]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
qa: Improve failure reporting for watchdog timeouts wip_tentacle_stat_mismatch_logging_invalidate_only
authorBill Scales <bill_scales@uk.ibm.com>
Thu, 14 Aug 2025 12:02:49 +0000 (13:02 +0100)
committerJon <jonathan.bailey1@ibm.com>
Fri, 3 Oct 2025 13:31:28 +0000 (14:31 +0100)
A daemon such as an osd or mon crashing can lead to teuthology tests
failing, these failures are detected by the daemon watchdog. Currently
the failure message just indicates that a watchdog timeout occurred.

Add an AssertTracker class that processes the stderr output from
daemon processes, taking insipration from teuthology scrape.py to
capture the assert string and backtrace if the daemon crashes

Enhance the watchdog to report the assertion and backtrace if
a daemon has crashed so this gets reported as the teuthoolgy
job failure reason

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
qa/tasks/ceph.py
qa/tasks/daemonwatchdog.py

index ea860a135f9911c6f40188cdf4a7afee0889dc21..c566cafe2d9b8f94a8c7449f011d7d83c25d2645 100644 (file)
@@ -778,6 +778,7 @@ def cluster(ctx, config):
     ctx.ceph[cluster_name] = argparse.Namespace()
     ctx.ceph[cluster_name].conf = conf
     ctx.ceph[cluster_name].mons = mons
+    ctx.ceph[cluster_name].assert_trackers = []
 
     default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
     keyring_path = config.get('keyring_path', default_keyring)
@@ -1403,6 +1404,69 @@ def run_daemon(ctx, config, type_):
     :param config: Configuration
     :param type_: Role type
     """
+    class AssertTracker(StringIO):
+        MAX_BT_LINES = 100
+        DISPLAY_BT_LINES = 6
+
+        def __init__(self, cluster_name, type_, id_, logger):
+            super(AssertTracker, self).__init__()
+            self.role = cluster_name + '.' + type_
+            self.id_ = id_
+            self.bt_lines = []
+            self.assertion = None
+            self.backtrace = None
+            self.in_bt = False
+            self.logger = logger
+            self.failure_time = None
+
+        def write(self, line):
+            if "FAILED assert" in line or "__ceph_assert_fail" in line:
+                self.assertion = line.strip()
+            if line.startswith(" ceph version"):
+                # The start of a backtrace!
+                self.bt_lines = [line]
+                self.in_bt = True
+                self.failure_time = time.time()
+            elif line.startswith(" NOTE: a copy of the executable") or \
+                 line.strip().endswith("clone()") or \
+                 "clone()+0x" in line:
+                # The backtrace terminated
+                if len(self.bt_lines):
+                    if self.assertion != None:
+                        self.bt_lines.insert(0, self.assertion)
+                        self.assertion = None
+                    # Limit number of lines of backtrace in the failure reason
+                    self.backtrace = ("".join(self.bt_lines[:self.DISPLAY_BT_LINES])).strip()
+                else:
+                    self.logger.warning("Saw end of backtrace but not start")
+                self.in_bt = False
+            elif self.in_bt:
+                # We're in a backtrace, push the line onto the list
+                if len(self.bt_lines) > self.MAX_BT_LINES:
+                    self.logger.warning("Failed to capture backtrace")
+                    self.bt_lines = ["Failed to capture backtrace", self.assertion]
+                    self.in_bt = False
+                else:
+                    # Backtrace filtering - skip lines for libraries, but include
+                    # libraries with ceph or rados in their path or name
+                    if "ceph" in line or "rados" in line or not re.match(r" [0-9]+: /.*",line):
+                        # Truncate long lines
+                        line = re.sub(r"([0-9]+: .{80}).*(\+0x[0-9a-fA-F]*\) \[0x[0-9a-fA-F]*\])",r"\1 ... \2",line)
+                        self.bt_lines.append(line)
+
+        def get_exception(self):
+            if self.backtrace is not None:
+                return Exception(self.backtrace)
+            if self.assertion is not None:
+                return Exception(self.assertion)
+            return None
+
+        def get_failure_time(self):
+            return self.failure_time
+
+        def match_id(self, role, id_):
+            return self.role == role and self.id_ == id_
+
     cluster_name = config['cluster']
     log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
     testdir = teuthology.get_testdir(ctx)
@@ -1503,13 +1567,17 @@ def run_daemon(ctx, config, type_):
                 maybe_redirect_stderr(config, type_, run_cmd, log_path)
             if create_log_cmd:
                 remote.sh(create_log_cmd)
+
             # always register mgr; don't necessarily start
+            asserttracker = AssertTracker(cluster_name, type_, id_, log.getChild(role))
+            ctx.ceph[cluster_name].assert_trackers.append(asserttracker)
             ctx.daemons.register_daemon(
                 remote, type_, id_,
                 cluster=cluster_name,
                 args=run_cmd,
                 logger=log.getChild(role),
                 stdin=run.PIPE,
+                stderr=asserttracker,
                 wait=False
             )
             if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
index 7003a6b798598ea4950e979a160b002babb4437c..c81e2d3300add4c6d0e6a356c55f53687a4b1213 100644 (file)
@@ -1,6 +1,7 @@
 import logging
 import signal
 import time
+import traceback
 
 from gevent import sleep
 from gevent.greenlet import Greenlet
@@ -15,7 +16,7 @@ class BarkError(Exception):
     """
 
     def __init__(self, bark_reason):
-        self.message = f"The WatchDog barked due to {bark_reason}"
+        self.message = f"Watchdog timeout: {bark_reason}"
         super().__init__(self.message)
 
 class DaemonWatchdog(Greenlet):
@@ -33,8 +34,8 @@ class DaemonWatchdog(Greenlet):
     If an extended failure is detected in a daemon process (i.e. not intentional), then
     the watchdog will bark. It will also bark is an assert is raised during the running
     of any monitored Thrashers or WatchedProcesses
-    
-    When the watchdog barks it will 
+
+    When the watchdog barks it will
      - unmount file systems and send SIGTERM to all daemons.
      - stop all thrasher processes by calling their stop_and_join() method
      - stop any watched processes by calling their stop() method
@@ -61,6 +62,7 @@ class DaemonWatchdog(Greenlet):
         self.stopping = Event()
         self.thrashers = ctx.ceph[config["cluster"]].thrashers
         self.watched_processes = ctx.ceph[config["cluster"]].watched_processes
+        self.assert_trackers = ctx.ceph[config["cluster"]].assert_trackers
 
     def _run(self):
         try:
@@ -113,7 +115,7 @@ class DaemonWatchdog(Greenlet):
         daemon_timeout = int(self.config.get('daemon_timeout', 300))
         daemon_restart = self.config.get('daemon_restart', False)
         daemon_failure_time = {}
-        bark_reason: str = ""
+        bark_reason = []
         while not self.stopping.is_set():
             bark = False
             now = time.time()
@@ -131,39 +133,106 @@ class DaemonWatchdog(Greenlet):
             daemon_failures.extend(filter(lambda daemon: daemon.finished(), rgws))
             daemon_failures.extend(filter(lambda daemon: daemon.finished(), mgrs))
 
-            for daemon in daemon_failures:
-                name = daemon.role + '.' + daemon.id_
-                dt = daemon_failure_time.setdefault(name, (daemon, now))
-                assert dt[0] is daemon
-                delta = now-dt[1]
-                self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
-                if delta > daemon_timeout:
-                    bark = True
-                    bark_reason = f"Daemon {name} has failed"
-                if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
-                    self.log(f"attempting to restart daemon {name}")
-                    daemon.restart()
-
-            # If a daemon is no longer failed, remove it from tracking:
-            for name in list(daemon_failure_time.keys()):
-                if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
-                    self.log("daemon {name} has been restored".format(name=name))
-                    del daemon_failure_time[name]
-
-            for thrasher in self.thrashers:
-                if thrasher.exception is not None:
-                    self.log("{name} failed".format(name=thrasher.name))
-                    bark_reason = f"Thrasher {name} threw exception {thrasher.exception}"
-                    bark = True
-
-            for proc in self.watched_processes:
-                if proc.exception is not None:
-                    self.log("Remote process %s failed" % proc.id)
-                    bark_reason = f"Remote process {proc.id} threw exception {proc.exception}"
-                    bark = True
+            try:
+                earliest = None
+                failures = []
+                for daemon in daemon_failures:
+                    name = daemon.role + '.' + daemon.id_
+                    dt = daemon_failure_time.setdefault(name, (daemon, now))
+                    assert dt[0] is daemon
+                    delta = now-dt[1]
+                    self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
+                    if delta > daemon_timeout:
+                        bark = True
+                        failures.append(f"{name}")
+                        for at in self.assert_trackers:
+                            if at.match_id(daemon.role, daemon.id_):
+                                exception = at.get_exception()
+                                if exception is not None:
+                                    if earliest == None or at.get_failure_time() < earliest.get_failure_time():
+                                        earliest = at
+
+                    if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
+                        self.log(f"attempting to restart daemon {name}")
+                        daemon.restart()
+
+                if len(failures) > 1:
+                    # Remove common prefixes from failure list
+                    last=""
+                    failures2 = []
+                    for f in failures:
+                        p = last.split(".")
+                        n = f.split(".")
+                        if p[0] != n[0] or len(p) == 1 or len(n) == 1:
+                            failures2.append(f)
+                        elif p[1] != n[1] or len(n) == 2:
+                            failures2.append(".".join(n[1:]))
+                        else:
+                            failures2.append(".".join(n[2:]))
+                        last = f
+                    failure_list = ",".join(failures2)
+                    bark_reason.append(f"Multiple daemons {failure_list} have timed out")
+                elif len(failures) == 1:
+                    bark_reason.append(f"Daemon {failures[0]} has timed out")
+
+                # If a daemon is no longer failed, remove it from tracking:
+                for name in list(daemon_failure_time.keys()):
+                    if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
+                        self.log("daemon {name} has been restored".format(name=name))
+                        del daemon_failure_time[name]
+
+                for thrasher in self.thrashers:
+                    if thrasher.exception is not None:
+                        self.log("{name} failed".format(name=thrasher.name))
+                        bark_reason.append(f"Thrasher {name} threw exception {thrasher.exception}")
+                        bark = True
+
+                for proc in self.watched_processes:
+                    if proc.exception is not None:
+                        self.log("Remote process %s failed" % proc.id)
+                        bark_reason.append(f"Remote process {proc.id} threw exception {proc.exception}")
+                        bark = True
+
+                # If a thrasher or watched process failed then check for the earliest daemon failure
+                failures = []
+                if bark and earliest == None:
+                    for at in self.assert_trackers:
+                        exception = at.get_exception()
+                        if exception is not None:
+                            failures.append(f"{at.role}.{at.id_}")
+                            if earliest == None or at.get_failure_time() < earliest.get_failure_time():
+                                earliest = at
+
+                if len(failures) > 1:
+                    # Remove common prefixes from failure list
+                    last=""
+                    failures2 = []
+                    for f in failures:
+                        p = last.split(".")
+                        n = f.split(".")
+                        if p[0] != n[0] or len(p) == 1 or len(n) == 1:
+                            failures2.append(f)
+                        elif p[1] != n[1] or len(n) == 2:
+                            failures2.append(".".join(n[1:]))
+                        else:
+                            failures2.append(".".join(n[2:]))
+                        last = f
+                    failure_list = ",".join(failures2)
+                    bark_reason.append(f"Multiple daemons {failure_list} have crashed")
+                elif len(failures) == 1:
+                    bark_reason.append(f"Daemon {failures[0]} has crashed")
+
+                # Only report details about one daemon failure
+                if earliest != None:
+                    bark_reason.append(f"First crash: Daemon {earliest.role}.{earliest.id_} at {time.ctime(earliest.get_failure_time())}")
+                    bark_reason.append(f"{earliest.get_exception()}")
+
+            except:
+                bark = True
+                bark_reason.append(f"Watchdog bug {traceback.format_exc()}")
 
             if bark:
-                self.bark(bark_reason)
+                self.bark("\n".join(bark_reason))
                 return
 
             sleep(5)