From 63238d6e36da8c0663a3694d852a94c090857960 Mon Sep 17 00:00:00 2001 From: Bill Scales Date: Thu, 14 Aug 2025 13:02:49 +0100 Subject: [PATCH] qa: Improve failure reporting for watchdog timeouts A daemon such as an osd or mon crashing can lead to teuthology tests failing, these failures are detected by the daemon watchdog. Currently the failure message just indicates that a watchdog timeout occurred. Add an AssertTracker class that processes the stderr output from daemon processes, taking insipration from teuthology scrape.py to capture the assert string and backtrace if the daemon crashes Enhance the watchdog to report the assertion and backtrace if a daemon has crashed so this gets reported as the teuthoolgy job failure reason Signed-off-by: Bill Scales --- qa/tasks/ceph.py | 68 ++++++++++++++++++ qa/tasks/daemonwatchdog.py | 139 +++++++++++++++++++++++++++---------- 2 files changed, 172 insertions(+), 35 deletions(-) diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py index ea860a135f9..c566cafe2d9 100644 --- a/qa/tasks/ceph.py +++ b/qa/tasks/ceph.py @@ -778,6 +778,7 @@ def cluster(ctx, config): ctx.ceph[cluster_name] = argparse.Namespace() ctx.ceph[cluster_name].conf = conf ctx.ceph[cluster_name].mons = mons + ctx.ceph[cluster_name].assert_trackers = [] default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name) keyring_path = config.get('keyring_path', default_keyring) @@ -1403,6 +1404,69 @@ def run_daemon(ctx, config, type_): :param config: Configuration :param type_: Role type """ + class AssertTracker(StringIO): + MAX_BT_LINES = 100 + DISPLAY_BT_LINES = 6 + + def __init__(self, cluster_name, type_, id_, logger): + super(AssertTracker, self).__init__() + self.role = cluster_name + '.' + type_ + self.id_ = id_ + self.bt_lines = [] + self.assertion = None + self.backtrace = None + self.in_bt = False + self.logger = logger + self.failure_time = None + + def write(self, line): + if "FAILED assert" in line or "__ceph_assert_fail" in line: + self.assertion = line.strip() + if line.startswith(" ceph version"): + # The start of a backtrace! + self.bt_lines = [line] + self.in_bt = True + self.failure_time = time.time() + elif line.startswith(" NOTE: a copy of the executable") or \ + line.strip().endswith("clone()") or \ + "clone()+0x" in line: + # The backtrace terminated + if len(self.bt_lines): + if self.assertion != None: + self.bt_lines.insert(0, self.assertion) + self.assertion = None + # Limit number of lines of backtrace in the failure reason + self.backtrace = ("".join(self.bt_lines[:self.DISPLAY_BT_LINES])).strip() + else: + self.logger.warning("Saw end of backtrace but not start") + self.in_bt = False + elif self.in_bt: + # We're in a backtrace, push the line onto the list + if len(self.bt_lines) > self.MAX_BT_LINES: + self.logger.warning("Failed to capture backtrace") + self.bt_lines = ["Failed to capture backtrace", self.assertion] + self.in_bt = False + else: + # Backtrace filtering - skip lines for libraries, but include + # libraries with ceph or rados in their path or name + if "ceph" in line or "rados" in line or not re.match(r" [0-9]+: /.*",line): + # Truncate long lines + line = re.sub(r"([0-9]+: .{80}).*(\+0x[0-9a-fA-F]*\) \[0x[0-9a-fA-F]*\])",r"\1 ... \2",line) + self.bt_lines.append(line) + + def get_exception(self): + if self.backtrace is not None: + return Exception(self.backtrace) + if self.assertion is not None: + return Exception(self.assertion) + return None + + def get_failure_time(self): + return self.failure_time + + def match_id(self, role, id_): + return self.role == role and self.id_ == id_ + cluster_name = config['cluster'] log.info('Starting %s daemons in cluster %s...', type_, cluster_name) testdir = teuthology.get_testdir(ctx) @@ -1503,13 +1567,17 @@ def run_daemon(ctx, config, type_): maybe_redirect_stderr(config, type_, run_cmd, log_path) if create_log_cmd: remote.sh(create_log_cmd) + # always register mgr; don't necessarily start + asserttracker = AssertTracker(cluster_name, type_, id_, log.getChild(role)) + ctx.ceph[cluster_name].assert_trackers.append(asserttracker) ctx.daemons.register_daemon( remote, type_, id_, cluster=cluster_name, args=run_cmd, logger=log.getChild(role), stdin=run.PIPE, + stderr=asserttracker, wait=False ) if type_ != 'mgr' or not config.get('skip_mgr_daemons', False): diff --git a/qa/tasks/daemonwatchdog.py b/qa/tasks/daemonwatchdog.py index 7003a6b7985..c81e2d3300a 100644 --- a/qa/tasks/daemonwatchdog.py +++ b/qa/tasks/daemonwatchdog.py @@ -1,6 +1,7 @@ import logging import signal import time +import traceback from gevent import sleep from gevent.greenlet import Greenlet @@ -15,7 +16,7 @@ class BarkError(Exception): """ def __init__(self, bark_reason): - self.message = f"The WatchDog barked due to {bark_reason}" + self.message = f"Watchdog timeout: {bark_reason}" super().__init__(self.message) class DaemonWatchdog(Greenlet): @@ -33,8 +34,8 @@ class DaemonWatchdog(Greenlet): If an extended failure is detected in a daemon process (i.e. not intentional), then the watchdog will bark. It will also bark is an assert is raised during the running of any monitored Thrashers or WatchedProcesses - - When the watchdog barks it will + + When the watchdog barks it will - unmount file systems and send SIGTERM to all daemons. - stop all thrasher processes by calling their stop_and_join() method - stop any watched processes by calling their stop() method @@ -61,6 +62,7 @@ class DaemonWatchdog(Greenlet): self.stopping = Event() self.thrashers = ctx.ceph[config["cluster"]].thrashers self.watched_processes = ctx.ceph[config["cluster"]].watched_processes + self.assert_trackers = ctx.ceph[config["cluster"]].assert_trackers def _run(self): try: @@ -113,7 +115,7 @@ class DaemonWatchdog(Greenlet): daemon_timeout = int(self.config.get('daemon_timeout', 300)) daemon_restart = self.config.get('daemon_restart', False) daemon_failure_time = {} - bark_reason: str = "" + bark_reason = [] while not self.stopping.is_set(): bark = False now = time.time() @@ -131,39 +133,106 @@ class DaemonWatchdog(Greenlet): daemon_failures.extend(filter(lambda daemon: daemon.finished(), rgws)) daemon_failures.extend(filter(lambda daemon: daemon.finished(), mgrs)) - for daemon in daemon_failures: - name = daemon.role + '.' + daemon.id_ - dt = daemon_failure_time.setdefault(name, (daemon, now)) - assert dt[0] is daemon - delta = now-dt[1] - self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta)) - if delta > daemon_timeout: - bark = True - bark_reason = f"Daemon {name} has failed" - if daemon_restart == 'normal' and daemon.proc.exitstatus == 0: - self.log(f"attempting to restart daemon {name}") - daemon.restart() - - # If a daemon is no longer failed, remove it from tracking: - for name in list(daemon_failure_time.keys()): - if name not in [d.role + '.' + d.id_ for d in daemon_failures]: - self.log("daemon {name} has been restored".format(name=name)) - del daemon_failure_time[name] - - for thrasher in self.thrashers: - if thrasher.exception is not None: - self.log("{name} failed".format(name=thrasher.name)) - bark_reason = f"Thrasher {name} threw exception {thrasher.exception}" - bark = True - - for proc in self.watched_processes: - if proc.exception is not None: - self.log("Remote process %s failed" % proc.id) - bark_reason = f"Remote process {proc.id} threw exception {proc.exception}" - bark = True + try: + earliest = None + failures = [] + for daemon in daemon_failures: + name = daemon.role + '.' + daemon.id_ + dt = daemon_failure_time.setdefault(name, (daemon, now)) + assert dt[0] is daemon + delta = now-dt[1] + self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta)) + if delta > daemon_timeout: + bark = True + failures.append(f"{name}") + for at in self.assert_trackers: + if at.match_id(daemon.role, daemon.id_): + exception = at.get_exception() + if exception is not None: + if earliest == None or at.get_failure_time() < earliest.get_failure_time(): + earliest = at + + if daemon_restart == 'normal' and daemon.proc.exitstatus == 0: + self.log(f"attempting to restart daemon {name}") + daemon.restart() + + if len(failures) > 1: + # Remove common prefixes from failure list + last="" + failures2 = [] + for f in failures: + p = last.split(".") + n = f.split(".") + if p[0] != n[0] or len(p) == 1 or len(n) == 1: + failures2.append(f) + elif p[1] != n[1] or len(n) == 2: + failures2.append(".".join(n[1:])) + else: + failures2.append(".".join(n[2:])) + last = f + failure_list = ",".join(failures2) + bark_reason.append(f"Multiple daemons {failure_list} have timed out") + elif len(failures) == 1: + bark_reason.append(f"Daemon {failures[0]} has timed out") + + # If a daemon is no longer failed, remove it from tracking: + for name in list(daemon_failure_time.keys()): + if name not in [d.role + '.' + d.id_ for d in daemon_failures]: + self.log("daemon {name} has been restored".format(name=name)) + del daemon_failure_time[name] + + for thrasher in self.thrashers: + if thrasher.exception is not None: + self.log("{name} failed".format(name=thrasher.name)) + bark_reason.append(f"Thrasher {name} threw exception {thrasher.exception}") + bark = True + + for proc in self.watched_processes: + if proc.exception is not None: + self.log("Remote process %s failed" % proc.id) + bark_reason.append(f"Remote process {proc.id} threw exception {proc.exception}") + bark = True + + # If a thrasher or watched process failed then check for the earliest daemon failure + failures = [] + if bark and earliest == None: + for at in self.assert_trackers: + exception = at.get_exception() + if exception is not None: + failures.append(f"{at.role}.{at.id_}") + if earliest == None or at.get_failure_time() < earliest.get_failure_time(): + earliest = at + + if len(failures) > 1: + # Remove common prefixes from failure list + last="" + failures2 = [] + for f in failures: + p = last.split(".") + n = f.split(".") + if p[0] != n[0] or len(p) == 1 or len(n) == 1: + failures2.append(f) + elif p[1] != n[1] or len(n) == 2: + failures2.append(".".join(n[1:])) + else: + failures2.append(".".join(n[2:])) + last = f + failure_list = ",".join(failures2) + bark_reason.append(f"Multiple daemons {failure_list} have crashed") + elif len(failures) == 1: + bark_reason.append(f"Daemon {failures[0]} has crashed") + + # Only report details about one daemon failure + if earliest != None: + bark_reason.append(f"First crash: Daemon {earliest.role}.{earliest.id_} at {time.ctime(earliest.get_failure_time())}") + bark_reason.append(f"{earliest.get_exception()}") + + except: + bark = True + bark_reason.append(f"Watchdog bug {traceback.format_exc()}") if bark: - self.bark(bark_reason) + self.bark("\n".join(bark_reason)) return sleep(5) -- 2.39.5