ctx.ceph[cluster_name] = argparse.Namespace()
ctx.ceph[cluster_name].conf = conf
ctx.ceph[cluster_name].mons = mons
+ ctx.ceph[cluster_name].assert_trackers = []
default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
keyring_path = config.get('keyring_path', default_keyring)
:param config: Configuration
:param type_: Role type
"""
+ class AssertTracker(StringIO):
+ MAX_BT_LINES = 100
+ DISPLAY_BT_LINES = 6
+
+ def __init__(self, cluster_name, type_, id_, logger):
+ super(AssertTracker, self).__init__()
+ self.role = cluster_name + '.' + type_
+ self.id_ = id_
+ self.bt_lines = []
+ self.assertion = None
+ self.backtrace = None
+ self.in_bt = False
+ self.logger = logger
+ self.failure_time = None
+
+ def write(self, line):
+ if "FAILED assert" in line or "__ceph_assert_fail" in line:
+ self.assertion = line.strip()
+ if line.startswith(" ceph version"):
+ # The start of a backtrace!
+ self.bt_lines = [line]
+ self.in_bt = True
+ self.failure_time = time.time()
+ elif line.startswith(" NOTE: a copy of the executable") or \
+ line.strip().endswith("clone()") or \
+ "clone()+0x" in line:
+ # The backtrace terminated
+ if len(self.bt_lines):
+ if self.assertion != None:
+ self.bt_lines.insert(0, self.assertion)
+ self.assertion = None
+ # Limit number of lines of backtrace in the failure reason
+ self.backtrace = ("".join(self.bt_lines[:self.DISPLAY_BT_LINES])).strip()
+ else:
+ self.logger.warning("Saw end of backtrace but not start")
+ self.in_bt = False
+ elif self.in_bt:
+ # We're in a backtrace, push the line onto the list
+ if len(self.bt_lines) > self.MAX_BT_LINES:
+ self.logger.warning("Failed to capture backtrace")
+ self.bt_lines = ["Failed to capture backtrace", self.assertion]
+ self.in_bt = False
+ else:
+ # Backtrace filtering - skip lines for libraries, but include
+ # libraries with ceph or rados in their path or name
+ if "ceph" in line or "rados" in line or not re.match(r" [0-9]+: /.*",line):
+ # Truncate long lines
+ line = re.sub(r"([0-9]+: .{80}).*(\+0x[0-9a-fA-F]*\) \[0x[0-9a-fA-F]*\])",r"\1 ... \2",line)
+ self.bt_lines.append(line)
+
+ def get_exception(self):
+ if self.backtrace is not None:
+ return Exception(self.backtrace)
+ if self.assertion is not None:
+ return Exception(self.assertion)
+ return None
+
+ def get_failure_time(self):
+ return self.failure_time
+
+ def match_id(self, role, id_):
+ return self.role == role and self.id_ == id_
+
cluster_name = config['cluster']
log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
testdir = teuthology.get_testdir(ctx)
maybe_redirect_stderr(config, type_, run_cmd, log_path)
if create_log_cmd:
remote.sh(create_log_cmd)
+
# always register mgr; don't necessarily start
+ asserttracker = AssertTracker(cluster_name, type_, id_, log.getChild(role))
+ ctx.ceph[cluster_name].assert_trackers.append(asserttracker)
ctx.daemons.register_daemon(
remote, type_, id_,
cluster=cluster_name,
args=run_cmd,
logger=log.getChild(role),
stdin=run.PIPE,
+ stderr=asserttracker,
wait=False
)
if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
import logging
import signal
import time
+import traceback
from gevent import sleep
from gevent.greenlet import Greenlet
"""
def __init__(self, bark_reason):
- self.message = f"The WatchDog barked due to {bark_reason}"
+ self.message = f"Watchdog timeout: {bark_reason}"
super().__init__(self.message)
class DaemonWatchdog(Greenlet):
If an extended failure is detected in a daemon process (i.e. not intentional), then
the watchdog will bark. It will also bark is an assert is raised during the running
of any monitored Thrashers or WatchedProcesses
-
- When the watchdog barks it will
+
+ When the watchdog barks it will
- unmount file systems and send SIGTERM to all daemons.
- stop all thrasher processes by calling their stop_and_join() method
- stop any watched processes by calling their stop() method
self.stopping = Event()
self.thrashers = ctx.ceph[config["cluster"]].thrashers
self.watched_processes = ctx.ceph[config["cluster"]].watched_processes
+ self.assert_trackers = ctx.ceph[config["cluster"]].assert_trackers
def _run(self):
try:
daemon_timeout = int(self.config.get('daemon_timeout', 300))
daemon_restart = self.config.get('daemon_restart', False)
daemon_failure_time = {}
- bark_reason: str = ""
+ bark_reason = []
while not self.stopping.is_set():
bark = False
now = time.time()
daemon_failures.extend(filter(lambda daemon: daemon.finished(), rgws))
daemon_failures.extend(filter(lambda daemon: daemon.finished(), mgrs))
- for daemon in daemon_failures:
- name = daemon.role + '.' + daemon.id_
- dt = daemon_failure_time.setdefault(name, (daemon, now))
- assert dt[0] is daemon
- delta = now-dt[1]
- self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
- if delta > daemon_timeout:
- bark = True
- bark_reason = f"Daemon {name} has failed"
- if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
- self.log(f"attempting to restart daemon {name}")
- daemon.restart()
-
- # If a daemon is no longer failed, remove it from tracking:
- for name in list(daemon_failure_time.keys()):
- if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
- self.log("daemon {name} has been restored".format(name=name))
- del daemon_failure_time[name]
-
- for thrasher in self.thrashers:
- if thrasher.exception is not None:
- self.log("{name} failed".format(name=thrasher.name))
- bark_reason = f"Thrasher {name} threw exception {thrasher.exception}"
- bark = True
-
- for proc in self.watched_processes:
- if proc.exception is not None:
- self.log("Remote process %s failed" % proc.id)
- bark_reason = f"Remote process {proc.id} threw exception {proc.exception}"
- bark = True
+ try:
+ earliest = None
+ failures = []
+ for daemon in daemon_failures:
+ name = daemon.role + '.' + daemon.id_
+ dt = daemon_failure_time.setdefault(name, (daemon, now))
+ assert dt[0] is daemon
+ delta = now-dt[1]
+ self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
+ if delta > daemon_timeout:
+ bark = True
+ failures.append(f"{name}")
+ for at in self.assert_trackers:
+ if at.match_id(daemon.role, daemon.id_):
+ exception = at.get_exception()
+ if exception is not None:
+ if earliest == None or at.get_failure_time() < earliest.get_failure_time():
+ earliest = at
+
+ if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
+ self.log(f"attempting to restart daemon {name}")
+ daemon.restart()
+
+ if len(failures) > 1:
+ # Remove common prefixes from failure list
+ last=""
+ failures2 = []
+ for f in failures:
+ p = last.split(".")
+ n = f.split(".")
+ if p[0] != n[0] or len(p) == 1 or len(n) == 1:
+ failures2.append(f)
+ elif p[1] != n[1] or len(n) == 2:
+ failures2.append(".".join(n[1:]))
+ else:
+ failures2.append(".".join(n[2:]))
+ last = f
+ failure_list = ",".join(failures2)
+ bark_reason.append(f"Multiple daemons {failure_list} have timed out")
+ elif len(failures) == 1:
+ bark_reason.append(f"Daemon {failures[0]} has timed out")
+
+ # If a daemon is no longer failed, remove it from tracking:
+ for name in list(daemon_failure_time.keys()):
+ if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
+ self.log("daemon {name} has been restored".format(name=name))
+ del daemon_failure_time[name]
+
+ for thrasher in self.thrashers:
+ if thrasher.exception is not None:
+ self.log("{name} failed".format(name=thrasher.name))
+ bark_reason.append(f"Thrasher {name} threw exception {thrasher.exception}")
+ bark = True
+
+ for proc in self.watched_processes:
+ if proc.exception is not None:
+ self.log("Remote process %s failed" % proc.id)
+ bark_reason.append(f"Remote process {proc.id} threw exception {proc.exception}")
+ bark = True
+
+ # If a thrasher or watched process failed then check for the earliest daemon failure
+ failures = []
+ if bark and earliest == None:
+ for at in self.assert_trackers:
+ exception = at.get_exception()
+ if exception is not None:
+ failures.append(f"{at.role}.{at.id_}")
+ if earliest == None or at.get_failure_time() < earliest.get_failure_time():
+ earliest = at
+
+ if len(failures) > 1:
+ # Remove common prefixes from failure list
+ last=""
+ failures2 = []
+ for f in failures:
+ p = last.split(".")
+ n = f.split(".")
+ if p[0] != n[0] or len(p) == 1 or len(n) == 1:
+ failures2.append(f)
+ elif p[1] != n[1] or len(n) == 2:
+ failures2.append(".".join(n[1:]))
+ else:
+ failures2.append(".".join(n[2:]))
+ last = f
+ failure_list = ",".join(failures2)
+ bark_reason.append(f"Multiple daemons {failure_list} have crashed")
+ elif len(failures) == 1:
+ bark_reason.append(f"Daemon {failures[0]} has crashed")
+
+ # Only report details about one daemon failure
+ if earliest != None:
+ bark_reason.append(f"First crash: Daemon {earliest.role}.{earliest.id_} at {time.ctime(earliest.get_failure_time())}")
+ bark_reason.append(f"{earliest.get_exception()}")
+
+ except:
+ bark = True
+ bark_reason.append(f"Watchdog bug {traceback.format_exc()}")
if bark:
- self.bark(bark_reason)
+ self.bark("\n".join(bark_reason))
return
sleep(5)