osd heartbeat grace: 60
mds heartbeat grace: 60
mds beacon grace: 60
+ mds:
+ mds valgrind exit: true
mon:
mon osd crush smoke test: false
osd:
valgrind:
mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
mds: [--tool=memcheck]
+ watchdog:
+ daemon_restart: normal
ceph-fuse:
client.0:
valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
SIGTERM to all daemons. The duration of an extended failure is configurable
with watchdog_daemon_timeout.
- watchdog_daemon_timeout [default: 300]: number of seconds a daemon
- is allowed to be failed before the watchdog will bark.
+ ceph:
+ watchdog:
+ daemon_restart [default: no]: restart daemon if "normal" exit (status==0).
+
+ daemon_timeout [default: 300]: number of seconds a daemon
+ is allowed to be failed before the
+ watchdog will bark.
"""
def __init__(self, ctx, config, thrashers):
super(DaemonWatchdog, self).__init__()
+ self.config = ctx.config.get('watchdog', {})
self.ctx = ctx
- self.config = config
self.e = None
self.logger = log.getChild('daemon_watchdog')
self.cluster = config.get('cluster', 'ceph')
def watch(self):
self.log("watchdog starting")
- daemon_timeout = int(self.config.get('watchdog_daemon_timeout', 300))
+ daemon_timeout = int(self.config.get('daemon_timeout', 300))
+ daemon_restart = self.config.get('daemon_restart', False)
daemon_failure_time = {}
while not self.stopping.is_set():
bark = False
self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
if delta > daemon_timeout:
bark = True
+ if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
+ self.log(f"attempting to restart daemon {name}")
+ daemon.restart()
# If a daemon is no longer failed, remove it from tracking:
for name in list(daemon_failure_time.keys()):
.set_flag(Option::FLAG_RUNTIME)
.set_description("set the maximum length of alternate names for dentries"),
+ Option("mds_valgrind_exit", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_flag(Option::FLAG_RUNTIME),
+
Option("mds_numa_node", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(-1)
.set_flag(Option::FLAG_STARTUP)
* be removed from the MDSMap leading to respawn. */
g_ceph_context->_log->dump_recent();
+ /* valgrind can't handle execve; just exit and let QA infra restart */
+ if (g_conf().get_val<bool>("mds_valgrind_exit")) {
+ _exit(0);
+ }
+
char *new_argv[orig_argc+1];
dout(1) << " e: '" << orig_argv[0] << "'" << dendl;
for (int i=0; i<orig_argc; i++) {