From: Patrick Donnelly Date: Wed, 3 Mar 2021 03:39:09 +0000 (-0800) Subject: mds,qa: exit instead of respawn under valgrind X-Git-Tag: v16.2.0~30^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=da9ed46cdcdca2b351553955b8452afd28489318;p=ceph.git mds,qa: exit instead of respawn under valgrind valgrind can't handle execve of /proc/self/exe: 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== execve(0x18546740(/proc/self/exe), 0x18546670, 0x133ef310) failed, errno 2 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== EXEC FAILED: I can't recover from execve() failing, so I'm dying. 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== Add more stringent tests in PRE(sys_execve), or work out how to recover. So configure the MDS to just exit so it can be restarted by QA infra (the daemon watchdog). Signed-off-by: Patrick Donnelly (cherry picked from commit 5faf0ee0f367f66e25a3927e6ac4c40fb83568ce) --- diff --git a/qa/suites/fs/verify/validater/valgrind.yaml b/qa/suites/fs/verify/validater/valgrind.yaml index 14c06ae5e1d3..930872fc7aaa 100644 --- a/qa/suites/fs/verify/validater/valgrind.yaml +++ b/qa/suites/fs/verify/validater/valgrind.yaml @@ -13,6 +13,8 @@ overrides: osd heartbeat grace: 60 mds heartbeat grace: 60 mds beacon grace: 60 + mds: + mds valgrind exit: true mon: mon osd crush smoke test: false osd: @@ -20,6 +22,8 @@ overrides: valgrind: mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes] mds: [--tool=memcheck] + watchdog: + daemon_restart: normal ceph-fuse: client.0: valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes] diff --git a/qa/tasks/daemonwatchdog.py b/qa/tasks/daemonwatchdog.py index b95c29920ddd..f72ccd7cef38 100644 --- a/qa/tasks/daemonwatchdog.py +++ b/qa/tasks/daemonwatchdog.py @@ -17,14 +17,19 @@ class DaemonWatchdog(Greenlet): SIGTERM to all daemons. The duration of an extended failure is configurable with watchdog_daemon_timeout. - watchdog_daemon_timeout [default: 300]: number of seconds a daemon - is allowed to be failed before the watchdog will bark. + ceph: + watchdog: + daemon_restart [default: no]: restart daemon if "normal" exit (status==0). + + daemon_timeout [default: 300]: number of seconds a daemon + is allowed to be failed before the + watchdog will bark. """ def __init__(self, ctx, config, thrashers): super(DaemonWatchdog, self).__init__() + self.config = ctx.config.get('watchdog', {}) self.ctx = ctx - self.config = config self.e = None self.logger = log.getChild('daemon_watchdog') self.cluster = config.get('cluster', 'ceph') @@ -70,7 +75,8 @@ class DaemonWatchdog(Greenlet): def watch(self): self.log("watchdog starting") - daemon_timeout = int(self.config.get('watchdog_daemon_timeout', 300)) + daemon_timeout = int(self.config.get('daemon_timeout', 300)) + daemon_restart = self.config.get('daemon_restart', False) daemon_failure_time = {} while not self.stopping.is_set(): bark = False @@ -97,6 +103,9 @@ class DaemonWatchdog(Greenlet): self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta)) if delta > daemon_timeout: bark = True + if daemon_restart == 'normal' and daemon.proc.exitstatus == 0: + self.log(f"attempting to restart daemon {name}") + daemon.restart() # If a daemon is no longer failed, remove it from tracking: for name in list(daemon_failure_time.keys()): diff --git a/src/common/options.cc b/src/common/options.cc index e315b267a2a1..aebaea279622 100644 --- a/src/common/options.cc +++ b/src/common/options.cc @@ -8030,6 +8030,10 @@ std::vector