mds,qa: exit instead of respawn under valgrind

author Patrick Donnelly <pdonnell@redhat.com>

Wed, 3 Mar 2021 03:39:09 +0000 (19:39 -0800)

committer Patrick Donnelly <pdonnell@redhat.com>

Wed, 3 Mar 2021 17:30:21 +0000 (09:30 -0800)
author Patrick Donnelly <pdonnell@redhat.com>
Wed, 3 Mar 2021 03:39:09 +0000 (19:39 -0800)
committer Patrick Donnelly <pdonnell@redhat.com>
Wed, 3 Mar 2021 17:30:21 +0000 (09:30 -0800)
diff --git a/qa/suites/fs/verify/validater/valgrind.yaml b/qa/suites/fs/verify/validater/valgrind.yaml

index 14c06ae5e1d3de15f01bf4417f9011c73e5b7ecf..930872fc7aaa96288c4471ea7ffce5279f9bc65f 100644 (file)
--- a/qa/suites/fs/verify/validater/valgrind.yaml
+++ b/qa/suites/fs/verify/validater/valgrind.yaml
@@ -13,6 +13,8 @@ overrides:
          osd heartbeat grace: 60
          mds heartbeat grace: 60
          mds beacon grace: 60
+      mds:
+        mds valgrind exit: true
        mon:
          mon osd crush smoke test: false
        osd:
@@ -20,6 +22,8 @@ overrides:
      valgrind:
        mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
        mds: [--tool=memcheck]
+    watchdog:
+      daemon_restart: normal
    ceph-fuse:
      client.0:
        valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
diff --git a/qa/tasks/daemonwatchdog.py b/qa/tasks/daemonwatchdog.py

index b95c29920ddd50733690f075f22c553e1ee2b1e3..f72ccd7cef3883edf4277426262a914068af348e 100644 (file)
--- a/qa/tasks/daemonwatchdog.py
+++ b/qa/tasks/daemonwatchdog.py
@@ -17,14 +17,19 @@ class DaemonWatchdog(Greenlet):
      SIGTERM to all daemons. The duration of an extended failure is configurable
      with watchdog_daemon_timeout.
  
-    watchdog_daemon_timeout [default: 300]: number of seconds a daemon
-        is allowed to be failed before the watchdog will bark.
+    ceph:
+      watchdog:
+        daemon_restart [default: no]: restart daemon if "normal" exit (status==0).
+
+        daemon_timeout [default: 300]: number of seconds a daemon
+                                              is allowed to be failed before the
+                                              watchdog will bark.
      """
  
      def __init__(self, ctx, config, thrashers):
          super(DaemonWatchdog, self).__init__()
+        self.config = ctx.config.get('watchdog', {})
          self.ctx = ctx
-        self.config = config
          self.e = None
          self.logger = log.getChild('daemon_watchdog')
          self.cluster = config.get('cluster', 'ceph')
@@ -70,7 +75,8 @@ class DaemonWatchdog(Greenlet):
  
      def watch(self):
          self.log("watchdog starting")
-        daemon_timeout = int(self.config.get('watchdog_daemon_timeout', 300))
+        daemon_timeout = int(self.config.get('daemon_timeout', 300))
+        daemon_restart = self.config.get('daemon_restart', False)
          daemon_failure_time = {}
          while not self.stopping.is_set():
              bark = False
@@ -97,6 +103,9 @@ class DaemonWatchdog(Greenlet):
                  self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
                  if delta > daemon_timeout:
                      bark = True
+                if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
+                    self.log(f"attempting to restart daemon {name}")
+                    daemon.restart()
  
              # If a daemon is no longer failed, remove it from tracking:
              for name in list(daemon_failure_time.keys()):
diff --git a/src/common/options.cc b/src/common/options.cc

index f2c460dfc5cba9993ee719bb0e0bfa2d93f55f76..69bc228dae291a4b6b1ac515d4f21f9a6a32adcb 100644 (file)
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -8054,6 +8054,10 @@ std::vector<Option> get_mds_options() {
      .set_flag(Option::FLAG_RUNTIME)
      .set_description("set the maximum length of alternate names for dentries"),
  
+    Option("mds_valgrind_exit", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_flag(Option::FLAG_RUNTIME),
+
      Option("mds_numa_node", Option::TYPE_INT, Option::LEVEL_ADVANCED)
      .set_default(-1)
      .set_flag(Option::FLAG_STARTUP)
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc

index 85a70412f1d2058ba35bee646a9266e6c509fb5d..b18a6b7573673fd968cfdf1e33c13a4c2cd7fb0a 100644 (file)
--- a/src/mds/MDSDaemon.cc
+++ b/src/mds/MDSDaemon.cc
@@ -845,6 +845,11 @@ void MDSDaemon::respawn()
     * be removed from the MDSMap leading to respawn. */
    g_ceph_context->_log->dump_recent();
  
+  /* valgrind can't handle execve; just exit and let QA infra restart */
+  if (g_conf().get_val<bool>("mds_valgrind_exit")) {
+    _exit(0);
+  }
+
    char *new_argv[orig_argc+1];
    dout(1) << " e: '" << orig_argv[0] << "'" << dendl;
    for (int i=0; i<orig_argc; i++) {
author	Patrick Donnelly <pdonnell@redhat.com>
	Wed, 3 Mar 2021 03:39:09 +0000 (19:39 -0800)
committer	Patrick Donnelly <pdonnell@redhat.com>
	Wed, 3 Mar 2021 17:30:21 +0000 (09:30 -0800)
qa/suites/fs/verify/validater/valgrind.yaml		patch \| blob \| history
qa/tasks/daemonwatchdog.py		patch \| blob \| history
src/common/options.cc		patch \| blob \| history
src/mds/MDSDaemon.cc		patch \| blob \| history