]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
qa: add DaemonWatchdog to stop tests on failure
authorPatrick Donnelly <pdonnell@redhat.com>
Sat, 4 Feb 2017 17:00:45 +0000 (12:00 -0500)
committerPatrick Donnelly <pdonnell@redhat.com>
Mon, 6 Feb 2017 19:07:14 +0000 (14:07 -0500)
Thrashing MDS will often result in failures which often do not stop the
test. The failure may also cause the test to stall which will force the
machines to needlessly be locked until a timeout is reached. This
watchdog will unmount mounts and kill daemons when a failure is
detected.

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
qa/tasks/mds_thrash.py

index 6e95aae80a58b23b170092b65830be7745419c98..8714967b95229880835be0acdcbf16d707db2a61 100644 (file)
@@ -6,6 +6,7 @@ import contextlib
 import ceph_manager
 import itertools
 import random
+import signal
 import time
 
 from gevent import sleep
@@ -17,6 +18,109 @@ from tasks.cephfs.filesystem import MDSCluster, Filesystem
 
 log = logging.getLogger(__name__)
 
+class DaemonWatchdog(Greenlet):
+    """
+    DaemonWatchdog::
+
+    Watch Ceph daemons for failures. If an extended failure is detected (i.e.
+    not intentional), then the watchdog will unmount file systems and send
+    SIGTERM to all daemons. The duration of an extended failure is configurable
+    with watchdog_daemon_timeout.
+
+    watchdog_daemon_timeout [default: 300]: number of seconds a daemon
+        is allowed to be failed before the watchdog will bark.
+    """
+
+    def __init__(self, ctx, manager, config, thrashers):
+        Greenlet.__init__(self)
+        self.ctx = ctx
+        self.config = config
+        self.e = None
+        self.logger = log.getChild('daemon_watchdog')
+        self.manager = manager
+        self.name = 'watchdog'
+        self.stopping = Event()
+        self.thrashers = thrashers
+
+    def _run(self):
+        try:
+            self.watch()
+        except Exception as e:
+            # See _run exception comment for MDSThrasher
+            self.e = e
+            self.logger.exception("exception:")
+            # allow successful completion so gevent doesn't see an exception...
+
+    def log(self, x):
+        """Write data to logger"""
+        self.logger.info(x)
+
+    def stop(self):
+        self.stopping.set()
+
+    def bark(self):
+        self.log("BARK! unmounting mounts and killing all daemons")
+        for mount in self.ctx.mounts.values():
+            try:
+                mount.umount_wait(force=True)
+            except:
+                self.logger.exception("ignoring exception:")
+        daemons = []
+        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.manager.cluster)))
+        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.manager.cluster)))
+        for daemon in daemons:
+            try:
+                daemon.signal(signal.SIGTERM)
+            except:
+                self.logger.exception("ignoring exception:")
+
+    def watch(self):
+        self.log("watchdog starting")
+        daemon_timeout = int(self.config.get('watchdog_daemon_timeout', 300))
+        daemon_failure_time = {}
+        while not self.stopping.is_set():
+            bark = False
+            now = time.time()
+
+            mons = self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.manager.cluster)
+            mdss = self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.manager.cluster)
+            clients = self.ctx.daemons.iter_daemons_of_role('client', cluster=self.manager.cluster)
+
+            #for daemon in mons:
+            #    self.log("mon daemon {role}.{id}: running={r}".format(role=daemon.role, id=daemon.id_, r=daemon.running() and not daemon.proc.finished))
+            #for daemon in mdss:
+            #    self.log("mds daemon {role}.{id}: running={r}".format(role=daemon.role, id=daemon.id_, r=daemon.running() and not daemon.proc.finished))
+
+            daemon_failures = []
+            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mons))
+            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mdss))
+            for daemon in daemon_failures:
+                name = daemon.role + '.' + daemon.id_
+                dt = daemon_failure_time.setdefault(name, (daemon, now))
+                assert dt[0] is daemon
+                delta = now-dt[1]
+                self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
+                if delta > daemon_timeout:
+                    bark = True
+
+            # If a daemon is no longer failed, remove it from tracking:
+            for name in daemon_failure_time.keys():
+                if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
+                    self.log("daemon {name} has been restored".format(name=name))
+                    del daemon_failure_time[name]
+
+            for thrasher in self.thrashers:
+                if thrasher.e is not None:
+                    self.log("thrasher on fs.{name} failed".format(name=thrasher.fs.name))
+                    bark = True
+
+            if bark:
+                self.bark()
+                return
+
+            sleep(5)
+
+        self.log("watchdog finished")
 
 class MDSThrasher(Greenlet):
     """
@@ -98,17 +202,17 @@ class MDSThrasher(Greenlet):
 
     """
 
-    def __init__(self, ctx, manager, config, logger, fs, max_mds):
-        super(MDSThrasher, self).__init__()
+    def __init__(self, ctx, manager, config, fs, max_mds):
+        Greenlet.__init__(self)
 
-        self.ctx = ctx
-        self.manager = manager
-        assert self.manager.is_clean()
         self.config = config
-        self.logger = logger
+        self.ctx = ctx
+        self.e = None
+        self.logger = log.getChild('fs.[{f}]'.format(f = fs.name))
         self.fs = fs
+        self.manager = manager
         self.max_mds = max_mds
-
+        self.name = 'thrasher.fs.[{f}]'.format(f = fs.name)
         self.stopping = Event()
 
         self.randomize = bool(self.config.get('randomize', True))
@@ -124,11 +228,23 @@ class MDSThrasher(Greenlet):
     def _run(self):
         try:
             self.do_thrash()
-        except:
-            # Log exceptions here so we get the full backtrace (it's lost
-            # by the time someone does a .get() on this greenlet)
-            self.logger.exception("Exception in do_thrash:")
-            raise
+        except Exception as e:
+            # Log exceptions here so we get the full backtrace (gevent loses them).
+            # Also allow succesful completion as gevent exception handling is a broken mess:
+            #
+            # 2017-02-03T14:34:01.259 CRITICAL:root:  File "gevent.libev.corecext.pyx", line 367, in gevent.libev.corecext.loop.handle_error (src/gevent/libev/gevent.corecext.c:5051)
+            #   File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/virtualenv/local/lib/python2.7/site-packages/gevent/hub.py", line 558, in handle_error
+            #     self.print_exception(context, type, value, tb)
+            #   File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/virtualenv/local/lib/python2.7/site-packages/gevent/hub.py", line 605, in print_exception
+            #     traceback.print_exception(type, value, tb, file=errstream)
+            #   File "/usr/lib/python2.7/traceback.py", line 124, in print_exception
+            #     _print(file, 'Traceback (most recent call last):')
+            #   File "/usr/lib/python2.7/traceback.py", line 13, in _print
+            #     file.write(str+terminator)
+            # 2017-02-03T14:34:01.261 CRITICAL:root:IOError
+            self.e = e
+            self.logger.exception("exception:")
+            # allow successful completion so gevent doesn't see an exception...
 
     def log(self, x):
         """Write data to logger assigned to this MDThrasher"""
@@ -186,9 +302,6 @@ class MDSThrasher(Greenlet):
                     self.log('cluster is considered unstable while MDS are in up:stopping (!thrash_while_stopping)')
             else:
                 if rank is not None:
-                    if len(actives) >= max_mds:
-                        # no replacement can occur!
-                        return status
                     try:
                         info = status.get_rank(self.fs.id, rank)
                         if info['gid'] != gid and "up:active" == info['state']:
@@ -196,6 +309,10 @@ class MDSThrasher(Greenlet):
                             return status
                     except:
                         pass # no rank present
+                    if len(actives) >= max_mds:
+                        # no replacement can occur!
+                        self.log("cluster has %d actives (max_mds is %d), no MDS can replace rank %d".format(len(actives), max_mds, rank))
+                        return status
                 else:
                     if len(actives) >= max_mds:
                         self.log('mds cluster has {count} alive and active, now stable!'.format(count = len(actives)))
@@ -203,7 +320,9 @@ class MDSThrasher(Greenlet):
             if itercount > 300/2: # 5 minutes
                  raise RuntimeError('timeout waiting for cluster to stabilize')
             elif itercount % 5 == 0:
-                self.log('mds map: {status}'.format(status=self.fs.status()))
+                self.log('mds map: {status}'.format(status=status))
+            else:
+                self.log('no change')
             sleep(2)
 
     def do_thrash(self):
@@ -406,27 +525,29 @@ def task(ctx, config):
         status = mds_cluster.status()
     log.info('Ready to start thrashing')
 
+    thrashers = []
+
+    watchdog = DaemonWatchdog(ctx, manager, config, thrashers)
+    watchdog.start()
+
     manager.wait_for_clean()
-    thrashers = {}
+    assert manager.is_clean()
     for fs in status.get_filesystems():
-        name = fs['mdsmap']['fs_name']
-        log.info('Running thrasher against FS {f}'.format(f = name))
-        thrasher = MDSThrasher(
-            ctx, manager, config,
-            log.getChild('fs.[{f}]'.format(f = name)),
-            Filesystem(ctx, fs['id']), fs['mdsmap']['max_mds']
-            )
+        thrasher = MDSThrasher(ctx, manager, config, Filesystem(ctx, fs['id']), fs['mdsmap']['max_mds'])
         thrasher.start()
-        thrashers[name] = thrasher
+        thrashers.append(thrasher)
 
     try:
         log.debug('Yielding')
         yield
     finally:
         log.info('joining mds_thrashers')
-        for name in thrashers:
-            log.info('join thrasher mds_thrasher.fs.[{f}]'.format(f=name))
-            thrashers[name].stop()
-            thrashers[name].get()  # Raise any exception from _run()
-            thrashers[name].join()
+        for thrasher in thrashers:
+            thrasher.stop()
+            if thrasher.e:
+                raise RuntimeError('error during thrashing')
+            thrasher.join()
         log.info('done joining')
+
+        watchdog.stop()
+        watchdog.join()