]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
task: die on ceph error or coredump
authorMike Ryan <mike.ryan@inktank.com>
Tue, 4 Sep 2012 16:52:38 +0000 (09:52 -0700)
committerMike Ryan <mike.ryan@inktank.com>
Tue, 4 Sep 2012 16:52:38 +0000 (09:52 -0700)
This task allows ceph to signal to teuth that it should die immediately
by touching a file under /tmp/cephtest

Signed-off-by: Mike Ryan <mike.ryan@inktank.com>
teuthology/task/die_on_err.py [new file with mode: 0644]

diff --git a/teuthology/task/die_on_err.py b/teuthology/task/die_on_err.py
new file mode 100644 (file)
index 0000000..4a7112d
--- /dev/null
@@ -0,0 +1,68 @@
+import contextlib
+import logging
+import os
+import time
+from ..orchestra import run
+
+import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Die if /tmp/cephtest/err exists or if an OSD dumps core
+    """
+    if config is None:
+        config = {}
+
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    log.info('num_osds is %s' % num_osds)
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < num_osds:
+        time.sleep(10)
+
+    log_path = '/tmp/cephtest/archive/log'
+
+    while True:
+        for i in range(num_osds):
+            (osd_remote,) = ctx.cluster.only('osd.%d' % i).remotes.iterkeys()
+            p = osd_remote.run(
+                args = [ 'test', '-e', '/tmp/cephtest/err' ],
+                wait=True,
+                check_status=False,
+            )
+            exit_status = p.exitstatus
+
+            if exit_status == 0:
+                log.info("osd %d has an error" % i)
+                raise Exception("osd %d error" % i)
+
+            log_path = '/tmp/cephtest/archive/log/osd.%d.log' % i
+
+            p = osd_remote.run(
+                args = [
+                         'tail', '-1', log_path,
+                         run.Raw('|'),
+                         'grep', '-q', 'end dump'
+                       ],
+                wait=True,
+                check_status=False,
+            )
+            exit_status = p.exitstatus
+
+            if exit_status == 0:
+                log.info("osd %d dumped core" % i)
+                raise Exception("osd %d dumped core" % i)
+
+        time.sleep(5)