]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
tasks/ceph_manager: add bdev_inject_crash_probability support
authorSage Weil <sage@redhat.com>
Wed, 21 Sep 2016 21:01:57 +0000 (17:01 -0400)
committerSage Weil <sage@redhat.com>
Wed, 21 Sep 2016 21:07:28 +0000 (17:07 -0400)
Only do the failure injection 50% of the time; otherwise, just
kill as usual.

Signed-off-by: Sage Weil <sage@redhat.com>
# Conflicts:
# tasks/ceph_manager.py

objectstore/bluestore.yaml
tasks/ceph_manager.py
tasks/thrashosds.py

index 8baaff77e31625ae24b279024dbcc511f5678578..782de567094ce61f4bfb23506481e256aaedada6 100644 (file)
@@ -1,6 +1,7 @@
 overrides:
   thrashosds:
     bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
   ceph:
     conf:
       osd:
index c735f4e366dbe361fb5dc0bfa464c28c0c540d5a..1adec25bbd3c0d5fcf310c49fe520bc6c019c788 100644 (file)
@@ -1926,18 +1926,21 @@ class CephManager:
                      'doing powercycle of {s}'.format(o=osd, s=remote.name))
             self._assert_ipmi(remote)
             remote.console.power_off()
-        elif self.config.get('bdev_inject_crash'):
-            self.raw_cluster_cmd(
-                '--', 'tell', 'osd.%d' % osd,
-                'injectargs',
-                '--bdev-inject-crash %d' % self.config.get('bdev_inject_crash'),
-            )
-            try:
-                self.ctx.daemons.get_daemon('osd', osd, self.cluster).wait()
-            except:
-                pass
+        elif self.config.get('bdev_inject_crash') and self.config.get('bdev_inject_crash_probability'):
+            if random.uniform(0, 1) < self.config.get('bdev_inject_crash_probability', .5):
+                self.raw_cluster_cmd(
+                    '--', 'tell', 'osd.%d' % osd,
+                    'injectargs',
+                    '--bdev-inject-crash %d' % self.config.get('bdev_inject_crash'),
+                )
+                try:
+                    self.ctx.daemons.get_daemon('osd', osd, self.cluster).wait()
+                except:
+                    pass
+                else:
+                    raise RuntimeError('osd.%s did not fail' % osd)
             else:
-                raise RuntimeError('osd.%s did not fail' % osd)
+                self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
         else:
             self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
 
index d7cbc93f2c635e1e45bf59b931fd7d926cfbb427..eff9a63fcb5b9e5a831e252258a73c0e41266e72 100644 (file)
@@ -93,6 +93,14 @@ def task(ctx, config):
         of just the osd process. Note that this assumes that a single
         osd is the only important process on the node.
 
+    bdev_inject_crash: (0) seconds to delay while inducing a synthetic crash.
+        the delay lets the BlockDevice "accept" more aio operations but blocks
+        any flush, and then eventually crashes (losing some or all ios).  If 0,
+        no bdev failure injection is enabled.
+
+    bdev_inject_crash_probability: (.5) probability of doing a bdev failure
+        injection crash vs a normal OSD kill.
+
     chance_test_backfill_full: (0) chance to simulate full disks stopping
         backfill