From: Sage Weil Date: Wed, 21 Sep 2016 21:01:57 +0000 (-0400) Subject: tasks/ceph_manager: add bdev_inject_crash_probability support X-Git-Tag: v11.1.1~58^2^2~93^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=dccdb2eb0fd7db5fd908cecc6978fb18a0d5ec03;p=ceph.git tasks/ceph_manager: add bdev_inject_crash_probability support Only do the failure injection 50% of the time; otherwise, just kill as usual. Signed-off-by: Sage Weil # Conflicts: # tasks/ceph_manager.py --- diff --git a/objectstore/bluestore.yaml b/objectstore/bluestore.yaml index 8baaff77e316..782de567094c 100644 --- a/objectstore/bluestore.yaml +++ b/objectstore/bluestore.yaml @@ -1,6 +1,7 @@ overrides: thrashosds: bdev_inject_crash: 2 + bdev_inject_crash_probability: .5 ceph: conf: osd: diff --git a/tasks/ceph_manager.py b/tasks/ceph_manager.py index c735f4e366db..1adec25bbd3c 100644 --- a/tasks/ceph_manager.py +++ b/tasks/ceph_manager.py @@ -1926,18 +1926,21 @@ class CephManager: 'doing powercycle of {s}'.format(o=osd, s=remote.name)) self._assert_ipmi(remote) remote.console.power_off() - elif self.config.get('bdev_inject_crash'): - self.raw_cluster_cmd( - '--', 'tell', 'osd.%d' % osd, - 'injectargs', - '--bdev-inject-crash %d' % self.config.get('bdev_inject_crash'), - ) - try: - self.ctx.daemons.get_daemon('osd', osd, self.cluster).wait() - except: - pass + elif self.config.get('bdev_inject_crash') and self.config.get('bdev_inject_crash_probability'): + if random.uniform(0, 1) < self.config.get('bdev_inject_crash_probability', .5): + self.raw_cluster_cmd( + '--', 'tell', 'osd.%d' % osd, + 'injectargs', + '--bdev-inject-crash %d' % self.config.get('bdev_inject_crash'), + ) + try: + self.ctx.daemons.get_daemon('osd', osd, self.cluster).wait() + except: + pass + else: + raise RuntimeError('osd.%s did not fail' % osd) else: - raise RuntimeError('osd.%s did not fail' % osd) + self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop() else: self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop() diff --git a/tasks/thrashosds.py b/tasks/thrashosds.py index d7cbc93f2c63..eff9a63fcb5b 100644 --- a/tasks/thrashosds.py +++ b/tasks/thrashosds.py @@ -93,6 +93,14 @@ def task(ctx, config): of just the osd process. Note that this assumes that a single osd is the only important process on the node. + bdev_inject_crash: (0) seconds to delay while inducing a synthetic crash. + the delay lets the BlockDevice "accept" more aio operations but blocks + any flush, and then eventually crashes (losing some or all ios). If 0, + no bdev failure injection is enabled. + + bdev_inject_crash_probability: (.5) probability of doing a bdev failure + injection crash vs a normal OSD kill. + chance_test_backfill_full: (0) chance to simulate full disks stopping backfill