From: Samuel Just Date: Wed, 7 Nov 2012 20:36:37 +0000 (-0800) Subject: ceph_manager: add test_min_size action X-Git-Tag: 1.1.0~2425 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bd83ed70dc2e369e8e75bd550bd0bad4c8fe7724;p=teuthology.git ceph_manager: add test_min_size action Thrasher can now with configurable frequency test min_size by taking down all but one osd, waiting, killing that osd and bringing back the others, and verifying that the cluster goes clean. Signed-off-by: Samuel Just --- diff --git a/teuthology/task/ceph_manager.py b/teuthology/task/ceph_manager.py index c7697d423..4266a870f 100644 --- a/teuthology/task/ceph_manager.py +++ b/teuthology/task/ceph_manager.py @@ -82,8 +82,33 @@ class Thrasher: self.stopping = True self.thread.get() + def test_pool_min_size(self): + self.log("test_pool_min_size") + self.all_up() + self.ceph_manager.wait_for_recovery( + timeout=self.config.get('timeout') + ) + the_one = random.choice(self.in_osds) + self.log("Killing everyone but %s", the_one) + to_kill = filter(lambda x: x != the_one, self.in_osds) + [self.kill_osd(i) for i in to_kill] + [self.out_osd(i) for i in to_kill] + time.sleep(self.config.get("test_pool_min_size_time", 10)) + self.log("Killing %s"%(the_one,)) + self.kill_osd(the_one) + self.out_osd(the_one) + self.log("Reviving everyone but %s"%(the_one,)) + [self.revive_osd(i) for i in to_kill] + [self.in_osd(i) for i in to_kill] + self.log("Revived everyone but %s"%(the_one,)) + self.log("Waiting for clean") + self.ceph_manager.wait_for_recovery( + timeout=self.config.get('timeout') + ) + def choose_action(self): chance_down = self.config.get("chance_down", 0) + chance_test_min_size = self.config.get("chance_test_min_size", 0) if isinstance(chance_down, int): chance_down = float(chance_down) / 100 minin = self.config.get("min_in", 2) @@ -102,6 +127,7 @@ class Thrasher: actions.append((self.in_osd, 1.7,)) if len(self.dead_osds) > mindead: actions.append((self.revive_osd, 1.0,)) + actions.append((self.test_pool_min_size, chance_test_min_size,)) total = sum([y for (x,y) in actions]) val = random.uniform(0, total) diff --git a/teuthology/task/thrashosds.py b/teuthology/task/thrashosds.py index 2678e60e5..2548c0a2b 100644 --- a/teuthology/task/thrashosds.py +++ b/teuthology/task/thrashosds.py @@ -45,6 +45,14 @@ def task(ctx, config): can be either an integer (eg, 75) or a float probability (eg 0.75). + chance_test_min_size: (0) chance to run test_pool_min_size, + which: + - kills all but one osd + - waits + - kills that osd + - revives all other osds + - verifies that the osds fully recover + timeout: (360) the number of seconds to wait for the cluster to become clean after each cluster change. If this doesn't happen within the timeout, an exception will be raised.