]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
ceph_manager: add test_min_size action
authorSamuel Just <sam.just@inktank.com>
Wed, 7 Nov 2012 20:36:37 +0000 (12:36 -0800)
committerSamuel Just <sam.just@inktank.com>
Wed, 7 Nov 2012 20:56:31 +0000 (12:56 -0800)
Thrasher can now with configurable frequency test min_size by
taking down all but one osd, waiting, killing that osd and bringing
back the others, and verifying that the cluster goes clean.

Signed-off-by: Samuel Just <sam.just@inktank.com>
teuthology/task/ceph_manager.py
teuthology/task/thrashosds.py

index c7697d423268ff9b4dcb39816c457ae2ff54788f..4266a870f669ce47de61ddff8b0bf912966d18da 100644 (file)
@@ -82,8 +82,33 @@ class Thrasher:
         self.stopping = True
         self.thread.get()
 
+    def test_pool_min_size(self):
+        self.log("test_pool_min_size")
+        self.all_up()
+        self.ceph_manager.wait_for_recovery(
+            timeout=self.config.get('timeout')
+            )
+        the_one = random.choice(self.in_osds)
+        self.log("Killing everyone but %s", the_one)
+        to_kill = filter(lambda x: x != the_one, self.in_osds)
+        [self.kill_osd(i) for i in to_kill]
+        [self.out_osd(i) for i in to_kill]
+        time.sleep(self.config.get("test_pool_min_size_time", 10))
+        self.log("Killing %s"%(the_one,))
+        self.kill_osd(the_one)
+        self.out_osd(the_one)
+        self.log("Reviving everyone but %s"%(the_one,))
+        [self.revive_osd(i) for i in to_kill]
+        [self.in_osd(i) for i in to_kill]
+        self.log("Revived everyone but %s"%(the_one,))
+        self.log("Waiting for clean")
+        self.ceph_manager.wait_for_recovery(
+            timeout=self.config.get('timeout')
+            )
+
     def choose_action(self):
         chance_down = self.config.get("chance_down", 0)
+        chance_test_min_size = self.config.get("chance_test_min_size", 0)
         if isinstance(chance_down, int):
             chance_down = float(chance_down) / 100
         minin = self.config.get("min_in", 2)
@@ -102,6 +127,7 @@ class Thrasher:
             actions.append((self.in_osd, 1.7,))
         if len(self.dead_osds) > mindead:
             actions.append((self.revive_osd, 1.0,))
+        actions.append((self.test_pool_min_size, chance_test_min_size,))
 
         total = sum([y for (x,y) in actions])
         val = random.uniform(0, total)
index 2678e60e56b5eed6aecb58133b1e0648cc1a2fd2..2548c0a2b8e34fc2a4df16a40810723064372dee 100644 (file)
@@ -45,6 +45,14 @@ def task(ctx, config):
        can be either an integer (eg, 75) or a float probability (eg
        0.75).
 
+    chance_test_min_size: (0) chance to run test_pool_min_size,
+       which:
+       - kills all but one osd
+       - waits
+       - kills that osd
+       - revives all other osds
+       - verifies that the osds fully recover
+
     timeout: (360) the number of seconds to wait for the cluster
        to become clean after each cluster change. If this doesn't
        happen within the timeout, an exception will be raised.