]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
ceph_manager: add filestore and heartbeat stalls
authorSamuel Just <sam.just@inktank.com>
Thu, 24 Jan 2013 00:13:22 +0000 (16:13 -0800)
committerSamuel Just <sam.just@inktank.com>
Thu, 24 Jan 2013 01:40:40 +0000 (17:40 -0800)
Signed-off-by: Samuel Just <sam.just@inktank.com>
teuthology/task/ceph_manager.py
teuthology/task/thrashosds.py

index 5748eba801dd79228472bd6ef968b77db9e19abb..49f82f07b4c5c8336fe1819a6fe06ff293908b66 100644 (file)
@@ -124,6 +124,29 @@ class Thrasher:
             timeout=self.config.get('timeout')
             )
 
+    def inject_pause(self, conf_key, duration, check_after, should_be_down):
+        the_one = random.choice(self.in_osds)
+        self.log("inject_pause on {osd}".format(osd = the_one))
+        self.log(
+            "Testing {key} pause injection for duration {duration}".format(
+                key = conf_key,
+                duration = duration
+                ))
+        self.log(
+            "Checking after {after}, should_be_down={shouldbedown}".format(
+                after = check_after,
+                shouldbedown = should_be_down
+                ))
+        self.ceph_manager.set_config(the_one, **{conf_key:duration})
+        if not should_be_down:
+            return
+        time.sleep(check_after)
+        status = self.ceph_manager.get_osd_status()
+        assert the_one in status['down']
+        time.sleep(duration - check_after + 20)
+        status = self.ceph_manager.get_osd_status()
+        assert not the_one in status['down']
+
     def choose_action(self):
         chance_down = self.config.get('chance_down', 0)
         chance_test_min_size = self.config.get('chance_test_min_size', 0)
@@ -148,6 +171,19 @@ class Thrasher:
         actions.append((self.grow_pool, self.config.get('chance_pgnum_grow', 0),))
         actions.append((self.fix_pgp_num, self.config.get('chance_pgpnum_fix', 0),))
         actions.append((self.test_pool_min_size, chance_test_min_size,))
+        for key in ['heartbeat_inject_failure', 'filestore_inject_stall']:
+            for scenario in [
+                (lambda: self.inject_pause(key,
+                                           self.config.get('pause_short', 3),
+                                           0,
+                                           False),
+                 self.config.get('chance_inject_pause_short', 1),),
+                (lambda: self.inject_pause(key,
+                                           self.config.get('pause_long', 150),
+                                           self.config.get('pause_check_after', 120),
+                                           True),
+                 self.config.get('chance_inject_pause_long', 0.2),)]:
+                actions.append(scenario)
 
         total = sum([y for (x,y) in actions])
         val = random.uniform(0, total)
index 3325cef3cd49702c18a128bdcc43c2fb58017f0c..a945dbc475a8b3bba19b7e7e89da583d172d6a7e 100644 (file)
@@ -62,6 +62,12 @@ def task(ctx, config):
     pool_grow_by: (10) amount to increase pgnum by
     max_pgs_per_pool_osd: (1200) don't expand pools past this size per osd
 
+    pause_short: (3) duration of short pause
+    pause_long: (150) duration of long pause
+    pause_check_after: (120) assert osd down after this long
+    chance_inject_pause_short: (1) chance of injecting short stall
+    chance_inject_pause_long: (0.1) chance of injecting long stall
+
     example:
 
     tasks: