From: Sage Weil <sage@newdream.net>
Date: Tue, 10 Jan 2012 21:57:55 +0000 (-0800)
Subject: thrasher: add max_dead
X-Git-Tag: 1.1.0~2682
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=fb74b901527537a98d511c38ca8169af2b34e85b;p=teuthology.git

thrasher: add max_dead

Add max_dead, and revive osds prior to waiting for clean.  Otherwise we
can leave too many OSDs down and the cluster will never go clean.
---

diff --git a/teuthology/task/ceph_manager.py b/teuthology/task/ceph_manager.py
index 4d5680b1e..7902b2ece 100644
--- a/teuthology/task/ceph_manager.py
+++ b/teuthology/task/ceph_manager.py
@@ -106,6 +106,7 @@ class Thrasher(gevent.Greenlet):
 
     def do_thrash(self):
         cleanint = self.config.get("clean_interval", 60)
+        maxdead = self.config.get("max_dead", 1);
         delay = self.config.get("op_delay", 5)
         self.log("starting do_thrash")
         while not self.stopping:
@@ -113,6 +114,8 @@ class Thrasher(gevent.Greenlet):
                                                 "dead_osds: ", self.dead_osds, "live_osds: ",
                                                 self.live_osds]]))
             if random.uniform(0,1) < (float(delay) / cleanint):
+                while len(self.dead_osds) > maxdead:
+                    self.revive_osd()
                 self.ceph_manager.wait_till_clean(
                     timeout=self.config.get('timeout')
                     )
diff --git a/teuthology/task/thrashosds.py b/teuthology/task/thrashosds.py
index 6da7416c5..7e252ca39 100644
--- a/teuthology/task/thrashosds.py
+++ b/teuthology/task/thrashosds.py
@@ -28,6 +28,9 @@ def task(ctx, config):
     op_delay: (5) the length of time to sleep between changing an
        OSD's status
 
+    max_dead: (1) maximum number of osds to leave down/dead before waiting
+       for clean.  This should probably be num_replicas - 1.
+
     clean_interval: (60) the approximate length of time to loop before
        waiting until the cluster goes clean. (In reality this is used
        to probabilistically choose when to wait, and the method used