From: Sage Weil Date: Tue, 10 Jan 2012 21:57:55 +0000 (-0800) Subject: thrasher: add max_dead X-Git-Tag: 1.1.0~2682 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=fb74b901527537a98d511c38ca8169af2b34e85b;p=teuthology.git thrasher: add max_dead Add max_dead, and revive osds prior to waiting for clean. Otherwise we can leave too many OSDs down and the cluster will never go clean. --- diff --git a/teuthology/task/ceph_manager.py b/teuthology/task/ceph_manager.py index 4d5680b1e..7902b2ece 100644 --- a/teuthology/task/ceph_manager.py +++ b/teuthology/task/ceph_manager.py @@ -106,6 +106,7 @@ class Thrasher(gevent.Greenlet): def do_thrash(self): cleanint = self.config.get("clean_interval", 60) + maxdead = self.config.get("max_dead", 1); delay = self.config.get("op_delay", 5) self.log("starting do_thrash") while not self.stopping: @@ -113,6 +114,8 @@ class Thrasher(gevent.Greenlet): "dead_osds: ", self.dead_osds, "live_osds: ", self.live_osds]])) if random.uniform(0,1) < (float(delay) / cleanint): + while len(self.dead_osds) > maxdead: + self.revive_osd() self.ceph_manager.wait_till_clean( timeout=self.config.get('timeout') ) diff --git a/teuthology/task/thrashosds.py b/teuthology/task/thrashosds.py index 6da7416c5..7e252ca39 100644 --- a/teuthology/task/thrashosds.py +++ b/teuthology/task/thrashosds.py @@ -28,6 +28,9 @@ def task(ctx, config): op_delay: (5) the length of time to sleep between changing an OSD's status + max_dead: (1) maximum number of osds to leave down/dead before waiting + for clean. This should probably be num_replicas - 1. + clean_interval: (60) the approximate length of time to loop before waiting until the cluster goes clean. (In reality this is used to probabilistically choose when to wait, and the method used