ceph_manager: add timeout option to revive, increase for power_cycle

author Samuel Just <sam.just@inktank.com>

Mon, 6 May 2013 21:10:11 +0000 (14:10 -0700)

committer Samuel Just <sam.just@inktank.com>

Tue, 7 May 2013 22:51:36 +0000 (15:51 -0700)
author Samuel Just <sam.just@inktank.com>
Mon, 6 May 2013 21:10:11 +0000 (14:10 -0700)
committer Samuel Just <sam.just@inktank.com>
Tue, 7 May 2013 22:51:36 +0000 (15:51 -0700)
diff --git a/teuthology/task/ceph_manager.py b/teuthology/task/ceph_manager.py

index 3f6b75162fb7d8790bc784e67f10046df749481f..c5c3723fc2e638ee89f7097d50773d1c687a1b16 100644 (file)
--- a/teuthology/task/ceph_manager.py
+++ b/teuthology/task/ceph_manager.py
@@ -20,6 +20,10 @@ class Thrasher:
          self.stopping = False
          self.logger = logger
          self.config = config
+        self.revive_timeout = self.config.get("revive_timeout", 75)
+        if self.config.get('powercycle'):
+            self.revive_timeout += 120
+
          num_osds = self.in_osds + self.out_osds
          self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds
          if self.logger is not None:
@@ -57,7 +61,7 @@ class Thrasher:
          self.log("Reviving osd %s" % (str(osd),))
          self.live_osds.append(osd)
          self.dead_osds.remove(osd)
-        self.ceph_manager.revive_osd(osd)
+        self.ceph_manager.revive_osd(osd, self.revive_timeout)
  
      def out_osd(self, osd=None):
          if osd is None:
@@ -412,7 +416,7 @@ class CephManager:
              'kick_recovery_wq',
              '0')
  
-    def wait_run_admin_socket(self, osdnum, args=['version']):
+    def wait_run_admin_socket(self, osdnum, args=['version'], timeout=75):
          tries = 0
          while True:
              proc = self.osd_admin_socket(
@@ -422,7 +426,7 @@ class CephManager:
                  break
              else:
                  tries += 1
-                if tries > 15:
+                if (tries * 5) > timeout:
                      raise Exception('timed out waiting for admin_socket to appear after osd.{o} restart'.format(o=osdnum))
                  self.log(
                      "waiting on admin_socket for {osdnum}, {command}".format(
@@ -817,7 +821,7 @@ class CephManager:
          time.sleep(2)
          self.ctx.daemons.get_daemon('osd', osd).stop()
  
-    def revive_osd(self, osd):
+    def revive_osd(self, osd, timeout=75):
          if self.config.get('powercycle'):
              (remote,) = self.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys()
              self.log('kill_osd on osd.{o} doing powercycle of {s}'.format(o=osd, s=remote.name))
@@ -830,7 +834,7 @@ class CephManager:
              ceph_task.make_admin_daemon_dir(self.ctx, remote)
              self.ctx.daemons.get_daemon('osd', osd).reset()
          self.ctx.daemons.get_daemon('osd', osd).restart()
-        self.wait_run_admin_socket(osd)
+        self.wait_run_admin_socket(osd, timeout=timeout)
  
      def mark_down_osd(self, osd):
          self.raw_cluster_cmd('osd', 'down', str(osd))
diff --git a/teuthology/task/thrashosds.py b/teuthology/task/thrashosds.py

index 7e4e04d339bc980938cfdcb0f2a0f1bb8c24150f..5dcfb821dc51ba4ea2acf66699f49c932fb40bee 100644 (file)
--- a/teuthology/task/thrashosds.py
+++ b/teuthology/task/thrashosds.py
@@ -57,6 +57,9 @@ def task(ctx, config):
         to become clean after each cluster change. If this doesn't
         happen within the timeout, an exception will be raised.
  
+    revive_timeout: (75) number of seconds to wait for an osd asok to
+       appear after attempting to revive the osd
+
      chance_pgnum_grow: (0) chance to increase a pool's size
      chance_pgpnum_fix: (0) chance to adjust pgpnum to pg for a pool
      pool_grow_by: (10) amount to increase pgnum by
author	Samuel Just <sam.just@inktank.com>
	Mon, 6 May 2013 21:10:11 +0000 (14:10 -0700)
committer	Samuel Just <sam.just@inktank.com>
	Tue, 7 May 2013 22:51:36 +0000 (15:51 -0700)
teuthology/task/ceph_manager.py		patch \| blob \| history
teuthology/task/thrashosds.py		patch \| blob \| history