From 5741228f605d30640881a0dc3aa885e94061bbe1 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Mon, 6 May 2013 14:10:11 -0700 Subject: [PATCH] ceph_manager: add timeout option to revive, increase for power_cycle Signed-off-by: Samuel Just --- teuthology/task/ceph_manager.py | 14 +++++++++----- teuthology/task/thrashosds.py | 3 +++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/teuthology/task/ceph_manager.py b/teuthology/task/ceph_manager.py index 3f6b75162..c5c3723fc 100644 --- a/teuthology/task/ceph_manager.py +++ b/teuthology/task/ceph_manager.py @@ -20,6 +20,10 @@ class Thrasher: self.stopping = False self.logger = logger self.config = config + self.revive_timeout = self.config.get("revive_timeout", 75) + if self.config.get('powercycle'): + self.revive_timeout += 120 + num_osds = self.in_osds + self.out_osds self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds if self.logger is not None: @@ -57,7 +61,7 @@ class Thrasher: self.log("Reviving osd %s" % (str(osd),)) self.live_osds.append(osd) self.dead_osds.remove(osd) - self.ceph_manager.revive_osd(osd) + self.ceph_manager.revive_osd(osd, self.revive_timeout) def out_osd(self, osd=None): if osd is None: @@ -412,7 +416,7 @@ class CephManager: 'kick_recovery_wq', '0') - def wait_run_admin_socket(self, osdnum, args=['version']): + def wait_run_admin_socket(self, osdnum, args=['version'], timeout=75): tries = 0 while True: proc = self.osd_admin_socket( @@ -422,7 +426,7 @@ class CephManager: break else: tries += 1 - if tries > 15: + if (tries * 5) > timeout: raise Exception('timed out waiting for admin_socket to appear after osd.{o} restart'.format(o=osdnum)) self.log( "waiting on admin_socket for {osdnum}, {command}".format( @@ -817,7 +821,7 @@ class CephManager: time.sleep(2) self.ctx.daemons.get_daemon('osd', osd).stop() - def revive_osd(self, osd): + def revive_osd(self, osd, timeout=75): if self.config.get('powercycle'): (remote,) = self.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys() self.log('kill_osd on osd.{o} doing powercycle of {s}'.format(o=osd, s=remote.name)) @@ -830,7 +834,7 @@ class CephManager: ceph_task.make_admin_daemon_dir(self.ctx, remote) self.ctx.daemons.get_daemon('osd', osd).reset() self.ctx.daemons.get_daemon('osd', osd).restart() - self.wait_run_admin_socket(osd) + self.wait_run_admin_socket(osd, timeout=timeout) def mark_down_osd(self, osd): self.raw_cluster_cmd('osd', 'down', str(osd)) diff --git a/teuthology/task/thrashosds.py b/teuthology/task/thrashosds.py index 7e4e04d33..5dcfb821d 100644 --- a/teuthology/task/thrashosds.py +++ b/teuthology/task/thrashosds.py @@ -57,6 +57,9 @@ def task(ctx, config): to become clean after each cluster change. If this doesn't happen within the timeout, an exception will be raised. + revive_timeout: (75) number of seconds to wait for an osd asok to + appear after attempting to revive the osd + chance_pgnum_grow: (0) chance to increase a pool's size chance_pgpnum_fix: (0) chance to adjust pgpnum to pg for a pool pool_grow_by: (10) amount to increase pgnum by -- 2.47.3