manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_till_clean()
+ manager.wait_for_clean()
# write some data
p = rados_start(mon, ['-p', 'data', 'bench', '15', 'write', '-b', '4096'])
# wait for everything to peer and be happy...
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_till_clean()
+ manager.wait_for_recovery()
# write some new data
p = rados_start(mon, ['-p', 'data', 'bench', '30', 'write', '-b', '4096'])
# cluster must recover
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_till_clean()
+ manager.wait_for_recovery()
# re-add osd.0
manager.revive_osd(0)
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_till_clean()
+ manager.wait_for_clean()
class Thrasher(gevent.Greenlet):
def __init__(self, manager, config, logger=None):
self.ceph_manager = manager
- self.ceph_manager.wait_till_clean()
+ self.ceph_manager.wait_for_clean()
osd_status = self.ceph_manager.get_osd_status()
self.in_osds = osd_status['in']
self.live_osds = osd_status['live']
if random.uniform(0,1) < (float(delay) / cleanint):
while len(self.dead_osds) > maxdead:
self.revive_osd()
- self.ceph_manager.wait_till_clean(
+ self.ceph_manager.wait_for_recovery(
timeout=self.config.get('timeout')
)
self.choose_action()()
num += 1
return num
+ def get_num_active_recovered(self):
+ pgs = self.get_pg_stats()
+ num = 0
+ for pg in pgs:
+ if pg['state'].count('active') and not pg['state'].count('recovering'):
+ num += 1
+ return num
+
def get_num_active(self):
pgs = self.get_pg_stats()
num = 0
def is_clean(self):
return self.get_num_active_clean() == self.get_num_pgs()
- def wait_till_clean(self, timeout=None):
- self.log("waiting till clean")
+ def is_recovered(self):
+ return self.get_num_active_recovered() == self.get_num_pgs()
+
+ def wait_for_clean(self, timeout=None):
+ self.log("waiting for clean")
start = time.time()
num_active_clean = self.get_num_active_clean()
while not self.is_clean():
time.sleep(3)
self.log("clean!")
+ def wait_for_recovery(self, timeout=None):
+ self.log("waiting for recovery to complete")
+ start = time.time()
+ num_active_recovered = self.get_num_active_recovered()
+ while not self.is_recovered():
+ if timeout is not None:
+ assert time.time() - start < timeout, \
+ 'failed to recover before timeout expired'
+ cur_active_recovered = self.get_num_active_recovered()
+ if cur_active_recovered != num_active_recovered:
+ start = time.time()
+ num_active_recovered = cur_active_recovered
+ time.sleep(3)
+ self.log("recovered!")
+
def osd_is_up(self, osd):
osds = self.get_osd_dump()
return osds[osd]['up'] > 0
manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_till_clean()
+ manager.wait_for_clean()
# something that is always there
dummyfile = '/etc/fstab'
manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.wait_till_clean()
+ manager.wait_for_recovery()
# create old objects
for f in range(1, 10):
manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_till_clean()
+ manager.wait_for_recovery()
# verify result
for f in range(1, 10):
# see if osd.1 can cope
manager.revive_osd(1)
manager.wait_till_osd_is_up(1)
- manager.wait_till_clean()
+ manager.wait_for_clean()
finally:
log.info('joining thrashosds')
thrash_proc.do_join()
- manager.wait_till_clean(config.get('timeout', 360))
+ manager.wait_for_recovery(config.get('timeout', 360))