"-w"],
wait=False, stdout=StringIO(), stdin=run.PIPE)
+ def flush_pg_stats(self, osds, wait_for_mon=3*5):
+ """
+ Flush pg stats from a list of OSD ids, ensuring they are reflected
+ all the way to the monitor. Luminous and later only.
+
+ :param osds: list of OSDs to flush
+ :param wait_for_mon: wait for mon to be synced with mgr. 0 to disable
+ it. (3 * mon_mgr_digest_period, by default)
+ """
+ seq = {osd: self.raw_cluster_cmd('tell', 'osd.%d' % osd, 'flush_pg_stats')
+ for osd in osds}
+ if not wait_for_mon:
+ return
+ for osd, need in seq.iteritems():
+ got = 0
+ while wait_for_mon > 0:
+ got = self.raw_cluster_cmd('osd', 'last-stat-seq', 'osd.%d' % osd)
+ self.log('need seq {need} got {got} for osd.{osd}'.format(
+ need=need, got=got, osd=osd))
+ if got >= need:
+ break
+ A_WHILE = 1
+ time.sleep(A_WHILE)
+ wait_for_mon -= A_WHILE
+ else:
+ raise Exception('timed out waiting for mon to be updated with '
+ 'osd.{osd}: {got} < {need}'.
+ format(osd=osd, got=got, need=need))
+
def do_rados(self, remote, cmd, check_status=True):
"""
Execute a remote rados command.
while len(manager.get_osd_status()['up']) < 3:
time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1, 2])
manager.raw_cluster_cmd('osd', 'set', 'noout')
manager.raw_cluster_cmd('osd', 'set', 'noin')
manager.raw_cluster_cmd('osd', 'set', 'nodown')
while len(manager.get_osd_status()['up']) < 3:
time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1, 2])
manager.raw_cluster_cmd('osd', 'set', 'noout')
manager.raw_cluster_cmd('osd', 'set', 'noin')
manager.raw_cluster_cmd('osd', 'set', 'nodown')
logger=log.getChild('ceph_manager'),
)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1])
manager.wait_for_clean(timeout)
manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--',
manager.mark_out_osd(0)
time.sleep(timeout)
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.flush_pg_stats([1])
manager.wait_for_recovery(timeout)
check_stuck(
)
manager.mark_in_osd(0)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1])
manager.wait_for_clean(timeout)
check_stuck(
manager.mark_in_osd(id_)
while True:
try:
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1])
break
except Exception:
log.exception('osds must not be started yet, waiting...')
# kludge to make sure they get a map
rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile])
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1])
manager.wait_for_recovery()
# create old objects
manager.revive_osd(3)
manager.wait_till_osd_is_up(3)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 2, 3])
manager.wait_till_active()
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 2, 3])
# verify that there are unfound objects
unfound = manager.get_num_unfound_objects()
manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
manager.raw_cluster_cmd('tell', 'osd.3', 'debug', 'kick_recovery_wq', '5')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 2, 3])
manager.wait_for_recovery()
if not config.get('parallel_bench', True):
# kludge to make sure they get a map
rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile])
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1])
manager.wait_for_recovery()
# create old objects
manager.mark_in_osd(0)
manager.wait_till_osd_is_up(0)
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.flush_pg_stats([1, 0])
manager.wait_till_active()
# take out osd.1 and the only copy of those objects.
manager.mark_in_osd(2)
manager.wait_till_osd_is_up(2)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 2])
manager.wait_till_active()
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 2])
# verify that there are unfound objects
unfound = manager.get_num_unfound_objects()
manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 2])
manager.wait_for_recovery()
# verify result
manager.mark_in_osd(0)
manager.wait_till_active()
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.flush_pg_stats([2, 0])
manager.mark_out_osd(2)
manager.wait_till_active()
manager.mark_in_osd(1)
manager.wait_till_active()
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1])
log.info("Getting unfound objects")
unfound = manager.get_num_unfound_objects()
assert not unfound
manager.kill_osd(3)
manager.mark_down_osd(3)
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1])
log.info("Getting unfound objects")
unfound = manager.get_num_unfound_objects()
assert unfound
while len(manager.get_osd_status()['up']) < 3:
time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1, 2])
manager.wait_for_clean()
# write some data
manager.mark_down_osd(0)
# wait for everything to peer and be happy...
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([1, 2])
manager.wait_for_recovery()
# write some new data
manager.wait_till_osd_is_up(2)
# cluster must recover
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([1, 2])
manager.wait_for_recovery()
# re-add osd.0
manager.revive_osd(0)
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([1, 2])
manager.wait_for_clean()
while len(manager.get_osd_status()['up']) < 3:
time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1, 2])
manager.wait_for_clean()
# test some osdmap flags
log.info('err is %d' % err)
# cluster must repeer
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1])
manager.wait_for_active_or_down()
# write some more (make sure osd.2 really is divergent)
log.info('3 are up!')
# cluster must recover
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1, 2])
manager.wait_for_clean()
while len(manager.get_osd_status()['up']) < 4:
time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1, 2, 3])
manager.wait_for_clean()
log.info('Testing incomplete pgs...')
# move data off of osd.0, osd.1
manager.raw_cluster_cmd('osd', 'out', '0', '1')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1, 2, 3])
manager.wait_for_clean()
# lots of objects in rbd (no pg log, will backfill)
manager.raw_cluster_cmd('osd', 'in', '0', '1')
manager.raw_cluster_cmd('osd', 'out', '2', '3')
time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1, 2, 3])
time.sleep(10)
manager.wait_for_active()
manager.kill_osd(3)
log.info('...')
manager.raw_cluster_cmd('osd', 'down', '2', '3')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1])
manager.wait_for_active_or_down()
assert manager.get_num_down() > 0
while len(manager.get_osd_status()['up']) < 3:
time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1, 2])
manager.wait_for_clean()
for i in range(3):
# kludge to make sure they get a map
rados(ctx, mon, ['-p', 'data', 'get', 'dummy', '-'])
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1])
manager.wait_for_recovery()
# kill another and revive 2, so that some pgs can't peer.
manager.revive_osd(2)
manager.wait_till_osd_is_up(2)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 2])
manager.wait_for_active_or_down()
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 2])
# look for down pgs
num_down_pgs = 0
# bring it all back
manager.revive_osd(1)
manager.wait_till_osd_is_up(1)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1, 2])
manager.wait_for_clean()
while len(manager.get_osd_status()['up']) < 3:
time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1, 2])
manager.raw_cluster_cmd('osd', 'set', 'noout')
manager.raw_cluster_cmd('osd', 'set', 'noin')
manager.raw_cluster_cmd('osd', 'set', 'nodown')
while len(manager.get_osd_status()['up']) < 3:
time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1, 2])
manager.wait_for_clean()
manager.create_pool(POOL)
# kludge to make sure they get a map
rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile])
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1])
manager.wait_for_recovery()
# create old objects
manager.mark_in_osd(0)
manager.wait_till_osd_is_up(0)
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 1])
manager.wait_till_active()
# take out osd.1 and the only copy of those objects.
manager.mark_in_osd(2)
manager.wait_till_osd_is_up(2)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 2])
manager.wait_till_active()
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 2])
# verify that there are unfound objects
unfound = manager.get_num_unfound_objects()
manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.flush_pg_stats([0, 2])
manager.wait_for_recovery()
# verify result
for i in range(num_osds):
manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs',
'--', '--osd-objectstore-fuse')
- for i in range(num_osds):
- manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats')
+ manager.flush_pg_stats(range(num_osds))
manager.wait_for_clean()
# write some data