From: Sage Weil Date: Thu, 18 May 2017 22:16:55 +0000 (-0400) Subject: qa/tasks: use new reliable flush_pg_stats helper X-Git-Tag: ses5-milestone6~8^2~19^2~66 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=ab1b78ae00956561975d340b385e78d9d5f841dd;p=ceph.git qa/tasks: use new reliable flush_pg_stats helper The helper gets a sequence number from the osd (or osds), and then polls the mon until that seq is reflected there. This is overkill in some cases, since many tests only require that the stats be reflected on the mgr (not the mon), but waiting for it to also reach the mon is sufficient! Signed-off-by: Sage Weil --- diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index ab3df5773f4..1fceb9b9529 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -1148,6 +1148,35 @@ class CephManager: "-w"], wait=False, stdout=StringIO(), stdin=run.PIPE) + def flush_pg_stats(self, osds, wait_for_mon=3*5): + """ + Flush pg stats from a list of OSD ids, ensuring they are reflected + all the way to the monitor. Luminous and later only. + + :param osds: list of OSDs to flush + :param wait_for_mon: wait for mon to be synced with mgr. 0 to disable + it. (3 * mon_mgr_digest_period, by default) + """ + seq = {osd: self.raw_cluster_cmd('tell', 'osd.%d' % osd, 'flush_pg_stats') + for osd in osds} + if not wait_for_mon: + return + for osd, need in seq.iteritems(): + got = 0 + while wait_for_mon > 0: + got = self.raw_cluster_cmd('osd', 'last-stat-seq', 'osd.%d' % osd) + self.log('need seq {need} got {got} for osd.{osd}'.format( + need=need, got=got, osd=osd)) + if got >= need: + break + A_WHILE = 1 + time.sleep(A_WHILE) + wait_for_mon -= A_WHILE + else: + raise Exception('timed out waiting for mon to be updated with ' + 'osd.{osd}: {got} < {need}'. + format(osd=osd, got=got, need=need)) + def do_rados(self, remote, cmd, check_status=True): """ Execute a remote rados command. diff --git a/qa/tasks/divergent_priors.py b/qa/tasks/divergent_priors.py index 6ee376d0a92..12ea93365bf 100644 --- a/qa/tasks/divergent_priors.py +++ b/qa/tasks/divergent_priors.py @@ -33,9 +33,7 @@ def task(ctx, config): while len(manager.get_osd_status()['up']) < 3: time.sleep(10) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 1, 2]) manager.raw_cluster_cmd('osd', 'set', 'noout') manager.raw_cluster_cmd('osd', 'set', 'noin') manager.raw_cluster_cmd('osd', 'set', 'nodown') diff --git a/qa/tasks/divergent_priors2.py b/qa/tasks/divergent_priors2.py index 51a8a68f2e2..26b8120f034 100644 --- a/qa/tasks/divergent_priors2.py +++ b/qa/tasks/divergent_priors2.py @@ -35,9 +35,7 @@ def task(ctx, config): while len(manager.get_osd_status()['up']) < 3: time.sleep(10) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 1, 2]) manager.raw_cluster_cmd('osd', 'set', 'noout') manager.raw_cluster_cmd('osd', 'set', 'noin') manager.raw_cluster_cmd('osd', 'set', 'nodown') diff --git a/qa/tasks/dump_stuck.py b/qa/tasks/dump_stuck.py index 86e04a9ca88..8da634b5d9d 100644 --- a/qa/tasks/dump_stuck.py +++ b/qa/tasks/dump_stuck.py @@ -56,8 +56,7 @@ def task(ctx, config): logger=log.getChild('ceph_manager'), ) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') + manager.flush_pg_stats([0, 1]) manager.wait_for_clean(timeout) manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--', @@ -74,7 +73,7 @@ def task(ctx, config): manager.mark_out_osd(0) time.sleep(timeout) - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') + manager.flush_pg_stats([1]) manager.wait_for_recovery(timeout) check_stuck( @@ -85,8 +84,7 @@ def task(ctx, config): ) manager.mark_in_osd(0) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') + manager.flush_pg_stats([0, 1]) manager.wait_for_clean(timeout) check_stuck( @@ -145,8 +143,7 @@ def task(ctx, config): manager.mark_in_osd(id_) while True: try: - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') + manager.flush_pg_stats([0, 1]) break except Exception: log.exception('osds must not be started yet, waiting...') diff --git a/qa/tasks/ec_lost_unfound.py b/qa/tasks/ec_lost_unfound.py index 0dd4edcd791..d7a55fcab6a 100644 --- a/qa/tasks/ec_lost_unfound.py +++ b/qa/tasks/ec_lost_unfound.py @@ -48,8 +48,7 @@ def task(ctx, config): # kludge to make sure they get a map rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile]) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') + manager.flush_pg_stats([0, 1]) manager.wait_for_recovery() # create old objects @@ -84,13 +83,9 @@ def task(ctx, config): manager.revive_osd(3) manager.wait_till_osd_is_up(3) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') + manager.flush_pg_stats([0, 2, 3]) manager.wait_till_active() - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') + manager.flush_pg_stats([0, 2, 3]) # verify that there are unfound objects unfound = manager.get_num_unfound_objects() @@ -141,9 +136,7 @@ def task(ctx, config): manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') manager.raw_cluster_cmd('tell', 'osd.3', 'debug', 'kick_recovery_wq', '5') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') + manager.flush_pg_stats([0, 2, 3]) manager.wait_for_recovery() if not config.get('parallel_bench', True): diff --git a/qa/tasks/lost_unfound.py b/qa/tasks/lost_unfound.py index 713688fa726..1cc588b4016 100644 --- a/qa/tasks/lost_unfound.py +++ b/qa/tasks/lost_unfound.py @@ -48,8 +48,7 @@ def task(ctx, config): # kludge to make sure they get a map rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile]) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') + manager.flush_pg_stats([0, 1]) manager.wait_for_recovery() # create old objects @@ -84,8 +83,7 @@ def task(ctx, config): manager.mark_in_osd(0) manager.wait_till_osd_is_up(0) - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.flush_pg_stats([1, 0]) manager.wait_till_active() # take out osd.1 and the only copy of those objects. @@ -99,11 +97,9 @@ def task(ctx, config): manager.mark_in_osd(2) manager.wait_till_osd_is_up(2) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 2]) manager.wait_till_active() - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 2]) # verify that there are unfound objects unfound = manager.get_num_unfound_objects() @@ -160,8 +156,7 @@ def task(ctx, config): manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 2]) manager.wait_for_recovery() # verify result diff --git a/qa/tasks/object_source_down.py b/qa/tasks/object_source_down.py index bea3d18c8d1..9705d7c7375 100644 --- a/qa/tasks/object_source_down.py +++ b/qa/tasks/object_source_down.py @@ -76,8 +76,7 @@ def task(ctx, config): manager.mark_in_osd(0) manager.wait_till_active() - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.flush_pg_stats([2, 0]) manager.mark_out_osd(2) manager.wait_till_active() @@ -86,8 +85,7 @@ def task(ctx, config): manager.mark_in_osd(1) manager.wait_till_active() - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.flush_pg_stats([0, 1]) log.info("Getting unfound objects") unfound = manager.get_num_unfound_objects() assert not unfound @@ -97,8 +95,7 @@ def task(ctx, config): manager.kill_osd(3) manager.mark_down_osd(3) - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.flush_pg_stats([0, 1]) log.info("Getting unfound objects") unfound = manager.get_num_unfound_objects() assert unfound diff --git a/qa/tasks/osd_backfill.py b/qa/tasks/osd_backfill.py index dbed30fd412..04658d20569 100644 --- a/qa/tasks/osd_backfill.py +++ b/qa/tasks/osd_backfill.py @@ -52,9 +52,7 @@ def task(ctx, config): while len(manager.get_osd_status()['up']) < 3: time.sleep(10) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 1, 2]) manager.wait_for_clean() # write some data @@ -71,8 +69,7 @@ def task(ctx, config): manager.mark_down_osd(0) # wait for everything to peer and be happy... - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([1, 2]) manager.wait_for_recovery() # write some new data @@ -96,14 +93,12 @@ def task(ctx, config): manager.wait_till_osd_is_up(2) # cluster must recover - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([1, 2]) manager.wait_for_recovery() # re-add osd.0 manager.revive_osd(0) - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([1, 2]) manager.wait_for_clean() diff --git a/qa/tasks/osd_recovery.py b/qa/tasks/osd_recovery.py index db46ade9985..02a1cc1ba92 100644 --- a/qa/tasks/osd_recovery.py +++ b/qa/tasks/osd_recovery.py @@ -52,9 +52,7 @@ def task(ctx, config): while len(manager.get_osd_status()['up']) < 3: time.sleep(10) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 1, 2]) manager.wait_for_clean() # test some osdmap flags @@ -86,8 +84,7 @@ def task(ctx, config): log.info('err is %d' % err) # cluster must repeer - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') + manager.flush_pg_stats([0, 1]) manager.wait_for_active_or_down() # write some more (make sure osd.2 really is divergent) @@ -103,9 +100,7 @@ def task(ctx, config): log.info('3 are up!') # cluster must recover - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 1, 2]) manager.wait_for_clean() @@ -134,10 +129,7 @@ def test_incomplete_pgs(ctx, config): while len(manager.get_osd_status()['up']) < 4: time.sleep(10) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') + manager.flush_pg_stats([0, 1, 2, 3]) manager.wait_for_clean() log.info('Testing incomplete pgs...') @@ -149,10 +141,7 @@ def test_incomplete_pgs(ctx, config): # move data off of osd.0, osd.1 manager.raw_cluster_cmd('osd', 'out', '0', '1') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') + manager.flush_pg_stats([0, 1, 2, 3]) manager.wait_for_clean() # lots of objects in rbd (no pg log, will backfill) @@ -171,10 +160,7 @@ def test_incomplete_pgs(ctx, config): manager.raw_cluster_cmd('osd', 'in', '0', '1') manager.raw_cluster_cmd('osd', 'out', '2', '3') time.sleep(10) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') + manager.flush_pg_stats([0, 1, 2, 3]) time.sleep(10) manager.wait_for_active() @@ -187,8 +173,7 @@ def test_incomplete_pgs(ctx, config): manager.kill_osd(3) log.info('...') manager.raw_cluster_cmd('osd', 'down', '2', '3') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') + manager.flush_pg_stats([0, 1]) manager.wait_for_active_or_down() assert manager.get_num_down() > 0 diff --git a/qa/tasks/peer.py b/qa/tasks/peer.py index f2279d75be2..9850da18e55 100644 --- a/qa/tasks/peer.py +++ b/qa/tasks/peer.py @@ -30,9 +30,7 @@ def task(ctx, config): while len(manager.get_osd_status()['up']) < 3: time.sleep(10) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 1, 2]) manager.wait_for_clean() for i in range(3): @@ -47,8 +45,7 @@ def task(ctx, config): # kludge to make sure they get a map rados(ctx, mon, ['-p', 'data', 'get', 'dummy', '-']) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') + manager.flush_pg_stats([0, 1]) manager.wait_for_recovery() # kill another and revive 2, so that some pgs can't peer. @@ -57,13 +54,11 @@ def task(ctx, config): manager.revive_osd(2) manager.wait_till_osd_is_up(2) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 2]) manager.wait_for_active_or_down() - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 2]) # look for down pgs num_down_pgs = 0 @@ -91,7 +86,5 @@ def task(ctx, config): # bring it all back manager.revive_osd(1) manager.wait_till_osd_is_up(1) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 1, 2]) manager.wait_for_clean() diff --git a/qa/tasks/reg11184.py b/qa/tasks/reg11184.py index ebddcc3bd94..03db1b0641e 100644 --- a/qa/tasks/reg11184.py +++ b/qa/tasks/reg11184.py @@ -40,9 +40,7 @@ def task(ctx, config): while len(manager.get_osd_status()['up']) < 3: time.sleep(10) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 1, 2]) manager.raw_cluster_cmd('osd', 'set', 'noout') manager.raw_cluster_cmd('osd', 'set', 'noin') manager.raw_cluster_cmd('osd', 'set', 'nodown') diff --git a/qa/tasks/rep_lost_unfound_delete.py b/qa/tasks/rep_lost_unfound_delete.py index b0ba3dc0ed0..4e5678d0858 100644 --- a/qa/tasks/rep_lost_unfound_delete.py +++ b/qa/tasks/rep_lost_unfound_delete.py @@ -32,9 +32,7 @@ def task(ctx, config): while len(manager.get_osd_status()['up']) < 3: time.sleep(10) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 1, 2]) manager.wait_for_clean() manager.create_pool(POOL) @@ -50,8 +48,7 @@ def task(ctx, config): # kludge to make sure they get a map rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile]) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') + manager.flush_pg_stats([0, 1]) manager.wait_for_recovery() # create old objects @@ -86,8 +83,7 @@ def task(ctx, config): manager.mark_in_osd(0) manager.wait_till_osd_is_up(0) - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.flush_pg_stats([0, 1]) manager.wait_till_active() # take out osd.1 and the only copy of those objects. @@ -101,11 +97,9 @@ def task(ctx, config): manager.mark_in_osd(2) manager.wait_till_osd_is_up(2) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 2]) manager.wait_till_active() - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 2]) # verify that there are unfound objects unfound = manager.get_num_unfound_objects() @@ -162,8 +156,7 @@ def task(ctx, config): manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.flush_pg_stats([0, 2]) manager.wait_for_recovery() # verify result diff --git a/qa/tasks/scrub_test.py b/qa/tasks/scrub_test.py index 2c22bb6e4bf..a545c9b89e2 100644 --- a/qa/tasks/scrub_test.py +++ b/qa/tasks/scrub_test.py @@ -375,8 +375,7 @@ def task(ctx, config): for i in range(num_osds): manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs', '--', '--osd-objectstore-fuse') - for i in range(num_osds): - manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats') + manager.flush_pg_stats(range(num_osds)) manager.wait_for_clean() # write some data