From 995dc1f7510c71ce8dc38088eb805dfe42b965cc Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Tue, 21 Feb 2012 13:11:05 -0800 Subject: [PATCH] Add a task for testing stuck pg visibility. --- teuthology/task/ceph_manager.py | 5 ++ teuthology/task/dump_stuck.py | 97 +++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 teuthology/task/dump_stuck.py diff --git a/teuthology/task/ceph_manager.py b/teuthology/task/ceph_manager.py index 1fac703800..a49ce8aae8 100644 --- a/teuthology/task/ceph_manager.py +++ b/teuthology/task/ceph_manager.py @@ -207,6 +207,11 @@ class CephManager: j = json.loads('\n'.join(out.split('\n')[1:])) return j['osds'] + def get_stuck_pgs(self, type_, threshold): + out = self.raw_cluster_cmd('--', 'pg','dump_stuck', type_, + '--format=json', '-t', str(threshold)) + return json.loads('\n'.join(out.split('\n')[1:])) + def get_num_unfound_objects(self): status = self.raw_cluster_status() self.log(status) diff --git a/teuthology/task/dump_stuck.py b/teuthology/task/dump_stuck.py new file mode 100644 index 0000000000..7bf4e5aa8f --- /dev/null +++ b/teuthology/task/dump_stuck.py @@ -0,0 +1,97 @@ +import logging +import time + +import ceph_manager +from teuthology import misc as teuthology + + +log = logging.getLogger(__name__) + +def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10): + inactive = manager.get_stuck_pgs('inactive', timeout) + assert len(inactive) == num_inactive + unclean = manager.get_stuck_pgs('unclean', timeout) + assert len(unclean) == num_unclean + stale = manager.get_stuck_pgs('stale', timeout) + assert len(stale) == num_stale + +def task(ctx, config): + """ + Test the dump_stuck command. + """ + assert config is None, \ + 'dump_stuck requires no configuration' + assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \ + 'dump_stuck requires exactly 2 osds' + + timeout = 60 + first_mon = teuthology.get_first_mon(ctx, config) + (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() + + manager = ceph_manager.CephManager( + mon, + ctx=ctx, + logger=log.getChild('ceph_manager'), + ) + + manager.wait_for_clean(timeout) + check_stuck( + manager, + num_inactive=0, + num_unclean=0, + num_stale=0, + ) + num_pgs = manager.get_num_pgs() + + manager.mark_out_osd(0) + time.sleep(timeout) + manager.wait_for_recovery(timeout) + + check_stuck( + manager, + num_inactive=0, + num_unclean=num_pgs, + num_stale=0, + ) + + manager.mark_in_osd(0) + manager.wait_for_clean(timeout) + + check_stuck( + manager, + num_inactive=0, + num_unclean=0, + num_stale=0, + ) + + for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'): + manager.kill_osd(id_) + manager.mark_down_osd(id_) + + starttime = time.time() + done = False + while not done: + try: + check_stuck( + manager, + num_inactive=0, + num_unclean=0, + num_stale=num_pgs, + ) + done = True + except AssertionError: + # wait up to 15 minutes to become stale + if time.time() - starttime > 900: + raise + + for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'): + manager.revive_osd(id_) + manager.mark_in_osd(id_) + manager.wait_for_clean(timeout) + + check_stuck( + manager, + num_inactive=0, + num_unclean=0, + num_stale=0, + ) -- 2.39.5