From: Samuel Just Date: Wed, 23 Apr 2014 00:38:12 +0000 (-0700) Subject: task/: add tests for ec and rep mark_unfound_lost delete X-Git-Tag: 1.1.0~1525 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=e64d8314ac81c38cbf5234d42dc503e766b339b2;p=teuthology.git task/: add tests for ec and rep mark_unfound_lost delete Signed-off-by: Samuel Just --- diff --git a/teuthology/task/ceph_manager.py b/teuthology/task/ceph_manager.py index 3952950b..481c2173 100644 --- a/teuthology/task/ceph_manager.py +++ b/teuthology/task/ceph_manager.py @@ -681,7 +681,7 @@ class CephManager: self.log(status) return status['pgmap']['num_pgs'] - def create_pool_with_unique_name(self, pg_num=16, ec_pool=False): + def create_pool_with_unique_name(self, pg_num=16, ec_pool=False, ec_m=1, ec_k=2): """ Create a pool named unique_pool_X where X is unique. """ @@ -689,10 +689,15 @@ class CephManager: with self.lock: name = "unique_pool_%s" % (str(self.next_pool_id),) self.next_pool_id += 1 - self.create_pool(name, pg_num, ec_pool=ec_pool) + self.create_pool( + name, + pg_num, + ec_pool=ec_pool, + ec_m=ec_m, + ec_k=ec_k) return name - def create_pool(self, pool_name, pg_num=16, ec_pool=False): + def create_pool(self, pool_name, pg_num=16, ec_pool=False, ec_m=1, ec_k=2): """ Create a pool named from the pool_name parameter. :param pool_name: name of the pool being created. @@ -705,7 +710,7 @@ class CephManager: self.log("creating pool_name %s"%(pool_name,)) if ec_pool and not self.created_erasure_pool: self.created_erasure_pool = True - self.raw_cluster_cmd('osd', 'erasure-code-profile', 'set', 'teuthologyprofile', 'ruleset-failure-domain=osd', 'm=1', 'k=2') + self.raw_cluster_cmd('osd', 'erasure-code-profile', 'set', 'teuthologyprofile', 'ruleset-failure-domain=osd', 'm='+str(ec_m), 'k='+str(ec_k)) if ec_pool: self.raw_cluster_cmd('osd', 'pool', 'create', pool_name, str(pg_num), str(pg_num), 'erasure', 'teuthologyprofile') diff --git a/teuthology/task/ec_lost_unfound.py b/teuthology/task/ec_lost_unfound.py new file mode 100644 index 00000000..25bac6cb --- /dev/null +++ b/teuthology/task/ec_lost_unfound.py @@ -0,0 +1,134 @@ +""" +Lost_unfound +""" +import logging +import ceph_manager +from teuthology import misc as teuthology +from teuthology.task_util.rados import rados + +log = logging.getLogger(__name__) + +def task(ctx, config): + """ + Test handling of lost objects on an ec pool. + + A pretty rigid cluster is brought up andtested by this task + """ + if config is None: + config = {} + assert isinstance(config, dict), \ + 'lost_unfound task only accepts a dict for configuration' + first_mon = teuthology.get_first_mon(ctx, config) + (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() + + manager = ceph_manager.CephManager( + mon, + ctx=ctx, + logger=log.getChild('ceph_manager'), + ) + + manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') + manager.wait_for_clean() + + + pool = manager.create_pool_with_unique_name( + ec_pool=True, + ec_m=2, + ec_k=2) + + # something that is always there + dummyfile = '/etc/fstab' + + # kludge to make sure they get a map + rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile]) + + manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') + manager.wait_for_recovery() + + # create old objects + for f in range(1, 10): + rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile]) + rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile]) + rados(ctx, mon, ['-p', pool, 'rm', 'existed_%d' % f]) + + # delay recovery, and make the pg log very long (to prevent backfill) + manager.raw_cluster_cmd( + 'tell', 'osd.1', + 'injectargs', + '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000' + ) + + manager.kill_osd(0) + manager.mark_down_osd(0) + manager.kill_osd(3) + manager.mark_down_osd(3) + + for f in range(1, 10): + rados(ctx, mon, ['-p', pool, 'put', 'new_%d' % f, dummyfile]) + rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile]) + rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile]) + + # take out osd.1 and a necessary shard of those objects. + manager.kill_osd(1) + manager.mark_down_osd(1) + manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') + manager.revive_osd(0) + manager.wait_till_osd_is_up(0) + manager.revive_osd(3) + manager.wait_till_osd_is_up(3) + + manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') + manager.wait_till_active() + manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') + + # verify that there are unfound objects + unfound = manager.get_num_unfound_objects() + log.info("there are %d unfound objects" % unfound) + assert unfound + + # mark stuff lost + pgs = manager.get_pg_stats() + for pg in pgs: + if pg['stat_sum']['num_objects_unfound'] > 0: + # verify that i can list them direct from the osd + log.info('listing missing/lost in %s state %s', pg['pgid'], + pg['state']); + m = manager.list_pg_missing(pg['pgid']) + log.info('%s' % m) + assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] + + log.info("reverting unfound in %s", pg['pgid']) + manager.raw_cluster_cmd('pg', pg['pgid'], + 'mark_unfound_lost', 'delete') + else: + log.info("no unfound in %s", pg['pgid']) + + manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') + manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') + manager.raw_cluster_cmd('tell', 'osd.3', 'debug', 'kick_recovery_wq', '5') + manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') + manager.wait_for_recovery() + + # verify result + for f in range(1, 10): + err = rados(ctx, mon, ['-p', pool, 'get', 'new_%d' % f, '-']) + assert err + err = rados(ctx, mon, ['-p', pool, 'get', 'existed_%d' % f, '-']) + assert err + err = rados(ctx, mon, ['-p', pool, 'get', 'existing_%d' % f, '-']) + assert err + + # see if osd.1 can cope + manager.revive_osd(1) + manager.wait_till_osd_is_up(1) + manager.wait_for_clean() diff --git a/teuthology/task/rep_lost_unfound_delete.py b/teuthology/task/rep_lost_unfound_delete.py new file mode 100644 index 00000000..f75a4d2b --- /dev/null +++ b/teuthology/task/rep_lost_unfound_delete.py @@ -0,0 +1,153 @@ +""" +Lost_unfound +""" +import logging +import ceph_manager +from teuthology import misc as teuthology +from teuthology.task_util.rados import rados + +log = logging.getLogger(__name__) + +def task(ctx, config): + """ + Test handling of lost objects. + + A pretty rigid cluseter is brought up andtested by this task + """ + if config is None: + config = {} + assert isinstance(config, dict), \ + 'lost_unfound task only accepts a dict for configuration' + first_mon = teuthology.get_first_mon(ctx, config) + (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() + + manager = ceph_manager.CephManager( + mon, + ctx=ctx, + logger=log.getChild('ceph_manager'), + ) + + while len(manager.get_osd_status()['up']) < 3: + manager.sleep(10) + manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.wait_for_clean() + + # something that is always there + dummyfile = '/etc/fstab' + + # take an osd out until the very end + manager.kill_osd(2) + manager.mark_down_osd(2) + manager.mark_out_osd(2) + + # kludge to make sure they get a map + rados(ctx, mon, ['-p', 'data', 'put', 'dummy', dummyfile]) + + manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') + manager.wait_for_recovery() + + # create old objects + for f in range(1, 10): + rados(ctx, mon, ['-p', 'data', 'put', 'existing_%d' % f, dummyfile]) + rados(ctx, mon, ['-p', 'data', 'put', 'existed_%d' % f, dummyfile]) + rados(ctx, mon, ['-p', 'data', 'rm', 'existed_%d' % f]) + + # delay recovery, and make the pg log very long (to prevent backfill) + manager.raw_cluster_cmd( + 'tell', 'osd.1', + 'injectargs', + '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000' + ) + + manager.kill_osd(0) + manager.mark_down_osd(0) + + for f in range(1, 10): + rados(ctx, mon, ['-p', 'data', 'put', 'new_%d' % f, dummyfile]) + rados(ctx, mon, ['-p', 'data', 'put', 'existed_%d' % f, dummyfile]) + rados(ctx, mon, ['-p', 'data', 'put', 'existing_%d' % f, dummyfile]) + + # bring osd.0 back up, let it peer, but don't replicate the new + # objects... + log.info('osd.0 command_args is %s' % 'foo') + log.info(ctx.daemons.get_daemon('osd', 0).command_args) + ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([ + '--osd-recovery-delay-start', '1000' + ]) + manager.revive_osd(0) + manager.mark_in_osd(0) + manager.wait_till_osd_is_up(0) + + manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.wait_till_active() + + # take out osd.1 and the only copy of those objects. + manager.kill_osd(1) + manager.mark_down_osd(1) + manager.mark_out_osd(1) + manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') + + # bring up osd.2 so that things would otherwise, in theory, recovery fully + manager.revive_osd(2) + manager.mark_in_osd(2) + manager.wait_till_osd_is_up(2) + + manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.wait_till_active() + manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + + # verify that there are unfound objects + unfound = manager.get_num_unfound_objects() + log.info("there are %d unfound objects" % unfound) + assert unfound + + # mark stuff lost + pgs = manager.get_pg_stats() + for pg in pgs: + if pg['stat_sum']['num_objects_unfound'] > 0: + primary = 'osd.%d' % pg['acting'][0] + + # verify that i can list them direct from the osd + log.info('listing missing/lost in %s state %s', pg['pgid'], + pg['state']); + m = manager.list_pg_missing(pg['pgid']) + #log.info('%s' % m) + assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] + num_unfound=0 + for o in m['objects']: + if len(o['locations']) == 0: + num_unfound += 1 + assert m['num_unfound'] == num_unfound + + log.info("reverting unfound in %s on %s", pg['pgid'], primary) + manager.raw_cluster_cmd('pg', pg['pgid'], + 'mark_unfound_lost', 'delete') + else: + log.info("no unfound in %s", pg['pgid']) + + manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') + manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') + manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') + manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') + manager.wait_for_recovery() + + # verify result + for f in range(1, 10): + err = rados(ctx, mon, ['-p', 'data', 'get', 'new_%d' % f, '-']) + assert err + err = rados(ctx, mon, ['-p', 'data', 'get', 'existed_%d' % f, '-']) + assert err + err = rados(ctx, mon, ['-p', 'data', 'get', 'existing_%d' % f, '-']) + assert err + + # see if osd.1 can cope + manager.revive_osd(1) + manager.mark_in_osd(1) + manager.wait_till_osd_is_up(1) + manager.wait_for_clean()