From 502e43978dd5b823207be47710008c028e7b8187 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 27 Mar 2013 12:11:04 -0700 Subject: [PATCH] repair_test: add test for repairing read errs and truncations Signed-off-by: Samuel Just --- teuthology/task/ceph_manager.py | 72 ++++++++++++++++++- teuthology/task/repair_test.py | 118 ++++++++++++++++++++++++++++++++ 2 files changed, 188 insertions(+), 2 deletions(-) create mode 100644 teuthology/task/repair_test.py diff --git a/teuthology/task/ceph_manager.py b/teuthology/task/ceph_manager.py index d8a6f8eb5eb44..83825cb1474a3 100644 --- a/teuthology/task/ceph_manager.py +++ b/teuthology/task/ceph_manager.py @@ -298,6 +298,30 @@ class CephManager: ) return proc + def do_put(self, pool, obj, fname): + return self.do_rados( + self.controller, + [ + '-p', + pool, + 'put', + obj, + fname + ] + ) + + def do_get(self, pool, obj, fname='/dev/null'): + return self.do_rados( + self.controller, + [ + '-p', + pool, + 'stat', + obj, + fname + ] + ) + def osd_admin_socket(self, osdnum, command, check_status=True): testdir = teuthology.get_testdir(self.ctx) remote = None @@ -323,14 +347,32 @@ class CephManager: check_status=check_status ) + def get_pgid(self, pool, pgnum): + poolnum = self.get_pool_num(pool) + pg_str = "{poolnum}.{pgnum}".format( + poolnum=poolnum, + pgnum=pgnum) + return pg_str + + def get_pg_replica(self, pool, pgnum): + """ + get replica for pool, pgnum (e.g. (data, 0)->0 + """ + output = self.raw_cluster_cmd("pg", "dump", '--format=json') + j = json.loads('\n'.join(output.split('\n')[1:])) + pg_str = self.get_pgid(pool, pgnum) + for pg in j['pg_stats']: + if pg['pgid'] == pg_str: + return int(pg['acting'][-1]) + assert False + def get_pg_primary(self, pool, pgnum): """ get primary for pool, pgnum (e.g. (data, 0)->0 """ - poolnum = self.get_pool_num(pool) output = self.raw_cluster_cmd("pg", "dump", '--format=json') j = json.loads('\n'.join(output.split('\n')[1:])) - pg_str = "%d.%d" % (poolnum, pgnum) + pg_str = self.get_pgid(pool, pgnum) for pg in j['pg_stats']: if pg['pgid'] == pg_str: return int(pg['acting'][0]) @@ -529,6 +571,32 @@ class CephManager: ret[status] += 1 return ret + def pg_scrubbing(self, pool, pgnum): + pgstr = self.get_pgid(pool, pgnum) + stats = self.get_single_pg_stats(pgstr) + return 'scrub' in stats['state'] + + def pg_repairing(self, pool, pgnum): + pgstr = self.get_pgid(pool, pgnum) + stats = self.get_single_pg_stats(pgstr) + return 'repair' in stats['state'] + + def pg_inconsistent(self, pool, pgnum): + pgstr = self.get_pgid(pool, pgnum) + stats = self.get_single_pg_stats(pgstr) + return 'inconsistent' in stats['state'] + + def get_last_scrub_stamp(self, pool, pgnum): + stats = self.get_single_pg_stats(self.get_pgid(pool, pgnum)) + return stats["last_scrub_stamp"] + + def do_pg_scrub(self, pool, pgnum, stype): + init = self.get_last_scrub_stamp(pool, pgnum) + self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum)) + while init == self.get_last_scrub_stamp(pool, pgnum): + self.log("waiting for scrub type %s"%(stype,)) + time.sleep(10) + def get_single_pg_stats(self, pgid): all_stats = self.get_pg_stats() diff --git a/teuthology/task/repair_test.py b/teuthology/task/repair_test.py new file mode 100644 index 0000000000000..49ba575b473ab --- /dev/null +++ b/teuthology/task/repair_test.py @@ -0,0 +1,118 @@ +import logging + +import ceph_manager +from teuthology import misc as teuthology + +log = logging.getLogger(__name__) + +def setup(ctx, config): + ctx.manager.wait_for_clean() + ctx.manager.create_pool("repair_test_pool", 1) + return "repair_test_pool" + +def teardown(ctx, config, pool): + ctx.manager.remove_pool(pool) + ctx.manager.wait_for_clean() + +def run_test(ctx, config, test): + s = setup(ctx, config) + test(ctx, config, s) + teardown(ctx, config, s) + +def choose_primary(ctx): + def ret(pool, num): + log.info("Choosing primary") + return ctx.manager.get_pg_primary(pool, num) + return ret + +def choose_replica(ctx): + def ret(pool, num): + log.info("Choosing replica") + return ctx.manager.get_pg_replica(pool, num) + return ret + +def trunc(ctx): + def ret(osd, pool, obj): + log.info("truncating object") + return ctx.manager.osd_admin_socket( + osd, + ['truncobj', pool, obj, '1']) + return ret + +def dataerr(ctx): + def ret(osd, pool, obj): + log.info("injecting data err on object") + return ctx.manager.osd_admin_socket( + osd, + ['injectdataerr', pool, obj]) + return ret + +def mdataerr(ctx): + def ret(osd, pool, obj): + log.info("injecting mdata err on object") + return ctx.manager.osd_admin_socket( + osd, + ['injectmdataerr', pool, obj]) + return ret + +def gen_repair_test(corrupter, chooser, scrub_type): + def ret(ctx, config, pool): + log.info("starting repair test") + victim_osd = chooser(pool, 0) + + # create object + log.info("doing put") + ctx.manager.do_put(pool, 'repair_test_obj', '/etc/hosts') + + # corrupt object + log.info("corrupting object") + corrupter(victim_osd, pool, 'repair_test_obj') + + # verify inconsistent + log.info("scrubbing") + ctx.manager.do_pg_scrub(pool, 0, scrub_type) + + assert ctx.manager.pg_inconsistent(pool, 0) + + # repair + log.info("repairing") + ctx.manager.do_pg_scrub(pool, 0, "repair") + + log.info("re-scrubbing") + ctx.manager.do_pg_scrub(pool, 0, scrub_type) + + # verify consistent + assert not ctx.manager.pg_inconsistent(pool, 0) + log.info("done") + return ret + +def task(ctx, config): + """ + Test [deep] repair in several situations: + Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica] + """ + if config is None: + config = {} + assert isinstance(config, dict), \ + 'repair_test task only accepts a dict for config' + + if not hasattr(ctx, 'manager'): + first_mon = teuthology.get_first_mon(ctx, config) + (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() + ctx.manager = ceph_manager.CephManager( + mon, + ctx=ctx, + logger=log.getChild('ceph_manager') + ) + + tests = [ + gen_repair_test(mdataerr(ctx), choose_primary(ctx), "scrub"), + gen_repair_test(mdataerr(ctx), choose_replica(ctx), "scrub"), + gen_repair_test(dataerr(ctx), choose_primary(ctx), "deep-scrub"), + gen_repair_test(dataerr(ctx), choose_replica(ctx), "deep-scrub"), + gen_repair_test(trunc(ctx), choose_primary(ctx), "scrub"), + gen_repair_test(trunc(ctx), choose_replica(ctx), "scrub") + ] + + for test in tests: + run_test(ctx, config, test) -- 2.39.5