From f81b1b61c97dba4c3ff6aa2ba08c3d1fcc00f7c1 Mon Sep 17 00:00:00 2001 From: Sungmin Lee Date: Tue, 23 Aug 2022 13:51:31 +0900 Subject: [PATCH] qa: add validation stage for deduplication.py To validate sample-dedup actually works, validate() runs separated thread from sample-dedup and verifies two following things. 1. check sample-dedup starts properly. 2. check references of all the chunk objects' in chunk tier exists in designated base pool. This routune repeats for max_valication_cnt times while sample-dedup is running. If it doesn't raise any fail while the loop, we can pretend sample-dedup works accurately. If not, assert() will stop this test. In case that a reference of chunk object doesn't exist in base pool, validate() gives a second chance after repairing it (chunk-repair op) to deal with false-positive reference inconsistency. Signed-off-by: Sungmin Lee --- qa/tasks/deduplication.py | 169 +++++++++++++++++++++++++++++++------- 1 file changed, 139 insertions(+), 30 deletions(-) diff --git a/qa/tasks/deduplication.py b/qa/tasks/deduplication.py index 056c91f4391..ace0c86e144 100644 --- a/qa/tasks/deduplication.py +++ b/qa/tasks/deduplication.py @@ -5,6 +5,9 @@ import contextlib import logging import gevent from teuthology import misc as teuthology +import json +import time +from io import StringIO from teuthology.orchestra import run @@ -15,9 +18,7 @@ log = logging.getLogger(__name__) def task(ctx, config): """ Run ceph-dedup-tool. - The config should be as follows:: - ceph-dedup-tool: clients: [client list] op: @@ -26,13 +27,10 @@ def task(ctx, config): chunk_size: chunk_algorithm: fingerprint_algorithm: - object_dedup_threashold: <#% of duplicate chunks within an object to trigger object dedup> chunk_dedup_threashold: max_thread: wakeup_period: - For example:: - tasks: - exec: client.0: @@ -42,14 +40,13 @@ def task(ctx, config): op: 'sample-dedup' pool: 'default.rgw.buckets.data' chunk_pool: 'low_tier' - chunk_size: 8192 + chunk_size: 131072 chunk_algorithm: 'fastcdc' fingerprint_algorithm: 'sha1' - object_dedup_threshold: 15 chunk_dedup_threshold: 5 max_thread: 2 wakeup_period: 20 - + sampling_ratio: 100 """ log.info('Beginning deduplication...') assert isinstance(config, dict), \ @@ -58,59 +55,171 @@ def task(ctx, config): #assert hasattr(ctx, 'rgw') testdir = teuthology.get_testdir(ctx) args = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), 'ceph-dedup-tool'] if config.get('op', None): args.extend(['--op', config.get('op', None)]) if config.get('chunk_pool', None): args.extend(['--chunk-pool', config.get('chunk_pool', None)]) if config.get('chunk_size', False): - args.extend(['--chunk-size', str(config.get('chunk_size', 8192))]) + args.extend(['--chunk-size', str(config.get('chunk_size', 131072))]) if config.get('chunk_algorithm', False): args.extend(['--chunk-algorithm', config.get('chunk_algorithm', None)] ) if config.get('fingerprint_algorithm', False): args.extend(['--fingerprint-algorithm', config.get('fingerprint_algorithm', None)] ) - if config.get('object_dedup_threshold', False): - args.extend(['--object-dedup-threshold', str(config.get('object_dedup_threshold', 50))]) if config.get('chunk_dedup_threshold', False): - args.extend(['--chunk-dedup-threshold', str(config.get('chunk_dedup_threshold', 5))]) + args.extend(['--chunk-dedup-threshold', str(config.get('chunk_dedup_threshold', 1))]) if config.get('max_thread', False): args.extend(['--max-thread', str(config.get('max_thread', 2))]) + if config.get('sampling_ratio', False): + args.extend(['--sampling-ratio', str(config.get('sampling_ratio', 100))]) if config.get('wakeup_period', False): - args.extend(['"--wakeup-period"', str(config.get('wakeup_period', 30))]) + args.extend(['--wakeup-period', str(config.get('wakeup_period', 20))]) if config.get('pool', False): args.extend(['--pool', config.get('pool', None)]) args.extend([ '--debug', - '--deamon', - '--iterative']) + '--daemon', + '--loop']) def thread(): + run_remote(args, False, 0) + + def run_remote(args, need_wait, client_num): clients = ['client.{id}'.format(id=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] log.info('clients are %s' % clients) - manager = ctx.managers['ceph'] - tests = {} - log.info("args %s", args) - for role in config.get('clients', clients): - assert isinstance(role, str) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] + role = 'client.{id}'.format(id=client_num) + if role not in clients: + raise Exception('wrong client {c}'.format(c=role)) + assert isinstance(role, str) + PREFIX = 'client.' + assert role.startswith(PREFIX) + testdir = teuthology.get_testdir(ctx) + cmd_args = [ + 'adjust-ulimits', + 'ceph-coverage', + '{tdir}/archive/coverage'.format(tdir=testdir)] + cmd_args.extend(args) + log.info("cmd: %s", cmd_args) + tries = 0 + while True: (remote,) = ctx.cluster.only(role).remotes.keys() proc = remote.run( - args=args, - stdin=run.PIPE, - wait=False + args=cmd_args, + wait=need_wait, check_status=False, + stdout=StringIO(), ) - tests[id_] = proc + log.info('exitstatus {r}'.format(r=proc.exitstatus)) + if proc.exitstatus == 0 or need_wait == False: + log.info('proc stdout ', proc.stdout.getvalue()) + return proc.stdout.getvalue().strip() + tries += 1 + if tries > 30: + raise Exception('timed out getting correct exitstatus') + time.sleep(30) + + def get_chunk_objs(chunk_pool): + chunk_obj_list = run_remote(('rados ls -p ' + chunk_pool).split(), True, 1).split() + if chunk_obj_list == False: + return None + else: + return chunk_obj_list + + def get_ref_list(chunk_pool, chunk_obj): + # get reference list of chunk object + dump_str = run_remote( + ('ceph-dedup-tool --op dump-chunk-refs --chunk-pool ' + + chunk_pool + ' --object ' + chunk_obj).split(), + True, 1 + ) + # fail in case that reference object is not written + assert len(dump_str) > 0 + log.info('{0} obj has {1} refs' + .format(chunk_obj, json.loads(dump_str)['count'])) + + # check if chunk object's reference object exists in base-tier + ref_list = json.loads(dump_str)['refs'] + return ref_list + + # To validate whether the sample-dedup operation works well, this function checks if + # 1. sample-dedup has been started and + # 2. reference of chunk objects' exists in correct base pool + def validate(): + log.info('start validating sample-dedup') + base_pool = config.get('pool', None) + chunk_pool = config.get('chunk_pool', None) + max_validation_cnt = 15 + retry_cnt = 0 + # chunk objs for re-validation after chunk-repair + retry_chunk_objs = list() + + # check whether sample-dedup has been started + chunk_obj_list = get_chunk_objs(chunk_pool) + while (chunk_obj_list == None or len(chunk_obj_list) == 0) and retry_cnt < max_validation_cnt: + # retry getting # chunk objs after 30 secs of sleep + time.sleep(30) + chunk_obj_list = get_chunk_objs(chunk_pool) + retry_cnt += 1 + log.info('chunk pool empty. retry ', retry_cnt) + assert retry_cnt < max_validation_cnt + + log.info('sample-dedup started successfully') + + retry_cnt = 0 + max_validation_cnt = 5 + # validate chunk pool for max_validation_cnt times + while retry_cnt < max_validation_cnt: + for chunk_obj in chunk_obj_list: + ref_list = get_ref_list(chunk_pool, chunk_obj) + for ref in ref_list: + ret = run_remote( + ('rados -p ' + base_pool + ' stat ' + ref['oid']) + .split(), True, 1 + ) + # check if ref exists in base pool + if ret == False or len(ret) == 0: + # if ref not exists in base pool, try repair in order to avoid + # false-positive inconsistent reference + ret = run_remote(('ceph osd pool stats ' + base_pool).split(), True, 1) + assert len(ret) > 0 + base_pool_id = ret.split()[3] + ret = run_remote( + ('ceph-dedup-tool --op chunk-repair --chunk-pool ' + + chunk_pool + ' --object ' + chunk_obj + ' --target-ref ' + + ref['oid'] + ' --target-ref-pool-id ' + base_pool_id) + .split(), True, 1 + ) + retry_chunk_objs.append(chunk_obj) + log.info('{0} obj exists in {1}'.format(ref['oid'], base_pool)) + + # retry validation for repaired objects + for chunk_obj in retry_chunk_objs: + ref_list = get_ref_list(chunk_pool, chunk_obj) + for ref in ref_list: + ret = run_remote( + ('rados -p ' + base_pool + ' stat ' + ref['oid']) + .split(), True, 1 + ) + assert len(ret) > 0 + log.info( + '{0} obj exists in {1} after repair'.format(ref['oid'], + base_pool) + ) + retry_chunk_objs = list() + + # get chunk objects for the next loop + chunk_obj_list = get_chunk_objs(chunk_pool) + retry_cnt += 1 + time.sleep(30) + return True + running = gevent.spawn(thread) + checker = gevent.spawn(validate) try: yield finally: log.info('joining ceph-dedup-tool') running.get() + checker.get() -- 2.39.5