From: Vasu Kulkarni Date: Sat, 27 Feb 2016 04:16:10 +0000 (-0800) Subject: Fix recent failures due to HEALTH_WARN check issues X-Git-Tag: v10.2.6~165^2^2~166^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1c24f58180c66940a3f4cd746e34c6123d3f2359;p=ceph.git Fix recent failures due to HEALTH_WARN check issues There were recent failures due to HEALTH_WARN check unrelated to script, this fix basically bypasses this issue by using osd pool default size: 1 in ceph.conf and now checks for HEALTH_OK instead of HEALTH_WARN It also adds the meta information to tasks which describes test Signed-off-by: Vasu Kulkarni --- diff --git a/clusters/fixed-1.yaml b/clusters/fixed-1.yaml index 5f3a8f992335..4259c313dac2 100644 --- a/clusters/fixed-1.yaml +++ b/clusters/fixed-1.yaml @@ -1,2 +1,8 @@ +overrides: + ceph-deploy: + conf: + global: + mon pg warn min per osd: 1 + osd pool default size: 1 roles: -- [mon.a, mon.b, mon.c, osd.0, osd.1, osd.2, client.0] +- [mon.a, osd.0, osd.1, osd.2, client.0] diff --git a/suites/smoke/1node/tasks/ceph-deploy.yaml b/suites/smoke/1node/tasks/ceph-deploy.yaml index df54a6b7d5dc..5a3092304f02 100644 --- a/suites/smoke/1node/tasks/ceph-deploy.yaml +++ b/suites/smoke/1node/tasks/ceph-deploy.yaml @@ -1,2 +1,7 @@ +meta: +- desc: | + Run ceph-deploy cli tests on one node + and verify all the cli works and cluster can reach + HEALTH_OK state(implicty verifying the daemons via init). tasks: - ceph_deploy.single_node_test: null diff --git a/tasks/ceph_deploy.py b/tasks/ceph_deploy.py index cb103f0db6a6..b82ed8411138 100644 --- a/tasks/ceph_deploy.py +++ b/tasks/ceph_deploy.py @@ -463,36 +463,6 @@ def build_ceph_cluster(ctx, config): execute_ceph_deploy(purgedata_nodes) -def execute_cdeploy(admin, cmd, path): - """Execute ceph-deploy commands """ - """Either use git path or repo path """ - if path is not None: - ec = admin.run( - args=[ - 'cd', - run.Raw('~/cdtest'), - run.Raw(';'), - '{path}/ceph-deploy/ceph-deploy'.format(path=path), - run.Raw(cmd), - ], - check_status=False, - ).exitstatus - else: - ec = admin.run( - args=[ - 'cd', - run.Raw('~/cdtest'), - run.Raw(';'), - 'ceph-deploy', - run.Raw(cmd), - ], - check_status=False, - ).exitstatus - if ec != 0: - raise RuntimeError( - "failed during ceph-deploy cmd: {cmd} , ec={ec}".format(cmd=cmd, ec=ec)) - - @contextlib.contextmanager def cli_test(ctx, config): """ @@ -503,8 +473,23 @@ def cli_test(ctx, config): log.info('Ceph-deploy Test') if config is None: config = {} - test_branch = '' + conf_dir = teuthology.get_testdir(ctx) + "/cdtest" + + def execute_cdeploy(admin, cmd, path): + """Execute ceph-deploy commands """ + """Either use git path or repo path """ + args = ['cd', conf_dir, run.Raw(';')] + if path: + args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path)); + else: + args.append('ceph-deploy') + args.append(run.Raw(cmd)) + ec = admin.run(args=args, check_status=False).exitstatus + if ec != 0: + raise RuntimeError( + "failed during ceph-deploy cmd: {cmd} , ec={ec}".format(cmd=cmd, ec=ec)) + if config.get('rhbuild'): path = None else: @@ -517,7 +502,7 @@ def cli_test(ctx, config): mons = ctx.cluster.only(teuthology.is_type('mon')) for node, role in mons.remotes.iteritems(): admin = node - admin.run(args=['mkdir', '~/', 'cdtest'], check_status=False) + admin.run(args=['mkdir', conf_dir], check_status=False) nodename = admin.shortname system_type = teuthology.get_system_type(admin) if config.get('rhbuild'): @@ -534,14 +519,26 @@ def cli_test(ctx, config): str(devs)) raise RuntimeError("Needs minimum of 3 devices ") + conf_path = '{conf_dir}/ceph.conf'.format(conf_dir=conf_dir) new_cmd = 'new ' + nodename + execute_cdeploy(admin, new_cmd, path) + if config.get('conf') is not None: + confp = config.get('conf') + for section, keys in confp.iteritems(): + lines = '[{section}]\n'.format(section=section) + teuthology.append_lines_to_file(admin, conf_path, lines, + sudo=True) + for key, value in keys.iteritems(): + log.info("[%s] %s = %s" % (section, key, value)) + lines = '{key} = {value}\n'.format(key=key, value=value) + teuthology.append_lines_to_file(admin, conf_path, lines, + sudo=True) new_mon_install = 'install {branch} --mon '.format( branch=test_branch) + nodename new_osd_install = 'install {branch} --osd '.format( branch=test_branch) + nodename new_admin = 'install {branch} --cli '.format(branch=test_branch) + nodename - create_initial = '--overwrite-conf mon create-initial ' - execute_cdeploy(admin, new_cmd, path) + create_initial = 'mon create-initial ' execute_cdeploy(admin, new_mon_install, path) execute_cdeploy(admin, new_osd_install, path) execute_cdeploy(admin, new_admin, path) @@ -553,17 +550,25 @@ def cli_test(ctx, config): execute_cdeploy(admin, zap_disk, path) execute_cdeploy(admin, prepare, path) - admin.run(args=['ls', run.Raw('-lt'), run.Raw('~/cdtest/')]) - time.sleep(4) + log.info("list files for debugging purpose to check file permissions") + admin.run(args=['ls', run.Raw('-lt'), conf_dir]) remote.run(args=['sudo', 'ceph', '-s'], check_status=False) r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO()) out = r.stdout.getvalue() log.info('Ceph health: %s', out.rstrip('\n')) - if out.split(None, 1)[0] == 'HEALTH_WARN': - log.info('All ceph-deploy cli tests passed') + log.info("Waiting for cluster to become healthy") + retry = 1 + while (out.split(None, 1)[0] != 'HEALTH_OK') and (retry <= 6): + r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO()) + log.info('Retry: %d Ceph health: %s', retry, out.rstrip('\n')) + time.sleep(10) + retry += 1 + if (retry > 6): + raise RuntimeError( + "Failed to reach HEALTH_OK state after {r} retries".format( + r=retry)) else: - raise RuntimeError("Failed to reach HEALTH_WARN State") - + log.info('All ceph-deploy cli tests passed') # test rgw cli rgw_install = 'install {branch} --rgw {node}'.format( branch=test_branch, @@ -588,8 +593,13 @@ def cli_test(ctx, config): execute_cdeploy(admin, cmd, path) cmd = 'purgedata ' + nodename execute_cdeploy(admin, cmd, path) - admin.run(args=['rm', run.Raw('-rf'), run.Raw('~/cdtest/*')]) - admin.run(args=['rmdir', run.Raw('~/cdtest')]) + log.info("Removing temporary dir") + admin.run( + args=[ + 'rm', + run.Raw('-rf'), + run.Raw(conf_dir)], + check_status=False) if config.get('rhbuild'): admin.run(args=['sudo', 'yum', 'remove', 'ceph-deploy', '-y']) @@ -607,6 +617,8 @@ def single_node_test(ctx, config): log.info("Testing ceph-deploy on single node") if config is None: config = {} + overrides = ctx.config.get('overrides', {}) + teuthology.deep_merge(config, overrides.get('ceph-deploy', {})) if config.get('rhbuild'): log.info("RH Build, Skip Download")