From 91942df5a690809ed872f5aa8c35b56e8048e485 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 30 Apr 2018 14:43:28 -0700 Subject: [PATCH] qa: add test for snap format upgrade Signed-off-by: Patrick Donnelly --- qa/suites/fs/upgrade/snaps/% | 0 .../fs/upgrade/snaps/clusters/3-mds.yaml | 1 + qa/suites/fs/upgrade/snaps/objectstore-ec | 1 + qa/suites/fs/upgrade/snaps/overrides/+ | 0 .../fs/upgrade/snaps/overrides/debug.yaml | 1 + .../upgrade/snaps/overrides/frag_enable.yaml | 1 + .../upgrade/snaps/overrides/no_multimds.yaml | 3 + .../snaps/overrides/whitelist_health.yaml | 1 + .../whitelist_wrongly_marked_down.yaml | 1 + qa/suites/fs/upgrade/snaps/tasks/% | 0 .../fs/upgrade/snaps/tasks/0-luminous.yaml | 30 ++++++++++ .../fs/upgrade/snaps/tasks/1-client.yaml | 13 +++++ .../fs/upgrade/snaps/tasks/2-upgrade.yaml | 12 ++++ .../fs/upgrade/snaps/tasks/3-sanity.yaml | 10 ++++ .../snaps/tasks/4-client-upgrade/no.yaml | 0 .../snaps/tasks/4-client-upgrade/yes.yaml | 10 ++++ .../upgrade/snaps/tasks/5-client-sanity.yaml | 10 ++++ .../upgrade/snaps/tasks/6-snap-upgrade.yaml | 16 ++++++ .../upgrade/snaps/tasks/7-client-sanity.yaml | 1 + qa/tasks/ceph.py | 2 +- qa/tasks/cephfs/filesystem.py | 14 +++++ qa/tasks/cephfs_upgrade_snap.py | 45 +++++++++++++++ qa/tasks/mds_pre_upgrade.py | 56 +++++++++++++++++++ qa/tasks/workunit.py | 33 ++++++----- qa/workunits/fs/snaps/snap-hierarchy.sh | 24 ++++++++ 25 files changed, 271 insertions(+), 14 deletions(-) create mode 100644 qa/suites/fs/upgrade/snaps/% create mode 120000 qa/suites/fs/upgrade/snaps/clusters/3-mds.yaml create mode 120000 qa/suites/fs/upgrade/snaps/objectstore-ec create mode 100644 qa/suites/fs/upgrade/snaps/overrides/+ create mode 120000 qa/suites/fs/upgrade/snaps/overrides/debug.yaml create mode 120000 qa/suites/fs/upgrade/snaps/overrides/frag_enable.yaml create mode 100644 qa/suites/fs/upgrade/snaps/overrides/no_multimds.yaml create mode 120000 qa/suites/fs/upgrade/snaps/overrides/whitelist_health.yaml create mode 120000 qa/suites/fs/upgrade/snaps/overrides/whitelist_wrongly_marked_down.yaml create mode 100644 qa/suites/fs/upgrade/snaps/tasks/% create mode 100644 qa/suites/fs/upgrade/snaps/tasks/0-luminous.yaml create mode 100644 qa/suites/fs/upgrade/snaps/tasks/1-client.yaml create mode 100644 qa/suites/fs/upgrade/snaps/tasks/2-upgrade.yaml create mode 100644 qa/suites/fs/upgrade/snaps/tasks/3-sanity.yaml create mode 100644 qa/suites/fs/upgrade/snaps/tasks/4-client-upgrade/no.yaml create mode 100644 qa/suites/fs/upgrade/snaps/tasks/4-client-upgrade/yes.yaml create mode 100644 qa/suites/fs/upgrade/snaps/tasks/5-client-sanity.yaml create mode 100644 qa/suites/fs/upgrade/snaps/tasks/6-snap-upgrade.yaml create mode 120000 qa/suites/fs/upgrade/snaps/tasks/7-client-sanity.yaml create mode 100644 qa/tasks/cephfs_upgrade_snap.py create mode 100644 qa/tasks/mds_pre_upgrade.py create mode 100755 qa/workunits/fs/snaps/snap-hierarchy.sh diff --git a/qa/suites/fs/upgrade/snaps/% b/qa/suites/fs/upgrade/snaps/% new file mode 100644 index 00000000000..e69de29bb2d diff --git a/qa/suites/fs/upgrade/snaps/clusters/3-mds.yaml b/qa/suites/fs/upgrade/snaps/clusters/3-mds.yaml new file mode 120000 index 00000000000..e3aff189c62 --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/clusters/3-mds.yaml @@ -0,0 +1 @@ +../../../../../cephfs/clusters/3-mds.yaml \ No newline at end of file diff --git a/qa/suites/fs/upgrade/snaps/objectstore-ec b/qa/suites/fs/upgrade/snaps/objectstore-ec new file mode 120000 index 00000000000..0545524961e --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/objectstore-ec @@ -0,0 +1 @@ +../../../../cephfs/objectstore-ec/ \ No newline at end of file diff --git a/qa/suites/fs/upgrade/snaps/overrides/+ b/qa/suites/fs/upgrade/snaps/overrides/+ new file mode 100644 index 00000000000..e69de29bb2d diff --git a/qa/suites/fs/upgrade/snaps/overrides/debug.yaml b/qa/suites/fs/upgrade/snaps/overrides/debug.yaml new file mode 120000 index 00000000000..4fdb9dd1213 --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/overrides/debug.yaml @@ -0,0 +1 @@ +../../../../../cephfs/overrides/debug.yaml \ No newline at end of file diff --git a/qa/suites/fs/upgrade/snaps/overrides/frag_enable.yaml b/qa/suites/fs/upgrade/snaps/overrides/frag_enable.yaml new file mode 120000 index 00000000000..9e0f15fc29a --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/overrides/frag_enable.yaml @@ -0,0 +1 @@ +../../../../../cephfs/overrides/frag_enable.yaml \ No newline at end of file diff --git a/qa/suites/fs/upgrade/snaps/overrides/no_multimds.yaml b/qa/suites/fs/upgrade/snaps/overrides/no_multimds.yaml new file mode 100644 index 00000000000..c740a450a6f --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/overrides/no_multimds.yaml @@ -0,0 +1,3 @@ +overrides: + ceph: + max_mds: 1 diff --git a/qa/suites/fs/upgrade/snaps/overrides/whitelist_health.yaml b/qa/suites/fs/upgrade/snaps/overrides/whitelist_health.yaml new file mode 120000 index 00000000000..42fa3ea7a1f --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/overrides/whitelist_health.yaml @@ -0,0 +1 @@ +../../../../../cephfs/overrides/whitelist_health.yaml \ No newline at end of file diff --git a/qa/suites/fs/upgrade/snaps/overrides/whitelist_wrongly_marked_down.yaml b/qa/suites/fs/upgrade/snaps/overrides/whitelist_wrongly_marked_down.yaml new file mode 120000 index 00000000000..3728aacfd15 --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/overrides/whitelist_wrongly_marked_down.yaml @@ -0,0 +1 @@ +../../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml \ No newline at end of file diff --git a/qa/suites/fs/upgrade/snaps/tasks/% b/qa/suites/fs/upgrade/snaps/tasks/% new file mode 100644 index 00000000000..e69de29bb2d diff --git a/qa/suites/fs/upgrade/snaps/tasks/0-luminous.yaml b/qa/suites/fs/upgrade/snaps/tasks/0-luminous.yaml new file mode 100644 index 00000000000..bf627dc394c --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/tasks/0-luminous.yaml @@ -0,0 +1,30 @@ +meta: +- desc: | + install ceph/luminous latest +tasks: +- install: + branch: luminous +- print: "**** done installing luminous" +- ceph: + log-whitelist: + - overall HEALTH_ + - \(FS_ + - \(MDS_ + - \(OSD_ + - \(MON_DOWN\) + - \(CACHE_POOL_ + - \(POOL_ + - \(MGR_DOWN\) + - \(PG_ + - \(SMALLER_PGP_NUM\) + - Monitor daemon marked osd + - Behind on trimming + - Manager daemon + conf: + global: + mon warn on pool no app: false +- exec: + osd.0: + - ceph osd require-osd-release luminous + - ceph osd set-require-min-compat-client luminous +- print: "**** done ceph" diff --git a/qa/suites/fs/upgrade/snaps/tasks/1-client.yaml b/qa/suites/fs/upgrade/snaps/tasks/1-client.yaml new file mode 100644 index 00000000000..e9dea8f4e30 --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/tasks/1-client.yaml @@ -0,0 +1,13 @@ +tasks: +- ceph-fuse: +- print: "**** done luminous client" +- exec: + mon.a: + - ceph fs set cephfs allow_new_snaps true --yes-i-really-mean-it +- workunit: + timeout: 5m + cleanup: false + clients: + client.0: + - fs/snaps/snap-hierarchy.sh +- print: "**** done snap hierarchy" diff --git a/qa/suites/fs/upgrade/snaps/tasks/2-upgrade.yaml b/qa/suites/fs/upgrade/snaps/tasks/2-upgrade.yaml new file mode 100644 index 00000000000..5c50d60c460 --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/tasks/2-upgrade.yaml @@ -0,0 +1,12 @@ +tasks: +- mds_pre_upgrade: +- print: "**** done mds pre-upgrade sequence" +- install.upgrade: + mon.a: + mon.b: +- print: "**** done install.upgrade both hosts" +- ceph.stop: [mds.*] +- ceph.restart: + daemons: [mon.*, mgr.*, osd.*, mds.*] + mon-health-to-clog: false +- print: "**** done ceph.restart" diff --git a/qa/suites/fs/upgrade/snaps/tasks/3-sanity.yaml b/qa/suites/fs/upgrade/snaps/tasks/3-sanity.yaml new file mode 100644 index 00000000000..d93dc3ba467 --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/tasks/3-sanity.yaml @@ -0,0 +1,10 @@ +tasks: +- exec: + mon.a: + - ceph status + - ceph fs dump --format=json-pretty + - ceph fs set cephfs max_mds 2 && exit 1 || true +- print: "**** confirmed cannot set max_mds=2" +- exec: + mon.a: + - ceph fs set cephfs allow_new_snaps true diff --git a/qa/suites/fs/upgrade/snaps/tasks/4-client-upgrade/no.yaml b/qa/suites/fs/upgrade/snaps/tasks/4-client-upgrade/no.yaml new file mode 100644 index 00000000000..e69de29bb2d diff --git a/qa/suites/fs/upgrade/snaps/tasks/4-client-upgrade/yes.yaml b/qa/suites/fs/upgrade/snaps/tasks/4-client-upgrade/yes.yaml new file mode 100644 index 00000000000..13b590e25af --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/tasks/4-client-upgrade/yes.yaml @@ -0,0 +1,10 @@ +tasks: +- install.upgrade: + client.0: +- print: "**** done install.upgrade on client.0" +- ceph-fuse: + client.0: + mounted: false +- ceph-fuse: + client.0: +- print: "**** done remount client" diff --git a/qa/suites/fs/upgrade/snaps/tasks/5-client-sanity.yaml b/qa/suites/fs/upgrade/snaps/tasks/5-client-sanity.yaml new file mode 100644 index 00000000000..f32a89da452 --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/tasks/5-client-sanity.yaml @@ -0,0 +1,10 @@ +tasks: +- workunit: + timeout: 5m + cleanup: false + env: + VERIFY: verify + clients: + client.0: + - fs/snaps/snap-hierarchy.sh +- print: "**** done verify snap hierarchy" diff --git a/qa/suites/fs/upgrade/snaps/tasks/6-snap-upgrade.yaml b/qa/suites/fs/upgrade/snaps/tasks/6-snap-upgrade.yaml new file mode 100644 index 00000000000..fe0b17e13aa --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/tasks/6-snap-upgrade.yaml @@ -0,0 +1,16 @@ +overrides: + ceph: + log-whitelist: + - bad backtrace on inode +tasks: +- cephfs_upgrade_snap: +- print: "**** upgraded snapshot metadata" +- exec: + mon.a: + - ceph fs set cephfs max_mds 2 +- print: "**** increased max_mds=2" +- sleep: + duration: 10 +- exec: + mon.a: + - ceph fs dump | grep '^max_mds.*2' diff --git a/qa/suites/fs/upgrade/snaps/tasks/7-client-sanity.yaml b/qa/suites/fs/upgrade/snaps/tasks/7-client-sanity.yaml new file mode 120000 index 00000000000..4ad65e45c57 --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/tasks/7-client-sanity.yaml @@ -0,0 +1 @@ +5-client-sanity.yaml \ No newline at end of file diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py index 1990e10c231..e53adcf31a7 100644 --- a/qa/tasks/ceph.py +++ b/qa/tasks/ceph.py @@ -376,7 +376,7 @@ def cephfs_setup(ctx, config): all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles] num_active = len([r for r in all_roles if is_active_mds(r)]) - fs.set_max_mds(num_active) + fs.set_max_mds(config.get('max_mds', num_active)) yield diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py index 4687c392571..d22126a9baa 100644 --- a/qa/tasks/cephfs/filesystem.py +++ b/qa/tasks/cephfs/filesystem.py @@ -728,6 +728,16 @@ class Filesystem(MDSCluster): return result + def get_rank(self, rank=0, status=None): + if status is None: + status = self.getinfo() + return status.get_rank(self.id, rank) + + def get_ranks(self, status=None): + if status is None: + status = self.getinfo() + return status.get_ranks(self.id) + def get_rank_names(self, status=None): """ Return MDS daemon names of those daemons holding a rank, @@ -854,6 +864,10 @@ class Filesystem(MDSCluster): return self.json_asok(command, 'mds', mds_id) + def rank_asok(self, command, rank=0): + info = self.get_rank(rank=rank) + return self.json_asok(command, 'mds', info['name']) + def read_cache(self, path, depth=None): cmd = ["dump", "tree", path] if depth is not None: diff --git a/qa/tasks/cephfs_upgrade_snap.py b/qa/tasks/cephfs_upgrade_snap.py new file mode 100644 index 00000000000..a11b1d7ee75 --- /dev/null +++ b/qa/tasks/cephfs_upgrade_snap.py @@ -0,0 +1,45 @@ +""" +Upgrade cluster snap format. +""" + +import logging +import time + +from tasks.cephfs.filesystem import Filesystem + +log = logging.getLogger(__name__) + +def task(ctx, config): + """ + Upgrade CephFS file system snap format. + """ + + if config is None: + config = {} + assert isinstance(config, dict), \ + 'snap-upgrade task only accepts a dict for configuration' + + fs = Filesystem(ctx) + + mds_map = fs.get_mds_map() + assert(mds_map['max_mds'] == 1) + + json = fs.rank_asok(["scrub_path", "/", "force", "recursive", "repair"]) + if not json or json['return_code'] == 0: + log.info("scrub / completed") + else: + log.info("scrub / failed: {}".format(json)) + + json = fs.rank_asok(["scrub_path", "~mdsdir", "force", "recursive", "repair"]) + if not json or json['return_code'] == 0: + log.info("scrub ~mdsdir completed") + else: + log.info("scrub / failed: {}".format(json)) + + for i in range(0, 10): + mds_map = fs.get_mds_map() + if (mds_map['flags'] & (1<<1)) != 0 and (mds_map['flags'] & (1<<4)) != 0: + break + time.sleep(10) + assert((mds_map['flags'] & (1<<1)) != 0) # Test CEPH_MDSMAP_ALLOW_SNAPS + assert((mds_map['flags'] & (1<<4)) != 0) # Test CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS diff --git a/qa/tasks/mds_pre_upgrade.py b/qa/tasks/mds_pre_upgrade.py new file mode 100644 index 00000000000..5193f92eefa --- /dev/null +++ b/qa/tasks/mds_pre_upgrade.py @@ -0,0 +1,56 @@ +""" +Prepare MDS cluster for upgrade. +""" + +import logging +import time + +from tasks.cephfs.filesystem import Filesystem + +log = logging.getLogger(__name__) + +def task(ctx, config): + """ + Prepare MDS cluster for upgrade. + + This task reduces ranks to 1 and stops all standbys. + """ + + if config is None: + config = {} + assert isinstance(config, dict), \ + 'snap-upgrade task only accepts a dict for configuration' + + fs = Filesystem(ctx) + status = fs.getinfo() + + fs.set_max_mds(1) + status = fs.getinfo() + targets = filter(lambda r: r['rank'] >= 1, fs.get_ranks(status=status)) + if len(targets) > 0: + # deactivate mds in decending order + targets = sorted(targets, key=lambda r: r['rank'], reverse=True) + for target in targets: + self.log("deactivating rank %d" % target['rank']) + self.fs.deactivate(target['rank']) + status = self.wait_for_stable()[0] + else: + status = self.wait_for_stable()[0] + + assert(fs.get_mds_map(status=status)['max_mds'] == 1) + assert(fs.get_mds_map(status=status)['in'] == [0]) + + # Stop standbys now to minimize time rank 0 is down in subsequent: + # tasks: + # - ceph.stop: [mds.*] + rank0 = fs.get_rank(rank=0, status=status) + for daemon in ctx.daemons.iter_daemons_of_role('mds', fs.mon_manager.cluster): + if rank0['name'] != daemon.id_: + daemon.stop() + + for i in range(1, 10): + time.sleep(5) # time for FSMap to update + status = fs.getinfo() + if len(list(status.get_standbys())) == 0: + break + assert(len(list(status.get_standbys())) == 0) diff --git a/qa/tasks/workunit.py b/qa/tasks/workunit.py index f69b3960a4c..e9b97e1f4c2 100644 --- a/qa/tasks/workunit.py +++ b/qa/tasks/workunit.py @@ -160,6 +160,7 @@ def task(ctx, config): refspec = Head() timeout = config.get('timeout', '3h') + cleanup = config.get('cleanup', True) log.info('Pulling workunits from ref %s', refspec) @@ -181,24 +182,28 @@ def task(ctx, config): created_mountpoint[role] = created_mnt_dir # Execute any non-all workunits + log.info("timeout={}".format(timeout)) + log.info("cleanup={}".format(cleanup)) with parallel() as p: for role, tests in clients.iteritems(): if role != "all": p.spawn(_run_tests, ctx, refspec, role, tests, config.get('env'), basedir=config.get('basedir','qa/workunits'), - timeout=timeout) + timeout=timeout,cleanup=cleanup) - # Clean up dirs from any non-all workunits - for role, created in created_mountpoint.items(): - _delete_dir(ctx, role, created) + if cleanup: + # Clean up dirs from any non-all workunits + for role, created in created_mountpoint.items(): + _delete_dir(ctx, role, created) # Execute any 'all' workunits if 'all' in clients: all_tasks = clients["all"] _spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'), config.get('basedir', 'qa/workunits'), - config.get('subdir'), timeout=timeout) + config.get('subdir'), timeout=timeout, + cleanup=cleanup) def _client_mountpoint(ctx, cluster, id_): @@ -326,7 +331,7 @@ def _make_scratch_dir(ctx, role, subdir): return created_mountpoint -def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=None): +def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=None, cleanup=True): """ Make a scratch directory for each client in the cluster, and then for each test spawn _run_tests() for each role. @@ -351,12 +356,13 @@ def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=Non timeout=timeout) # cleanup the generated client directories - for role, _ in client_remotes.items(): - _delete_dir(ctx, role, created_mountpoint[role]) + if cleanup: + for role, _ in client_remotes.items(): + _delete_dir(ctx, role, created_mountpoint[role]) def _run_tests(ctx, refspec, role, tests, env, basedir, - subdir=None, timeout=None): + subdir=None, timeout=None, cleanup=True): """ Run the individual test. Create a scratch directory and then extract the workunits from git. Make the executables, and then run the tests. @@ -472,10 +478,11 @@ def _run_tests(ctx, refspec, role, tests, env, basedir, args=args, label="workunit test {workunit}".format(workunit=workunit) ) - remote.run( - logger=log.getChild(role), - args=['sudo', 'rm', '-rf', '--', scratch_tmp], - ) + if cleanup: + remote.run( + logger=log.getChild(role), + args=['sudo', 'rm', '-rf', '--', scratch_tmp], + ) finally: log.info('Stopping %s on %s...', tests, role) remote.run( diff --git a/qa/workunits/fs/snaps/snap-hierarchy.sh b/qa/workunits/fs/snaps/snap-hierarchy.sh new file mode 100755 index 00000000000..67f0e014bd0 --- /dev/null +++ b/qa/workunits/fs/snaps/snap-hierarchy.sh @@ -0,0 +1,24 @@ +#!/bin/sh + +set -ex + +if [ -d "$1" ]; then + mkdir -p -- "$1" && cd "$1" +fi + +[ "$VERIFY" != verify ] && mkdir 1 +[ "$VERIFY" != verify ] && mkdir 1/.snap/first +stat 1/.snap/first +[ "$VERIFY" != verify ] && mkdir 1/2 +stat 1/.snap/first/2 && exit 1 +[ "$VERIFY" != verify ] && mkdir 1/2/.snap/second +stat 1/2/.snap/second +[ "$VERIFY" != verify ] && touch 1/foo +stat 1/.snap/first/foo && exit 1 +[ "$VERIFY" != verify ] && mkdir 1/.snap/third +stat 1/.snap/third/foo || exit 1 +[ "$VERIFY" != verify ] && mkdir 1/2/3 +[ "$VERIFY" != verify ] && mkdir 1/2/.snap/fourth +stat 1/2/.snap/fourth/3 + +exit 0 -- 2.39.5