From d6e3099e4ac133e2aa3afa31e16a485c6fd41eb8 Mon Sep 17 00:00:00 2001 From: Douglas Fuller Date: Wed, 24 Jan 2018 11:11:40 -0500 Subject: [PATCH] qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller --- .../tasks/cephfs_scrub_tests.yaml | 1 + qa/tasks/cephfs/filesystem.py | 7 +- qa/tasks/cephfs/test_scrub.py | 157 ++++++++++++++++++ 3 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 qa/tasks/cephfs/test_scrub.py diff --git a/qa/suites/fs/basic_functional/tasks/cephfs_scrub_tests.yaml b/qa/suites/fs/basic_functional/tasks/cephfs_scrub_tests.yaml index 30b3a96e20760..8d4cbeab9b987 100644 --- a/qa/suites/fs/basic_functional/tasks/cephfs_scrub_tests.yaml +++ b/qa/suites/fs/basic_functional/tasks/cephfs_scrub_tests.yaml @@ -14,3 +14,4 @@ tasks: - cephfs_test_runner: modules: - tasks.cephfs.test_scrub_checks + - tasks.cephfs.test_scrub diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py index 9638fd55c96ab..6d4b91bfff6bf 100644 --- a/qa/tasks/cephfs/filesystem.py +++ b/qa/tasks/cephfs/filesystem.py @@ -1055,7 +1055,8 @@ class Filesystem(MDSCluster): else: return True - def rados(self, args, pool=None, namespace=None, stdin_data=None): + def rados(self, args, pool=None, namespace=None, stdin_data=None, + stdin_file=None): """ Call into the `rados` CLI from an MDS """ @@ -1073,6 +1074,10 @@ class Filesystem(MDSCluster): args = ([os.path.join(self._prefix, "rados"), "-p", pool] + (["--namespace", namespace] if namespace else []) + args) + + if stdin_file is not None: + args = ["bash", "-c", "cat " + stdin_file + " | " + " ".join(args)] + p = remote.run( args=args, stdin=stdin_data, diff --git a/qa/tasks/cephfs/test_scrub.py b/qa/tasks/cephfs/test_scrub.py new file mode 100644 index 0000000000000..32371dd67be94 --- /dev/null +++ b/qa/tasks/cephfs/test_scrub.py @@ -0,0 +1,157 @@ +""" +Test CephFS scrub (distinct from OSD scrub) functionality +""" +import logging +import os +import traceback +from collections import namedtuple + +from teuthology.orchestra.run import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology + +log = logging.getLogger(__name__) + +ValidationError = namedtuple("ValidationError", ["exception", "backtrace"]) + + +class Workload(object): + def __init__(self, filesystem, mount): + self._mount = mount + self._filesystem = filesystem + self._initial_state = None + + # Accumulate backtraces for every failed validation, and return them. Backtraces + # are rather verbose, but we only see them when something breaks, and they + # let us see which check failed without having to decorate each check with + # a string + self._errors = [] + + def assert_equal(self, a, b): + try: + if a != b: + raise AssertionError("{0} != {1}".format(a, b)) + except AssertionError as e: + self._errors.append( + ValidationError(e, traceback.format_exc(3)) + ) + + def write(self): + """ + Write the workload files to the mount + """ + raise NotImplementedError() + + def validate(self): + """ + Read from the mount and validate that the workload files are present (i.e. have + survived or been reconstructed from the test scenario) + """ + raise NotImplementedError() + + def damage(self): + """ + Damage the filesystem pools in ways that will be interesting to recover from. By + default just wipe everything in the metadata pool + """ + # Delete every object in the metadata pool + objects = self._filesystem.rados(["ls"]).split("\n") + for o in objects: + self._filesystem.rados(["rm", o]) + + def flush(self): + """ + Called after client unmount, after write: flush whatever you want + """ + self._filesystem.mds_asok(["flush", "journal"]) + + +class BacktraceWorkload(Workload): + """ + Single file, single directory, wipe the backtrace and check it. + """ + def write(self): + self._mount.run_shell(["mkdir", "subdir"]) + self._mount.write_n_mb("subdir/sixmegs", 6) + + def validate(self): + st = self._mount.stat("subdir/sixmegs") + self._filesystem.mds_asok(["flush", "journal"]) + bt = self._filesystem.read_backtrace(st['st_ino']) + parent = bt['ancestors'][0]['dname'] + self.assert_equal(parent, "sixmegs") + return self._errors + + def damage(self): + st = self._mount.stat("subdir/sixmegs") + self._filesystem.mds_asok(["flush", "journal"]) + self._filesystem._write_data_xattr(st['st_ino'], "parent", "") + + +class DupInodeWorkload(Workload): + """ + Duplicate an inode and try scrubbing it twice." + """ + + def write(self): + self._mount.run_shell(["mkdir", "parent"]) + self._mount.run_shell(["mkdir", "parent/child"]) + self._mount.write_n_mb("parent/parentfile", 6) + self._mount.write_n_mb("parent/child/childfile", 6) + + def damage(self): + temp_bin_path = "/tmp/10000000000.00000000_omap.bin" + self._mount.umount() + self._filesystem.mds_asok(["flush", "journal"]) + self._filesystem.mds_stop() + self._filesystem.rados(["getomapval", "10000000000.00000000", + "parentfile_head", temp_bin_path]) + self._filesystem.rados(["setomapval", "10000000000.00000000", + "shadow_head"], stdin_file=temp_bin_path) + self._filesystem.set_ceph_conf('mds', 'mds hack allow loading invalid metadata', True) + self._filesystem.mds_restart() + self._filesystem.wait_for_daemons() + + def validate(self): + self._filesystem.mds_asok(["scrub_path", "/", "recursive", "repair"]) + self.assert_equal(self._filesystem.are_daemons_healthy(), True) + return self._errors + + +class TestScrub(CephFSTestCase): + MDSS_REQUIRED = 1 + + def _scrub(self, workload, workers=1): + """ + That when all objects in metadata pool are removed, we can rebuild a metadata pool + based on the contents of a data pool, and a client can see and read our files. + """ + + # First, inject some files + + workload.write() + + # are off by default, but in QA we need to explicitly disable them) + self.fs.set_ceph_conf('mds', 'mds verify scatter', False) + self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False) + + # Apply any data damage the workload wants + workload.damage() + + self.fs.mds_asok(["scrub_path", "/", "recursive", "repair"]) + + # See that the files are present and correct + errors = workload.validate() + if errors: + log.error("Validation errors found: {0}".format(len(errors))) + for e in errors: + log.error(e.exception) + log.error(e.backtrace) + raise AssertionError("Validation failed, first error: {0}\n{1}".format( + errors[0].exception, errors[0].backtrace + )) + + def test_scrub_backtrace(self): + self._scrub(BacktraceWorkload(self.fs, self.mount_a)) + + def test_scrub_dup_inode(self): + self._scrub(DupInodeWorkload(self.fs, self.mount_a)) -- 2.39.5