From 293b90fe63665666ca2de8b65c38f987a4d0b2a6 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 17 Jan 2023 21:22:10 -0500 Subject: [PATCH] qa/tasks/cephfs: test damage to dentry's first is caught Signed-off-by: Patrick Donnelly --- qa/suites/fs/functional/tasks/damage.yaml | 1 + qa/tasks/cephfs/filesystem.py | 3 + qa/tasks/cephfs/test_damage.py | 97 +++++++++++++++++++++++ 3 files changed, 101 insertions(+) diff --git a/qa/suites/fs/functional/tasks/damage.yaml b/qa/suites/fs/functional/tasks/damage.yaml index 917c4b1a3d63e..ff8b3a58a767a 100644 --- a/qa/suites/fs/functional/tasks/damage.yaml +++ b/qa/suites/fs/functional/tasks/damage.yaml @@ -18,6 +18,7 @@ overrides: - Metadata damage detected - MDS_READ_ONLY - force file system read-only + - with standby daemon mds tasks: - cephfs_test_runner: modules: diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py index d9ccfc51266b0..111a99fc4c873 100644 --- a/qa/tasks/cephfs/filesystem.py +++ b/qa/tasks/cephfs/filesystem.py @@ -1648,6 +1648,9 @@ class Filesystem(MDSCluster): def get_scrub_status(self, rank=0): return self.run_scrub(["status"], rank) + def flush(self, rank=0): + return self.rank_tell(["flush", "journal"], rank=rank) + def wait_until_scrub_complete(self, result=None, tag=None, rank=0, sleep=30, timeout=300, reverse=False): # time out after "timeout" seconds and assume as done diff --git a/qa/tasks/cephfs/test_damage.py b/qa/tasks/cephfs/test_damage.py index 4d019442976b6..d83187017e3ff 100644 --- a/qa/tasks/cephfs/test_damage.py +++ b/qa/tasks/cephfs/test_damage.py @@ -3,6 +3,7 @@ import json import logging import errno import re +import time from teuthology.contextutil import MaxWhileTries from teuthology.exceptions import CommandFailedError from teuthology.orchestra.run import wait @@ -562,3 +563,99 @@ class TestDamage(CephFSTestCase): self.fs.mon_manager.raw_cluster_cmd( 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), "damage", "rm", str(entry['id'])) + + def test_dentry_first_existing(self): + """ + That the MDS won't abort when the dentry is already known to be damaged. + """ + + def verify_corrupt(): + info = self.fs.read_cache("/a", 0) + log.debug('%s', info) + self.assertEqual(len(info), 1) + dirfrags = info[0]['dirfrags'] + self.assertEqual(len(dirfrags), 1) + dentries = dirfrags[0]['dentries'] + self.assertEqual([dn['path'] for dn in dentries if dn['is_primary']], ['a/c']) + self.assertEqual(dentries[0]['snap_first'], 18446744073709551606) # SNAP_HEAD + + self.mount_a.run_shell_payload("mkdir -p a/b") + self.fs.flush() + self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False) + self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0") + time.sleep(5) # for conf to percolate + self.mount_a.run_shell_payload("mv a/b a/c; sync .") + self.mount_a.umount() + verify_corrupt() + self.fs.fail() + self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first") + self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False) + self.fs.set_joinable() + status = self.fs.status() + self.fs.flush() + self.assertFalse(self.fs.status().hadfailover(status)) + verify_corrupt() + + def test_dentry_first_preflush(self): + """ + That the MDS won't write a dentry with new damage to CDentry::first + to the journal. + """ + + rank0 = self.fs.get_rank() + self.fs.rank_freeze(True, rank=0) + self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d") + self.fs.flush() + self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0") + time.sleep(5) # for conf to percolate + p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False) + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout) + self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first") + self.fs.rank_freeze(False, rank=0) + self.delete_mds_coredump(rank0['name']) + self.fs.mds_restart(rank0['name']) + self.fs.wait_for_daemons() + p.wait() + self.mount_a.run_shell_payload("stat a/ && find a/") + self.fs.flush() + + def test_dentry_first_precommit(self): + """ + That the MDS won't write a dentry with new damage to CDentry::first + to the directory object. + """ + + fscid = self.fs.id + self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d; sync .") + self.mount_a.umount() # allow immediate scatter write back + self.fs.flush() + # now just twiddle some inode metadata on a regular file + self.mount_a.mount_wait() + self.mount_a.run_shell_payload("chmod 711 a/b/d; sync .") + self.mount_a.umount() # avoid journaling session related things + # okay, now cause the dentry to get damaged after loading from the journal + self.fs.fail() + self.config_set("mds", "mds_inject_journal_corrupt_dentry_first", "1.0") + time.sleep(5) # for conf to percolate + self.fs.set_joinable() + self.fs.wait_for_daemons() + rank0 = self.fs.get_rank() + self.fs.rank_freeze(True, rank=0) + # so now we want to trigger commit but this will crash, so: + c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"] + p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30) + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout) + self.config_rm("mds", "mds_inject_journal_corrupt_dentry_first") + self.fs.rank_freeze(False, rank=0) + self.delete_mds_coredump(rank0['name']) + self.fs.mds_restart(rank0['name']) + self.fs.wait_for_daemons() + try: + p.wait() + except CommandFailedError as e: + print(e) + else: + self.fail("flush journal should fail!") + self.mount_a.mount_wait() + self.mount_a.run_shell_payload("stat a/ && find a/") + self.fs.flush() -- 2.39.5