qa/tasks/cephfs: test damage to dentry's first is caught

author Patrick Donnelly <pdonnell@redhat.com>

Wed, 18 Jan 2023 02:22:10 +0000 (21:22 -0500)

committer Patrick Donnelly <pdonnell@redhat.com>

Wed, 29 Mar 2023 17:02:56 +0000 (13:02 -0400)
author Patrick Donnelly <pdonnell@redhat.com>
Wed, 18 Jan 2023 02:22:10 +0000 (21:22 -0500)
committer Patrick Donnelly <pdonnell@redhat.com>
Wed, 29 Mar 2023 17:02:56 +0000 (13:02 -0400)
diff --git a/qa/suites/fs/functional/tasks/damage.yaml b/qa/suites/fs/functional/tasks/damage.yaml

index 917c4b1a3d63e80bded1e7b9917adc5362b22c88..ff8b3a58a767ab5948ee661f05ff0ac680bb0799 100644 (file)
--- a/qa/suites/fs/functional/tasks/damage.yaml
+++ b/qa/suites/fs/functional/tasks/damage.yaml
@@ -18,6 +18,7 @@ overrides:
        - Metadata damage detected
        - MDS_READ_ONLY
        - force file system read-only
+      - with standby daemon mds
  tasks:
    - cephfs_test_runner:
        modules:
diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py

index d9ccfc51266b014234232258a4df600bc2096ee0..111a99fc4c87304df347b96c2176bc1933c958d5 100644 (file)
--- a/qa/tasks/cephfs/filesystem.py
+++ b/qa/tasks/cephfs/filesystem.py
@@ -1648,6 +1648,9 @@ class Filesystem(MDSCluster):
      def get_scrub_status(self, rank=0):
          return self.run_scrub(["status"], rank)
  
+    def flush(self, rank=0):
+        return self.rank_tell(["flush", "journal"], rank=rank)
+
      def wait_until_scrub_complete(self, result=None, tag=None, rank=0, sleep=30,
                                    timeout=300, reverse=False):
          # time out after "timeout" seconds and assume as done
diff --git a/qa/tasks/cephfs/test_damage.py b/qa/tasks/cephfs/test_damage.py

index 4d019442976b6594577d20a50610651f8445a73d..d83187017e3ff63bafc694bb01a4f2133aeed83d 100644 (file)
--- a/qa/tasks/cephfs/test_damage.py
+++ b/qa/tasks/cephfs/test_damage.py
@@ -3,6 +3,7 @@ import json
  import logging
  import errno
  import re
+import time
  from teuthology.contextutil import MaxWhileTries
  from teuthology.exceptions import CommandFailedError
  from teuthology.orchestra.run import wait
@@ -562,3 +563,99 @@ class TestDamage(CephFSTestCase):
              self.fs.mon_manager.raw_cluster_cmd(
                  'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                  "damage", "rm", str(entry['id']))
+
+    def test_dentry_first_existing(self):
+        """
+        That the MDS won't abort when the dentry is already known to be damaged.
+        """
+
+        def verify_corrupt():
+            info = self.fs.read_cache("/a", 0)
+            log.debug('%s', info)
+            self.assertEqual(len(info), 1)
+            dirfrags = info[0]['dirfrags']
+            self.assertEqual(len(dirfrags), 1)
+            dentries = dirfrags[0]['dentries']
+            self.assertEqual([dn['path'] for dn in dentries if dn['is_primary']], ['a/c'])
+            self.assertEqual(dentries[0]['snap_first'], 18446744073709551606) # SNAP_HEAD
+
+        self.mount_a.run_shell_payload("mkdir -p a/b")
+        self.fs.flush()
+        self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False)
+        self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
+        time.sleep(5) # for conf to percolate
+        self.mount_a.run_shell_payload("mv a/b a/c; sync .")
+        self.mount_a.umount()
+        verify_corrupt()
+        self.fs.fail()
+        self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
+        self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False)
+        self.fs.set_joinable()
+        status = self.fs.status()
+        self.fs.flush()
+        self.assertFalse(self.fs.status().hadfailover(status))
+        verify_corrupt()
+
+    def test_dentry_first_preflush(self):
+        """
+        That the MDS won't write a dentry with new damage to CDentry::first
+        to the journal.
+        """
+
+        rank0 = self.fs.get_rank()
+        self.fs.rank_freeze(True, rank=0)
+        self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d")
+        self.fs.flush()
+        self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
+        time.sleep(5) # for conf to percolate
+        p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False)
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
+        self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
+        self.fs.rank_freeze(False, rank=0)
+        self.delete_mds_coredump(rank0['name'])
+        self.fs.mds_restart(rank0['name'])
+        self.fs.wait_for_daemons()
+        p.wait()
+        self.mount_a.run_shell_payload("stat a/ && find a/")
+        self.fs.flush()
+
+    def test_dentry_first_precommit(self):
+        """
+        That the MDS won't write a dentry with new damage to CDentry::first
+        to the directory object.
+        """
+
+        fscid = self.fs.id
+        self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d; sync .")
+        self.mount_a.umount() # allow immediate scatter write back
+        self.fs.flush()
+        # now just twiddle some inode metadata on a regular file
+        self.mount_a.mount_wait()
+        self.mount_a.run_shell_payload("chmod 711 a/b/d; sync .")
+        self.mount_a.umount() # avoid journaling session related things
+        # okay, now cause the dentry to get damaged after loading from the journal
+        self.fs.fail()
+        self.config_set("mds", "mds_inject_journal_corrupt_dentry_first", "1.0")
+        time.sleep(5) # for conf to percolate
+        self.fs.set_joinable()
+        self.fs.wait_for_daemons()
+        rank0 = self.fs.get_rank()
+        self.fs.rank_freeze(True, rank=0)
+        # so now we want to trigger commit but this will crash, so:
+        c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"]
+        p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30)
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
+        self.config_rm("mds", "mds_inject_journal_corrupt_dentry_first")
+        self.fs.rank_freeze(False, rank=0)
+        self.delete_mds_coredump(rank0['name'])
+        self.fs.mds_restart(rank0['name'])
+        self.fs.wait_for_daemons()
+        try:
+            p.wait()
+        except CommandFailedError as e:
+            print(e)
+        else:
+            self.fail("flush journal should fail!")
+        self.mount_a.mount_wait()
+        self.mount_a.run_shell_payload("stat a/ && find a/")
+        self.fs.flush()
author	Patrick Donnelly <pdonnell@redhat.com>
	Wed, 18 Jan 2023 02:22:10 +0000 (21:22 -0500)
committer	Patrick Donnelly <pdonnell@redhat.com>
	Wed, 29 Mar 2023 17:02:56 +0000 (13:02 -0400)
qa/suites/fs/functional/tasks/damage.yaml		patch \| blob \| history
qa/tasks/cephfs/filesystem.py		patch \| blob \| history
qa/tasks/cephfs/test_damage.py		patch \| blob \| history