xfs: test rebuilding the entire filesystem with online fsck

author Darrick J. Wong <djwong@kernel.org>

Fri, 30 Dec 2022 22:19:12 +0000 (14:19 -0800)

committer Zorro Lang <zlang@kernel.org>

Sat, 18 Feb 2023 06:30:36 +0000 (14:30 +0800)
author Darrick J. Wong <djwong@kernel.org>
Fri, 30 Dec 2022 22:19:12 +0000 (14:19 -0800)
committer Zorro Lang <zlang@kernel.org>
Sat, 18 Feb 2023 06:30:36 +0000 (14:30 +0800)
diff --git a/README b/README

index 4c4f22f853dee560ac07ce26b72746dda21b79dc..744317625fb18ea40daa4ce8e77dc22d22627ae4 100644 (file)
--- a/README
+++ b/README
@@ -191,6 +191,9 @@ Extra XFS specification:
     to check the filesystem. As of August 2021, xfs_repair finds all
     filesystem corruptions found by xfs_check, and more, which means that
     xfs_check is no longer run by default.
+ - Set TEST_XFS_SCRUB_REBUILD=1 to have _check_xfs_filesystem run xfs_scrub in
+   "force_repair" mode to rebuild the filesystem; and xfs_repair -n to check
+   the results of the rebuilding.
   - xfs_scrub, if present, will always check the test and scratch
     filesystems if they are still online at the end of the test. It is no
     longer necessary to set TEST_XFS_SCRUB.
diff --git a/common/fuzzy b/common/fuzzy

index 7670a093d76b6fb2f379d6b46a336db5cd2d0bec..cc775b62be57ab20a421d61361d000590ef43128 100644 (file)
--- a/common/fuzzy
+++ b/common/fuzzy
@@ -975,6 +975,7 @@ __scratch_xfs_stress_setup_force_rebuild() {
  # and wait for 30*TIME_FACTOR seconds to see if the filesystem goes down.
  # Same requirements and arguments as _scratch_xfs_stress_scrub.
  _scratch_xfs_stress_online_repair() {
+       touch "$RESULT_DIR/.skip_orebuild"      # no need to test online rebuild
         __scratch_xfs_stress_setup_force_rebuild
         XFS_SCRUB_FORCE_REPAIR=1 _scratch_xfs_stress_scrub "$@"
  }
diff --git a/common/rc b/common/rc

index 16ef36afd174ed99247912f172768f35335b8bf3..6852af79b40741459d9d206971ededb4f203a4cd 100644 (file)
--- a/common/rc
+++ b/common/rc
@@ -1691,7 +1691,7 @@ _require_scratch_nocheck()
              exit 1
          fi
      fi
-    rm -f ${RESULT_DIR}/require_scratch
+    rm -f ${RESULT_DIR}/require_scratch "$RESULT_DIR/.skip_orebuild"
  }
  
  # we need the scratch device and it needs to not be an lvm device
diff --git a/common/xfs b/common/xfs

index 97c049e2ca1cb4fcde6f048ef4a960cabfb03ba8..0d4855ea682c04adb1da3b788495f1a4e9c04f3f 100644 (file)
--- a/common/xfs
+++ b/common/xfs
@@ -682,6 +682,8 @@ _scratch_xfs_mdrestore()
  # run xfs_check and friends on a FS.
  _check_xfs_filesystem()
  {
+       local can_scrub=
+
         if [ $# -ne 3 ]; then
                 echo "Usage: _check_xfs_filesystem device <logdev>|none <rtdev>|none" 1>&2
                 exit 1
@@ -716,6 +718,8 @@ _check_xfs_filesystem()
         # Run online scrub if we can.
         mntpt="$(_is_dev_mounted $device)"
         if [ -n "$mntpt" ] && _supports_xfs_scrub "$mntpt" "$device"; then
+               can_scrub=1
+
                 # Tests can create a scenario in which a call to syncfs() issued
                 # at the end of the execution of the test script would return an
                 # error code. xfs_scrub internally calls syncfs() before
@@ -832,6 +836,79 @@ _check_xfs_filesystem()
                 _mount_or_remount_rw "$extra_mount_options" $device $mountpoint
         fi
  
+       # If desired, test the online metadata rebuilding behavior if the
+       # filesystem was mounted when this function was called.
+       if [ -n "$TEST_XFS_SCRUB_REBUILD" ] && [ -n "$can_scrub" ] && [ ! -e "$RESULT_DIR/.skip_orebuild" ]; then
+               orebuild_ok=1
+
+               # Walk the entire directory tree to load directory blocks into
+               # memory and populate the dentry cache, which can speed up the
+               # repairs considerably when the directory tree is very large.
+               find $mntpt &>/dev/null &
+
+               XFS_SCRUB_FORCE_REPAIR=1 "$XFS_SCRUB_PROG" -v -d $mntpt > $tmp.scrub 2>&1
+               if [ $? -ne 0 ]; then
+                       if grep -q 'No space left on device' $tmp.scrub; then
+                               # It's not an error if the fs does not have
+                               # enough space to complete a repair.  We will
+                               # check everything, though.
+                               echo "*** XFS_SCRUB_FORCE_REPAIR=1 xfs_scrub -v -d ran out of space ***" >> $seqres.full
+                               cat $tmp.scrub >> $seqres.full
+                               echo "*** end xfs_scrub output" >> $seqres.full
+                       else
+                               _log_err "_check_xfs_filesystem: filesystem on $device failed scrub orebuild"
+                               echo "*** XFS_SCRUB_FORCE_REPAIR=1 xfs_scrub -v -d output ***" >> $seqres.full
+                               cat $tmp.scrub >> $seqres.full
+                               echo "*** end xfs_scrub output" >> $seqres.full
+                               ok=0
+                               orebuild_ok=0
+                       fi
+               fi
+               rm -f $tmp.scrub
+
+               # Clear force_repair because xfs_scrub could have set it
+               $XFS_IO_PROG -x -c 'inject noerror' "$mntpt" >> $seqres.full
+
+               "$XFS_SCRUB_PROG" -v -d -n $mntpt > $tmp.scrub 2>&1
+               if [ $? -ne 0 ]; then
+                       _log_err "_check_xfs_filesystem: filesystem on $device failed scrub orebuild recheck"
+                       echo "*** xfs_scrub -v -d -n output ***" >> $seqres.full
+                       cat $tmp.scrub >> $seqres.full
+                       echo "*** end xfs_scrub output" >> $seqres.full
+                       ok=0
+                       orebuild_ok=0
+               fi
+               rm -f $tmp.scrub
+
+               mountpoint=`_umount_or_remount_ro $device`
+
+               $XFS_REPAIR_PROG -n $extra_options $extra_log_options $extra_rt_options $device >$tmp.repair 2>&1
+               if [ $? -ne 0 ]; then
+                       _log_err "_check_xfs_filesystem: filesystem on $device is inconsistent (orebuild-reverify)"
+                       echo "*** xfs_repair -n output ***"     >>$seqres.full
+                       cat $tmp.repair                         >>$seqres.full
+                       echo "*** end xfs_repair output"        >>$seqres.full
+
+                       ok=0
+                       orebuild_ok=0
+               fi
+               rm -f $tmp.repair
+
+               if [ $ok -eq 0 ]; then
+                       echo "*** mount output ***"             >>$seqres.full
+                       _mount                                  >>$seqres.full
+                       echo "*** end mount output"             >>$seqres.full
+               elif [ "$type" = "xfs" ]; then
+                       _mount_or_remount_rw "$extra_mount_options" $device $mountpoint
+               fi
+
+               if [ "$orebuild_ok" -ne 1 ] && [ "$DUMP_CORRUPT_FS" = "1" ]; then
+                       local flatdev="$(basename "$device")"
+                       _xfs_metadump "$seqres.$flatdev.orebuild.md" "$device" \
+                               "$logdev" compress >> $seqres.full
+               fi
+       fi
+
         if [ $ok -eq 0 ]; then
                 status=1
                 if [ "$iam" != "check" ]; then
author	Darrick J. Wong <djwong@kernel.org>
	Fri, 30 Dec 2022 22:19:12 +0000 (14:19 -0800)
committer	Zorro Lang <zlang@kernel.org>
	Sat, 18 Feb 2023 06:30:36 +0000 (14:30 +0800)
README		patch \| blob \| history
common/fuzzy		patch \| blob \| history
common/rc		patch \| blob \| history
common/xfs		patch \| blob \| history