From: Darrick J. Wong Date: Tue, 10 Mar 2026 03:52:13 +0000 (-0700) Subject: xfs: test xfs_healer can report file media errors X-Git-Tag: v2026.03.20~8 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=eb1013c4f2698d8fa6ae32e9ce0a9e71e738a630;p=xfstests-dev.git xfs: test xfs_healer can report file media errors Make sure that xfs_healer can actually report media errors as found by the kernel. Signed-off-by: Darrick J. Wong Reviewed-by: Zorro Lang Signed-off-by: Zorro Lang --- diff --git a/tests/xfs/660 b/tests/xfs/660 new file mode 100755 index 00000000..214dfe1a --- /dev/null +++ b/tests/xfs/660 @@ -0,0 +1,171 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later +# Copyright (c) 2024-2026 Oracle. All Rights Reserved. +# +# FS QA Test No. 660 +# +# Check that xfs_healer can report media errors. + +. ./common/preamble +_begin_fstest auto quick scrub eio selfhealing + +_cleanup() +{ + cd / + rm -f $tmp.* + _dmerror_cleanup +} + +. ./common/fuzzy +. ./common/filter +. ./common/dmerror +. ./common/systemd + +_require_scratch +_require_scrub +_require_dm_target error +_require_command "$XFS_HEALER_PROG" "xfs_healer" +_require_xfs_io_command verifymedia + +filter_healer() { + _filter_scratch | \ + grep -E '(media failed|media error)' | \ + sed \ + -e 's/datadev/DEVICE/g' \ + -e 's/rtdev/DEVICE/g' \ + -e 's/ino [0-9]*/ino NUM/g' \ + -e 's/gen 0x[0-9a-f]*/gen NUM/g' \ + -e 's/pos [0-9]*/pos NUM/g' \ + -e 's/len [0-9]*/len NUM/g' \ + -e 's/0x[0-9a-f]*/NUM/g' \ + -e 's|SCRATCH_MNT/a|VICTIM|g' \ + -e 's|SCRATCH_MNT ino NUM gen NUM|VICTIM|g' +} + +filter_verify() { + sed -e 's/\([a-z]*dev\): verify error at offset \([0-9]*\) length \([0-9]*\)/DEVICE: verify error at offset XXX length XXX/g' +} + +_scratch_mkfs >> $seqres.full + +# The dm-error map added by this test doesn't work on zoned devices because +# table sizes need to be aligned to the zone size, and even for zoned on +# conventional this test will get confused because of the internal RT device. +# +# That check requires a mounted file system, so do a dummy mount before setting +# up DM. +_scratch_mount +_require_xfs_scratch_non_zoned +_require_xfs_healer $SCRATCH_MNT +_scratch_unmount + +_dmerror_init +_dmerror_mount + +# Write a file with 4 file blocks worth of data, figure out the LBA to target +victim=$SCRATCH_MNT/a +file_blksz=$(_get_file_block_size $SCRATCH_MNT) +$XFS_IO_PROG -f -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c "fsync" $victim >> $seqres.full +unset errordev +verifymediadev="-d" + +awk_len_prog='{print $6}' +if _xfs_is_realtime_file $victim; then + if ! _xfs_has_feature $SCRATCH_MNT rtgroups; then + awk_len_prog='{print $4}' + fi + errordev="RT" + verifymediadev="-r" +fi +bmap_str="$($XFS_IO_PROG -c "bmap -elpv" $victim | grep "^[[:space:]]*0:")" +echo "$errordev:$bmap_str" >> $seqres.full + +phys="$(echo "$bmap_str" | $AWK_PROG '{print $3}')" +len="$(echo "$bmap_str" | $AWK_PROG "$awk_len_prog")" + +fs_blksz=$(_get_block_size $SCRATCH_MNT) +echo "file_blksz:$file_blksz:fs_blksz:$fs_blksz" >> $seqres.full +kernel_sectors_per_fs_block=$((fs_blksz / 512)) + +# Did we get at least 4 fs blocks worth of extent? +min_len_sectors=$(( 4 * kernel_sectors_per_fs_block )) +test "$len" -lt $min_len_sectors && \ + _fail "could not format a long enough extent on an empty fs??" + +phys_start=$(echo "$phys" | sed -e 's/\.\..*//g') + +echo "$errordev:$phys:$len:$fs_blksz:$phys_start" >> $seqres.full +echo "victim file:" >> $seqres.full +od -tx1 -Ad -c $victim >> $seqres.full + +# Set the dmerror table so that all IO will pass through. +_dmerror_reset_table + +cat >> $seqres.full << ENDL +dmerror before: +$DMERROR_TABLE +$DMERROR_RTTABLE + +ENDL + +# All sector numbers that we feed to the kernel must be in units of 512b, but +# they also must be aligned to the device's logical block size. +logical_block_size=`$here/src/min_dio_alignment $SCRATCH_MNT $SCRATCH_DEV` +kernel_sectors_per_device_lba=$((logical_block_size / 512)) + +# Pretend as bad one of the device LBAs in the middle of the extent. Target +# the second LBA of the third block of the four-block file extent that we +# allocated earlier, but without overflowing into the fourth file block. +bad_sector=$(( phys_start + (2 * kernel_sectors_per_fs_block) )) +bad_len=$kernel_sectors_per_device_lba +if (( kernel_sectors_per_device_lba < kernel_sectors_per_fs_block )); then + bad_sector=$((bad_sector + kernel_sectors_per_device_lba)) +fi +if (( (bad_sector % kernel_sectors_per_device_lba) != 0)); then + echo "bad_sector $bad_sector not congruent with device logical block size $logical_block_size" +fi +_dmerror_mark_range_bad $bad_sector $bad_len $errordev + +cat >> $seqres.full << ENDL +dmerror after marking bad: +$DMERROR_TABLE +$DMERROR_RTTABLE + +ENDL + +_dmerror_load_error_table + +echo "Simulate media error" +_scratch_invoke_xfs_healer "$tmp.healer" +echo "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" >> $seqres.full +$XFS_IO_PROG -x -c "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" $SCRATCH_MNT 2>&1 | filter_verify + +# Now mark the bad range good so that a retest shows no media failure. +_dmerror_mark_range_good $bad_sector $bad_len $errordev +_dmerror_load_error_table + +cat >> $seqres.full << ENDL +dmerror after marking good: +$DMERROR_TABLE +$DMERROR_RTTABLE + +ENDL + +echo "No more media error" +echo "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" >> $seqres.full +$XFS_IO_PROG -x -c "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" $SCRATCH_MNT >> $seqres.full + +# Unmount filesystem to start fresh +echo "Kill healer" +_scratch_kill_xfs_healer _dmerror_unmount + +# filesystems without rmap do not translate media errors to lost file ranges +# so fake the output +_xfs_has_feature "$SCRATCH_DEV" rmapbt || \ + echo "VICTIM pos 0 len 0: media failed" >> $tmp.healer + +cat $tmp.healer >> $seqres.full +cat $tmp.healer | filter_healer + +# success, all done +_exit 0 diff --git a/tests/xfs/660.out b/tests/xfs/660.out new file mode 100755 index 00000000..c0556da3 --- /dev/null +++ b/tests/xfs/660.out @@ -0,0 +1,7 @@ +QA output created by 660 +Simulate media error +DEVICE: verify error at offset XXX length XXX: Input/output error +No more media error +Kill healer +SCRATCH_MNT DEVICE daddr NUM bbcount NUM: media error +VICTIM pos NUM len NUM: media failed