2 # SPDX-License-Identifier: GPL-2.0+
3 # Copyright (c) 2017 Oracle. All Rights Reserved.
5 # Routines for fuzzing and scrubbing a filesystem.
7 # Modify various files after a fuzzing operation
8 _scratch_fuzz_modify() {
9 echo "+++ stressing filesystem"
10 mkdir -p $SCRATCH_MNT/data
11 _xfs_force_bdev data $SCRATCH_MNT/data
12 $FSSTRESS_PROG -n $((TIME_FACTOR * 10000)) -p $((LOAD_FACTOR * 4)) -d $SCRATCH_MNT/data
14 if _xfs_has_feature "$SCRATCH_MNT" realtime; then
15 mkdir -p $SCRATCH_MNT/rt
16 _xfs_force_bdev realtime $SCRATCH_MNT/rt
17 $FSSTRESS_PROG -n $((TIME_FACTOR * 10000)) -p $((LOAD_FACTOR * 4)) -d $SCRATCH_MNT/rt
19 echo "+++ xfs realtime not configured"
23 # Try to access files after fuzzing
24 _scratch_fuzz_test() {
25 echo "+++ ls -laR" >> $seqres.full
26 ls -laR "${SCRATCH_MNT}/test.1/" >/dev/null 2>&1
28 echo "+++ cat files" >> $seqres.full
29 (find "${SCRATCH_MNT}/test.1/" -type f -size -1048576k -print0 | xargs -0 cat) >/dev/null 2>&1
32 # Do we have an online scrub program?
36 test -x "$XFS_SCRUB_PROG" || _notrun "xfs_scrub not found"
39 _notrun "No online scrub program for ${FSTYP}."
44 # Scrub the scratch filesystem metadata (online)
48 $XFS_SCRUB_PROG -d -T -v "$@" $SCRATCH_MNT
51 _fail "No online scrub program for ${FSTYP}."
56 # Expand indexed keys (i.e. arrays) into a long format so that we can filter
57 # the array indices individually, and pass regular keys right through.
59 # For example, "u3.bmx[0-1] = [foo,bar]" is exploded into:
60 # u3.bmx[0] = [foo,bar]
61 # u3.bmx[1] = [foo,bar]
63 # Note that we restrict array indices to [0-9] to reduce fuzz runtime. The
64 # minimum and maximum array indices can be changed by setting the variables
65 # SCRATCH_XFS_{MIN,MAX}_ARRAY_IDX.
67 # Also filter padding fields.
68 __explode_xfs_db_fields() {
69 local min_idx="${SCRATCH_XFS_MIN_ARRAY_IDX}"
70 local max_idx="${SCRATCH_XFS_MAX_ARRAY_IDX}"
72 test -z "${min_idx}" && min_idx=0
73 test -z "${max_idx}" && max_idx=9
74 test "${max_idx}" = "none" && max_idx=99999
77 sed -e 's/^\([.a-zA-Z0-9_]*\)\[\([0-9]*\)-\([0-9]*\)\]\(.*\) = \(.*\)$/\1[%d]\4 \2 \3 = \5/g' \
78 -e 's/^\([.a-zA-Z0-9_]*\)\[\([0-9]*\)\]\(.*\) = \(.*\)$/\1[%d]\3 \2 \2 = \4/g' | \
79 while read name col1 col2 rest; do
80 if [[ "${name}" == *pad* ]]; then
84 if [ "${col1}" = "=" ]; then
85 echo "${name} ${col1} ${col2} ${rest}"
89 test "${min_idx}" -gt "${col1}" && col1="${min_idx}"
90 test "${max_idx}" -lt "${col2}" && col2="${max_idx}"
92 seq "${col1}" "${col2}" | while read idx; do
93 printf "${name} %s\n" "${idx}" "${rest}"
98 # Filter out metadata fields that are completely controlled by userspace
99 # or are arbitrary bit sequences. In other words, fields where the filesystem
100 # does no validation.
101 __filter_unvalidated_xfs_db_fields() {
106 -e '/^core.flushiter/d' \
107 -e '/^core.dmevmask/d' \
108 -e '/^core.dmstate/d' \
110 -e '/^core.prealloc/d' \
111 -e '/^core.immutable/d' \
112 -e '/^core.append/d' \
114 -e '/^core.noatime/d' \
115 -e '/^core.nodump/d' \
116 -e '/^core.nodefrag/d' \
118 -e '/^nvlist.*value/d' \
119 -e '/^entries.*root/d' \
120 -e '/^entries.*secure/d' \
121 -e '/^a.sfattr.list.*value/d' \
122 -e '/^a.sfattr.list.*root/d' \
123 -e '/^a.sfattr.list.*secure/d'
126 # Filter the xfs_db print command's field debug information
127 # into field name and type.
128 __filter_xfs_db_print_fields() {
130 if [ -z "${filter}" ] || [ "${filter}" = "nofilter" ]; then
133 __explode_xfs_db_fields | while read key equals value; do
134 fuzzkey="$(echo "${key}")"
135 if [ -z "${fuzzkey}" ]; then
137 elif [[ "${value}" == "["* ]]; then
138 echo "${value}" | sed -e 's/^.//g' -e 's/.$//g' -e 's/,/\n/g' | while read subfield; do
139 echo "${fuzzkey}.${subfield}"
144 done | grep -E "${filter}" | __filter_unvalidated_xfs_db_fields
147 # Dump the current contents of a metadata object.
148 # All arguments are xfs_db commands to locate the metadata.
149 _scratch_xfs_dump_metadata() {
152 cmds+=("-c" "${arg}")
154 _scratch_xfs_db "${cmds[@]}" -c print
157 # Decide from the output of the xfs_db "stack" command if the debugger's io
158 # cursor is pointed at a block that is an unstructured data format (blob).
159 __scratch_xfs_detect_blob_from_stack() {
160 grep -q -E 'inode.*, type (data|rtsummary|rtbitmap)'
163 # Navigate to some part of the filesystem and print the field info.
164 # The first argument is an grep filter for the fields
165 # The rest of the arguments are xfs_db commands to locate the metadata.
166 _scratch_xfs_list_metadata_fields() {
169 if [ -n "${SCRATCH_XFS_LIST_METADATA_FIELDS}" ]; then
170 echo "${SCRATCH_XFS_LIST_METADATA_FIELDS}" | tr '[ ,]' '[\n\n]'
176 cmds+=("-c" "${arg}")
179 # Does the path argument point towards something that is an
181 if _scratch_xfs_db "${cmds[@]}" -c stack 2>/dev/null | \
182 __scratch_xfs_detect_blob_from_stack; then
187 _scratch_xfs_db "${cmds[@]}" -c print | \
188 __filter_xfs_db_print_fields "${filter}"
191 # Fuzz a metadata field
192 # The first arg is the field name
193 # The second arg is the xfs_db fuzz verb
194 # The rest of the arguments are xfs_db commands to find the metadata.
195 _scratch_xfs_fuzz_metadata_field() {
200 if [[ "${key}" == *crc ]]; then
205 oldval="$(_scratch_xfs_get_metadata_field "${key}" "$@")"
209 cmds+=("-c" "${arg}")
212 _scratch_xfs_db -x "${cmds[@]}" -c "fuzz ${fuzz_arg} ${key} ${value}"
214 newval="$(_scratch_xfs_get_metadata_field "${key}" "$@" 2> /dev/null)"
215 if [ "${key}" != "random" ] || [ "${oldval}" != "${newval}" ]; then
219 if [ "${oldval}" = "${newval}" ]; then
220 echo "Field ${key} already set to ${newval}, skipping test."
226 # List the fuzzing verbs available for unstructured blobs
227 __scratch_xfs_list_blob_fuzz_verbs() {
238 # Fuzz a metadata blob
239 # The first arg is a blob fuzzing verb
240 # The rest of the arguments are xfs_db commands to find the metadata.
241 _scratch_xfs_fuzz_metadata_blob() {
244 local trashcmd=(blocktrash -z)
248 cmds+=("-c" "${arg}")
251 local bytecount=$(_scratch_xfs_db "${cmds[@]}" -c "stack" | grep 'byte.*length' | awk '{print $5}')
252 local bitmax=$((bytecount * 8))
254 case "${fuzzverb}" in
256 trashcmd+=(-0 -o 0 -x "${bitmax}" -y "${bitmax}");;
258 trashcmd+=(-1 -o 0 -x "${bitmax}" -y "${bitmax}");;
260 trashcmd+=(-2 -o 0 -x 1 -y 1);;
262 trashcmd+=(-2 -o $((bitmax / 2)) -x 1 -y 1);;
264 trashcmd+=(-2 -o "${bitmax}" -x 1 -y 1);;
266 trashcmd+=(-3 -o 0 -x "${bitmax}" -y "${bitmax}");;
268 echo "Unknown blob fuzz verb \"${fuzzverb}\"."
273 trashcmd="${trashcmd[@]}"
274 oldval="$(_scratch_xfs_get_metadata_field "" "$@")"
276 _scratch_xfs_db -x "${cmds[@]}" -c "${trashcmd}"
278 newval="$(_scratch_xfs_get_metadata_field "" "$@" 2> /dev/null)"
279 if [ "${fuzzverb}" != "random" ] || [ "${oldval}" != "${newval}" ]; then
283 if [ "${oldval}" = "${newval}" ]; then
284 echo "Blob already set to new value, skipping test."
290 # Try to forcibly unmount the scratch fs
291 __scratch_xfs_fuzz_unmount()
293 while _scratch_unmount 2>/dev/null; do sleep 0.2; done
296 # Restore metadata to scratch device prior to field-fuzzing.
297 __scratch_xfs_fuzz_mdrestore()
299 __scratch_xfs_fuzz_unmount
300 _xfs_mdrestore "${POPULATE_METADUMP}" "${SCRATCH_DEV}" || \
301 _fail "${POPULATE_METADUMP}: Could not find metadump to restore?"
305 echo '========================================'
307 echo '========================================'
311 # Perform the online repair part of a fuzz test.
312 __scratch_xfs_fuzz_field_online() {
313 local fuzz_action="$1"
315 # Mount or else we can't do anything online
316 __fuzz_notify "+ Mount filesystem to try online repair"
317 _try_scratch_mount 2>&1
319 if [ $res -ne 0 ]; then
320 (>&2 echo "${fuzz_action}: mount failed ($res).")
324 # Make sure online scrub will catch whatever we fuzzed
325 __fuzz_notify "++ Detect fuzzed field (online)"
326 _scratch_scrub -n -a 1 -e continue 2>&1
329 (>&2 echo "${fuzz_action}: online scrub didn't fail.")
331 # Does the health status report reflect the corruption?
332 if [ $res -ne 0 ]; then
333 __fuzz_notify "++ Detect fuzzed field ill-health report"
334 _check_xfs_health $SCRATCH_MNT 2>&1
337 (>&2 echo "${fuzz_action}: online health check failed ($res).")
340 # Try fixing the filesystem online
341 __fuzz_notify "++ Try to repair filesystem (online)"
345 (>&2 echo "${fuzz_action}: online repair failed ($res).")
347 # Online scrub should pass now
348 __fuzz_notify "++ Make sure error is gone (online)"
349 _scratch_scrub -n -a 1 -e continue 2>&1
352 (>&2 echo "${fuzz_action}: online re-scrub failed ($res).")
354 __scratch_xfs_fuzz_unmount
356 # Offline scrub should pass now
357 __fuzz_notify "+ Make sure error is gone (offline)"
358 _scratch_xfs_repair -P -n 2>&1
361 (>&2 echo "${fuzz_action}: offline re-scrub failed ($res).")
366 # Perform the offline repair part of a fuzz test.
367 __scratch_xfs_fuzz_field_offline() {
368 local fuzz_action="$1"
370 # Make sure offline scrub will catch whatever we fuzzed
371 __fuzz_notify "+ Detect fuzzed field (offline)"
372 _scratch_xfs_repair -P -n 2>&1
375 (>&2 echo "${fuzz_action}: offline scrub didn't fail.")
377 # Make sure xfs_repair catches at least as many things as the old
379 if [ -n "${SCRATCH_XFS_FUZZ_CHECK}" ]; then
380 __fuzz_notify "+ Detect fuzzed field (xfs_check)"
381 _scratch_xfs_check 2>&1
383 if [ $res1 -ne 0 ] && [ $res -eq 0 ]; then
384 (>&2 echo "${fuzz_action}: xfs_repair passed but xfs_check failed ($res1).")
388 # Repair the filesystem offline
389 __fuzz_notify "+ Try to repair the filesystem (offline)"
390 _repair_scratch_fs -P 2>&1
393 (>&2 echo "${fuzz_action}: offline repair failed ($res).")
395 # See if repair finds a clean fs
396 __fuzz_notify "+ Make sure error is gone (offline)"
397 _scratch_xfs_repair -P -n 2>&1
400 (>&2 echo "${fuzz_action}: offline re-scrub failed ($res).")
405 # Perform the no-repair part of a fuzz test.
406 __scratch_xfs_fuzz_field_norepair() {
407 local fuzz_action="$1"
409 # Make sure offline scrub will catch whatever we fuzzed
410 __fuzz_notify "+ Detect fuzzed field (offline)"
411 _scratch_xfs_repair -P -n 2>&1
414 (>&2 echo "${fuzz_action}: offline scrub didn't fail.")
416 # Mount or else we can't do anything in norepair mode
417 __fuzz_notify "+ Mount filesystem to try online scan"
418 _try_scratch_mount 2>&1
420 if [ $res -ne 0 ]; then
421 (>&2 echo "${fuzz_action}: mount failed ($res).")
425 # Skip scrub and health check if scrub is not supported
426 if ! _supports_xfs_scrub $SCRATCH_MNT $SCRATCH_DEV; then
427 __scratch_xfs_fuzz_unmount
431 # Make sure online scrub will catch whatever we fuzzed
432 __fuzz_notify "++ Detect fuzzed field (online)"
433 _scratch_scrub -n -a 1 -e continue 2>&1
436 (>&2 echo "${fuzz_action}: online scrub didn't fail.")
438 # Does the health status report reflect the corruption?
439 if [ $res -ne 0 ]; then
440 __fuzz_notify "++ Detect fuzzed field ill-health report"
441 _check_xfs_health $SCRATCH_MNT 2>&1
444 (>&2 echo "${fuzz_action}: online health check failed ($res).")
447 __scratch_xfs_fuzz_unmount
452 # Perform the online-then-offline repair part of a fuzz test.
453 __scratch_xfs_fuzz_field_both() {
454 local fuzz_action="$1"
456 # Make sure offline scrub will catch whatever we fuzzed
457 __fuzz_notify "+ Detect fuzzed field (offline)"
458 _scratch_xfs_repair -P -n 2>&1
461 (>&2 echo "${fuzz_action}: offline scrub didn't fail.")
463 # Mount or else we can't do anything in both repair mode
464 __fuzz_notify "+ Mount filesystem to try both repairs"
465 _try_scratch_mount 2>&1
467 if [ $res -ne 0 ]; then
468 (>&2 echo "${fuzz_action}: mount failed ($res).")
470 # Make sure online scrub will catch whatever we fuzzed
471 __fuzz_notify "++ Detect fuzzed field (online)"
472 _scratch_scrub -n -a 1 -e continue 2>&1
475 (>&2 echo "${fuzz_action}: online scrub didn't fail.")
477 # Does the health status report reflect the corruption?
478 if [ $res -ne 0 ]; then
479 __fuzz_notify "++ Detect fuzzed field ill-health report"
480 _check_xfs_health $SCRATCH_MNT 2>&1
483 (>&2 echo "${fuzz_action}: online health check failed ($res).")
486 # Try fixing the filesystem online
487 __fuzz_notify "++ Try to repair filesystem (online)"
491 (>&2 echo "${fuzz_action}: online repair failed ($res).")
493 __scratch_xfs_fuzz_unmount
496 # Repair the filesystem offline if online repair failed?
497 if [ $res -ne 0 ]; then
498 __fuzz_notify "+ Try to repair the filesystem (offline)"
499 _repair_scratch_fs -P 2>&1
502 (>&2 echo "${fuzz_action}: offline repair failed ($res).")
505 # See if repair finds a clean fs
506 __fuzz_notify "+ Make sure error is gone (offline)"
507 _scratch_xfs_repair -P -n 2>&1
510 (>&2 echo "${fuzz_action}: offline re-scrub failed ($res).")
512 # Mount so that we can see what scrub says after we've fixed the fs
513 __fuzz_notify "+ Re-mount filesystem to re-try online scan"
514 _try_scratch_mount 2>&1
516 if [ $res -ne 0 ]; then
517 (>&2 echo "${fuzz_action}: mount failed ($res).")
521 # Online scrub should pass now
522 __fuzz_notify "++ Make sure error is gone (online)"
523 _scratch_scrub -n -a 1 -e continue 2>&1
526 (>&2 echo "${fuzz_action}: online re-scrub failed ($res).")
528 __scratch_xfs_fuzz_unmount
533 # Assess the state of the filesystem after a repair strategy has been run by
534 # trying to make changes to it.
535 _scratch_xfs_fuzz_field_modifyfs() {
536 local fuzz_action="$1"
539 # Try to mount the filesystem so that we can make changes
540 __fuzz_notify "+ Mount filesystem to make changes"
541 _try_scratch_mount 2>&1
543 if [ $res -ne 0 ]; then
544 (>&2 echo "${fuzz_action}: pre-mod mount failed ($res).")
548 # Try modifying the filesystem again
549 __fuzz_notify "++ Try to write filesystem again"
550 _scratch_fuzz_modify 2>&1
552 # If we didn't repair anything, there's no point in checking further,
553 # the fs is still corrupt.
554 if [ "${repair}" = "none" ]; then
555 __scratch_xfs_fuzz_unmount
559 # Run an online check to make sure the fs is still ok, unless we
560 # are running the norepair strategy.
561 __fuzz_notify "+ Re-check the filesystem (online)"
562 _scratch_scrub -n -e continue 2>&1
565 (>&2 echo "${fuzz_action}: online post-mod scrub failed ($res).")
567 __scratch_xfs_fuzz_unmount
569 # Run an offline check to make sure the fs is still ok, unless we
570 # are running the norepair strategy.
571 __fuzz_notify "+ Re-check the filesystem (offline)"
572 _scratch_xfs_repair -P -n 2>&1
575 (>&2 echo "${fuzz_action}: offline post-mod scrub failed ($res).")
580 # Fuzz one field of some piece of metadata.
581 # First arg is the field name
582 # Second arg is the fuzz verb (ones, zeroes, random, add, sub...)
583 # Third arg is the repair mode (online, offline, both, none)
584 __scratch_xfs_fuzz_field_test() {
590 # Set the new field value
591 __fuzz_notify "+ Fuzz ${field} = ${fuzzverb}"
592 if [ "$field" = "<blob>" ]; then
593 _scratch_xfs_fuzz_metadata_blob ${fuzzverb} "$@"
595 _scratch_xfs_fuzz_metadata_field "${field}" ${fuzzverb} "$@"
598 test $res -ne 0 && return
600 # Try to catch the error with whatever repair strategy we picked.
601 # The fs should not be mounted before or after the strategy call.
602 local fuzz_action="${field} = ${fuzzverb}"
605 __scratch_xfs_fuzz_field_online "${fuzz_action}"
609 __scratch_xfs_fuzz_field_offline "${fuzz_action}"
613 __scratch_xfs_fuzz_field_norepair "${fuzz_action}"
617 __scratch_xfs_fuzz_field_both "${fuzz_action}"
621 (>&2 echo "unknown repair strategy ${repair}.")
625 test $res -eq 0 || return $res
627 # See what happens when we modify the fs
628 _scratch_xfs_fuzz_field_modifyfs "${fuzz_action}" "${repair}"
632 # Make sure we have all the pieces we need for field fuzzing
633 _require_scratch_xfs_fuzz_fields()
635 _require_scratch_nocheck
637 _require_populate_commands
638 _scratch_mkfs_xfs >/dev/null 2>&1
639 _require_xfs_db_command "fuzz"
642 # Sets the array SCRATCH_XFS_DIR_FUZZ_TYPES to the list of directory formats
643 # available for fuzzing. Each list item must match one of the /S_IFDIR.FMT_*
644 # files created by the fs population code. Users can override this by setting
645 # SCRATCH_XFS_LIST_FUZZ_DIRTYPE in the environment. BTREE is omitted here
646 # because that refers to the fork format and does not affect the directory
648 _scratch_xfs_set_dir_fuzz_types() {
649 if [ -n "${SCRATCH_XFS_LIST_FUZZ_DIRTYPE}" ]; then
650 mapfile -t SCRATCH_XFS_DIR_FUZZ_TYPES < \
651 <(echo "${SCRATCH_XFS_LIST_FUZZ_DIRTYPE}" | tr '[ ,]' '[\n\n]')
655 SCRATCH_XFS_DIR_FUZZ_TYPES=(BLOCK LEAF LEAFN NODE)
658 # Sets the array SCRATCH_XFS_XATTR_FUZZ_TYPES to the list of xattr formats
659 # available for fuzzing. Each list item must match one of the /ATTR.FMT_*
660 # files created by the fs population code. Users can override this by setting
661 # SCRATCH_XFS_LIST_FUZZ_XATTRTYPE in the environment. BTREE is omitted here
662 # because that refers to the fork format and does not affect the extended
663 # attribute structure at all.
664 _scratch_xfs_set_xattr_fuzz_types() {
665 if [ -n "${SCRATCH_XFS_LIST_FUZZ_XATTRTYPE}" ]; then
666 mapfile -t SCRATCH_XFS_XATTR_FUZZ_TYPES < \
667 <(echo "${SCRATCH_XFS_LIST_FUZZ_XATTRTYPE}" | tr '[ ,]' '[\n\n]')
671 SCRATCH_XFS_XATTR_FUZZ_TYPES=(EXTENTS_REMOTE3K EXTENTS_REMOTE4K LEAF NODE)
674 # Grab the list of available fuzzing verbs
675 _scratch_xfs_list_fuzz_verbs() {
676 if [ -n "${SCRATCH_XFS_LIST_FUZZ_VERBS}" ]; then
677 echo "${SCRATCH_XFS_LIST_FUZZ_VERBS}" | tr '[ ,]' '[\n\n]'
683 cmds+=("-c" "${arg}")
685 test "${#cmds[@]}" -eq 0 && cmds=('-c' 'sb 0')
687 # Does the path argument point towards something that is an
689 if _scratch_xfs_db "${cmds[@]}" -c stack 2>/dev/null | \
690 __scratch_xfs_detect_blob_from_stack; then
691 __scratch_xfs_list_blob_fuzz_verbs
695 _scratch_xfs_db -x "${cmds[@]}" -c 'fuzz' | grep '^Fuzz commands:' | \
696 sed -e 's/[,.]//g' -e 's/Fuzz commands: //g' -e 's/ /\n/g' | \
700 # Fuzz some of the fields of some piece of metadata
701 # The first argument is an grep filter for the field names
702 # The second argument is the repair mode (online, offline, both)
703 # The rest of the arguments are xfs_db commands to locate the metadata.
705 # Users can specify the fuzz verbs via SCRATCH_XFS_LIST_FUZZ_VERBS
706 # They can specify the fields via SCRATCH_XFS_LIST_METADATA_FIELDS
707 _scratch_xfs_fuzz_metadata() {
712 fields="$(_scratch_xfs_list_metadata_fields "${filter}" "$@")"
713 verbs="$(_scratch_xfs_list_fuzz_verbs "$@")"
714 echo "Fields we propose to fuzz with the \"${repair}\" repair strategy: $@"
715 echo $(echo "${fields}")
716 echo "Verbs we propose to fuzz with:"
717 echo $(echo "${verbs}")
718 echo "Current metadata object state:"
719 _scratch_xfs_dump_metadata "$@"
721 # Always capture full core dumps from crashing tools
724 _xfs_skip_online_rebuild
725 _xfs_skip_offline_rebuild
727 echo "${fields}" | while read field; do
728 echo "${verbs}" | while read fuzzverb; do
729 __scratch_xfs_fuzz_mdrestore
730 __scratch_xfs_fuzz_field_test "${field}" "${fuzzverb}" "${repair}" "$@"
732 # Collect compresssed coredumps in the test results
733 # directory if the sysadmin didn't override the default
735 for i in core core.*; do
736 test -f "$i" || continue
743 # Functions to race fsstress, fs freeze, and xfs metadata scrubbing against
744 # each other to shake out bugs in xfs online repair.
746 # Filter freeze and thaw loop output so that we don't tarnish the golden output
747 # if the kernel temporarily won't let us freeze.
748 __stress_freeze_filter_output() {
750 sed -e '/Device or resource busy/d' \
751 -e '/Invalid argument/d'
754 # Filter scrub output so that we don't tarnish the golden output if the fs is
755 # too busy to scrub. Note: Tests should _notrun if the scrub type is not
756 # supported. Callers can provide extra strings to filter out as function
758 __stress_scrub_filter_output() {
762 extra_args+=(-e "/${arg}/d")
766 sed -e '/Device or resource busy/d' \
767 -e '/Optimization possible/d' \
768 -e '/No space left on device/d' \
772 # Decide if the scratch filesystem is still alive.
773 __stress_scrub_scratch_alive() {
774 # If we can't stat the scratch filesystem, there's a reasonably good
775 # chance that the fs shut down, which is not good.
776 stat "$SCRATCH_MNT" &>/dev/null
779 # Decide if we want to keep running stress tests. The first argument is the
780 # stop time, and second argument is the path to the sentinel file.
781 __stress_scrub_running() {
782 test -e "$2" && test "$(date +%s)" -lt "$1" && __stress_scrub_scratch_alive
785 # Run fs freeze and thaw in a tight loop.
786 __stress_scrub_freeze_loop() {
788 local runningfile="$2"
790 while __stress_scrub_running "$end" "$runningfile"; do
791 $XFS_IO_PROG -x -c 'freeze' -c 'thaw' $SCRATCH_MNT 2>&1 | \
792 __stress_freeze_filter_output
796 # Run individual xfs_io commands in a tight loop.
797 __stress_xfs_io_loop() {
799 local runningfile="$2"
804 xfs_io_args+=('-c' "$arg")
807 while __stress_scrub_running "$end" "$runningfile"; do
808 $XFS_IO_PROG -x "${xfs_io_args[@]}" "$SCRATCH_MNT" \
809 > /dev/null 2>> $seqres.full
813 # Run individual XFS online fsck commands in a tight loop with xfs_io.
814 __stress_one_scrub_loop() {
816 local runningfile="$2"
818 local scrub_startat="$4"
819 local start_agno="$5"
820 shift; shift; shift; shift; shift
821 local agcount="$(_xfs_mount_agcount $SCRATCH_MNT)"
825 if [ -n "$SCRUBSTRESS_USE_FORCE_REBUILD" ]; then
826 arg="$(echo "$arg" | sed -e 's/^repair/repair -R/g')"
828 if echo "$arg" | grep -q -w '%agno%'; then
829 # Substitute the AG number
830 for ((agno = start_agno; agno < agcount; agno++)); do
831 local ag_arg="$(echo "$arg" | sed -e "s|%agno%|$agno|g")"
832 xfs_io_args+=('-c' "$ag_arg")
835 xfs_io_args+=('-c' "$arg")
839 local extra_filters=()
841 "%file%"|"%datafile%"|"%attrfile%")
842 extra_filters+=('No such file or directory' 'No such device or address')
845 extra_filters+=('No such file or directory' 'Not a directory')
847 "%regfile%"|"%cowfile%")
848 extra_filters+=('No such file or directory')
852 local target_cmd=(echo "$scrub_tgt")
854 "%file%") target_cmd=($here/src/xfsfind -q "$SCRATCH_MNT");;
855 "%attrfile%") target_cmd=($here/src/xfsfind -qa "$SCRATCH_MNT");;
856 "%datafile%") target_cmd=($here/src/xfsfind -qb "$SCRATCH_MNT");;
857 "%dir%") target_cmd=($here/src/xfsfind -qd "$SCRATCH_MNT");;
858 "%regfile%") target_cmd=($here/src/xfsfind -qr "$SCRATCH_MNT");;
859 "%cowfile%") target_cmd=($here/src/xfsfind -qs "$SCRATCH_MNT");;
862 while __stress_scrub_running "$scrub_startat" "$runningfile"; do
866 while __stress_scrub_running "$end" "$runningfile"; do
867 readarray -t fnames < <("${target_cmd[@]}" 2>> $seqres.full)
868 for fname in "${fnames[@]}"; do
869 $XFS_IO_PROG -x "${xfs_io_args[@]}" "$fname" 2>&1 | \
870 __stress_scrub_filter_output "${extra_filters[@]}"
871 __stress_scrub_running "$end" "$runningfile" || break
876 # Run xfs_scrub online fsck in a tight loop.
877 __stress_xfs_scrub_loop() {
879 local runningfile="$2"
880 local scrub_startat="$3"
882 local sigint_ret="$(( $(kill -l SIGINT) + 128 ))"
883 local scrublog="$tmp.scrub"
885 while __stress_scrub_running "$scrub_startat" "$runningfile"; do
889 while __stress_scrub_running "$end" "$runningfile"; do
890 _scratch_scrub "$@" &> $scrublog
892 if [ "$res" -eq "$sigint_ret" ]; then
893 # Ignore SIGINT because the cleanup function sends
894 # that to terminate xfs_scrub
897 echo "xfs_scrub exits with $res at $(date)" >> $seqres.full
898 if [ "$res" -ge 128 ]; then
899 # Report scrub death due to fatal signals
900 echo "xfs_scrub died with SIG$(kill -l $res)"
901 cat $scrublog >> $seqres.full 2>/dev/null
902 elif [ "$((res & 0x1))" -gt 0 ]; then
903 # Report uncorrected filesystem errors
904 echo "xfs_scrub reports uncorrected errors:"
905 grep -E '(Repair unsuccessful;|Corruption:)' $scrublog
906 cat $scrublog >> $seqres.full 2>/dev/null
912 # Clean the scratch filesystem between rounds of fsstress if there is 2%
913 # available space or less because that isn't an interesting stress test.
915 # Returns 0 if we cleared anything, and 1 if we did nothing.
916 __stress_scrub_clean_scratch() {
917 local used_pct="$(_used $SCRATCH_DEV)"
919 test "$used_pct" -lt 98 && return 1
921 echo "Clearing scratch fs at $(date)" >> $seqres.full
922 rm -r -f $SCRATCH_MNT/p*
926 # Run fsx while we're testing online fsck.
927 __stress_scrub_fsx_loop() {
929 local runningfile="$2"
930 local remount_period="$3"
931 local stress_tgt="$4" # ignored
932 local focus=(-q -X) # quiet, validate file contents
934 # As of November 2022, 2 million fsx ops should be enough to keep
935 # any filesystem busy for a couple of hours.
937 focus+=(-o $((128000 * LOAD_FACTOR)) )
938 focus+=(-l $((600000 * LOAD_FACTOR)) )
940 local args="$FSX_AVOID ${focus[@]} ${SCRATCH_MNT}/fsx.$seq"
941 echo "Running $here/ltp/fsx $args" >> $seqres.full
943 if [ -n "$remount_period" ]; then
946 while __stress_scrub_running "$end" "$runningfile"; do
947 # Need to recheck running conditions if we cleared
949 test "$mode" = "rw" && __stress_scrub_clean_scratch && continue
951 timeout -s TERM "$remount_period" $here/ltp/fsx \
952 $args $rw_arg >> $seqres.full
954 echo "$mode fsx exits with $res at $(date)" >> $seqres.full
955 if [ "$res" -ne 0 ] && [ "$res" -ne 124 ]; then
956 # Stop if fsstress returns error. Mask off
957 # the magic code 124 because that is how the
958 # timeout(1) program communicates that we ran
962 if [ "$mode" = "rw" ]; then
964 rw_arg="-t 0 -w 0 -FHzCIJBE0"
970 # Try remounting until we get the result we wanted
971 while ! _scratch_remount "$mode" &>/dev/null && \
972 __stress_scrub_running "$end" "$runningfile"; do
980 while __stress_scrub_running "$end" "$runningfile"; do
981 # Need to recheck running conditions if we cleared anything
982 __stress_scrub_clean_scratch && continue
983 $here/ltp/fsx $args >> $seqres.full
984 echo "fsx exits with $? at $(date)" >> $seqres.full
989 # Run fsstress while we're testing online fsck.
990 __stress_scrub_fsstress_loop() {
992 local runningfile="$2"
993 local remount_period="$3"
994 local stress_tgt="$4"
997 case "$stress_tgt" in
1001 # Create a directory tree very gradually
1002 for op in creat link mkdir; do
1003 focus+=('-f' "${op}=2")
1005 focus+=('-f' 'unlink=1' '-f' 'rmdir=1')
1007 # But do a lot of renames to cycle parent pointers
1008 for op in rename rnoreplace rexchange; do
1009 focus+=('-f' "${op}=40")
1015 # Create a directory tree rapidly
1016 for op in creat link mkdir mknod symlink; do
1017 focus+=('-f' "${op}=8")
1019 focus+=('-f' 'rmdir=2' '-f' 'unlink=8')
1021 # Rename half as often
1022 for op in rename rnoreplace rexchange; do
1023 focus+=('-f' "${op}=4")
1026 # Read and sync occasionally
1027 for op in getdents stat fsync; do
1028 focus+=('-f' "${op}=1")
1034 # Create a directory tree slowly
1035 for op in creat ; do
1036 focus+=('-f' "${op}=2")
1038 for op in unlink rmdir; do
1039 focus+=('-f' "${op}=1")
1042 # Create xattrs rapidly
1043 for op in attr_set setfattr; do
1044 focus+=('-f' "${op}=80")
1047 # Remove xattrs 1/4 as quickly
1048 for op in attr_remove removefattr; do
1049 focus+=('-f' "${op}=20")
1052 # Read and sync occasionally
1053 for op in listfattr getfattr fsync; do
1054 focus+=('-f' "${op}=10")
1058 # Only do things that cause filesystem writes
1067 # Only create, read, and delete symbolic links
1068 focus+=('-f' 'symlink=4')
1069 focus+=('-f' 'readlink=10')
1070 focus+=('-f' 'unlink=1')
1075 # Only create and delete special files
1076 focus+=('-f' 'mknod=4')
1077 focus+=('-f' 'getdents=100')
1078 focus+=('-f' 'unlink=1')
1081 echo "$stress_tgt: Unrecognized stress target, using defaults."
1085 # As of March 2022, 2 million fsstress ops should be enough to keep
1086 # any filesystem busy for a couple of hours.
1087 local args=$(_scale_fsstress_args -p 4 -d $SCRATCH_MNT -n 2000000 "${focus[@]}" $FSSTRESS_AVOID)
1088 echo "Running $FSSTRESS_PROG $args" >> $seqres.full
1090 if [ -n "$remount_period" ]; then
1093 while __stress_scrub_running "$end" "$runningfile"; do
1094 # Need to recheck running conditions if we cleared
1096 test "$mode" = "rw" && __stress_scrub_clean_scratch && continue
1098 timeout -s TERM "$remount_period" $FSSTRESS_PROG \
1099 $args $rw_arg >> $seqres.full
1101 echo "$mode fsstress exits with $res at $(date)" >> $seqres.full
1102 if [ "$res" -ne 0 ] && [ "$res" -ne 124 ]; then
1103 # Stop if fsstress returns error. Mask off
1104 # the magic code 124 because that is how the
1105 # timeout(1) program communicates that we ran
1109 if [ "$mode" = "rw" ]; then
1117 # Try remounting until we get the result we wanted
1118 while ! _scratch_remount "$mode" &>/dev/null && \
1119 __stress_scrub_running "$end" "$runningfile"; do
1123 rm -f "$runningfile"
1127 while __stress_scrub_running "$end" "$runningfile"; do
1128 # Need to recheck running conditions if we cleared anything
1129 __stress_scrub_clean_scratch && continue
1130 $FSSTRESS_PROG $args >> $seqres.full
1131 echo "fsstress exits with $? at $(date)" >> $seqres.full
1133 rm -f "$runningfile"
1136 # Make sure we have everything we need to run stress and scrub
1137 _require_xfs_stress_scrub() {
1138 _require_xfs_io_command "scrub"
1139 _require_test_program "xfsfind"
1140 _require_command "$KILLALL_PROG" killall
1142 command -v _filter_scratch &>/dev/null || \
1143 _notrun 'xfs scrub stress test requires common/filter'
1146 # Make sure that we can force repairs either by error injection or passing
1147 # FORCE_REBUILD via ioctl.
1148 __require_xfs_stress_force_rebuild() {
1149 local output="$($XFS_IO_PROG -x -c 'repair -R probe' $SCRATCH_MNT 2>&1)"
1150 test -z "$output" && return
1151 _require_xfs_io_error_injection "force_repair"
1154 # Make sure we have everything we need to run stress and online repair
1155 _require_xfs_stress_online_repair() {
1156 _require_xfs_stress_scrub
1157 _require_xfs_io_command "repair"
1158 command -v _require_xfs_io_error_injection &>/dev/null || \
1159 _notrun 'xfs repair stress test requires common/inject'
1160 __require_xfs_stress_force_rebuild
1164 # Clean up after the loops in case they didn't do it themselves.
1165 _scratch_xfs_stress_scrub_cleanup() {
1166 rm -f "$runningfile"
1167 echo "Cleaning up scrub stress run at $(date)" >> $seqres.full
1169 # Send SIGINT so that bash won't print a 'Terminated' message that
1170 # distorts the golden output.
1171 echo "Killing stressor processes at $(date)" >> $seqres.full
1172 $KILLALL_PROG -INT xfs_io fsstress fsx xfs_scrub >> $seqres.full 2>&1
1174 # Tests are not allowed to exit with the scratch fs frozen. If we
1175 # started a fs freeze/thaw background loop, wait for that loop to exit
1176 # and then thaw the filesystem. Cleanup for the freeze loop must be
1177 # performed prior to waiting for the other children to avoid triggering
1178 # a race condition that can hang fstests.
1180 # If the xfs_io -c freeze process is asleep waiting for a write lock on
1181 # s_umount or sb_write when the killall signal is delivered, it will
1182 # not check for pending signals until after it has frozen the fs. If
1183 # even one thread of the stress test processes (xfs_io, fsstress, etc.)
1184 # is waiting for read locks on sb_write when the killall signals are
1185 # delivered, they will block in the kernel until someone thaws the fs,
1186 # and the `wait' below will wait forever.
1188 # Hence we issue the killall, wait for the freezer loop to exit, thaw
1189 # the filesystem, and wait for the rest of the children.
1190 if [ -n "$__SCRUB_STRESS_FREEZE_PID" ]; then
1191 echo "Waiting for fs freezer $__SCRUB_STRESS_FREEZE_PID to exit at $(date)" >> $seqres.full
1192 wait "$__SCRUB_STRESS_FREEZE_PID"
1194 echo "Thawing filesystem at $(date)" >> $seqres.full
1195 $XFS_IO_PROG -x -c 'thaw' $SCRATCH_MNT >> $seqres.full 2>&1
1196 __SCRUB_STRESS_FREEZE_PID=""
1199 # Wait for the remaining children to exit.
1200 echo "Waiting for children to exit at $(date)" >> $seqres.full
1203 # Ensure the scratch fs is also writable before we exit.
1204 if [ -n "$__SCRUB_STRESS_REMOUNT_LOOP" ]; then
1205 echo "Remounting rw at $(date)" >> $seqres.full
1206 _scratch_remount rw >> $seqres.full 2>&1
1207 __SCRUB_STRESS_REMOUNT_LOOP=""
1210 echo "Cleanup finished at $(date)" >> $seqres.full
1213 # Make sure the provided scrub/repair commands actually work on the scratch
1214 # filesystem before we start running them in a loop.
1215 __stress_scrub_check_commands() {
1216 local scrub_tgt="$1"
1217 local start_agno="$2"
1220 local cooked_tgt="$scrub_tgt"
1221 case "$scrub_tgt" in
1223 cooked_tgt="$SCRATCH_MNT"
1225 "%regfile%"|"%datafile%")
1226 cooked_tgt="$SCRATCH_MNT/testfile"
1227 echo test > "$cooked_tgt"
1230 cooked_tgt="$SCRATCH_MNT/testfile"
1231 $XFS_IO_PROG -f -c 'pwrite -S 0x58 0 64k' "$cooked_tgt" &>/dev/null
1232 attr -s attrname "$cooked_tgt" < "$cooked_tgt" &>/dev/null
1235 cooked_tgt="$SCRATCH_MNT/testfile"
1236 $XFS_IO_PROG -f -c 'pwrite -S 0x58 0 128k' "$cooked_tgt" &>/dev/null
1237 _cp_reflink "$cooked_tgt" "$cooked_tgt.1"
1238 $XFS_IO_PROG -f -c 'pwrite -S 0x58 0 1' "$cooked_tgt.1" &>/dev/null
1243 local cooked_arg="$arg"
1244 if [ -n "$SCRUBSTRESS_USE_FORCE_REBUILD" ]; then
1245 cooked_arg="$(echo "$cooked_arg" | sed -e 's/^repair/repair -R/g')"
1247 cooked_arg="$(echo "$cooked_arg" | sed -e "s/%agno%/$start_agno/g")"
1248 testio=`$XFS_IO_PROG -x -c "$cooked_arg" "$cooked_tgt" 2>&1`
1249 echo $testio | grep -q "Unknown type" && \
1250 _notrun "xfs_io scrub subcommand support is missing"
1251 echo $testio | grep -q "Inappropriate ioctl" && \
1252 _notrun "kernel scrub ioctl is missing"
1253 echo $testio | grep -q "No such file or directory" && \
1254 _notrun "kernel does not know about: $arg"
1255 echo $testio | grep -q "Operation not supported" && \
1256 _notrun "kernel does not support: $arg"
1260 # Start scrub, freeze, and fsstress in background looping processes, and wait
1261 # for 30*TIME_FACTOR seconds to see if the filesystem goes down. Callers
1262 # must call _scratch_xfs_stress_scrub_cleanup from their cleanup functions.
1264 # Various options include:
1266 # -a For %agno% substitution, start with this AG instead of AG 0.
1267 # -f Run a freeze/thaw loop while we're doing other things. Defaults to
1268 # disabled, unless XFS_SCRUB_STRESS_FREEZE is set.
1269 # -i Pass this command to xfs_io to exercise something that is not scrub
1270 # in a separate loop. If zero -i options are specified, do not run.
1271 # Callers must check each of these commands (via _require_xfs_io_command)
1272 # before calling here.
1273 # -r Run fsstress for this amount of time, then remount the fs ro or rw.
1274 # The default is to run fsstress continuously with no remount, unless
1275 # XFS_SCRUB_STRESS_REMOUNT_PERIOD is set.
1276 # -s Pass this command to xfs_io to test scrub. If zero -s options are
1277 # specified, xfs_io will not be run.
1278 # -S Pass this option to xfs_scrub. If zero -S options are specified,
1279 # xfs_scrub will not be run. To select repair mode, pass '-k' or '-v'.
1280 # -t Run online scrub against this file; $SCRATCH_MNT is the default.
1281 # Special values are as follows:
1284 # %regfile% regular files
1286 # %datafile% regular files with data blocks
1287 # %attrfile% regular files with xattr blocks
1288 # %cowfile% regular files with shared blocks
1290 # File selection races with fsstress, so the selection is best-effort.
1291 # -w Delay the start of the scrub/repair loop by this number of seconds.
1292 # Defaults to no delay unless XFS_SCRUB_STRESS_DELAY is set. This value
1293 # will be clamped to ten seconds before the end time.
1294 # -x Focus on this type of fsstress operation. Possible values:
1296 # 'dir': Grow the directory trees as much as possible.
1297 # 'xattr': Grow extended attributes in a small tree.
1298 # 'default': Run fsstress with default arguments.
1299 # 'writeonly': Only perform fs updates, no reads.
1300 # 'symlink': Only create symbolic links.
1301 # 'mknod': Only create special files.
1302 # 'parent': Focus on updating parent pointers
1304 # The default is 'default' unless XFS_SCRUB_STRESS_TARGET is set.
1305 # -X Run this program to exercise the filesystem. Currently supported
1306 # options are 'fsx' and 'fsstress'. The default is 'fsstress'.
1307 _scratch_xfs_stress_scrub() {
1308 local one_scrub_args=()
1309 local xfs_scrub_args=()
1310 local scrub_tgt="$SCRATCH_MNT"
1311 local runningfile="$tmp.fsstress"
1312 local freeze="${XFS_SCRUB_STRESS_FREEZE}"
1313 local scrub_delay="${XFS_SCRUB_STRESS_DELAY:--1}"
1314 local exerciser="fsstress"
1316 local remount_period="${XFS_SCRUB_STRESS_REMOUNT_PERIOD}"
1317 local stress_tgt="${XFS_SCRUB_STRESS_TARGET:-default}"
1320 __SCRUB_STRESS_FREEZE_PID=""
1321 __SCRUB_STRESS_REMOUNT_LOOP=""
1322 rm -f "$runningfile"
1323 touch "$runningfile"
1326 while getopts "a:fi:r:s:S:t:w:x:X:" c; do
1328 a) start_agno="$OPTARG";;
1330 i) io_args+=("$OPTARG");;
1331 r) remount_period="$OPTARG";;
1332 s) one_scrub_args+=("$OPTARG");;
1333 S) xfs_scrub_args+=("$OPTARG");;
1334 t) scrub_tgt="$OPTARG";;
1335 w) scrub_delay="$OPTARG";;
1336 x) stress_tgt="$OPTARG";;
1337 X) exerciser="$OPTARG";;
1342 __stress_scrub_check_commands "$scrub_tgt" "$start_agno" \
1343 "${one_scrub_args[@]}"
1345 if ! command -v "__stress_scrub_${exerciser}_loop" &>/dev/null; then
1346 echo "${exerciser}: Unknown fs exercise program."
1350 if [ "${#xfs_scrub_args[@]}" -gt 0 ]; then
1351 _scratch_scrub "${xfs_scrub_args[@]}" &> "$tmp.scrub"
1353 if [ $res -ne 0 ]; then
1354 echo "xfs_scrub ${xfs_scrub_args[@]} failed, err $res" >> $seqres.full
1355 cat "$tmp.scrub" >> $seqres.full
1357 _notrun 'scrub not supported on scratch filesystem'
1362 _xfs_skip_online_rebuild
1363 _xfs_skip_offline_rebuild
1365 local start="$(date +%s)"
1367 if [ -n "$SOAK_DURATION" ]; then
1368 end="$((start + SOAK_DURATION))"
1370 end="$((start + (30 * TIME_FACTOR) ))"
1372 local scrub_startat="$((start + scrub_delay))"
1373 test "$scrub_startat" -gt "$((end - 10))" &&
1374 scrub_startat="$((end - 10))"
1376 echo "Loop started at $(date --date="@${start}")," \
1377 "ending at $(date --date="@${end}")" >> $seqres.full
1379 if [ -n "$remount_period" ]; then
1380 __SCRUB_STRESS_REMOUNT_LOOP="1"
1383 "__stress_scrub_${exerciser}_loop" "$end" "$runningfile" \
1384 "$remount_period" "$stress_tgt" &
1386 if [ -n "$freeze" ]; then
1387 __stress_scrub_freeze_loop "$end" "$runningfile" &
1388 __SCRUB_STRESS_FREEZE_PID="$!"
1391 if [ "${#io_args[@]}" -gt 0 ]; then
1392 __stress_xfs_io_loop "$end" "$runningfile" \
1396 if [ "${#one_scrub_args[@]}" -gt 0 ]; then
1397 __stress_one_scrub_loop "$end" "$runningfile" "$scrub_tgt" \
1398 "$scrub_startat" "$start_agno" \
1399 "${one_scrub_args[@]}" &
1402 if [ "${#xfs_scrub_args[@]}" -gt 0 ]; then
1403 __stress_xfs_scrub_loop "$end" "$runningfile" "$scrub_startat" \
1404 "${xfs_scrub_args[@]}" &
1407 # Wait until the designated end time or fsstress dies, then kill all of
1408 # our background processes.
1409 while __stress_scrub_running "$end" "$runningfile"; do
1412 _scratch_xfs_stress_scrub_cleanup
1414 # Warn the user if we think the scratch filesystem went down.
1415 __stress_scrub_scratch_alive || \
1416 echo "Did the scratch filesystem die?"
1418 echo "Loop finished at $(date)" >> $seqres.full
1421 # Decide if we're going to force repairs either by error injection or passing
1422 # FORCE_REBUILD via ioctl.
1423 __scratch_xfs_stress_setup_force_rebuild() {
1424 local output="$($XFS_IO_PROG -x -c 'repair -R probe' $SCRATCH_MNT 2>&1)"
1426 if [ -z "$output" ]; then
1427 SCRUBSTRESS_USE_FORCE_REBUILD=1
1431 $XFS_IO_PROG -x -c 'inject force_repair' $SCRATCH_MNT
1434 # Start online repair, freeze, and fsstress in background looping processes,
1435 # and wait for 30*TIME_FACTOR seconds to see if the filesystem goes down.
1436 # Same requirements and arguments as _scratch_xfs_stress_scrub.
1437 _scratch_xfs_stress_online_repair() {
1438 __scratch_xfs_stress_setup_force_rebuild
1439 XFS_SCRUB_FORCE_REPAIR=1 _scratch_xfs_stress_scrub "$@"