generic/019: don't dump cores when fio/fsstress hit io errors
[xfstests-dev.git] / tests / generic / 019
1 #! /bin/bash
2 # SPDX-License-Identifier: GPL-2.0
3
4 #
5 # FSQA Test No. generic/019
6 #
7 # Run fsstress and fio(dio/aio and mmap) and simulate disk failure
8 # check filesystem consistency at the end.
9 #
10 . ./common/preamble
11 _begin_fstest aio dangerous enospc rw stress
12
13 fio_config=$tmp.fio
14
15 # Import common functions.
16 . ./common/filter
17 _supported_fs generic
18 _require_scratch
19 _require_block_device $SCRATCH_DEV
20 _require_fail_make_request
21
22 SYSFS_BDEV=`_sysfs_dev $SCRATCH_DEV`
23
24 allow_fail_make_request()
25 {
26     echo "Allow global fail_make_request feature"
27     echo 100 > $DEBUGFS_MNT/fail_make_request/probability
28     echo 9999999 > $DEBUGFS_MNT/fail_make_request/times
29     echo 0 >  /sys/kernel/debug/fail_make_request/verbose
30 }
31
32 disallow_fail_make_request()
33 {
34     echo "Disallow global fail_make_request feature"
35     echo 0 > $DEBUGFS_MNT/fail_make_request/probability
36     echo 0 > $DEBUGFS_MNT/fail_make_request/times
37 }
38
39 start_fail_scratch_dev()
40 {
41     echo "Force SCRATCH_DEV device failure"
42     echo " echo 1 > $SYSFS_BDEV/make-it-fail" >> $seqres.full
43     echo 1 > $SYSFS_BDEV/make-it-fail
44 }
45
46 stop_fail_scratch_dev()
47 {
48     echo "Make SCRATCH_DEV device operable again"
49     echo " echo 0 > $SYSFS_BDEV/make-it-fail" >> $seqres.full
50     echo 0 > $SYSFS_BDEV/make-it-fail
51 }
52
53 # Override the default cleanup function.
54 _cleanup()
55 {
56     disallow_fail_make_request
57     rm -f $tmp.*
58 }
59
60 RUN_TIME=$((20+10*$TIME_FACTOR))
61 NUM_JOBS=$((4*LOAD_FACTOR))
62 BLK_DEV_SIZE=`blockdev --getsz $SCRATCH_DEV`
63 FILE_SIZE=$((BLK_DEV_SIZE * 512))
64
65 # Don't fail the test just because fio or fsstress dump cores
66 ulimit -c 0
67
68 cat >$fio_config <<EOF
69 ###########
70 # $seq test's fio activity
71 # Filenames derived from jobsname and jobid like follows:
72 # ${JOB_NAME}.${JOB_ID}.${ITERATION_ID}
73 [global]
74 ioengine=libaio
75 bs=4k
76 directory=${SCRATCH_MNT}
77 filesize=${FILE_SIZE}
78 size=9999T
79 continue_on_error=write
80 ignore_error=EIO,ENOSPC:EIO
81 error_dump=0
82
83 [stress_dio_aio_activity]
84 create_on_open=1
85 fallocate=none
86 iodepth=128*${LOAD_FACTOR}
87 direct=1
88 buffered=0
89 numjobs=${NUM_JOBS}
90 rw=randwrite
91 runtime=40+${RUN_TIME}
92 time_based
93
94 [stress_mmap_activity]
95 ioengine=mmap
96 create_on_open=0
97 fallocate=1
98 fdatasync=40960
99 filesize=8M
100 size=9999T
101 numjobs=${NUM_JOBS}
102 rw=randwrite
103 runtime=40+${RUN_TIME}
104 time_based
105
106 EOF
107
108 _require_fio $fio_config
109
110 # Disable all sync operations to get higher load
111 FSSTRESS_AVOID="$FSSTRESS_AVOID -ffsync=0 -fsync=0 -ffdatasync=0 -f setattr=1"
112
113 _workout()
114 {
115         out=$SCRATCH_MNT/fsstress.$$
116         args=`_scale_fsstress_args -p 1 -n999999999 -f setattr=0 $FSSTRESS_AVOID -d $out`
117         echo ""
118         echo "Start fsstress.."
119         echo ""
120         echo "fsstress $args" >> $seqres.full
121         $FSSTRESS_PROG $args > /dev/null 2>&1 &
122         fs_pid=$!
123         echo "Start fio.."
124         cat $fio_config >>  $seqres.full
125         $FIO_PROG $fio_config >> $seqres.full 2>&1 &
126         fio_pid=$!
127
128         # Let's it work for awhile, and force device failure
129         sleep $RUN_TIME
130         start_fail_scratch_dev
131         # After device turns in to failed state filesystem may yet not know about
132         # that so buffered write(2) may succeed, but any integrity operations
133         # such as (sync, fsync, fdatasync, direct-io) should fail.
134         dd if=/dev/zero of=$SCRATCH_MNT/touch_failed_filesystem count=1 bs=4k conv=fsync \
135             >> $seqres.full 2>&1 && \
136             _fail "failed: still able to perform integrity fsync on $SCRATCH_MNT"
137
138         kill $fs_pid &> /dev/null
139         wait $fs_pid
140         wait $fio_pid
141
142         # We expect that broken FS still can be umounted
143         run_check _scratch_unmount
144         # Once filesystem was umounted no one is able to write to block device
145         # It is now safe to bring device back to normal state
146         stop_fail_scratch_dev
147
148         # In order to check that filesystem is able to recover journal on mount(2)
149         # perform mount/umount, after that all errors should be fixed
150         _scratch_mount
151         run_check _scratch_unmount
152 }
153
154 # real QA test starts here
155
156 _scratch_mkfs >> $seqres.full 2>&1 || _fail "mkfs failed"
157 _scratch_mount
158 allow_fail_make_request
159 _workout
160 status=$?
161 exit