#! /bin/bash
# SPDX-License-Identifier: GPL-2.0-or-later
# Copyright (c) 2021 Oracle.  All Rights Reserved.
#
# FS QA Test No. 155
#
# Populate a filesystem with all types of metadata, then run repair with the
# libxfs write failure trigger set to go after a single write.  Check that the
# injected error trips, causing repair to abort, that needsrepair is set on the
# fs, the kernel won't mount; and that a non-injecting repair run clears
# needsrepair and makes the filesystem mountable again.
#
# Repeat with the trip point set to successively higher numbers of writes until
# we hit ~200 writes or repair manages to run to completion without tripping.

. ./common/preamble
_begin_fstest auto repair

# Import common functions.
. ./common/populate
. ./common/filter

# real QA test starts here
_supported_fs xfs
_require_scratch_nocheck
_require_scratch_xfs_crc		# needsrepair only exists for v5
_require_populate_commands
_require_libxfs_debug_flag LIBXFS_DEBUG_WRITE_CRASH
_require_command "$TIMEOUT_PROG" timeout

# Inject a 10 minute abortive timeout on the repair program so that deadlocks
# in the program do not cause fstests to hang indefinitely.
XFS_REPAIR_PROG="$TIMEOUT_PROG -s ABRT 10m $XFS_REPAIR_PROG"

# Populate the filesystem
_scratch_populate_cached nofill >> $seqres.full 2>&1

max_writes=200			# 200 loops should be enough for anyone
nr_incr=$((13 / TIME_FACTOR))
test $nr_incr -lt 1 && nr_incr=1
for ((nr_writes = 1; nr_writes < max_writes; nr_writes += nr_incr)); do
	# Add a tiny bit of randomness into each run
	allowed_writes=$(( nr_writes + (RANDOM % 7) ))
	echo "Setting debug hook to crash after $allowed_writes writes." >> $seqres.full

	# Start a repair and force it to abort after some number of writes
	LIBXFS_DEBUG_WRITE_CRASH=ddev=$allowed_writes \
			_scratch_xfs_repair 2>> $seqres.full
	res=$?
	if [ $res -ne 0 ] && [ $res -ne 137 ]; then
		echo "repair failed with $res??"
		break
	elif [ $res -eq 0 ]; then
		[ $nr_writes -eq 1 ] && \
			echo "ran to completion on the first try?"
		break
	fi

	# Check the state of NEEDSREPAIR after repair fails.  If it isn't set
	# but repair -n says the fs is clean, then it's possible that the
	# injected error caused it to abort immediately after the write that
	# cleared NEEDSREPAIR.
	if ! _check_scratch_xfs_features NEEDSREPAIR &> /dev/null &&
	   ! _scratch_xfs_repair -n &>> $seqres.full; then
		echo "NEEDSREPAIR should be set on corrupt fs"
	fi
done

# If NEEDSREPAIR is still set on the filesystem, ensure that a full run
# cleans everything up.
echo "Checking filesystem one last time after $allowed_writes writes." >> $seqres.full
if _check_scratch_xfs_features NEEDSREPAIR &> /dev/null; then
	echo "Clearing NEEDSREPAIR" >> $seqres.full
	_scratch_xfs_repair 2>> $seqres.full
	_check_scratch_xfs_features NEEDSREPAIR > /dev/null && \
		echo "Repair failed to clear NEEDSREPAIR on the $allowed_writes writes test"
fi

# success, all done
echo Silence is golden.
status=0
exit