--- /dev/null
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2021 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FSQA Test No. 240
+#
+# Test a scenario where we do several partial writes into multiple preallocated
+# extents across two transactions and with several fsyncs in between. The goal
+# is to check that the fsyncs succeed. This scenario used to trigger an -EIO
+# failure on the last fsync and turn the filesystem to RO mode because of a
+# transaction abort.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1 # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+ _cleanup_flakey
+ cd /
+ rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs btrfs
+_require_scratch
+_require_dm_target flakey
+_require_xfs_io_command "falloc"
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+
+# Create our test file with 2 preallocated extents. Leave a 1M hole between them
+# to ensure that we get two file extent items that will never be merged into a
+# single one. The extents are contiguous on disk, which will later result in the
+# checksums for their data to be merged into a single checksum item in the csums
+# btree.
+#
+$XFS_IO_PROG -f \
+ -c "falloc 0 1M" \
+ -c "falloc 3M 3M" \
+ $SCRATCH_MNT/foobar
+
+# Now write to the second extent and leave only 1M of it as unwritten, which
+# corresponds to the file range [4M, 5M[.
+#
+# Then fsync the file to flush delalloc and to clear full sync flag from the
+# inode, so that a future fsync will use the fast code path.
+#
+# After the writeback triggered by the fsync we have 3 file extent items that
+# point to the second extent we previously allocated with fallocate():
+#
+# 1) One file extent item of type BTRFS_FILE_EXTENT_REG that covers the file
+# range [3M, 4M[
+#
+# 2) One file extent item of type BTRFS_FILE_EXTENT_PREALLOC that covers the
+# file range [4M, 5M[
+#
+# 3) One file extent item of type BTRFS_FILE_EXTENT_REG that covers the file
+# range [5M, 6M[
+#
+# All these file extent items have a generation of 6, which is the ID of the
+# transaction where they were created. The split of the original file extent
+# item is done at btrfs_mark_extent_written() when ordered extents complete for
+# the file ranges [3M, 4M[ and [5M, 6M[.
+#
+$XFS_IO_PROG -c "pwrite -S 0xab 3M 1M" \
+ -c "pwrite -S 0xef 5M 1M" \
+ -c "fsync" \
+ $SCRATCH_MNT/foobar | _filter_xfs_io
+
+# Commit the current transaction. This wipes out the log tree created by the
+# previous fsync.
+sync
+
+# Now write to the unwritten range of the second extent we allocated,
+# corresponding to the file range [4M, 5M[, and fsync the file, which triggers
+# the fast fsync code path.
+#
+# The fast fsync code path sees that there is a new extent map covering the file
+# range [4M, 5M[ and therefore it will log a checksum item covering the range
+# [1M, 2M[ of the second extent we allocated.
+#
+# Also, after the fsync finishes we no longer have the 3 file extent items that
+# pointed to 3 sections of the second extent we allocated. Instead we end up
+# with a single file extent item pointing to the whole extent, with a type of
+# BTRFS_FILE_EXTENT_REG and a generation of 7 (the current transaction ID). This
+# is due to the file extent item merging we do when completing ordered extents
+# into ranges that point to unwritten (preallocated) extents. This merging is
+# done at btrfs_mark_extent_written().
+#
+$XFS_IO_PROG -c "pwrite -S 0xcd 4M 1M" \
+ -c "fsync" \
+ $SCRATCH_MNT/foobar | _filter_xfs_io
+
+# Now do some write to our file outside the range of the second extent that we
+# allocated with fallocate() and truncate the file size from 6M down to 5M.
+#
+# The truncate operation sets the full sync runtime flag on the inode, forcing
+# the next fsync to use the slow code path. It also changes the length of the
+# second file extent item so that it represents the file range [3M, 5M[ and not
+# the range [3M, 6M[ anymore.
+#
+# Finally fsync the file. Since this is a fsync that triggers the slow code path,
+# it will remove all items associated to the inode from the log tree and then it
+# will scan for file extent items in the fs/subvolume tree that have a generation
+# matching the current transaction ID, which is 7. This means it will log 2 file
+# extent items:
+#
+# 1) One for the first extent we allocated, covering the file range [0, 1M[
+#
+# 2) Another for the first 2M of the second extent we allocated, covering the
+# file range [3M, 5M[
+#
+# When logging the first file extent item we log a single checksum item that has
+# all the checksums for the entire extent.
+#
+# When logging the second file extent item, we also lookup for the checksums that
+# are associated with the range [0, 2M[ of the second extent we allocated (file
+# range [3M, 5M[), and then we log them with btrfs_csum_file_blocks(). However
+# that results in ending up with a log that has two checksum items with ranges
+# that overlap:
+#
+# 1) One for the range [1M, 2M[ of the second extent we allocated, corresponding
+# to the file range [4M, 5M[, which we logged in the previous fsync that used
+# the fast code path;
+#
+# 2) One for the ranges [0, 1M[ and [0, 2M[ of the first and second extents,
+# respectively, corresponding to the files ranges [0, 1M[ and [3M, 5M[.
+# This one was added during this last fsync that uses the slow code path
+# and overlaps with the previous one logged by the previous fast fsync.
+#
+# This happens because when logging the checksums for the second extent, we
+# notice they start at an offset that matches the end of the checksums item that
+# we logged for the first extent, and because both extents are contiguous on
+# disk, btrfs_csum_file_blocks() decides to extend that existing checksums item
+# and append the checksums for the second extent to this item. The end result is
+# we end up with two checksum items in the log tree that have overlapping ranges,
+# as listed before, resulting in the fsync to fail with -EIO and aborting the
+# transaction, turning the filesystem into RO mode.
+#
+$XFS_IO_PROG -c "pwrite -S 0xff 0 1M" \
+ -c "truncate 5M" \
+ -c "fsync" \
+ $SCRATCH_MNT/foobar | _filter_xfs_io
+
+echo "File content before power failure:"
+od -A d -t x1 $SCRATCH_MNT/foobar
+
+# Simulate a power failure and mount again the filesystem. The file content
+# must be the same that we had before.
+_flakey_drop_and_remount
+
+echo "File content before after failure:"
+od -A d -t x1 $SCRATCH_MNT/foobar
+
+_unmount_flakey
+
+status=0
+exit