btrfs: add test for multiple fsync with adjacent preallocated extents

author Filipe Manana <fdmanana@suse.com>

Mon, 24 May 2021 10:37:21 +0000 (11:37 +0100)

committer Eryu Guan <guaneryu@gmail.com>

Sun, 30 May 2021 12:16:50 +0000 (20:16 +0800)
author Filipe Manana <fdmanana@suse.com>
Mon, 24 May 2021 10:37:21 +0000 (11:37 +0100)
committer Eryu Guan <guaneryu@gmail.com>
Sun, 30 May 2021 12:16:50 +0000 (20:16 +0800)
diff --git a/tests/btrfs/240 b/tests/btrfs/240

new file mode 100755 (executable)

index 0000000..c799848
--- /dev/null
+++ b/tests/btrfs/240
@@ -0,0 +1,173 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2021 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FSQA Test No. 240
+#
+# Test a scenario where we do several partial writes into multiple preallocated
+# extents across two transactions and with several fsyncs in between. The goal
+# is to check that the fsyncs succeed. This scenario used to trigger an -EIO
+# failure on the last fsync and turn the filesystem to RO mode because of a
+# transaction abort.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1       # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+       _cleanup_flakey
+       cd /
+       rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs btrfs
+_require_scratch
+_require_dm_target flakey
+_require_xfs_io_command "falloc"
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+
+# Create our test file with 2 preallocated extents. Leave a 1M hole between them
+# to ensure that we get two file extent items that will never be merged into a
+# single one. The extents are contiguous on disk, which will later result in the
+# checksums for their data to be merged into a single checksum item in the csums
+# btree.
+#
+$XFS_IO_PROG -f \
+            -c "falloc 0 1M" \
+            -c "falloc 3M 3M" \
+            $SCRATCH_MNT/foobar
+
+# Now write to the second extent and leave only 1M of it as unwritten, which
+# corresponds to the file range [4M, 5M[.
+#
+# Then fsync the file to flush delalloc and to clear full sync flag from the
+# inode, so that a future fsync will use the fast code path.
+#
+# After the writeback triggered by the fsync we have 3 file extent items that
+# point to the second extent we previously allocated with fallocate():
+#
+# 1) One file extent item of type BTRFS_FILE_EXTENT_REG that covers the file
+#    range [3M, 4M[
+#
+# 2) One file extent item of type BTRFS_FILE_EXTENT_PREALLOC that covers the
+#    file range [4M, 5M[
+#
+# 3) One file extent item of type BTRFS_FILE_EXTENT_REG that covers the file
+#    range [5M, 6M[
+#
+# All these file extent items have a generation of 6, which is the ID of the
+# transaction where they were created. The split of the original file extent
+# item is done at btrfs_mark_extent_written() when ordered extents complete for
+# the file ranges [3M, 4M[ and [5M, 6M[.
+#
+$XFS_IO_PROG -c "pwrite -S 0xab 3M 1M" \
+            -c "pwrite -S 0xef 5M 1M" \
+            -c "fsync" \
+            $SCRATCH_MNT/foobar | _filter_xfs_io
+
+# Commit the current transaction. This wipes out the log tree created by the
+# previous fsync.
+sync
+
+# Now write to the unwritten range of the second extent we allocated,
+# corresponding to the file range [4M, 5M[, and fsync the file, which triggers
+# the fast fsync code path.
+#
+# The fast fsync code path sees that there is a new extent map covering the file
+# range [4M, 5M[ and therefore it will log a checksum item covering the range
+# [1M, 2M[ of the second extent we allocated.
+#
+# Also, after the fsync finishes we no longer have the 3 file extent items that
+# pointed to 3 sections of the second extent we allocated. Instead we end up
+# with a single file extent item pointing to the whole extent, with a type of
+# BTRFS_FILE_EXTENT_REG and a generation of 7 (the current transaction ID). This
+# is due to the file extent item merging we do when completing ordered extents
+# into ranges that point to unwritten (preallocated) extents. This merging is
+# done at btrfs_mark_extent_written().
+#
+$XFS_IO_PROG -c "pwrite -S 0xcd 4M 1M" \
+            -c "fsync" \
+            $SCRATCH_MNT/foobar | _filter_xfs_io
+
+# Now do some write to our file outside the range of the second extent that we
+# allocated with fallocate() and truncate the file size from 6M down to 5M.
+#
+# The truncate operation sets the full sync runtime flag on the inode, forcing
+# the next fsync to use the slow code path. It also changes the length of the
+# second file extent item so that it represents the file range [3M, 5M[ and not
+# the range [3M, 6M[ anymore.
+#
+# Finally fsync the file. Since this is a fsync that triggers the slow code path,
+# it will remove all items associated to the inode from the log tree and then it
+# will scan for file extent items in the fs/subvolume tree that have a generation
+# matching the current transaction ID, which is 7. This means it will log 2 file
+# extent items:
+#
+# 1) One for the first extent we allocated, covering the file range [0, 1M[
+#
+# 2) Another for the first 2M of the second extent we allocated, covering the
+#    file range [3M, 5M[
+#
+# When logging the first file extent item we log a single checksum item that has
+# all the checksums for the entire extent.
+#
+# When logging the second file extent item, we also lookup for the checksums that
+# are associated with the range [0, 2M[ of the second extent we allocated (file
+# range [3M, 5M[), and then we log them with btrfs_csum_file_blocks(). However
+# that results in ending up with a log that has two checksum items with ranges
+# that overlap:
+#
+# 1) One for the range [1M, 2M[ of the second extent we allocated, corresponding
+#    to the file range [4M, 5M[, which we logged in the previous fsync that used
+#    the fast code path;
+#
+# 2) One for the ranges [0, 1M[ and [0, 2M[ of the first and second extents,
+#    respectively, corresponding to the files ranges [0, 1M[ and [3M, 5M[.
+#    This one was added during this last fsync that uses the slow code path
+#    and overlaps with the previous one logged by the previous fast fsync.
+#
+# This happens because when logging the checksums for the second extent, we
+# notice they start at an offset that matches the end of the checksums item that
+# we logged for the first extent, and because both extents are contiguous on
+# disk, btrfs_csum_file_blocks() decides to extend that existing checksums item
+# and append the checksums for the second extent to this item. The end result is
+# we end up with two checksum items in the log tree that have overlapping ranges,
+# as listed before, resulting in the fsync to fail with -EIO and aborting the
+# transaction, turning the filesystem into RO mode.
+#
+$XFS_IO_PROG -c "pwrite -S 0xff 0 1M" \
+            -c "truncate 5M" \
+            -c "fsync" \
+            $SCRATCH_MNT/foobar | _filter_xfs_io
+
+echo "File content before power failure:"
+od -A d -t x1 $SCRATCH_MNT/foobar
+
+# Simulate a power failure and mount again the filesystem. The file content
+# must be the same that we had before.
+_flakey_drop_and_remount
+
+echo "File content before after failure:"
+od -A d -t x1 $SCRATCH_MNT/foobar
+
+_unmount_flakey
+
+status=0
+exit
diff --git a/tests/btrfs/240.out b/tests/btrfs/240.out

new file mode 100644 (file)

index 0000000..ffd03ee
--- /dev/null
+++ b/tests/btrfs/240.out
@@ -0,0 +1,29 @@
+QA output created by 240
+wrote 1048576/1048576 bytes at offset 3145728
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1048576/1048576 bytes at offset 5242880
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1048576/1048576 bytes at offset 4194304
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1048576/1048576 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+File content before power failure:
+0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
+*
+1048576 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+*
+3145728 ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab
+*
+4194304 cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd
+*
+5242880
+File content before after failure:
+0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
+*
+1048576 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+*
+3145728 ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab
+*
+4194304 cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd
+*
+5242880
diff --git a/tests/btrfs/group b/tests/btrfs/group

index d7f2f3e554fcbed469ecddd76b41e3f0d7eab2e4..ad59ed3e78613a67ac3408d0f2ad17fdec3a3cb6 100644 (file)
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -242,3 +242,4 @@
  237 auto quick zone balance
  238 auto quick seed trim
  239 auto quick log
+240 auto quick prealloc log
author	Filipe Manana <fdmanana@suse.com>
	Mon, 24 May 2021 10:37:21 +0000 (11:37 +0100)
committer	Eryu Guan <guaneryu@gmail.com>
	Sun, 30 May 2021 12:16:50 +0000 (20:16 +0800)
tests/btrfs/240	[new file with mode: 0755]	patch \| blob
tests/btrfs/240.out	[new file with mode: 0644]	patch \| blob
tests/btrfs/group		patch \| blob \| history