]> git.apps.os.sepia.ceph.com Git - ceph-client.git/commitdiff
block: introduce max_{hw|user}_wzeroes_unmap_sectors to queue limits
authorZhang Yi <yi.zhang@huawei.com>
Thu, 19 Jun 2025 11:17:58 +0000 (19:17 +0800)
committerChristian Brauner <brauner@kernel.org>
Mon, 23 Jun 2025 10:45:13 +0000 (12:45 +0200)
Currently, disks primarily implement the write zeroes command (aka
REQ_OP_WRITE_ZEROES) through two mechanisms: the first involves
physically writing zeros to the disk media (e.g., HDDs), while the
second performs an unmap operation on the logical blocks, effectively
putting them into a deallocated state (e.g., SSDs). The first method is
generally slow, while the second method is typically very fast.

For example, on certain NVMe SSDs that support NVME_NS_DEAC, submitting
REQ_OP_WRITE_ZEROES requests with the NVME_WZ_DEAC bit can accelerate
the write zeros operation by placing disk blocks into a deallocated
state, which opportunistically avoids writing zeroes to media while
still guaranteeing that subsequent reads from the specified block range
will return zeroed data. This is a best-effort optimization, not a
mandatory requirement, some devices may partially fall back to writing
physical zeroes due to factors such as misalignment or being asked to
clear a block range smaller than the device's internal allocation unit.
Therefore, the speed of this operation is not guaranteed.

It is difficult to determine whether the storage device supports unmap
write zeroes operation. We cannot determine this by only querying
bdev_limits(bdev)->max_write_zeroes_sectors. Therefore, first, add a new
hardware queue limit parameters, max_hw_wzeroes_unmap_sectors, to
indicate whether a device supports this unmap write zeroes operation.
Then, add two new counterpart software queue limits,
max_wzeroes_unmap_sectors and max_user_wzeroes_unmap_sectors, which
allow users to disable this operation if the speed is very slow on some
sepcial devices.

Finally, for the stacked devices cases, initialize these two parameters
to UINT_MAX. This operation should be enabled by both the stacking
driver and all underlying devices.

Thanks to Martin K. Petersen for optimizing the documentation of the
write_zeroes_unmap sysfs interface.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Link: https://lore.kernel.org/20250619111806.3546162-2-yi.zhang@huaweicloud.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Documentation/ABI/stable/sysfs-block
block/blk-settings.c
block/blk-sysfs.c
include/linux/blkdev.h

index 4ba771b56b3b5952cec4f1856c5ce5931a71d3bc..803f578dc023ad62e8db0ccbe4d2cb2699626c68 100644 (file)
@@ -778,6 +778,39 @@ Description:
                0, write zeroes is not supported by the device.
 
 
+What:          /sys/block/<disk>/queue/write_zeroes_unmap_max_hw_bytes
+Date:          January 2025
+Contact:       Zhang Yi <yi.zhang@huawei.com>
+Description:
+               [RO] This file indicates whether a device supports zeroing data
+               in a specified block range without incurring the cost of
+               physically writing zeroes to the media for each individual
+               block. If this parameter is set to write_zeroes_max_bytes, the
+               device implements a zeroing operation which opportunistically
+               avoids writing zeroes to media while still guaranteeing that
+               subsequent reads from the specified block range will return
+               zeroed data. This operation is a best-effort optimization, a
+               device may fall back to physically writing zeroes to the media
+               due to other factors such as misalignment or being asked to
+               clear a block range smaller than the device's internal
+               allocation unit. If this parameter is set to 0, the device may
+               have to write each logical block media during a zeroing
+               operation.
+
+
+What:          /sys/block/<disk>/queue/write_zeroes_unmap_max_bytes
+Date:          January 2025
+Contact:       Zhang Yi <yi.zhang@huawei.com>
+Description:
+               [RW] While write_zeroes_unmap_max_hw_bytes is the hardware limit
+               for the device, this setting is the software limit. Since the
+               unmap write zeroes operation is a best-effort optimization, some
+               devices may still physically writing zeroes to media. So the
+               speed of this operation is not guaranteed. Writing a value of
+               '0' to this file disables this operation. Otherwise, this
+               parameter should be equal to write_zeroes_unmap_max_hw_bytes.
+
+
 What:          /sys/block/<disk>/queue/zone_append_max_bytes
 Date:          May 2020
 Contact:       linux-block@vger.kernel.org
index a000daafbfb489fa6a23c794af0f9bbe3513e1bb..b5c75f0ac3e98d259cd3144cbf0069cbd736f633 100644 (file)
@@ -50,6 +50,8 @@ void blk_set_stacking_limits(struct queue_limits *lim)
        lim->max_sectors = UINT_MAX;
        lim->max_dev_sectors = UINT_MAX;
        lim->max_write_zeroes_sectors = UINT_MAX;
+       lim->max_hw_wzeroes_unmap_sectors = UINT_MAX;
+       lim->max_user_wzeroes_unmap_sectors = UINT_MAX;
        lim->max_hw_zone_append_sectors = UINT_MAX;
        lim->max_user_discard_sectors = UINT_MAX;
 }
@@ -333,6 +335,12 @@ int blk_validate_limits(struct queue_limits *lim)
        if (!lim->max_segments)
                lim->max_segments = BLK_MAX_SEGMENTS;
 
+       if (lim->max_hw_wzeroes_unmap_sectors &&
+           lim->max_hw_wzeroes_unmap_sectors != lim->max_write_zeroes_sectors)
+               return -EINVAL;
+       lim->max_wzeroes_unmap_sectors = min(lim->max_hw_wzeroes_unmap_sectors,
+                       lim->max_user_wzeroes_unmap_sectors);
+
        lim->max_discard_sectors =
                min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors);
 
@@ -418,10 +426,11 @@ int blk_set_default_limits(struct queue_limits *lim)
 {
        /*
         * Most defaults are set by capping the bounds in blk_validate_limits,
-        * but max_user_discard_sectors is special and needs an explicit
-        * initialization to the max value here.
+        * but these limits are special and need an explicit initialization to
+        * the max value here.
         */
        lim->max_user_discard_sectors = UINT_MAX;
+       lim->max_user_wzeroes_unmap_sectors = UINT_MAX;
        return blk_validate_limits(lim);
 }
 
@@ -708,6 +717,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
        t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
        t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
                                        b->max_write_zeroes_sectors);
+       t->max_user_wzeroes_unmap_sectors =
+                       min(t->max_user_wzeroes_unmap_sectors,
+                           b->max_user_wzeroes_unmap_sectors);
+       t->max_hw_wzeroes_unmap_sectors =
+                       min(t->max_hw_wzeroes_unmap_sectors,
+                           b->max_hw_wzeroes_unmap_sectors);
+
        t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors,
                                        b->max_hw_zone_append_sectors);
 
index b2b9b89d6967ca9675a1648a6f0a3a253c1c2b5d..48c7ecbb531ff809c4c5fa3a5a49bab5380d98de 100644 (file)
@@ -161,6 +161,8 @@ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page)      \
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_wzeroes_unmap_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_wzeroes_unmap_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors)
@@ -205,6 +207,24 @@ static int queue_max_discard_sectors_store(struct gendisk *disk,
        return 0;
 }
 
+static int queue_max_wzeroes_unmap_sectors_store(struct gendisk *disk,
+               const char *page, size_t count, struct queue_limits *lim)
+{
+       unsigned long max_zeroes_bytes, max_hw_zeroes_bytes;
+       ssize_t ret;
+
+       ret = queue_var_store(&max_zeroes_bytes, page, count);
+       if (ret < 0)
+               return ret;
+
+       max_hw_zeroes_bytes = lim->max_hw_wzeroes_unmap_sectors << SECTOR_SHIFT;
+       if (max_zeroes_bytes != 0 && max_zeroes_bytes != max_hw_zeroes_bytes)
+               return -EINVAL;
+
+       lim->max_user_wzeroes_unmap_sectors = max_zeroes_bytes >> SECTOR_SHIFT;
+       return 0;
+}
+
 static int
 queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count,
                struct queue_limits *lim)
@@ -514,6 +534,10 @@ QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
 
 QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
 QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_max_hw_wzeroes_unmap_sectors,
+               "write_zeroes_unmap_max_hw_bytes");
+QUEUE_LIM_RW_ENTRY(queue_max_wzeroes_unmap_sectors,
+               "write_zeroes_unmap_max_bytes");
 QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
 QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
 
@@ -662,6 +686,8 @@ static struct attribute *queue_attrs[] = {
        &queue_atomic_write_unit_min_entry.attr,
        &queue_atomic_write_unit_max_entry.attr,
        &queue_max_write_zeroes_sectors_entry.attr,
+       &queue_max_hw_wzeroes_unmap_sectors_entry.attr,
+       &queue_max_wzeroes_unmap_sectors_entry.attr,
        &queue_max_zone_append_sectors_entry.attr,
        &queue_zone_write_granularity_entry.attr,
        &queue_rotational_entry.attr,
index a59880c809c7bd92aff5a16020371cf1413a3e1b..1a5725c1f93d526645041da67cf3d25c2433d051 100644 (file)
@@ -383,6 +383,9 @@ struct queue_limits {
        unsigned int            max_user_discard_sectors;
        unsigned int            max_secure_erase_sectors;
        unsigned int            max_write_zeroes_sectors;
+       unsigned int            max_wzeroes_unmap_sectors;
+       unsigned int            max_hw_wzeroes_unmap_sectors;
+       unsigned int            max_user_wzeroes_unmap_sectors;
        unsigned int            max_hw_zone_append_sectors;
        unsigned int            max_zone_append_sectors;
        unsigned int            discard_granularity;
@@ -1042,6 +1045,7 @@ static inline void blk_queue_disable_secure_erase(struct request_queue *q)
 static inline void blk_queue_disable_write_zeroes(struct request_queue *q)
 {
        q->limits.max_write_zeroes_sectors = 0;
+       q->limits.max_wzeroes_unmap_sectors = 0;
 }
 
 /*
@@ -1378,6 +1382,12 @@ static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
        return bdev_limits(bdev)->max_write_zeroes_sectors;
 }
 
+static inline unsigned int
+bdev_write_zeroes_unmap_sectors(struct block_device *bdev)
+{
+       return bdev_limits(bdev)->max_wzeroes_unmap_sectors;
+}
+
 static inline bool bdev_nonrot(struct block_device *bdev)
 {
        return blk_queue_nonrot(bdev_get_queue(bdev));