From: Johannes Thumshirn Date: Thu, 5 Mar 2026 10:06:42 +0000 (+0100) Subject: btrfs: move reclaiming of a single block group into its own function X-Git-Tag: ceph-for-7.1-rc4~321^2~57 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=b619185168555dc286adf8af634bea39b9cf0107;p=ceph-client.git btrfs: move reclaiming of a single block group into its own function The main work of reclaiming a single block-group in btrfs_reclaim_bgs_work() is done inside the loop iterating over all the block_groups in the fs_info->reclaim_bgs list. Factor out reclaim of a single block group from the loop to improve readability. No functional change intended. Reviewed-by: Damien Le Moal Reviewed-by: Boris Burkov Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index b5c274715809..4df076bd93f5 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1909,6 +1909,137 @@ static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 b return true; } +static int btrfs_reclaim_block_group(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_space_info *space_info = bg->space_info; + u64 used; + u64 reserved; + u64 old_total; + int ret = 0; + + /* Don't race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + + spin_lock(&space_info->lock); + spin_lock(&bg->lock); + if (bg->reserved || bg->pinned || bg->ro) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); + up_write(&space_info->groups_sem); + return 0; + } + + if (bg->used == 0) { + /* + * It is possible that we trigger relocation on a block + * group as its extents are deleted and it first goes + * below the threshold, then shortly after goes empty. + * + * In this case, relocating it does delete it, but has + * some overhead in relocation specific metadata, looking + * for the non-existent extents and running some extra + * transactions, which we can avoid by using one of the + * other mechanisms for dealing with empty block groups. + */ + if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_mark_bg_unused(bg); + spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); + up_write(&space_info->groups_sem); + return 0; + } + + /* + * The block group might no longer meet the reclaim condition by + * the time we get around to reclaiming it, so to avoid + * reclaiming overly full block_groups, skip reclaiming them. + * + * Since the decision making process also depends on the amount + * being freed, pass in a fake giant value to skip that extra + * check, which is more meaningful when adding to the list in + * the first place. + */ + if (!should_reclaim_block_group(bg, bg->length)) { + spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); + up_write(&space_info->groups_sem); + return 0; + } + + spin_unlock(&bg->lock); + old_total = space_info->total_bytes; + spin_unlock(&space_info->lock); + + /* + * Get out fast, in case we're read-only or unmounting the + * filesystem. It is OK to drop block groups from the list even + * for the read-only case. As we did take the super write lock, + * "mount -o remount,ro" won't happen and read-only filesystem + * means it is forced read-only due to a fatal error. So, it + * never gets back to read-write to let us reclaim again. + */ + if (btrfs_need_cleaner_sleep(fs_info)) { + up_write(&space_info->groups_sem); + return 0; + } + + ret = inc_block_group_ro(bg, false); + up_write(&space_info->groups_sem); + if (ret < 0) + return ret; + + /* + * The amount of bytes reclaimed corresponds to the sum of the + * "used" and "reserved" counters. We have set the block group + * to RO above, which prevents reservations from happening but + * we may have existing reservations for which allocation has + * not yet been done - btrfs_update_block_group() was not yet + * called, which is where we will transfer a reserved extent's + * size from the "reserved" counter to the "used" counter - this + * happens when running delayed references. When we relocate the + * chunk below, relocation first flushes delalloc, waits for + * ordered extent completion (which is where we create delayed + * references for data extents) and commits the current + * transaction (which runs delayed references), and only after + * it does the actual work to move extents out of the block + * group. So the reported amount of reclaimed bytes is + * effectively the sum of the 'used' and 'reserved' counters. + */ + spin_lock(&bg->lock); + used = bg->used; + reserved = bg->reserved; + spin_unlock(&bg->lock); + + trace_btrfs_reclaim_block_group(bg); + ret = btrfs_relocate_chunk(fs_info, bg->start, false); + if (ret) { + btrfs_dec_block_group_ro(bg); + btrfs_err(fs_info, "error relocating chunk %llu", + bg->start); + used = 0; + reserved = 0; + spin_lock(&space_info->lock); + space_info->reclaim_errors++; + spin_unlock(&space_info->lock); + } + spin_lock(&space_info->lock); + space_info->reclaim_count++; + space_info->reclaim_bytes += used; + space_info->reclaim_bytes += reserved; + if (space_info->total_bytes < old_total) + btrfs_set_periodic_reclaim_ready(space_info, true); + spin_unlock(&space_info->lock); + + return ret; +} + void btrfs_reclaim_bgs_work(struct work_struct *work) { struct btrfs_fs_info *fs_info = @@ -1942,10 +2073,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { - u64 used; - u64 reserved; - u64 old_total; - int ret = 0; + int ret; bg = list_first_entry(&fs_info->reclaim_bgs, struct btrfs_block_group, @@ -1954,126 +2082,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) space_info = bg->space_info; spin_unlock(&fs_info->unused_bgs_lock); + ret = btrfs_reclaim_block_group(bg); - /* Don't race with allocators so take the groups_sem */ - down_write(&space_info->groups_sem); - - spin_lock(&space_info->lock); - spin_lock(&bg->lock); - if (bg->reserved || bg->pinned || bg->ro) { - /* - * We want to bail if we made new allocations or have - * outstanding allocations in this block group. We do - * the ro check in case balance is currently acting on - * this block group. - */ - spin_unlock(&bg->lock); - spin_unlock(&space_info->lock); - up_write(&space_info->groups_sem); - goto next; - } - if (bg->used == 0) { - /* - * It is possible that we trigger relocation on a block - * group as its extents are deleted and it first goes - * below the threshold, then shortly after goes empty. - * - * In this case, relocating it does delete it, but has - * some overhead in relocation specific metadata, looking - * for the non-existent extents and running some extra - * transactions, which we can avoid by using one of the - * other mechanisms for dealing with empty block groups. - */ - if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) - btrfs_mark_bg_unused(bg); - spin_unlock(&bg->lock); - spin_unlock(&space_info->lock); - up_write(&space_info->groups_sem); - goto next; - - } - /* - * The block group might no longer meet the reclaim condition by - * the time we get around to reclaiming it, so to avoid - * reclaiming overly full block_groups, skip reclaiming them. - * - * Since the decision making process also depends on the amount - * being freed, pass in a fake giant value to skip that extra - * check, which is more meaningful when adding to the list in - * the first place. - */ - if (!should_reclaim_block_group(bg, bg->length)) { - spin_unlock(&bg->lock); - spin_unlock(&space_info->lock); - up_write(&space_info->groups_sem); - goto next; - } - - spin_unlock(&bg->lock); - old_total = space_info->total_bytes; - spin_unlock(&space_info->lock); - - /* - * Get out fast, in case we're read-only or unmounting the - * filesystem. It is OK to drop block groups from the list even - * for the read-only case. As we did take the super write lock, - * "mount -o remount,ro" won't happen and read-only filesystem - * means it is forced read-only due to a fatal error. So, it - * never gets back to read-write to let us reclaim again. - */ - if (btrfs_need_cleaner_sleep(fs_info)) { - up_write(&space_info->groups_sem); - goto next; - } - - ret = inc_block_group_ro(bg, false); - up_write(&space_info->groups_sem); - if (ret < 0) - goto next; - - /* - * The amount of bytes reclaimed corresponds to the sum of the - * "used" and "reserved" counters. We have set the block group - * to RO above, which prevents reservations from happening but - * we may have existing reservations for which allocation has - * not yet been done - btrfs_update_block_group() was not yet - * called, which is where we will transfer a reserved extent's - * size from the "reserved" counter to the "used" counter - this - * happens when running delayed references. When we relocate the - * chunk below, relocation first flushes delalloc, waits for - * ordered extent completion (which is where we create delayed - * references for data extents) and commits the current - * transaction (which runs delayed references), and only after - * it does the actual work to move extents out of the block - * group. So the reported amount of reclaimed bytes is - * effectively the sum of the 'used' and 'reserved' counters. - */ - spin_lock(&bg->lock); - used = bg->used; - reserved = bg->reserved; - spin_unlock(&bg->lock); - - trace_btrfs_reclaim_block_group(bg); - ret = btrfs_relocate_chunk(fs_info, bg->start, false); - if (ret) { - btrfs_dec_block_group_ro(bg); - btrfs_err(fs_info, "error relocating chunk %llu", - bg->start); - used = 0; - reserved = 0; - spin_lock(&space_info->lock); - space_info->reclaim_errors++; - spin_unlock(&space_info->lock); - } - spin_lock(&space_info->lock); - space_info->reclaim_count++; - space_info->reclaim_bytes += used; - space_info->reclaim_bytes += reserved; - if (space_info->total_bytes < old_total) - btrfs_set_periodic_reclaim_ready(space_info, true); - spin_unlock(&space_info->lock); - -next: if (ret && !READ_ONCE(space_info->periodic_reclaim)) btrfs_link_bg_list(bg, &retry_list); btrfs_put_block_group(bg);