]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-client.git/commitdiff
mm: change dup_mmap() recovery
authorLiam R. Howlett <Liam.Howlett@oracle.com>
Wed, 21 Jan 2026 16:49:42 +0000 (11:49 -0500)
committerAndrew Morton <akpm@linux-foundation.org>
Thu, 12 Feb 2026 23:42:55 +0000 (15:42 -0800)
When the dup_mmap() fails during the vma duplication or setup, don't write
the XA_ZERO entry in the vma tree.  Instead, destroy the tree and free the
new resources, leaving an empty vma tree.

Using XA_ZERO introduced races where the vma could be found between
dup_mmap() dropping all locks and exit_mmap() taking the locks.  The race
can occur because the mm can be reached through the other trees via
successfully copied vmas and other methods such as the swapoff code.

XA_ZERO was marking the location to stop vma removal and pagetable
freeing.  The newly created arguments to the unmap_vmas() and
free_pgtables() serve this function.

Replacing the XA_ZERO entry use with the new argument list also means the
checks for xa_is_zero() are no longer necessary so these are also removed.

Note that dup_mmap() now cleans up when ALL vmas are successfully copied,
but the dup_mmap() fails to completely set up some other aspect of the
duplication.

Link: https://lkml.kernel.org/r/20260121164946.2093480-8-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
mm/memory.c
mm/mmap.c

index 98c407622deaba96d9f5fe44ac2e07811e1c2e7d..6033cf6c93de6c2590e90506eda361554464d2f1 100644 (file)
@@ -411,8 +411,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
                struct vm_area_struct *next;
 
                next = mas_find(mas, vma_end - 1);
-               if (unlikely(xa_is_zero(next)))
-                       next = NULL;
 
                /*
                 * Hide vma from rmap and truncate_pagecache before freeing
@@ -431,8 +429,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
                while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
                        vma = next;
                        next = mas_find(mas, vma_end - 1);
-                       if (unlikely(xa_is_zero(next)))
-                               next = NULL;
                        if (mm_wr_locked)
                                vma_start_write(vma);
                        unlink_anon_vmas(vma);
@@ -2186,7 +2182,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
                unmap_single_vma(tlb, vma, start, end, &details);
                hugetlb_zap_end(vma, &details);
                vma = mas_find(mas, tree_end - 1);
-       } while (vma && likely(!xa_is_zero(vma)));
+       } while (vma);
        mmu_notifier_invalidate_range_end(&range);
 }
 
index 827a64cdcc681116761382cd134efc672021b223..48dae3d48e46f8840bd44aa891bafe6a5615762a 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1285,7 +1285,7 @@ void exit_mmap(struct mm_struct *mm)
        arch_exit_mmap(mm);
 
        vma = vma_next(&vmi);
-       if (!vma || unlikely(xa_is_zero(vma))) {
+       if (!vma) {
                /* Can happen if dup_mmap() received an OOM */
                mmap_read_unlock(mm);
                mmap_write_lock(mm);
@@ -1851,20 +1851,40 @@ loop_out:
                ksm_fork(mm, oldmm);
                khugepaged_fork(mm, oldmm);
        } else {
+               unsigned long end;
 
                /*
-                * The entire maple tree has already been duplicated. If the
-                * mmap duplication fails, mark the failure point with
-                * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
-                * stop releasing VMAs that have not been duplicated after this
-                * point.
+                * The entire maple tree has already been duplicated, but
+                * replacing the vmas failed at mpnt (which could be NULL if
+                * all were allocated but the last vma was not fully set up).
+                * Use the start address of the failure point to clean up the
+                * partially initialized tree.
                 */
-               if (mpnt) {
-                       mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
-                       mas_store(&vmi.mas, XA_ZERO_ENTRY);
-                       /* Avoid OOM iterating a broken tree */
-                       mm_flags_set(MMF_OOM_SKIP, mm);
+               if (!mm->map_count) {
+                       /* zero vmas were written to the new tree. */
+                       end = 0;
+               } else if (mpnt) {
+                       /* partial tree failure */
+                       end = mpnt->vm_start;
+               } else {
+                       /* All vmas were written to the new tree */
+                       end = ULONG_MAX;
                }
+
+               /* Hide mm from oom killer because the memory is being freed */
+               mm_flags_set(MMF_OOM_SKIP, mm);
+               if (end) {
+                       vma_iter_set(&vmi, 0);
+                       tmp = vma_next(&vmi);
+                       flush_cache_mm(mm);
+                       unmap_region(&vmi.mas, /* vma = */ tmp,
+                                    /* vma_start = */ 0, /* vma_end = */ end,
+                                    /* pg_end = */ end, /* prev = */ NULL,
+                                    /* next = */ NULL);
+                       charge = tear_down_vmas(mm, &vmi, tmp, end);
+                       vm_unacct_memory(charge);
+               }
+               __mt_destroy(&mm->mm_mt);
                /*
                 * The mm_struct is going to exit, but the locks will be dropped
                 * first.  Set the mm_struct as unstable is advisable as it is