xref: /openbmc/linux/mm/mremap.c (revision cf21eb6a)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *	mm/mremap.c
4  *
5  *	(C) Copyright 1996 Linus Torvalds
6  *
7  *	Address space accounting code	<alan@lxorguk.ukuu.org.uk>
8  *	(C) Copyright 2002 Red Hat Inc, All Rights Reserved
9  */
10 
11 #include <linux/mm.h>
12 #include <linux/mm_inline.h>
13 #include <linux/hugetlb.h>
14 #include <linux/shm.h>
15 #include <linux/ksm.h>
16 #include <linux/mman.h>
17 #include <linux/swap.h>
18 #include <linux/capability.h>
19 #include <linux/fs.h>
20 #include <linux/swapops.h>
21 #include <linux/highmem.h>
22 #include <linux/security.h>
23 #include <linux/syscalls.h>
24 #include <linux/mmu_notifier.h>
25 #include <linux/uaccess.h>
26 #include <linux/userfaultfd_k.h>
27 #include <linux/mempolicy.h>
28 
29 #include <asm/cacheflush.h>
30 #include <asm/tlb.h>
31 #include <asm/pgalloc.h>
32 
33 #include "internal.h"
34 
35 static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
36 {
37 	pgd_t *pgd;
38 	p4d_t *p4d;
39 	pud_t *pud;
40 
41 	pgd = pgd_offset(mm, addr);
42 	if (pgd_none_or_clear_bad(pgd))
43 		return NULL;
44 
45 	p4d = p4d_offset(pgd, addr);
46 	if (p4d_none_or_clear_bad(p4d))
47 		return NULL;
48 
49 	pud = pud_offset(p4d, addr);
50 	if (pud_none_or_clear_bad(pud))
51 		return NULL;
52 
53 	return pud;
54 }
55 
56 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
57 {
58 	pud_t *pud;
59 	pmd_t *pmd;
60 
61 	pud = get_old_pud(mm, addr);
62 	if (!pud)
63 		return NULL;
64 
65 	pmd = pmd_offset(pud, addr);
66 	if (pmd_none(*pmd))
67 		return NULL;
68 
69 	return pmd;
70 }
71 
72 static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
73 			    unsigned long addr)
74 {
75 	pgd_t *pgd;
76 	p4d_t *p4d;
77 
78 	pgd = pgd_offset(mm, addr);
79 	p4d = p4d_alloc(mm, pgd, addr);
80 	if (!p4d)
81 		return NULL;
82 
83 	return pud_alloc(mm, p4d, addr);
84 }
85 
86 static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
87 			    unsigned long addr)
88 {
89 	pud_t *pud;
90 	pmd_t *pmd;
91 
92 	pud = alloc_new_pud(mm, vma, addr);
93 	if (!pud)
94 		return NULL;
95 
96 	pmd = pmd_alloc(mm, pud, addr);
97 	if (!pmd)
98 		return NULL;
99 
100 	VM_BUG_ON(pmd_trans_huge(*pmd));
101 
102 	return pmd;
103 }
104 
105 static void take_rmap_locks(struct vm_area_struct *vma)
106 {
107 	if (vma->vm_file)
108 		i_mmap_lock_write(vma->vm_file->f_mapping);
109 	if (vma->anon_vma)
110 		anon_vma_lock_write(vma->anon_vma);
111 }
112 
113 static void drop_rmap_locks(struct vm_area_struct *vma)
114 {
115 	if (vma->anon_vma)
116 		anon_vma_unlock_write(vma->anon_vma);
117 	if (vma->vm_file)
118 		i_mmap_unlock_write(vma->vm_file->f_mapping);
119 }
120 
121 static pte_t move_soft_dirty_pte(pte_t pte)
122 {
123 	/*
124 	 * Set soft dirty bit so we can notice
125 	 * in userspace the ptes were moved.
126 	 */
127 #ifdef CONFIG_MEM_SOFT_DIRTY
128 	if (pte_present(pte))
129 		pte = pte_mksoft_dirty(pte);
130 	else if (is_swap_pte(pte))
131 		pte = pte_swp_mksoft_dirty(pte);
132 #endif
133 	return pte;
134 }
135 
136 static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
137 		unsigned long old_addr, unsigned long old_end,
138 		struct vm_area_struct *new_vma, pmd_t *new_pmd,
139 		unsigned long new_addr, bool need_rmap_locks)
140 {
141 	struct mm_struct *mm = vma->vm_mm;
142 	pte_t *old_pte, *new_pte, pte;
143 	spinlock_t *old_ptl, *new_ptl;
144 	bool force_flush = false;
145 	unsigned long len = old_end - old_addr;
146 	int err = 0;
147 
148 	/*
149 	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
150 	 * locks to ensure that rmap will always observe either the old or the
151 	 * new ptes. This is the easiest way to avoid races with
152 	 * truncate_pagecache(), page migration, etc...
153 	 *
154 	 * When need_rmap_locks is false, we use other ways to avoid
155 	 * such races:
156 	 *
157 	 * - During exec() shift_arg_pages(), we use a specially tagged vma
158 	 *   which rmap call sites look for using vma_is_temporary_stack().
159 	 *
160 	 * - During mremap(), new_vma is often known to be placed after vma
161 	 *   in rmap traversal order. This ensures rmap will always observe
162 	 *   either the old pte, or the new pte, or both (the page table locks
163 	 *   serialize access to individual ptes, but only rmap traversal
164 	 *   order guarantees that we won't miss both the old and new ptes).
165 	 */
166 	if (need_rmap_locks)
167 		take_rmap_locks(vma);
168 
169 	/*
170 	 * We don't have to worry about the ordering of src and dst
171 	 * pte locks because exclusive mmap_lock prevents deadlock.
172 	 */
173 	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
174 	if (!old_pte) {
175 		err = -EAGAIN;
176 		goto out;
177 	}
178 	new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl);
179 	if (!new_pte) {
180 		pte_unmap_unlock(old_pte, old_ptl);
181 		err = -EAGAIN;
182 		goto out;
183 	}
184 	if (new_ptl != old_ptl)
185 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
186 	flush_tlb_batched_pending(vma->vm_mm);
187 	arch_enter_lazy_mmu_mode();
188 
189 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
190 				   new_pte++, new_addr += PAGE_SIZE) {
191 		if (pte_none(ptep_get(old_pte)))
192 			continue;
193 
194 		pte = ptep_get_and_clear(mm, old_addr, old_pte);
195 		/*
196 		 * If we are remapping a valid PTE, make sure
197 		 * to flush TLB before we drop the PTL for the
198 		 * PTE.
199 		 *
200 		 * NOTE! Both old and new PTL matter: the old one
201 		 * for racing with page_mkclean(), the new one to
202 		 * make sure the physical page stays valid until
203 		 * the TLB entry for the old mapping has been
204 		 * flushed.
205 		 */
206 		if (pte_present(pte))
207 			force_flush = true;
208 		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
209 		pte = move_soft_dirty_pte(pte);
210 		set_pte_at(mm, new_addr, new_pte, pte);
211 	}
212 
213 	arch_leave_lazy_mmu_mode();
214 	if (force_flush)
215 		flush_tlb_range(vma, old_end - len, old_end);
216 	if (new_ptl != old_ptl)
217 		spin_unlock(new_ptl);
218 	pte_unmap(new_pte - 1);
219 	pte_unmap_unlock(old_pte - 1, old_ptl);
220 out:
221 	if (need_rmap_locks)
222 		drop_rmap_locks(vma);
223 	return err;
224 }
225 
226 #ifndef arch_supports_page_table_move
227 #define arch_supports_page_table_move arch_supports_page_table_move
228 static inline bool arch_supports_page_table_move(void)
229 {
230 	return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
231 		IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
232 }
233 #endif
234 
235 #ifdef CONFIG_HAVE_MOVE_PMD
236 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
237 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
238 {
239 	spinlock_t *old_ptl, *new_ptl;
240 	struct mm_struct *mm = vma->vm_mm;
241 	pmd_t pmd;
242 
243 	if (!arch_supports_page_table_move())
244 		return false;
245 	/*
246 	 * The destination pmd shouldn't be established, free_pgtables()
247 	 * should have released it.
248 	 *
249 	 * However, there's a case during execve() where we use mremap
250 	 * to move the initial stack, and in that case the target area
251 	 * may overlap the source area (always moving down).
252 	 *
253 	 * If everything is PMD-aligned, that works fine, as moving
254 	 * each pmd down will clear the source pmd. But if we first
255 	 * have a few 4kB-only pages that get moved down, and then
256 	 * hit the "now the rest is PMD-aligned, let's do everything
257 	 * one pmd at a time", we will still have the old (now empty
258 	 * of any 4kB pages, but still there) PMD in the page table
259 	 * tree.
260 	 *
261 	 * Warn on it once - because we really should try to figure
262 	 * out how to do this better - but then say "I won't move
263 	 * this pmd".
264 	 *
265 	 * One alternative might be to just unmap the target pmd at
266 	 * this point, and verify that it really is empty. We'll see.
267 	 */
268 	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
269 		return false;
270 
271 	/*
272 	 * We don't have to worry about the ordering of src and dst
273 	 * ptlocks because exclusive mmap_lock prevents deadlock.
274 	 */
275 	old_ptl = pmd_lock(vma->vm_mm, old_pmd);
276 	new_ptl = pmd_lockptr(mm, new_pmd);
277 	if (new_ptl != old_ptl)
278 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
279 
280 	/* Clear the pmd */
281 	pmd = *old_pmd;
282 	pmd_clear(old_pmd);
283 
284 	VM_BUG_ON(!pmd_none(*new_pmd));
285 
286 	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
287 	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
288 	if (new_ptl != old_ptl)
289 		spin_unlock(new_ptl);
290 	spin_unlock(old_ptl);
291 
292 	return true;
293 }
294 #else
295 static inline bool move_normal_pmd(struct vm_area_struct *vma,
296 		unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
297 		pmd_t *new_pmd)
298 {
299 	return false;
300 }
301 #endif
302 
303 #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
304 static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
305 		  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
306 {
307 	spinlock_t *old_ptl, *new_ptl;
308 	struct mm_struct *mm = vma->vm_mm;
309 	pud_t pud;
310 
311 	if (!arch_supports_page_table_move())
312 		return false;
313 	/*
314 	 * The destination pud shouldn't be established, free_pgtables()
315 	 * should have released it.
316 	 */
317 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
318 		return false;
319 
320 	/*
321 	 * We don't have to worry about the ordering of src and dst
322 	 * ptlocks because exclusive mmap_lock prevents deadlock.
323 	 */
324 	old_ptl = pud_lock(vma->vm_mm, old_pud);
325 	new_ptl = pud_lockptr(mm, new_pud);
326 	if (new_ptl != old_ptl)
327 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
328 
329 	/* Clear the pud */
330 	pud = *old_pud;
331 	pud_clear(old_pud);
332 
333 	VM_BUG_ON(!pud_none(*new_pud));
334 
335 	pud_populate(mm, new_pud, pud_pgtable(pud));
336 	flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
337 	if (new_ptl != old_ptl)
338 		spin_unlock(new_ptl);
339 	spin_unlock(old_ptl);
340 
341 	return true;
342 }
343 #else
344 static inline bool move_normal_pud(struct vm_area_struct *vma,
345 		unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
346 		pud_t *new_pud)
347 {
348 	return false;
349 }
350 #endif
351 
352 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
353 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
354 			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
355 {
356 	spinlock_t *old_ptl, *new_ptl;
357 	struct mm_struct *mm = vma->vm_mm;
358 	pud_t pud;
359 
360 	/*
361 	 * The destination pud shouldn't be established, free_pgtables()
362 	 * should have released it.
363 	 */
364 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
365 		return false;
366 
367 	/*
368 	 * We don't have to worry about the ordering of src and dst
369 	 * ptlocks because exclusive mmap_lock prevents deadlock.
370 	 */
371 	old_ptl = pud_lock(vma->vm_mm, old_pud);
372 	new_ptl = pud_lockptr(mm, new_pud);
373 	if (new_ptl != old_ptl)
374 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
375 
376 	/* Clear the pud */
377 	pud = *old_pud;
378 	pud_clear(old_pud);
379 
380 	VM_BUG_ON(!pud_none(*new_pud));
381 
382 	/* Set the new pud */
383 	/* mark soft_ditry when we add pud level soft dirty support */
384 	set_pud_at(mm, new_addr, new_pud, pud);
385 	flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
386 	if (new_ptl != old_ptl)
387 		spin_unlock(new_ptl);
388 	spin_unlock(old_ptl);
389 
390 	return true;
391 }
392 #else
393 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
394 			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
395 {
396 	WARN_ON_ONCE(1);
397 	return false;
398 
399 }
400 #endif
401 
402 enum pgt_entry {
403 	NORMAL_PMD,
404 	HPAGE_PMD,
405 	NORMAL_PUD,
406 	HPAGE_PUD,
407 };
408 
409 /*
410  * Returns an extent of the corresponding size for the pgt_entry specified if
411  * valid. Else returns a smaller extent bounded by the end of the source and
412  * destination pgt_entry.
413  */
414 static __always_inline unsigned long get_extent(enum pgt_entry entry,
415 			unsigned long old_addr, unsigned long old_end,
416 			unsigned long new_addr)
417 {
418 	unsigned long next, extent, mask, size;
419 
420 	switch (entry) {
421 	case HPAGE_PMD:
422 	case NORMAL_PMD:
423 		mask = PMD_MASK;
424 		size = PMD_SIZE;
425 		break;
426 	case HPAGE_PUD:
427 	case NORMAL_PUD:
428 		mask = PUD_MASK;
429 		size = PUD_SIZE;
430 		break;
431 	default:
432 		BUILD_BUG();
433 		break;
434 	}
435 
436 	next = (old_addr + size) & mask;
437 	/* even if next overflowed, extent below will be ok */
438 	extent = next - old_addr;
439 	if (extent > old_end - old_addr)
440 		extent = old_end - old_addr;
441 	next = (new_addr + size) & mask;
442 	if (extent > next - new_addr)
443 		extent = next - new_addr;
444 	return extent;
445 }
446 
447 /*
448  * Attempts to speedup the move by moving entry at the level corresponding to
449  * pgt_entry. Returns true if the move was successful, else false.
450  */
451 static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
452 			unsigned long old_addr, unsigned long new_addr,
453 			void *old_entry, void *new_entry, bool need_rmap_locks)
454 {
455 	bool moved = false;
456 
457 	/* See comment in move_ptes() */
458 	if (need_rmap_locks)
459 		take_rmap_locks(vma);
460 
461 	switch (entry) {
462 	case NORMAL_PMD:
463 		moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
464 					new_entry);
465 		break;
466 	case NORMAL_PUD:
467 		moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
468 					new_entry);
469 		break;
470 	case HPAGE_PMD:
471 		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
472 			move_huge_pmd(vma, old_addr, new_addr, old_entry,
473 				      new_entry);
474 		break;
475 	case HPAGE_PUD:
476 		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
477 			move_huge_pud(vma, old_addr, new_addr, old_entry,
478 				      new_entry);
479 		break;
480 
481 	default:
482 		WARN_ON_ONCE(1);
483 		break;
484 	}
485 
486 	if (need_rmap_locks)
487 		drop_rmap_locks(vma);
488 
489 	return moved;
490 }
491 
492 unsigned long move_page_tables(struct vm_area_struct *vma,
493 		unsigned long old_addr, struct vm_area_struct *new_vma,
494 		unsigned long new_addr, unsigned long len,
495 		bool need_rmap_locks)
496 {
497 	unsigned long extent, old_end;
498 	struct mmu_notifier_range range;
499 	pmd_t *old_pmd, *new_pmd;
500 	pud_t *old_pud, *new_pud;
501 
502 	if (!len)
503 		return 0;
504 
505 	old_end = old_addr + len;
506 
507 	if (is_vm_hugetlb_page(vma))
508 		return move_hugetlb_page_tables(vma, new_vma, old_addr,
509 						new_addr, len);
510 
511 	flush_cache_range(vma, old_addr, old_end);
512 	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
513 				old_addr, old_end);
514 	mmu_notifier_invalidate_range_start(&range);
515 
516 	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
517 		cond_resched();
518 		/*
519 		 * If extent is PUD-sized try to speed up the move by moving at the
520 		 * PUD level if possible.
521 		 */
522 		extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
523 
524 		old_pud = get_old_pud(vma->vm_mm, old_addr);
525 		if (!old_pud)
526 			continue;
527 		new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
528 		if (!new_pud)
529 			break;
530 		if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
531 			if (extent == HPAGE_PUD_SIZE) {
532 				move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
533 					       old_pud, new_pud, need_rmap_locks);
534 				/* We ignore and continue on error? */
535 				continue;
536 			}
537 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
538 
539 			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
540 					   old_pud, new_pud, true))
541 				continue;
542 		}
543 
544 		extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
545 		old_pmd = get_old_pmd(vma->vm_mm, old_addr);
546 		if (!old_pmd)
547 			continue;
548 		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
549 		if (!new_pmd)
550 			break;
551 again:
552 		if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
553 		    pmd_devmap(*old_pmd)) {
554 			if (extent == HPAGE_PMD_SIZE &&
555 			    move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
556 					   old_pmd, new_pmd, need_rmap_locks))
557 				continue;
558 			split_huge_pmd(vma, old_pmd, old_addr);
559 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
560 			   extent == PMD_SIZE) {
561 			/*
562 			 * If the extent is PMD-sized, try to speed the move by
563 			 * moving at the PMD level if possible.
564 			 */
565 			if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
566 					   old_pmd, new_pmd, true))
567 				continue;
568 		}
569 		if (pmd_none(*old_pmd))
570 			continue;
571 		if (pte_alloc(new_vma->vm_mm, new_pmd))
572 			break;
573 		if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
574 			      new_vma, new_pmd, new_addr, need_rmap_locks) < 0)
575 			goto again;
576 	}
577 
578 	mmu_notifier_invalidate_range_end(&range);
579 
580 	return len + old_addr - old_end;	/* how much done */
581 }
582 
583 static unsigned long move_vma(struct vm_area_struct *vma,
584 		unsigned long old_addr, unsigned long old_len,
585 		unsigned long new_len, unsigned long new_addr,
586 		bool *locked, unsigned long flags,
587 		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
588 {
589 	long to_account = new_len - old_len;
590 	struct mm_struct *mm = vma->vm_mm;
591 	struct vm_area_struct *new_vma;
592 	unsigned long vm_flags = vma->vm_flags;
593 	unsigned long new_pgoff;
594 	unsigned long moved_len;
595 	unsigned long account_start = 0;
596 	unsigned long account_end = 0;
597 	unsigned long hiwater_vm;
598 	int err = 0;
599 	bool need_rmap_locks;
600 	struct vma_iterator vmi;
601 
602 	/*
603 	 * We'd prefer to avoid failure later on in do_munmap:
604 	 * which may split one vma into three before unmapping.
605 	 */
606 	if (mm->map_count >= sysctl_max_map_count - 3)
607 		return -ENOMEM;
608 
609 	if (unlikely(flags & MREMAP_DONTUNMAP))
610 		to_account = new_len;
611 
612 	if (vma->vm_ops && vma->vm_ops->may_split) {
613 		if (vma->vm_start != old_addr)
614 			err = vma->vm_ops->may_split(vma, old_addr);
615 		if (!err && vma->vm_end != old_addr + old_len)
616 			err = vma->vm_ops->may_split(vma, old_addr + old_len);
617 		if (err)
618 			return err;
619 	}
620 
621 	/*
622 	 * Advise KSM to break any KSM pages in the area to be moved:
623 	 * it would be confusing if they were to turn up at the new
624 	 * location, where they happen to coincide with different KSM
625 	 * pages recently unmapped.  But leave vma->vm_flags as it was,
626 	 * so KSM can come around to merge on vma and new_vma afterwards.
627 	 */
628 	err = ksm_madvise(vma, old_addr, old_addr + old_len,
629 						MADV_UNMERGEABLE, &vm_flags);
630 	if (err)
631 		return err;
632 
633 	if (vm_flags & VM_ACCOUNT) {
634 		if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
635 			return -ENOMEM;
636 	}
637 
638 	vma_start_write(vma);
639 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
640 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
641 			   &need_rmap_locks);
642 	if (!new_vma) {
643 		if (vm_flags & VM_ACCOUNT)
644 			vm_unacct_memory(to_account >> PAGE_SHIFT);
645 		return -ENOMEM;
646 	}
647 
648 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
649 				     need_rmap_locks);
650 	if (moved_len < old_len) {
651 		err = -ENOMEM;
652 	} else if (vma->vm_ops && vma->vm_ops->mremap) {
653 		err = vma->vm_ops->mremap(new_vma);
654 	}
655 
656 	if (unlikely(err)) {
657 		/*
658 		 * On error, move entries back from new area to old,
659 		 * which will succeed since page tables still there,
660 		 * and then proceed to unmap new area instead of old.
661 		 */
662 		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
663 				 true);
664 		vma = new_vma;
665 		old_len = new_len;
666 		old_addr = new_addr;
667 		new_addr = err;
668 	} else {
669 		mremap_userfaultfd_prep(new_vma, uf);
670 	}
671 
672 	if (is_vm_hugetlb_page(vma)) {
673 		clear_vma_resv_huge_pages(vma);
674 	}
675 
676 	/* Conceal VM_ACCOUNT so old reservation is not undone */
677 	if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
678 		vm_flags_clear(vma, VM_ACCOUNT);
679 		if (vma->vm_start < old_addr)
680 			account_start = vma->vm_start;
681 		if (vma->vm_end > old_addr + old_len)
682 			account_end = vma->vm_end;
683 	}
684 
685 	/*
686 	 * If we failed to move page tables we still do total_vm increment
687 	 * since do_munmap() will decrement it by old_len == new_len.
688 	 *
689 	 * Since total_vm is about to be raised artificially high for a
690 	 * moment, we need to restore high watermark afterwards: if stats
691 	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
692 	 * If this were a serious issue, we'd add a flag to do_munmap().
693 	 */
694 	hiwater_vm = mm->hiwater_vm;
695 	vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
696 
697 	/* Tell pfnmap has moved from this vma */
698 	if (unlikely(vma->vm_flags & VM_PFNMAP))
699 		untrack_pfn_clear(vma);
700 
701 	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
702 		/* We always clear VM_LOCKED[ONFAULT] on the old vma */
703 		vm_flags_clear(vma, VM_LOCKED_MASK);
704 
705 		/*
706 		 * anon_vma links of the old vma is no longer needed after its page
707 		 * table has been moved.
708 		 */
709 		if (new_vma != vma && vma->vm_start == old_addr &&
710 			vma->vm_end == (old_addr + old_len))
711 			unlink_anon_vmas(vma);
712 
713 		/* Because we won't unmap we don't need to touch locked_vm */
714 		return new_addr;
715 	}
716 
717 	vma_iter_init(&vmi, mm, old_addr);
718 	if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
719 		/* OOM: unable to split vma, just get accounts right */
720 		if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
721 			vm_acct_memory(old_len >> PAGE_SHIFT);
722 		account_start = account_end = 0;
723 	}
724 
725 	if (vm_flags & VM_LOCKED) {
726 		mm->locked_vm += new_len >> PAGE_SHIFT;
727 		*locked = true;
728 	}
729 
730 	mm->hiwater_vm = hiwater_vm;
731 
732 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
733 	if (account_start) {
734 		vma = vma_prev(&vmi);
735 		vm_flags_set(vma, VM_ACCOUNT);
736 	}
737 
738 	if (account_end) {
739 		vma = vma_next(&vmi);
740 		vm_flags_set(vma, VM_ACCOUNT);
741 	}
742 
743 	return new_addr;
744 }
745 
746 static struct vm_area_struct *vma_to_resize(unsigned long addr,
747 	unsigned long old_len, unsigned long new_len, unsigned long flags)
748 {
749 	struct mm_struct *mm = current->mm;
750 	struct vm_area_struct *vma;
751 	unsigned long pgoff;
752 
753 	vma = vma_lookup(mm, addr);
754 	if (!vma)
755 		return ERR_PTR(-EFAULT);
756 
757 	/*
758 	 * !old_len is a special case where an attempt is made to 'duplicate'
759 	 * a mapping.  This makes no sense for private mappings as it will
760 	 * instead create a fresh/new mapping unrelated to the original.  This
761 	 * is contrary to the basic idea of mremap which creates new mappings
762 	 * based on the original.  There are no known use cases for this
763 	 * behavior.  As a result, fail such attempts.
764 	 */
765 	if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
766 		pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n", current->comm, current->pid);
767 		return ERR_PTR(-EINVAL);
768 	}
769 
770 	if ((flags & MREMAP_DONTUNMAP) &&
771 			(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
772 		return ERR_PTR(-EINVAL);
773 
774 	/* We can't remap across vm area boundaries */
775 	if (old_len > vma->vm_end - addr)
776 		return ERR_PTR(-EFAULT);
777 
778 	if (new_len == old_len)
779 		return vma;
780 
781 	/* Need to be careful about a growing mapping */
782 	pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
783 	pgoff += vma->vm_pgoff;
784 	if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
785 		return ERR_PTR(-EINVAL);
786 
787 	if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
788 		return ERR_PTR(-EFAULT);
789 
790 	if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len))
791 		return ERR_PTR(-EAGAIN);
792 
793 	if (!may_expand_vm(mm, vma->vm_flags,
794 				(new_len - old_len) >> PAGE_SHIFT))
795 		return ERR_PTR(-ENOMEM);
796 
797 	return vma;
798 }
799 
800 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
801 		unsigned long new_addr, unsigned long new_len, bool *locked,
802 		unsigned long flags, struct vm_userfaultfd_ctx *uf,
803 		struct list_head *uf_unmap_early,
804 		struct list_head *uf_unmap)
805 {
806 	struct mm_struct *mm = current->mm;
807 	struct vm_area_struct *vma;
808 	unsigned long ret = -EINVAL;
809 	unsigned long map_flags = 0;
810 
811 	if (offset_in_page(new_addr))
812 		goto out;
813 
814 	if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
815 		goto out;
816 
817 	/* Ensure the old/new locations do not overlap */
818 	if (addr + old_len > new_addr && new_addr + new_len > addr)
819 		goto out;
820 
821 	/*
822 	 * move_vma() need us to stay 4 maps below the threshold, otherwise
823 	 * it will bail out at the very beginning.
824 	 * That is a problem if we have already unmaped the regions here
825 	 * (new_addr, and old_addr), because userspace will not know the
826 	 * state of the vma's after it gets -ENOMEM.
827 	 * So, to avoid such scenario we can pre-compute if the whole
828 	 * operation has high chances to success map-wise.
829 	 * Worst-scenario case is when both vma's (new_addr and old_addr) get
830 	 * split in 3 before unmapping it.
831 	 * That means 2 more maps (1 for each) to the ones we already hold.
832 	 * Check whether current map count plus 2 still leads us to 4 maps below
833 	 * the threshold, otherwise return -ENOMEM here to be more safe.
834 	 */
835 	if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
836 		return -ENOMEM;
837 
838 	if (flags & MREMAP_FIXED) {
839 		ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
840 		if (ret)
841 			goto out;
842 	}
843 
844 	if (old_len > new_len) {
845 		ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
846 		if (ret)
847 			goto out;
848 		old_len = new_len;
849 	}
850 
851 	vma = vma_to_resize(addr, old_len, new_len, flags);
852 	if (IS_ERR(vma)) {
853 		ret = PTR_ERR(vma);
854 		goto out;
855 	}
856 
857 	/* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
858 	if (flags & MREMAP_DONTUNMAP &&
859 		!may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
860 		ret = -ENOMEM;
861 		goto out;
862 	}
863 
864 	if (flags & MREMAP_FIXED)
865 		map_flags |= MAP_FIXED;
866 
867 	if (vma->vm_flags & VM_MAYSHARE)
868 		map_flags |= MAP_SHARED;
869 
870 	ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
871 				((addr - vma->vm_start) >> PAGE_SHIFT),
872 				map_flags);
873 	if (IS_ERR_VALUE(ret))
874 		goto out;
875 
876 	/* We got a new mapping */
877 	if (!(flags & MREMAP_FIXED))
878 		new_addr = ret;
879 
880 	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
881 		       uf_unmap);
882 
883 out:
884 	return ret;
885 }
886 
887 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
888 {
889 	unsigned long end = vma->vm_end + delta;
890 
891 	if (end < vma->vm_end) /* overflow */
892 		return 0;
893 	if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
894 		return 0;
895 	if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
896 			      0, MAP_FIXED) & ~PAGE_MASK)
897 		return 0;
898 	return 1;
899 }
900 
901 /*
902  * Expand (or shrink) an existing mapping, potentially moving it at the
903  * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
904  *
905  * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
906  * This option implies MREMAP_MAYMOVE.
907  */
908 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
909 		unsigned long, new_len, unsigned long, flags,
910 		unsigned long, new_addr)
911 {
912 	struct mm_struct *mm = current->mm;
913 	struct vm_area_struct *vma;
914 	unsigned long ret = -EINVAL;
915 	bool locked = false;
916 	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
917 	LIST_HEAD(uf_unmap_early);
918 	LIST_HEAD(uf_unmap);
919 
920 	/*
921 	 * There is a deliberate asymmetry here: we strip the pointer tag
922 	 * from the old address but leave the new address alone. This is
923 	 * for consistency with mmap(), where we prevent the creation of
924 	 * aliasing mappings in userspace by leaving the tag bits of the
925 	 * mapping address intact. A non-zero tag will cause the subsequent
926 	 * range checks to reject the address as invalid.
927 	 *
928 	 * See Documentation/arch/arm64/tagged-address-abi.rst for more
929 	 * information.
930 	 */
931 	addr = untagged_addr(addr);
932 
933 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
934 		return ret;
935 
936 	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
937 		return ret;
938 
939 	/*
940 	 * MREMAP_DONTUNMAP is always a move and it does not allow resizing
941 	 * in the process.
942 	 */
943 	if (flags & MREMAP_DONTUNMAP &&
944 			(!(flags & MREMAP_MAYMOVE) || old_len != new_len))
945 		return ret;
946 
947 
948 	if (offset_in_page(addr))
949 		return ret;
950 
951 	old_len = PAGE_ALIGN(old_len);
952 	new_len = PAGE_ALIGN(new_len);
953 
954 	/*
955 	 * We allow a zero old-len as a special case
956 	 * for DOS-emu "duplicate shm area" thing. But
957 	 * a zero new-len is nonsensical.
958 	 */
959 	if (!new_len)
960 		return ret;
961 
962 	if (mmap_write_lock_killable(current->mm))
963 		return -EINTR;
964 	vma = vma_lookup(mm, addr);
965 	if (!vma) {
966 		ret = -EFAULT;
967 		goto out;
968 	}
969 
970 	if (is_vm_hugetlb_page(vma)) {
971 		struct hstate *h __maybe_unused = hstate_vma(vma);
972 
973 		old_len = ALIGN(old_len, huge_page_size(h));
974 		new_len = ALIGN(new_len, huge_page_size(h));
975 
976 		/* addrs must be huge page aligned */
977 		if (addr & ~huge_page_mask(h))
978 			goto out;
979 		if (new_addr & ~huge_page_mask(h))
980 			goto out;
981 
982 		/*
983 		 * Don't allow remap expansion, because the underlying hugetlb
984 		 * reservation is not yet capable to handle split reservation.
985 		 */
986 		if (new_len > old_len)
987 			goto out;
988 	}
989 
990 	if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
991 		ret = mremap_to(addr, old_len, new_addr, new_len,
992 				&locked, flags, &uf, &uf_unmap_early,
993 				&uf_unmap);
994 		goto out;
995 	}
996 
997 	/*
998 	 * Always allow a shrinking remap: that just unmaps
999 	 * the unnecessary pages..
1000 	 * do_vmi_munmap does all the needed commit accounting, and
1001 	 * unlocks the mmap_lock if so directed.
1002 	 */
1003 	if (old_len >= new_len) {
1004 		VMA_ITERATOR(vmi, mm, addr + new_len);
1005 
1006 		if (old_len == new_len) {
1007 			ret = addr;
1008 			goto out;
1009 		}
1010 
1011 		ret = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len,
1012 				    &uf_unmap, true);
1013 		if (ret)
1014 			goto out;
1015 
1016 		ret = addr;
1017 		goto out_unlocked;
1018 	}
1019 
1020 	/*
1021 	 * Ok, we need to grow..
1022 	 */
1023 	vma = vma_to_resize(addr, old_len, new_len, flags);
1024 	if (IS_ERR(vma)) {
1025 		ret = PTR_ERR(vma);
1026 		goto out;
1027 	}
1028 
1029 	/* old_len exactly to the end of the area..
1030 	 */
1031 	if (old_len == vma->vm_end - addr) {
1032 		/* can we just expand the current mapping? */
1033 		if (vma_expandable(vma, new_len - old_len)) {
1034 			long pages = (new_len - old_len) >> PAGE_SHIFT;
1035 			unsigned long extension_start = addr + old_len;
1036 			unsigned long extension_end = addr + new_len;
1037 			pgoff_t extension_pgoff = vma->vm_pgoff +
1038 				((extension_start - vma->vm_start) >> PAGE_SHIFT);
1039 			VMA_ITERATOR(vmi, mm, extension_start);
1040 
1041 			if (vma->vm_flags & VM_ACCOUNT) {
1042 				if (security_vm_enough_memory_mm(mm, pages)) {
1043 					ret = -ENOMEM;
1044 					goto out;
1045 				}
1046 			}
1047 
1048 			/*
1049 			 * Function vma_merge() is called on the extension we
1050 			 * are adding to the already existing vma, vma_merge()
1051 			 * will merge this extension with the already existing
1052 			 * vma (expand operation itself) and possibly also with
1053 			 * the next vma if it becomes adjacent to the expanded
1054 			 * vma and  otherwise compatible.
1055 			 */
1056 			vma = vma_merge(&vmi, mm, vma, extension_start,
1057 				extension_end, vma->vm_flags, vma->anon_vma,
1058 				vma->vm_file, extension_pgoff, vma_policy(vma),
1059 				vma->vm_userfaultfd_ctx, anon_vma_name(vma));
1060 			if (!vma) {
1061 				vm_unacct_memory(pages);
1062 				ret = -ENOMEM;
1063 				goto out;
1064 			}
1065 
1066 			vm_stat_account(mm, vma->vm_flags, pages);
1067 			if (vma->vm_flags & VM_LOCKED) {
1068 				mm->locked_vm += pages;
1069 				locked = true;
1070 				new_addr = addr;
1071 			}
1072 			ret = addr;
1073 			goto out;
1074 		}
1075 	}
1076 
1077 	/*
1078 	 * We weren't able to just expand or shrink the area,
1079 	 * we need to create a new one and move it..
1080 	 */
1081 	ret = -ENOMEM;
1082 	if (flags & MREMAP_MAYMOVE) {
1083 		unsigned long map_flags = 0;
1084 		if (vma->vm_flags & VM_MAYSHARE)
1085 			map_flags |= MAP_SHARED;
1086 
1087 		new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
1088 					vma->vm_pgoff +
1089 					((addr - vma->vm_start) >> PAGE_SHIFT),
1090 					map_flags);
1091 		if (IS_ERR_VALUE(new_addr)) {
1092 			ret = new_addr;
1093 			goto out;
1094 		}
1095 
1096 		ret = move_vma(vma, addr, old_len, new_len, new_addr,
1097 			       &locked, flags, &uf, &uf_unmap);
1098 	}
1099 out:
1100 	if (offset_in_page(ret))
1101 		locked = false;
1102 	mmap_write_unlock(current->mm);
1103 	if (locked && new_len > old_len)
1104 		mm_populate(new_addr + old_len, new_len - old_len);
1105 out_unlocked:
1106 	userfaultfd_unmap_complete(mm, &uf_unmap_early);
1107 	mremap_userfaultfd_complete(&uf, addr, ret, old_len);
1108 	userfaultfd_unmap_complete(mm, &uf_unmap);
1109 	return ret;
1110 }
1111