xref: /openbmc/linux/mm/mremap.c (revision 248ed9e227e6cf59acb1aaf3aa30d530a0232c1a)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *	mm/mremap.c
4  *
5  *	(C) Copyright 1996 Linus Torvalds
6  *
7  *	Address space accounting code	<alan@lxorguk.ukuu.org.uk>
8  *	(C) Copyright 2002 Red Hat Inc, All Rights Reserved
9  */
10 
11 #include <linux/mm.h>
12 #include <linux/mm_inline.h>
13 #include <linux/hugetlb.h>
14 #include <linux/shm.h>
15 #include <linux/ksm.h>
16 #include <linux/mman.h>
17 #include <linux/swap.h>
18 #include <linux/capability.h>
19 #include <linux/fs.h>
20 #include <linux/swapops.h>
21 #include <linux/highmem.h>
22 #include <linux/security.h>
23 #include <linux/syscalls.h>
24 #include <linux/mmu_notifier.h>
25 #include <linux/uaccess.h>
26 #include <linux/userfaultfd_k.h>
27 #include <linux/mempolicy.h>
28 
29 #include <asm/cacheflush.h>
30 #include <asm/tlb.h>
31 #include <asm/pgalloc.h>
32 
33 #include "internal.h"
34 
35 static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
36 {
37 	pgd_t *pgd;
38 	p4d_t *p4d;
39 	pud_t *pud;
40 
41 	pgd = pgd_offset(mm, addr);
42 	if (pgd_none_or_clear_bad(pgd))
43 		return NULL;
44 
45 	p4d = p4d_offset(pgd, addr);
46 	if (p4d_none_or_clear_bad(p4d))
47 		return NULL;
48 
49 	pud = pud_offset(p4d, addr);
50 	if (pud_none_or_clear_bad(pud))
51 		return NULL;
52 
53 	return pud;
54 }
55 
56 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
57 {
58 	pud_t *pud;
59 	pmd_t *pmd;
60 
61 	pud = get_old_pud(mm, addr);
62 	if (!pud)
63 		return NULL;
64 
65 	pmd = pmd_offset(pud, addr);
66 	if (pmd_none(*pmd))
67 		return NULL;
68 
69 	return pmd;
70 }
71 
72 static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
73 			    unsigned long addr)
74 {
75 	pgd_t *pgd;
76 	p4d_t *p4d;
77 
78 	pgd = pgd_offset(mm, addr);
79 	p4d = p4d_alloc(mm, pgd, addr);
80 	if (!p4d)
81 		return NULL;
82 
83 	return pud_alloc(mm, p4d, addr);
84 }
85 
86 static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
87 			    unsigned long addr)
88 {
89 	pud_t *pud;
90 	pmd_t *pmd;
91 
92 	pud = alloc_new_pud(mm, vma, addr);
93 	if (!pud)
94 		return NULL;
95 
96 	pmd = pmd_alloc(mm, pud, addr);
97 	if (!pmd)
98 		return NULL;
99 
100 	VM_BUG_ON(pmd_trans_huge(*pmd));
101 
102 	return pmd;
103 }
104 
105 static void take_rmap_locks(struct vm_area_struct *vma)
106 {
107 	if (vma->vm_file)
108 		i_mmap_lock_write(vma->vm_file->f_mapping);
109 	if (vma->anon_vma)
110 		anon_vma_lock_write(vma->anon_vma);
111 }
112 
113 static void drop_rmap_locks(struct vm_area_struct *vma)
114 {
115 	if (vma->anon_vma)
116 		anon_vma_unlock_write(vma->anon_vma);
117 	if (vma->vm_file)
118 		i_mmap_unlock_write(vma->vm_file->f_mapping);
119 }
120 
121 static pte_t move_soft_dirty_pte(pte_t pte)
122 {
123 	/*
124 	 * Set soft dirty bit so we can notice
125 	 * in userspace the ptes were moved.
126 	 */
127 #ifdef CONFIG_MEM_SOFT_DIRTY
128 	if (pte_present(pte))
129 		pte = pte_mksoft_dirty(pte);
130 	else if (is_swap_pte(pte))
131 		pte = pte_swp_mksoft_dirty(pte);
132 #endif
133 	return pte;
134 }
135 
136 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
137 		unsigned long old_addr, unsigned long old_end,
138 		struct vm_area_struct *new_vma, pmd_t *new_pmd,
139 		unsigned long new_addr, bool need_rmap_locks)
140 {
141 	struct mm_struct *mm = vma->vm_mm;
142 	pte_t *old_pte, *new_pte, pte;
143 	spinlock_t *old_ptl, *new_ptl;
144 	bool force_flush = false;
145 	unsigned long len = old_end - old_addr;
146 
147 	/*
148 	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
149 	 * locks to ensure that rmap will always observe either the old or the
150 	 * new ptes. This is the easiest way to avoid races with
151 	 * truncate_pagecache(), page migration, etc...
152 	 *
153 	 * When need_rmap_locks is false, we use other ways to avoid
154 	 * such races:
155 	 *
156 	 * - During exec() shift_arg_pages(), we use a specially tagged vma
157 	 *   which rmap call sites look for using vma_is_temporary_stack().
158 	 *
159 	 * - During mremap(), new_vma is often known to be placed after vma
160 	 *   in rmap traversal order. This ensures rmap will always observe
161 	 *   either the old pte, or the new pte, or both (the page table locks
162 	 *   serialize access to individual ptes, but only rmap traversal
163 	 *   order guarantees that we won't miss both the old and new ptes).
164 	 */
165 	if (need_rmap_locks)
166 		take_rmap_locks(vma);
167 
168 	/*
169 	 * We don't have to worry about the ordering of src and dst
170 	 * pte locks because exclusive mmap_lock prevents deadlock.
171 	 */
172 	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
173 	new_pte = pte_offset_map(new_pmd, new_addr);
174 	new_ptl = pte_lockptr(mm, new_pmd);
175 	if (new_ptl != old_ptl)
176 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
177 	flush_tlb_batched_pending(vma->vm_mm);
178 	arch_enter_lazy_mmu_mode();
179 
180 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
181 				   new_pte++, new_addr += PAGE_SIZE) {
182 		if (pte_none(*old_pte))
183 			continue;
184 
185 		pte = ptep_get_and_clear(mm, old_addr, old_pte);
186 		/*
187 		 * If we are remapping a valid PTE, make sure
188 		 * to flush TLB before we drop the PTL for the
189 		 * PTE.
190 		 *
191 		 * NOTE! Both old and new PTL matter: the old one
192 		 * for racing with page_mkclean(), the new one to
193 		 * make sure the physical page stays valid until
194 		 * the TLB entry for the old mapping has been
195 		 * flushed.
196 		 */
197 		if (pte_present(pte))
198 			force_flush = true;
199 		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
200 		pte = move_soft_dirty_pte(pte);
201 		set_pte_at(mm, new_addr, new_pte, pte);
202 	}
203 
204 	arch_leave_lazy_mmu_mode();
205 	if (force_flush)
206 		flush_tlb_range(vma, old_end - len, old_end);
207 	if (new_ptl != old_ptl)
208 		spin_unlock(new_ptl);
209 	pte_unmap(new_pte - 1);
210 	pte_unmap_unlock(old_pte - 1, old_ptl);
211 	if (need_rmap_locks)
212 		drop_rmap_locks(vma);
213 }
214 
215 #ifndef arch_supports_page_table_move
216 #define arch_supports_page_table_move arch_supports_page_table_move
217 static inline bool arch_supports_page_table_move(void)
218 {
219 	return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
220 		IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
221 }
222 #endif
223 
224 #ifdef CONFIG_HAVE_MOVE_PMD
225 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
226 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
227 {
228 	spinlock_t *old_ptl, *new_ptl;
229 	struct mm_struct *mm = vma->vm_mm;
230 	pmd_t pmd;
231 
232 	if (!arch_supports_page_table_move())
233 		return false;
234 	/*
235 	 * The destination pmd shouldn't be established, free_pgtables()
236 	 * should have released it.
237 	 *
238 	 * However, there's a case during execve() where we use mremap
239 	 * to move the initial stack, and in that case the target area
240 	 * may overlap the source area (always moving down).
241 	 *
242 	 * If everything is PMD-aligned, that works fine, as moving
243 	 * each pmd down will clear the source pmd. But if we first
244 	 * have a few 4kB-only pages that get moved down, and then
245 	 * hit the "now the rest is PMD-aligned, let's do everything
246 	 * one pmd at a time", we will still have the old (now empty
247 	 * of any 4kB pages, but still there) PMD in the page table
248 	 * tree.
249 	 *
250 	 * Warn on it once - because we really should try to figure
251 	 * out how to do this better - but then say "I won't move
252 	 * this pmd".
253 	 *
254 	 * One alternative might be to just unmap the target pmd at
255 	 * this point, and verify that it really is empty. We'll see.
256 	 */
257 	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
258 		return false;
259 
260 	/*
261 	 * We don't have to worry about the ordering of src and dst
262 	 * ptlocks because exclusive mmap_lock prevents deadlock.
263 	 */
264 	old_ptl = pmd_lock(vma->vm_mm, old_pmd);
265 	new_ptl = pmd_lockptr(mm, new_pmd);
266 	if (new_ptl != old_ptl)
267 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
268 
269 	/* Clear the pmd */
270 	pmd = *old_pmd;
271 	pmd_clear(old_pmd);
272 
273 	VM_BUG_ON(!pmd_none(*new_pmd));
274 
275 	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
276 	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
277 	if (new_ptl != old_ptl)
278 		spin_unlock(new_ptl);
279 	spin_unlock(old_ptl);
280 
281 	return true;
282 }
283 #else
284 static inline bool move_normal_pmd(struct vm_area_struct *vma,
285 		unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
286 		pmd_t *new_pmd)
287 {
288 	return false;
289 }
290 #endif
291 
292 #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
293 static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
294 		  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
295 {
296 	spinlock_t *old_ptl, *new_ptl;
297 	struct mm_struct *mm = vma->vm_mm;
298 	pud_t pud;
299 
300 	if (!arch_supports_page_table_move())
301 		return false;
302 	/*
303 	 * The destination pud shouldn't be established, free_pgtables()
304 	 * should have released it.
305 	 */
306 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
307 		return false;
308 
309 	/*
310 	 * We don't have to worry about the ordering of src and dst
311 	 * ptlocks because exclusive mmap_lock prevents deadlock.
312 	 */
313 	old_ptl = pud_lock(vma->vm_mm, old_pud);
314 	new_ptl = pud_lockptr(mm, new_pud);
315 	if (new_ptl != old_ptl)
316 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
317 
318 	/* Clear the pud */
319 	pud = *old_pud;
320 	pud_clear(old_pud);
321 
322 	VM_BUG_ON(!pud_none(*new_pud));
323 
324 	pud_populate(mm, new_pud, pud_pgtable(pud));
325 	flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
326 	if (new_ptl != old_ptl)
327 		spin_unlock(new_ptl);
328 	spin_unlock(old_ptl);
329 
330 	return true;
331 }
332 #else
333 static inline bool move_normal_pud(struct vm_area_struct *vma,
334 		unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
335 		pud_t *new_pud)
336 {
337 	return false;
338 }
339 #endif
340 
341 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
342 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
343 			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
344 {
345 	spinlock_t *old_ptl, *new_ptl;
346 	struct mm_struct *mm = vma->vm_mm;
347 	pud_t pud;
348 
349 	/*
350 	 * The destination pud shouldn't be established, free_pgtables()
351 	 * should have released it.
352 	 */
353 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
354 		return false;
355 
356 	/*
357 	 * We don't have to worry about the ordering of src and dst
358 	 * ptlocks because exclusive mmap_lock prevents deadlock.
359 	 */
360 	old_ptl = pud_lock(vma->vm_mm, old_pud);
361 	new_ptl = pud_lockptr(mm, new_pud);
362 	if (new_ptl != old_ptl)
363 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
364 
365 	/* Clear the pud */
366 	pud = *old_pud;
367 	pud_clear(old_pud);
368 
369 	VM_BUG_ON(!pud_none(*new_pud));
370 
371 	/* Set the new pud */
372 	/* mark soft_ditry when we add pud level soft dirty support */
373 	set_pud_at(mm, new_addr, new_pud, pud);
374 	flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
375 	if (new_ptl != old_ptl)
376 		spin_unlock(new_ptl);
377 	spin_unlock(old_ptl);
378 
379 	return true;
380 }
381 #else
382 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
383 			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
384 {
385 	WARN_ON_ONCE(1);
386 	return false;
387 
388 }
389 #endif
390 
391 enum pgt_entry {
392 	NORMAL_PMD,
393 	HPAGE_PMD,
394 	NORMAL_PUD,
395 	HPAGE_PUD,
396 };
397 
398 /*
399  * Returns an extent of the corresponding size for the pgt_entry specified if
400  * valid. Else returns a smaller extent bounded by the end of the source and
401  * destination pgt_entry.
402  */
403 static __always_inline unsigned long get_extent(enum pgt_entry entry,
404 			unsigned long old_addr, unsigned long old_end,
405 			unsigned long new_addr)
406 {
407 	unsigned long next, extent, mask, size;
408 
409 	switch (entry) {
410 	case HPAGE_PMD:
411 	case NORMAL_PMD:
412 		mask = PMD_MASK;
413 		size = PMD_SIZE;
414 		break;
415 	case HPAGE_PUD:
416 	case NORMAL_PUD:
417 		mask = PUD_MASK;
418 		size = PUD_SIZE;
419 		break;
420 	default:
421 		BUILD_BUG();
422 		break;
423 	}
424 
425 	next = (old_addr + size) & mask;
426 	/* even if next overflowed, extent below will be ok */
427 	extent = next - old_addr;
428 	if (extent > old_end - old_addr)
429 		extent = old_end - old_addr;
430 	next = (new_addr + size) & mask;
431 	if (extent > next - new_addr)
432 		extent = next - new_addr;
433 	return extent;
434 }
435 
436 /*
437  * Attempts to speedup the move by moving entry at the level corresponding to
438  * pgt_entry. Returns true if the move was successful, else false.
439  */
440 static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
441 			unsigned long old_addr, unsigned long new_addr,
442 			void *old_entry, void *new_entry, bool need_rmap_locks)
443 {
444 	bool moved = false;
445 
446 	/* See comment in move_ptes() */
447 	if (need_rmap_locks)
448 		take_rmap_locks(vma);
449 
450 	switch (entry) {
451 	case NORMAL_PMD:
452 		moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
453 					new_entry);
454 		break;
455 	case NORMAL_PUD:
456 		moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
457 					new_entry);
458 		break;
459 	case HPAGE_PMD:
460 		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
461 			move_huge_pmd(vma, old_addr, new_addr, old_entry,
462 				      new_entry);
463 		break;
464 	case HPAGE_PUD:
465 		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
466 			move_huge_pud(vma, old_addr, new_addr, old_entry,
467 				      new_entry);
468 		break;
469 
470 	default:
471 		WARN_ON_ONCE(1);
472 		break;
473 	}
474 
475 	if (need_rmap_locks)
476 		drop_rmap_locks(vma);
477 
478 	return moved;
479 }
480 
481 unsigned long move_page_tables(struct vm_area_struct *vma,
482 		unsigned long old_addr, struct vm_area_struct *new_vma,
483 		unsigned long new_addr, unsigned long len,
484 		bool need_rmap_locks)
485 {
486 	unsigned long extent, old_end;
487 	struct mmu_notifier_range range;
488 	pmd_t *old_pmd, *new_pmd;
489 	pud_t *old_pud, *new_pud;
490 
491 	if (!len)
492 		return 0;
493 
494 	old_end = old_addr + len;
495 
496 	if (is_vm_hugetlb_page(vma))
497 		return move_hugetlb_page_tables(vma, new_vma, old_addr,
498 						new_addr, len);
499 
500 	flush_cache_range(vma, old_addr, old_end);
501 	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
502 				old_addr, old_end);
503 	mmu_notifier_invalidate_range_start(&range);
504 
505 	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
506 		cond_resched();
507 		/*
508 		 * If extent is PUD-sized try to speed up the move by moving at the
509 		 * PUD level if possible.
510 		 */
511 		extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
512 
513 		old_pud = get_old_pud(vma->vm_mm, old_addr);
514 		if (!old_pud)
515 			continue;
516 		new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
517 		if (!new_pud)
518 			break;
519 		if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
520 			if (extent == HPAGE_PUD_SIZE) {
521 				move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
522 					       old_pud, new_pud, need_rmap_locks);
523 				/* We ignore and continue on error? */
524 				continue;
525 			}
526 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
527 
528 			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
529 					   old_pud, new_pud, true))
530 				continue;
531 		}
532 
533 		extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
534 		old_pmd = get_old_pmd(vma->vm_mm, old_addr);
535 		if (!old_pmd)
536 			continue;
537 		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
538 		if (!new_pmd)
539 			break;
540 		if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
541 		    pmd_devmap(*old_pmd)) {
542 			if (extent == HPAGE_PMD_SIZE &&
543 			    move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
544 					   old_pmd, new_pmd, need_rmap_locks))
545 				continue;
546 			split_huge_pmd(vma, old_pmd, old_addr);
547 			if (pmd_trans_unstable(old_pmd))
548 				continue;
549 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
550 			   extent == PMD_SIZE) {
551 			/*
552 			 * If the extent is PMD-sized, try to speed the move by
553 			 * moving at the PMD level if possible.
554 			 */
555 			if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
556 					   old_pmd, new_pmd, true))
557 				continue;
558 		}
559 
560 		if (pte_alloc(new_vma->vm_mm, new_pmd))
561 			break;
562 		move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
563 			  new_pmd, new_addr, need_rmap_locks);
564 	}
565 
566 	mmu_notifier_invalidate_range_end(&range);
567 
568 	return len + old_addr - old_end;	/* how much done */
569 }
570 
571 static unsigned long move_vma(struct vm_area_struct *vma,
572 		unsigned long old_addr, unsigned long old_len,
573 		unsigned long new_len, unsigned long new_addr,
574 		bool *locked, unsigned long flags,
575 		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
576 {
577 	long to_account = new_len - old_len;
578 	struct mm_struct *mm = vma->vm_mm;
579 	struct vm_area_struct *new_vma;
580 	unsigned long vm_flags = vma->vm_flags;
581 	unsigned long new_pgoff;
582 	unsigned long moved_len;
583 	unsigned long account_start = 0;
584 	unsigned long account_end = 0;
585 	unsigned long hiwater_vm;
586 	int err = 0;
587 	bool need_rmap_locks;
588 	struct vma_iterator vmi;
589 
590 	/*
591 	 * We'd prefer to avoid failure later on in do_munmap:
592 	 * which may split one vma into three before unmapping.
593 	 */
594 	if (mm->map_count >= sysctl_max_map_count - 3)
595 		return -ENOMEM;
596 
597 	if (unlikely(flags & MREMAP_DONTUNMAP))
598 		to_account = new_len;
599 
600 	if (vma->vm_ops && vma->vm_ops->may_split) {
601 		if (vma->vm_start != old_addr)
602 			err = vma->vm_ops->may_split(vma, old_addr);
603 		if (!err && vma->vm_end != old_addr + old_len)
604 			err = vma->vm_ops->may_split(vma, old_addr + old_len);
605 		if (err)
606 			return err;
607 	}
608 
609 	/*
610 	 * Advise KSM to break any KSM pages in the area to be moved:
611 	 * it would be confusing if they were to turn up at the new
612 	 * location, where they happen to coincide with different KSM
613 	 * pages recently unmapped.  But leave vma->vm_flags as it was,
614 	 * so KSM can come around to merge on vma and new_vma afterwards.
615 	 */
616 	err = ksm_madvise(vma, old_addr, old_addr + old_len,
617 						MADV_UNMERGEABLE, &vm_flags);
618 	if (err)
619 		return err;
620 
621 	if (vm_flags & VM_ACCOUNT) {
622 		if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
623 			return -ENOMEM;
624 	}
625 
626 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
627 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
628 			   &need_rmap_locks);
629 	if (!new_vma) {
630 		if (vm_flags & VM_ACCOUNT)
631 			vm_unacct_memory(to_account >> PAGE_SHIFT);
632 		return -ENOMEM;
633 	}
634 
635 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
636 				     need_rmap_locks);
637 	if (moved_len < old_len) {
638 		err = -ENOMEM;
639 	} else if (vma->vm_ops && vma->vm_ops->mremap) {
640 		err = vma->vm_ops->mremap(new_vma);
641 	}
642 
643 	if (unlikely(err)) {
644 		/*
645 		 * On error, move entries back from new area to old,
646 		 * which will succeed since page tables still there,
647 		 * and then proceed to unmap new area instead of old.
648 		 */
649 		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
650 				 true);
651 		vma = new_vma;
652 		old_len = new_len;
653 		old_addr = new_addr;
654 		new_addr = err;
655 	} else {
656 		mremap_userfaultfd_prep(new_vma, uf);
657 	}
658 
659 	if (is_vm_hugetlb_page(vma)) {
660 		clear_vma_resv_huge_pages(vma);
661 	}
662 
663 	/* Conceal VM_ACCOUNT so old reservation is not undone */
664 	if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
665 		vm_flags_clear(vma, VM_ACCOUNT);
666 		if (vma->vm_start < old_addr)
667 			account_start = vma->vm_start;
668 		if (vma->vm_end > old_addr + old_len)
669 			account_end = vma->vm_end;
670 	}
671 
672 	/*
673 	 * If we failed to move page tables we still do total_vm increment
674 	 * since do_munmap() will decrement it by old_len == new_len.
675 	 *
676 	 * Since total_vm is about to be raised artificially high for a
677 	 * moment, we need to restore high watermark afterwards: if stats
678 	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
679 	 * If this were a serious issue, we'd add a flag to do_munmap().
680 	 */
681 	hiwater_vm = mm->hiwater_vm;
682 	vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
683 
684 	/* Tell pfnmap has moved from this vma */
685 	if (unlikely(vma->vm_flags & VM_PFNMAP))
686 		untrack_pfn_moved(vma);
687 
688 	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
689 		/* We always clear VM_LOCKED[ONFAULT] on the old vma */
690 		vm_flags_clear(vma, VM_LOCKED_MASK);
691 
692 		/*
693 		 * anon_vma links of the old vma is no longer needed after its page
694 		 * table has been moved.
695 		 */
696 		if (new_vma != vma && vma->vm_start == old_addr &&
697 			vma->vm_end == (old_addr + old_len))
698 			unlink_anon_vmas(vma);
699 
700 		/* Because we won't unmap we don't need to touch locked_vm */
701 		return new_addr;
702 	}
703 
704 	vma_iter_init(&vmi, mm, old_addr);
705 	if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
706 		/* OOM: unable to split vma, just get accounts right */
707 		if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
708 			vm_acct_memory(old_len >> PAGE_SHIFT);
709 		account_start = account_end = 0;
710 	}
711 
712 	if (vm_flags & VM_LOCKED) {
713 		mm->locked_vm += new_len >> PAGE_SHIFT;
714 		*locked = true;
715 	}
716 
717 	mm->hiwater_vm = hiwater_vm;
718 
719 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
720 	if (account_start) {
721 		vma = vma_prev(&vmi);
722 		vm_flags_set(vma, VM_ACCOUNT);
723 	}
724 
725 	if (account_end) {
726 		vma = vma_next(&vmi);
727 		vm_flags_set(vma, VM_ACCOUNT);
728 	}
729 
730 	return new_addr;
731 }
732 
733 static struct vm_area_struct *vma_to_resize(unsigned long addr,
734 	unsigned long old_len, unsigned long new_len, unsigned long flags)
735 {
736 	struct mm_struct *mm = current->mm;
737 	struct vm_area_struct *vma;
738 	unsigned long pgoff;
739 
740 	vma = vma_lookup(mm, addr);
741 	if (!vma)
742 		return ERR_PTR(-EFAULT);
743 
744 	/*
745 	 * !old_len is a special case where an attempt is made to 'duplicate'
746 	 * a mapping.  This makes no sense for private mappings as it will
747 	 * instead create a fresh/new mapping unrelated to the original.  This
748 	 * is contrary to the basic idea of mremap which creates new mappings
749 	 * based on the original.  There are no known use cases for this
750 	 * behavior.  As a result, fail such attempts.
751 	 */
752 	if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
753 		pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n", current->comm, current->pid);
754 		return ERR_PTR(-EINVAL);
755 	}
756 
757 	if ((flags & MREMAP_DONTUNMAP) &&
758 			(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
759 		return ERR_PTR(-EINVAL);
760 
761 	/* We can't remap across vm area boundaries */
762 	if (old_len > vma->vm_end - addr)
763 		return ERR_PTR(-EFAULT);
764 
765 	if (new_len == old_len)
766 		return vma;
767 
768 	/* Need to be careful about a growing mapping */
769 	pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
770 	pgoff += vma->vm_pgoff;
771 	if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
772 		return ERR_PTR(-EINVAL);
773 
774 	if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
775 		return ERR_PTR(-EFAULT);
776 
777 	if (mlock_future_check(mm, vma->vm_flags, new_len - old_len))
778 		return ERR_PTR(-EAGAIN);
779 
780 	if (!may_expand_vm(mm, vma->vm_flags,
781 				(new_len - old_len) >> PAGE_SHIFT))
782 		return ERR_PTR(-ENOMEM);
783 
784 	return vma;
785 }
786 
787 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
788 		unsigned long new_addr, unsigned long new_len, bool *locked,
789 		unsigned long flags, struct vm_userfaultfd_ctx *uf,
790 		struct list_head *uf_unmap_early,
791 		struct list_head *uf_unmap)
792 {
793 	struct mm_struct *mm = current->mm;
794 	struct vm_area_struct *vma;
795 	unsigned long ret = -EINVAL;
796 	unsigned long map_flags = 0;
797 
798 	if (offset_in_page(new_addr))
799 		goto out;
800 
801 	if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
802 		goto out;
803 
804 	/* Ensure the old/new locations do not overlap */
805 	if (addr + old_len > new_addr && new_addr + new_len > addr)
806 		goto out;
807 
808 	/*
809 	 * move_vma() need us to stay 4 maps below the threshold, otherwise
810 	 * it will bail out at the very beginning.
811 	 * That is a problem if we have already unmaped the regions here
812 	 * (new_addr, and old_addr), because userspace will not know the
813 	 * state of the vma's after it gets -ENOMEM.
814 	 * So, to avoid such scenario we can pre-compute if the whole
815 	 * operation has high chances to success map-wise.
816 	 * Worst-scenario case is when both vma's (new_addr and old_addr) get
817 	 * split in 3 before unmapping it.
818 	 * That means 2 more maps (1 for each) to the ones we already hold.
819 	 * Check whether current map count plus 2 still leads us to 4 maps below
820 	 * the threshold, otherwise return -ENOMEM here to be more safe.
821 	 */
822 	if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
823 		return -ENOMEM;
824 
825 	if (flags & MREMAP_FIXED) {
826 		ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
827 		if (ret)
828 			goto out;
829 	}
830 
831 	if (old_len > new_len) {
832 		ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
833 		if (ret)
834 			goto out;
835 		old_len = new_len;
836 	}
837 
838 	vma = vma_to_resize(addr, old_len, new_len, flags);
839 	if (IS_ERR(vma)) {
840 		ret = PTR_ERR(vma);
841 		goto out;
842 	}
843 
844 	/* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
845 	if (flags & MREMAP_DONTUNMAP &&
846 		!may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
847 		ret = -ENOMEM;
848 		goto out;
849 	}
850 
851 	if (flags & MREMAP_FIXED)
852 		map_flags |= MAP_FIXED;
853 
854 	if (vma->vm_flags & VM_MAYSHARE)
855 		map_flags |= MAP_SHARED;
856 
857 	ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
858 				((addr - vma->vm_start) >> PAGE_SHIFT),
859 				map_flags);
860 	if (IS_ERR_VALUE(ret))
861 		goto out;
862 
863 	/* We got a new mapping */
864 	if (!(flags & MREMAP_FIXED))
865 		new_addr = ret;
866 
867 	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
868 		       uf_unmap);
869 
870 out:
871 	return ret;
872 }
873 
874 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
875 {
876 	unsigned long end = vma->vm_end + delta;
877 
878 	if (end < vma->vm_end) /* overflow */
879 		return 0;
880 	if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
881 		return 0;
882 	if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
883 			      0, MAP_FIXED) & ~PAGE_MASK)
884 		return 0;
885 	return 1;
886 }
887 
888 /*
889  * Expand (or shrink) an existing mapping, potentially moving it at the
890  * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
891  *
892  * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
893  * This option implies MREMAP_MAYMOVE.
894  */
895 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
896 		unsigned long, new_len, unsigned long, flags,
897 		unsigned long, new_addr)
898 {
899 	struct mm_struct *mm = current->mm;
900 	struct vm_area_struct *vma;
901 	unsigned long ret = -EINVAL;
902 	bool locked = false;
903 	bool downgraded = false;
904 	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
905 	LIST_HEAD(uf_unmap_early);
906 	LIST_HEAD(uf_unmap);
907 
908 	/*
909 	 * There is a deliberate asymmetry here: we strip the pointer tag
910 	 * from the old address but leave the new address alone. This is
911 	 * for consistency with mmap(), where we prevent the creation of
912 	 * aliasing mappings in userspace by leaving the tag bits of the
913 	 * mapping address intact. A non-zero tag will cause the subsequent
914 	 * range checks to reject the address as invalid.
915 	 *
916 	 * See Documentation/arm64/tagged-address-abi.rst for more information.
917 	 */
918 	addr = untagged_addr(addr);
919 
920 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
921 		return ret;
922 
923 	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
924 		return ret;
925 
926 	/*
927 	 * MREMAP_DONTUNMAP is always a move and it does not allow resizing
928 	 * in the process.
929 	 */
930 	if (flags & MREMAP_DONTUNMAP &&
931 			(!(flags & MREMAP_MAYMOVE) || old_len != new_len))
932 		return ret;
933 
934 
935 	if (offset_in_page(addr))
936 		return ret;
937 
938 	old_len = PAGE_ALIGN(old_len);
939 	new_len = PAGE_ALIGN(new_len);
940 
941 	/*
942 	 * We allow a zero old-len as a special case
943 	 * for DOS-emu "duplicate shm area" thing. But
944 	 * a zero new-len is nonsensical.
945 	 */
946 	if (!new_len)
947 		return ret;
948 
949 	if (mmap_write_lock_killable(current->mm))
950 		return -EINTR;
951 	vma = vma_lookup(mm, addr);
952 	if (!vma) {
953 		ret = -EFAULT;
954 		goto out;
955 	}
956 
957 	if (is_vm_hugetlb_page(vma)) {
958 		struct hstate *h __maybe_unused = hstate_vma(vma);
959 
960 		old_len = ALIGN(old_len, huge_page_size(h));
961 		new_len = ALIGN(new_len, huge_page_size(h));
962 
963 		/* addrs must be huge page aligned */
964 		if (addr & ~huge_page_mask(h))
965 			goto out;
966 		if (new_addr & ~huge_page_mask(h))
967 			goto out;
968 
969 		/*
970 		 * Don't allow remap expansion, because the underlying hugetlb
971 		 * reservation is not yet capable to handle split reservation.
972 		 */
973 		if (new_len > old_len)
974 			goto out;
975 	}
976 
977 	if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
978 		ret = mremap_to(addr, old_len, new_addr, new_len,
979 				&locked, flags, &uf, &uf_unmap_early,
980 				&uf_unmap);
981 		goto out;
982 	}
983 
984 	/*
985 	 * Always allow a shrinking remap: that just unmaps
986 	 * the unnecessary pages..
987 	 * do_vmi_munmap does all the needed commit accounting, and
988 	 * downgrades mmap_lock to read if so directed.
989 	 */
990 	if (old_len >= new_len) {
991 		int retval;
992 		VMA_ITERATOR(vmi, mm, addr + new_len);
993 
994 		retval = do_vmi_munmap(&vmi, mm, addr + new_len,
995 				       old_len - new_len, &uf_unmap, true);
996 		/* Returning 1 indicates mmap_lock is downgraded to read. */
997 		if (retval == 1) {
998 			downgraded = true;
999 		} else if (retval < 0 && old_len != new_len) {
1000 			ret = retval;
1001 			goto out;
1002 		}
1003 
1004 		ret = addr;
1005 		goto out;
1006 	}
1007 
1008 	/*
1009 	 * Ok, we need to grow..
1010 	 */
1011 	vma = vma_to_resize(addr, old_len, new_len, flags);
1012 	if (IS_ERR(vma)) {
1013 		ret = PTR_ERR(vma);
1014 		goto out;
1015 	}
1016 
1017 	/* old_len exactly to the end of the area..
1018 	 */
1019 	if (old_len == vma->vm_end - addr) {
1020 		/* can we just expand the current mapping? */
1021 		if (vma_expandable(vma, new_len - old_len)) {
1022 			long pages = (new_len - old_len) >> PAGE_SHIFT;
1023 			unsigned long extension_start = addr + old_len;
1024 			unsigned long extension_end = addr + new_len;
1025 			pgoff_t extension_pgoff = vma->vm_pgoff +
1026 				((extension_start - vma->vm_start) >> PAGE_SHIFT);
1027 			VMA_ITERATOR(vmi, mm, extension_start);
1028 
1029 			if (vma->vm_flags & VM_ACCOUNT) {
1030 				if (security_vm_enough_memory_mm(mm, pages)) {
1031 					ret = -ENOMEM;
1032 					goto out;
1033 				}
1034 			}
1035 
1036 			/*
1037 			 * Function vma_merge() is called on the extension we
1038 			 * are adding to the already existing vma, vma_merge()
1039 			 * will merge this extension with the already existing
1040 			 * vma (expand operation itself) and possibly also with
1041 			 * the next vma if it becomes adjacent to the expanded
1042 			 * vma and  otherwise compatible.
1043 			 *
1044 			 * However, vma_merge() can currently fail due to
1045 			 * is_mergeable_vma() check for vm_ops->close (see the
1046 			 * comment there). Yet this should not prevent vma
1047 			 * expanding, so perform a simple expand for such vma.
1048 			 * Ideally the check for close op should be only done
1049 			 * when a vma would be actually removed due to a merge.
1050 			 */
1051 			if (!vma->vm_ops || !vma->vm_ops->close) {
1052 				vma = vma_merge(&vmi, mm, vma, extension_start,
1053 					extension_end, vma->vm_flags, vma->anon_vma,
1054 					vma->vm_file, extension_pgoff, vma_policy(vma),
1055 					vma->vm_userfaultfd_ctx, anon_vma_name(vma));
1056 			} else if (vma_expand(&vmi, vma, vma->vm_start,
1057 					addr + new_len, vma->vm_pgoff, NULL)) {
1058 				vma = NULL;
1059 			}
1060 			if (!vma) {
1061 				vm_unacct_memory(pages);
1062 				ret = -ENOMEM;
1063 				goto out;
1064 			}
1065 
1066 			vm_stat_account(mm, vma->vm_flags, pages);
1067 			if (vma->vm_flags & VM_LOCKED) {
1068 				mm->locked_vm += pages;
1069 				locked = true;
1070 				new_addr = addr;
1071 			}
1072 			ret = addr;
1073 			goto out;
1074 		}
1075 	}
1076 
1077 	/*
1078 	 * We weren't able to just expand or shrink the area,
1079 	 * we need to create a new one and move it..
1080 	 */
1081 	ret = -ENOMEM;
1082 	if (flags & MREMAP_MAYMOVE) {
1083 		unsigned long map_flags = 0;
1084 		if (vma->vm_flags & VM_MAYSHARE)
1085 			map_flags |= MAP_SHARED;
1086 
1087 		new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
1088 					vma->vm_pgoff +
1089 					((addr - vma->vm_start) >> PAGE_SHIFT),
1090 					map_flags);
1091 		if (IS_ERR_VALUE(new_addr)) {
1092 			ret = new_addr;
1093 			goto out;
1094 		}
1095 
1096 		ret = move_vma(vma, addr, old_len, new_len, new_addr,
1097 			       &locked, flags, &uf, &uf_unmap);
1098 	}
1099 out:
1100 	if (offset_in_page(ret))
1101 		locked = false;
1102 	if (downgraded)
1103 		mmap_read_unlock(current->mm);
1104 	else
1105 		mmap_write_unlock(current->mm);
1106 	if (locked && new_len > old_len)
1107 		mm_populate(new_addr + old_len, new_len - old_len);
1108 	userfaultfd_unmap_complete(mm, &uf_unmap_early);
1109 	mremap_userfaultfd_complete(&uf, addr, ret, old_len);
1110 	userfaultfd_unmap_complete(mm, &uf_unmap);
1111 	return ret;
1112 }
1113