xref: /openbmc/linux/mm/mremap.c (revision 83b975b5)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *	mm/mremap.c
4  *
5  *	(C) Copyright 1996 Linus Torvalds
6  *
7  *	Address space accounting code	<alan@lxorguk.ukuu.org.uk>
8  *	(C) Copyright 2002 Red Hat Inc, All Rights Reserved
9  */
10 
11 #include <linux/mm.h>
12 #include <linux/mm_inline.h>
13 #include <linux/hugetlb.h>
14 #include <linux/shm.h>
15 #include <linux/ksm.h>
16 #include <linux/mman.h>
17 #include <linux/swap.h>
18 #include <linux/capability.h>
19 #include <linux/fs.h>
20 #include <linux/swapops.h>
21 #include <linux/highmem.h>
22 #include <linux/security.h>
23 #include <linux/syscalls.h>
24 #include <linux/mmu_notifier.h>
25 #include <linux/uaccess.h>
26 #include <linux/userfaultfd_k.h>
27 #include <linux/mempolicy.h>
28 
29 #include <asm/cacheflush.h>
30 #include <asm/tlb.h>
31 #include <asm/pgalloc.h>
32 
33 #include "internal.h"
34 
35 static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
36 {
37 	pgd_t *pgd;
38 	p4d_t *p4d;
39 	pud_t *pud;
40 
41 	pgd = pgd_offset(mm, addr);
42 	if (pgd_none_or_clear_bad(pgd))
43 		return NULL;
44 
45 	p4d = p4d_offset(pgd, addr);
46 	if (p4d_none_or_clear_bad(p4d))
47 		return NULL;
48 
49 	pud = pud_offset(p4d, addr);
50 	if (pud_none_or_clear_bad(pud))
51 		return NULL;
52 
53 	return pud;
54 }
55 
56 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
57 {
58 	pud_t *pud;
59 	pmd_t *pmd;
60 
61 	pud = get_old_pud(mm, addr);
62 	if (!pud)
63 		return NULL;
64 
65 	pmd = pmd_offset(pud, addr);
66 	if (pmd_none(*pmd))
67 		return NULL;
68 
69 	return pmd;
70 }
71 
72 static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
73 			    unsigned long addr)
74 {
75 	pgd_t *pgd;
76 	p4d_t *p4d;
77 
78 	pgd = pgd_offset(mm, addr);
79 	p4d = p4d_alloc(mm, pgd, addr);
80 	if (!p4d)
81 		return NULL;
82 
83 	return pud_alloc(mm, p4d, addr);
84 }
85 
86 static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
87 			    unsigned long addr)
88 {
89 	pud_t *pud;
90 	pmd_t *pmd;
91 
92 	pud = alloc_new_pud(mm, vma, addr);
93 	if (!pud)
94 		return NULL;
95 
96 	pmd = pmd_alloc(mm, pud, addr);
97 	if (!pmd)
98 		return NULL;
99 
100 	VM_BUG_ON(pmd_trans_huge(*pmd));
101 
102 	return pmd;
103 }
104 
105 static void take_rmap_locks(struct vm_area_struct *vma)
106 {
107 	if (vma->vm_file)
108 		i_mmap_lock_write(vma->vm_file->f_mapping);
109 	if (vma->anon_vma)
110 		anon_vma_lock_write(vma->anon_vma);
111 }
112 
113 static void drop_rmap_locks(struct vm_area_struct *vma)
114 {
115 	if (vma->anon_vma)
116 		anon_vma_unlock_write(vma->anon_vma);
117 	if (vma->vm_file)
118 		i_mmap_unlock_write(vma->vm_file->f_mapping);
119 }
120 
121 static pte_t move_soft_dirty_pte(pte_t pte)
122 {
123 	/*
124 	 * Set soft dirty bit so we can notice
125 	 * in userspace the ptes were moved.
126 	 */
127 #ifdef CONFIG_MEM_SOFT_DIRTY
128 	if (pte_present(pte))
129 		pte = pte_mksoft_dirty(pte);
130 	else if (is_swap_pte(pte))
131 		pte = pte_swp_mksoft_dirty(pte);
132 #endif
133 	return pte;
134 }
135 
136 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
137 		unsigned long old_addr, unsigned long old_end,
138 		struct vm_area_struct *new_vma, pmd_t *new_pmd,
139 		unsigned long new_addr, bool need_rmap_locks)
140 {
141 	struct mm_struct *mm = vma->vm_mm;
142 	pte_t *old_pte, *new_pte, pte;
143 	spinlock_t *old_ptl, *new_ptl;
144 	bool force_flush = false;
145 	unsigned long len = old_end - old_addr;
146 
147 	/*
148 	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
149 	 * locks to ensure that rmap will always observe either the old or the
150 	 * new ptes. This is the easiest way to avoid races with
151 	 * truncate_pagecache(), page migration, etc...
152 	 *
153 	 * When need_rmap_locks is false, we use other ways to avoid
154 	 * such races:
155 	 *
156 	 * - During exec() shift_arg_pages(), we use a specially tagged vma
157 	 *   which rmap call sites look for using vma_is_temporary_stack().
158 	 *
159 	 * - During mremap(), new_vma is often known to be placed after vma
160 	 *   in rmap traversal order. This ensures rmap will always observe
161 	 *   either the old pte, or the new pte, or both (the page table locks
162 	 *   serialize access to individual ptes, but only rmap traversal
163 	 *   order guarantees that we won't miss both the old and new ptes).
164 	 */
165 	if (need_rmap_locks)
166 		take_rmap_locks(vma);
167 
168 	/*
169 	 * We don't have to worry about the ordering of src and dst
170 	 * pte locks because exclusive mmap_lock prevents deadlock.
171 	 */
172 	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
173 	new_pte = pte_offset_map(new_pmd, new_addr);
174 	new_ptl = pte_lockptr(mm, new_pmd);
175 	if (new_ptl != old_ptl)
176 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
177 	flush_tlb_batched_pending(vma->vm_mm);
178 	arch_enter_lazy_mmu_mode();
179 
180 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
181 				   new_pte++, new_addr += PAGE_SIZE) {
182 		if (pte_none(*old_pte))
183 			continue;
184 
185 		pte = ptep_get_and_clear(mm, old_addr, old_pte);
186 		/*
187 		 * If we are remapping a valid PTE, make sure
188 		 * to flush TLB before we drop the PTL for the
189 		 * PTE.
190 		 *
191 		 * NOTE! Both old and new PTL matter: the old one
192 		 * for racing with page_mkclean(), the new one to
193 		 * make sure the physical page stays valid until
194 		 * the TLB entry for the old mapping has been
195 		 * flushed.
196 		 */
197 		if (pte_present(pte))
198 			force_flush = true;
199 		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
200 		pte = move_soft_dirty_pte(pte);
201 		set_pte_at(mm, new_addr, new_pte, pte);
202 	}
203 
204 	arch_leave_lazy_mmu_mode();
205 	if (force_flush)
206 		flush_tlb_range(vma, old_end - len, old_end);
207 	if (new_ptl != old_ptl)
208 		spin_unlock(new_ptl);
209 	pte_unmap(new_pte - 1);
210 	pte_unmap_unlock(old_pte - 1, old_ptl);
211 	if (need_rmap_locks)
212 		drop_rmap_locks(vma);
213 }
214 
215 #ifndef arch_supports_page_table_move
216 #define arch_supports_page_table_move arch_supports_page_table_move
217 static inline bool arch_supports_page_table_move(void)
218 {
219 	return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
220 		IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
221 }
222 #endif
223 
224 #ifdef CONFIG_HAVE_MOVE_PMD
225 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
226 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
227 {
228 	spinlock_t *old_ptl, *new_ptl;
229 	struct mm_struct *mm = vma->vm_mm;
230 	pmd_t pmd;
231 
232 	if (!arch_supports_page_table_move())
233 		return false;
234 	/*
235 	 * The destination pmd shouldn't be established, free_pgtables()
236 	 * should have released it.
237 	 *
238 	 * However, there's a case during execve() where we use mremap
239 	 * to move the initial stack, and in that case the target area
240 	 * may overlap the source area (always moving down).
241 	 *
242 	 * If everything is PMD-aligned, that works fine, as moving
243 	 * each pmd down will clear the source pmd. But if we first
244 	 * have a few 4kB-only pages that get moved down, and then
245 	 * hit the "now the rest is PMD-aligned, let's do everything
246 	 * one pmd at a time", we will still have the old (now empty
247 	 * of any 4kB pages, but still there) PMD in the page table
248 	 * tree.
249 	 *
250 	 * Warn on it once - because we really should try to figure
251 	 * out how to do this better - but then say "I won't move
252 	 * this pmd".
253 	 *
254 	 * One alternative might be to just unmap the target pmd at
255 	 * this point, and verify that it really is empty. We'll see.
256 	 */
257 	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
258 		return false;
259 
260 	/*
261 	 * We don't have to worry about the ordering of src and dst
262 	 * ptlocks because exclusive mmap_lock prevents deadlock.
263 	 */
264 	old_ptl = pmd_lock(vma->vm_mm, old_pmd);
265 	new_ptl = pmd_lockptr(mm, new_pmd);
266 	if (new_ptl != old_ptl)
267 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
268 
269 	/* Clear the pmd */
270 	pmd = *old_pmd;
271 	pmd_clear(old_pmd);
272 
273 	VM_BUG_ON(!pmd_none(*new_pmd));
274 
275 	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
276 	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
277 	if (new_ptl != old_ptl)
278 		spin_unlock(new_ptl);
279 	spin_unlock(old_ptl);
280 
281 	return true;
282 }
283 #else
284 static inline bool move_normal_pmd(struct vm_area_struct *vma,
285 		unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
286 		pmd_t *new_pmd)
287 {
288 	return false;
289 }
290 #endif
291 
292 #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
293 static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
294 		  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
295 {
296 	spinlock_t *old_ptl, *new_ptl;
297 	struct mm_struct *mm = vma->vm_mm;
298 	pud_t pud;
299 
300 	if (!arch_supports_page_table_move())
301 		return false;
302 	/*
303 	 * The destination pud shouldn't be established, free_pgtables()
304 	 * should have released it.
305 	 */
306 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
307 		return false;
308 
309 	/*
310 	 * We don't have to worry about the ordering of src and dst
311 	 * ptlocks because exclusive mmap_lock prevents deadlock.
312 	 */
313 	old_ptl = pud_lock(vma->vm_mm, old_pud);
314 	new_ptl = pud_lockptr(mm, new_pud);
315 	if (new_ptl != old_ptl)
316 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
317 
318 	/* Clear the pud */
319 	pud = *old_pud;
320 	pud_clear(old_pud);
321 
322 	VM_BUG_ON(!pud_none(*new_pud));
323 
324 	pud_populate(mm, new_pud, pud_pgtable(pud));
325 	flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
326 	if (new_ptl != old_ptl)
327 		spin_unlock(new_ptl);
328 	spin_unlock(old_ptl);
329 
330 	return true;
331 }
332 #else
333 static inline bool move_normal_pud(struct vm_area_struct *vma,
334 		unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
335 		pud_t *new_pud)
336 {
337 	return false;
338 }
339 #endif
340 
341 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
342 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
343 			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
344 {
345 	spinlock_t *old_ptl, *new_ptl;
346 	struct mm_struct *mm = vma->vm_mm;
347 	pud_t pud;
348 
349 	/*
350 	 * The destination pud shouldn't be established, free_pgtables()
351 	 * should have released it.
352 	 */
353 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
354 		return false;
355 
356 	/*
357 	 * We don't have to worry about the ordering of src and dst
358 	 * ptlocks because exclusive mmap_lock prevents deadlock.
359 	 */
360 	old_ptl = pud_lock(vma->vm_mm, old_pud);
361 	new_ptl = pud_lockptr(mm, new_pud);
362 	if (new_ptl != old_ptl)
363 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
364 
365 	/* Clear the pud */
366 	pud = *old_pud;
367 	pud_clear(old_pud);
368 
369 	VM_BUG_ON(!pud_none(*new_pud));
370 
371 	/* Set the new pud */
372 	/* mark soft_ditry when we add pud level soft dirty support */
373 	set_pud_at(mm, new_addr, new_pud, pud);
374 	flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
375 	if (new_ptl != old_ptl)
376 		spin_unlock(new_ptl);
377 	spin_unlock(old_ptl);
378 
379 	return true;
380 }
381 #else
382 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
383 			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
384 {
385 	WARN_ON_ONCE(1);
386 	return false;
387 
388 }
389 #endif
390 
391 enum pgt_entry {
392 	NORMAL_PMD,
393 	HPAGE_PMD,
394 	NORMAL_PUD,
395 	HPAGE_PUD,
396 };
397 
398 /*
399  * Returns an extent of the corresponding size for the pgt_entry specified if
400  * valid. Else returns a smaller extent bounded by the end of the source and
401  * destination pgt_entry.
402  */
403 static __always_inline unsigned long get_extent(enum pgt_entry entry,
404 			unsigned long old_addr, unsigned long old_end,
405 			unsigned long new_addr)
406 {
407 	unsigned long next, extent, mask, size;
408 
409 	switch (entry) {
410 	case HPAGE_PMD:
411 	case NORMAL_PMD:
412 		mask = PMD_MASK;
413 		size = PMD_SIZE;
414 		break;
415 	case HPAGE_PUD:
416 	case NORMAL_PUD:
417 		mask = PUD_MASK;
418 		size = PUD_SIZE;
419 		break;
420 	default:
421 		BUILD_BUG();
422 		break;
423 	}
424 
425 	next = (old_addr + size) & mask;
426 	/* even if next overflowed, extent below will be ok */
427 	extent = next - old_addr;
428 	if (extent > old_end - old_addr)
429 		extent = old_end - old_addr;
430 	next = (new_addr + size) & mask;
431 	if (extent > next - new_addr)
432 		extent = next - new_addr;
433 	return extent;
434 }
435 
436 /*
437  * Attempts to speedup the move by moving entry at the level corresponding to
438  * pgt_entry. Returns true if the move was successful, else false.
439  */
440 static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
441 			unsigned long old_addr, unsigned long new_addr,
442 			void *old_entry, void *new_entry, bool need_rmap_locks)
443 {
444 	bool moved = false;
445 
446 	/* See comment in move_ptes() */
447 	if (need_rmap_locks)
448 		take_rmap_locks(vma);
449 
450 	switch (entry) {
451 	case NORMAL_PMD:
452 		moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
453 					new_entry);
454 		break;
455 	case NORMAL_PUD:
456 		moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
457 					new_entry);
458 		break;
459 	case HPAGE_PMD:
460 		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
461 			move_huge_pmd(vma, old_addr, new_addr, old_entry,
462 				      new_entry);
463 		break;
464 	case HPAGE_PUD:
465 		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
466 			move_huge_pud(vma, old_addr, new_addr, old_entry,
467 				      new_entry);
468 		break;
469 
470 	default:
471 		WARN_ON_ONCE(1);
472 		break;
473 	}
474 
475 	if (need_rmap_locks)
476 		drop_rmap_locks(vma);
477 
478 	return moved;
479 }
480 
481 unsigned long move_page_tables(struct vm_area_struct *vma,
482 		unsigned long old_addr, struct vm_area_struct *new_vma,
483 		unsigned long new_addr, unsigned long len,
484 		bool need_rmap_locks)
485 {
486 	unsigned long extent, old_end;
487 	struct mmu_notifier_range range;
488 	pmd_t *old_pmd, *new_pmd;
489 	pud_t *old_pud, *new_pud;
490 
491 	if (!len)
492 		return 0;
493 
494 	old_end = old_addr + len;
495 
496 	if (is_vm_hugetlb_page(vma))
497 		return move_hugetlb_page_tables(vma, new_vma, old_addr,
498 						new_addr, len);
499 
500 	flush_cache_range(vma, old_addr, old_end);
501 	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
502 				old_addr, old_end);
503 	mmu_notifier_invalidate_range_start(&range);
504 
505 	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
506 		cond_resched();
507 		/*
508 		 * If extent is PUD-sized try to speed up the move by moving at the
509 		 * PUD level if possible.
510 		 */
511 		extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
512 
513 		old_pud = get_old_pud(vma->vm_mm, old_addr);
514 		if (!old_pud)
515 			continue;
516 		new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
517 		if (!new_pud)
518 			break;
519 		if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
520 			if (extent == HPAGE_PUD_SIZE) {
521 				move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
522 					       old_pud, new_pud, need_rmap_locks);
523 				/* We ignore and continue on error? */
524 				continue;
525 			}
526 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
527 
528 			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
529 					   old_pud, new_pud, true))
530 				continue;
531 		}
532 
533 		extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
534 		old_pmd = get_old_pmd(vma->vm_mm, old_addr);
535 		if (!old_pmd)
536 			continue;
537 		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
538 		if (!new_pmd)
539 			break;
540 		if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
541 		    pmd_devmap(*old_pmd)) {
542 			if (extent == HPAGE_PMD_SIZE &&
543 			    move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
544 					   old_pmd, new_pmd, need_rmap_locks))
545 				continue;
546 			split_huge_pmd(vma, old_pmd, old_addr);
547 			if (pmd_trans_unstable(old_pmd))
548 				continue;
549 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
550 			   extent == PMD_SIZE) {
551 			/*
552 			 * If the extent is PMD-sized, try to speed the move by
553 			 * moving at the PMD level if possible.
554 			 */
555 			if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
556 					   old_pmd, new_pmd, true))
557 				continue;
558 		}
559 
560 		if (pte_alloc(new_vma->vm_mm, new_pmd))
561 			break;
562 		move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
563 			  new_pmd, new_addr, need_rmap_locks);
564 	}
565 
566 	mmu_notifier_invalidate_range_end(&range);
567 
568 	return len + old_addr - old_end;	/* how much done */
569 }
570 
571 static unsigned long move_vma(struct vm_area_struct *vma,
572 		unsigned long old_addr, unsigned long old_len,
573 		unsigned long new_len, unsigned long new_addr,
574 		bool *locked, unsigned long flags,
575 		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
576 {
577 	long to_account = new_len - old_len;
578 	struct mm_struct *mm = vma->vm_mm;
579 	struct vm_area_struct *new_vma;
580 	unsigned long vm_flags = vma->vm_flags;
581 	unsigned long new_pgoff;
582 	unsigned long moved_len;
583 	unsigned long excess = 0;
584 	unsigned long hiwater_vm;
585 	int split = 0;
586 	int err = 0;
587 	bool need_rmap_locks;
588 
589 	/*
590 	 * We'd prefer to avoid failure later on in do_munmap:
591 	 * which may split one vma into three before unmapping.
592 	 */
593 	if (mm->map_count >= sysctl_max_map_count - 3)
594 		return -ENOMEM;
595 
596 	if (unlikely(flags & MREMAP_DONTUNMAP))
597 		to_account = new_len;
598 
599 	if (vma->vm_ops && vma->vm_ops->may_split) {
600 		if (vma->vm_start != old_addr)
601 			err = vma->vm_ops->may_split(vma, old_addr);
602 		if (!err && vma->vm_end != old_addr + old_len)
603 			err = vma->vm_ops->may_split(vma, old_addr + old_len);
604 		if (err)
605 			return err;
606 	}
607 
608 	/*
609 	 * Advise KSM to break any KSM pages in the area to be moved:
610 	 * it would be confusing if they were to turn up at the new
611 	 * location, where they happen to coincide with different KSM
612 	 * pages recently unmapped.  But leave vma->vm_flags as it was,
613 	 * so KSM can come around to merge on vma and new_vma afterwards.
614 	 */
615 	err = ksm_madvise(vma, old_addr, old_addr + old_len,
616 						MADV_UNMERGEABLE, &vm_flags);
617 	if (err)
618 		return err;
619 
620 	if (vm_flags & VM_ACCOUNT) {
621 		if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
622 			return -ENOMEM;
623 	}
624 
625 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
626 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
627 			   &need_rmap_locks);
628 	if (!new_vma) {
629 		if (vm_flags & VM_ACCOUNT)
630 			vm_unacct_memory(to_account >> PAGE_SHIFT);
631 		return -ENOMEM;
632 	}
633 
634 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
635 				     need_rmap_locks);
636 	if (moved_len < old_len) {
637 		err = -ENOMEM;
638 	} else if (vma->vm_ops && vma->vm_ops->mremap) {
639 		err = vma->vm_ops->mremap(new_vma);
640 	}
641 
642 	if (unlikely(err)) {
643 		/*
644 		 * On error, move entries back from new area to old,
645 		 * which will succeed since page tables still there,
646 		 * and then proceed to unmap new area instead of old.
647 		 */
648 		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
649 				 true);
650 		vma = new_vma;
651 		old_len = new_len;
652 		old_addr = new_addr;
653 		new_addr = err;
654 	} else {
655 		mremap_userfaultfd_prep(new_vma, uf);
656 	}
657 
658 	if (is_vm_hugetlb_page(vma)) {
659 		clear_vma_resv_huge_pages(vma);
660 	}
661 
662 	/* Conceal VM_ACCOUNT so old reservation is not undone */
663 	if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
664 		vma->vm_flags &= ~VM_ACCOUNT;
665 		excess = vma->vm_end - vma->vm_start - old_len;
666 		if (old_addr > vma->vm_start &&
667 		    old_addr + old_len < vma->vm_end)
668 			split = 1;
669 	}
670 
671 	/*
672 	 * If we failed to move page tables we still do total_vm increment
673 	 * since do_munmap() will decrement it by old_len == new_len.
674 	 *
675 	 * Since total_vm is about to be raised artificially high for a
676 	 * moment, we need to restore high watermark afterwards: if stats
677 	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
678 	 * If this were a serious issue, we'd add a flag to do_munmap().
679 	 */
680 	hiwater_vm = mm->hiwater_vm;
681 	vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
682 
683 	/* Tell pfnmap has moved from this vma */
684 	if (unlikely(vma->vm_flags & VM_PFNMAP))
685 		untrack_pfn_moved(vma);
686 
687 	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
688 		/* We always clear VM_LOCKED[ONFAULT] on the old vma */
689 		vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
690 
691 		/*
692 		 * anon_vma links of the old vma is no longer needed after its page
693 		 * table has been moved.
694 		 */
695 		if (new_vma != vma && vma->vm_start == old_addr &&
696 			vma->vm_end == (old_addr + old_len))
697 			unlink_anon_vmas(vma);
698 
699 		/* Because we won't unmap we don't need to touch locked_vm */
700 		return new_addr;
701 	}
702 
703 	if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
704 		/* OOM: unable to split vma, just get accounts right */
705 		if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
706 			vm_acct_memory(old_len >> PAGE_SHIFT);
707 		excess = 0;
708 	}
709 
710 	if (vm_flags & VM_LOCKED) {
711 		mm->locked_vm += new_len >> PAGE_SHIFT;
712 		*locked = true;
713 	}
714 
715 	mm->hiwater_vm = hiwater_vm;
716 
717 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
718 	if (excess) {
719 		vma->vm_flags |= VM_ACCOUNT;
720 		if (split)
721 			find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT;
722 	}
723 
724 	return new_addr;
725 }
726 
727 static struct vm_area_struct *vma_to_resize(unsigned long addr,
728 	unsigned long old_len, unsigned long new_len, unsigned long flags)
729 {
730 	struct mm_struct *mm = current->mm;
731 	struct vm_area_struct *vma;
732 	unsigned long pgoff;
733 
734 	vma = vma_lookup(mm, addr);
735 	if (!vma)
736 		return ERR_PTR(-EFAULT);
737 
738 	/*
739 	 * !old_len is a special case where an attempt is made to 'duplicate'
740 	 * a mapping.  This makes no sense for private mappings as it will
741 	 * instead create a fresh/new mapping unrelated to the original.  This
742 	 * is contrary to the basic idea of mremap which creates new mappings
743 	 * based on the original.  There are no known use cases for this
744 	 * behavior.  As a result, fail such attempts.
745 	 */
746 	if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
747 		pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n", current->comm, current->pid);
748 		return ERR_PTR(-EINVAL);
749 	}
750 
751 	if ((flags & MREMAP_DONTUNMAP) &&
752 			(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
753 		return ERR_PTR(-EINVAL);
754 
755 	/* We can't remap across vm area boundaries */
756 	if (old_len > vma->vm_end - addr)
757 		return ERR_PTR(-EFAULT);
758 
759 	if (new_len == old_len)
760 		return vma;
761 
762 	/* Need to be careful about a growing mapping */
763 	pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
764 	pgoff += vma->vm_pgoff;
765 	if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
766 		return ERR_PTR(-EINVAL);
767 
768 	if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
769 		return ERR_PTR(-EFAULT);
770 
771 	if (mlock_future_check(mm, vma->vm_flags, new_len - old_len))
772 		return ERR_PTR(-EAGAIN);
773 
774 	if (!may_expand_vm(mm, vma->vm_flags,
775 				(new_len - old_len) >> PAGE_SHIFT))
776 		return ERR_PTR(-ENOMEM);
777 
778 	return vma;
779 }
780 
781 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
782 		unsigned long new_addr, unsigned long new_len, bool *locked,
783 		unsigned long flags, struct vm_userfaultfd_ctx *uf,
784 		struct list_head *uf_unmap_early,
785 		struct list_head *uf_unmap)
786 {
787 	struct mm_struct *mm = current->mm;
788 	struct vm_area_struct *vma;
789 	unsigned long ret = -EINVAL;
790 	unsigned long map_flags = 0;
791 
792 	if (offset_in_page(new_addr))
793 		goto out;
794 
795 	if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
796 		goto out;
797 
798 	/* Ensure the old/new locations do not overlap */
799 	if (addr + old_len > new_addr && new_addr + new_len > addr)
800 		goto out;
801 
802 	/*
803 	 * move_vma() need us to stay 4 maps below the threshold, otherwise
804 	 * it will bail out at the very beginning.
805 	 * That is a problem if we have already unmaped the regions here
806 	 * (new_addr, and old_addr), because userspace will not know the
807 	 * state of the vma's after it gets -ENOMEM.
808 	 * So, to avoid such scenario we can pre-compute if the whole
809 	 * operation has high chances to success map-wise.
810 	 * Worst-scenario case is when both vma's (new_addr and old_addr) get
811 	 * split in 3 before unmapping it.
812 	 * That means 2 more maps (1 for each) to the ones we already hold.
813 	 * Check whether current map count plus 2 still leads us to 4 maps below
814 	 * the threshold, otherwise return -ENOMEM here to be more safe.
815 	 */
816 	if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
817 		return -ENOMEM;
818 
819 	if (flags & MREMAP_FIXED) {
820 		ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
821 		if (ret)
822 			goto out;
823 	}
824 
825 	if (old_len > new_len) {
826 		ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
827 		if (ret)
828 			goto out;
829 		old_len = new_len;
830 	}
831 
832 	vma = vma_to_resize(addr, old_len, new_len, flags);
833 	if (IS_ERR(vma)) {
834 		ret = PTR_ERR(vma);
835 		goto out;
836 	}
837 
838 	/* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
839 	if (flags & MREMAP_DONTUNMAP &&
840 		!may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
841 		ret = -ENOMEM;
842 		goto out;
843 	}
844 
845 	if (flags & MREMAP_FIXED)
846 		map_flags |= MAP_FIXED;
847 
848 	if (vma->vm_flags & VM_MAYSHARE)
849 		map_flags |= MAP_SHARED;
850 
851 	ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
852 				((addr - vma->vm_start) >> PAGE_SHIFT),
853 				map_flags);
854 	if (IS_ERR_VALUE(ret))
855 		goto out;
856 
857 	/* We got a new mapping */
858 	if (!(flags & MREMAP_FIXED))
859 		new_addr = ret;
860 
861 	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
862 		       uf_unmap);
863 
864 out:
865 	return ret;
866 }
867 
868 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
869 {
870 	unsigned long end = vma->vm_end + delta;
871 
872 	if (end < vma->vm_end) /* overflow */
873 		return 0;
874 	if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
875 		return 0;
876 	if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
877 			      0, MAP_FIXED) & ~PAGE_MASK)
878 		return 0;
879 	return 1;
880 }
881 
882 /*
883  * Expand (or shrink) an existing mapping, potentially moving it at the
884  * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
885  *
886  * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
887  * This option implies MREMAP_MAYMOVE.
888  */
889 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
890 		unsigned long, new_len, unsigned long, flags,
891 		unsigned long, new_addr)
892 {
893 	struct mm_struct *mm = current->mm;
894 	struct vm_area_struct *vma;
895 	unsigned long ret = -EINVAL;
896 	bool locked = false;
897 	bool downgraded = false;
898 	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
899 	LIST_HEAD(uf_unmap_early);
900 	LIST_HEAD(uf_unmap);
901 
902 	/*
903 	 * There is a deliberate asymmetry here: we strip the pointer tag
904 	 * from the old address but leave the new address alone. This is
905 	 * for consistency with mmap(), where we prevent the creation of
906 	 * aliasing mappings in userspace by leaving the tag bits of the
907 	 * mapping address intact. A non-zero tag will cause the subsequent
908 	 * range checks to reject the address as invalid.
909 	 *
910 	 * See Documentation/arm64/tagged-address-abi.rst for more information.
911 	 */
912 	addr = untagged_addr(addr);
913 
914 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
915 		return ret;
916 
917 	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
918 		return ret;
919 
920 	/*
921 	 * MREMAP_DONTUNMAP is always a move and it does not allow resizing
922 	 * in the process.
923 	 */
924 	if (flags & MREMAP_DONTUNMAP &&
925 			(!(flags & MREMAP_MAYMOVE) || old_len != new_len))
926 		return ret;
927 
928 
929 	if (offset_in_page(addr))
930 		return ret;
931 
932 	old_len = PAGE_ALIGN(old_len);
933 	new_len = PAGE_ALIGN(new_len);
934 
935 	/*
936 	 * We allow a zero old-len as a special case
937 	 * for DOS-emu "duplicate shm area" thing. But
938 	 * a zero new-len is nonsensical.
939 	 */
940 	if (!new_len)
941 		return ret;
942 
943 	if (mmap_write_lock_killable(current->mm))
944 		return -EINTR;
945 	vma = vma_lookup(mm, addr);
946 	if (!vma) {
947 		ret = -EFAULT;
948 		goto out;
949 	}
950 
951 	if (is_vm_hugetlb_page(vma)) {
952 		struct hstate *h __maybe_unused = hstate_vma(vma);
953 
954 		old_len = ALIGN(old_len, huge_page_size(h));
955 		new_len = ALIGN(new_len, huge_page_size(h));
956 
957 		/* addrs must be huge page aligned */
958 		if (addr & ~huge_page_mask(h))
959 			goto out;
960 		if (new_addr & ~huge_page_mask(h))
961 			goto out;
962 
963 		/*
964 		 * Don't allow remap expansion, because the underlying hugetlb
965 		 * reservation is not yet capable to handle split reservation.
966 		 */
967 		if (new_len > old_len)
968 			goto out;
969 	}
970 
971 	if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
972 		ret = mremap_to(addr, old_len, new_addr, new_len,
973 				&locked, flags, &uf, &uf_unmap_early,
974 				&uf_unmap);
975 		goto out;
976 	}
977 
978 	/*
979 	 * Always allow a shrinking remap: that just unmaps
980 	 * the unnecessary pages..
981 	 * do_mas_munmap does all the needed commit accounting, and
982 	 * downgrades mmap_lock to read if so directed.
983 	 */
984 	if (old_len >= new_len) {
985 		int retval;
986 		MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len);
987 
988 		retval = do_mas_munmap(&mas, mm, addr + new_len,
989 				       old_len - new_len, &uf_unmap, true);
990 		/* Returning 1 indicates mmap_lock is downgraded to read. */
991 		if (retval == 1) {
992 			downgraded = true;
993 		} else if (retval < 0 && old_len != new_len) {
994 			ret = retval;
995 			goto out;
996 		}
997 
998 		ret = addr;
999 		goto out;
1000 	}
1001 
1002 	/*
1003 	 * Ok, we need to grow..
1004 	 */
1005 	vma = vma_to_resize(addr, old_len, new_len, flags);
1006 	if (IS_ERR(vma)) {
1007 		ret = PTR_ERR(vma);
1008 		goto out;
1009 	}
1010 
1011 	/* old_len exactly to the end of the area..
1012 	 */
1013 	if (old_len == vma->vm_end - addr) {
1014 		/* can we just expand the current mapping? */
1015 		if (vma_expandable(vma, new_len - old_len)) {
1016 			long pages = (new_len - old_len) >> PAGE_SHIFT;
1017 			unsigned long extension_start = addr + old_len;
1018 			unsigned long extension_end = addr + new_len;
1019 			pgoff_t extension_pgoff = vma->vm_pgoff + (old_len >> PAGE_SHIFT);
1020 
1021 			if (vma->vm_flags & VM_ACCOUNT) {
1022 				if (security_vm_enough_memory_mm(mm, pages)) {
1023 					ret = -ENOMEM;
1024 					goto out;
1025 				}
1026 			}
1027 
1028 			/*
1029 			 * Function vma_merge() is called on the extension we are adding to
1030 			 * the already existing vma, vma_merge() will merge this extension with
1031 			 * the already existing vma (expand operation itself) and possibly also
1032 			 * with the next vma if it becomes adjacent to the expanded vma and
1033 			 * otherwise compatible.
1034 			 */
1035 			vma = vma_merge(mm, vma, extension_start, extension_end,
1036 					vma->vm_flags, vma->anon_vma, vma->vm_file,
1037 					extension_pgoff, vma_policy(vma),
1038 					vma->vm_userfaultfd_ctx, anon_vma_name(vma));
1039 			if (!vma) {
1040 				vm_unacct_memory(pages);
1041 				ret = -ENOMEM;
1042 				goto out;
1043 			}
1044 
1045 			vm_stat_account(mm, vma->vm_flags, pages);
1046 			if (vma->vm_flags & VM_LOCKED) {
1047 				mm->locked_vm += pages;
1048 				locked = true;
1049 				new_addr = addr;
1050 			}
1051 			ret = addr;
1052 			goto out;
1053 		}
1054 	}
1055 
1056 	/*
1057 	 * We weren't able to just expand or shrink the area,
1058 	 * we need to create a new one and move it..
1059 	 */
1060 	ret = -ENOMEM;
1061 	if (flags & MREMAP_MAYMOVE) {
1062 		unsigned long map_flags = 0;
1063 		if (vma->vm_flags & VM_MAYSHARE)
1064 			map_flags |= MAP_SHARED;
1065 
1066 		new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
1067 					vma->vm_pgoff +
1068 					((addr - vma->vm_start) >> PAGE_SHIFT),
1069 					map_flags);
1070 		if (IS_ERR_VALUE(new_addr)) {
1071 			ret = new_addr;
1072 			goto out;
1073 		}
1074 
1075 		ret = move_vma(vma, addr, old_len, new_len, new_addr,
1076 			       &locked, flags, &uf, &uf_unmap);
1077 	}
1078 out:
1079 	if (offset_in_page(ret))
1080 		locked = false;
1081 	if (downgraded)
1082 		mmap_read_unlock(current->mm);
1083 	else
1084 		mmap_write_unlock(current->mm);
1085 	if (locked && new_len > old_len)
1086 		mm_populate(new_addr + old_len, new_len - old_len);
1087 	userfaultfd_unmap_complete(mm, &uf_unmap_early);
1088 	mremap_userfaultfd_complete(&uf, addr, ret, old_len);
1089 	userfaultfd_unmap_complete(mm, &uf_unmap);
1090 	return ret;
1091 }
1092