xref: /openbmc/linux/mm/madvise.c (revision b625fe69)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *	linux/mm/madvise.c
4  *
5  * Copyright (C) 1999  Linus Torvalds
6  * Copyright (C) 2002  Christoph Hellwig
7  */
8 
9 #include <linux/mman.h>
10 #include <linux/pagemap.h>
11 #include <linux/syscalls.h>
12 #include <linux/mempolicy.h>
13 #include <linux/page-isolation.h>
14 #include <linux/page_idle.h>
15 #include <linux/userfaultfd_k.h>
16 #include <linux/hugetlb.h>
17 #include <linux/falloc.h>
18 #include <linux/fadvise.h>
19 #include <linux/sched.h>
20 #include <linux/sched/mm.h>
21 #include <linux/mm_inline.h>
22 #include <linux/string.h>
23 #include <linux/uio.h>
24 #include <linux/ksm.h>
25 #include <linux/fs.h>
26 #include <linux/file.h>
27 #include <linux/blkdev.h>
28 #include <linux/backing-dev.h>
29 #include <linux/pagewalk.h>
30 #include <linux/swap.h>
31 #include <linux/swapops.h>
32 #include <linux/shmem_fs.h>
33 #include <linux/mmu_notifier.h>
34 
35 #include <asm/tlb.h>
36 
37 #include "internal.h"
38 
39 struct madvise_walk_private {
40 	struct mmu_gather *tlb;
41 	bool pageout;
42 };
43 
44 /*
45  * Any behaviour which results in changes to the vma->vm_flags needs to
46  * take mmap_lock for writing. Others, which simply traverse vmas, need
47  * to only take it for reading.
48  */
49 static int madvise_need_mmap_write(int behavior)
50 {
51 	switch (behavior) {
52 	case MADV_REMOVE:
53 	case MADV_WILLNEED:
54 	case MADV_DONTNEED:
55 	case MADV_COLD:
56 	case MADV_PAGEOUT:
57 	case MADV_FREE:
58 	case MADV_POPULATE_READ:
59 	case MADV_POPULATE_WRITE:
60 		return 0;
61 	default:
62 		/* be safe, default to 1. list exceptions explicitly */
63 		return 1;
64 	}
65 }
66 
67 #ifdef CONFIG_ANON_VMA_NAME
68 static struct anon_vma_name *anon_vma_name_alloc(const char *name)
69 {
70 	struct anon_vma_name *anon_name;
71 	size_t count;
72 
73 	/* Add 1 for NUL terminator at the end of the anon_name->name */
74 	count = strlen(name) + 1;
75 	anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
76 	if (anon_name) {
77 		kref_init(&anon_name->kref);
78 		memcpy(anon_name->name, name, count);
79 	}
80 
81 	return anon_name;
82 }
83 
84 static void vma_anon_name_free(struct kref *kref)
85 {
86 	struct anon_vma_name *anon_name =
87 			container_of(kref, struct anon_vma_name, kref);
88 	kfree(anon_name);
89 }
90 
91 static inline bool has_vma_anon_name(struct vm_area_struct *vma)
92 {
93 	return !vma->vm_file && vma->anon_name;
94 }
95 
96 const char *vma_anon_name(struct vm_area_struct *vma)
97 {
98 	if (!has_vma_anon_name(vma))
99 		return NULL;
100 
101 	mmap_assert_locked(vma->vm_mm);
102 
103 	return vma->anon_name->name;
104 }
105 
106 void dup_vma_anon_name(struct vm_area_struct *orig_vma,
107 		       struct vm_area_struct *new_vma)
108 {
109 	if (!has_vma_anon_name(orig_vma))
110 		return;
111 
112 	kref_get(&orig_vma->anon_name->kref);
113 	new_vma->anon_name = orig_vma->anon_name;
114 }
115 
116 void free_vma_anon_name(struct vm_area_struct *vma)
117 {
118 	struct anon_vma_name *anon_name;
119 
120 	if (!has_vma_anon_name(vma))
121 		return;
122 
123 	anon_name = vma->anon_name;
124 	vma->anon_name = NULL;
125 	kref_put(&anon_name->kref, vma_anon_name_free);
126 }
127 
128 /* mmap_lock should be write-locked */
129 static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
130 {
131 	const char *anon_name;
132 
133 	if (!name) {
134 		free_vma_anon_name(vma);
135 		return 0;
136 	}
137 
138 	anon_name = vma_anon_name(vma);
139 	if (anon_name) {
140 		/* Same name, nothing to do here */
141 		if (!strcmp(name, anon_name))
142 			return 0;
143 
144 		free_vma_anon_name(vma);
145 	}
146 	vma->anon_name = anon_vma_name_alloc(name);
147 	if (!vma->anon_name)
148 		return -ENOMEM;
149 
150 	return 0;
151 }
152 #else /* CONFIG_ANON_VMA_NAME */
153 static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
154 {
155 	if (name)
156 		return -EINVAL;
157 
158 	return 0;
159 }
160 #endif /* CONFIG_ANON_VMA_NAME */
161 /*
162  * Update the vm_flags on region of a vma, splitting it or merging it as
163  * necessary.  Must be called with mmap_sem held for writing;
164  */
165 static int madvise_update_vma(struct vm_area_struct *vma,
166 			      struct vm_area_struct **prev, unsigned long start,
167 			      unsigned long end, unsigned long new_flags,
168 			      const char *name)
169 {
170 	struct mm_struct *mm = vma->vm_mm;
171 	int error;
172 	pgoff_t pgoff;
173 
174 	if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) {
175 		*prev = vma;
176 		return 0;
177 	}
178 
179 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
180 	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
181 			  vma->vm_file, pgoff, vma_policy(vma),
182 			  vma->vm_userfaultfd_ctx, name);
183 	if (*prev) {
184 		vma = *prev;
185 		goto success;
186 	}
187 
188 	*prev = vma;
189 
190 	if (start != vma->vm_start) {
191 		if (unlikely(mm->map_count >= sysctl_max_map_count))
192 			return -ENOMEM;
193 		error = __split_vma(mm, vma, start, 1);
194 		if (error)
195 			return error;
196 	}
197 
198 	if (end != vma->vm_end) {
199 		if (unlikely(mm->map_count >= sysctl_max_map_count))
200 			return -ENOMEM;
201 		error = __split_vma(mm, vma, end, 0);
202 		if (error)
203 			return error;
204 	}
205 
206 success:
207 	/*
208 	 * vm_flags is protected by the mmap_lock held in write mode.
209 	 */
210 	vma->vm_flags = new_flags;
211 	if (!vma->vm_file) {
212 		error = replace_vma_anon_name(vma, name);
213 		if (error)
214 			return error;
215 	}
216 
217 	return 0;
218 }
219 
220 #ifdef CONFIG_SWAP
221 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
222 	unsigned long end, struct mm_walk *walk)
223 {
224 	pte_t *orig_pte;
225 	struct vm_area_struct *vma = walk->private;
226 	unsigned long index;
227 
228 	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
229 		return 0;
230 
231 	for (index = start; index != end; index += PAGE_SIZE) {
232 		pte_t pte;
233 		swp_entry_t entry;
234 		struct page *page;
235 		spinlock_t *ptl;
236 
237 		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
238 		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
239 		pte_unmap_unlock(orig_pte, ptl);
240 
241 		if (pte_present(pte) || pte_none(pte))
242 			continue;
243 		entry = pte_to_swp_entry(pte);
244 		if (unlikely(non_swap_entry(entry)))
245 			continue;
246 
247 		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
248 							vma, index, false);
249 		if (page)
250 			put_page(page);
251 	}
252 
253 	return 0;
254 }
255 
256 static const struct mm_walk_ops swapin_walk_ops = {
257 	.pmd_entry		= swapin_walk_pmd_entry,
258 };
259 
260 static void force_shm_swapin_readahead(struct vm_area_struct *vma,
261 		unsigned long start, unsigned long end,
262 		struct address_space *mapping)
263 {
264 	XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
265 	pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
266 	struct page *page;
267 
268 	rcu_read_lock();
269 	xas_for_each(&xas, page, end_index) {
270 		swp_entry_t swap;
271 
272 		if (!xa_is_value(page))
273 			continue;
274 		xas_pause(&xas);
275 		rcu_read_unlock();
276 
277 		swap = radix_to_swp_entry(page);
278 		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
279 							NULL, 0, false);
280 		if (page)
281 			put_page(page);
282 
283 		rcu_read_lock();
284 	}
285 	rcu_read_unlock();
286 
287 	lru_add_drain();	/* Push any new pages onto the LRU now */
288 }
289 #endif		/* CONFIG_SWAP */
290 
291 /*
292  * Schedule all required I/O operations.  Do not wait for completion.
293  */
294 static long madvise_willneed(struct vm_area_struct *vma,
295 			     struct vm_area_struct **prev,
296 			     unsigned long start, unsigned long end)
297 {
298 	struct mm_struct *mm = vma->vm_mm;
299 	struct file *file = vma->vm_file;
300 	loff_t offset;
301 
302 	*prev = vma;
303 #ifdef CONFIG_SWAP
304 	if (!file) {
305 		walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
306 		lru_add_drain(); /* Push any new pages onto the LRU now */
307 		return 0;
308 	}
309 
310 	if (shmem_mapping(file->f_mapping)) {
311 		force_shm_swapin_readahead(vma, start, end,
312 					file->f_mapping);
313 		return 0;
314 	}
315 #else
316 	if (!file)
317 		return -EBADF;
318 #endif
319 
320 	if (IS_DAX(file_inode(file))) {
321 		/* no bad return value, but ignore advice */
322 		return 0;
323 	}
324 
325 	/*
326 	 * Filesystem's fadvise may need to take various locks.  We need to
327 	 * explicitly grab a reference because the vma (and hence the
328 	 * vma's reference to the file) can go away as soon as we drop
329 	 * mmap_lock.
330 	 */
331 	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
332 	get_file(file);
333 	offset = (loff_t)(start - vma->vm_start)
334 			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
335 	mmap_read_unlock(mm);
336 	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
337 	fput(file);
338 	mmap_read_lock(mm);
339 	return 0;
340 }
341 
342 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
343 				unsigned long addr, unsigned long end,
344 				struct mm_walk *walk)
345 {
346 	struct madvise_walk_private *private = walk->private;
347 	struct mmu_gather *tlb = private->tlb;
348 	bool pageout = private->pageout;
349 	struct mm_struct *mm = tlb->mm;
350 	struct vm_area_struct *vma = walk->vma;
351 	pte_t *orig_pte, *pte, ptent;
352 	spinlock_t *ptl;
353 	struct page *page = NULL;
354 	LIST_HEAD(page_list);
355 
356 	if (fatal_signal_pending(current))
357 		return -EINTR;
358 
359 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
360 	if (pmd_trans_huge(*pmd)) {
361 		pmd_t orig_pmd;
362 		unsigned long next = pmd_addr_end(addr, end);
363 
364 		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
365 		ptl = pmd_trans_huge_lock(pmd, vma);
366 		if (!ptl)
367 			return 0;
368 
369 		orig_pmd = *pmd;
370 		if (is_huge_zero_pmd(orig_pmd))
371 			goto huge_unlock;
372 
373 		if (unlikely(!pmd_present(orig_pmd))) {
374 			VM_BUG_ON(thp_migration_supported() &&
375 					!is_pmd_migration_entry(orig_pmd));
376 			goto huge_unlock;
377 		}
378 
379 		page = pmd_page(orig_pmd);
380 
381 		/* Do not interfere with other mappings of this page */
382 		if (page_mapcount(page) != 1)
383 			goto huge_unlock;
384 
385 		if (next - addr != HPAGE_PMD_SIZE) {
386 			int err;
387 
388 			get_page(page);
389 			spin_unlock(ptl);
390 			lock_page(page);
391 			err = split_huge_page(page);
392 			unlock_page(page);
393 			put_page(page);
394 			if (!err)
395 				goto regular_page;
396 			return 0;
397 		}
398 
399 		if (pmd_young(orig_pmd)) {
400 			pmdp_invalidate(vma, addr, pmd);
401 			orig_pmd = pmd_mkold(orig_pmd);
402 
403 			set_pmd_at(mm, addr, pmd, orig_pmd);
404 			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
405 		}
406 
407 		ClearPageReferenced(page);
408 		test_and_clear_page_young(page);
409 		if (pageout) {
410 			if (!isolate_lru_page(page)) {
411 				if (PageUnevictable(page))
412 					putback_lru_page(page);
413 				else
414 					list_add(&page->lru, &page_list);
415 			}
416 		} else
417 			deactivate_page(page);
418 huge_unlock:
419 		spin_unlock(ptl);
420 		if (pageout)
421 			reclaim_pages(&page_list);
422 		return 0;
423 	}
424 
425 regular_page:
426 	if (pmd_trans_unstable(pmd))
427 		return 0;
428 #endif
429 	tlb_change_page_size(tlb, PAGE_SIZE);
430 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
431 	flush_tlb_batched_pending(mm);
432 	arch_enter_lazy_mmu_mode();
433 	for (; addr < end; pte++, addr += PAGE_SIZE) {
434 		ptent = *pte;
435 
436 		if (pte_none(ptent))
437 			continue;
438 
439 		if (!pte_present(ptent))
440 			continue;
441 
442 		page = vm_normal_page(vma, addr, ptent);
443 		if (!page)
444 			continue;
445 
446 		/*
447 		 * Creating a THP page is expensive so split it only if we
448 		 * are sure it's worth. Split it if we are only owner.
449 		 */
450 		if (PageTransCompound(page)) {
451 			if (page_mapcount(page) != 1)
452 				break;
453 			get_page(page);
454 			if (!trylock_page(page)) {
455 				put_page(page);
456 				break;
457 			}
458 			pte_unmap_unlock(orig_pte, ptl);
459 			if (split_huge_page(page)) {
460 				unlock_page(page);
461 				put_page(page);
462 				pte_offset_map_lock(mm, pmd, addr, &ptl);
463 				break;
464 			}
465 			unlock_page(page);
466 			put_page(page);
467 			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
468 			pte--;
469 			addr -= PAGE_SIZE;
470 			continue;
471 		}
472 
473 		/* Do not interfere with other mappings of this page */
474 		if (page_mapcount(page) != 1)
475 			continue;
476 
477 		VM_BUG_ON_PAGE(PageTransCompound(page), page);
478 
479 		if (pte_young(ptent)) {
480 			ptent = ptep_get_and_clear_full(mm, addr, pte,
481 							tlb->fullmm);
482 			ptent = pte_mkold(ptent);
483 			set_pte_at(mm, addr, pte, ptent);
484 			tlb_remove_tlb_entry(tlb, pte, addr);
485 		}
486 
487 		/*
488 		 * We are deactivating a page for accelerating reclaiming.
489 		 * VM couldn't reclaim the page unless we clear PG_young.
490 		 * As a side effect, it makes confuse idle-page tracking
491 		 * because they will miss recent referenced history.
492 		 */
493 		ClearPageReferenced(page);
494 		test_and_clear_page_young(page);
495 		if (pageout) {
496 			if (!isolate_lru_page(page)) {
497 				if (PageUnevictable(page))
498 					putback_lru_page(page);
499 				else
500 					list_add(&page->lru, &page_list);
501 			}
502 		} else
503 			deactivate_page(page);
504 	}
505 
506 	arch_leave_lazy_mmu_mode();
507 	pte_unmap_unlock(orig_pte, ptl);
508 	if (pageout)
509 		reclaim_pages(&page_list);
510 	cond_resched();
511 
512 	return 0;
513 }
514 
515 static const struct mm_walk_ops cold_walk_ops = {
516 	.pmd_entry = madvise_cold_or_pageout_pte_range,
517 };
518 
519 static void madvise_cold_page_range(struct mmu_gather *tlb,
520 			     struct vm_area_struct *vma,
521 			     unsigned long addr, unsigned long end)
522 {
523 	struct madvise_walk_private walk_private = {
524 		.pageout = false,
525 		.tlb = tlb,
526 	};
527 
528 	tlb_start_vma(tlb, vma);
529 	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
530 	tlb_end_vma(tlb, vma);
531 }
532 
533 static long madvise_cold(struct vm_area_struct *vma,
534 			struct vm_area_struct **prev,
535 			unsigned long start_addr, unsigned long end_addr)
536 {
537 	struct mm_struct *mm = vma->vm_mm;
538 	struct mmu_gather tlb;
539 
540 	*prev = vma;
541 	if (!can_madv_lru_vma(vma))
542 		return -EINVAL;
543 
544 	lru_add_drain();
545 	tlb_gather_mmu(&tlb, mm);
546 	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
547 	tlb_finish_mmu(&tlb);
548 
549 	return 0;
550 }
551 
552 static void madvise_pageout_page_range(struct mmu_gather *tlb,
553 			     struct vm_area_struct *vma,
554 			     unsigned long addr, unsigned long end)
555 {
556 	struct madvise_walk_private walk_private = {
557 		.pageout = true,
558 		.tlb = tlb,
559 	};
560 
561 	tlb_start_vma(tlb, vma);
562 	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
563 	tlb_end_vma(tlb, vma);
564 }
565 
566 static inline bool can_do_pageout(struct vm_area_struct *vma)
567 {
568 	if (vma_is_anonymous(vma))
569 		return true;
570 	if (!vma->vm_file)
571 		return false;
572 	/*
573 	 * paging out pagecache only for non-anonymous mappings that correspond
574 	 * to the files the calling process could (if tried) open for writing;
575 	 * otherwise we'd be including shared non-exclusive mappings, which
576 	 * opens a side channel.
577 	 */
578 	return inode_owner_or_capable(&init_user_ns,
579 				      file_inode(vma->vm_file)) ||
580 	       file_permission(vma->vm_file, MAY_WRITE) == 0;
581 }
582 
583 static long madvise_pageout(struct vm_area_struct *vma,
584 			struct vm_area_struct **prev,
585 			unsigned long start_addr, unsigned long end_addr)
586 {
587 	struct mm_struct *mm = vma->vm_mm;
588 	struct mmu_gather tlb;
589 
590 	*prev = vma;
591 	if (!can_madv_lru_vma(vma))
592 		return -EINVAL;
593 
594 	if (!can_do_pageout(vma))
595 		return 0;
596 
597 	lru_add_drain();
598 	tlb_gather_mmu(&tlb, mm);
599 	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
600 	tlb_finish_mmu(&tlb);
601 
602 	return 0;
603 }
604 
605 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
606 				unsigned long end, struct mm_walk *walk)
607 
608 {
609 	struct mmu_gather *tlb = walk->private;
610 	struct mm_struct *mm = tlb->mm;
611 	struct vm_area_struct *vma = walk->vma;
612 	spinlock_t *ptl;
613 	pte_t *orig_pte, *pte, ptent;
614 	struct page *page;
615 	int nr_swap = 0;
616 	unsigned long next;
617 
618 	next = pmd_addr_end(addr, end);
619 	if (pmd_trans_huge(*pmd))
620 		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
621 			goto next;
622 
623 	if (pmd_trans_unstable(pmd))
624 		return 0;
625 
626 	tlb_change_page_size(tlb, PAGE_SIZE);
627 	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
628 	flush_tlb_batched_pending(mm);
629 	arch_enter_lazy_mmu_mode();
630 	for (; addr != end; pte++, addr += PAGE_SIZE) {
631 		ptent = *pte;
632 
633 		if (pte_none(ptent))
634 			continue;
635 		/*
636 		 * If the pte has swp_entry, just clear page table to
637 		 * prevent swap-in which is more expensive rather than
638 		 * (page allocation + zeroing).
639 		 */
640 		if (!pte_present(ptent)) {
641 			swp_entry_t entry;
642 
643 			entry = pte_to_swp_entry(ptent);
644 			if (non_swap_entry(entry))
645 				continue;
646 			nr_swap--;
647 			free_swap_and_cache(entry);
648 			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
649 			continue;
650 		}
651 
652 		page = vm_normal_page(vma, addr, ptent);
653 		if (!page)
654 			continue;
655 
656 		/*
657 		 * If pmd isn't transhuge but the page is THP and
658 		 * is owned by only this process, split it and
659 		 * deactivate all pages.
660 		 */
661 		if (PageTransCompound(page)) {
662 			if (page_mapcount(page) != 1)
663 				goto out;
664 			get_page(page);
665 			if (!trylock_page(page)) {
666 				put_page(page);
667 				goto out;
668 			}
669 			pte_unmap_unlock(orig_pte, ptl);
670 			if (split_huge_page(page)) {
671 				unlock_page(page);
672 				put_page(page);
673 				pte_offset_map_lock(mm, pmd, addr, &ptl);
674 				goto out;
675 			}
676 			unlock_page(page);
677 			put_page(page);
678 			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
679 			pte--;
680 			addr -= PAGE_SIZE;
681 			continue;
682 		}
683 
684 		VM_BUG_ON_PAGE(PageTransCompound(page), page);
685 
686 		if (PageSwapCache(page) || PageDirty(page)) {
687 			if (!trylock_page(page))
688 				continue;
689 			/*
690 			 * If page is shared with others, we couldn't clear
691 			 * PG_dirty of the page.
692 			 */
693 			if (page_mapcount(page) != 1) {
694 				unlock_page(page);
695 				continue;
696 			}
697 
698 			if (PageSwapCache(page) && !try_to_free_swap(page)) {
699 				unlock_page(page);
700 				continue;
701 			}
702 
703 			ClearPageDirty(page);
704 			unlock_page(page);
705 		}
706 
707 		if (pte_young(ptent) || pte_dirty(ptent)) {
708 			/*
709 			 * Some of architecture(ex, PPC) don't update TLB
710 			 * with set_pte_at and tlb_remove_tlb_entry so for
711 			 * the portability, remap the pte with old|clean
712 			 * after pte clearing.
713 			 */
714 			ptent = ptep_get_and_clear_full(mm, addr, pte,
715 							tlb->fullmm);
716 
717 			ptent = pte_mkold(ptent);
718 			ptent = pte_mkclean(ptent);
719 			set_pte_at(mm, addr, pte, ptent);
720 			tlb_remove_tlb_entry(tlb, pte, addr);
721 		}
722 		mark_page_lazyfree(page);
723 	}
724 out:
725 	if (nr_swap) {
726 		if (current->mm == mm)
727 			sync_mm_rss(mm);
728 
729 		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
730 	}
731 	arch_leave_lazy_mmu_mode();
732 	pte_unmap_unlock(orig_pte, ptl);
733 	cond_resched();
734 next:
735 	return 0;
736 }
737 
738 static const struct mm_walk_ops madvise_free_walk_ops = {
739 	.pmd_entry		= madvise_free_pte_range,
740 };
741 
742 static int madvise_free_single_vma(struct vm_area_struct *vma,
743 			unsigned long start_addr, unsigned long end_addr)
744 {
745 	struct mm_struct *mm = vma->vm_mm;
746 	struct mmu_notifier_range range;
747 	struct mmu_gather tlb;
748 
749 	/* MADV_FREE works for only anon vma at the moment */
750 	if (!vma_is_anonymous(vma))
751 		return -EINVAL;
752 
753 	range.start = max(vma->vm_start, start_addr);
754 	if (range.start >= vma->vm_end)
755 		return -EINVAL;
756 	range.end = min(vma->vm_end, end_addr);
757 	if (range.end <= vma->vm_start)
758 		return -EINVAL;
759 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
760 				range.start, range.end);
761 
762 	lru_add_drain();
763 	tlb_gather_mmu(&tlb, mm);
764 	update_hiwater_rss(mm);
765 
766 	mmu_notifier_invalidate_range_start(&range);
767 	tlb_start_vma(&tlb, vma);
768 	walk_page_range(vma->vm_mm, range.start, range.end,
769 			&madvise_free_walk_ops, &tlb);
770 	tlb_end_vma(&tlb, vma);
771 	mmu_notifier_invalidate_range_end(&range);
772 	tlb_finish_mmu(&tlb);
773 
774 	return 0;
775 }
776 
777 /*
778  * Application no longer needs these pages.  If the pages are dirty,
779  * it's OK to just throw them away.  The app will be more careful about
780  * data it wants to keep.  Be sure to free swap resources too.  The
781  * zap_page_range call sets things up for shrink_active_list to actually free
782  * these pages later if no one else has touched them in the meantime,
783  * although we could add these pages to a global reuse list for
784  * shrink_active_list to pick up before reclaiming other pages.
785  *
786  * NB: This interface discards data rather than pushes it out to swap,
787  * as some implementations do.  This has performance implications for
788  * applications like large transactional databases which want to discard
789  * pages in anonymous maps after committing to backing store the data
790  * that was kept in them.  There is no reason to write this data out to
791  * the swap area if the application is discarding it.
792  *
793  * An interface that causes the system to free clean pages and flush
794  * dirty pages is already available as msync(MS_INVALIDATE).
795  */
796 static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
797 					unsigned long start, unsigned long end)
798 {
799 	zap_page_range(vma, start, end - start);
800 	return 0;
801 }
802 
803 static long madvise_dontneed_free(struct vm_area_struct *vma,
804 				  struct vm_area_struct **prev,
805 				  unsigned long start, unsigned long end,
806 				  int behavior)
807 {
808 	struct mm_struct *mm = vma->vm_mm;
809 
810 	*prev = vma;
811 	if (!can_madv_lru_vma(vma))
812 		return -EINVAL;
813 
814 	if (!userfaultfd_remove(vma, start, end)) {
815 		*prev = NULL; /* mmap_lock has been dropped, prev is stale */
816 
817 		mmap_read_lock(mm);
818 		vma = find_vma(mm, start);
819 		if (!vma)
820 			return -ENOMEM;
821 		if (start < vma->vm_start) {
822 			/*
823 			 * This "vma" under revalidation is the one
824 			 * with the lowest vma->vm_start where start
825 			 * is also < vma->vm_end. If start <
826 			 * vma->vm_start it means an hole materialized
827 			 * in the user address space within the
828 			 * virtual range passed to MADV_DONTNEED
829 			 * or MADV_FREE.
830 			 */
831 			return -ENOMEM;
832 		}
833 		if (!can_madv_lru_vma(vma))
834 			return -EINVAL;
835 		if (end > vma->vm_end) {
836 			/*
837 			 * Don't fail if end > vma->vm_end. If the old
838 			 * vma was split while the mmap_lock was
839 			 * released the effect of the concurrent
840 			 * operation may not cause madvise() to
841 			 * have an undefined result. There may be an
842 			 * adjacent next vma that we'll walk
843 			 * next. userfaultfd_remove() will generate an
844 			 * UFFD_EVENT_REMOVE repetition on the
845 			 * end-vma->vm_end range, but the manager can
846 			 * handle a repetition fine.
847 			 */
848 			end = vma->vm_end;
849 		}
850 		VM_WARN_ON(start >= end);
851 	}
852 
853 	if (behavior == MADV_DONTNEED)
854 		return madvise_dontneed_single_vma(vma, start, end);
855 	else if (behavior == MADV_FREE)
856 		return madvise_free_single_vma(vma, start, end);
857 	else
858 		return -EINVAL;
859 }
860 
861 static long madvise_populate(struct vm_area_struct *vma,
862 			     struct vm_area_struct **prev,
863 			     unsigned long start, unsigned long end,
864 			     int behavior)
865 {
866 	const bool write = behavior == MADV_POPULATE_WRITE;
867 	struct mm_struct *mm = vma->vm_mm;
868 	unsigned long tmp_end;
869 	int locked = 1;
870 	long pages;
871 
872 	*prev = vma;
873 
874 	while (start < end) {
875 		/*
876 		 * We might have temporarily dropped the lock. For example,
877 		 * our VMA might have been split.
878 		 */
879 		if (!vma || start >= vma->vm_end) {
880 			vma = find_vma(mm, start);
881 			if (!vma || start < vma->vm_start)
882 				return -ENOMEM;
883 		}
884 
885 		tmp_end = min_t(unsigned long, end, vma->vm_end);
886 		/* Populate (prefault) page tables readable/writable. */
887 		pages = faultin_vma_page_range(vma, start, tmp_end, write,
888 					       &locked);
889 		if (!locked) {
890 			mmap_read_lock(mm);
891 			locked = 1;
892 			*prev = NULL;
893 			vma = NULL;
894 		}
895 		if (pages < 0) {
896 			switch (pages) {
897 			case -EINTR:
898 				return -EINTR;
899 			case -EINVAL: /* Incompatible mappings / permissions. */
900 				return -EINVAL;
901 			case -EHWPOISON:
902 				return -EHWPOISON;
903 			case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
904 				return -EFAULT;
905 			default:
906 				pr_warn_once("%s: unhandled return value: %ld\n",
907 					     __func__, pages);
908 				fallthrough;
909 			case -ENOMEM:
910 				return -ENOMEM;
911 			}
912 		}
913 		start += pages * PAGE_SIZE;
914 	}
915 	return 0;
916 }
917 
918 /*
919  * Application wants to free up the pages and associated backing store.
920  * This is effectively punching a hole into the middle of a file.
921  */
922 static long madvise_remove(struct vm_area_struct *vma,
923 				struct vm_area_struct **prev,
924 				unsigned long start, unsigned long end)
925 {
926 	loff_t offset;
927 	int error;
928 	struct file *f;
929 	struct mm_struct *mm = vma->vm_mm;
930 
931 	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
932 
933 	if (vma->vm_flags & VM_LOCKED)
934 		return -EINVAL;
935 
936 	f = vma->vm_file;
937 
938 	if (!f || !f->f_mapping || !f->f_mapping->host) {
939 			return -EINVAL;
940 	}
941 
942 	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
943 		return -EACCES;
944 
945 	offset = (loff_t)(start - vma->vm_start)
946 			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
947 
948 	/*
949 	 * Filesystem's fallocate may need to take i_rwsem.  We need to
950 	 * explicitly grab a reference because the vma (and hence the
951 	 * vma's reference to the file) can go away as soon as we drop
952 	 * mmap_lock.
953 	 */
954 	get_file(f);
955 	if (userfaultfd_remove(vma, start, end)) {
956 		/* mmap_lock was not released by userfaultfd_remove() */
957 		mmap_read_unlock(mm);
958 	}
959 	error = vfs_fallocate(f,
960 				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
961 				offset, end - start);
962 	fput(f);
963 	mmap_read_lock(mm);
964 	return error;
965 }
966 
967 /*
968  * Apply an madvise behavior to a region of a vma.  madvise_update_vma
969  * will handle splitting a vm area into separate areas, each area with its own
970  * behavior.
971  */
972 static int madvise_vma_behavior(struct vm_area_struct *vma,
973 				struct vm_area_struct **prev,
974 				unsigned long start, unsigned long end,
975 				unsigned long behavior)
976 {
977 	int error;
978 	unsigned long new_flags = vma->vm_flags;
979 
980 	switch (behavior) {
981 	case MADV_REMOVE:
982 		return madvise_remove(vma, prev, start, end);
983 	case MADV_WILLNEED:
984 		return madvise_willneed(vma, prev, start, end);
985 	case MADV_COLD:
986 		return madvise_cold(vma, prev, start, end);
987 	case MADV_PAGEOUT:
988 		return madvise_pageout(vma, prev, start, end);
989 	case MADV_FREE:
990 	case MADV_DONTNEED:
991 		return madvise_dontneed_free(vma, prev, start, end, behavior);
992 	case MADV_POPULATE_READ:
993 	case MADV_POPULATE_WRITE:
994 		return madvise_populate(vma, prev, start, end, behavior);
995 	case MADV_NORMAL:
996 		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
997 		break;
998 	case MADV_SEQUENTIAL:
999 		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
1000 		break;
1001 	case MADV_RANDOM:
1002 		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
1003 		break;
1004 	case MADV_DONTFORK:
1005 		new_flags |= VM_DONTCOPY;
1006 		break;
1007 	case MADV_DOFORK:
1008 		if (vma->vm_flags & VM_IO)
1009 			return -EINVAL;
1010 		new_flags &= ~VM_DONTCOPY;
1011 		break;
1012 	case MADV_WIPEONFORK:
1013 		/* MADV_WIPEONFORK is only supported on anonymous memory. */
1014 		if (vma->vm_file || vma->vm_flags & VM_SHARED)
1015 			return -EINVAL;
1016 		new_flags |= VM_WIPEONFORK;
1017 		break;
1018 	case MADV_KEEPONFORK:
1019 		new_flags &= ~VM_WIPEONFORK;
1020 		break;
1021 	case MADV_DONTDUMP:
1022 		new_flags |= VM_DONTDUMP;
1023 		break;
1024 	case MADV_DODUMP:
1025 		if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
1026 			return -EINVAL;
1027 		new_flags &= ~VM_DONTDUMP;
1028 		break;
1029 	case MADV_MERGEABLE:
1030 	case MADV_UNMERGEABLE:
1031 		error = ksm_madvise(vma, start, end, behavior, &new_flags);
1032 		if (error)
1033 			goto out;
1034 		break;
1035 	case MADV_HUGEPAGE:
1036 	case MADV_NOHUGEPAGE:
1037 		error = hugepage_madvise(vma, &new_flags, behavior);
1038 		if (error)
1039 			goto out;
1040 		break;
1041 	}
1042 
1043 	error = madvise_update_vma(vma, prev, start, end, new_flags,
1044 				   vma_anon_name(vma));
1045 
1046 out:
1047 	/*
1048 	 * madvise() returns EAGAIN if kernel resources, such as
1049 	 * slab, are temporarily unavailable.
1050 	 */
1051 	if (error == -ENOMEM)
1052 		error = -EAGAIN;
1053 	return error;
1054 }
1055 
1056 #ifdef CONFIG_MEMORY_FAILURE
1057 /*
1058  * Error injection support for memory error handling.
1059  */
1060 static int madvise_inject_error(int behavior,
1061 		unsigned long start, unsigned long end)
1062 {
1063 	unsigned long size;
1064 
1065 	if (!capable(CAP_SYS_ADMIN))
1066 		return -EPERM;
1067 
1068 
1069 	for (; start < end; start += size) {
1070 		unsigned long pfn;
1071 		struct page *page;
1072 		int ret;
1073 
1074 		ret = get_user_pages_fast(start, 1, 0, &page);
1075 		if (ret != 1)
1076 			return ret;
1077 		pfn = page_to_pfn(page);
1078 
1079 		/*
1080 		 * When soft offlining hugepages, after migrating the page
1081 		 * we dissolve it, therefore in the second loop "page" will
1082 		 * no longer be a compound page.
1083 		 */
1084 		size = page_size(compound_head(page));
1085 
1086 		if (behavior == MADV_SOFT_OFFLINE) {
1087 			pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
1088 				 pfn, start);
1089 			ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
1090 		} else {
1091 			pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
1092 				 pfn, start);
1093 			ret = memory_failure(pfn, MF_COUNT_INCREASED);
1094 		}
1095 
1096 		if (ret)
1097 			return ret;
1098 	}
1099 
1100 	return 0;
1101 }
1102 #endif
1103 
1104 static bool
1105 madvise_behavior_valid(int behavior)
1106 {
1107 	switch (behavior) {
1108 	case MADV_DOFORK:
1109 	case MADV_DONTFORK:
1110 	case MADV_NORMAL:
1111 	case MADV_SEQUENTIAL:
1112 	case MADV_RANDOM:
1113 	case MADV_REMOVE:
1114 	case MADV_WILLNEED:
1115 	case MADV_DONTNEED:
1116 	case MADV_FREE:
1117 	case MADV_COLD:
1118 	case MADV_PAGEOUT:
1119 	case MADV_POPULATE_READ:
1120 	case MADV_POPULATE_WRITE:
1121 #ifdef CONFIG_KSM
1122 	case MADV_MERGEABLE:
1123 	case MADV_UNMERGEABLE:
1124 #endif
1125 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1126 	case MADV_HUGEPAGE:
1127 	case MADV_NOHUGEPAGE:
1128 #endif
1129 	case MADV_DONTDUMP:
1130 	case MADV_DODUMP:
1131 	case MADV_WIPEONFORK:
1132 	case MADV_KEEPONFORK:
1133 #ifdef CONFIG_MEMORY_FAILURE
1134 	case MADV_SOFT_OFFLINE:
1135 	case MADV_HWPOISON:
1136 #endif
1137 		return true;
1138 
1139 	default:
1140 		return false;
1141 	}
1142 }
1143 
1144 static bool
1145 process_madvise_behavior_valid(int behavior)
1146 {
1147 	switch (behavior) {
1148 	case MADV_COLD:
1149 	case MADV_PAGEOUT:
1150 	case MADV_WILLNEED:
1151 		return true;
1152 	default:
1153 		return false;
1154 	}
1155 }
1156 
1157 /*
1158  * Walk the vmas in range [start,end), and call the visit function on each one.
1159  * The visit function will get start and end parameters that cover the overlap
1160  * between the current vma and the original range.  Any unmapped regions in the
1161  * original range will result in this function returning -ENOMEM while still
1162  * calling the visit function on all of the existing vmas in the range.
1163  * Must be called with the mmap_lock held for reading or writing.
1164  */
1165 static
1166 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
1167 		      unsigned long end, unsigned long arg,
1168 		      int (*visit)(struct vm_area_struct *vma,
1169 				   struct vm_area_struct **prev, unsigned long start,
1170 				   unsigned long end, unsigned long arg))
1171 {
1172 	struct vm_area_struct *vma;
1173 	struct vm_area_struct *prev;
1174 	unsigned long tmp;
1175 	int unmapped_error = 0;
1176 
1177 	/*
1178 	 * If the interval [start,end) covers some unmapped address
1179 	 * ranges, just ignore them, but return -ENOMEM at the end.
1180 	 * - different from the way of handling in mlock etc.
1181 	 */
1182 	vma = find_vma_prev(mm, start, &prev);
1183 	if (vma && start > vma->vm_start)
1184 		prev = vma;
1185 
1186 	for (;;) {
1187 		int error;
1188 
1189 		/* Still start < end. */
1190 		if (!vma)
1191 			return -ENOMEM;
1192 
1193 		/* Here start < (end|vma->vm_end). */
1194 		if (start < vma->vm_start) {
1195 			unmapped_error = -ENOMEM;
1196 			start = vma->vm_start;
1197 			if (start >= end)
1198 				break;
1199 		}
1200 
1201 		/* Here vma->vm_start <= start < (end|vma->vm_end) */
1202 		tmp = vma->vm_end;
1203 		if (end < tmp)
1204 			tmp = end;
1205 
1206 		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
1207 		error = visit(vma, &prev, start, tmp, arg);
1208 		if (error)
1209 			return error;
1210 		start = tmp;
1211 		if (prev && start < prev->vm_end)
1212 			start = prev->vm_end;
1213 		if (start >= end)
1214 			break;
1215 		if (prev)
1216 			vma = prev->vm_next;
1217 		else	/* madvise_remove dropped mmap_lock */
1218 			vma = find_vma(mm, start);
1219 	}
1220 
1221 	return unmapped_error;
1222 }
1223 
1224 #ifdef CONFIG_ANON_VMA_NAME
1225 static int madvise_vma_anon_name(struct vm_area_struct *vma,
1226 				 struct vm_area_struct **prev,
1227 				 unsigned long start, unsigned long end,
1228 				 unsigned long name)
1229 {
1230 	int error;
1231 
1232 	/* Only anonymous mappings can be named */
1233 	if (vma->vm_file)
1234 		return -EBADF;
1235 
1236 	error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
1237 				   (const char *)name);
1238 
1239 	/*
1240 	 * madvise() returns EAGAIN if kernel resources, such as
1241 	 * slab, are temporarily unavailable.
1242 	 */
1243 	if (error == -ENOMEM)
1244 		error = -EAGAIN;
1245 	return error;
1246 }
1247 
1248 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
1249 			  unsigned long len_in, const char *name)
1250 {
1251 	unsigned long end;
1252 	unsigned long len;
1253 
1254 	if (start & ~PAGE_MASK)
1255 		return -EINVAL;
1256 	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
1257 
1258 	/* Check to see whether len was rounded up from small -ve to zero */
1259 	if (len_in && !len)
1260 		return -EINVAL;
1261 
1262 	end = start + len;
1263 	if (end < start)
1264 		return -EINVAL;
1265 
1266 	if (end == start)
1267 		return 0;
1268 
1269 	return madvise_walk_vmas(mm, start, end, (unsigned long)name,
1270 				 madvise_vma_anon_name);
1271 }
1272 #endif /* CONFIG_ANON_VMA_NAME */
1273 /*
1274  * The madvise(2) system call.
1275  *
1276  * Applications can use madvise() to advise the kernel how it should
1277  * handle paging I/O in this VM area.  The idea is to help the kernel
1278  * use appropriate read-ahead and caching techniques.  The information
1279  * provided is advisory only, and can be safely disregarded by the
1280  * kernel without affecting the correct operation of the application.
1281  *
1282  * behavior values:
1283  *  MADV_NORMAL - the default behavior is to read clusters.  This
1284  *		results in some read-ahead and read-behind.
1285  *  MADV_RANDOM - the system should read the minimum amount of data
1286  *		on any access, since it is unlikely that the appli-
1287  *		cation will need more than what it asks for.
1288  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
1289  *		once, so they can be aggressively read ahead, and
1290  *		can be freed soon after they are accessed.
1291  *  MADV_WILLNEED - the application is notifying the system to read
1292  *		some pages ahead.
1293  *  MADV_DONTNEED - the application is finished with the given range,
1294  *		so the kernel can free resources associated with it.
1295  *  MADV_FREE - the application marks pages in the given range as lazy free,
1296  *		where actual purges are postponed until memory pressure happens.
1297  *  MADV_REMOVE - the application wants to free up the given range of
1298  *		pages and associated backing store.
1299  *  MADV_DONTFORK - omit this area from child's address space when forking:
1300  *		typically, to avoid COWing pages pinned by get_user_pages().
1301  *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
1302  *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
1303  *              range after a fork.
1304  *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
1305  *  MADV_HWPOISON - trigger memory error handler as if the given memory range
1306  *		were corrupted by unrecoverable hardware memory failure.
1307  *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
1308  *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
1309  *		this area with pages of identical content from other such areas.
1310  *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1311  *  MADV_HUGEPAGE - the application wants to back the given range by transparent
1312  *		huge pages in the future. Existing pages might be coalesced and
1313  *		new pages might be allocated as THP.
1314  *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
1315  *		transparent huge pages so the existing pages will not be
1316  *		coalesced into THP and new pages will not be allocated as THP.
1317  *  MADV_DONTDUMP - the application wants to prevent pages in the given range
1318  *		from being included in its core dump.
1319  *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1320  *  MADV_COLD - the application is not expected to use this memory soon,
1321  *		deactivate pages in this range so that they can be reclaimed
1322  *		easily if memory pressure happens.
1323  *  MADV_PAGEOUT - the application is not expected to use this memory soon,
1324  *		page out the pages in this range immediately.
1325  *  MADV_POPULATE_READ - populate (prefault) page tables readable by
1326  *		triggering read faults if required
1327  *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
1328  *		triggering write faults if required
1329  *
1330  * return values:
1331  *  zero    - success
1332  *  -EINVAL - start + len < 0, start is not page-aligned,
1333  *		"behavior" is not a valid value, or application
1334  *		is attempting to release locked or shared pages,
1335  *		or the specified address range includes file, Huge TLB,
1336  *		MAP_SHARED or VMPFNMAP range.
1337  *  -ENOMEM - addresses in the specified range are not currently
1338  *		mapped, or are outside the AS of the process.
1339  *  -EIO    - an I/O error occurred while paging in data.
1340  *  -EBADF  - map exists, but area maps something that isn't a file.
1341  *  -EAGAIN - a kernel resource was temporarily unavailable.
1342  */
1343 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1344 {
1345 	unsigned long end;
1346 	int error;
1347 	int write;
1348 	size_t len;
1349 	struct blk_plug plug;
1350 
1351 	start = untagged_addr(start);
1352 
1353 	if (!madvise_behavior_valid(behavior))
1354 		return -EINVAL;
1355 
1356 	if (!PAGE_ALIGNED(start))
1357 		return -EINVAL;
1358 	len = PAGE_ALIGN(len_in);
1359 
1360 	/* Check to see whether len was rounded up from small -ve to zero */
1361 	if (len_in && !len)
1362 		return -EINVAL;
1363 
1364 	end = start + len;
1365 	if (end < start)
1366 		return -EINVAL;
1367 
1368 	if (end == start)
1369 		return 0;
1370 
1371 #ifdef CONFIG_MEMORY_FAILURE
1372 	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
1373 		return madvise_inject_error(behavior, start, start + len_in);
1374 #endif
1375 
1376 	write = madvise_need_mmap_write(behavior);
1377 	if (write) {
1378 		if (mmap_write_lock_killable(mm))
1379 			return -EINTR;
1380 	} else {
1381 		mmap_read_lock(mm);
1382 	}
1383 
1384 	blk_start_plug(&plug);
1385 	error = madvise_walk_vmas(mm, start, end, behavior,
1386 			madvise_vma_behavior);
1387 	blk_finish_plug(&plug);
1388 	if (write)
1389 		mmap_write_unlock(mm);
1390 	else
1391 		mmap_read_unlock(mm);
1392 
1393 	return error;
1394 }
1395 
1396 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1397 {
1398 	return do_madvise(current->mm, start, len_in, behavior);
1399 }
1400 
1401 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
1402 		size_t, vlen, int, behavior, unsigned int, flags)
1403 {
1404 	ssize_t ret;
1405 	struct iovec iovstack[UIO_FASTIOV], iovec;
1406 	struct iovec *iov = iovstack;
1407 	struct iov_iter iter;
1408 	struct task_struct *task;
1409 	struct mm_struct *mm;
1410 	size_t total_len;
1411 	unsigned int f_flags;
1412 
1413 	if (flags != 0) {
1414 		ret = -EINVAL;
1415 		goto out;
1416 	}
1417 
1418 	ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1419 	if (ret < 0)
1420 		goto out;
1421 
1422 	task = pidfd_get_task(pidfd, &f_flags);
1423 	if (IS_ERR(task)) {
1424 		ret = PTR_ERR(task);
1425 		goto free_iov;
1426 	}
1427 
1428 	if (!process_madvise_behavior_valid(behavior)) {
1429 		ret = -EINVAL;
1430 		goto release_task;
1431 	}
1432 
1433 	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
1434 	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1435 	if (IS_ERR_OR_NULL(mm)) {
1436 		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
1437 		goto release_task;
1438 	}
1439 
1440 	/*
1441 	 * Require CAP_SYS_NICE for influencing process performance. Note that
1442 	 * only non-destructive hints are currently supported.
1443 	 */
1444 	if (!capable(CAP_SYS_NICE)) {
1445 		ret = -EPERM;
1446 		goto release_mm;
1447 	}
1448 
1449 	total_len = iov_iter_count(&iter);
1450 
1451 	while (iov_iter_count(&iter)) {
1452 		iovec = iov_iter_iovec(&iter);
1453 		ret = do_madvise(mm, (unsigned long)iovec.iov_base,
1454 					iovec.iov_len, behavior);
1455 		if (ret < 0)
1456 			break;
1457 		iov_iter_advance(&iter, iovec.iov_len);
1458 	}
1459 
1460 	if (ret == 0)
1461 		ret = total_len - iov_iter_count(&iter);
1462 
1463 release_mm:
1464 	mmput(mm);
1465 release_task:
1466 	put_task_struct(task);
1467 free_iov:
1468 	kfree(iov);
1469 out:
1470 	return ret;
1471 }
1472