1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds * linux/mm/madvise.c
41da177e4SLinus Torvalds *
51da177e4SLinus Torvalds * Copyright (C) 1999 Linus Torvalds
61da177e4SLinus Torvalds * Copyright (C) 2002 Christoph Hellwig
71da177e4SLinus Torvalds */
81da177e4SLinus Torvalds
91da177e4SLinus Torvalds #include <linux/mman.h>
101da177e4SLinus Torvalds #include <linux/pagemap.h>
111da177e4SLinus Torvalds #include <linux/syscalls.h>
1205b74384SPrasanna Meda #include <linux/mempolicy.h>
13afcf938eSAndi Kleen #include <linux/page-isolation.h>
149c276cc6SMinchan Kim #include <linux/page_idle.h>
1505ce7724SPavel Emelyanov #include <linux/userfaultfd_k.h>
161da177e4SLinus Torvalds #include <linux/hugetlb.h>
173f31d075SHugh Dickins #include <linux/falloc.h>
18692fe624SJan Kara #include <linux/fadvise.h>
19e8edc6e0SAlexey Dobriyan #include <linux/sched.h>
20ecb8ac8bSMinchan Kim #include <linux/sched/mm.h>
2117fca131SArnd Bergmann #include <linux/mm_inline.h>
229a10064fSColin Cross #include <linux/string.h>
23ecb8ac8bSMinchan Kim #include <linux/uio.h>
24f8af4da3SHugh Dickins #include <linux/ksm.h>
253f31d075SHugh Dickins #include <linux/fs.h>
269ab4233dSAndy Lutomirski #include <linux/file.h>
271998cc04SShaohua Li #include <linux/blkdev.h>
2866114cadSTejun Heo #include <linux/backing-dev.h>
29a520110eSChristoph Hellwig #include <linux/pagewalk.h>
301998cc04SShaohua Li #include <linux/swap.h>
311998cc04SShaohua Li #include <linux/swapops.h>
323a4f8a0bSHugh Dickins #include <linux/shmem_fs.h>
33854e9ed0SMinchan Kim #include <linux/mmu_notifier.h>
34854e9ed0SMinchan Kim
35854e9ed0SMinchan Kim #include <asm/tlb.h>
361da177e4SLinus Torvalds
3723519073SKirill A. Shutemov #include "internal.h"
38014bb1deSNeilBrown #include "swap.h"
3923519073SKirill A. Shutemov
40d616d512SMinchan Kim struct madvise_walk_private {
41d616d512SMinchan Kim struct mmu_gather *tlb;
42d616d512SMinchan Kim bool pageout;
43d616d512SMinchan Kim };
44d616d512SMinchan Kim
451da177e4SLinus Torvalds /*
460a27a14aSNick Piggin * Any behaviour which results in changes to the vma->vm_flags needs to
47c1e8d7c6SMichel Lespinasse * take mmap_lock for writing. Others, which simply traverse vmas, need
480a27a14aSNick Piggin * to only take it for reading.
490a27a14aSNick Piggin */
madvise_need_mmap_write(int behavior)500a27a14aSNick Piggin static int madvise_need_mmap_write(int behavior)
510a27a14aSNick Piggin {
520a27a14aSNick Piggin switch (behavior) {
530a27a14aSNick Piggin case MADV_REMOVE:
540a27a14aSNick Piggin case MADV_WILLNEED:
550a27a14aSNick Piggin case MADV_DONTNEED:
569457056aSJohannes Weiner case MADV_DONTNEED_LOCKED:
579c276cc6SMinchan Kim case MADV_COLD:
581a4e58ccSMinchan Kim case MADV_PAGEOUT:
59854e9ed0SMinchan Kim case MADV_FREE:
604ca9b385SDavid Hildenbrand case MADV_POPULATE_READ:
614ca9b385SDavid Hildenbrand case MADV_POPULATE_WRITE:
627d8faaf1SZach O'Keefe case MADV_COLLAPSE:
630a27a14aSNick Piggin return 0;
640a27a14aSNick Piggin default:
650a27a14aSNick Piggin /* be safe, default to 1. list exceptions explicitly */
660a27a14aSNick Piggin return 1;
670a27a14aSNick Piggin }
680a27a14aSNick Piggin }
690a27a14aSNick Piggin
709a10064fSColin Cross #ifdef CONFIG_ANON_VMA_NAME
anon_vma_name_alloc(const char * name)715c26f6acSSuren Baghdasaryan struct anon_vma_name *anon_vma_name_alloc(const char *name)
7278db3412SSuren Baghdasaryan {
7378db3412SSuren Baghdasaryan struct anon_vma_name *anon_name;
7478db3412SSuren Baghdasaryan size_t count;
7578db3412SSuren Baghdasaryan
7678db3412SSuren Baghdasaryan /* Add 1 for NUL terminator at the end of the anon_name->name */
7778db3412SSuren Baghdasaryan count = strlen(name) + 1;
7878db3412SSuren Baghdasaryan anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
7978db3412SSuren Baghdasaryan if (anon_name) {
8078db3412SSuren Baghdasaryan kref_init(&anon_name->kref);
8178db3412SSuren Baghdasaryan memcpy(anon_name->name, name, count);
8278db3412SSuren Baghdasaryan }
8378db3412SSuren Baghdasaryan
8478db3412SSuren Baghdasaryan return anon_name;
8578db3412SSuren Baghdasaryan }
8678db3412SSuren Baghdasaryan
anon_vma_name_free(struct kref * kref)875c26f6acSSuren Baghdasaryan void anon_vma_name_free(struct kref *kref)
8878db3412SSuren Baghdasaryan {
8978db3412SSuren Baghdasaryan struct anon_vma_name *anon_name =
9078db3412SSuren Baghdasaryan container_of(kref, struct anon_vma_name, kref);
9178db3412SSuren Baghdasaryan kfree(anon_name);
9278db3412SSuren Baghdasaryan }
9378db3412SSuren Baghdasaryan
anon_vma_name(struct vm_area_struct * vma)945c26f6acSSuren Baghdasaryan struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
959a10064fSColin Cross {
969a10064fSColin Cross mmap_assert_locked(vma->vm_mm);
979a10064fSColin Cross
985c26f6acSSuren Baghdasaryan return vma->anon_name;
999a10064fSColin Cross }
1009a10064fSColin Cross
1019a10064fSColin Cross /* mmap_lock should be write-locked */
replace_anon_vma_name(struct vm_area_struct * vma,struct anon_vma_name * anon_name)1025c26f6acSSuren Baghdasaryan static int replace_anon_vma_name(struct vm_area_struct *vma,
1035c26f6acSSuren Baghdasaryan struct anon_vma_name *anon_name)
1049a10064fSColin Cross {
1055c26f6acSSuren Baghdasaryan struct anon_vma_name *orig_name = anon_vma_name(vma);
10678db3412SSuren Baghdasaryan
1075c26f6acSSuren Baghdasaryan if (!anon_name) {
1085c26f6acSSuren Baghdasaryan vma->anon_name = NULL;
1095c26f6acSSuren Baghdasaryan anon_vma_name_put(orig_name);
1109a10064fSColin Cross return 0;
1119a10064fSColin Cross }
1129a10064fSColin Cross
1135c26f6acSSuren Baghdasaryan if (anon_vma_name_eq(orig_name, anon_name))
1149a10064fSColin Cross return 0;
1159a10064fSColin Cross
11696403e11SSuren Baghdasaryan vma->anon_name = anon_vma_name_reuse(anon_name);
1175c26f6acSSuren Baghdasaryan anon_vma_name_put(orig_name);
1189a10064fSColin Cross
1199a10064fSColin Cross return 0;
1209a10064fSColin Cross }
1219a10064fSColin Cross #else /* CONFIG_ANON_VMA_NAME */
replace_anon_vma_name(struct vm_area_struct * vma,struct anon_vma_name * anon_name)1225c26f6acSSuren Baghdasaryan static int replace_anon_vma_name(struct vm_area_struct *vma,
1235c26f6acSSuren Baghdasaryan struct anon_vma_name *anon_name)
1249a10064fSColin Cross {
1255c26f6acSSuren Baghdasaryan if (anon_name)
1269a10064fSColin Cross return -EINVAL;
1279a10064fSColin Cross
1289a10064fSColin Cross return 0;
1299a10064fSColin Cross }
1309a10064fSColin Cross #endif /* CONFIG_ANON_VMA_NAME */
1310a27a14aSNick Piggin /*
132ac1e9accSColin Cross * Update the vm_flags on region of a vma, splitting it or merging it as
1338651a137SLorenzo Stoakes * necessary. Must be called with mmap_lock held for writing;
134942341dcSSuren Baghdasaryan * Caller should ensure anon_name stability by raising its refcount even when
135942341dcSSuren Baghdasaryan * anon_name belongs to a valid vma because this function might free that vma.
1361da177e4SLinus Torvalds */
madvise_update_vma(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,unsigned long new_flags,struct anon_vma_name * anon_name)137ac1e9accSColin Cross static int madvise_update_vma(struct vm_area_struct *vma,
138ac1e9accSColin Cross struct vm_area_struct **prev, unsigned long start,
1399a10064fSColin Cross unsigned long end, unsigned long new_flags,
1405c26f6acSSuren Baghdasaryan struct anon_vma_name *anon_name)
1411da177e4SLinus Torvalds {
1421da177e4SLinus Torvalds struct mm_struct *mm = vma->vm_mm;
143ac1e9accSColin Cross int error;
14405b74384SPrasanna Meda pgoff_t pgoff;
14585ab779eSLiam R. Howlett VMA_ITERATOR(vmi, mm, start);
146e798c6e8SPrasanna Meda
1475c26f6acSSuren Baghdasaryan if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
14805b74384SPrasanna Meda *prev = vma;
149ac1e9accSColin Cross return 0;
15005b74384SPrasanna Meda }
15105b74384SPrasanna Meda
15205b74384SPrasanna Meda pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
1539760ebffSLiam R. Howlett *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags,
154178e22acSLiam R. Howlett vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
1555c26f6acSSuren Baghdasaryan vma->vm_userfaultfd_ctx, anon_name);
15605b74384SPrasanna Meda if (*prev) {
15705b74384SPrasanna Meda vma = *prev;
15805b74384SPrasanna Meda goto success;
15905b74384SPrasanna Meda }
16005b74384SPrasanna Meda
16105b74384SPrasanna Meda *prev = vma;
1621da177e4SLinus Torvalds
1631da177e4SLinus Torvalds if (start != vma->vm_start) {
16485ab779eSLiam R. Howlett error = split_vma(&vmi, vma, start, 1);
165f3bc0dbaSMike Rapoport if (error)
166ac1e9accSColin Cross return error;
167def5efe0SDavid Rientjes }
1681da177e4SLinus Torvalds
1691da177e4SLinus Torvalds if (end != vma->vm_end) {
17085ab779eSLiam R. Howlett error = split_vma(&vmi, vma, end, 0);
171f3bc0dbaSMike Rapoport if (error)
172ac1e9accSColin Cross return error;
173def5efe0SDavid Rientjes }
1741da177e4SLinus Torvalds
175836d5ffdSHugh Dickins success:
17660081bf1SSuren Baghdasaryan /* vm_flags is protected by the mmap_lock held in write mode. */
17760081bf1SSuren Baghdasaryan vma_start_write(vma);
1781c71222eSSuren Baghdasaryan vm_flags_reset(vma, new_flags);
179d09e8ca6SPasha Tatashin if (!vma->vm_file || vma_is_anon_shmem(vma)) {
1805c26f6acSSuren Baghdasaryan error = replace_anon_vma_name(vma, anon_name);
1819a10064fSColin Cross if (error)
1829a10064fSColin Cross return error;
1839a10064fSColin Cross }
184f3bc0dbaSMike Rapoport
185ac1e9accSColin Cross return 0;
1861da177e4SLinus Torvalds }
1871da177e4SLinus Torvalds
1881998cc04SShaohua Li #ifdef CONFIG_SWAP
swapin_walk_pmd_entry(pmd_t * pmd,unsigned long start,unsigned long end,struct mm_walk * walk)1891998cc04SShaohua Li static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
1901998cc04SShaohua Li unsigned long end, struct mm_walk *walk)
1911998cc04SShaohua Li {
1921998cc04SShaohua Li struct vm_area_struct *vma = walk->private;
1935169b844SNeilBrown struct swap_iocb *splug = NULL;
194f3cd4ab0SHugh Dickins pte_t *ptep = NULL;
195f3cd4ab0SHugh Dickins spinlock_t *ptl;
196f3cd4ab0SHugh Dickins unsigned long addr;
1971998cc04SShaohua Li
198f3cd4ab0SHugh Dickins for (addr = start; addr < end; addr += PAGE_SIZE) {
1991998cc04SShaohua Li pte_t pte;
2001998cc04SShaohua Li swp_entry_t entry;
2011998cc04SShaohua Li struct page *page;
2021998cc04SShaohua Li
203f3cd4ab0SHugh Dickins if (!ptep++) {
204f3cd4ab0SHugh Dickins ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
205f3cd4ab0SHugh Dickins if (!ptep)
206f3cd4ab0SHugh Dickins break;
207f3cd4ab0SHugh Dickins }
208f3cd4ab0SHugh Dickins
209c33c7948SRyan Roberts pte = ptep_get(ptep);
210f7cc67aeSMiaohe Lin if (!is_swap_pte(pte))
2111998cc04SShaohua Li continue;
2121998cc04SShaohua Li entry = pte_to_swp_entry(pte);
2131998cc04SShaohua Li if (unlikely(non_swap_entry(entry)))
2141998cc04SShaohua Li continue;
2151998cc04SShaohua Li
216f3cd4ab0SHugh Dickins pte_unmap_unlock(ptep, ptl);
217f3cd4ab0SHugh Dickins ptep = NULL;
218f3cd4ab0SHugh Dickins
2191998cc04SShaohua Li page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
220b243dcbfSSuren Baghdasaryan vma, addr, &splug);
2211998cc04SShaohua Li if (page)
22209cbfeafSKirill A. Shutemov put_page(page);
2231998cc04SShaohua Li }
224f3cd4ab0SHugh Dickins
225f3cd4ab0SHugh Dickins if (ptep)
226f3cd4ab0SHugh Dickins pte_unmap_unlock(ptep, ptl);
2275169b844SNeilBrown swap_read_unplug(splug);
228de2e5171SKefeng Wang cond_resched();
2291998cc04SShaohua Li
2301998cc04SShaohua Li return 0;
2311998cc04SShaohua Li }
2321998cc04SShaohua Li
2337b86ac33SChristoph Hellwig static const struct mm_walk_ops swapin_walk_ops = {
2341998cc04SShaohua Li .pmd_entry = swapin_walk_pmd_entry,
23549b06385SSuren Baghdasaryan .walk_lock = PGWALK_RDLOCK,
2361998cc04SShaohua Li };
2371998cc04SShaohua Li
shmem_swapin_range(struct vm_area_struct * vma,unsigned long start,unsigned long end,struct address_space * mapping)238179d3e4fSHugh Dickins static void shmem_swapin_range(struct vm_area_struct *vma,
2391998cc04SShaohua Li unsigned long start, unsigned long end,
2401998cc04SShaohua Li struct address_space *mapping)
2411998cc04SShaohua Li {
242e6e88712SMatthew Wilcox (Oracle) XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
243179d3e4fSHugh Dickins pgoff_t end_index = linear_page_index(vma, end) - 1;
2441998cc04SShaohua Li struct page *page;
2455169b844SNeilBrown struct swap_iocb *splug = NULL;
246e6e88712SMatthew Wilcox (Oracle)
247e6e88712SMatthew Wilcox (Oracle) rcu_read_lock();
248e6e88712SMatthew Wilcox (Oracle) xas_for_each(&xas, page, end_index) {
249179d3e4fSHugh Dickins unsigned long addr;
250179d3e4fSHugh Dickins swp_entry_t entry;
2511998cc04SShaohua Li
252e6e88712SMatthew Wilcox (Oracle) if (!xa_is_value(page))
2531998cc04SShaohua Li continue;
254179d3e4fSHugh Dickins entry = radix_to_swp_entry(page);
255ba6851b4SMiaohe Lin /* There might be swapin error entries in shmem mapping. */
256179d3e4fSHugh Dickins if (non_swap_entry(entry))
257ba6851b4SMiaohe Lin continue;
258179d3e4fSHugh Dickins
259179d3e4fSHugh Dickins addr = vma->vm_start +
260179d3e4fSHugh Dickins ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
261e6e88712SMatthew Wilcox (Oracle) xas_pause(&xas);
262e6e88712SMatthew Wilcox (Oracle) rcu_read_unlock();
263e6e88712SMatthew Wilcox (Oracle)
264179d3e4fSHugh Dickins page = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
265b243dcbfSSuren Baghdasaryan vma, addr, &splug);
2661998cc04SShaohua Li if (page)
26709cbfeafSKirill A. Shutemov put_page(page);
268e6e88712SMatthew Wilcox (Oracle)
269e6e88712SMatthew Wilcox (Oracle) rcu_read_lock();
2701998cc04SShaohua Li }
271e6e88712SMatthew Wilcox (Oracle) rcu_read_unlock();
2725169b844SNeilBrown swap_read_unplug(splug);
2731998cc04SShaohua Li }
2741998cc04SShaohua Li #endif /* CONFIG_SWAP */
2751998cc04SShaohua Li
2761da177e4SLinus Torvalds /*
2771da177e4SLinus Torvalds * Schedule all required I/O operations. Do not wait for completion.
2781da177e4SLinus Torvalds */
madvise_willneed(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end)2791da177e4SLinus Torvalds static long madvise_willneed(struct vm_area_struct *vma,
28005b74384SPrasanna Meda struct vm_area_struct **prev,
2811da177e4SLinus Torvalds unsigned long start, unsigned long end)
2821da177e4SLinus Torvalds {
2830726b01eSMinchan Kim struct mm_struct *mm = vma->vm_mm;
2841da177e4SLinus Torvalds struct file *file = vma->vm_file;
285692fe624SJan Kara loff_t offset;
2861da177e4SLinus Torvalds
2876ea8d958Schenjie *prev = vma;
2881998cc04SShaohua Li #ifdef CONFIG_SWAP
28997b713baSChristoph Hellwig if (!file) {
2907b86ac33SChristoph Hellwig walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
2917b86ac33SChristoph Hellwig lru_add_drain(); /* Push any new pages onto the LRU now */
29297b713baSChristoph Hellwig return 0;
29397b713baSChristoph Hellwig }
29497b713baSChristoph Hellwig
29597b713baSChristoph Hellwig if (shmem_mapping(file->f_mapping)) {
296179d3e4fSHugh Dickins shmem_swapin_range(vma, start, end, file->f_mapping);
297179d3e4fSHugh Dickins lru_add_drain(); /* Push any new pages onto the LRU now */
2981998cc04SShaohua Li return 0;
2991998cc04SShaohua Li }
30097b713baSChristoph Hellwig #else
3011bef4003SSuzuki if (!file)
3021bef4003SSuzuki return -EBADF;
30397b713baSChristoph Hellwig #endif
3041bef4003SSuzuki
305e748dcd0SMatthew Wilcox if (IS_DAX(file_inode(file))) {
306fe77ba6fSCarsten Otte /* no bad return value, but ignore advice */
307fe77ba6fSCarsten Otte return 0;
308fe77ba6fSCarsten Otte }
309fe77ba6fSCarsten Otte
310692fe624SJan Kara /*
311692fe624SJan Kara * Filesystem's fadvise may need to take various locks. We need to
312692fe624SJan Kara * explicitly grab a reference because the vma (and hence the
313692fe624SJan Kara * vma's reference to the file) can go away as soon as we drop
314c1e8d7c6SMichel Lespinasse * mmap_lock.
315692fe624SJan Kara */
316c1e8d7c6SMichel Lespinasse *prev = NULL; /* tell sys_madvise we drop mmap_lock */
317692fe624SJan Kara get_file(file);
318692fe624SJan Kara offset = (loff_t)(start - vma->vm_start)
319692fe624SJan Kara + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
3200726b01eSMinchan Kim mmap_read_unlock(mm);
321692fe624SJan Kara vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
322692fe624SJan Kara fput(file);
3230726b01eSMinchan Kim mmap_read_lock(mm);
3241da177e4SLinus Torvalds return 0;
3251da177e4SLinus Torvalds }
3261da177e4SLinus Torvalds
can_do_file_pageout(struct vm_area_struct * vma)327fd3b1bc3SPavankumar Kondeti static inline bool can_do_file_pageout(struct vm_area_struct *vma)
328fd3b1bc3SPavankumar Kondeti {
329fd3b1bc3SPavankumar Kondeti if (!vma->vm_file)
330fd3b1bc3SPavankumar Kondeti return false;
331fd3b1bc3SPavankumar Kondeti /*
332fd3b1bc3SPavankumar Kondeti * paging out pagecache only for non-anonymous mappings that correspond
333fd3b1bc3SPavankumar Kondeti * to the files the calling process could (if tried) open for writing;
334fd3b1bc3SPavankumar Kondeti * otherwise we'd be including shared non-exclusive mappings, which
335fd3b1bc3SPavankumar Kondeti * opens a side channel.
336fd3b1bc3SPavankumar Kondeti */
33701beba79SChristian Brauner return inode_owner_or_capable(&nop_mnt_idmap,
338fd3b1bc3SPavankumar Kondeti file_inode(vma->vm_file)) ||
339fd3b1bc3SPavankumar Kondeti file_permission(vma->vm_file, MAY_WRITE) == 0;
340fd3b1bc3SPavankumar Kondeti }
341fd3b1bc3SPavankumar Kondeti
madvise_cold_or_pageout_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)342d616d512SMinchan Kim static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
343d616d512SMinchan Kim unsigned long addr, unsigned long end,
344d616d512SMinchan Kim struct mm_walk *walk)
3459c276cc6SMinchan Kim {
346d616d512SMinchan Kim struct madvise_walk_private *private = walk->private;
347d616d512SMinchan Kim struct mmu_gather *tlb = private->tlb;
348d616d512SMinchan Kim bool pageout = private->pageout;
3499c276cc6SMinchan Kim struct mm_struct *mm = tlb->mm;
3509c276cc6SMinchan Kim struct vm_area_struct *vma = walk->vma;
351f3cd4ab0SHugh Dickins pte_t *start_pte, *pte, ptent;
3529c276cc6SMinchan Kim spinlock_t *ptl;
35307e8c82bSVishal Moola (Oracle) struct folio *folio = NULL;
35407e8c82bSVishal Moola (Oracle) LIST_HEAD(folio_list);
355fd3b1bc3SPavankumar Kondeti bool pageout_anon_only_filter;
356d616d512SMinchan Kim
357d616d512SMinchan Kim if (fatal_signal_pending(current))
358d616d512SMinchan Kim return -EINTR;
3599c276cc6SMinchan Kim
360fd3b1bc3SPavankumar Kondeti pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
361fd3b1bc3SPavankumar Kondeti !can_do_file_pageout(vma);
362fd3b1bc3SPavankumar Kondeti
3639c276cc6SMinchan Kim #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3649c276cc6SMinchan Kim if (pmd_trans_huge(*pmd)) {
3659c276cc6SMinchan Kim pmd_t orig_pmd;
3669c276cc6SMinchan Kim unsigned long next = pmd_addr_end(addr, end);
3679c276cc6SMinchan Kim
3689c276cc6SMinchan Kim tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
3699c276cc6SMinchan Kim ptl = pmd_trans_huge_lock(pmd, vma);
3709c276cc6SMinchan Kim if (!ptl)
3719c276cc6SMinchan Kim return 0;
3729c276cc6SMinchan Kim
3739c276cc6SMinchan Kim orig_pmd = *pmd;
3749c276cc6SMinchan Kim if (is_huge_zero_pmd(orig_pmd))
3759c276cc6SMinchan Kim goto huge_unlock;
3769c276cc6SMinchan Kim
3779c276cc6SMinchan Kim if (unlikely(!pmd_present(orig_pmd))) {
3789c276cc6SMinchan Kim VM_BUG_ON(thp_migration_supported() &&
3799c276cc6SMinchan Kim !is_pmd_migration_entry(orig_pmd));
3809c276cc6SMinchan Kim goto huge_unlock;
3819c276cc6SMinchan Kim }
3829c276cc6SMinchan Kim
38307e8c82bSVishal Moola (Oracle) folio = pfn_folio(pmd_pfn(orig_pmd));
3849c276cc6SMinchan Kim
38507e8c82bSVishal Moola (Oracle) /* Do not interfere with other mappings of this folio */
3862f406263SYin Fengwei if (folio_estimated_sharers(folio) != 1)
3879c276cc6SMinchan Kim goto huge_unlock;
3889c276cc6SMinchan Kim
38907e8c82bSVishal Moola (Oracle) if (pageout_anon_only_filter && !folio_test_anon(folio))
390fd3b1bc3SPavankumar Kondeti goto huge_unlock;
391fd3b1bc3SPavankumar Kondeti
39212e967fdSMichal Hocko if (next - addr != HPAGE_PMD_SIZE) {
39312e967fdSMichal Hocko int err;
39412e967fdSMichal Hocko
39507e8c82bSVishal Moola (Oracle) folio_get(folio);
3969c276cc6SMinchan Kim spin_unlock(ptl);
39707e8c82bSVishal Moola (Oracle) folio_lock(folio);
39807e8c82bSVishal Moola (Oracle) err = split_folio(folio);
39907e8c82bSVishal Moola (Oracle) folio_unlock(folio);
40007e8c82bSVishal Moola (Oracle) folio_put(folio);
4019c276cc6SMinchan Kim if (!err)
40207e8c82bSVishal Moola (Oracle) goto regular_folio;
4039c276cc6SMinchan Kim return 0;
4049c276cc6SMinchan Kim }
4059c276cc6SMinchan Kim
4069c276cc6SMinchan Kim if (pmd_young(orig_pmd)) {
4079c276cc6SMinchan Kim pmdp_invalidate(vma, addr, pmd);
4089c276cc6SMinchan Kim orig_pmd = pmd_mkold(orig_pmd);
4099c276cc6SMinchan Kim
4109c276cc6SMinchan Kim set_pmd_at(mm, addr, pmd, orig_pmd);
4119c276cc6SMinchan Kim tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
4129c276cc6SMinchan Kim }
4139c276cc6SMinchan Kim
41407e8c82bSVishal Moola (Oracle) folio_clear_referenced(folio);
41507e8c82bSVishal Moola (Oracle) folio_test_clear_young(folio);
41620c897eaSCharan Teja Kalla if (folio_test_active(folio))
41720c897eaSCharan Teja Kalla folio_set_workingset(folio);
418d616d512SMinchan Kim if (pageout) {
419be2d5756SBaolin Wang if (folio_isolate_lru(folio)) {
42007e8c82bSVishal Moola (Oracle) if (folio_test_unevictable(folio))
42107e8c82bSVishal Moola (Oracle) folio_putback_lru(folio);
42282072962Szhong jiang else
42307e8c82bSVishal Moola (Oracle) list_add(&folio->lru, &folio_list);
42482072962Szhong jiang }
425d616d512SMinchan Kim } else
4265a9e3474SVishal Moola (Oracle) folio_deactivate(folio);
4279c276cc6SMinchan Kim huge_unlock:
4289c276cc6SMinchan Kim spin_unlock(ptl);
429d616d512SMinchan Kim if (pageout)
43007e8c82bSVishal Moola (Oracle) reclaim_pages(&folio_list);
4319c276cc6SMinchan Kim return 0;
4329c276cc6SMinchan Kim }
4339c276cc6SMinchan Kim
43407e8c82bSVishal Moola (Oracle) regular_folio:
4359c276cc6SMinchan Kim #endif
4369c276cc6SMinchan Kim tlb_change_page_size(tlb, PAGE_SIZE);
437f3cd4ab0SHugh Dickins start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
438f3cd4ab0SHugh Dickins if (!start_pte)
439f3cd4ab0SHugh Dickins return 0;
4409c276cc6SMinchan Kim flush_tlb_batched_pending(mm);
4419c276cc6SMinchan Kim arch_enter_lazy_mmu_mode();
4429c276cc6SMinchan Kim for (; addr < end; pte++, addr += PAGE_SIZE) {
443c33c7948SRyan Roberts ptent = ptep_get(pte);
4449c276cc6SMinchan Kim
4459c276cc6SMinchan Kim if (pte_none(ptent))
4469c276cc6SMinchan Kim continue;
4479c276cc6SMinchan Kim
4489c276cc6SMinchan Kim if (!pte_present(ptent))
4499c276cc6SMinchan Kim continue;
4509c276cc6SMinchan Kim
45107e8c82bSVishal Moola (Oracle) folio = vm_normal_folio(vma, addr, ptent);
45207e8c82bSVishal Moola (Oracle) if (!folio || folio_is_zone_device(folio))
4539c276cc6SMinchan Kim continue;
4549c276cc6SMinchan Kim
4559c276cc6SMinchan Kim /*
4569c276cc6SMinchan Kim * Creating a THP page is expensive so split it only if we
4579c276cc6SMinchan Kim * are sure it's worth. Split it if we are only owner.
4589c276cc6SMinchan Kim */
45907e8c82bSVishal Moola (Oracle) if (folio_test_large(folio)) {
460f3cd4ab0SHugh Dickins int err;
461f3cd4ab0SHugh Dickins
4622f406263SYin Fengwei if (folio_estimated_sharers(folio) != 1)
4639c276cc6SMinchan Kim break;
46407e8c82bSVishal Moola (Oracle) if (pageout_anon_only_filter && !folio_test_anon(folio))
465fd3b1bc3SPavankumar Kondeti break;
466f3cd4ab0SHugh Dickins if (!folio_trylock(folio))
467f3cd4ab0SHugh Dickins break;
46807e8c82bSVishal Moola (Oracle) folio_get(folio);
469f3cd4ab0SHugh Dickins arch_leave_lazy_mmu_mode();
470f3cd4ab0SHugh Dickins pte_unmap_unlock(start_pte, ptl);
471f3cd4ab0SHugh Dickins start_pte = NULL;
472f3cd4ab0SHugh Dickins err = split_folio(folio);
47307e8c82bSVishal Moola (Oracle) folio_unlock(folio);
47407e8c82bSVishal Moola (Oracle) folio_put(folio);
475f3cd4ab0SHugh Dickins if (err)
4769c276cc6SMinchan Kim break;
477f3cd4ab0SHugh Dickins start_pte = pte =
478f3cd4ab0SHugh Dickins pte_offset_map_lock(mm, pmd, addr, &ptl);
479f3cd4ab0SHugh Dickins if (!start_pte)
480f3cd4ab0SHugh Dickins break;
481f3cd4ab0SHugh Dickins arch_enter_lazy_mmu_mode();
4829c276cc6SMinchan Kim pte--;
4839c276cc6SMinchan Kim addr -= PAGE_SIZE;
4849c276cc6SMinchan Kim continue;
4859c276cc6SMinchan Kim }
4869c276cc6SMinchan Kim
48758d426a7SMinchan Kim /*
48807e8c82bSVishal Moola (Oracle) * Do not interfere with other mappings of this folio and
48907e8c82bSVishal Moola (Oracle) * non-LRU folio.
49058d426a7SMinchan Kim */
49107e8c82bSVishal Moola (Oracle) if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
49212e967fdSMichal Hocko continue;
49312e967fdSMichal Hocko
49407e8c82bSVishal Moola (Oracle) if (pageout_anon_only_filter && !folio_test_anon(folio))
495fd3b1bc3SPavankumar Kondeti continue;
496fd3b1bc3SPavankumar Kondeti
49707e8c82bSVishal Moola (Oracle) VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
4989c276cc6SMinchan Kim
4999c276cc6SMinchan Kim if (pte_young(ptent)) {
5009c276cc6SMinchan Kim ptent = ptep_get_and_clear_full(mm, addr, pte,
5019c276cc6SMinchan Kim tlb->fullmm);
5029c276cc6SMinchan Kim ptent = pte_mkold(ptent);
5039c276cc6SMinchan Kim set_pte_at(mm, addr, pte, ptent);
5049c276cc6SMinchan Kim tlb_remove_tlb_entry(tlb, pte, addr);
5059c276cc6SMinchan Kim }
5069c276cc6SMinchan Kim
5079c276cc6SMinchan Kim /*
50807e8c82bSVishal Moola (Oracle) * We are deactivating a folio for accelerating reclaiming.
50907e8c82bSVishal Moola (Oracle) * VM couldn't reclaim the folio unless we clear PG_young.
5109c276cc6SMinchan Kim * As a side effect, it makes confuse idle-page tracking
5119c276cc6SMinchan Kim * because they will miss recent referenced history.
5129c276cc6SMinchan Kim */
51307e8c82bSVishal Moola (Oracle) folio_clear_referenced(folio);
51407e8c82bSVishal Moola (Oracle) folio_test_clear_young(folio);
51520c897eaSCharan Teja Kalla if (folio_test_active(folio))
51620c897eaSCharan Teja Kalla folio_set_workingset(folio);
517d616d512SMinchan Kim if (pageout) {
518be2d5756SBaolin Wang if (folio_isolate_lru(folio)) {
51907e8c82bSVishal Moola (Oracle) if (folio_test_unevictable(folio))
52007e8c82bSVishal Moola (Oracle) folio_putback_lru(folio);
52182072962Szhong jiang else
52207e8c82bSVishal Moola (Oracle) list_add(&folio->lru, &folio_list);
52382072962Szhong jiang }
524d616d512SMinchan Kim } else
5255a9e3474SVishal Moola (Oracle) folio_deactivate(folio);
5269c276cc6SMinchan Kim }
5279c276cc6SMinchan Kim
528f3cd4ab0SHugh Dickins if (start_pte) {
5299c276cc6SMinchan Kim arch_leave_lazy_mmu_mode();
530f3cd4ab0SHugh Dickins pte_unmap_unlock(start_pte, ptl);
531f3cd4ab0SHugh Dickins }
532d616d512SMinchan Kim if (pageout)
53307e8c82bSVishal Moola (Oracle) reclaim_pages(&folio_list);
5349c276cc6SMinchan Kim cond_resched();
5359c276cc6SMinchan Kim
5369c276cc6SMinchan Kim return 0;
5379c276cc6SMinchan Kim }
5389c276cc6SMinchan Kim
5399c276cc6SMinchan Kim static const struct mm_walk_ops cold_walk_ops = {
540d616d512SMinchan Kim .pmd_entry = madvise_cold_or_pageout_pte_range,
54149b06385SSuren Baghdasaryan .walk_lock = PGWALK_RDLOCK,
5429c276cc6SMinchan Kim };
5439c276cc6SMinchan Kim
madvise_cold_page_range(struct mmu_gather * tlb,struct vm_area_struct * vma,unsigned long addr,unsigned long end)5449c276cc6SMinchan Kim static void madvise_cold_page_range(struct mmu_gather *tlb,
5459c276cc6SMinchan Kim struct vm_area_struct *vma,
5469c276cc6SMinchan Kim unsigned long addr, unsigned long end)
5479c276cc6SMinchan Kim {
548d616d512SMinchan Kim struct madvise_walk_private walk_private = {
549d616d512SMinchan Kim .pageout = false,
550d616d512SMinchan Kim .tlb = tlb,
551d616d512SMinchan Kim };
552d616d512SMinchan Kim
5539c276cc6SMinchan Kim tlb_start_vma(tlb, vma);
554d616d512SMinchan Kim walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
5559c276cc6SMinchan Kim tlb_end_vma(tlb, vma);
5569c276cc6SMinchan Kim }
5579c276cc6SMinchan Kim
can_madv_lru_vma(struct vm_area_struct * vma)558a213e5cfSHugh Dickins static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
559a213e5cfSHugh Dickins {
5609457056aSJohannes Weiner return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
561a213e5cfSHugh Dickins }
562a213e5cfSHugh Dickins
madvise_cold(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start_addr,unsigned long end_addr)5639c276cc6SMinchan Kim static long madvise_cold(struct vm_area_struct *vma,
5649c276cc6SMinchan Kim struct vm_area_struct **prev,
5659c276cc6SMinchan Kim unsigned long start_addr, unsigned long end_addr)
5669c276cc6SMinchan Kim {
5679c276cc6SMinchan Kim struct mm_struct *mm = vma->vm_mm;
5689c276cc6SMinchan Kim struct mmu_gather tlb;
5699c276cc6SMinchan Kim
5709c276cc6SMinchan Kim *prev = vma;
5719c276cc6SMinchan Kim if (!can_madv_lru_vma(vma))
5729c276cc6SMinchan Kim return -EINVAL;
5739c276cc6SMinchan Kim
5749c276cc6SMinchan Kim lru_add_drain();
575a72afd87SWill Deacon tlb_gather_mmu(&tlb, mm);
5769c276cc6SMinchan Kim madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
577ae8eba8bSWill Deacon tlb_finish_mmu(&tlb);
5789c276cc6SMinchan Kim
5799c276cc6SMinchan Kim return 0;
5809c276cc6SMinchan Kim }
5819c276cc6SMinchan Kim
madvise_pageout_page_range(struct mmu_gather * tlb,struct vm_area_struct * vma,unsigned long addr,unsigned long end)5821a4e58ccSMinchan Kim static void madvise_pageout_page_range(struct mmu_gather *tlb,
5831a4e58ccSMinchan Kim struct vm_area_struct *vma,
5841a4e58ccSMinchan Kim unsigned long addr, unsigned long end)
5851a4e58ccSMinchan Kim {
586d616d512SMinchan Kim struct madvise_walk_private walk_private = {
587d616d512SMinchan Kim .pageout = true,
588d616d512SMinchan Kim .tlb = tlb,
589d616d512SMinchan Kim };
590d616d512SMinchan Kim
5911a4e58ccSMinchan Kim tlb_start_vma(tlb, vma);
592d616d512SMinchan Kim walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
5931a4e58ccSMinchan Kim tlb_end_vma(tlb, vma);
5941a4e58ccSMinchan Kim }
5951a4e58ccSMinchan Kim
madvise_pageout(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start_addr,unsigned long end_addr)5961a4e58ccSMinchan Kim static long madvise_pageout(struct vm_area_struct *vma,
5971a4e58ccSMinchan Kim struct vm_area_struct **prev,
5981a4e58ccSMinchan Kim unsigned long start_addr, unsigned long end_addr)
5991a4e58ccSMinchan Kim {
6001a4e58ccSMinchan Kim struct mm_struct *mm = vma->vm_mm;
6011a4e58ccSMinchan Kim struct mmu_gather tlb;
6021a4e58ccSMinchan Kim
6031a4e58ccSMinchan Kim *prev = vma;
6041a4e58ccSMinchan Kim if (!can_madv_lru_vma(vma))
6051a4e58ccSMinchan Kim return -EINVAL;
6061a4e58ccSMinchan Kim
607fd3b1bc3SPavankumar Kondeti /*
608fd3b1bc3SPavankumar Kondeti * If the VMA belongs to a private file mapping, there can be private
609fd3b1bc3SPavankumar Kondeti * dirty pages which can be paged out if even this process is neither
610fd3b1bc3SPavankumar Kondeti * owner nor write capable of the file. We allow private file mappings
611fd3b1bc3SPavankumar Kondeti * further to pageout dirty anon pages.
612fd3b1bc3SPavankumar Kondeti */
613fd3b1bc3SPavankumar Kondeti if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
614fd3b1bc3SPavankumar Kondeti (vma->vm_flags & VM_MAYSHARE)))
6151a4e58ccSMinchan Kim return 0;
6161a4e58ccSMinchan Kim
6171a4e58ccSMinchan Kim lru_add_drain();
618a72afd87SWill Deacon tlb_gather_mmu(&tlb, mm);
6191a4e58ccSMinchan Kim madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
620ae8eba8bSWill Deacon tlb_finish_mmu(&tlb);
6211a4e58ccSMinchan Kim
6221a4e58ccSMinchan Kim return 0;
6231a4e58ccSMinchan Kim }
6241a4e58ccSMinchan Kim
madvise_free_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)625854e9ed0SMinchan Kim static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
626854e9ed0SMinchan Kim unsigned long end, struct mm_walk *walk)
627854e9ed0SMinchan Kim
628854e9ed0SMinchan Kim {
629854e9ed0SMinchan Kim struct mmu_gather *tlb = walk->private;
630854e9ed0SMinchan Kim struct mm_struct *mm = tlb->mm;
631854e9ed0SMinchan Kim struct vm_area_struct *vma = walk->vma;
632854e9ed0SMinchan Kim spinlock_t *ptl;
633f3cd4ab0SHugh Dickins pte_t *start_pte, *pte, ptent;
63498b211d6SMatthew Wilcox (Oracle) struct folio *folio;
63564b42bc1SMinchan Kim int nr_swap = 0;
636b8d3c4c3SMinchan Kim unsigned long next;
637854e9ed0SMinchan Kim
638b8d3c4c3SMinchan Kim next = pmd_addr_end(addr, end);
639b8d3c4c3SMinchan Kim if (pmd_trans_huge(*pmd))
640b8d3c4c3SMinchan Kim if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
641854e9ed0SMinchan Kim return 0;
642854e9ed0SMinchan Kim
643ed6a7935SPeter Zijlstra tlb_change_page_size(tlb, PAGE_SIZE);
644f3cd4ab0SHugh Dickins start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
645f3cd4ab0SHugh Dickins if (!start_pte)
646f3cd4ab0SHugh Dickins return 0;
6473ea27719SMel Gorman flush_tlb_batched_pending(mm);
648854e9ed0SMinchan Kim arch_enter_lazy_mmu_mode();
649854e9ed0SMinchan Kim for (; addr != end; pte++, addr += PAGE_SIZE) {
650c33c7948SRyan Roberts ptent = ptep_get(pte);
651854e9ed0SMinchan Kim
65264b42bc1SMinchan Kim if (pte_none(ptent))
653854e9ed0SMinchan Kim continue;
65464b42bc1SMinchan Kim /*
65564b42bc1SMinchan Kim * If the pte has swp_entry, just clear page table to
65664b42bc1SMinchan Kim * prevent swap-in which is more expensive rather than
65764b42bc1SMinchan Kim * (page allocation + zeroing).
65864b42bc1SMinchan Kim */
65964b42bc1SMinchan Kim if (!pte_present(ptent)) {
66064b42bc1SMinchan Kim swp_entry_t entry;
66164b42bc1SMinchan Kim
66264b42bc1SMinchan Kim entry = pte_to_swp_entry(ptent);
6637b49514fSMiaohe Lin if (!non_swap_entry(entry)) {
66464b42bc1SMinchan Kim nr_swap--;
66564b42bc1SMinchan Kim free_swap_and_cache(entry);
66664b42bc1SMinchan Kim pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
6677b49514fSMiaohe Lin } else if (is_hwpoison_entry(entry) ||
668af19487fSAxel Rasmussen is_poisoned_swp_entry(entry)) {
6697b49514fSMiaohe Lin pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
6707b49514fSMiaohe Lin }
67164b42bc1SMinchan Kim continue;
67264b42bc1SMinchan Kim }
673854e9ed0SMinchan Kim
6744947ed93SKefeng Wang folio = vm_normal_folio(vma, addr, ptent);
6754947ed93SKefeng Wang if (!folio || folio_is_zone_device(folio))
676854e9ed0SMinchan Kim continue;
677854e9ed0SMinchan Kim
678854e9ed0SMinchan Kim /*
67998b211d6SMatthew Wilcox (Oracle) * If pmd isn't transhuge but the folio is large and
680854e9ed0SMinchan Kim * is owned by only this process, split it and
681854e9ed0SMinchan Kim * deactivate all pages.
682854e9ed0SMinchan Kim */
68398b211d6SMatthew Wilcox (Oracle) if (folio_test_large(folio)) {
684f3cd4ab0SHugh Dickins int err;
685f3cd4ab0SHugh Dickins
6860e0e9bd5SYin Fengwei if (folio_estimated_sharers(folio) != 1)
687f3cd4ab0SHugh Dickins break;
688f3cd4ab0SHugh Dickins if (!folio_trylock(folio))
689f3cd4ab0SHugh Dickins break;
69098b211d6SMatthew Wilcox (Oracle) folio_get(folio);
691f3cd4ab0SHugh Dickins arch_leave_lazy_mmu_mode();
692f3cd4ab0SHugh Dickins pte_unmap_unlock(start_pte, ptl);
693f3cd4ab0SHugh Dickins start_pte = NULL;
694f3cd4ab0SHugh Dickins err = split_folio(folio);
69598b211d6SMatthew Wilcox (Oracle) folio_unlock(folio);
69698b211d6SMatthew Wilcox (Oracle) folio_put(folio);
697f3cd4ab0SHugh Dickins if (err)
698f3cd4ab0SHugh Dickins break;
699f3cd4ab0SHugh Dickins start_pte = pte =
700f3cd4ab0SHugh Dickins pte_offset_map_lock(mm, pmd, addr, &ptl);
701f3cd4ab0SHugh Dickins if (!start_pte)
702f3cd4ab0SHugh Dickins break;
703f3cd4ab0SHugh Dickins arch_enter_lazy_mmu_mode();
704854e9ed0SMinchan Kim pte--;
705854e9ed0SMinchan Kim addr -= PAGE_SIZE;
706854e9ed0SMinchan Kim continue;
707854e9ed0SMinchan Kim }
708854e9ed0SMinchan Kim
70998b211d6SMatthew Wilcox (Oracle) if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
71098b211d6SMatthew Wilcox (Oracle) if (!folio_trylock(folio))
711854e9ed0SMinchan Kim continue;
712854e9ed0SMinchan Kim /*
71398b211d6SMatthew Wilcox (Oracle) * If folio is shared with others, we mustn't clear
71498b211d6SMatthew Wilcox (Oracle) * the folio's dirty flag.
715854e9ed0SMinchan Kim */
71698b211d6SMatthew Wilcox (Oracle) if (folio_mapcount(folio) != 1) {
71798b211d6SMatthew Wilcox (Oracle) folio_unlock(folio);
718854e9ed0SMinchan Kim continue;
719854e9ed0SMinchan Kim }
720854e9ed0SMinchan Kim
72198b211d6SMatthew Wilcox (Oracle) if (folio_test_swapcache(folio) &&
72298b211d6SMatthew Wilcox (Oracle) !folio_free_swap(folio)) {
72398b211d6SMatthew Wilcox (Oracle) folio_unlock(folio);
724854e9ed0SMinchan Kim continue;
725854e9ed0SMinchan Kim }
726854e9ed0SMinchan Kim
72798b211d6SMatthew Wilcox (Oracle) folio_clear_dirty(folio);
72898b211d6SMatthew Wilcox (Oracle) folio_unlock(folio);
729854e9ed0SMinchan Kim }
730854e9ed0SMinchan Kim
731854e9ed0SMinchan Kim if (pte_young(ptent) || pte_dirty(ptent)) {
732854e9ed0SMinchan Kim /*
733854e9ed0SMinchan Kim * Some of architecture(ex, PPC) don't update TLB
734854e9ed0SMinchan Kim * with set_pte_at and tlb_remove_tlb_entry so for
735854e9ed0SMinchan Kim * the portability, remap the pte with old|clean
736854e9ed0SMinchan Kim * after pte clearing.
737854e9ed0SMinchan Kim */
738854e9ed0SMinchan Kim ptent = ptep_get_and_clear_full(mm, addr, pte,
739854e9ed0SMinchan Kim tlb->fullmm);
740854e9ed0SMinchan Kim
741854e9ed0SMinchan Kim ptent = pte_mkold(ptent);
742854e9ed0SMinchan Kim ptent = pte_mkclean(ptent);
743854e9ed0SMinchan Kim set_pte_at(mm, addr, pte, ptent);
744854e9ed0SMinchan Kim tlb_remove_tlb_entry(tlb, pte, addr);
745854e9ed0SMinchan Kim }
7466a6fe9ebSKefeng Wang folio_mark_lazyfree(folio);
747854e9ed0SMinchan Kim }
748f3cd4ab0SHugh Dickins
74964b42bc1SMinchan Kim if (nr_swap) {
75064b42bc1SMinchan Kim if (current->mm == mm)
75164b42bc1SMinchan Kim sync_mm_rss(mm);
75264b42bc1SMinchan Kim add_mm_counter(mm, MM_SWAPENTS, nr_swap);
75364b42bc1SMinchan Kim }
754f3cd4ab0SHugh Dickins if (start_pte) {
755854e9ed0SMinchan Kim arch_leave_lazy_mmu_mode();
756f3cd4ab0SHugh Dickins pte_unmap_unlock(start_pte, ptl);
757f3cd4ab0SHugh Dickins }
758854e9ed0SMinchan Kim cond_resched();
759f3cd4ab0SHugh Dickins
760854e9ed0SMinchan Kim return 0;
761854e9ed0SMinchan Kim }
762854e9ed0SMinchan Kim
7637b86ac33SChristoph Hellwig static const struct mm_walk_ops madvise_free_walk_ops = {
764854e9ed0SMinchan Kim .pmd_entry = madvise_free_pte_range,
76549b06385SSuren Baghdasaryan .walk_lock = PGWALK_RDLOCK,
766854e9ed0SMinchan Kim };
767854e9ed0SMinchan Kim
madvise_free_single_vma(struct vm_area_struct * vma,unsigned long start_addr,unsigned long end_addr)768854e9ed0SMinchan Kim static int madvise_free_single_vma(struct vm_area_struct *vma,
769854e9ed0SMinchan Kim unsigned long start_addr, unsigned long end_addr)
770854e9ed0SMinchan Kim {
771854e9ed0SMinchan Kim struct mm_struct *mm = vma->vm_mm;
772ac46d4f3SJérôme Glisse struct mmu_notifier_range range;
773854e9ed0SMinchan Kim struct mmu_gather tlb;
774854e9ed0SMinchan Kim
775854e9ed0SMinchan Kim /* MADV_FREE works for only anon vma at the moment */
776854e9ed0SMinchan Kim if (!vma_is_anonymous(vma))
777854e9ed0SMinchan Kim return -EINVAL;
778854e9ed0SMinchan Kim
779ac46d4f3SJérôme Glisse range.start = max(vma->vm_start, start_addr);
780ac46d4f3SJérôme Glisse if (range.start >= vma->vm_end)
781854e9ed0SMinchan Kim return -EINVAL;
782ac46d4f3SJérôme Glisse range.end = min(vma->vm_end, end_addr);
783ac46d4f3SJérôme Glisse if (range.end <= vma->vm_start)
784854e9ed0SMinchan Kim return -EINVAL;
7857d4a8be0SAlistair Popple mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
7866f4f13e8SJérôme Glisse range.start, range.end);
787854e9ed0SMinchan Kim
788854e9ed0SMinchan Kim lru_add_drain();
789a72afd87SWill Deacon tlb_gather_mmu(&tlb, mm);
790854e9ed0SMinchan Kim update_hiwater_rss(mm);
791854e9ed0SMinchan Kim
792ac46d4f3SJérôme Glisse mmu_notifier_invalidate_range_start(&range);
7937b86ac33SChristoph Hellwig tlb_start_vma(&tlb, vma);
7947b86ac33SChristoph Hellwig walk_page_range(vma->vm_mm, range.start, range.end,
7957b86ac33SChristoph Hellwig &madvise_free_walk_ops, &tlb);
7967b86ac33SChristoph Hellwig tlb_end_vma(&tlb, vma);
797ac46d4f3SJérôme Glisse mmu_notifier_invalidate_range_end(&range);
798ae8eba8bSWill Deacon tlb_finish_mmu(&tlb);
799854e9ed0SMinchan Kim
800854e9ed0SMinchan Kim return 0;
801854e9ed0SMinchan Kim }
802854e9ed0SMinchan Kim
8031da177e4SLinus Torvalds /*
8041da177e4SLinus Torvalds * Application no longer needs these pages. If the pages are dirty,
8051da177e4SLinus Torvalds * it's OK to just throw them away. The app will be more careful about
8061da177e4SLinus Torvalds * data it wants to keep. Be sure to free swap resources too. The
80721b85b09SMike Kravetz * zap_page_range_single call sets things up for shrink_active_list to actually
80821b85b09SMike Kravetz * free these pages later if no one else has touched them in the meantime,
8091da177e4SLinus Torvalds * although we could add these pages to a global reuse list for
8107e6cbea3SFernando Luis Vazquez Cao * shrink_active_list to pick up before reclaiming other pages.
8111da177e4SLinus Torvalds *
8121da177e4SLinus Torvalds * NB: This interface discards data rather than pushes it out to swap,
8131da177e4SLinus Torvalds * as some implementations do. This has performance implications for
8141da177e4SLinus Torvalds * applications like large transactional databases which want to discard
8151da177e4SLinus Torvalds * pages in anonymous maps after committing to backing store the data
8161da177e4SLinus Torvalds * that was kept in them. There is no reason to write this data out to
8171da177e4SLinus Torvalds * the swap area if the application is discarding it.
8181da177e4SLinus Torvalds *
8191da177e4SLinus Torvalds * An interface that causes the system to free clean pages and flush
8201da177e4SLinus Torvalds * dirty pages is already available as msync(MS_INVALIDATE).
8211da177e4SLinus Torvalds */
madvise_dontneed_single_vma(struct vm_area_struct * vma,unsigned long start,unsigned long end)822230ca982SMike Rapoport static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
8231da177e4SLinus Torvalds unsigned long start, unsigned long end)
8241da177e4SLinus Torvalds {
82521b85b09SMike Kravetz zap_page_range_single(vma, start, end - start, NULL);
826230ca982SMike Rapoport return 0;
827230ca982SMike Rapoport }
828230ca982SMike Rapoport
madvise_dontneed_free_valid_vma(struct vm_area_struct * vma,unsigned long start,unsigned long * end,int behavior)82990e7e7f5SMike Kravetz static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
83090e7e7f5SMike Kravetz unsigned long start,
83190e7e7f5SMike Kravetz unsigned long *end,
83290e7e7f5SMike Kravetz int behavior)
83390e7e7f5SMike Kravetz {
8349457056aSJohannes Weiner if (!is_vm_hugetlb_page(vma)) {
8359457056aSJohannes Weiner unsigned int forbidden = VM_PFNMAP;
83690e7e7f5SMike Kravetz
8379457056aSJohannes Weiner if (behavior != MADV_DONTNEED_LOCKED)
8389457056aSJohannes Weiner forbidden |= VM_LOCKED;
8399457056aSJohannes Weiner
8409457056aSJohannes Weiner return !(vma->vm_flags & forbidden);
8419457056aSJohannes Weiner }
8429457056aSJohannes Weiner
8439457056aSJohannes Weiner if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
84490e7e7f5SMike Kravetz return false;
84590e7e7f5SMike Kravetz if (start & ~huge_page_mask(hstate_vma(vma)))
84690e7e7f5SMike Kravetz return false;
84790e7e7f5SMike Kravetz
8488ebe0a5eSRik van Riel /*
8498ebe0a5eSRik van Riel * Madvise callers expect the length to be rounded up to PAGE_SIZE
8508ebe0a5eSRik van Riel * boundaries, and may be unaware that this VMA uses huge pages.
8518ebe0a5eSRik van Riel * Avoid unexpected data loss by rounding down the number of
8528ebe0a5eSRik van Riel * huge pages freed.
8538ebe0a5eSRik van Riel */
8548ebe0a5eSRik van Riel *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
8558ebe0a5eSRik van Riel
85690e7e7f5SMike Kravetz return true;
85790e7e7f5SMike Kravetz }
85890e7e7f5SMike Kravetz
madvise_dontneed_free(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,int behavior)859230ca982SMike Rapoport static long madvise_dontneed_free(struct vm_area_struct *vma,
860230ca982SMike Rapoport struct vm_area_struct **prev,
861230ca982SMike Rapoport unsigned long start, unsigned long end,
862230ca982SMike Rapoport int behavior)
863230ca982SMike Rapoport {
8640726b01eSMinchan Kim struct mm_struct *mm = vma->vm_mm;
8650726b01eSMinchan Kim
86605b74384SPrasanna Meda *prev = vma;
86790e7e7f5SMike Kravetz if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
8681da177e4SLinus Torvalds return -EINVAL;
8691da177e4SLinus Torvalds
8708ebe0a5eSRik van Riel if (start == end)
8718ebe0a5eSRik van Riel return 0;
8728ebe0a5eSRik van Riel
87370ccb92fSAndrea Arcangeli if (!userfaultfd_remove(vma, start, end)) {
874c1e8d7c6SMichel Lespinasse *prev = NULL; /* mmap_lock has been dropped, prev is stale */
87570ccb92fSAndrea Arcangeli
8760726b01eSMinchan Kim mmap_read_lock(mm);
87792d5df38SZhangPeng vma = vma_lookup(mm, start);
87870ccb92fSAndrea Arcangeli if (!vma)
87970ccb92fSAndrea Arcangeli return -ENOMEM;
88090e7e7f5SMike Kravetz /*
88190e7e7f5SMike Kravetz * Potential end adjustment for hugetlb vma is OK as
88290e7e7f5SMike Kravetz * the check below keeps end within vma.
88390e7e7f5SMike Kravetz */
88490e7e7f5SMike Kravetz if (!madvise_dontneed_free_valid_vma(vma, start, &end,
88590e7e7f5SMike Kravetz behavior))
88670ccb92fSAndrea Arcangeli return -EINVAL;
88770ccb92fSAndrea Arcangeli if (end > vma->vm_end) {
88870ccb92fSAndrea Arcangeli /*
88970ccb92fSAndrea Arcangeli * Don't fail if end > vma->vm_end. If the old
890f0953a1bSIngo Molnar * vma was split while the mmap_lock was
89170ccb92fSAndrea Arcangeli * released the effect of the concurrent
892230ca982SMike Rapoport * operation may not cause madvise() to
89370ccb92fSAndrea Arcangeli * have an undefined result. There may be an
89470ccb92fSAndrea Arcangeli * adjacent next vma that we'll walk
89570ccb92fSAndrea Arcangeli * next. userfaultfd_remove() will generate an
89670ccb92fSAndrea Arcangeli * UFFD_EVENT_REMOVE repetition on the
89770ccb92fSAndrea Arcangeli * end-vma->vm_end range, but the manager can
89870ccb92fSAndrea Arcangeli * handle a repetition fine.
89970ccb92fSAndrea Arcangeli */
90070ccb92fSAndrea Arcangeli end = vma->vm_end;
90170ccb92fSAndrea Arcangeli }
902*91f0e576SRicardo Cañuelo Navarro /*
903*91f0e576SRicardo Cañuelo Navarro * If the memory region between start and end was
904*91f0e576SRicardo Cañuelo Navarro * originally backed by 4kB pages and then remapped to
905*91f0e576SRicardo Cañuelo Navarro * be backed by hugepages while mmap_lock was dropped,
906*91f0e576SRicardo Cañuelo Navarro * the adjustment for hugetlb vma above may have rounded
907*91f0e576SRicardo Cañuelo Navarro * end down to the start address.
908*91f0e576SRicardo Cañuelo Navarro */
909*91f0e576SRicardo Cañuelo Navarro if (start == end)
910*91f0e576SRicardo Cañuelo Navarro return 0;
911*91f0e576SRicardo Cañuelo Navarro VM_WARN_ON(start > end);
91270ccb92fSAndrea Arcangeli }
913230ca982SMike Rapoport
9149457056aSJohannes Weiner if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
915230ca982SMike Rapoport return madvise_dontneed_single_vma(vma, start, end);
916230ca982SMike Rapoport else if (behavior == MADV_FREE)
917230ca982SMike Rapoport return madvise_free_single_vma(vma, start, end);
918230ca982SMike Rapoport else
919230ca982SMike Rapoport return -EINVAL;
9201da177e4SLinus Torvalds }
9211da177e4SLinus Torvalds
madvise_populate(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,int behavior)9224ca9b385SDavid Hildenbrand static long madvise_populate(struct vm_area_struct *vma,
9234ca9b385SDavid Hildenbrand struct vm_area_struct **prev,
9244ca9b385SDavid Hildenbrand unsigned long start, unsigned long end,
9254ca9b385SDavid Hildenbrand int behavior)
9264ca9b385SDavid Hildenbrand {
9274ca9b385SDavid Hildenbrand const bool write = behavior == MADV_POPULATE_WRITE;
9284ca9b385SDavid Hildenbrand struct mm_struct *mm = vma->vm_mm;
9294ca9b385SDavid Hildenbrand int locked = 1;
9304ca9b385SDavid Hildenbrand long pages;
9314ca9b385SDavid Hildenbrand
9324ca9b385SDavid Hildenbrand *prev = vma;
9334ca9b385SDavid Hildenbrand
9344ca9b385SDavid Hildenbrand while (start < end) {
9354ca9b385SDavid Hildenbrand /* Populate (prefault) page tables readable/writable. */
9369e898211SDavid Hildenbrand pages = faultin_page_range(mm, start, end, write, &locked);
9374ca9b385SDavid Hildenbrand if (!locked) {
9384ca9b385SDavid Hildenbrand mmap_read_lock(mm);
9394ca9b385SDavid Hildenbrand locked = 1;
9404ca9b385SDavid Hildenbrand *prev = NULL;
9414ca9b385SDavid Hildenbrand vma = NULL;
9424ca9b385SDavid Hildenbrand }
9434ca9b385SDavid Hildenbrand if (pages < 0) {
9444ca9b385SDavid Hildenbrand switch (pages) {
9454ca9b385SDavid Hildenbrand case -EINTR:
9464ca9b385SDavid Hildenbrand return -EINTR;
947eb2faa51SDavid Hildenbrand case -EINVAL: /* Incompatible mappings / permissions. */
9484ca9b385SDavid Hildenbrand return -EINVAL;
9494ca9b385SDavid Hildenbrand case -EHWPOISON:
9504ca9b385SDavid Hildenbrand return -EHWPOISON;
951eb2faa51SDavid Hildenbrand case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
952eb2faa51SDavid Hildenbrand return -EFAULT;
9534ca9b385SDavid Hildenbrand default:
9544ca9b385SDavid Hildenbrand pr_warn_once("%s: unhandled return value: %ld\n",
9554ca9b385SDavid Hildenbrand __func__, pages);
9564ca9b385SDavid Hildenbrand fallthrough;
9579e898211SDavid Hildenbrand case -ENOMEM: /* No VMA or out of memory. */
9584ca9b385SDavid Hildenbrand return -ENOMEM;
9594ca9b385SDavid Hildenbrand }
9604ca9b385SDavid Hildenbrand }
9614ca9b385SDavid Hildenbrand start += pages * PAGE_SIZE;
9624ca9b385SDavid Hildenbrand }
9634ca9b385SDavid Hildenbrand return 0;
9644ca9b385SDavid Hildenbrand }
9654ca9b385SDavid Hildenbrand
966f6b3ec23SBadari Pulavarty /*
967f6b3ec23SBadari Pulavarty * Application wants to free up the pages and associated backing store.
968f6b3ec23SBadari Pulavarty * This is effectively punching a hole into the middle of a file.
969f6b3ec23SBadari Pulavarty */
madvise_remove(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end)970f6b3ec23SBadari Pulavarty static long madvise_remove(struct vm_area_struct *vma,
97100e9fa2dSNick Piggin struct vm_area_struct **prev,
972f6b3ec23SBadari Pulavarty unsigned long start, unsigned long end)
973f6b3ec23SBadari Pulavarty {
9743f31d075SHugh Dickins loff_t offset;
97590ed52ebSHugh Dickins int error;
9769ab4233dSAndy Lutomirski struct file *f;
9770726b01eSMinchan Kim struct mm_struct *mm = vma->vm_mm;
978f6b3ec23SBadari Pulavarty
979c1e8d7c6SMichel Lespinasse *prev = NULL; /* tell sys_madvise we drop mmap_lock */
98000e9fa2dSNick Piggin
98172079ba0SMike Kravetz if (vma->vm_flags & VM_LOCKED)
982f6b3ec23SBadari Pulavarty return -EINVAL;
983f6b3ec23SBadari Pulavarty
9849ab4233dSAndy Lutomirski f = vma->vm_file;
9859ab4233dSAndy Lutomirski
9869ab4233dSAndy Lutomirski if (!f || !f->f_mapping || !f->f_mapping->host) {
987f6b3ec23SBadari Pulavarty return -EINVAL;
988f6b3ec23SBadari Pulavarty }
989f6b3ec23SBadari Pulavarty
99069cf0facSHugh Dickins if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
99169cf0facSHugh Dickins return -EACCES;
99269cf0facSHugh Dickins
993f6b3ec23SBadari Pulavarty offset = (loff_t)(start - vma->vm_start)
994f6b3ec23SBadari Pulavarty + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
99590ed52ebSHugh Dickins
9969ab4233dSAndy Lutomirski /*
9979608703eSJan Kara * Filesystem's fallocate may need to take i_rwsem. We need to
9989ab4233dSAndy Lutomirski * explicitly grab a reference because the vma (and hence the
9999ab4233dSAndy Lutomirski * vma's reference to the file) can go away as soon as we drop
1000c1e8d7c6SMichel Lespinasse * mmap_lock.
10019ab4233dSAndy Lutomirski */
10029ab4233dSAndy Lutomirski get_file(f);
100370ccb92fSAndrea Arcangeli if (userfaultfd_remove(vma, start, end)) {
1004c1e8d7c6SMichel Lespinasse /* mmap_lock was not released by userfaultfd_remove() */
10050726b01eSMinchan Kim mmap_read_unlock(mm);
100670ccb92fSAndrea Arcangeli }
100772c72bdfSAnna Schumaker error = vfs_fallocate(f,
10083f31d075SHugh Dickins FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
10093f31d075SHugh Dickins offset, end - start);
10109ab4233dSAndy Lutomirski fput(f);
10110726b01eSMinchan Kim mmap_read_lock(mm);
101290ed52ebSHugh Dickins return error;
1013f6b3ec23SBadari Pulavarty }
1014f6b3ec23SBadari Pulavarty
1015ac1e9accSColin Cross /*
1016ac1e9accSColin Cross * Apply an madvise behavior to a region of a vma. madvise_update_vma
1017ac1e9accSColin Cross * will handle splitting a vm area into separate areas, each area with its own
1018ac1e9accSColin Cross * behavior.
1019ac1e9accSColin Cross */
madvise_vma_behavior(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,unsigned long behavior)1020ac1e9accSColin Cross static int madvise_vma_behavior(struct vm_area_struct *vma,
1021ac1e9accSColin Cross struct vm_area_struct **prev,
1022ac1e9accSColin Cross unsigned long start, unsigned long end,
1023ac1e9accSColin Cross unsigned long behavior)
1024ac1e9accSColin Cross {
1025ac1e9accSColin Cross int error;
1026942341dcSSuren Baghdasaryan struct anon_vma_name *anon_name;
1027ac1e9accSColin Cross unsigned long new_flags = vma->vm_flags;
1028ac1e9accSColin Cross
1029ac1e9accSColin Cross switch (behavior) {
1030ac1e9accSColin Cross case MADV_REMOVE:
1031ac1e9accSColin Cross return madvise_remove(vma, prev, start, end);
1032ac1e9accSColin Cross case MADV_WILLNEED:
1033ac1e9accSColin Cross return madvise_willneed(vma, prev, start, end);
1034ac1e9accSColin Cross case MADV_COLD:
1035ac1e9accSColin Cross return madvise_cold(vma, prev, start, end);
1036ac1e9accSColin Cross case MADV_PAGEOUT:
1037ac1e9accSColin Cross return madvise_pageout(vma, prev, start, end);
1038ac1e9accSColin Cross case MADV_FREE:
1039ac1e9accSColin Cross case MADV_DONTNEED:
10409457056aSJohannes Weiner case MADV_DONTNEED_LOCKED:
1041ac1e9accSColin Cross return madvise_dontneed_free(vma, prev, start, end, behavior);
1042ac1e9accSColin Cross case MADV_POPULATE_READ:
1043ac1e9accSColin Cross case MADV_POPULATE_WRITE:
1044ac1e9accSColin Cross return madvise_populate(vma, prev, start, end, behavior);
1045ac1e9accSColin Cross case MADV_NORMAL:
1046ac1e9accSColin Cross new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
1047ac1e9accSColin Cross break;
1048ac1e9accSColin Cross case MADV_SEQUENTIAL:
1049ac1e9accSColin Cross new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
1050ac1e9accSColin Cross break;
1051ac1e9accSColin Cross case MADV_RANDOM:
1052ac1e9accSColin Cross new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
1053ac1e9accSColin Cross break;
1054ac1e9accSColin Cross case MADV_DONTFORK:
1055ac1e9accSColin Cross new_flags |= VM_DONTCOPY;
1056ac1e9accSColin Cross break;
1057ac1e9accSColin Cross case MADV_DOFORK:
1058ac1e9accSColin Cross if (vma->vm_flags & VM_IO)
1059ac1e9accSColin Cross return -EINVAL;
1060ac1e9accSColin Cross new_flags &= ~VM_DONTCOPY;
1061ac1e9accSColin Cross break;
1062ac1e9accSColin Cross case MADV_WIPEONFORK:
1063ac1e9accSColin Cross /* MADV_WIPEONFORK is only supported on anonymous memory. */
1064ac1e9accSColin Cross if (vma->vm_file || vma->vm_flags & VM_SHARED)
1065ac1e9accSColin Cross return -EINVAL;
1066ac1e9accSColin Cross new_flags |= VM_WIPEONFORK;
1067ac1e9accSColin Cross break;
1068ac1e9accSColin Cross case MADV_KEEPONFORK:
1069ac1e9accSColin Cross new_flags &= ~VM_WIPEONFORK;
1070ac1e9accSColin Cross break;
1071ac1e9accSColin Cross case MADV_DONTDUMP:
1072ac1e9accSColin Cross new_flags |= VM_DONTDUMP;
1073ac1e9accSColin Cross break;
1074ac1e9accSColin Cross case MADV_DODUMP:
1075ac1e9accSColin Cross if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
1076ac1e9accSColin Cross return -EINVAL;
1077ac1e9accSColin Cross new_flags &= ~VM_DONTDUMP;
1078ac1e9accSColin Cross break;
1079ac1e9accSColin Cross case MADV_MERGEABLE:
1080ac1e9accSColin Cross case MADV_UNMERGEABLE:
1081ac1e9accSColin Cross error = ksm_madvise(vma, start, end, behavior, &new_flags);
1082ac1e9accSColin Cross if (error)
1083ac1e9accSColin Cross goto out;
1084ac1e9accSColin Cross break;
1085ac1e9accSColin Cross case MADV_HUGEPAGE:
1086ac1e9accSColin Cross case MADV_NOHUGEPAGE:
1087ac1e9accSColin Cross error = hugepage_madvise(vma, &new_flags, behavior);
1088ac1e9accSColin Cross if (error)
1089ac1e9accSColin Cross goto out;
1090ac1e9accSColin Cross break;
10917d8faaf1SZach O'Keefe case MADV_COLLAPSE:
10927d8faaf1SZach O'Keefe return madvise_collapse(vma, prev, start, end);
1093ac1e9accSColin Cross }
1094ac1e9accSColin Cross
1095942341dcSSuren Baghdasaryan anon_name = anon_vma_name(vma);
1096942341dcSSuren Baghdasaryan anon_vma_name_get(anon_name);
10979a10064fSColin Cross error = madvise_update_vma(vma, prev, start, end, new_flags,
1098942341dcSSuren Baghdasaryan anon_name);
1099942341dcSSuren Baghdasaryan anon_vma_name_put(anon_name);
1100ac1e9accSColin Cross
1101ac1e9accSColin Cross out:
1102ac1e9accSColin Cross /*
1103ac1e9accSColin Cross * madvise() returns EAGAIN if kernel resources, such as
1104ac1e9accSColin Cross * slab, are temporarily unavailable.
1105ac1e9accSColin Cross */
1106ac1e9accSColin Cross if (error == -ENOMEM)
1107ac1e9accSColin Cross error = -EAGAIN;
1108ac1e9accSColin Cross return error;
1109ac1e9accSColin Cross }
1110ac1e9accSColin Cross
11119893e49dSAndi Kleen #ifdef CONFIG_MEMORY_FAILURE
11129893e49dSAndi Kleen /*
11139893e49dSAndi Kleen * Error injection support for memory error handling.
11149893e49dSAndi Kleen */
madvise_inject_error(int behavior,unsigned long start,unsigned long end)111597167a76SAnshuman Khandual static int madvise_inject_error(int behavior,
111697167a76SAnshuman Khandual unsigned long start, unsigned long end)
11179893e49dSAndi Kleen {
1118d3cd257cSYunfeng Ye unsigned long size;
111997167a76SAnshuman Khandual
11209893e49dSAndi Kleen if (!capable(CAP_SYS_ADMIN))
11219893e49dSAndi Kleen return -EPERM;
112297167a76SAnshuman Khandual
112319bfbe22SAlexandru Moise
1124d3cd257cSYunfeng Ye for (; start < end; start += size) {
112523e7b5c2SDan Williams unsigned long pfn;
1126dc7560b4SOscar Salvador struct page *page;
1127325c4ef5SAndrew Morton int ret;
1128325c4ef5SAndrew Morton
112997167a76SAnshuman Khandual ret = get_user_pages_fast(start, 1, 0, &page);
11309893e49dSAndi Kleen if (ret != 1)
11319893e49dSAndi Kleen return ret;
113223e7b5c2SDan Williams pfn = page_to_pfn(page);
1133325c4ef5SAndrew Morton
113419bfbe22SAlexandru Moise /*
113519bfbe22SAlexandru Moise * When soft offlining hugepages, after migrating the page
113619bfbe22SAlexandru Moise * we dissolve it, therefore in the second loop "page" will
1137d3cd257cSYunfeng Ye * no longer be a compound page.
113819bfbe22SAlexandru Moise */
1139d3cd257cSYunfeng Ye size = page_size(compound_head(page));
114019bfbe22SAlexandru Moise
114197167a76SAnshuman Khandual if (behavior == MADV_SOFT_OFFLINE) {
114297167a76SAnshuman Khandual pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
114323e7b5c2SDan Williams pfn, start);
1144feec24a6SNaoya Horiguchi ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
1145dc7560b4SOscar Salvador } else {
114623e7b5c2SDan Williams pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
114723e7b5c2SDan Williams pfn, start);
114867f22ba7Szhenwei pi ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED);
1149d1fe111fSluofei if (ret == -EOPNOTSUPP)
1150d1fe111fSluofei ret = 0;
1151dc7560b4SOscar Salvador }
1152dc7560b4SOscar Salvador
115323a003bfSNaoya Horiguchi if (ret)
115423a003bfSNaoya Horiguchi return ret;
11559893e49dSAndi Kleen }
1156c461ad6aSMel Gorman
1157325c4ef5SAndrew Morton return 0;
11589893e49dSAndi Kleen }
11599893e49dSAndi Kleen #endif
11609893e49dSAndi Kleen
11611ecef9edSNicholas Krause static bool
madvise_behavior_valid(int behavior)116275927af8SNick Piggin madvise_behavior_valid(int behavior)
116375927af8SNick Piggin {
116475927af8SNick Piggin switch (behavior) {
116575927af8SNick Piggin case MADV_DOFORK:
116675927af8SNick Piggin case MADV_DONTFORK:
116775927af8SNick Piggin case MADV_NORMAL:
116875927af8SNick Piggin case MADV_SEQUENTIAL:
116975927af8SNick Piggin case MADV_RANDOM:
117075927af8SNick Piggin case MADV_REMOVE:
117175927af8SNick Piggin case MADV_WILLNEED:
117275927af8SNick Piggin case MADV_DONTNEED:
11739457056aSJohannes Weiner case MADV_DONTNEED_LOCKED:
1174854e9ed0SMinchan Kim case MADV_FREE:
11759c276cc6SMinchan Kim case MADV_COLD:
11761a4e58ccSMinchan Kim case MADV_PAGEOUT:
11774ca9b385SDavid Hildenbrand case MADV_POPULATE_READ:
11784ca9b385SDavid Hildenbrand case MADV_POPULATE_WRITE:
1179f8af4da3SHugh Dickins #ifdef CONFIG_KSM
1180f8af4da3SHugh Dickins case MADV_MERGEABLE:
1181f8af4da3SHugh Dickins case MADV_UNMERGEABLE:
1182f8af4da3SHugh Dickins #endif
11830af4e98bSAndrea Arcangeli #ifdef CONFIG_TRANSPARENT_HUGEPAGE
11840af4e98bSAndrea Arcangeli case MADV_HUGEPAGE:
1185a664b2d8SAndrea Arcangeli case MADV_NOHUGEPAGE:
11867d8faaf1SZach O'Keefe case MADV_COLLAPSE:
11870af4e98bSAndrea Arcangeli #endif
1188accb61feSJason Baron case MADV_DONTDUMP:
1189accb61feSJason Baron case MADV_DODUMP:
1190d2cd9edeSRik van Riel case MADV_WIPEONFORK:
1191d2cd9edeSRik van Riel case MADV_KEEPONFORK:
11925e451be7SAnshuman Khandual #ifdef CONFIG_MEMORY_FAILURE
11935e451be7SAnshuman Khandual case MADV_SOFT_OFFLINE:
11945e451be7SAnshuman Khandual case MADV_HWPOISON:
11955e451be7SAnshuman Khandual #endif
11961ecef9edSNicholas Krause return true;
119775927af8SNick Piggin
119875927af8SNick Piggin default:
11991ecef9edSNicholas Krause return false;
120075927af8SNick Piggin }
120175927af8SNick Piggin }
12023866ea90SHugh Dickins
process_madvise_behavior_valid(int behavior)1203876b4a18SZach O'Keefe static bool process_madvise_behavior_valid(int behavior)
1204ecb8ac8bSMinchan Kim {
1205ecb8ac8bSMinchan Kim switch (behavior) {
1206ecb8ac8bSMinchan Kim case MADV_COLD:
1207ecb8ac8bSMinchan Kim case MADV_PAGEOUT:
1208d5fffc5aSzhangkui case MADV_WILLNEED:
1209876b4a18SZach O'Keefe case MADV_COLLAPSE:
1210ecb8ac8bSMinchan Kim return true;
1211ecb8ac8bSMinchan Kim default:
1212ecb8ac8bSMinchan Kim return false;
1213ecb8ac8bSMinchan Kim }
1214ecb8ac8bSMinchan Kim }
1215ecb8ac8bSMinchan Kim
12161da177e4SLinus Torvalds /*
1217ac1e9accSColin Cross * Walk the vmas in range [start,end), and call the visit function on each one.
1218ac1e9accSColin Cross * The visit function will get start and end parameters that cover the overlap
1219ac1e9accSColin Cross * between the current vma and the original range. Any unmapped regions in the
1220ac1e9accSColin Cross * original range will result in this function returning -ENOMEM while still
1221ac1e9accSColin Cross * calling the visit function on all of the existing vmas in the range.
1222ac1e9accSColin Cross * Must be called with the mmap_lock held for reading or writing.
1223ac1e9accSColin Cross */
1224ac1e9accSColin Cross static
madvise_walk_vmas(struct mm_struct * mm,unsigned long start,unsigned long end,unsigned long arg,int (* visit)(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,unsigned long arg))1225ac1e9accSColin Cross int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
1226ac1e9accSColin Cross unsigned long end, unsigned long arg,
1227ac1e9accSColin Cross int (*visit)(struct vm_area_struct *vma,
1228ac1e9accSColin Cross struct vm_area_struct **prev, unsigned long start,
1229ac1e9accSColin Cross unsigned long end, unsigned long arg))
1230ac1e9accSColin Cross {
1231ac1e9accSColin Cross struct vm_area_struct *vma;
1232ac1e9accSColin Cross struct vm_area_struct *prev;
1233ac1e9accSColin Cross unsigned long tmp;
1234ac1e9accSColin Cross int unmapped_error = 0;
1235ac1e9accSColin Cross
1236ac1e9accSColin Cross /*
1237ac1e9accSColin Cross * If the interval [start,end) covers some unmapped address
1238ac1e9accSColin Cross * ranges, just ignore them, but return -ENOMEM at the end.
1239ac1e9accSColin Cross * - different from the way of handling in mlock etc.
1240ac1e9accSColin Cross */
1241ac1e9accSColin Cross vma = find_vma_prev(mm, start, &prev);
1242ac1e9accSColin Cross if (vma && start > vma->vm_start)
1243ac1e9accSColin Cross prev = vma;
1244ac1e9accSColin Cross
1245ac1e9accSColin Cross for (;;) {
1246ac1e9accSColin Cross int error;
1247ac1e9accSColin Cross
1248ac1e9accSColin Cross /* Still start < end. */
1249ac1e9accSColin Cross if (!vma)
1250ac1e9accSColin Cross return -ENOMEM;
1251ac1e9accSColin Cross
1252ac1e9accSColin Cross /* Here start < (end|vma->vm_end). */
1253ac1e9accSColin Cross if (start < vma->vm_start) {
1254ac1e9accSColin Cross unmapped_error = -ENOMEM;
1255ac1e9accSColin Cross start = vma->vm_start;
1256ac1e9accSColin Cross if (start >= end)
1257ac1e9accSColin Cross break;
1258ac1e9accSColin Cross }
1259ac1e9accSColin Cross
1260ac1e9accSColin Cross /* Here vma->vm_start <= start < (end|vma->vm_end) */
1261ac1e9accSColin Cross tmp = vma->vm_end;
1262ac1e9accSColin Cross if (end < tmp)
1263ac1e9accSColin Cross tmp = end;
1264ac1e9accSColin Cross
1265ac1e9accSColin Cross /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
1266ac1e9accSColin Cross error = visit(vma, &prev, start, tmp, arg);
1267ac1e9accSColin Cross if (error)
1268ac1e9accSColin Cross return error;
1269ac1e9accSColin Cross start = tmp;
1270ac1e9accSColin Cross if (prev && start < prev->vm_end)
1271ac1e9accSColin Cross start = prev->vm_end;
1272ac1e9accSColin Cross if (start >= end)
1273ac1e9accSColin Cross break;
1274ac1e9accSColin Cross if (prev)
127535474818SLiam R. Howlett vma = find_vma(mm, prev->vm_end);
1276ac1e9accSColin Cross else /* madvise_remove dropped mmap_lock */
1277ac1e9accSColin Cross vma = find_vma(mm, start);
1278ac1e9accSColin Cross }
1279ac1e9accSColin Cross
1280ac1e9accSColin Cross return unmapped_error;
1281ac1e9accSColin Cross }
1282ac1e9accSColin Cross
12839a10064fSColin Cross #ifdef CONFIG_ANON_VMA_NAME
madvise_vma_anon_name(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,unsigned long anon_name)12849a10064fSColin Cross static int madvise_vma_anon_name(struct vm_area_struct *vma,
12859a10064fSColin Cross struct vm_area_struct **prev,
12869a10064fSColin Cross unsigned long start, unsigned long end,
12875c26f6acSSuren Baghdasaryan unsigned long anon_name)
12889a10064fSColin Cross {
12899a10064fSColin Cross int error;
12909a10064fSColin Cross
12919a10064fSColin Cross /* Only anonymous mappings can be named */
1292d09e8ca6SPasha Tatashin if (vma->vm_file && !vma_is_anon_shmem(vma))
12939a10064fSColin Cross return -EBADF;
12949a10064fSColin Cross
12959a10064fSColin Cross error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
12965c26f6acSSuren Baghdasaryan (struct anon_vma_name *)anon_name);
12979a10064fSColin Cross
12989a10064fSColin Cross /*
12999a10064fSColin Cross * madvise() returns EAGAIN if kernel resources, such as
13009a10064fSColin Cross * slab, are temporarily unavailable.
13019a10064fSColin Cross */
13029a10064fSColin Cross if (error == -ENOMEM)
13039a10064fSColin Cross error = -EAGAIN;
13049a10064fSColin Cross return error;
13059a10064fSColin Cross }
13069a10064fSColin Cross
madvise_set_anon_name(struct mm_struct * mm,unsigned long start,unsigned long len_in,struct anon_vma_name * anon_name)13079a10064fSColin Cross int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
13085c26f6acSSuren Baghdasaryan unsigned long len_in, struct anon_vma_name *anon_name)
13099a10064fSColin Cross {
13109a10064fSColin Cross unsigned long end;
13119a10064fSColin Cross unsigned long len;
13129a10064fSColin Cross
13139a10064fSColin Cross if (start & ~PAGE_MASK)
13149a10064fSColin Cross return -EINVAL;
13159a10064fSColin Cross len = (len_in + ~PAGE_MASK) & PAGE_MASK;
13169a10064fSColin Cross
13179a10064fSColin Cross /* Check to see whether len was rounded up from small -ve to zero */
13189a10064fSColin Cross if (len_in && !len)
13199a10064fSColin Cross return -EINVAL;
13209a10064fSColin Cross
13219a10064fSColin Cross end = start + len;
13229a10064fSColin Cross if (end < start)
13239a10064fSColin Cross return -EINVAL;
13249a10064fSColin Cross
13259a10064fSColin Cross if (end == start)
13269a10064fSColin Cross return 0;
13279a10064fSColin Cross
13285c26f6acSSuren Baghdasaryan return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
13299a10064fSColin Cross madvise_vma_anon_name);
13309a10064fSColin Cross }
13319a10064fSColin Cross #endif /* CONFIG_ANON_VMA_NAME */
1332ac1e9accSColin Cross /*
13331da177e4SLinus Torvalds * The madvise(2) system call.
13341da177e4SLinus Torvalds *
13351da177e4SLinus Torvalds * Applications can use madvise() to advise the kernel how it should
13361da177e4SLinus Torvalds * handle paging I/O in this VM area. The idea is to help the kernel
13371da177e4SLinus Torvalds * use appropriate read-ahead and caching techniques. The information
13381da177e4SLinus Torvalds * provided is advisory only, and can be safely disregarded by the
13391da177e4SLinus Torvalds * kernel without affecting the correct operation of the application.
13401da177e4SLinus Torvalds *
13411da177e4SLinus Torvalds * behavior values:
13421da177e4SLinus Torvalds * MADV_NORMAL - the default behavior is to read clusters. This
13431da177e4SLinus Torvalds * results in some read-ahead and read-behind.
13441da177e4SLinus Torvalds * MADV_RANDOM - the system should read the minimum amount of data
13451da177e4SLinus Torvalds * on any access, since it is unlikely that the appli-
13461da177e4SLinus Torvalds * cation will need more than what it asks for.
13471da177e4SLinus Torvalds * MADV_SEQUENTIAL - pages in the given range will probably be accessed
13481da177e4SLinus Torvalds * once, so they can be aggressively read ahead, and
13491da177e4SLinus Torvalds * can be freed soon after they are accessed.
13501da177e4SLinus Torvalds * MADV_WILLNEED - the application is notifying the system to read
13511da177e4SLinus Torvalds * some pages ahead.
13521da177e4SLinus Torvalds * MADV_DONTNEED - the application is finished with the given range,
13531da177e4SLinus Torvalds * so the kernel can free resources associated with it.
1354d7206a70SNaoya Horiguchi * MADV_FREE - the application marks pages in the given range as lazy free,
1355d7206a70SNaoya Horiguchi * where actual purges are postponed until memory pressure happens.
1356f6b3ec23SBadari Pulavarty * MADV_REMOVE - the application wants to free up the given range of
1357f6b3ec23SBadari Pulavarty * pages and associated backing store.
13583866ea90SHugh Dickins * MADV_DONTFORK - omit this area from child's address space when forking:
13593866ea90SHugh Dickins * typically, to avoid COWing pages pinned by get_user_pages().
13603866ea90SHugh Dickins * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
1361c02c3009SYang Shi * MADV_WIPEONFORK - present the child process with zero-filled memory in this
1362c02c3009SYang Shi * range after a fork.
1363c02c3009SYang Shi * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
1364d7206a70SNaoya Horiguchi * MADV_HWPOISON - trigger memory error handler as if the given memory range
1365d7206a70SNaoya Horiguchi * were corrupted by unrecoverable hardware memory failure.
1366d7206a70SNaoya Horiguchi * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
1367f8af4da3SHugh Dickins * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
1368f8af4da3SHugh Dickins * this area with pages of identical content from other such areas.
1369f8af4da3SHugh Dickins * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1370d7206a70SNaoya Horiguchi * MADV_HUGEPAGE - the application wants to back the given range by transparent
1371d7206a70SNaoya Horiguchi * huge pages in the future. Existing pages might be coalesced and
1372d7206a70SNaoya Horiguchi * new pages might be allocated as THP.
1373d7206a70SNaoya Horiguchi * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
1374d7206a70SNaoya Horiguchi * transparent huge pages so the existing pages will not be
1375d7206a70SNaoya Horiguchi * coalesced into THP and new pages will not be allocated as THP.
13767d8faaf1SZach O'Keefe * MADV_COLLAPSE - synchronously coalesce pages into new THP.
1377d7206a70SNaoya Horiguchi * MADV_DONTDUMP - the application wants to prevent pages in the given range
1378d7206a70SNaoya Horiguchi * from being included in its core dump.
1379d7206a70SNaoya Horiguchi * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1380ecb8ac8bSMinchan Kim * MADV_COLD - the application is not expected to use this memory soon,
1381ecb8ac8bSMinchan Kim * deactivate pages in this range so that they can be reclaimed
1382f0953a1bSIngo Molnar * easily if memory pressure happens.
1383ecb8ac8bSMinchan Kim * MADV_PAGEOUT - the application is not expected to use this memory soon,
1384ecb8ac8bSMinchan Kim * page out the pages in this range immediately.
13854ca9b385SDavid Hildenbrand * MADV_POPULATE_READ - populate (prefault) page tables readable by
13864ca9b385SDavid Hildenbrand * triggering read faults if required
13874ca9b385SDavid Hildenbrand * MADV_POPULATE_WRITE - populate (prefault) page tables writable by
13884ca9b385SDavid Hildenbrand * triggering write faults if required
13891da177e4SLinus Torvalds *
13901da177e4SLinus Torvalds * return values:
13911da177e4SLinus Torvalds * zero - success
13921da177e4SLinus Torvalds * -EINVAL - start + len < 0, start is not page-aligned,
13931da177e4SLinus Torvalds * "behavior" is not a valid value, or application
1394c02c3009SYang Shi * is attempting to release locked or shared pages,
1395c02c3009SYang Shi * or the specified address range includes file, Huge TLB,
1396c02c3009SYang Shi * MAP_SHARED or VMPFNMAP range.
13971da177e4SLinus Torvalds * -ENOMEM - addresses in the specified range are not currently
13981da177e4SLinus Torvalds * mapped, or are outside the AS of the process.
13991da177e4SLinus Torvalds * -EIO - an I/O error occurred while paging in data.
14001da177e4SLinus Torvalds * -EBADF - map exists, but area maps something that isn't a file.
14011da177e4SLinus Torvalds * -EAGAIN - a kernel resource was temporarily unavailable.
14021da177e4SLinus Torvalds */
do_madvise(struct mm_struct * mm,unsigned long start,size_t len_in,int behavior)14030726b01eSMinchan Kim int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
14041da177e4SLinus Torvalds {
1405ac1e9accSColin Cross unsigned long end;
1406ac1e9accSColin Cross int error;
1407f7977793SJason Baron int write;
14081da177e4SLinus Torvalds size_t len;
14091998cc04SShaohua Li struct blk_plug plug;
14101da177e4SLinus Torvalds
141175927af8SNick Piggin if (!madvise_behavior_valid(behavior))
1412ac1e9accSColin Cross return -EINVAL;
141375927af8SNick Piggin
1414df6c6500SWei Yang if (!PAGE_ALIGNED(start))
1415ac1e9accSColin Cross return -EINVAL;
1416df6c6500SWei Yang len = PAGE_ALIGN(len_in);
141784d96d89SRasmus Villemoes
141884d96d89SRasmus Villemoes /* Check to see whether len was rounded up from small -ve to zero */
141984d96d89SRasmus Villemoes if (len_in && !len)
1420ac1e9accSColin Cross return -EINVAL;
142184d96d89SRasmus Villemoes
142284d96d89SRasmus Villemoes end = start + len;
142384d96d89SRasmus Villemoes if (end < start)
1424ac1e9accSColin Cross return -EINVAL;
142584d96d89SRasmus Villemoes
142684d96d89SRasmus Villemoes if (end == start)
1427ac1e9accSColin Cross return 0;
142884d96d89SRasmus Villemoes
14295e451be7SAnshuman Khandual #ifdef CONFIG_MEMORY_FAILURE
14305e451be7SAnshuman Khandual if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
14315e451be7SAnshuman Khandual return madvise_inject_error(behavior, start, start + len_in);
14325e451be7SAnshuman Khandual #endif
14335e451be7SAnshuman Khandual
1434f7977793SJason Baron write = madvise_need_mmap_write(behavior);
1435dc0ef0dfSMichal Hocko if (write) {
14360726b01eSMinchan Kim if (mmap_write_lock_killable(mm))
1437dc0ef0dfSMichal Hocko return -EINTR;
1438dc0ef0dfSMichal Hocko } else {
14390726b01eSMinchan Kim mmap_read_lock(mm);
1440dc0ef0dfSMichal Hocko }
14411da177e4SLinus Torvalds
1442428e106aSKirill A. Shutemov start = untagged_addr_remote(mm, start);
1443428e106aSKirill A. Shutemov end = start + len;
1444428e106aSKirill A. Shutemov
14451998cc04SShaohua Li blk_start_plug(&plug);
1446ac1e9accSColin Cross error = madvise_walk_vmas(mm, start, end, behavior,
1447ac1e9accSColin Cross madvise_vma_behavior);
144884d96d89SRasmus Villemoes blk_finish_plug(&plug);
1449f7977793SJason Baron if (write)
14500726b01eSMinchan Kim mmap_write_unlock(mm);
14510a27a14aSNick Piggin else
14520726b01eSMinchan Kim mmap_read_unlock(mm);
14530a27a14aSNick Piggin
14541da177e4SLinus Torvalds return error;
14551da177e4SLinus Torvalds }
1456db08ca25SJens Axboe
SYSCALL_DEFINE3(madvise,unsigned long,start,size_t,len_in,int,behavior)1457db08ca25SJens Axboe SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1458db08ca25SJens Axboe {
14590726b01eSMinchan Kim return do_madvise(current->mm, start, len_in, behavior);
1460db08ca25SJens Axboe }
1461ecb8ac8bSMinchan Kim
SYSCALL_DEFINE5(process_madvise,int,pidfd,const struct iovec __user *,vec,size_t,vlen,int,behavior,unsigned int,flags)1462ecb8ac8bSMinchan Kim SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
1463ecb8ac8bSMinchan Kim size_t, vlen, int, behavior, unsigned int, flags)
1464ecb8ac8bSMinchan Kim {
1465ecb8ac8bSMinchan Kim ssize_t ret;
146695e49cf8SJens Axboe struct iovec iovstack[UIO_FASTIOV];
1467ecb8ac8bSMinchan Kim struct iovec *iov = iovstack;
1468ecb8ac8bSMinchan Kim struct iov_iter iter;
1469ecb8ac8bSMinchan Kim struct task_struct *task;
1470ecb8ac8bSMinchan Kim struct mm_struct *mm;
1471ecb8ac8bSMinchan Kim size_t total_len;
1472ecb8ac8bSMinchan Kim unsigned int f_flags;
1473ecb8ac8bSMinchan Kim
1474ecb8ac8bSMinchan Kim if (flags != 0) {
1475ecb8ac8bSMinchan Kim ret = -EINVAL;
1476ecb8ac8bSMinchan Kim goto out;
1477ecb8ac8bSMinchan Kim }
1478ecb8ac8bSMinchan Kim
1479de4eda9dSAl Viro ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1480ecb8ac8bSMinchan Kim if (ret < 0)
1481ecb8ac8bSMinchan Kim goto out;
1482ecb8ac8bSMinchan Kim
1483ee9955d6SChristian Brauner task = pidfd_get_task(pidfd, &f_flags);
1484ee9955d6SChristian Brauner if (IS_ERR(task)) {
1485ee9955d6SChristian Brauner ret = PTR_ERR(task);
1486ecb8ac8bSMinchan Kim goto free_iov;
1487ecb8ac8bSMinchan Kim }
1488ecb8ac8bSMinchan Kim
1489a68a0262SMinchan Kim if (!process_madvise_behavior_valid(behavior)) {
1490ecb8ac8bSMinchan Kim ret = -EINVAL;
1491ecb8ac8bSMinchan Kim goto release_task;
1492ecb8ac8bSMinchan Kim }
1493ecb8ac8bSMinchan Kim
149496cfe2c0SSuren Baghdasaryan /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
149596cfe2c0SSuren Baghdasaryan mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1496ecb8ac8bSMinchan Kim if (IS_ERR_OR_NULL(mm)) {
1497ecb8ac8bSMinchan Kim ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
1498ecb8ac8bSMinchan Kim goto release_task;
1499ecb8ac8bSMinchan Kim }
1500ecb8ac8bSMinchan Kim
150196cfe2c0SSuren Baghdasaryan /*
150296cfe2c0SSuren Baghdasaryan * Require CAP_SYS_NICE for influencing process performance. Note that
150396cfe2c0SSuren Baghdasaryan * only non-destructive hints are currently supported.
150496cfe2c0SSuren Baghdasaryan */
150596cfe2c0SSuren Baghdasaryan if (!capable(CAP_SYS_NICE)) {
150696cfe2c0SSuren Baghdasaryan ret = -EPERM;
150796cfe2c0SSuren Baghdasaryan goto release_mm;
150896cfe2c0SSuren Baghdasaryan }
150996cfe2c0SSuren Baghdasaryan
1510ecb8ac8bSMinchan Kim total_len = iov_iter_count(&iter);
1511ecb8ac8bSMinchan Kim
1512ecb8ac8bSMinchan Kim while (iov_iter_count(&iter)) {
151395e49cf8SJens Axboe ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
151495e49cf8SJens Axboe iter_iov_len(&iter), behavior);
1515e6b0a7b3SCharan Teja Kalla if (ret < 0)
1516ecb8ac8bSMinchan Kim break;
151795e49cf8SJens Axboe iov_iter_advance(&iter, iter_iov_len(&iter));
1518ecb8ac8bSMinchan Kim }
1519ecb8ac8bSMinchan Kim
15205bd009c7SCharan Teja Kalla ret = (total_len - iov_iter_count(&iter)) ? : ret;
1521ecb8ac8bSMinchan Kim
152296cfe2c0SSuren Baghdasaryan release_mm:
1523ecb8ac8bSMinchan Kim mmput(mm);
1524ecb8ac8bSMinchan Kim release_task:
1525ecb8ac8bSMinchan Kim put_task_struct(task);
1526ecb8ac8bSMinchan Kim free_iov:
1527ecb8ac8bSMinchan Kim kfree(iov);
1528ecb8ac8bSMinchan Kim out:
1529ecb8ac8bSMinchan Kim return ret;
1530ecb8ac8bSMinchan Kim }
1531