1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds * linux/mm/mincore.c
41da177e4SLinus Torvalds *
52f77d107SLinus Torvalds * Copyright (C) 1994-2006 Linus Torvalds
61da177e4SLinus Torvalds */
71da177e4SLinus Torvalds
81da177e4SLinus Torvalds /*
91da177e4SLinus Torvalds * The mincore() system call.
101da177e4SLinus Torvalds */
111da177e4SLinus Torvalds #include <linux/pagemap.h>
125a0e3ad6STejun Heo #include <linux/gfp.h>
13a520110eSChristoph Hellwig #include <linux/pagewalk.h>
141da177e4SLinus Torvalds #include <linux/mman.h>
151da177e4SLinus Torvalds #include <linux/syscalls.h>
1642da9cbdSNick Piggin #include <linux/swap.h>
1742da9cbdSNick Piggin #include <linux/swapops.h>
183a4f8a0bSHugh Dickins #include <linux/shmem_fs.h>
194f16fc10SNaoya Horiguchi #include <linux/hugetlb.h>
2065fddcfcSMike Rapoport #include <linux/pgtable.h>
211da177e4SLinus Torvalds
227c0f6ba6SLinus Torvalds #include <linux/uaccess.h>
23014bb1deSNeilBrown #include "swap.h"
241da177e4SLinus Torvalds
mincore_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)251e25a271SNaoya Horiguchi static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
261e25a271SNaoya Horiguchi unsigned long end, struct mm_walk *walk)
27f4884010SJohannes Weiner {
28f4884010SJohannes Weiner #ifdef CONFIG_HUGETLB_PAGE
29f4884010SJohannes Weiner unsigned char present;
301e25a271SNaoya Horiguchi unsigned char *vec = walk->private;
311e25a271SNaoya Horiguchi
32f4884010SJohannes Weiner /*
331e25a271SNaoya Horiguchi * Hugepages under user process are always in RAM and never
341e25a271SNaoya Horiguchi * swapped out, but theoretically it needs to be checked.
35f4884010SJohannes Weiner */
3663cf5842SJames Houghton present = pte && !huge_pte_none_mostly(huge_ptep_get(pte));
371e25a271SNaoya Horiguchi for (; addr != end; vec++, addr += PAGE_SIZE)
3825ef0e50SJohannes Weiner *vec = present;
391e25a271SNaoya Horiguchi walk->private = vec;
40f4884010SJohannes Weiner #else
41f4884010SJohannes Weiner BUG();
42f4884010SJohannes Weiner #endif
431e25a271SNaoya Horiguchi return 0;
44f4884010SJohannes Weiner }
45f4884010SJohannes Weiner
4630bac164SLinus Torvalds /*
4730bac164SLinus Torvalds * Later we can get more picky about what "in core" means precisely.
4830bac164SLinus Torvalds * For now, simply check to see if the page is in the page cache,
4930bac164SLinus Torvalds * and is up to date; i.e. that no page-in operation would be required
5030bac164SLinus Torvalds * at this time if an application were to map and access this page.
5130bac164SLinus Torvalds */
mincore_page(struct address_space * mapping,pgoff_t index)5261ef1865SMatthew Wilcox (Oracle) static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
5330bac164SLinus Torvalds {
5430bac164SLinus Torvalds unsigned char present = 0;
55524984ffSMatthew Wilcox (Oracle) struct folio *folio;
5630bac164SLinus Torvalds
5730bac164SLinus Torvalds /*
5830bac164SLinus Torvalds * When tmpfs swaps out a page from a file, any process mapping that
5930bac164SLinus Torvalds * file will not get a swp_entry_t in its pte, but rather it is like
6030bac164SLinus Torvalds * any other file mapping (ie. marked !present and faulted in with
6130bac164SLinus Torvalds * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
6230bac164SLinus Torvalds */
63524984ffSMatthew Wilcox (Oracle) folio = filemap_get_incore_folio(mapping, index);
6466dabbb6SChristoph Hellwig if (!IS_ERR(folio)) {
65524984ffSMatthew Wilcox (Oracle) present = folio_test_uptodate(folio);
66524984ffSMatthew Wilcox (Oracle) folio_put(folio);
6730bac164SLinus Torvalds }
6830bac164SLinus Torvalds
6930bac164SLinus Torvalds return present;
7030bac164SLinus Torvalds }
7130bac164SLinus Torvalds
__mincore_unmapped_range(unsigned long addr,unsigned long end,struct vm_area_struct * vma,unsigned char * vec)7230bac164SLinus Torvalds static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
7330bac164SLinus Torvalds struct vm_area_struct *vma, unsigned char *vec)
7430bac164SLinus Torvalds {
7530bac164SLinus Torvalds unsigned long nr = (end - addr) >> PAGE_SHIFT;
7630bac164SLinus Torvalds int i;
7730bac164SLinus Torvalds
7830bac164SLinus Torvalds if (vma->vm_file) {
7930bac164SLinus Torvalds pgoff_t pgoff;
8030bac164SLinus Torvalds
8130bac164SLinus Torvalds pgoff = linear_page_index(vma, addr);
8230bac164SLinus Torvalds for (i = 0; i < nr; i++, pgoff++)
8330bac164SLinus Torvalds vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
8430bac164SLinus Torvalds } else {
8530bac164SLinus Torvalds for (i = 0; i < nr; i++)
8630bac164SLinus Torvalds vec[i] = 0;
8730bac164SLinus Torvalds }
8830bac164SLinus Torvalds return nr;
8930bac164SLinus Torvalds }
9030bac164SLinus Torvalds
mincore_unmapped_range(unsigned long addr,unsigned long end,__always_unused int depth,struct mm_walk * walk)911e25a271SNaoya Horiguchi static int mincore_unmapped_range(unsigned long addr, unsigned long end,
92b7a16c7aSSteven Price __always_unused int depth,
931e25a271SNaoya Horiguchi struct mm_walk *walk)
94f4884010SJohannes Weiner {
9530bac164SLinus Torvalds walk->private += __mincore_unmapped_range(addr, end,
9630bac164SLinus Torvalds walk->vma, walk->private);
971e25a271SNaoya Horiguchi return 0;
981e25a271SNaoya Horiguchi }
991da177e4SLinus Torvalds
mincore_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)1001e25a271SNaoya Horiguchi static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1011e25a271SNaoya Horiguchi struct mm_walk *walk)
1021e25a271SNaoya Horiguchi {
1031e25a271SNaoya Horiguchi spinlock_t *ptl;
1041e25a271SNaoya Horiguchi struct vm_area_struct *vma = walk->vma;
1051e25a271SNaoya Horiguchi pte_t *ptep;
1061e25a271SNaoya Horiguchi unsigned char *vec = walk->private;
1071e25a271SNaoya Horiguchi int nr = (end - addr) >> PAGE_SHIFT;
1081e25a271SNaoya Horiguchi
109b6ec57f4SKirill A. Shutemov ptl = pmd_trans_huge_lock(pmd, vma);
110b6ec57f4SKirill A. Shutemov if (ptl) {
1111e25a271SNaoya Horiguchi memset(vec, 1, nr);
1121e25a271SNaoya Horiguchi spin_unlock(ptl);
1131e25a271SNaoya Horiguchi goto out;
1141e25a271SNaoya Horiguchi }
1151e25a271SNaoya Horiguchi
1161e25a271SNaoya Horiguchi ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1177780d040SHugh Dickins if (!ptep) {
1187780d040SHugh Dickins walk->action = ACTION_AGAIN;
1197780d040SHugh Dickins return 0;
1207780d040SHugh Dickins }
1211e25a271SNaoya Horiguchi for (; addr != end; ptep++, addr += PAGE_SIZE) {
122c33c7948SRyan Roberts pte_t pte = ptep_get(ptep);
12342da9cbdSNick Piggin
1245c041f5dSPeter Xu /* We need to do cache lookup too for pte markers */
1255c041f5dSPeter Xu if (pte_none_mostly(pte))
12630bac164SLinus Torvalds __mincore_unmapped_range(addr, addr + PAGE_SIZE,
12730bac164SLinus Torvalds vma, vec);
128f4884010SJohannes Weiner else if (pte_present(pte))
12925ef0e50SJohannes Weiner *vec = 1;
1300661a336SKirill A. Shutemov else { /* pte is a swap entry */
13142da9cbdSNick Piggin swp_entry_t entry = pte_to_swp_entry(pte);
1326a60f1b3SJohannes Weiner
13330bac164SLinus Torvalds if (non_swap_entry(entry)) {
134c313dc5dSWeijie Yang /*
135c313dc5dSWeijie Yang * migration or hwpoison entries are always
136c313dc5dSWeijie Yang * uptodate
137c313dc5dSWeijie Yang */
13830bac164SLinus Torvalds *vec = 1;
13930bac164SLinus Torvalds } else {
14030bac164SLinus Torvalds #ifdef CONFIG_SWAP
14130bac164SLinus Torvalds *vec = mincore_page(swap_address_space(entry),
14230bac164SLinus Torvalds swp_offset(entry));
14330bac164SLinus Torvalds #else
14430bac164SLinus Torvalds WARN_ON(1);
14530bac164SLinus Torvalds *vec = 1;
14630bac164SLinus Torvalds #endif
14730bac164SLinus Torvalds }
14842da9cbdSNick Piggin }
14925ef0e50SJohannes Weiner vec++;
1501e25a271SNaoya Horiguchi }
15142da9cbdSNick Piggin pte_unmap_unlock(ptep - 1, ptl);
1521e25a271SNaoya Horiguchi out:
1531e25a271SNaoya Horiguchi walk->private += nr;
1541e25a271SNaoya Horiguchi cond_resched();
1551e25a271SNaoya Horiguchi return 0;
156e48293fdSJohannes Weiner }
157e48293fdSJohannes Weiner
can_do_mincore(struct vm_area_struct * vma)158134fca90SJiri Kosina static inline bool can_do_mincore(struct vm_area_struct *vma)
159134fca90SJiri Kosina {
160134fca90SJiri Kosina if (vma_is_anonymous(vma))
161134fca90SJiri Kosina return true;
162134fca90SJiri Kosina if (!vma->vm_file)
163134fca90SJiri Kosina return false;
164134fca90SJiri Kosina /*
165134fca90SJiri Kosina * Reveal pagecache information only for non-anonymous mappings that
166134fca90SJiri Kosina * correspond to the files the calling process could (if tried) open
167134fca90SJiri Kosina * for writing; otherwise we'd be including shared non-exclusive
168134fca90SJiri Kosina * mappings, which opens a side channel.
169134fca90SJiri Kosina */
17001beba79SChristian Brauner return inode_owner_or_capable(&nop_mnt_idmap,
17121cb47beSChristian Brauner file_inode(vma->vm_file)) ||
17202f92b38SChristian Brauner file_permission(vma->vm_file, MAY_WRITE) == 0;
173134fca90SJiri Kosina }
174134fca90SJiri Kosina
1757b86ac33SChristoph Hellwig static const struct mm_walk_ops mincore_walk_ops = {
1767b86ac33SChristoph Hellwig .pmd_entry = mincore_pte_range,
1777b86ac33SChristoph Hellwig .pte_hole = mincore_unmapped_range,
1787b86ac33SChristoph Hellwig .hugetlb_entry = mincore_hugetlb,
179*49b06385SSuren Baghdasaryan .walk_lock = PGWALK_RDLOCK,
1807b86ac33SChristoph Hellwig };
1817b86ac33SChristoph Hellwig
182f4884010SJohannes Weiner /*
183f4884010SJohannes Weiner * Do a chunk of "sys_mincore()". We've already checked
184f4884010SJohannes Weiner * all the arguments, we hold the mmap semaphore: we should
185f4884010SJohannes Weiner * just return the amount of info we're asked for.
186f4884010SJohannes Weiner */
do_mincore(unsigned long addr,unsigned long pages,unsigned char * vec)187f4884010SJohannes Weiner static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
188f4884010SJohannes Weiner {
189f4884010SJohannes Weiner struct vm_area_struct *vma;
19025ef0e50SJohannes Weiner unsigned long end;
1911e25a271SNaoya Horiguchi int err;
192f4884010SJohannes Weiner
19397955f69SDeming Wang vma = vma_lookup(current->mm, addr);
19497955f69SDeming Wang if (!vma)
195f4884010SJohannes Weiner return -ENOMEM;
19625ef0e50SJohannes Weiner end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
197134fca90SJiri Kosina if (!can_do_mincore(vma)) {
198134fca90SJiri Kosina unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE);
199134fca90SJiri Kosina memset(vec, 1, pages);
200134fca90SJiri Kosina return pages;
201134fca90SJiri Kosina }
2027b86ac33SChristoph Hellwig err = walk_page_range(vma->vm_mm, addr, end, &mincore_walk_ops, vec);
2031e25a271SNaoya Horiguchi if (err < 0)
2041e25a271SNaoya Horiguchi return err;
20525ef0e50SJohannes Weiner return (end - addr) >> PAGE_SHIFT;
2061da177e4SLinus Torvalds }
2071da177e4SLinus Torvalds
2081da177e4SLinus Torvalds /*
2091da177e4SLinus Torvalds * The mincore(2) system call.
2101da177e4SLinus Torvalds *
2111da177e4SLinus Torvalds * mincore() returns the memory residency status of the pages in the
2121da177e4SLinus Torvalds * current process's address space specified by [addr, addr + len).
2131da177e4SLinus Torvalds * The status is returned in a vector of bytes. The least significant
2141da177e4SLinus Torvalds * bit of each byte is 1 if the referenced page is in memory, otherwise
2151da177e4SLinus Torvalds * it is zero.
2161da177e4SLinus Torvalds *
2171da177e4SLinus Torvalds * Because the status of a page can change after mincore() checks it
2181da177e4SLinus Torvalds * but before it returns to the application, the returned vector may
2191da177e4SLinus Torvalds * contain stale information. Only locked pages are guaranteed to
2201da177e4SLinus Torvalds * remain in memory.
2211da177e4SLinus Torvalds *
2221da177e4SLinus Torvalds * return values:
2231da177e4SLinus Torvalds * zero - success
2241da177e4SLinus Torvalds * -EFAULT - vec points to an illegal address
225ea1754a0SKirill A. Shutemov * -EINVAL - addr is not a multiple of PAGE_SIZE
2261da177e4SLinus Torvalds * -ENOMEM - Addresses in the range [addr, addr + len] are
2271da177e4SLinus Torvalds * invalid for the address space of this process, or
2281da177e4SLinus Torvalds * specify one or more pages which are not currently
2291da177e4SLinus Torvalds * mapped
2301da177e4SLinus Torvalds * -EAGAIN - A kernel resource was temporarily unavailable.
2311da177e4SLinus Torvalds */
SYSCALL_DEFINE3(mincore,unsigned long,start,size_t,len,unsigned char __user *,vec)2323480b257SHeiko Carstens SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
2333480b257SHeiko Carstens unsigned char __user *, vec)
2341da177e4SLinus Torvalds {
2352f77d107SLinus Torvalds long retval;
2362f77d107SLinus Torvalds unsigned long pages;
2372f77d107SLinus Torvalds unsigned char *tmp;
2381da177e4SLinus Torvalds
239057d3389SAndrey Konovalov start = untagged_addr(start);
240057d3389SAndrey Konovalov
2412f77d107SLinus Torvalds /* Check the start address: needs to be page-aligned.. */
24209cbfeafSKirill A. Shutemov if (start & ~PAGE_MASK)
2431da177e4SLinus Torvalds return -EINVAL;
2442f77d107SLinus Torvalds
2452f77d107SLinus Torvalds /* ..and we need to be passed a valid user-space range */
24696d4f267SLinus Torvalds if (!access_ok((void __user *) start, len))
2471da177e4SLinus Torvalds return -ENOMEM;
2482f77d107SLinus Torvalds
249ea1754a0SKirill A. Shutemov /* This also avoids any overflows on PAGE_ALIGN */
2502f77d107SLinus Torvalds pages = len >> PAGE_SHIFT;
251e7bbdd07SAlexander Kuleshov pages += (offset_in_page(len)) != 0;
2522f77d107SLinus Torvalds
25396d4f267SLinus Torvalds if (!access_ok(vec, pages))
2542f77d107SLinus Torvalds return -EFAULT;
2552f77d107SLinus Torvalds
2562f77d107SLinus Torvalds tmp = (void *) __get_free_page(GFP_USER);
2572f77d107SLinus Torvalds if (!tmp)
2584fb23e43SLinus Torvalds return -EAGAIN;
2592f77d107SLinus Torvalds
2602f77d107SLinus Torvalds retval = 0;
2612f77d107SLinus Torvalds while (pages) {
2622f77d107SLinus Torvalds /*
2632f77d107SLinus Torvalds * Do at most PAGE_SIZE entries per iteration, due to
2642f77d107SLinus Torvalds * the temporary buffer size.
2652f77d107SLinus Torvalds */
266d8ed45c5SMichel Lespinasse mmap_read_lock(current->mm);
2676a60f1b3SJohannes Weiner retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
268d8ed45c5SMichel Lespinasse mmap_read_unlock(current->mm);
2692f77d107SLinus Torvalds
2702f77d107SLinus Torvalds if (retval <= 0)
2712f77d107SLinus Torvalds break;
2722f77d107SLinus Torvalds if (copy_to_user(vec, tmp, retval)) {
2732f77d107SLinus Torvalds retval = -EFAULT;
2742f77d107SLinus Torvalds break;
2752f77d107SLinus Torvalds }
2762f77d107SLinus Torvalds pages -= retval;
2772f77d107SLinus Torvalds vec += retval;
2782f77d107SLinus Torvalds start += retval << PAGE_SHIFT;
2792f77d107SLinus Torvalds retval = 0;
2802f77d107SLinus Torvalds }
2812f77d107SLinus Torvalds free_page((unsigned long) tmp);
2822f77d107SLinus Torvalds return retval;
2831da177e4SLinus Torvalds }
284