xref: /openbmc/linux/mm/mincore.c (revision 49b06385)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds  *	linux/mm/mincore.c
41da177e4SLinus Torvalds  *
52f77d107SLinus Torvalds  * Copyright (C) 1994-2006  Linus Torvalds
61da177e4SLinus Torvalds  */
71da177e4SLinus Torvalds 
81da177e4SLinus Torvalds /*
91da177e4SLinus Torvalds  * The mincore() system call.
101da177e4SLinus Torvalds  */
111da177e4SLinus Torvalds #include <linux/pagemap.h>
125a0e3ad6STejun Heo #include <linux/gfp.h>
13a520110eSChristoph Hellwig #include <linux/pagewalk.h>
141da177e4SLinus Torvalds #include <linux/mman.h>
151da177e4SLinus Torvalds #include <linux/syscalls.h>
1642da9cbdSNick Piggin #include <linux/swap.h>
1742da9cbdSNick Piggin #include <linux/swapops.h>
183a4f8a0bSHugh Dickins #include <linux/shmem_fs.h>
194f16fc10SNaoya Horiguchi #include <linux/hugetlb.h>
2065fddcfcSMike Rapoport #include <linux/pgtable.h>
211da177e4SLinus Torvalds 
227c0f6ba6SLinus Torvalds #include <linux/uaccess.h>
23014bb1deSNeilBrown #include "swap.h"
241da177e4SLinus Torvalds 
mincore_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)251e25a271SNaoya Horiguchi static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
261e25a271SNaoya Horiguchi 			unsigned long end, struct mm_walk *walk)
27f4884010SJohannes Weiner {
28f4884010SJohannes Weiner #ifdef CONFIG_HUGETLB_PAGE
29f4884010SJohannes Weiner 	unsigned char present;
301e25a271SNaoya Horiguchi 	unsigned char *vec = walk->private;
311e25a271SNaoya Horiguchi 
32f4884010SJohannes Weiner 	/*
331e25a271SNaoya Horiguchi 	 * Hugepages under user process are always in RAM and never
341e25a271SNaoya Horiguchi 	 * swapped out, but theoretically it needs to be checked.
35f4884010SJohannes Weiner 	 */
3663cf5842SJames Houghton 	present = pte && !huge_pte_none_mostly(huge_ptep_get(pte));
371e25a271SNaoya Horiguchi 	for (; addr != end; vec++, addr += PAGE_SIZE)
3825ef0e50SJohannes Weiner 		*vec = present;
391e25a271SNaoya Horiguchi 	walk->private = vec;
40f4884010SJohannes Weiner #else
41f4884010SJohannes Weiner 	BUG();
42f4884010SJohannes Weiner #endif
431e25a271SNaoya Horiguchi 	return 0;
44f4884010SJohannes Weiner }
45f4884010SJohannes Weiner 
4630bac164SLinus Torvalds /*
4730bac164SLinus Torvalds  * Later we can get more picky about what "in core" means precisely.
4830bac164SLinus Torvalds  * For now, simply check to see if the page is in the page cache,
4930bac164SLinus Torvalds  * and is up to date; i.e. that no page-in operation would be required
5030bac164SLinus Torvalds  * at this time if an application were to map and access this page.
5130bac164SLinus Torvalds  */
mincore_page(struct address_space * mapping,pgoff_t index)5261ef1865SMatthew Wilcox (Oracle) static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
5330bac164SLinus Torvalds {
5430bac164SLinus Torvalds 	unsigned char present = 0;
55524984ffSMatthew Wilcox (Oracle) 	struct folio *folio;
5630bac164SLinus Torvalds 
5730bac164SLinus Torvalds 	/*
5830bac164SLinus Torvalds 	 * When tmpfs swaps out a page from a file, any process mapping that
5930bac164SLinus Torvalds 	 * file will not get a swp_entry_t in its pte, but rather it is like
6030bac164SLinus Torvalds 	 * any other file mapping (ie. marked !present and faulted in with
6130bac164SLinus Torvalds 	 * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
6230bac164SLinus Torvalds 	 */
63524984ffSMatthew Wilcox (Oracle) 	folio = filemap_get_incore_folio(mapping, index);
6466dabbb6SChristoph Hellwig 	if (!IS_ERR(folio)) {
65524984ffSMatthew Wilcox (Oracle) 		present = folio_test_uptodate(folio);
66524984ffSMatthew Wilcox (Oracle) 		folio_put(folio);
6730bac164SLinus Torvalds 	}
6830bac164SLinus Torvalds 
6930bac164SLinus Torvalds 	return present;
7030bac164SLinus Torvalds }
7130bac164SLinus Torvalds 
__mincore_unmapped_range(unsigned long addr,unsigned long end,struct vm_area_struct * vma,unsigned char * vec)7230bac164SLinus Torvalds static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
7330bac164SLinus Torvalds 				struct vm_area_struct *vma, unsigned char *vec)
7430bac164SLinus Torvalds {
7530bac164SLinus Torvalds 	unsigned long nr = (end - addr) >> PAGE_SHIFT;
7630bac164SLinus Torvalds 	int i;
7730bac164SLinus Torvalds 
7830bac164SLinus Torvalds 	if (vma->vm_file) {
7930bac164SLinus Torvalds 		pgoff_t pgoff;
8030bac164SLinus Torvalds 
8130bac164SLinus Torvalds 		pgoff = linear_page_index(vma, addr);
8230bac164SLinus Torvalds 		for (i = 0; i < nr; i++, pgoff++)
8330bac164SLinus Torvalds 			vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
8430bac164SLinus Torvalds 	} else {
8530bac164SLinus Torvalds 		for (i = 0; i < nr; i++)
8630bac164SLinus Torvalds 			vec[i] = 0;
8730bac164SLinus Torvalds 	}
8830bac164SLinus Torvalds 	return nr;
8930bac164SLinus Torvalds }
9030bac164SLinus Torvalds 
mincore_unmapped_range(unsigned long addr,unsigned long end,__always_unused int depth,struct mm_walk * walk)911e25a271SNaoya Horiguchi static int mincore_unmapped_range(unsigned long addr, unsigned long end,
92b7a16c7aSSteven Price 				   __always_unused int depth,
931e25a271SNaoya Horiguchi 				   struct mm_walk *walk)
94f4884010SJohannes Weiner {
9530bac164SLinus Torvalds 	walk->private += __mincore_unmapped_range(addr, end,
9630bac164SLinus Torvalds 						  walk->vma, walk->private);
971e25a271SNaoya Horiguchi 	return 0;
981e25a271SNaoya Horiguchi }
991da177e4SLinus Torvalds 
mincore_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)1001e25a271SNaoya Horiguchi static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1011e25a271SNaoya Horiguchi 			struct mm_walk *walk)
1021e25a271SNaoya Horiguchi {
1031e25a271SNaoya Horiguchi 	spinlock_t *ptl;
1041e25a271SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
1051e25a271SNaoya Horiguchi 	pte_t *ptep;
1061e25a271SNaoya Horiguchi 	unsigned char *vec = walk->private;
1071e25a271SNaoya Horiguchi 	int nr = (end - addr) >> PAGE_SHIFT;
1081e25a271SNaoya Horiguchi 
109b6ec57f4SKirill A. Shutemov 	ptl = pmd_trans_huge_lock(pmd, vma);
110b6ec57f4SKirill A. Shutemov 	if (ptl) {
1111e25a271SNaoya Horiguchi 		memset(vec, 1, nr);
1121e25a271SNaoya Horiguchi 		spin_unlock(ptl);
1131e25a271SNaoya Horiguchi 		goto out;
1141e25a271SNaoya Horiguchi 	}
1151e25a271SNaoya Horiguchi 
1161e25a271SNaoya Horiguchi 	ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1177780d040SHugh Dickins 	if (!ptep) {
1187780d040SHugh Dickins 		walk->action = ACTION_AGAIN;
1197780d040SHugh Dickins 		return 0;
1207780d040SHugh Dickins 	}
1211e25a271SNaoya Horiguchi 	for (; addr != end; ptep++, addr += PAGE_SIZE) {
122c33c7948SRyan Roberts 		pte_t pte = ptep_get(ptep);
12342da9cbdSNick Piggin 
1245c041f5dSPeter Xu 		/* We need to do cache lookup too for pte markers */
1255c041f5dSPeter Xu 		if (pte_none_mostly(pte))
12630bac164SLinus Torvalds 			__mincore_unmapped_range(addr, addr + PAGE_SIZE,
12730bac164SLinus Torvalds 						 vma, vec);
128f4884010SJohannes Weiner 		else if (pte_present(pte))
12925ef0e50SJohannes Weiner 			*vec = 1;
1300661a336SKirill A. Shutemov 		else { /* pte is a swap entry */
13142da9cbdSNick Piggin 			swp_entry_t entry = pte_to_swp_entry(pte);
1326a60f1b3SJohannes Weiner 
13330bac164SLinus Torvalds 			if (non_swap_entry(entry)) {
134c313dc5dSWeijie Yang 				/*
135c313dc5dSWeijie Yang 				 * migration or hwpoison entries are always
136c313dc5dSWeijie Yang 				 * uptodate
137c313dc5dSWeijie Yang 				 */
13830bac164SLinus Torvalds 				*vec = 1;
13930bac164SLinus Torvalds 			} else {
14030bac164SLinus Torvalds #ifdef CONFIG_SWAP
14130bac164SLinus Torvalds 				*vec = mincore_page(swap_address_space(entry),
14230bac164SLinus Torvalds 						    swp_offset(entry));
14330bac164SLinus Torvalds #else
14430bac164SLinus Torvalds 				WARN_ON(1);
14530bac164SLinus Torvalds 				*vec = 1;
14630bac164SLinus Torvalds #endif
14730bac164SLinus Torvalds 			}
14842da9cbdSNick Piggin 		}
14925ef0e50SJohannes Weiner 		vec++;
1501e25a271SNaoya Horiguchi 	}
15142da9cbdSNick Piggin 	pte_unmap_unlock(ptep - 1, ptl);
1521e25a271SNaoya Horiguchi out:
1531e25a271SNaoya Horiguchi 	walk->private += nr;
1541e25a271SNaoya Horiguchi 	cond_resched();
1551e25a271SNaoya Horiguchi 	return 0;
156e48293fdSJohannes Weiner }
157e48293fdSJohannes Weiner 
can_do_mincore(struct vm_area_struct * vma)158134fca90SJiri Kosina static inline bool can_do_mincore(struct vm_area_struct *vma)
159134fca90SJiri Kosina {
160134fca90SJiri Kosina 	if (vma_is_anonymous(vma))
161134fca90SJiri Kosina 		return true;
162134fca90SJiri Kosina 	if (!vma->vm_file)
163134fca90SJiri Kosina 		return false;
164134fca90SJiri Kosina 	/*
165134fca90SJiri Kosina 	 * Reveal pagecache information only for non-anonymous mappings that
166134fca90SJiri Kosina 	 * correspond to the files the calling process could (if tried) open
167134fca90SJiri Kosina 	 * for writing; otherwise we'd be including shared non-exclusive
168134fca90SJiri Kosina 	 * mappings, which opens a side channel.
169134fca90SJiri Kosina 	 */
17001beba79SChristian Brauner 	return inode_owner_or_capable(&nop_mnt_idmap,
17121cb47beSChristian Brauner 				      file_inode(vma->vm_file)) ||
17202f92b38SChristian Brauner 	       file_permission(vma->vm_file, MAY_WRITE) == 0;
173134fca90SJiri Kosina }
174134fca90SJiri Kosina 
1757b86ac33SChristoph Hellwig static const struct mm_walk_ops mincore_walk_ops = {
1767b86ac33SChristoph Hellwig 	.pmd_entry		= mincore_pte_range,
1777b86ac33SChristoph Hellwig 	.pte_hole		= mincore_unmapped_range,
1787b86ac33SChristoph Hellwig 	.hugetlb_entry		= mincore_hugetlb,
179*49b06385SSuren Baghdasaryan 	.walk_lock		= PGWALK_RDLOCK,
1807b86ac33SChristoph Hellwig };
1817b86ac33SChristoph Hellwig 
182f4884010SJohannes Weiner /*
183f4884010SJohannes Weiner  * Do a chunk of "sys_mincore()". We've already checked
184f4884010SJohannes Weiner  * all the arguments, we hold the mmap semaphore: we should
185f4884010SJohannes Weiner  * just return the amount of info we're asked for.
186f4884010SJohannes Weiner  */
do_mincore(unsigned long addr,unsigned long pages,unsigned char * vec)187f4884010SJohannes Weiner static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
188f4884010SJohannes Weiner {
189f4884010SJohannes Weiner 	struct vm_area_struct *vma;
19025ef0e50SJohannes Weiner 	unsigned long end;
1911e25a271SNaoya Horiguchi 	int err;
192f4884010SJohannes Weiner 
19397955f69SDeming Wang 	vma = vma_lookup(current->mm, addr);
19497955f69SDeming Wang 	if (!vma)
195f4884010SJohannes Weiner 		return -ENOMEM;
19625ef0e50SJohannes Weiner 	end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
197134fca90SJiri Kosina 	if (!can_do_mincore(vma)) {
198134fca90SJiri Kosina 		unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE);
199134fca90SJiri Kosina 		memset(vec, 1, pages);
200134fca90SJiri Kosina 		return pages;
201134fca90SJiri Kosina 	}
2027b86ac33SChristoph Hellwig 	err = walk_page_range(vma->vm_mm, addr, end, &mincore_walk_ops, vec);
2031e25a271SNaoya Horiguchi 	if (err < 0)
2041e25a271SNaoya Horiguchi 		return err;
20525ef0e50SJohannes Weiner 	return (end - addr) >> PAGE_SHIFT;
2061da177e4SLinus Torvalds }
2071da177e4SLinus Torvalds 
2081da177e4SLinus Torvalds /*
2091da177e4SLinus Torvalds  * The mincore(2) system call.
2101da177e4SLinus Torvalds  *
2111da177e4SLinus Torvalds  * mincore() returns the memory residency status of the pages in the
2121da177e4SLinus Torvalds  * current process's address space specified by [addr, addr + len).
2131da177e4SLinus Torvalds  * The status is returned in a vector of bytes.  The least significant
2141da177e4SLinus Torvalds  * bit of each byte is 1 if the referenced page is in memory, otherwise
2151da177e4SLinus Torvalds  * it is zero.
2161da177e4SLinus Torvalds  *
2171da177e4SLinus Torvalds  * Because the status of a page can change after mincore() checks it
2181da177e4SLinus Torvalds  * but before it returns to the application, the returned vector may
2191da177e4SLinus Torvalds  * contain stale information.  Only locked pages are guaranteed to
2201da177e4SLinus Torvalds  * remain in memory.
2211da177e4SLinus Torvalds  *
2221da177e4SLinus Torvalds  * return values:
2231da177e4SLinus Torvalds  *  zero    - success
2241da177e4SLinus Torvalds  *  -EFAULT - vec points to an illegal address
225ea1754a0SKirill A. Shutemov  *  -EINVAL - addr is not a multiple of PAGE_SIZE
2261da177e4SLinus Torvalds  *  -ENOMEM - Addresses in the range [addr, addr + len] are
2271da177e4SLinus Torvalds  *		invalid for the address space of this process, or
2281da177e4SLinus Torvalds  *		specify one or more pages which are not currently
2291da177e4SLinus Torvalds  *		mapped
2301da177e4SLinus Torvalds  *  -EAGAIN - A kernel resource was temporarily unavailable.
2311da177e4SLinus Torvalds  */
SYSCALL_DEFINE3(mincore,unsigned long,start,size_t,len,unsigned char __user *,vec)2323480b257SHeiko Carstens SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
2333480b257SHeiko Carstens 		unsigned char __user *, vec)
2341da177e4SLinus Torvalds {
2352f77d107SLinus Torvalds 	long retval;
2362f77d107SLinus Torvalds 	unsigned long pages;
2372f77d107SLinus Torvalds 	unsigned char *tmp;
2381da177e4SLinus Torvalds 
239057d3389SAndrey Konovalov 	start = untagged_addr(start);
240057d3389SAndrey Konovalov 
2412f77d107SLinus Torvalds 	/* Check the start address: needs to be page-aligned.. */
24209cbfeafSKirill A. Shutemov 	if (start & ~PAGE_MASK)
2431da177e4SLinus Torvalds 		return -EINVAL;
2442f77d107SLinus Torvalds 
2452f77d107SLinus Torvalds 	/* ..and we need to be passed a valid user-space range */
24696d4f267SLinus Torvalds 	if (!access_ok((void __user *) start, len))
2471da177e4SLinus Torvalds 		return -ENOMEM;
2482f77d107SLinus Torvalds 
249ea1754a0SKirill A. Shutemov 	/* This also avoids any overflows on PAGE_ALIGN */
2502f77d107SLinus Torvalds 	pages = len >> PAGE_SHIFT;
251e7bbdd07SAlexander Kuleshov 	pages += (offset_in_page(len)) != 0;
2522f77d107SLinus Torvalds 
25396d4f267SLinus Torvalds 	if (!access_ok(vec, pages))
2542f77d107SLinus Torvalds 		return -EFAULT;
2552f77d107SLinus Torvalds 
2562f77d107SLinus Torvalds 	tmp = (void *) __get_free_page(GFP_USER);
2572f77d107SLinus Torvalds 	if (!tmp)
2584fb23e43SLinus Torvalds 		return -EAGAIN;
2592f77d107SLinus Torvalds 
2602f77d107SLinus Torvalds 	retval = 0;
2612f77d107SLinus Torvalds 	while (pages) {
2622f77d107SLinus Torvalds 		/*
2632f77d107SLinus Torvalds 		 * Do at most PAGE_SIZE entries per iteration, due to
2642f77d107SLinus Torvalds 		 * the temporary buffer size.
2652f77d107SLinus Torvalds 		 */
266d8ed45c5SMichel Lespinasse 		mmap_read_lock(current->mm);
2676a60f1b3SJohannes Weiner 		retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
268d8ed45c5SMichel Lespinasse 		mmap_read_unlock(current->mm);
2692f77d107SLinus Torvalds 
2702f77d107SLinus Torvalds 		if (retval <= 0)
2712f77d107SLinus Torvalds 			break;
2722f77d107SLinus Torvalds 		if (copy_to_user(vec, tmp, retval)) {
2732f77d107SLinus Torvalds 			retval = -EFAULT;
2742f77d107SLinus Torvalds 			break;
2752f77d107SLinus Torvalds 		}
2762f77d107SLinus Torvalds 		pages -= retval;
2772f77d107SLinus Torvalds 		vec += retval;
2782f77d107SLinus Torvalds 		start += retval << PAGE_SHIFT;
2792f77d107SLinus Torvalds 		retval = 0;
2802f77d107SLinus Torvalds 	}
2812f77d107SLinus Torvalds 	free_page((unsigned long) tmp);
2822f77d107SLinus Torvalds 	return retval;
2831da177e4SLinus Torvalds }
284