xref: /openbmc/linux/arch/x86/mm/pgtable.c (revision ba2929159000dc7015cc01cdf7bb72542e19952a)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
24f76cd38SJeremy Fitzhardinge #include <linux/mm.h>
35a0e3ad6STejun Heo #include <linux/gfp.h>
4e3e28812SJoerg Roedel #include <linux/hugetlb.h>
54f76cd38SJeremy Fitzhardinge #include <asm/pgalloc.h>
64f76cd38SJeremy Fitzhardinge #include <asm/tlb.h>
7a1d5a869SIngo Molnar #include <asm/fixmap.h>
86b637835SToshi Kani #include <asm/mtrr.h>
94f76cd38SJeremy Fitzhardinge 
1094d49eb3SKirill A. Shutemov #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
1194d49eb3SKirill A. Shutemov phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
1294d49eb3SKirill A. Shutemov EXPORT_SYMBOL(physical_mask);
1394d49eb3SKirill A. Shutemov #endif
1494d49eb3SKirill A. Shutemov 
1514315592SIan Campbell #ifdef CONFIG_HIGHPTE
165fba4af4SMike Rapoport #define PGTABLE_HIGHMEM __GFP_HIGHMEM
1714315592SIan Campbell #else
185fba4af4SMike Rapoport #define PGTABLE_HIGHMEM 0
1914315592SIan Campbell #endif
2014315592SIan Campbell 
2169de6c1aSThomas Gleixner #ifndef CONFIG_PARAVIRT
2269de6c1aSThomas Gleixner static inline
paravirt_tlb_remove_table(struct mmu_gather * tlb,void * table)2369de6c1aSThomas Gleixner void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
2469de6c1aSThomas Gleixner {
2569de6c1aSThomas Gleixner 	tlb_remove_page(tlb, table);
2669de6c1aSThomas Gleixner }
2769de6c1aSThomas Gleixner #endif
2869de6c1aSThomas Gleixner 
295fba4af4SMike Rapoport gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
304f76cd38SJeremy Fitzhardinge 
pte_alloc_one(struct mm_struct * mm)314cf58924SJoel Fernandes (Google) pgtable_t pte_alloc_one(struct mm_struct *mm)
324f76cd38SJeremy Fitzhardinge {
335fba4af4SMike Rapoport 	return __pte_alloc_one(mm, __userpte_alloc_gfp);
344f76cd38SJeremy Fitzhardinge }
354f76cd38SJeremy Fitzhardinge 
setup_userpte(char * arg)3614315592SIan Campbell static int __init setup_userpte(char *arg)
3714315592SIan Campbell {
3814315592SIan Campbell 	if (!arg)
3914315592SIan Campbell 		return -EINVAL;
4014315592SIan Campbell 
4114315592SIan Campbell 	/*
4214315592SIan Campbell 	 * "userpte=nohigh" disables allocation of user pagetables in
4314315592SIan Campbell 	 * high memory.
4414315592SIan Campbell 	 */
4514315592SIan Campbell 	if (strcmp(arg, "nohigh") == 0)
4614315592SIan Campbell 		__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
4714315592SIan Campbell 	else
4814315592SIan Campbell 		return -EINVAL;
4914315592SIan Campbell 	return 0;
5014315592SIan Campbell }
5114315592SIan Campbell early_param("userpte", setup_userpte);
5214315592SIan Campbell 
___pte_free_tlb(struct mmu_gather * tlb,struct page * pte)539e1b32caSBenjamin Herrenschmidt void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
54397f687aSJeremy Fitzhardinge {
55f92c494fSVishal Moola (Oracle) 	pagetable_pte_dtor(page_ptdesc(pte));
566944a9c8SJeremy Fitzhardinge 	paravirt_release_pte(page_to_pfn(pte));
5748a8b97cSPeter Zijlstra 	paravirt_tlb_remove_table(tlb, pte);
58397f687aSJeremy Fitzhardinge }
59397f687aSJeremy Fitzhardinge 
6098233368SKirill A. Shutemov #if CONFIG_PGTABLE_LEVELS > 2
___pmd_free_tlb(struct mmu_gather * tlb,pmd_t * pmd)619e1b32caSBenjamin Herrenschmidt void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
62170fdff7SJeremy Fitzhardinge {
63f92c494fSVishal Moola (Oracle) 	struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
646944a9c8SJeremy Fitzhardinge 	paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
651de14c3cSDave Hansen 	/*
661de14c3cSDave Hansen 	 * NOTE! For PAE, any changes to the top page-directory-pointer-table
671de14c3cSDave Hansen 	 * entries need a full cr3 reload to flush.
681de14c3cSDave Hansen 	 */
691de14c3cSDave Hansen #ifdef CONFIG_X86_PAE
701de14c3cSDave Hansen 	tlb->need_flush_all = 1;
711de14c3cSDave Hansen #endif
72f92c494fSVishal Moola (Oracle) 	pagetable_pmd_dtor(ptdesc);
73f92c494fSVishal Moola (Oracle) 	paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
74170fdff7SJeremy Fitzhardinge }
755a5f8f42SJeremy Fitzhardinge 
7698233368SKirill A. Shutemov #if CONFIG_PGTABLE_LEVELS > 3
___pud_free_tlb(struct mmu_gather * tlb,pud_t * pud)779e1b32caSBenjamin Herrenschmidt void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
785a5f8f42SJeremy Fitzhardinge {
792761fa09SJeremy Fitzhardinge 	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
8048a8b97cSPeter Zijlstra 	paravirt_tlb_remove_table(tlb, virt_to_page(pud));
815a5f8f42SJeremy Fitzhardinge }
82b8504058SKirill A. Shutemov 
83b8504058SKirill A. Shutemov #if CONFIG_PGTABLE_LEVELS > 4
___p4d_free_tlb(struct mmu_gather * tlb,p4d_t * p4d)84b8504058SKirill A. Shutemov void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
85b8504058SKirill A. Shutemov {
86b8504058SKirill A. Shutemov 	paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
8748a8b97cSPeter Zijlstra 	paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
88b8504058SKirill A. Shutemov }
89b8504058SKirill A. Shutemov #endif	/* CONFIG_PGTABLE_LEVELS > 4 */
9098233368SKirill A. Shutemov #endif	/* CONFIG_PGTABLE_LEVELS > 3 */
9198233368SKirill A. Shutemov #endif	/* CONFIG_PGTABLE_LEVELS > 2 */
92170fdff7SJeremy Fitzhardinge 
pgd_list_add(pgd_t * pgd)934f76cd38SJeremy Fitzhardinge static inline void pgd_list_add(pgd_t *pgd)
944f76cd38SJeremy Fitzhardinge {
95f92c494fSVishal Moola (Oracle) 	struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
964f76cd38SJeremy Fitzhardinge 
97f92c494fSVishal Moola (Oracle) 	list_add(&ptdesc->pt_list, &pgd_list);
984f76cd38SJeremy Fitzhardinge }
994f76cd38SJeremy Fitzhardinge 
pgd_list_del(pgd_t * pgd)1004f76cd38SJeremy Fitzhardinge static inline void pgd_list_del(pgd_t *pgd)
1014f76cd38SJeremy Fitzhardinge {
102f92c494fSVishal Moola (Oracle) 	struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
1034f76cd38SJeremy Fitzhardinge 
104f92c494fSVishal Moola (Oracle) 	list_del(&ptdesc->pt_list);
1054f76cd38SJeremy Fitzhardinge }
1064f76cd38SJeremy Fitzhardinge 
1074f76cd38SJeremy Fitzhardinge #define UNSHARED_PTRS_PER_PGD				\
10868db065cSJeremy Fitzhardinge 	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
109184d47f0SKees Cook #define MAX_UNSHARED_PTRS_PER_PGD			\
110184d47f0SKees Cook 	max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
1114f76cd38SJeremy Fitzhardinge 
112617d34d9SJeremy Fitzhardinge 
pgd_set_mm(pgd_t * pgd,struct mm_struct * mm)113617d34d9SJeremy Fitzhardinge static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
114617d34d9SJeremy Fitzhardinge {
115f92c494fSVishal Moola (Oracle) 	virt_to_ptdesc(pgd)->pt_mm = mm;
116617d34d9SJeremy Fitzhardinge }
117617d34d9SJeremy Fitzhardinge 
pgd_page_get_mm(struct page * page)118617d34d9SJeremy Fitzhardinge struct mm_struct *pgd_page_get_mm(struct page *page)
119617d34d9SJeremy Fitzhardinge {
120f92c494fSVishal Moola (Oracle) 	return page_ptdesc(page)->pt_mm;
121617d34d9SJeremy Fitzhardinge }
122617d34d9SJeremy Fitzhardinge 
pgd_ctor(struct mm_struct * mm,pgd_t * pgd)123617d34d9SJeremy Fitzhardinge static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
1244f76cd38SJeremy Fitzhardinge {
1254f76cd38SJeremy Fitzhardinge 	/* If the pgd points to a shared pagetable level (either the
1264f76cd38SJeremy Fitzhardinge 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
1274f76cd38SJeremy Fitzhardinge 	   references from swapper_pg_dir. */
12898233368SKirill A. Shutemov 	if (CONFIG_PGTABLE_LEVELS == 2 ||
12998233368SKirill A. Shutemov 	    (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
130b8504058SKirill A. Shutemov 	    CONFIG_PGTABLE_LEVELS >= 4) {
13168db065cSJeremy Fitzhardinge 		clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
13268db065cSJeremy Fitzhardinge 				swapper_pg_dir + KERNEL_PGD_BOUNDARY,
1334f76cd38SJeremy Fitzhardinge 				KERNEL_PGD_PTRS);
1344f76cd38SJeremy Fitzhardinge 	}
1354f76cd38SJeremy Fitzhardinge 
1364f76cd38SJeremy Fitzhardinge 	/* list required to sync kernel mapping updates */
137617d34d9SJeremy Fitzhardinge 	if (!SHARED_KERNEL_PMD) {
138617d34d9SJeremy Fitzhardinge 		pgd_set_mm(pgd, mm);
1394f76cd38SJeremy Fitzhardinge 		pgd_list_add(pgd);
1404f76cd38SJeremy Fitzhardinge 	}
141617d34d9SJeremy Fitzhardinge }
1424f76cd38SJeremy Fitzhardinge 
pgd_dtor(pgd_t * pgd)14317b74627SJan Beulich static void pgd_dtor(pgd_t *pgd)
1444f76cd38SJeremy Fitzhardinge {
1454f76cd38SJeremy Fitzhardinge 	if (SHARED_KERNEL_PMD)
1464f76cd38SJeremy Fitzhardinge 		return;
1474f76cd38SJeremy Fitzhardinge 
148a79e53d8SAndrea Arcangeli 	spin_lock(&pgd_lock);
1494f76cd38SJeremy Fitzhardinge 	pgd_list_del(pgd);
150a79e53d8SAndrea Arcangeli 	spin_unlock(&pgd_lock);
1514f76cd38SJeremy Fitzhardinge }
1524f76cd38SJeremy Fitzhardinge 
15385958b46SJeremy Fitzhardinge /*
15485958b46SJeremy Fitzhardinge  * List of all pgd's needed for non-PAE so it can invalidate entries
15585958b46SJeremy Fitzhardinge  * in both cached and uncached pgd's; not needed for PAE since the
15685958b46SJeremy Fitzhardinge  * kernel pmd is shared. If PAE were not to share the pmd a similar
15785958b46SJeremy Fitzhardinge  * tactic would be needed. This is essentially codepath-based locking
15885958b46SJeremy Fitzhardinge  * against pageattr.c; it is the unique case in which a valid change
15985958b46SJeremy Fitzhardinge  * of kernel pagetables can't be lazily synchronized by vmalloc faults.
16085958b46SJeremy Fitzhardinge  * vmalloc faults work because attached pagetables are never freed.
1616d49e352SNadia Yvette Chambers  * -- nyc
16285958b46SJeremy Fitzhardinge  */
16385958b46SJeremy Fitzhardinge 
1644f76cd38SJeremy Fitzhardinge #ifdef CONFIG_X86_PAE
1654f76cd38SJeremy Fitzhardinge /*
1664f76cd38SJeremy Fitzhardinge  * In PAE mode, we need to do a cr3 reload (=tlb flush) when
1674f76cd38SJeremy Fitzhardinge  * updating the top-level pagetable entries to guarantee the
1684f76cd38SJeremy Fitzhardinge  * processor notices the update.  Since this is expensive, and
1694f76cd38SJeremy Fitzhardinge  * all 4 top-level entries are used almost immediately in a
1704f76cd38SJeremy Fitzhardinge  * new process's life, we just pre-populate them here.
1714f76cd38SJeremy Fitzhardinge  *
1724f76cd38SJeremy Fitzhardinge  * Also, if we're in a paravirt environment where the kernel pmd is
1734f76cd38SJeremy Fitzhardinge  * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
1744f76cd38SJeremy Fitzhardinge  * and initialize the kernel pmds here.
1754f76cd38SJeremy Fitzhardinge  */
176d8d5900eSJeremy Fitzhardinge #define PREALLOCATED_PMDS	UNSHARED_PTRS_PER_PGD
177184d47f0SKees Cook #define MAX_PREALLOCATED_PMDS	MAX_UNSHARED_PTRS_PER_PGD
1781ec1fe73SIngo Molnar 
179f59dbe9cSJoerg Roedel /*
180f59dbe9cSJoerg Roedel  * We allocate separate PMDs for the kernel part of the user page-table
181f59dbe9cSJoerg Roedel  * when PTI is enabled. We need them to map the per-process LDT into the
182f59dbe9cSJoerg Roedel  * user-space page-table.
183f59dbe9cSJoerg Roedel  */
18428e3ace7SBorislav Petkov #define PREALLOCATED_USER_PMDS	 (boot_cpu_has(X86_FEATURE_PTI) ? \
185f59dbe9cSJoerg Roedel 					KERNEL_PGD_PTRS : 0)
186184d47f0SKees Cook #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
187f59dbe9cSJoerg Roedel 
pud_populate(struct mm_struct * mm,pud_t * pudp,pmd_t * pmd)1881ec1fe73SIngo Molnar void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
1891ec1fe73SIngo Molnar {
1906944a9c8SJeremy Fitzhardinge 	paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
1911ec1fe73SIngo Molnar 
1921ec1fe73SIngo Molnar 	/* Note: almost everything apart from _PAGE_PRESENT is
1931ec1fe73SIngo Molnar 	   reserved at the pmd (PDPT) level. */
1941ec1fe73SIngo Molnar 	set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
1951ec1fe73SIngo Molnar 
1961ec1fe73SIngo Molnar 	/*
1971ec1fe73SIngo Molnar 	 * According to Intel App note "TLBs, Paging-Structure Caches,
1981ec1fe73SIngo Molnar 	 * and Their Invalidation", April 2007, document 317080-001,
1991ec1fe73SIngo Molnar 	 * section 8.1: in PAE mode we explicitly have to flush the
2001ec1fe73SIngo Molnar 	 * TLB via cr3 if the top-level pgd is changed...
2011ec1fe73SIngo Molnar 	 */
2024981d01eSShaohua Li 	flush_tlb_mm(mm);
2031ec1fe73SIngo Molnar }
2044f76cd38SJeremy Fitzhardinge #else  /* !CONFIG_X86_PAE */
205d8d5900eSJeremy Fitzhardinge 
2064f76cd38SJeremy Fitzhardinge /* No need to prepopulate any pagetable entries in non-PAE modes. */
207d8d5900eSJeremy Fitzhardinge #define PREALLOCATED_PMDS	0
208184d47f0SKees Cook #define MAX_PREALLOCATED_PMDS	0
209f59dbe9cSJoerg Roedel #define PREALLOCATED_USER_PMDS	 0
210184d47f0SKees Cook #define MAX_PREALLOCATED_USER_PMDS 0
211d8d5900eSJeremy Fitzhardinge #endif	/* CONFIG_X86_PAE */
212d8d5900eSJeremy Fitzhardinge 
free_pmds(struct mm_struct * mm,pmd_t * pmds[],int count)213f59dbe9cSJoerg Roedel static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
2144f76cd38SJeremy Fitzhardinge {
215d8d5900eSJeremy Fitzhardinge 	int i;
216f92c494fSVishal Moola (Oracle) 	struct ptdesc *ptdesc;
217d8d5900eSJeremy Fitzhardinge 
218f59dbe9cSJoerg Roedel 	for (i = 0; i < count; i++)
21909ef4939SKirill A. Shutemov 		if (pmds[i]) {
220f92c494fSVishal Moola (Oracle) 			ptdesc = virt_to_ptdesc(pmds[i]);
221f92c494fSVishal Moola (Oracle) 
222f92c494fSVishal Moola (Oracle) 			pagetable_pmd_dtor(ptdesc);
223f92c494fSVishal Moola (Oracle) 			pagetable_free(ptdesc);
224dc6c9a35SKirill A. Shutemov 			mm_dec_nr_pmds(mm);
2254f76cd38SJeremy Fitzhardinge 		}
22609ef4939SKirill A. Shutemov }
2274f76cd38SJeremy Fitzhardinge 
preallocate_pmds(struct mm_struct * mm,pmd_t * pmds[],int count)228f59dbe9cSJoerg Roedel static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
2294f76cd38SJeremy Fitzhardinge {
230d8d5900eSJeremy Fitzhardinge 	int i;
231d8d5900eSJeremy Fitzhardinge 	bool failed = false;
2325fba4af4SMike Rapoport 	gfp_t gfp = GFP_PGTABLE_USER;
2333e79ec7dSVladimir Davydov 
2343e79ec7dSVladimir Davydov 	if (mm == &init_mm)
2353e79ec7dSVladimir Davydov 		gfp &= ~__GFP_ACCOUNT;
236f92c494fSVishal Moola (Oracle) 	gfp &= ~__GFP_HIGHMEM;
237d8d5900eSJeremy Fitzhardinge 
238f59dbe9cSJoerg Roedel 	for (i = 0; i < count; i++) {
239f92c494fSVishal Moola (Oracle) 		pmd_t *pmd = NULL;
240f92c494fSVishal Moola (Oracle) 		struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);
241f92c494fSVishal Moola (Oracle) 
242f92c494fSVishal Moola (Oracle) 		if (!ptdesc)
243d8d5900eSJeremy Fitzhardinge 			failed = true;
244f92c494fSVishal Moola (Oracle) 		if (ptdesc && !pagetable_pmd_ctor(ptdesc)) {
245f92c494fSVishal Moola (Oracle) 			pagetable_free(ptdesc);
246f92c494fSVishal Moola (Oracle) 			ptdesc = NULL;
24709ef4939SKirill A. Shutemov 			failed = true;
24809ef4939SKirill A. Shutemov 		}
249f92c494fSVishal Moola (Oracle) 		if (ptdesc) {
250dc6c9a35SKirill A. Shutemov 			mm_inc_nr_pmds(mm);
251f92c494fSVishal Moola (Oracle) 			pmd = ptdesc_address(ptdesc);
252f92c494fSVishal Moola (Oracle) 		}
253f92c494fSVishal Moola (Oracle) 
254d8d5900eSJeremy Fitzhardinge 		pmds[i] = pmd;
2554f76cd38SJeremy Fitzhardinge 	}
256d8d5900eSJeremy Fitzhardinge 
257d8d5900eSJeremy Fitzhardinge 	if (failed) {
258f59dbe9cSJoerg Roedel 		free_pmds(mm, pmds, count);
259d8d5900eSJeremy Fitzhardinge 		return -ENOMEM;
260d8d5900eSJeremy Fitzhardinge 	}
261d8d5900eSJeremy Fitzhardinge 
262d8d5900eSJeremy Fitzhardinge 	return 0;
263d8d5900eSJeremy Fitzhardinge }
264d8d5900eSJeremy Fitzhardinge 
265d8d5900eSJeremy Fitzhardinge /*
266d8d5900eSJeremy Fitzhardinge  * Mop up any pmd pages which may still be attached to the pgd.
267d8d5900eSJeremy Fitzhardinge  * Normally they will be freed by munmap/exit_mmap, but any pmd we
268d8d5900eSJeremy Fitzhardinge  * preallocate which never got a corresponding vma will need to be
269d8d5900eSJeremy Fitzhardinge  * freed manually.
270d8d5900eSJeremy Fitzhardinge  */
mop_up_one_pmd(struct mm_struct * mm,pgd_t * pgdp)271f59dbe9cSJoerg Roedel static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
272d8d5900eSJeremy Fitzhardinge {
273f59dbe9cSJoerg Roedel 	pgd_t pgd = *pgdp;
274d8d5900eSJeremy Fitzhardinge 
275d8d5900eSJeremy Fitzhardinge 	if (pgd_val(pgd) != 0) {
276d8d5900eSJeremy Fitzhardinge 		pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
277d8d5900eSJeremy Fitzhardinge 
2789bc4f28aSNadav Amit 		pgd_clear(pgdp);
279d8d5900eSJeremy Fitzhardinge 
280d8d5900eSJeremy Fitzhardinge 		paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
281d8d5900eSJeremy Fitzhardinge 		pmd_free(mm, pmd);
282dc6c9a35SKirill A. Shutemov 		mm_dec_nr_pmds(mm);
283d8d5900eSJeremy Fitzhardinge 	}
284d8d5900eSJeremy Fitzhardinge }
285f59dbe9cSJoerg Roedel 
pgd_mop_up_pmds(struct mm_struct * mm,pgd_t * pgdp)286f59dbe9cSJoerg Roedel static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
287f59dbe9cSJoerg Roedel {
288f59dbe9cSJoerg Roedel 	int i;
289f59dbe9cSJoerg Roedel 
290f59dbe9cSJoerg Roedel 	for (i = 0; i < PREALLOCATED_PMDS; i++)
291f59dbe9cSJoerg Roedel 		mop_up_one_pmd(mm, &pgdp[i]);
292f59dbe9cSJoerg Roedel 
293f59dbe9cSJoerg Roedel #ifdef CONFIG_PAGE_TABLE_ISOLATION
294f59dbe9cSJoerg Roedel 
29528e3ace7SBorislav Petkov 	if (!boot_cpu_has(X86_FEATURE_PTI))
296f59dbe9cSJoerg Roedel 		return;
297f59dbe9cSJoerg Roedel 
298f59dbe9cSJoerg Roedel 	pgdp = kernel_to_user_pgdp(pgdp);
299f59dbe9cSJoerg Roedel 
300f59dbe9cSJoerg Roedel 	for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
301f59dbe9cSJoerg Roedel 		mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
302f59dbe9cSJoerg Roedel #endif
303d8d5900eSJeremy Fitzhardinge }
304d8d5900eSJeremy Fitzhardinge 
pgd_prepopulate_pmd(struct mm_struct * mm,pgd_t * pgd,pmd_t * pmds[])305d8d5900eSJeremy Fitzhardinge static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
306d8d5900eSJeremy Fitzhardinge {
307e0c4f675SKirill A. Shutemov 	p4d_t *p4d;
308d8d5900eSJeremy Fitzhardinge 	pud_t *pud;
309d8d5900eSJeremy Fitzhardinge 	int i;
310d8d5900eSJeremy Fitzhardinge 
311e0c4f675SKirill A. Shutemov 	p4d = p4d_offset(pgd, 0);
312e0c4f675SKirill A. Shutemov 	pud = pud_offset(p4d, 0);
313d8d5900eSJeremy Fitzhardinge 
31473b44ff4SWanpeng Li 	for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
315d8d5900eSJeremy Fitzhardinge 		pmd_t *pmd = pmds[i];
316d8d5900eSJeremy Fitzhardinge 
317d8d5900eSJeremy Fitzhardinge 		if (i >= KERNEL_PGD_BOUNDARY)
318d8d5900eSJeremy Fitzhardinge 			memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
319d8d5900eSJeremy Fitzhardinge 			       sizeof(pmd_t) * PTRS_PER_PMD);
320d8d5900eSJeremy Fitzhardinge 
321d8d5900eSJeremy Fitzhardinge 		pud_populate(mm, pud, pmd);
322d8d5900eSJeremy Fitzhardinge 	}
323d8d5900eSJeremy Fitzhardinge }
3244f76cd38SJeremy Fitzhardinge 
325f59dbe9cSJoerg Roedel #ifdef CONFIG_PAGE_TABLE_ISOLATION
pgd_prepopulate_user_pmd(struct mm_struct * mm,pgd_t * k_pgd,pmd_t * pmds[])326f59dbe9cSJoerg Roedel static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
327f59dbe9cSJoerg Roedel 				     pgd_t *k_pgd, pmd_t *pmds[])
328f59dbe9cSJoerg Roedel {
329f59dbe9cSJoerg Roedel 	pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
330f59dbe9cSJoerg Roedel 	pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
331f59dbe9cSJoerg Roedel 	p4d_t *u_p4d;
332f59dbe9cSJoerg Roedel 	pud_t *u_pud;
333f59dbe9cSJoerg Roedel 	int i;
334f59dbe9cSJoerg Roedel 
335f59dbe9cSJoerg Roedel 	u_p4d = p4d_offset(u_pgd, 0);
336f59dbe9cSJoerg Roedel 	u_pud = pud_offset(u_p4d, 0);
337f59dbe9cSJoerg Roedel 
338f59dbe9cSJoerg Roedel 	s_pgd += KERNEL_PGD_BOUNDARY;
339f59dbe9cSJoerg Roedel 	u_pud += KERNEL_PGD_BOUNDARY;
340f59dbe9cSJoerg Roedel 
341f59dbe9cSJoerg Roedel 	for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
342f59dbe9cSJoerg Roedel 		pmd_t *pmd = pmds[i];
343f59dbe9cSJoerg Roedel 
344f59dbe9cSJoerg Roedel 		memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
345f59dbe9cSJoerg Roedel 		       sizeof(pmd_t) * PTRS_PER_PMD);
346f59dbe9cSJoerg Roedel 
347f59dbe9cSJoerg Roedel 		pud_populate(mm, u_pud, pmd);
348f59dbe9cSJoerg Roedel 	}
349f59dbe9cSJoerg Roedel 
350f59dbe9cSJoerg Roedel }
351f59dbe9cSJoerg Roedel #else
pgd_prepopulate_user_pmd(struct mm_struct * mm,pgd_t * k_pgd,pmd_t * pmds[])352f59dbe9cSJoerg Roedel static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
353f59dbe9cSJoerg Roedel 				     pgd_t *k_pgd, pmd_t *pmds[])
354f59dbe9cSJoerg Roedel {
355f59dbe9cSJoerg Roedel }
356f59dbe9cSJoerg Roedel #endif
3571db491f7SFenghua Yu /*
3581db491f7SFenghua Yu  * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
3591db491f7SFenghua Yu  * assumes that pgd should be in one page.
3601db491f7SFenghua Yu  *
3611db491f7SFenghua Yu  * But kernel with PAE paging that is not running as a Xen domain
3621db491f7SFenghua Yu  * only needs to allocate 32 bytes for pgd instead of one page.
3631db491f7SFenghua Yu  */
3641db491f7SFenghua Yu #ifdef CONFIG_X86_PAE
3651db491f7SFenghua Yu 
3661db491f7SFenghua Yu #include <linux/slab.h>
3671db491f7SFenghua Yu 
3681db491f7SFenghua Yu #define PGD_SIZE	(PTRS_PER_PGD * sizeof(pgd_t))
3691db491f7SFenghua Yu #define PGD_ALIGN	32
3701db491f7SFenghua Yu 
3711db491f7SFenghua Yu static struct kmem_cache *pgd_cache;
3721db491f7SFenghua Yu 
pgtable_cache_init(void)373782de70cSMike Rapoport void __init pgtable_cache_init(void)
3741db491f7SFenghua Yu {
3751db491f7SFenghua Yu 	/*
3761db491f7SFenghua Yu 	 * When PAE kernel is running as a Xen domain, it does not use
3771db491f7SFenghua Yu 	 * shared kernel pmd. And this requires a whole page for pgd.
3781db491f7SFenghua Yu 	 */
3791db491f7SFenghua Yu 	if (!SHARED_KERNEL_PMD)
380caa84136SNadav Amit 		return;
3811db491f7SFenghua Yu 
3821db491f7SFenghua Yu 	/*
3831db491f7SFenghua Yu 	 * when PAE kernel is not running as a Xen domain, it uses
3841db491f7SFenghua Yu 	 * shared kernel pmd. Shared kernel pmd does not require a whole
3851db491f7SFenghua Yu 	 * page for pgd. We are able to just allocate a 32-byte for pgd.
3861db491f7SFenghua Yu 	 * During boot time, we create a 32-byte slab for pgd table allocation.
3871db491f7SFenghua Yu 	 */
3881db491f7SFenghua Yu 	pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
3891db491f7SFenghua Yu 				      SLAB_PANIC, NULL);
3901db491f7SFenghua Yu }
3911db491f7SFenghua Yu 
_pgd_alloc(void)3921db491f7SFenghua Yu static inline pgd_t *_pgd_alloc(void)
3931db491f7SFenghua Yu {
3941db491f7SFenghua Yu 	/*
3951db491f7SFenghua Yu 	 * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
3961db491f7SFenghua Yu 	 * We allocate one page for pgd.
3971db491f7SFenghua Yu 	 */
3981db491f7SFenghua Yu 	if (!SHARED_KERNEL_PMD)
3995fba4af4SMike Rapoport 		return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
400e3238fafSJoerg Roedel 						 PGD_ALLOCATION_ORDER);
4011db491f7SFenghua Yu 
4021db491f7SFenghua Yu 	/*
4031db491f7SFenghua Yu 	 * Now PAE kernel is not running as a Xen domain. We can allocate
4041db491f7SFenghua Yu 	 * a 32-byte slab for pgd to save memory space.
4051db491f7SFenghua Yu 	 */
4065fba4af4SMike Rapoport 	return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER);
4071db491f7SFenghua Yu }
4081db491f7SFenghua Yu 
_pgd_free(pgd_t * pgd)4091db491f7SFenghua Yu static inline void _pgd_free(pgd_t *pgd)
4101db491f7SFenghua Yu {
4111db491f7SFenghua Yu 	if (!SHARED_KERNEL_PMD)
412e3238fafSJoerg Roedel 		free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
4131db491f7SFenghua Yu 	else
4141db491f7SFenghua Yu 		kmem_cache_free(pgd_cache, pgd);
4151db491f7SFenghua Yu }
4161db491f7SFenghua Yu #else
417d9e9a641SDave Hansen 
_pgd_alloc(void)4181db491f7SFenghua Yu static inline pgd_t *_pgd_alloc(void)
4191db491f7SFenghua Yu {
4205fba4af4SMike Rapoport 	return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
4215fba4af4SMike Rapoport 					 PGD_ALLOCATION_ORDER);
4221db491f7SFenghua Yu }
4231db491f7SFenghua Yu 
_pgd_free(pgd_t * pgd)4241db491f7SFenghua Yu static inline void _pgd_free(pgd_t *pgd)
4251db491f7SFenghua Yu {
426d9e9a641SDave Hansen 	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
4271db491f7SFenghua Yu }
4281db491f7SFenghua Yu #endif /* CONFIG_X86_PAE */
4291db491f7SFenghua Yu 
pgd_alloc(struct mm_struct * mm)4304f76cd38SJeremy Fitzhardinge pgd_t *pgd_alloc(struct mm_struct *mm)
4314f76cd38SJeremy Fitzhardinge {
432d8d5900eSJeremy Fitzhardinge 	pgd_t *pgd;
433184d47f0SKees Cook 	pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
434184d47f0SKees Cook 	pmd_t *pmds[MAX_PREALLOCATED_PMDS];
4354f76cd38SJeremy Fitzhardinge 
4361db491f7SFenghua Yu 	pgd = _pgd_alloc();
437d8d5900eSJeremy Fitzhardinge 
438d8d5900eSJeremy Fitzhardinge 	if (pgd == NULL)
439d8d5900eSJeremy Fitzhardinge 		goto out;
440d8d5900eSJeremy Fitzhardinge 
4414f76cd38SJeremy Fitzhardinge 	mm->pgd = pgd;
4424f76cd38SJeremy Fitzhardinge 
44325226df4SGustavo A. R. Silva 	if (sizeof(pmds) != 0 &&
44425226df4SGustavo A. R. Silva 			preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
445d8d5900eSJeremy Fitzhardinge 		goto out_free_pgd;
446d8d5900eSJeremy Fitzhardinge 
44725226df4SGustavo A. R. Silva 	if (sizeof(u_pmds) != 0 &&
44825226df4SGustavo A. R. Silva 			preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
449d8d5900eSJeremy Fitzhardinge 		goto out_free_pmds;
450d8d5900eSJeremy Fitzhardinge 
451f59dbe9cSJoerg Roedel 	if (paravirt_pgd_alloc(mm) != 0)
452f59dbe9cSJoerg Roedel 		goto out_free_user_pmds;
453f59dbe9cSJoerg Roedel 
454d8d5900eSJeremy Fitzhardinge 	/*
455d8d5900eSJeremy Fitzhardinge 	 * Make sure that pre-populating the pmds is atomic with
456d8d5900eSJeremy Fitzhardinge 	 * respect to anything walking the pgd_list, so that they
457d8d5900eSJeremy Fitzhardinge 	 * never see a partially populated pgd.
458d8d5900eSJeremy Fitzhardinge 	 */
459a79e53d8SAndrea Arcangeli 	spin_lock(&pgd_lock);
460d8d5900eSJeremy Fitzhardinge 
461617d34d9SJeremy Fitzhardinge 	pgd_ctor(mm, pgd);
46225226df4SGustavo A. R. Silva 	if (sizeof(pmds) != 0)
463d8d5900eSJeremy Fitzhardinge 		pgd_prepopulate_pmd(mm, pgd, pmds);
46425226df4SGustavo A. R. Silva 
46525226df4SGustavo A. R. Silva 	if (sizeof(u_pmds) != 0)
466f59dbe9cSJoerg Roedel 		pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
467d8d5900eSJeremy Fitzhardinge 
468a79e53d8SAndrea Arcangeli 	spin_unlock(&pgd_lock);
4694f76cd38SJeremy Fitzhardinge 
4704f76cd38SJeremy Fitzhardinge 	return pgd;
471d8d5900eSJeremy Fitzhardinge 
472f59dbe9cSJoerg Roedel out_free_user_pmds:
47325226df4SGustavo A. R. Silva 	if (sizeof(u_pmds) != 0)
474f59dbe9cSJoerg Roedel 		free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
475d8d5900eSJeremy Fitzhardinge out_free_pmds:
47625226df4SGustavo A. R. Silva 	if (sizeof(pmds) != 0)
477f59dbe9cSJoerg Roedel 		free_pmds(mm, pmds, PREALLOCATED_PMDS);
478d8d5900eSJeremy Fitzhardinge out_free_pgd:
4791db491f7SFenghua Yu 	_pgd_free(pgd);
480d8d5900eSJeremy Fitzhardinge out:
481d8d5900eSJeremy Fitzhardinge 	return NULL;
4824f76cd38SJeremy Fitzhardinge }
4834f76cd38SJeremy Fitzhardinge 
pgd_free(struct mm_struct * mm,pgd_t * pgd)4844f76cd38SJeremy Fitzhardinge void pgd_free(struct mm_struct *mm, pgd_t *pgd)
4854f76cd38SJeremy Fitzhardinge {
4864f76cd38SJeremy Fitzhardinge 	pgd_mop_up_pmds(mm, pgd);
4874f76cd38SJeremy Fitzhardinge 	pgd_dtor(pgd);
488eba0045fSJeremy Fitzhardinge 	paravirt_pgd_free(mm, pgd);
4891db491f7SFenghua Yu 	_pgd_free(pgd);
4904f76cd38SJeremy Fitzhardinge }
491ee5aa8d3SJeremy Fitzhardinge 
4920f9a921cSRik van Riel /*
4930f9a921cSRik van Riel  * Used to set accessed or dirty bits in the page table entries
4940f9a921cSRik van Riel  * on other architectures. On x86, the accessed and dirty bits
4950f9a921cSRik van Riel  * are tracked by hardware. However, do_wp_page calls this function
4960f9a921cSRik van Riel  * to also make the pte writeable at the same time the dirty bit is
4970f9a921cSRik van Riel  * set. In that case we do actually need to write the PTE.
4980f9a921cSRik van Riel  */
ptep_set_access_flags(struct vm_area_struct * vma,unsigned long address,pte_t * ptep,pte_t entry,int dirty)499ee5aa8d3SJeremy Fitzhardinge int ptep_set_access_flags(struct vm_area_struct *vma,
500ee5aa8d3SJeremy Fitzhardinge 			  unsigned long address, pte_t *ptep,
501ee5aa8d3SJeremy Fitzhardinge 			  pte_t entry, int dirty)
502ee5aa8d3SJeremy Fitzhardinge {
503ee5aa8d3SJeremy Fitzhardinge 	int changed = !pte_same(*ptep, entry);
504ee5aa8d3SJeremy Fitzhardinge 
50587930019SJuergen Gross 	if (changed && dirty)
5069bc4f28aSNadav Amit 		set_pte(ptep, entry);
507ee5aa8d3SJeremy Fitzhardinge 
508ee5aa8d3SJeremy Fitzhardinge 	return changed;
509ee5aa8d3SJeremy Fitzhardinge }
510f9fbf1a3SJeremy Fitzhardinge 
511db3eb96fSAndrea Arcangeli #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmdp_set_access_flags(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp,pmd_t entry,int dirty)512db3eb96fSAndrea Arcangeli int pmdp_set_access_flags(struct vm_area_struct *vma,
513db3eb96fSAndrea Arcangeli 			  unsigned long address, pmd_t *pmdp,
514db3eb96fSAndrea Arcangeli 			  pmd_t entry, int dirty)
515db3eb96fSAndrea Arcangeli {
516db3eb96fSAndrea Arcangeli 	int changed = !pmd_same(*pmdp, entry);
517db3eb96fSAndrea Arcangeli 
518db3eb96fSAndrea Arcangeli 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
519db3eb96fSAndrea Arcangeli 
520db3eb96fSAndrea Arcangeli 	if (changed && dirty) {
5219bc4f28aSNadav Amit 		set_pmd(pmdp, entry);
5225e4bf1a5SIngo Molnar 		/*
5235e4bf1a5SIngo Molnar 		 * We had a write-protection fault here and changed the pmd
5245e4bf1a5SIngo Molnar 		 * to to more permissive. No need to flush the TLB for that,
5255e4bf1a5SIngo Molnar 		 * #PF is architecturally guaranteed to do that and in the
5265e4bf1a5SIngo Molnar 		 * worst-case we'll generate a spurious fault.
5275e4bf1a5SIngo Molnar 		 */
528db3eb96fSAndrea Arcangeli 	}
529db3eb96fSAndrea Arcangeli 
530db3eb96fSAndrea Arcangeli 	return changed;
531db3eb96fSAndrea Arcangeli }
532a00cc7d9SMatthew Wilcox 
pudp_set_access_flags(struct vm_area_struct * vma,unsigned long address,pud_t * pudp,pud_t entry,int dirty)533a00cc7d9SMatthew Wilcox int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
534a00cc7d9SMatthew Wilcox 			  pud_t *pudp, pud_t entry, int dirty)
535a00cc7d9SMatthew Wilcox {
536a00cc7d9SMatthew Wilcox 	int changed = !pud_same(*pudp, entry);
537a00cc7d9SMatthew Wilcox 
538a00cc7d9SMatthew Wilcox 	VM_BUG_ON(address & ~HPAGE_PUD_MASK);
539a00cc7d9SMatthew Wilcox 
540a00cc7d9SMatthew Wilcox 	if (changed && dirty) {
5419bc4f28aSNadav Amit 		set_pud(pudp, entry);
542a00cc7d9SMatthew Wilcox 		/*
543a00cc7d9SMatthew Wilcox 		 * We had a write-protection fault here and changed the pud
544a00cc7d9SMatthew Wilcox 		 * to to more permissive. No need to flush the TLB for that,
545a00cc7d9SMatthew Wilcox 		 * #PF is architecturally guaranteed to do that and in the
546a00cc7d9SMatthew Wilcox 		 * worst-case we'll generate a spurious fault.
547a00cc7d9SMatthew Wilcox 		 */
548a00cc7d9SMatthew Wilcox 	}
549a00cc7d9SMatthew Wilcox 
550a00cc7d9SMatthew Wilcox 	return changed;
551a00cc7d9SMatthew Wilcox }
552db3eb96fSAndrea Arcangeli #endif
553db3eb96fSAndrea Arcangeli 
ptep_test_and_clear_young(struct vm_area_struct * vma,unsigned long addr,pte_t * ptep)554f9fbf1a3SJeremy Fitzhardinge int ptep_test_and_clear_young(struct vm_area_struct *vma,
555f9fbf1a3SJeremy Fitzhardinge 			      unsigned long addr, pte_t *ptep)
556f9fbf1a3SJeremy Fitzhardinge {
557f9fbf1a3SJeremy Fitzhardinge 	int ret = 0;
558f9fbf1a3SJeremy Fitzhardinge 
559f9fbf1a3SJeremy Fitzhardinge 	if (pte_young(*ptep))
560f9fbf1a3SJeremy Fitzhardinge 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
56148e23957SThomas Gleixner 					 (unsigned long *) &ptep->pte);
562f9fbf1a3SJeremy Fitzhardinge 
563f9fbf1a3SJeremy Fitzhardinge 	return ret;
564f9fbf1a3SJeremy Fitzhardinge }
565c20311e1SJeremy Fitzhardinge 
566eed9a328SYu Zhao #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
pmdp_test_and_clear_young(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmdp)567db3eb96fSAndrea Arcangeli int pmdp_test_and_clear_young(struct vm_area_struct *vma,
568db3eb96fSAndrea Arcangeli 			      unsigned long addr, pmd_t *pmdp)
569db3eb96fSAndrea Arcangeli {
570db3eb96fSAndrea Arcangeli 	int ret = 0;
571db3eb96fSAndrea Arcangeli 
572db3eb96fSAndrea Arcangeli 	if (pmd_young(*pmdp))
573db3eb96fSAndrea Arcangeli 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
574f2d6bfe9SJohannes Weiner 					 (unsigned long *)pmdp);
575db3eb96fSAndrea Arcangeli 
576db3eb96fSAndrea Arcangeli 	return ret;
577db3eb96fSAndrea Arcangeli }
578eed9a328SYu Zhao #endif
579eed9a328SYu Zhao 
580eed9a328SYu Zhao #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pudp_test_and_clear_young(struct vm_area_struct * vma,unsigned long addr,pud_t * pudp)581a00cc7d9SMatthew Wilcox int pudp_test_and_clear_young(struct vm_area_struct *vma,
582a00cc7d9SMatthew Wilcox 			      unsigned long addr, pud_t *pudp)
583a00cc7d9SMatthew Wilcox {
584a00cc7d9SMatthew Wilcox 	int ret = 0;
585a00cc7d9SMatthew Wilcox 
586a00cc7d9SMatthew Wilcox 	if (pud_young(*pudp))
587a00cc7d9SMatthew Wilcox 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
588a00cc7d9SMatthew Wilcox 					 (unsigned long *)pudp);
589a00cc7d9SMatthew Wilcox 
590a00cc7d9SMatthew Wilcox 	return ret;
591a00cc7d9SMatthew Wilcox }
592db3eb96fSAndrea Arcangeli #endif
593db3eb96fSAndrea Arcangeli 
ptep_clear_flush_young(struct vm_area_struct * vma,unsigned long address,pte_t * ptep)594c20311e1SJeremy Fitzhardinge int ptep_clear_flush_young(struct vm_area_struct *vma,
595c20311e1SJeremy Fitzhardinge 			   unsigned long address, pte_t *ptep)
596c20311e1SJeremy Fitzhardinge {
597b13b1d2dSShaohua Li 	/*
598b13b1d2dSShaohua Li 	 * On x86 CPUs, clearing the accessed bit without a TLB flush
599b13b1d2dSShaohua Li 	 * doesn't cause data corruption. [ It could cause incorrect
600b13b1d2dSShaohua Li 	 * page aging and the (mistaken) reclaim of hot pages, but the
601b13b1d2dSShaohua Li 	 * chance of that should be relatively low. ]
602b13b1d2dSShaohua Li 	 *
603b13b1d2dSShaohua Li 	 * So as a performance optimization don't flush the TLB when
604b13b1d2dSShaohua Li 	 * clearing the accessed bit, it will eventually be flushed by
605b13b1d2dSShaohua Li 	 * a context switch or a VM operation anyway. [ In the rare
606b13b1d2dSShaohua Li 	 * event of it not getting flushed for a long time the delay
607b13b1d2dSShaohua Li 	 * shouldn't really matter because there's no real memory
608b13b1d2dSShaohua Li 	 * pressure for swapout to react to. ]
609b13b1d2dSShaohua Li 	 */
610b13b1d2dSShaohua Li 	return ptep_test_and_clear_young(vma, address, ptep);
611c20311e1SJeremy Fitzhardinge }
6127c7e6e07SJeremy Fitzhardinge 
613db3eb96fSAndrea Arcangeli #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmdp_clear_flush_young(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp)614db3eb96fSAndrea Arcangeli int pmdp_clear_flush_young(struct vm_area_struct *vma,
615db3eb96fSAndrea Arcangeli 			   unsigned long address, pmd_t *pmdp)
616db3eb96fSAndrea Arcangeli {
617db3eb96fSAndrea Arcangeli 	int young;
618db3eb96fSAndrea Arcangeli 
619db3eb96fSAndrea Arcangeli 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
620db3eb96fSAndrea Arcangeli 
621db3eb96fSAndrea Arcangeli 	young = pmdp_test_and_clear_young(vma, address, pmdp);
622db3eb96fSAndrea Arcangeli 	if (young)
623db3eb96fSAndrea Arcangeli 		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
624db3eb96fSAndrea Arcangeli 
625db3eb96fSAndrea Arcangeli 	return young;
626db3eb96fSAndrea Arcangeli }
6274f831457SNadav Amit 
pmdp_invalidate_ad(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp)6284f831457SNadav Amit pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
6294f831457SNadav Amit 			 pmd_t *pmdp)
6304f831457SNadav Amit {
631*be0ce3f6SRyan Roberts 	VM_WARN_ON_ONCE(!pmd_present(*pmdp));
632*be0ce3f6SRyan Roberts 
6334f831457SNadav Amit 	/*
6344f831457SNadav Amit 	 * No flush is necessary. Once an invalid PTE is established, the PTE's
6354f831457SNadav Amit 	 * access and dirty bits cannot be updated.
6364f831457SNadav Amit 	 */
6374f831457SNadav Amit 	return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
6384f831457SNadav Amit }
639db3eb96fSAndrea Arcangeli #endif
640db3eb96fSAndrea Arcangeli 
641fd862ddeSGustavo F. Padovan /**
642fd862ddeSGustavo F. Padovan  * reserve_top_address - reserves a hole in the top of kernel address space
643fd862ddeSGustavo F. Padovan  * @reserve - size of hole to reserve
644fd862ddeSGustavo F. Padovan  *
645fd862ddeSGustavo F. Padovan  * Can be used to relocate the fixmap area and poke a hole in the top
646fd862ddeSGustavo F. Padovan  * of kernel address space to make room for a hypervisor.
647fd862ddeSGustavo F. Padovan  */
reserve_top_address(unsigned long reserve)648fd862ddeSGustavo F. Padovan void __init reserve_top_address(unsigned long reserve)
649fd862ddeSGustavo F. Padovan {
650fd862ddeSGustavo F. Padovan #ifdef CONFIG_X86_32
651fd862ddeSGustavo F. Padovan 	BUG_ON(fixmaps_set > 0);
65273159fdcSAndy Lutomirski 	__FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
65373159fdcSAndy Lutomirski 	printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
65473159fdcSAndy Lutomirski 	       -reserve, __FIXADDR_TOP + PAGE_SIZE);
655fd862ddeSGustavo F. Padovan #endif
656fd862ddeSGustavo F. Padovan }
657fd862ddeSGustavo F. Padovan 
6587c7e6e07SJeremy Fitzhardinge int fixmaps_set;
6597c7e6e07SJeremy Fitzhardinge 
__native_set_fixmap(enum fixed_addresses idx,pte_t pte)660aeaaa59cSJeremy Fitzhardinge void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
6617c7e6e07SJeremy Fitzhardinge {
6627c7e6e07SJeremy Fitzhardinge 	unsigned long address = __fix_to_virt(idx);
6637c7e6e07SJeremy Fitzhardinge 
66405ab1d8aSFeng Tang #ifdef CONFIG_X86_64
66505ab1d8aSFeng Tang        /*
66605ab1d8aSFeng Tang 	* Ensure that the static initial page tables are covering the
66705ab1d8aSFeng Tang 	* fixmap completely.
66805ab1d8aSFeng Tang 	*/
66905ab1d8aSFeng Tang 	BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
67005ab1d8aSFeng Tang 		     (FIXMAP_PMD_NUM * PTRS_PER_PTE));
67105ab1d8aSFeng Tang #endif
67205ab1d8aSFeng Tang 
6737c7e6e07SJeremy Fitzhardinge 	if (idx >= __end_of_fixed_addresses) {
6747c7e6e07SJeremy Fitzhardinge 		BUG();
6757c7e6e07SJeremy Fitzhardinge 		return;
6767c7e6e07SJeremy Fitzhardinge 	}
677aeaaa59cSJeremy Fitzhardinge 	set_pte_vaddr(address, pte);
6787c7e6e07SJeremy Fitzhardinge 	fixmaps_set++;
6797c7e6e07SJeremy Fitzhardinge }
680aeaaa59cSJeremy Fitzhardinge 
native_set_fixmap(unsigned idx,phys_addr_t phys,pgprot_t flags)681f53e2cd0SSami Tolvanen void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
682f53e2cd0SSami Tolvanen 		       phys_addr_t phys, pgprot_t flags)
683aeaaa59cSJeremy Fitzhardinge {
684fb43d6cbSDave Hansen 	/* Sanitize 'prot' against any unsupported bits: */
685fb43d6cbSDave Hansen 	pgprot_val(flags) &= __default_kernel_pte_mask;
686fb43d6cbSDave Hansen 
687aeaaa59cSJeremy Fitzhardinge 	__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
688aeaaa59cSJeremy Fitzhardinge }
6896b637835SToshi Kani 
6906b637835SToshi Kani #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
691b8504058SKirill A. Shutemov #ifdef CONFIG_X86_5LEVEL
692b8504058SKirill A. Shutemov /**
693b8504058SKirill A. Shutemov  * p4d_set_huge - setup kernel P4D mapping
694b8504058SKirill A. Shutemov  *
695b8504058SKirill A. Shutemov  * No 512GB pages yet -- always return 0
696b8504058SKirill A. Shutemov  */
p4d_set_huge(p4d_t * p4d,phys_addr_t addr,pgprot_t prot)697b8504058SKirill A. Shutemov int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
698b8504058SKirill A. Shutemov {
699b8504058SKirill A. Shutemov 	return 0;
700b8504058SKirill A. Shutemov }
701b8504058SKirill A. Shutemov 
702b8504058SKirill A. Shutemov /**
703b8504058SKirill A. Shutemov  * p4d_clear_huge - clear kernel P4D mapping when it is set
704b8504058SKirill A. Shutemov  *
705b8504058SKirill A. Shutemov  * No 512GB pages yet -- always return 0
706b8504058SKirill A. Shutemov  */
p4d_clear_huge(p4d_t * p4d)707c8db8c26SLi kunyu void p4d_clear_huge(p4d_t *p4d)
708b8504058SKirill A. Shutemov {
709b8504058SKirill A. Shutemov }
710b8504058SKirill A. Shutemov #endif
711b8504058SKirill A. Shutemov 
7123d3ca416SToshi Kani /**
7133d3ca416SToshi Kani  * pud_set_huge - setup kernel PUD mapping
7143d3ca416SToshi Kani  *
715b73522e0SToshi Kani  * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
71612f0dd8dSJuergen Gross  * function sets up a huge page only if the complete range has the same MTRR
71712f0dd8dSJuergen Gross  * caching mode.
718b73522e0SToshi Kani  *
719b73522e0SToshi Kani  * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
720b73522e0SToshi Kani  * page mapping attempt fails.
7213d3ca416SToshi Kani  *
7223d3ca416SToshi Kani  * Returns 1 on success and 0 on failure.
7233d3ca416SToshi Kani  */
pud_set_huge(pud_t * pud,phys_addr_t addr,pgprot_t prot)7246b637835SToshi Kani int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
7256b637835SToshi Kani {
72612f0dd8dSJuergen Gross 	u8 uniform;
7276b637835SToshi Kani 
72812f0dd8dSJuergen Gross 	mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
72912f0dd8dSJuergen Gross 	if (!uniform)
7306b637835SToshi Kani 		return 0;
7316b637835SToshi Kani 
732e3e28812SJoerg Roedel 	/* Bail out if we are we on a populated non-leaf entry: */
733e3e28812SJoerg Roedel 	if (pud_present(*pud) && !pud_huge(*pud))
734e3e28812SJoerg Roedel 		return 0;
735e3e28812SJoerg Roedel 
7366b637835SToshi Kani 	set_pte((pte_t *)pud, pfn_pte(
7376b637835SToshi Kani 		(u64)addr >> PAGE_SHIFT,
738d0735693SChristoph Hellwig 		__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
7396b637835SToshi Kani 
7406b637835SToshi Kani 	return 1;
7416b637835SToshi Kani }
7426b637835SToshi Kani 
7433d3ca416SToshi Kani /**
7443d3ca416SToshi Kani  * pmd_set_huge - setup kernel PMD mapping
7453d3ca416SToshi Kani  *
746b73522e0SToshi Kani  * See text over pud_set_huge() above.
7473d3ca416SToshi Kani  *
7483d3ca416SToshi Kani  * Returns 1 on success and 0 on failure.
7493d3ca416SToshi Kani  */
pmd_set_huge(pmd_t * pmd,phys_addr_t addr,pgprot_t prot)7506b637835SToshi Kani int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
7516b637835SToshi Kani {
75212f0dd8dSJuergen Gross 	u8 uniform;
7536b637835SToshi Kani 
75412f0dd8dSJuergen Gross 	mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
75512f0dd8dSJuergen Gross 	if (!uniform) {
756b73522e0SToshi Kani 		pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
757b73522e0SToshi Kani 			     __func__, addr, addr + PMD_SIZE);
7586b637835SToshi Kani 		return 0;
759b73522e0SToshi Kani 	}
7606b637835SToshi Kani 
761e3e28812SJoerg Roedel 	/* Bail out if we are we on a populated non-leaf entry: */
762e3e28812SJoerg Roedel 	if (pmd_present(*pmd) && !pmd_huge(*pmd))
763e3e28812SJoerg Roedel 		return 0;
764e3e28812SJoerg Roedel 
7656b637835SToshi Kani 	set_pte((pte_t *)pmd, pfn_pte(
7666b637835SToshi Kani 		(u64)addr >> PAGE_SHIFT,
767d0735693SChristoph Hellwig 		__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
7686b637835SToshi Kani 
7696b637835SToshi Kani 	return 1;
7706b637835SToshi Kani }
7716b637835SToshi Kani 
7723d3ca416SToshi Kani /**
773d8a71905SJonathan Marek  * pud_clear_huge - clear kernel PUD mapping when it is set
774d8a71905SJonathan Marek  *
775d8a71905SJonathan Marek  * Returns 1 on success and 0 on failure (no PUD map is found).
776d8a71905SJonathan Marek  */
pud_clear_huge(pud_t * pud)777d8a71905SJonathan Marek int pud_clear_huge(pud_t *pud)
778d8a71905SJonathan Marek {
779907835e6SPeter Xu 	if (pud_leaf(*pud)) {
780d8a71905SJonathan Marek 		pud_clear(pud);
781d8a71905SJonathan Marek 		return 1;
782d8a71905SJonathan Marek 	}
783d8a71905SJonathan Marek 
784d8a71905SJonathan Marek 	return 0;
785d8a71905SJonathan Marek }
786d8a71905SJonathan Marek 
787d8a71905SJonathan Marek /**
7883d3ca416SToshi Kani  * pmd_clear_huge - clear kernel PMD mapping when it is set
7893d3ca416SToshi Kani  *
7903d3ca416SToshi Kani  * Returns 1 on success and 0 on failure (no PMD map is found).
7913d3ca416SToshi Kani  */
pmd_clear_huge(pmd_t * pmd)7926b637835SToshi Kani int pmd_clear_huge(pmd_t *pmd)
7936b637835SToshi Kani {
7946b637835SToshi Kani 	if (pmd_large(*pmd)) {
7956b637835SToshi Kani 		pmd_clear(pmd);
7966b637835SToshi Kani 		return 1;
7976b637835SToshi Kani 	}
7986b637835SToshi Kani 
7996b637835SToshi Kani 	return 0;
8006b637835SToshi Kani }
801b6bdb751SToshi Kani 
802f967db0bSToshi Kani #ifdef CONFIG_X86_64
803b6bdb751SToshi Kani /**
804b6bdb751SToshi Kani  * pud_free_pmd_page - Clear pud entry and free pmd page.
805b6bdb751SToshi Kani  * @pud: Pointer to a PUD.
806785a19f9SChintan Pandya  * @addr: Virtual address associated with pud.
807b6bdb751SToshi Kani  *
8085e0fb5dfSToshi Kani  * Context: The pud range has been unmapped and TLB purged.
809b6bdb751SToshi Kani  * Return: 1 if clearing the entry succeeded. 0 otherwise.
8105e0fb5dfSToshi Kani  *
8115e0fb5dfSToshi Kani  * NOTE: Callers must allow a single page allocation.
812b6bdb751SToshi Kani  */
pud_free_pmd_page(pud_t * pud,unsigned long addr)813785a19f9SChintan Pandya int pud_free_pmd_page(pud_t *pud, unsigned long addr)
814b6bdb751SToshi Kani {
8155e0fb5dfSToshi Kani 	pmd_t *pmd, *pmd_sv;
8165e0fb5dfSToshi Kani 	pte_t *pte;
81728ee90feSToshi Kani 	int i;
81828ee90feSToshi Kani 
8199cf6fa24SAneesh Kumar K.V 	pmd = pud_pgtable(*pud);
8205e0fb5dfSToshi Kani 	pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
8215e0fb5dfSToshi Kani 	if (!pmd_sv)
82228ee90feSToshi Kani 		return 0;
82328ee90feSToshi Kani 
8245e0fb5dfSToshi Kani 	for (i = 0; i < PTRS_PER_PMD; i++) {
8255e0fb5dfSToshi Kani 		pmd_sv[i] = pmd[i];
8265e0fb5dfSToshi Kani 		if (!pmd_none(pmd[i]))
8275e0fb5dfSToshi Kani 			pmd_clear(&pmd[i]);
8285e0fb5dfSToshi Kani 	}
8295e0fb5dfSToshi Kani 
83028ee90feSToshi Kani 	pud_clear(pud);
8315e0fb5dfSToshi Kani 
8325e0fb5dfSToshi Kani 	/* INVLPG to clear all paging-structure caches */
8335e0fb5dfSToshi Kani 	flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
8345e0fb5dfSToshi Kani 
8355e0fb5dfSToshi Kani 	for (i = 0; i < PTRS_PER_PMD; i++) {
8365e0fb5dfSToshi Kani 		if (!pmd_none(pmd_sv[i])) {
8375e0fb5dfSToshi Kani 			pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
8385e0fb5dfSToshi Kani 			free_page((unsigned long)pte);
8395e0fb5dfSToshi Kani 		}
8405e0fb5dfSToshi Kani 	}
8415e0fb5dfSToshi Kani 
8425e0fb5dfSToshi Kani 	free_page((unsigned long)pmd_sv);
843d1c5246eSDan Williams 
844f92c494fSVishal Moola (Oracle) 	pagetable_pmd_dtor(virt_to_ptdesc(pmd));
84528ee90feSToshi Kani 	free_page((unsigned long)pmd);
84628ee90feSToshi Kani 
84728ee90feSToshi Kani 	return 1;
848b6bdb751SToshi Kani }
849b6bdb751SToshi Kani 
850b6bdb751SToshi Kani /**
851b6bdb751SToshi Kani  * pmd_free_pte_page - Clear pmd entry and free pte page.
852b6bdb751SToshi Kani  * @pmd: Pointer to a PMD.
853785a19f9SChintan Pandya  * @addr: Virtual address associated with pmd.
854b6bdb751SToshi Kani  *
8555e0fb5dfSToshi Kani  * Context: The pmd range has been unmapped and TLB purged.
856b6bdb751SToshi Kani  * Return: 1 if clearing the entry succeeded. 0 otherwise.
857b6bdb751SToshi Kani  */
pmd_free_pte_page(pmd_t * pmd,unsigned long addr)858785a19f9SChintan Pandya int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
859b6bdb751SToshi Kani {
86028ee90feSToshi Kani 	pte_t *pte;
86128ee90feSToshi Kani 
86228ee90feSToshi Kani 	pte = (pte_t *)pmd_page_vaddr(*pmd);
86328ee90feSToshi Kani 	pmd_clear(pmd);
8645e0fb5dfSToshi Kani 
8655e0fb5dfSToshi Kani 	/* INVLPG to clear all paging-structure caches */
8665e0fb5dfSToshi Kani 	flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
8675e0fb5dfSToshi Kani 
86828ee90feSToshi Kani 	free_page((unsigned long)pte);
86928ee90feSToshi Kani 
87028ee90feSToshi Kani 	return 1;
871b6bdb751SToshi Kani }
872f967db0bSToshi Kani 
873f967db0bSToshi Kani #else /* !CONFIG_X86_64 */
874f967db0bSToshi Kani 
875f967db0bSToshi Kani /*
876f967db0bSToshi Kani  * Disable free page handling on x86-PAE. This assures that ioremap()
877f967db0bSToshi Kani  * does not update sync'd pmd entries. See vmalloc_sync_one().
878f967db0bSToshi Kani  */
pmd_free_pte_page(pmd_t * pmd,unsigned long addr)879785a19f9SChintan Pandya int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
880f967db0bSToshi Kani {
881f967db0bSToshi Kani 	return pmd_none(*pmd);
882f967db0bSToshi Kani }
883f967db0bSToshi Kani 
884f967db0bSToshi Kani #endif /* CONFIG_X86_64 */
8856b637835SToshi Kani #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
886bb3aadf7SRick Edgecombe 
pte_mkwrite(pte_t pte,struct vm_area_struct * vma)887bb3aadf7SRick Edgecombe pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
888bb3aadf7SRick Edgecombe {
889b497e52dSRick Edgecombe 	if (vma->vm_flags & VM_SHADOW_STACK)
890b497e52dSRick Edgecombe 		return pte_mkwrite_shstk(pte);
891b497e52dSRick Edgecombe 
892bb3aadf7SRick Edgecombe 	pte = pte_mkwrite_novma(pte);
893bb3aadf7SRick Edgecombe 
894bb3aadf7SRick Edgecombe 	return pte_clear_saveddirty(pte);
895bb3aadf7SRick Edgecombe }
896bb3aadf7SRick Edgecombe 
pmd_mkwrite(pmd_t pmd,struct vm_area_struct * vma)897bb3aadf7SRick Edgecombe pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
898bb3aadf7SRick Edgecombe {
899b497e52dSRick Edgecombe 	if (vma->vm_flags & VM_SHADOW_STACK)
900b497e52dSRick Edgecombe 		return pmd_mkwrite_shstk(pmd);
901b497e52dSRick Edgecombe 
902bb3aadf7SRick Edgecombe 	pmd = pmd_mkwrite_novma(pmd);
903bb3aadf7SRick Edgecombe 
904bb3aadf7SRick Edgecombe 	return pmd_clear_saveddirty(pmd);
905bb3aadf7SRick Edgecombe }
906e5136e87SRick Edgecombe 
arch_check_zapped_pte(struct vm_area_struct * vma,pte_t pte)907e5136e87SRick Edgecombe void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
908e5136e87SRick Edgecombe {
909e5136e87SRick Edgecombe 	/*
910e5136e87SRick Edgecombe 	 * Hardware before shadow stack can (rarely) set Dirty=1
911e5136e87SRick Edgecombe 	 * on a Write=0 PTE. So the below condition
912e5136e87SRick Edgecombe 	 * only indicates a software bug when shadow stack is
913e5136e87SRick Edgecombe 	 * supported by the HW. This checking is covered in
914e5136e87SRick Edgecombe 	 * pte_shstk().
915e5136e87SRick Edgecombe 	 */
916e5136e87SRick Edgecombe 	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
917e5136e87SRick Edgecombe 			pte_shstk(pte));
918e5136e87SRick Edgecombe }
919e5136e87SRick Edgecombe 
arch_check_zapped_pmd(struct vm_area_struct * vma,pmd_t pmd)920e5136e87SRick Edgecombe void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
921e5136e87SRick Edgecombe {
922e5136e87SRick Edgecombe 	/* See note in arch_check_zapped_pte() */
923e5136e87SRick Edgecombe 	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
924e5136e87SRick Edgecombe 			pmd_shstk(pmd));
925e5136e87SRick Edgecombe }
926