1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
24f76cd38SJeremy Fitzhardinge #include <linux/mm.h>
35a0e3ad6STejun Heo #include <linux/gfp.h>
4e3e28812SJoerg Roedel #include <linux/hugetlb.h>
54f76cd38SJeremy Fitzhardinge #include <asm/pgalloc.h>
64f76cd38SJeremy Fitzhardinge #include <asm/tlb.h>
7a1d5a869SIngo Molnar #include <asm/fixmap.h>
86b637835SToshi Kani #include <asm/mtrr.h>
94f76cd38SJeremy Fitzhardinge
1094d49eb3SKirill A. Shutemov #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
1194d49eb3SKirill A. Shutemov phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
1294d49eb3SKirill A. Shutemov EXPORT_SYMBOL(physical_mask);
1394d49eb3SKirill A. Shutemov #endif
1494d49eb3SKirill A. Shutemov
1514315592SIan Campbell #ifdef CONFIG_HIGHPTE
165fba4af4SMike Rapoport #define PGTABLE_HIGHMEM __GFP_HIGHMEM
1714315592SIan Campbell #else
185fba4af4SMike Rapoport #define PGTABLE_HIGHMEM 0
1914315592SIan Campbell #endif
2014315592SIan Campbell
2169de6c1aSThomas Gleixner #ifndef CONFIG_PARAVIRT
2269de6c1aSThomas Gleixner static inline
paravirt_tlb_remove_table(struct mmu_gather * tlb,void * table)2369de6c1aSThomas Gleixner void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
2469de6c1aSThomas Gleixner {
2569de6c1aSThomas Gleixner tlb_remove_page(tlb, table);
2669de6c1aSThomas Gleixner }
2769de6c1aSThomas Gleixner #endif
2869de6c1aSThomas Gleixner
295fba4af4SMike Rapoport gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
304f76cd38SJeremy Fitzhardinge
pte_alloc_one(struct mm_struct * mm)314cf58924SJoel Fernandes (Google) pgtable_t pte_alloc_one(struct mm_struct *mm)
324f76cd38SJeremy Fitzhardinge {
335fba4af4SMike Rapoport return __pte_alloc_one(mm, __userpte_alloc_gfp);
344f76cd38SJeremy Fitzhardinge }
354f76cd38SJeremy Fitzhardinge
setup_userpte(char * arg)3614315592SIan Campbell static int __init setup_userpte(char *arg)
3714315592SIan Campbell {
3814315592SIan Campbell if (!arg)
3914315592SIan Campbell return -EINVAL;
4014315592SIan Campbell
4114315592SIan Campbell /*
4214315592SIan Campbell * "userpte=nohigh" disables allocation of user pagetables in
4314315592SIan Campbell * high memory.
4414315592SIan Campbell */
4514315592SIan Campbell if (strcmp(arg, "nohigh") == 0)
4614315592SIan Campbell __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
4714315592SIan Campbell else
4814315592SIan Campbell return -EINVAL;
4914315592SIan Campbell return 0;
5014315592SIan Campbell }
5114315592SIan Campbell early_param("userpte", setup_userpte);
5214315592SIan Campbell
___pte_free_tlb(struct mmu_gather * tlb,struct page * pte)539e1b32caSBenjamin Herrenschmidt void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
54397f687aSJeremy Fitzhardinge {
55f92c494fSVishal Moola (Oracle) pagetable_pte_dtor(page_ptdesc(pte));
566944a9c8SJeremy Fitzhardinge paravirt_release_pte(page_to_pfn(pte));
5748a8b97cSPeter Zijlstra paravirt_tlb_remove_table(tlb, pte);
58397f687aSJeremy Fitzhardinge }
59397f687aSJeremy Fitzhardinge
6098233368SKirill A. Shutemov #if CONFIG_PGTABLE_LEVELS > 2
___pmd_free_tlb(struct mmu_gather * tlb,pmd_t * pmd)619e1b32caSBenjamin Herrenschmidt void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
62170fdff7SJeremy Fitzhardinge {
63f92c494fSVishal Moola (Oracle) struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
646944a9c8SJeremy Fitzhardinge paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
651de14c3cSDave Hansen /*
661de14c3cSDave Hansen * NOTE! For PAE, any changes to the top page-directory-pointer-table
671de14c3cSDave Hansen * entries need a full cr3 reload to flush.
681de14c3cSDave Hansen */
691de14c3cSDave Hansen #ifdef CONFIG_X86_PAE
701de14c3cSDave Hansen tlb->need_flush_all = 1;
711de14c3cSDave Hansen #endif
72f92c494fSVishal Moola (Oracle) pagetable_pmd_dtor(ptdesc);
73f92c494fSVishal Moola (Oracle) paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
74170fdff7SJeremy Fitzhardinge }
755a5f8f42SJeremy Fitzhardinge
7698233368SKirill A. Shutemov #if CONFIG_PGTABLE_LEVELS > 3
___pud_free_tlb(struct mmu_gather * tlb,pud_t * pud)779e1b32caSBenjamin Herrenschmidt void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
785a5f8f42SJeremy Fitzhardinge {
792761fa09SJeremy Fitzhardinge paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
8048a8b97cSPeter Zijlstra paravirt_tlb_remove_table(tlb, virt_to_page(pud));
815a5f8f42SJeremy Fitzhardinge }
82b8504058SKirill A. Shutemov
83b8504058SKirill A. Shutemov #if CONFIG_PGTABLE_LEVELS > 4
___p4d_free_tlb(struct mmu_gather * tlb,p4d_t * p4d)84b8504058SKirill A. Shutemov void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
85b8504058SKirill A. Shutemov {
86b8504058SKirill A. Shutemov paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
8748a8b97cSPeter Zijlstra paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
88b8504058SKirill A. Shutemov }
89b8504058SKirill A. Shutemov #endif /* CONFIG_PGTABLE_LEVELS > 4 */
9098233368SKirill A. Shutemov #endif /* CONFIG_PGTABLE_LEVELS > 3 */
9198233368SKirill A. Shutemov #endif /* CONFIG_PGTABLE_LEVELS > 2 */
92170fdff7SJeremy Fitzhardinge
pgd_list_add(pgd_t * pgd)934f76cd38SJeremy Fitzhardinge static inline void pgd_list_add(pgd_t *pgd)
944f76cd38SJeremy Fitzhardinge {
95f92c494fSVishal Moola (Oracle) struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
964f76cd38SJeremy Fitzhardinge
97f92c494fSVishal Moola (Oracle) list_add(&ptdesc->pt_list, &pgd_list);
984f76cd38SJeremy Fitzhardinge }
994f76cd38SJeremy Fitzhardinge
pgd_list_del(pgd_t * pgd)1004f76cd38SJeremy Fitzhardinge static inline void pgd_list_del(pgd_t *pgd)
1014f76cd38SJeremy Fitzhardinge {
102f92c494fSVishal Moola (Oracle) struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
1034f76cd38SJeremy Fitzhardinge
104f92c494fSVishal Moola (Oracle) list_del(&ptdesc->pt_list);
1054f76cd38SJeremy Fitzhardinge }
1064f76cd38SJeremy Fitzhardinge
1074f76cd38SJeremy Fitzhardinge #define UNSHARED_PTRS_PER_PGD \
10868db065cSJeremy Fitzhardinge (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
109184d47f0SKees Cook #define MAX_UNSHARED_PTRS_PER_PGD \
110184d47f0SKees Cook max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
1114f76cd38SJeremy Fitzhardinge
112617d34d9SJeremy Fitzhardinge
pgd_set_mm(pgd_t * pgd,struct mm_struct * mm)113617d34d9SJeremy Fitzhardinge static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
114617d34d9SJeremy Fitzhardinge {
115f92c494fSVishal Moola (Oracle) virt_to_ptdesc(pgd)->pt_mm = mm;
116617d34d9SJeremy Fitzhardinge }
117617d34d9SJeremy Fitzhardinge
pgd_page_get_mm(struct page * page)118617d34d9SJeremy Fitzhardinge struct mm_struct *pgd_page_get_mm(struct page *page)
119617d34d9SJeremy Fitzhardinge {
120f92c494fSVishal Moola (Oracle) return page_ptdesc(page)->pt_mm;
121617d34d9SJeremy Fitzhardinge }
122617d34d9SJeremy Fitzhardinge
pgd_ctor(struct mm_struct * mm,pgd_t * pgd)123617d34d9SJeremy Fitzhardinge static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
1244f76cd38SJeremy Fitzhardinge {
1254f76cd38SJeremy Fitzhardinge /* If the pgd points to a shared pagetable level (either the
1264f76cd38SJeremy Fitzhardinge ptes in non-PAE, or shared PMD in PAE), then just copy the
1274f76cd38SJeremy Fitzhardinge references from swapper_pg_dir. */
12898233368SKirill A. Shutemov if (CONFIG_PGTABLE_LEVELS == 2 ||
12998233368SKirill A. Shutemov (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
130b8504058SKirill A. Shutemov CONFIG_PGTABLE_LEVELS >= 4) {
13168db065cSJeremy Fitzhardinge clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
13268db065cSJeremy Fitzhardinge swapper_pg_dir + KERNEL_PGD_BOUNDARY,
1334f76cd38SJeremy Fitzhardinge KERNEL_PGD_PTRS);
1344f76cd38SJeremy Fitzhardinge }
1354f76cd38SJeremy Fitzhardinge
1364f76cd38SJeremy Fitzhardinge /* list required to sync kernel mapping updates */
137617d34d9SJeremy Fitzhardinge if (!SHARED_KERNEL_PMD) {
138617d34d9SJeremy Fitzhardinge pgd_set_mm(pgd, mm);
1394f76cd38SJeremy Fitzhardinge pgd_list_add(pgd);
1404f76cd38SJeremy Fitzhardinge }
141617d34d9SJeremy Fitzhardinge }
1424f76cd38SJeremy Fitzhardinge
pgd_dtor(pgd_t * pgd)14317b74627SJan Beulich static void pgd_dtor(pgd_t *pgd)
1444f76cd38SJeremy Fitzhardinge {
1454f76cd38SJeremy Fitzhardinge if (SHARED_KERNEL_PMD)
1464f76cd38SJeremy Fitzhardinge return;
1474f76cd38SJeremy Fitzhardinge
148a79e53d8SAndrea Arcangeli spin_lock(&pgd_lock);
1494f76cd38SJeremy Fitzhardinge pgd_list_del(pgd);
150a79e53d8SAndrea Arcangeli spin_unlock(&pgd_lock);
1514f76cd38SJeremy Fitzhardinge }
1524f76cd38SJeremy Fitzhardinge
15385958b46SJeremy Fitzhardinge /*
15485958b46SJeremy Fitzhardinge * List of all pgd's needed for non-PAE so it can invalidate entries
15585958b46SJeremy Fitzhardinge * in both cached and uncached pgd's; not needed for PAE since the
15685958b46SJeremy Fitzhardinge * kernel pmd is shared. If PAE were not to share the pmd a similar
15785958b46SJeremy Fitzhardinge * tactic would be needed. This is essentially codepath-based locking
15885958b46SJeremy Fitzhardinge * against pageattr.c; it is the unique case in which a valid change
15985958b46SJeremy Fitzhardinge * of kernel pagetables can't be lazily synchronized by vmalloc faults.
16085958b46SJeremy Fitzhardinge * vmalloc faults work because attached pagetables are never freed.
1616d49e352SNadia Yvette Chambers * -- nyc
16285958b46SJeremy Fitzhardinge */
16385958b46SJeremy Fitzhardinge
1644f76cd38SJeremy Fitzhardinge #ifdef CONFIG_X86_PAE
1654f76cd38SJeremy Fitzhardinge /*
1664f76cd38SJeremy Fitzhardinge * In PAE mode, we need to do a cr3 reload (=tlb flush) when
1674f76cd38SJeremy Fitzhardinge * updating the top-level pagetable entries to guarantee the
1684f76cd38SJeremy Fitzhardinge * processor notices the update. Since this is expensive, and
1694f76cd38SJeremy Fitzhardinge * all 4 top-level entries are used almost immediately in a
1704f76cd38SJeremy Fitzhardinge * new process's life, we just pre-populate them here.
1714f76cd38SJeremy Fitzhardinge *
1724f76cd38SJeremy Fitzhardinge * Also, if we're in a paravirt environment where the kernel pmd is
1734f76cd38SJeremy Fitzhardinge * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
1744f76cd38SJeremy Fitzhardinge * and initialize the kernel pmds here.
1754f76cd38SJeremy Fitzhardinge */
176d8d5900eSJeremy Fitzhardinge #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
177184d47f0SKees Cook #define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD
1781ec1fe73SIngo Molnar
179f59dbe9cSJoerg Roedel /*
180f59dbe9cSJoerg Roedel * We allocate separate PMDs for the kernel part of the user page-table
181f59dbe9cSJoerg Roedel * when PTI is enabled. We need them to map the per-process LDT into the
182f59dbe9cSJoerg Roedel * user-space page-table.
183f59dbe9cSJoerg Roedel */
18428e3ace7SBorislav Petkov #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \
185f59dbe9cSJoerg Roedel KERNEL_PGD_PTRS : 0)
186184d47f0SKees Cook #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
187f59dbe9cSJoerg Roedel
pud_populate(struct mm_struct * mm,pud_t * pudp,pmd_t * pmd)1881ec1fe73SIngo Molnar void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
1891ec1fe73SIngo Molnar {
1906944a9c8SJeremy Fitzhardinge paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
1911ec1fe73SIngo Molnar
1921ec1fe73SIngo Molnar /* Note: almost everything apart from _PAGE_PRESENT is
1931ec1fe73SIngo Molnar reserved at the pmd (PDPT) level. */
1941ec1fe73SIngo Molnar set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
1951ec1fe73SIngo Molnar
1961ec1fe73SIngo Molnar /*
1971ec1fe73SIngo Molnar * According to Intel App note "TLBs, Paging-Structure Caches,
1981ec1fe73SIngo Molnar * and Their Invalidation", April 2007, document 317080-001,
1991ec1fe73SIngo Molnar * section 8.1: in PAE mode we explicitly have to flush the
2001ec1fe73SIngo Molnar * TLB via cr3 if the top-level pgd is changed...
2011ec1fe73SIngo Molnar */
2024981d01eSShaohua Li flush_tlb_mm(mm);
2031ec1fe73SIngo Molnar }
2044f76cd38SJeremy Fitzhardinge #else /* !CONFIG_X86_PAE */
205d8d5900eSJeremy Fitzhardinge
2064f76cd38SJeremy Fitzhardinge /* No need to prepopulate any pagetable entries in non-PAE modes. */
207d8d5900eSJeremy Fitzhardinge #define PREALLOCATED_PMDS 0
208184d47f0SKees Cook #define MAX_PREALLOCATED_PMDS 0
209f59dbe9cSJoerg Roedel #define PREALLOCATED_USER_PMDS 0
210184d47f0SKees Cook #define MAX_PREALLOCATED_USER_PMDS 0
211d8d5900eSJeremy Fitzhardinge #endif /* CONFIG_X86_PAE */
212d8d5900eSJeremy Fitzhardinge
free_pmds(struct mm_struct * mm,pmd_t * pmds[],int count)213f59dbe9cSJoerg Roedel static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
2144f76cd38SJeremy Fitzhardinge {
215d8d5900eSJeremy Fitzhardinge int i;
216f92c494fSVishal Moola (Oracle) struct ptdesc *ptdesc;
217d8d5900eSJeremy Fitzhardinge
218f59dbe9cSJoerg Roedel for (i = 0; i < count; i++)
21909ef4939SKirill A. Shutemov if (pmds[i]) {
220f92c494fSVishal Moola (Oracle) ptdesc = virt_to_ptdesc(pmds[i]);
221f92c494fSVishal Moola (Oracle)
222f92c494fSVishal Moola (Oracle) pagetable_pmd_dtor(ptdesc);
223f92c494fSVishal Moola (Oracle) pagetable_free(ptdesc);
224dc6c9a35SKirill A. Shutemov mm_dec_nr_pmds(mm);
2254f76cd38SJeremy Fitzhardinge }
22609ef4939SKirill A. Shutemov }
2274f76cd38SJeremy Fitzhardinge
preallocate_pmds(struct mm_struct * mm,pmd_t * pmds[],int count)228f59dbe9cSJoerg Roedel static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
2294f76cd38SJeremy Fitzhardinge {
230d8d5900eSJeremy Fitzhardinge int i;
231d8d5900eSJeremy Fitzhardinge bool failed = false;
2325fba4af4SMike Rapoport gfp_t gfp = GFP_PGTABLE_USER;
2333e79ec7dSVladimir Davydov
2343e79ec7dSVladimir Davydov if (mm == &init_mm)
2353e79ec7dSVladimir Davydov gfp &= ~__GFP_ACCOUNT;
236f92c494fSVishal Moola (Oracle) gfp &= ~__GFP_HIGHMEM;
237d8d5900eSJeremy Fitzhardinge
238f59dbe9cSJoerg Roedel for (i = 0; i < count; i++) {
239f92c494fSVishal Moola (Oracle) pmd_t *pmd = NULL;
240f92c494fSVishal Moola (Oracle) struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);
241f92c494fSVishal Moola (Oracle)
242f92c494fSVishal Moola (Oracle) if (!ptdesc)
243d8d5900eSJeremy Fitzhardinge failed = true;
244f92c494fSVishal Moola (Oracle) if (ptdesc && !pagetable_pmd_ctor(ptdesc)) {
245f92c494fSVishal Moola (Oracle) pagetable_free(ptdesc);
246f92c494fSVishal Moola (Oracle) ptdesc = NULL;
24709ef4939SKirill A. Shutemov failed = true;
24809ef4939SKirill A. Shutemov }
249f92c494fSVishal Moola (Oracle) if (ptdesc) {
250dc6c9a35SKirill A. Shutemov mm_inc_nr_pmds(mm);
251f92c494fSVishal Moola (Oracle) pmd = ptdesc_address(ptdesc);
252f92c494fSVishal Moola (Oracle) }
253f92c494fSVishal Moola (Oracle)
254d8d5900eSJeremy Fitzhardinge pmds[i] = pmd;
2554f76cd38SJeremy Fitzhardinge }
256d8d5900eSJeremy Fitzhardinge
257d8d5900eSJeremy Fitzhardinge if (failed) {
258f59dbe9cSJoerg Roedel free_pmds(mm, pmds, count);
259d8d5900eSJeremy Fitzhardinge return -ENOMEM;
260d8d5900eSJeremy Fitzhardinge }
261d8d5900eSJeremy Fitzhardinge
262d8d5900eSJeremy Fitzhardinge return 0;
263d8d5900eSJeremy Fitzhardinge }
264d8d5900eSJeremy Fitzhardinge
265d8d5900eSJeremy Fitzhardinge /*
266d8d5900eSJeremy Fitzhardinge * Mop up any pmd pages which may still be attached to the pgd.
267d8d5900eSJeremy Fitzhardinge * Normally they will be freed by munmap/exit_mmap, but any pmd we
268d8d5900eSJeremy Fitzhardinge * preallocate which never got a corresponding vma will need to be
269d8d5900eSJeremy Fitzhardinge * freed manually.
270d8d5900eSJeremy Fitzhardinge */
mop_up_one_pmd(struct mm_struct * mm,pgd_t * pgdp)271f59dbe9cSJoerg Roedel static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
272d8d5900eSJeremy Fitzhardinge {
273f59dbe9cSJoerg Roedel pgd_t pgd = *pgdp;
274d8d5900eSJeremy Fitzhardinge
275d8d5900eSJeremy Fitzhardinge if (pgd_val(pgd) != 0) {
276d8d5900eSJeremy Fitzhardinge pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
277d8d5900eSJeremy Fitzhardinge
2789bc4f28aSNadav Amit pgd_clear(pgdp);
279d8d5900eSJeremy Fitzhardinge
280d8d5900eSJeremy Fitzhardinge paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
281d8d5900eSJeremy Fitzhardinge pmd_free(mm, pmd);
282dc6c9a35SKirill A. Shutemov mm_dec_nr_pmds(mm);
283d8d5900eSJeremy Fitzhardinge }
284d8d5900eSJeremy Fitzhardinge }
285f59dbe9cSJoerg Roedel
pgd_mop_up_pmds(struct mm_struct * mm,pgd_t * pgdp)286f59dbe9cSJoerg Roedel static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
287f59dbe9cSJoerg Roedel {
288f59dbe9cSJoerg Roedel int i;
289f59dbe9cSJoerg Roedel
290f59dbe9cSJoerg Roedel for (i = 0; i < PREALLOCATED_PMDS; i++)
291f59dbe9cSJoerg Roedel mop_up_one_pmd(mm, &pgdp[i]);
292f59dbe9cSJoerg Roedel
293f59dbe9cSJoerg Roedel #ifdef CONFIG_PAGE_TABLE_ISOLATION
294f59dbe9cSJoerg Roedel
29528e3ace7SBorislav Petkov if (!boot_cpu_has(X86_FEATURE_PTI))
296f59dbe9cSJoerg Roedel return;
297f59dbe9cSJoerg Roedel
298f59dbe9cSJoerg Roedel pgdp = kernel_to_user_pgdp(pgdp);
299f59dbe9cSJoerg Roedel
300f59dbe9cSJoerg Roedel for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
301f59dbe9cSJoerg Roedel mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
302f59dbe9cSJoerg Roedel #endif
303d8d5900eSJeremy Fitzhardinge }
304d8d5900eSJeremy Fitzhardinge
pgd_prepopulate_pmd(struct mm_struct * mm,pgd_t * pgd,pmd_t * pmds[])305d8d5900eSJeremy Fitzhardinge static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
306d8d5900eSJeremy Fitzhardinge {
307e0c4f675SKirill A. Shutemov p4d_t *p4d;
308d8d5900eSJeremy Fitzhardinge pud_t *pud;
309d8d5900eSJeremy Fitzhardinge int i;
310d8d5900eSJeremy Fitzhardinge
311e0c4f675SKirill A. Shutemov p4d = p4d_offset(pgd, 0);
312e0c4f675SKirill A. Shutemov pud = pud_offset(p4d, 0);
313d8d5900eSJeremy Fitzhardinge
31473b44ff4SWanpeng Li for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
315d8d5900eSJeremy Fitzhardinge pmd_t *pmd = pmds[i];
316d8d5900eSJeremy Fitzhardinge
317d8d5900eSJeremy Fitzhardinge if (i >= KERNEL_PGD_BOUNDARY)
318d8d5900eSJeremy Fitzhardinge memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
319d8d5900eSJeremy Fitzhardinge sizeof(pmd_t) * PTRS_PER_PMD);
320d8d5900eSJeremy Fitzhardinge
321d8d5900eSJeremy Fitzhardinge pud_populate(mm, pud, pmd);
322d8d5900eSJeremy Fitzhardinge }
323d8d5900eSJeremy Fitzhardinge }
3244f76cd38SJeremy Fitzhardinge
325f59dbe9cSJoerg Roedel #ifdef CONFIG_PAGE_TABLE_ISOLATION
pgd_prepopulate_user_pmd(struct mm_struct * mm,pgd_t * k_pgd,pmd_t * pmds[])326f59dbe9cSJoerg Roedel static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
327f59dbe9cSJoerg Roedel pgd_t *k_pgd, pmd_t *pmds[])
328f59dbe9cSJoerg Roedel {
329f59dbe9cSJoerg Roedel pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
330f59dbe9cSJoerg Roedel pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
331f59dbe9cSJoerg Roedel p4d_t *u_p4d;
332f59dbe9cSJoerg Roedel pud_t *u_pud;
333f59dbe9cSJoerg Roedel int i;
334f59dbe9cSJoerg Roedel
335f59dbe9cSJoerg Roedel u_p4d = p4d_offset(u_pgd, 0);
336f59dbe9cSJoerg Roedel u_pud = pud_offset(u_p4d, 0);
337f59dbe9cSJoerg Roedel
338f59dbe9cSJoerg Roedel s_pgd += KERNEL_PGD_BOUNDARY;
339f59dbe9cSJoerg Roedel u_pud += KERNEL_PGD_BOUNDARY;
340f59dbe9cSJoerg Roedel
341f59dbe9cSJoerg Roedel for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
342f59dbe9cSJoerg Roedel pmd_t *pmd = pmds[i];
343f59dbe9cSJoerg Roedel
344f59dbe9cSJoerg Roedel memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
345f59dbe9cSJoerg Roedel sizeof(pmd_t) * PTRS_PER_PMD);
346f59dbe9cSJoerg Roedel
347f59dbe9cSJoerg Roedel pud_populate(mm, u_pud, pmd);
348f59dbe9cSJoerg Roedel }
349f59dbe9cSJoerg Roedel
350f59dbe9cSJoerg Roedel }
351f59dbe9cSJoerg Roedel #else
pgd_prepopulate_user_pmd(struct mm_struct * mm,pgd_t * k_pgd,pmd_t * pmds[])352f59dbe9cSJoerg Roedel static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
353f59dbe9cSJoerg Roedel pgd_t *k_pgd, pmd_t *pmds[])
354f59dbe9cSJoerg Roedel {
355f59dbe9cSJoerg Roedel }
356f59dbe9cSJoerg Roedel #endif
3571db491f7SFenghua Yu /*
3581db491f7SFenghua Yu * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
3591db491f7SFenghua Yu * assumes that pgd should be in one page.
3601db491f7SFenghua Yu *
3611db491f7SFenghua Yu * But kernel with PAE paging that is not running as a Xen domain
3621db491f7SFenghua Yu * only needs to allocate 32 bytes for pgd instead of one page.
3631db491f7SFenghua Yu */
3641db491f7SFenghua Yu #ifdef CONFIG_X86_PAE
3651db491f7SFenghua Yu
3661db491f7SFenghua Yu #include <linux/slab.h>
3671db491f7SFenghua Yu
3681db491f7SFenghua Yu #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
3691db491f7SFenghua Yu #define PGD_ALIGN 32
3701db491f7SFenghua Yu
3711db491f7SFenghua Yu static struct kmem_cache *pgd_cache;
3721db491f7SFenghua Yu
pgtable_cache_init(void)373782de70cSMike Rapoport void __init pgtable_cache_init(void)
3741db491f7SFenghua Yu {
3751db491f7SFenghua Yu /*
3761db491f7SFenghua Yu * When PAE kernel is running as a Xen domain, it does not use
3771db491f7SFenghua Yu * shared kernel pmd. And this requires a whole page for pgd.
3781db491f7SFenghua Yu */
3791db491f7SFenghua Yu if (!SHARED_KERNEL_PMD)
380caa84136SNadav Amit return;
3811db491f7SFenghua Yu
3821db491f7SFenghua Yu /*
3831db491f7SFenghua Yu * when PAE kernel is not running as a Xen domain, it uses
3841db491f7SFenghua Yu * shared kernel pmd. Shared kernel pmd does not require a whole
3851db491f7SFenghua Yu * page for pgd. We are able to just allocate a 32-byte for pgd.
3861db491f7SFenghua Yu * During boot time, we create a 32-byte slab for pgd table allocation.
3871db491f7SFenghua Yu */
3881db491f7SFenghua Yu pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
3891db491f7SFenghua Yu SLAB_PANIC, NULL);
3901db491f7SFenghua Yu }
3911db491f7SFenghua Yu
_pgd_alloc(void)3921db491f7SFenghua Yu static inline pgd_t *_pgd_alloc(void)
3931db491f7SFenghua Yu {
3941db491f7SFenghua Yu /*
3951db491f7SFenghua Yu * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
3961db491f7SFenghua Yu * We allocate one page for pgd.
3971db491f7SFenghua Yu */
3981db491f7SFenghua Yu if (!SHARED_KERNEL_PMD)
3995fba4af4SMike Rapoport return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
400e3238fafSJoerg Roedel PGD_ALLOCATION_ORDER);
4011db491f7SFenghua Yu
4021db491f7SFenghua Yu /*
4031db491f7SFenghua Yu * Now PAE kernel is not running as a Xen domain. We can allocate
4041db491f7SFenghua Yu * a 32-byte slab for pgd to save memory space.
4051db491f7SFenghua Yu */
4065fba4af4SMike Rapoport return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER);
4071db491f7SFenghua Yu }
4081db491f7SFenghua Yu
_pgd_free(pgd_t * pgd)4091db491f7SFenghua Yu static inline void _pgd_free(pgd_t *pgd)
4101db491f7SFenghua Yu {
4111db491f7SFenghua Yu if (!SHARED_KERNEL_PMD)
412e3238fafSJoerg Roedel free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
4131db491f7SFenghua Yu else
4141db491f7SFenghua Yu kmem_cache_free(pgd_cache, pgd);
4151db491f7SFenghua Yu }
4161db491f7SFenghua Yu #else
417d9e9a641SDave Hansen
_pgd_alloc(void)4181db491f7SFenghua Yu static inline pgd_t *_pgd_alloc(void)
4191db491f7SFenghua Yu {
4205fba4af4SMike Rapoport return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
4215fba4af4SMike Rapoport PGD_ALLOCATION_ORDER);
4221db491f7SFenghua Yu }
4231db491f7SFenghua Yu
_pgd_free(pgd_t * pgd)4241db491f7SFenghua Yu static inline void _pgd_free(pgd_t *pgd)
4251db491f7SFenghua Yu {
426d9e9a641SDave Hansen free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
4271db491f7SFenghua Yu }
4281db491f7SFenghua Yu #endif /* CONFIG_X86_PAE */
4291db491f7SFenghua Yu
pgd_alloc(struct mm_struct * mm)4304f76cd38SJeremy Fitzhardinge pgd_t *pgd_alloc(struct mm_struct *mm)
4314f76cd38SJeremy Fitzhardinge {
432d8d5900eSJeremy Fitzhardinge pgd_t *pgd;
433184d47f0SKees Cook pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
434184d47f0SKees Cook pmd_t *pmds[MAX_PREALLOCATED_PMDS];
4354f76cd38SJeremy Fitzhardinge
4361db491f7SFenghua Yu pgd = _pgd_alloc();
437d8d5900eSJeremy Fitzhardinge
438d8d5900eSJeremy Fitzhardinge if (pgd == NULL)
439d8d5900eSJeremy Fitzhardinge goto out;
440d8d5900eSJeremy Fitzhardinge
4414f76cd38SJeremy Fitzhardinge mm->pgd = pgd;
4424f76cd38SJeremy Fitzhardinge
44325226df4SGustavo A. R. Silva if (sizeof(pmds) != 0 &&
44425226df4SGustavo A. R. Silva preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
445d8d5900eSJeremy Fitzhardinge goto out_free_pgd;
446d8d5900eSJeremy Fitzhardinge
44725226df4SGustavo A. R. Silva if (sizeof(u_pmds) != 0 &&
44825226df4SGustavo A. R. Silva preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
449d8d5900eSJeremy Fitzhardinge goto out_free_pmds;
450d8d5900eSJeremy Fitzhardinge
451f59dbe9cSJoerg Roedel if (paravirt_pgd_alloc(mm) != 0)
452f59dbe9cSJoerg Roedel goto out_free_user_pmds;
453f59dbe9cSJoerg Roedel
454d8d5900eSJeremy Fitzhardinge /*
455d8d5900eSJeremy Fitzhardinge * Make sure that pre-populating the pmds is atomic with
456d8d5900eSJeremy Fitzhardinge * respect to anything walking the pgd_list, so that they
457d8d5900eSJeremy Fitzhardinge * never see a partially populated pgd.
458d8d5900eSJeremy Fitzhardinge */
459a79e53d8SAndrea Arcangeli spin_lock(&pgd_lock);
460d8d5900eSJeremy Fitzhardinge
461617d34d9SJeremy Fitzhardinge pgd_ctor(mm, pgd);
46225226df4SGustavo A. R. Silva if (sizeof(pmds) != 0)
463d8d5900eSJeremy Fitzhardinge pgd_prepopulate_pmd(mm, pgd, pmds);
46425226df4SGustavo A. R. Silva
46525226df4SGustavo A. R. Silva if (sizeof(u_pmds) != 0)
466f59dbe9cSJoerg Roedel pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
467d8d5900eSJeremy Fitzhardinge
468a79e53d8SAndrea Arcangeli spin_unlock(&pgd_lock);
4694f76cd38SJeremy Fitzhardinge
4704f76cd38SJeremy Fitzhardinge return pgd;
471d8d5900eSJeremy Fitzhardinge
472f59dbe9cSJoerg Roedel out_free_user_pmds:
47325226df4SGustavo A. R. Silva if (sizeof(u_pmds) != 0)
474f59dbe9cSJoerg Roedel free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
475d8d5900eSJeremy Fitzhardinge out_free_pmds:
47625226df4SGustavo A. R. Silva if (sizeof(pmds) != 0)
477f59dbe9cSJoerg Roedel free_pmds(mm, pmds, PREALLOCATED_PMDS);
478d8d5900eSJeremy Fitzhardinge out_free_pgd:
4791db491f7SFenghua Yu _pgd_free(pgd);
480d8d5900eSJeremy Fitzhardinge out:
481d8d5900eSJeremy Fitzhardinge return NULL;
4824f76cd38SJeremy Fitzhardinge }
4834f76cd38SJeremy Fitzhardinge
pgd_free(struct mm_struct * mm,pgd_t * pgd)4844f76cd38SJeremy Fitzhardinge void pgd_free(struct mm_struct *mm, pgd_t *pgd)
4854f76cd38SJeremy Fitzhardinge {
4864f76cd38SJeremy Fitzhardinge pgd_mop_up_pmds(mm, pgd);
4874f76cd38SJeremy Fitzhardinge pgd_dtor(pgd);
488eba0045fSJeremy Fitzhardinge paravirt_pgd_free(mm, pgd);
4891db491f7SFenghua Yu _pgd_free(pgd);
4904f76cd38SJeremy Fitzhardinge }
491ee5aa8d3SJeremy Fitzhardinge
4920f9a921cSRik van Riel /*
4930f9a921cSRik van Riel * Used to set accessed or dirty bits in the page table entries
4940f9a921cSRik van Riel * on other architectures. On x86, the accessed and dirty bits
4950f9a921cSRik van Riel * are tracked by hardware. However, do_wp_page calls this function
4960f9a921cSRik van Riel * to also make the pte writeable at the same time the dirty bit is
4970f9a921cSRik van Riel * set. In that case we do actually need to write the PTE.
4980f9a921cSRik van Riel */
ptep_set_access_flags(struct vm_area_struct * vma,unsigned long address,pte_t * ptep,pte_t entry,int dirty)499ee5aa8d3SJeremy Fitzhardinge int ptep_set_access_flags(struct vm_area_struct *vma,
500ee5aa8d3SJeremy Fitzhardinge unsigned long address, pte_t *ptep,
501ee5aa8d3SJeremy Fitzhardinge pte_t entry, int dirty)
502ee5aa8d3SJeremy Fitzhardinge {
503ee5aa8d3SJeremy Fitzhardinge int changed = !pte_same(*ptep, entry);
504ee5aa8d3SJeremy Fitzhardinge
50587930019SJuergen Gross if (changed && dirty)
5069bc4f28aSNadav Amit set_pte(ptep, entry);
507ee5aa8d3SJeremy Fitzhardinge
508ee5aa8d3SJeremy Fitzhardinge return changed;
509ee5aa8d3SJeremy Fitzhardinge }
510f9fbf1a3SJeremy Fitzhardinge
511db3eb96fSAndrea Arcangeli #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmdp_set_access_flags(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp,pmd_t entry,int dirty)512db3eb96fSAndrea Arcangeli int pmdp_set_access_flags(struct vm_area_struct *vma,
513db3eb96fSAndrea Arcangeli unsigned long address, pmd_t *pmdp,
514db3eb96fSAndrea Arcangeli pmd_t entry, int dirty)
515db3eb96fSAndrea Arcangeli {
516db3eb96fSAndrea Arcangeli int changed = !pmd_same(*pmdp, entry);
517db3eb96fSAndrea Arcangeli
518db3eb96fSAndrea Arcangeli VM_BUG_ON(address & ~HPAGE_PMD_MASK);
519db3eb96fSAndrea Arcangeli
520db3eb96fSAndrea Arcangeli if (changed && dirty) {
5219bc4f28aSNadav Amit set_pmd(pmdp, entry);
5225e4bf1a5SIngo Molnar /*
5235e4bf1a5SIngo Molnar * We had a write-protection fault here and changed the pmd
5245e4bf1a5SIngo Molnar * to to more permissive. No need to flush the TLB for that,
5255e4bf1a5SIngo Molnar * #PF is architecturally guaranteed to do that and in the
5265e4bf1a5SIngo Molnar * worst-case we'll generate a spurious fault.
5275e4bf1a5SIngo Molnar */
528db3eb96fSAndrea Arcangeli }
529db3eb96fSAndrea Arcangeli
530db3eb96fSAndrea Arcangeli return changed;
531db3eb96fSAndrea Arcangeli }
532a00cc7d9SMatthew Wilcox
pudp_set_access_flags(struct vm_area_struct * vma,unsigned long address,pud_t * pudp,pud_t entry,int dirty)533a00cc7d9SMatthew Wilcox int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
534a00cc7d9SMatthew Wilcox pud_t *pudp, pud_t entry, int dirty)
535a00cc7d9SMatthew Wilcox {
536a00cc7d9SMatthew Wilcox int changed = !pud_same(*pudp, entry);
537a00cc7d9SMatthew Wilcox
538a00cc7d9SMatthew Wilcox VM_BUG_ON(address & ~HPAGE_PUD_MASK);
539a00cc7d9SMatthew Wilcox
540a00cc7d9SMatthew Wilcox if (changed && dirty) {
5419bc4f28aSNadav Amit set_pud(pudp, entry);
542a00cc7d9SMatthew Wilcox /*
543a00cc7d9SMatthew Wilcox * We had a write-protection fault here and changed the pud
544a00cc7d9SMatthew Wilcox * to to more permissive. No need to flush the TLB for that,
545a00cc7d9SMatthew Wilcox * #PF is architecturally guaranteed to do that and in the
546a00cc7d9SMatthew Wilcox * worst-case we'll generate a spurious fault.
547a00cc7d9SMatthew Wilcox */
548a00cc7d9SMatthew Wilcox }
549a00cc7d9SMatthew Wilcox
550a00cc7d9SMatthew Wilcox return changed;
551a00cc7d9SMatthew Wilcox }
552db3eb96fSAndrea Arcangeli #endif
553db3eb96fSAndrea Arcangeli
ptep_test_and_clear_young(struct vm_area_struct * vma,unsigned long addr,pte_t * ptep)554f9fbf1a3SJeremy Fitzhardinge int ptep_test_and_clear_young(struct vm_area_struct *vma,
555f9fbf1a3SJeremy Fitzhardinge unsigned long addr, pte_t *ptep)
556f9fbf1a3SJeremy Fitzhardinge {
557f9fbf1a3SJeremy Fitzhardinge int ret = 0;
558f9fbf1a3SJeremy Fitzhardinge
559f9fbf1a3SJeremy Fitzhardinge if (pte_young(*ptep))
560f9fbf1a3SJeremy Fitzhardinge ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
56148e23957SThomas Gleixner (unsigned long *) &ptep->pte);
562f9fbf1a3SJeremy Fitzhardinge
563f9fbf1a3SJeremy Fitzhardinge return ret;
564f9fbf1a3SJeremy Fitzhardinge }
565c20311e1SJeremy Fitzhardinge
566eed9a328SYu Zhao #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
pmdp_test_and_clear_young(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmdp)567db3eb96fSAndrea Arcangeli int pmdp_test_and_clear_young(struct vm_area_struct *vma,
568db3eb96fSAndrea Arcangeli unsigned long addr, pmd_t *pmdp)
569db3eb96fSAndrea Arcangeli {
570db3eb96fSAndrea Arcangeli int ret = 0;
571db3eb96fSAndrea Arcangeli
572db3eb96fSAndrea Arcangeli if (pmd_young(*pmdp))
573db3eb96fSAndrea Arcangeli ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
574f2d6bfe9SJohannes Weiner (unsigned long *)pmdp);
575db3eb96fSAndrea Arcangeli
576db3eb96fSAndrea Arcangeli return ret;
577db3eb96fSAndrea Arcangeli }
578eed9a328SYu Zhao #endif
579eed9a328SYu Zhao
580eed9a328SYu Zhao #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pudp_test_and_clear_young(struct vm_area_struct * vma,unsigned long addr,pud_t * pudp)581a00cc7d9SMatthew Wilcox int pudp_test_and_clear_young(struct vm_area_struct *vma,
582a00cc7d9SMatthew Wilcox unsigned long addr, pud_t *pudp)
583a00cc7d9SMatthew Wilcox {
584a00cc7d9SMatthew Wilcox int ret = 0;
585a00cc7d9SMatthew Wilcox
586a00cc7d9SMatthew Wilcox if (pud_young(*pudp))
587a00cc7d9SMatthew Wilcox ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
588a00cc7d9SMatthew Wilcox (unsigned long *)pudp);
589a00cc7d9SMatthew Wilcox
590a00cc7d9SMatthew Wilcox return ret;
591a00cc7d9SMatthew Wilcox }
592db3eb96fSAndrea Arcangeli #endif
593db3eb96fSAndrea Arcangeli
ptep_clear_flush_young(struct vm_area_struct * vma,unsigned long address,pte_t * ptep)594c20311e1SJeremy Fitzhardinge int ptep_clear_flush_young(struct vm_area_struct *vma,
595c20311e1SJeremy Fitzhardinge unsigned long address, pte_t *ptep)
596c20311e1SJeremy Fitzhardinge {
597b13b1d2dSShaohua Li /*
598b13b1d2dSShaohua Li * On x86 CPUs, clearing the accessed bit without a TLB flush
599b13b1d2dSShaohua Li * doesn't cause data corruption. [ It could cause incorrect
600b13b1d2dSShaohua Li * page aging and the (mistaken) reclaim of hot pages, but the
601b13b1d2dSShaohua Li * chance of that should be relatively low. ]
602b13b1d2dSShaohua Li *
603b13b1d2dSShaohua Li * So as a performance optimization don't flush the TLB when
604b13b1d2dSShaohua Li * clearing the accessed bit, it will eventually be flushed by
605b13b1d2dSShaohua Li * a context switch or a VM operation anyway. [ In the rare
606b13b1d2dSShaohua Li * event of it not getting flushed for a long time the delay
607b13b1d2dSShaohua Li * shouldn't really matter because there's no real memory
608b13b1d2dSShaohua Li * pressure for swapout to react to. ]
609b13b1d2dSShaohua Li */
610b13b1d2dSShaohua Li return ptep_test_and_clear_young(vma, address, ptep);
611c20311e1SJeremy Fitzhardinge }
6127c7e6e07SJeremy Fitzhardinge
613db3eb96fSAndrea Arcangeli #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmdp_clear_flush_young(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp)614db3eb96fSAndrea Arcangeli int pmdp_clear_flush_young(struct vm_area_struct *vma,
615db3eb96fSAndrea Arcangeli unsigned long address, pmd_t *pmdp)
616db3eb96fSAndrea Arcangeli {
617db3eb96fSAndrea Arcangeli int young;
618db3eb96fSAndrea Arcangeli
619db3eb96fSAndrea Arcangeli VM_BUG_ON(address & ~HPAGE_PMD_MASK);
620db3eb96fSAndrea Arcangeli
621db3eb96fSAndrea Arcangeli young = pmdp_test_and_clear_young(vma, address, pmdp);
622db3eb96fSAndrea Arcangeli if (young)
623db3eb96fSAndrea Arcangeli flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
624db3eb96fSAndrea Arcangeli
625db3eb96fSAndrea Arcangeli return young;
626db3eb96fSAndrea Arcangeli }
6274f831457SNadav Amit
pmdp_invalidate_ad(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp)6284f831457SNadav Amit pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
6294f831457SNadav Amit pmd_t *pmdp)
6304f831457SNadav Amit {
631*be0ce3f6SRyan Roberts VM_WARN_ON_ONCE(!pmd_present(*pmdp));
632*be0ce3f6SRyan Roberts
6334f831457SNadav Amit /*
6344f831457SNadav Amit * No flush is necessary. Once an invalid PTE is established, the PTE's
6354f831457SNadav Amit * access and dirty bits cannot be updated.
6364f831457SNadav Amit */
6374f831457SNadav Amit return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
6384f831457SNadav Amit }
639db3eb96fSAndrea Arcangeli #endif
640db3eb96fSAndrea Arcangeli
641fd862ddeSGustavo F. Padovan /**
642fd862ddeSGustavo F. Padovan * reserve_top_address - reserves a hole in the top of kernel address space
643fd862ddeSGustavo F. Padovan * @reserve - size of hole to reserve
644fd862ddeSGustavo F. Padovan *
645fd862ddeSGustavo F. Padovan * Can be used to relocate the fixmap area and poke a hole in the top
646fd862ddeSGustavo F. Padovan * of kernel address space to make room for a hypervisor.
647fd862ddeSGustavo F. Padovan */
reserve_top_address(unsigned long reserve)648fd862ddeSGustavo F. Padovan void __init reserve_top_address(unsigned long reserve)
649fd862ddeSGustavo F. Padovan {
650fd862ddeSGustavo F. Padovan #ifdef CONFIG_X86_32
651fd862ddeSGustavo F. Padovan BUG_ON(fixmaps_set > 0);
65273159fdcSAndy Lutomirski __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
65373159fdcSAndy Lutomirski printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
65473159fdcSAndy Lutomirski -reserve, __FIXADDR_TOP + PAGE_SIZE);
655fd862ddeSGustavo F. Padovan #endif
656fd862ddeSGustavo F. Padovan }
657fd862ddeSGustavo F. Padovan
6587c7e6e07SJeremy Fitzhardinge int fixmaps_set;
6597c7e6e07SJeremy Fitzhardinge
__native_set_fixmap(enum fixed_addresses idx,pte_t pte)660aeaaa59cSJeremy Fitzhardinge void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
6617c7e6e07SJeremy Fitzhardinge {
6627c7e6e07SJeremy Fitzhardinge unsigned long address = __fix_to_virt(idx);
6637c7e6e07SJeremy Fitzhardinge
66405ab1d8aSFeng Tang #ifdef CONFIG_X86_64
66505ab1d8aSFeng Tang /*
66605ab1d8aSFeng Tang * Ensure that the static initial page tables are covering the
66705ab1d8aSFeng Tang * fixmap completely.
66805ab1d8aSFeng Tang */
66905ab1d8aSFeng Tang BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
67005ab1d8aSFeng Tang (FIXMAP_PMD_NUM * PTRS_PER_PTE));
67105ab1d8aSFeng Tang #endif
67205ab1d8aSFeng Tang
6737c7e6e07SJeremy Fitzhardinge if (idx >= __end_of_fixed_addresses) {
6747c7e6e07SJeremy Fitzhardinge BUG();
6757c7e6e07SJeremy Fitzhardinge return;
6767c7e6e07SJeremy Fitzhardinge }
677aeaaa59cSJeremy Fitzhardinge set_pte_vaddr(address, pte);
6787c7e6e07SJeremy Fitzhardinge fixmaps_set++;
6797c7e6e07SJeremy Fitzhardinge }
680aeaaa59cSJeremy Fitzhardinge
native_set_fixmap(unsigned idx,phys_addr_t phys,pgprot_t flags)681f53e2cd0SSami Tolvanen void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
682f53e2cd0SSami Tolvanen phys_addr_t phys, pgprot_t flags)
683aeaaa59cSJeremy Fitzhardinge {
684fb43d6cbSDave Hansen /* Sanitize 'prot' against any unsupported bits: */
685fb43d6cbSDave Hansen pgprot_val(flags) &= __default_kernel_pte_mask;
686fb43d6cbSDave Hansen
687aeaaa59cSJeremy Fitzhardinge __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
688aeaaa59cSJeremy Fitzhardinge }
6896b637835SToshi Kani
6906b637835SToshi Kani #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
691b8504058SKirill A. Shutemov #ifdef CONFIG_X86_5LEVEL
692b8504058SKirill A. Shutemov /**
693b8504058SKirill A. Shutemov * p4d_set_huge - setup kernel P4D mapping
694b8504058SKirill A. Shutemov *
695b8504058SKirill A. Shutemov * No 512GB pages yet -- always return 0
696b8504058SKirill A. Shutemov */
p4d_set_huge(p4d_t * p4d,phys_addr_t addr,pgprot_t prot)697b8504058SKirill A. Shutemov int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
698b8504058SKirill A. Shutemov {
699b8504058SKirill A. Shutemov return 0;
700b8504058SKirill A. Shutemov }
701b8504058SKirill A. Shutemov
702b8504058SKirill A. Shutemov /**
703b8504058SKirill A. Shutemov * p4d_clear_huge - clear kernel P4D mapping when it is set
704b8504058SKirill A. Shutemov *
705b8504058SKirill A. Shutemov * No 512GB pages yet -- always return 0
706b8504058SKirill A. Shutemov */
p4d_clear_huge(p4d_t * p4d)707c8db8c26SLi kunyu void p4d_clear_huge(p4d_t *p4d)
708b8504058SKirill A. Shutemov {
709b8504058SKirill A. Shutemov }
710b8504058SKirill A. Shutemov #endif
711b8504058SKirill A. Shutemov
7123d3ca416SToshi Kani /**
7133d3ca416SToshi Kani * pud_set_huge - setup kernel PUD mapping
7143d3ca416SToshi Kani *
715b73522e0SToshi Kani * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
71612f0dd8dSJuergen Gross * function sets up a huge page only if the complete range has the same MTRR
71712f0dd8dSJuergen Gross * caching mode.
718b73522e0SToshi Kani *
719b73522e0SToshi Kani * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
720b73522e0SToshi Kani * page mapping attempt fails.
7213d3ca416SToshi Kani *
7223d3ca416SToshi Kani * Returns 1 on success and 0 on failure.
7233d3ca416SToshi Kani */
pud_set_huge(pud_t * pud,phys_addr_t addr,pgprot_t prot)7246b637835SToshi Kani int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
7256b637835SToshi Kani {
72612f0dd8dSJuergen Gross u8 uniform;
7276b637835SToshi Kani
72812f0dd8dSJuergen Gross mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
72912f0dd8dSJuergen Gross if (!uniform)
7306b637835SToshi Kani return 0;
7316b637835SToshi Kani
732e3e28812SJoerg Roedel /* Bail out if we are we on a populated non-leaf entry: */
733e3e28812SJoerg Roedel if (pud_present(*pud) && !pud_huge(*pud))
734e3e28812SJoerg Roedel return 0;
735e3e28812SJoerg Roedel
7366b637835SToshi Kani set_pte((pte_t *)pud, pfn_pte(
7376b637835SToshi Kani (u64)addr >> PAGE_SHIFT,
738d0735693SChristoph Hellwig __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
7396b637835SToshi Kani
7406b637835SToshi Kani return 1;
7416b637835SToshi Kani }
7426b637835SToshi Kani
7433d3ca416SToshi Kani /**
7443d3ca416SToshi Kani * pmd_set_huge - setup kernel PMD mapping
7453d3ca416SToshi Kani *
746b73522e0SToshi Kani * See text over pud_set_huge() above.
7473d3ca416SToshi Kani *
7483d3ca416SToshi Kani * Returns 1 on success and 0 on failure.
7493d3ca416SToshi Kani */
pmd_set_huge(pmd_t * pmd,phys_addr_t addr,pgprot_t prot)7506b637835SToshi Kani int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
7516b637835SToshi Kani {
75212f0dd8dSJuergen Gross u8 uniform;
7536b637835SToshi Kani
75412f0dd8dSJuergen Gross mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
75512f0dd8dSJuergen Gross if (!uniform) {
756b73522e0SToshi Kani pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
757b73522e0SToshi Kani __func__, addr, addr + PMD_SIZE);
7586b637835SToshi Kani return 0;
759b73522e0SToshi Kani }
7606b637835SToshi Kani
761e3e28812SJoerg Roedel /* Bail out if we are we on a populated non-leaf entry: */
762e3e28812SJoerg Roedel if (pmd_present(*pmd) && !pmd_huge(*pmd))
763e3e28812SJoerg Roedel return 0;
764e3e28812SJoerg Roedel
7656b637835SToshi Kani set_pte((pte_t *)pmd, pfn_pte(
7666b637835SToshi Kani (u64)addr >> PAGE_SHIFT,
767d0735693SChristoph Hellwig __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
7686b637835SToshi Kani
7696b637835SToshi Kani return 1;
7706b637835SToshi Kani }
7716b637835SToshi Kani
7723d3ca416SToshi Kani /**
773d8a71905SJonathan Marek * pud_clear_huge - clear kernel PUD mapping when it is set
774d8a71905SJonathan Marek *
775d8a71905SJonathan Marek * Returns 1 on success and 0 on failure (no PUD map is found).
776d8a71905SJonathan Marek */
pud_clear_huge(pud_t * pud)777d8a71905SJonathan Marek int pud_clear_huge(pud_t *pud)
778d8a71905SJonathan Marek {
779907835e6SPeter Xu if (pud_leaf(*pud)) {
780d8a71905SJonathan Marek pud_clear(pud);
781d8a71905SJonathan Marek return 1;
782d8a71905SJonathan Marek }
783d8a71905SJonathan Marek
784d8a71905SJonathan Marek return 0;
785d8a71905SJonathan Marek }
786d8a71905SJonathan Marek
787d8a71905SJonathan Marek /**
7883d3ca416SToshi Kani * pmd_clear_huge - clear kernel PMD mapping when it is set
7893d3ca416SToshi Kani *
7903d3ca416SToshi Kani * Returns 1 on success and 0 on failure (no PMD map is found).
7913d3ca416SToshi Kani */
pmd_clear_huge(pmd_t * pmd)7926b637835SToshi Kani int pmd_clear_huge(pmd_t *pmd)
7936b637835SToshi Kani {
7946b637835SToshi Kani if (pmd_large(*pmd)) {
7956b637835SToshi Kani pmd_clear(pmd);
7966b637835SToshi Kani return 1;
7976b637835SToshi Kani }
7986b637835SToshi Kani
7996b637835SToshi Kani return 0;
8006b637835SToshi Kani }
801b6bdb751SToshi Kani
802f967db0bSToshi Kani #ifdef CONFIG_X86_64
803b6bdb751SToshi Kani /**
804b6bdb751SToshi Kani * pud_free_pmd_page - Clear pud entry and free pmd page.
805b6bdb751SToshi Kani * @pud: Pointer to a PUD.
806785a19f9SChintan Pandya * @addr: Virtual address associated with pud.
807b6bdb751SToshi Kani *
8085e0fb5dfSToshi Kani * Context: The pud range has been unmapped and TLB purged.
809b6bdb751SToshi Kani * Return: 1 if clearing the entry succeeded. 0 otherwise.
8105e0fb5dfSToshi Kani *
8115e0fb5dfSToshi Kani * NOTE: Callers must allow a single page allocation.
812b6bdb751SToshi Kani */
pud_free_pmd_page(pud_t * pud,unsigned long addr)813785a19f9SChintan Pandya int pud_free_pmd_page(pud_t *pud, unsigned long addr)
814b6bdb751SToshi Kani {
8155e0fb5dfSToshi Kani pmd_t *pmd, *pmd_sv;
8165e0fb5dfSToshi Kani pte_t *pte;
81728ee90feSToshi Kani int i;
81828ee90feSToshi Kani
8199cf6fa24SAneesh Kumar K.V pmd = pud_pgtable(*pud);
8205e0fb5dfSToshi Kani pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
8215e0fb5dfSToshi Kani if (!pmd_sv)
82228ee90feSToshi Kani return 0;
82328ee90feSToshi Kani
8245e0fb5dfSToshi Kani for (i = 0; i < PTRS_PER_PMD; i++) {
8255e0fb5dfSToshi Kani pmd_sv[i] = pmd[i];
8265e0fb5dfSToshi Kani if (!pmd_none(pmd[i]))
8275e0fb5dfSToshi Kani pmd_clear(&pmd[i]);
8285e0fb5dfSToshi Kani }
8295e0fb5dfSToshi Kani
83028ee90feSToshi Kani pud_clear(pud);
8315e0fb5dfSToshi Kani
8325e0fb5dfSToshi Kani /* INVLPG to clear all paging-structure caches */
8335e0fb5dfSToshi Kani flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
8345e0fb5dfSToshi Kani
8355e0fb5dfSToshi Kani for (i = 0; i < PTRS_PER_PMD; i++) {
8365e0fb5dfSToshi Kani if (!pmd_none(pmd_sv[i])) {
8375e0fb5dfSToshi Kani pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
8385e0fb5dfSToshi Kani free_page((unsigned long)pte);
8395e0fb5dfSToshi Kani }
8405e0fb5dfSToshi Kani }
8415e0fb5dfSToshi Kani
8425e0fb5dfSToshi Kani free_page((unsigned long)pmd_sv);
843d1c5246eSDan Williams
844f92c494fSVishal Moola (Oracle) pagetable_pmd_dtor(virt_to_ptdesc(pmd));
84528ee90feSToshi Kani free_page((unsigned long)pmd);
84628ee90feSToshi Kani
84728ee90feSToshi Kani return 1;
848b6bdb751SToshi Kani }
849b6bdb751SToshi Kani
850b6bdb751SToshi Kani /**
851b6bdb751SToshi Kani * pmd_free_pte_page - Clear pmd entry and free pte page.
852b6bdb751SToshi Kani * @pmd: Pointer to a PMD.
853785a19f9SChintan Pandya * @addr: Virtual address associated with pmd.
854b6bdb751SToshi Kani *
8555e0fb5dfSToshi Kani * Context: The pmd range has been unmapped and TLB purged.
856b6bdb751SToshi Kani * Return: 1 if clearing the entry succeeded. 0 otherwise.
857b6bdb751SToshi Kani */
pmd_free_pte_page(pmd_t * pmd,unsigned long addr)858785a19f9SChintan Pandya int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
859b6bdb751SToshi Kani {
86028ee90feSToshi Kani pte_t *pte;
86128ee90feSToshi Kani
86228ee90feSToshi Kani pte = (pte_t *)pmd_page_vaddr(*pmd);
86328ee90feSToshi Kani pmd_clear(pmd);
8645e0fb5dfSToshi Kani
8655e0fb5dfSToshi Kani /* INVLPG to clear all paging-structure caches */
8665e0fb5dfSToshi Kani flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
8675e0fb5dfSToshi Kani
86828ee90feSToshi Kani free_page((unsigned long)pte);
86928ee90feSToshi Kani
87028ee90feSToshi Kani return 1;
871b6bdb751SToshi Kani }
872f967db0bSToshi Kani
873f967db0bSToshi Kani #else /* !CONFIG_X86_64 */
874f967db0bSToshi Kani
875f967db0bSToshi Kani /*
876f967db0bSToshi Kani * Disable free page handling on x86-PAE. This assures that ioremap()
877f967db0bSToshi Kani * does not update sync'd pmd entries. See vmalloc_sync_one().
878f967db0bSToshi Kani */
pmd_free_pte_page(pmd_t * pmd,unsigned long addr)879785a19f9SChintan Pandya int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
880f967db0bSToshi Kani {
881f967db0bSToshi Kani return pmd_none(*pmd);
882f967db0bSToshi Kani }
883f967db0bSToshi Kani
884f967db0bSToshi Kani #endif /* CONFIG_X86_64 */
8856b637835SToshi Kani #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
886bb3aadf7SRick Edgecombe
pte_mkwrite(pte_t pte,struct vm_area_struct * vma)887bb3aadf7SRick Edgecombe pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
888bb3aadf7SRick Edgecombe {
889b497e52dSRick Edgecombe if (vma->vm_flags & VM_SHADOW_STACK)
890b497e52dSRick Edgecombe return pte_mkwrite_shstk(pte);
891b497e52dSRick Edgecombe
892bb3aadf7SRick Edgecombe pte = pte_mkwrite_novma(pte);
893bb3aadf7SRick Edgecombe
894bb3aadf7SRick Edgecombe return pte_clear_saveddirty(pte);
895bb3aadf7SRick Edgecombe }
896bb3aadf7SRick Edgecombe
pmd_mkwrite(pmd_t pmd,struct vm_area_struct * vma)897bb3aadf7SRick Edgecombe pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
898bb3aadf7SRick Edgecombe {
899b497e52dSRick Edgecombe if (vma->vm_flags & VM_SHADOW_STACK)
900b497e52dSRick Edgecombe return pmd_mkwrite_shstk(pmd);
901b497e52dSRick Edgecombe
902bb3aadf7SRick Edgecombe pmd = pmd_mkwrite_novma(pmd);
903bb3aadf7SRick Edgecombe
904bb3aadf7SRick Edgecombe return pmd_clear_saveddirty(pmd);
905bb3aadf7SRick Edgecombe }
906e5136e87SRick Edgecombe
arch_check_zapped_pte(struct vm_area_struct * vma,pte_t pte)907e5136e87SRick Edgecombe void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
908e5136e87SRick Edgecombe {
909e5136e87SRick Edgecombe /*
910e5136e87SRick Edgecombe * Hardware before shadow stack can (rarely) set Dirty=1
911e5136e87SRick Edgecombe * on a Write=0 PTE. So the below condition
912e5136e87SRick Edgecombe * only indicates a software bug when shadow stack is
913e5136e87SRick Edgecombe * supported by the HW. This checking is covered in
914e5136e87SRick Edgecombe * pte_shstk().
915e5136e87SRick Edgecombe */
916e5136e87SRick Edgecombe VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
917e5136e87SRick Edgecombe pte_shstk(pte));
918e5136e87SRick Edgecombe }
919e5136e87SRick Edgecombe
arch_check_zapped_pmd(struct vm_area_struct * vma,pmd_t pmd)920e5136e87SRick Edgecombe void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
921e5136e87SRick Edgecombe {
922e5136e87SRick Edgecombe /* See note in arch_check_zapped_pte() */
923e5136e87SRick Edgecombe VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
924e5136e87SRick Edgecombe pmd_shstk(pmd));
925e5136e87SRick Edgecombe }
926