1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/mm.h> 3 #include <linux/gfp.h> 4 #include <linux/hugetlb.h> 5 #include <asm/pgalloc.h> 6 #include <asm/pgtable.h> 7 #include <asm/tlb.h> 8 #include <asm/fixmap.h> 9 #include <asm/mtrr.h> 10 11 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK 12 phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; 13 EXPORT_SYMBOL(physical_mask); 14 #endif 15 16 #define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) 17 18 #ifdef CONFIG_HIGHPTE 19 #define PGALLOC_USER_GFP __GFP_HIGHMEM 20 #else 21 #define PGALLOC_USER_GFP 0 22 #endif 23 24 gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; 25 26 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 27 { 28 return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT); 29 } 30 31 pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) 32 { 33 struct page *pte; 34 35 pte = alloc_pages(__userpte_alloc_gfp, 0); 36 if (!pte) 37 return NULL; 38 if (!pgtable_page_ctor(pte)) { 39 __free_page(pte); 40 return NULL; 41 } 42 return pte; 43 } 44 45 static int __init setup_userpte(char *arg) 46 { 47 if (!arg) 48 return -EINVAL; 49 50 /* 51 * "userpte=nohigh" disables allocation of user pagetables in 52 * high memory. 53 */ 54 if (strcmp(arg, "nohigh") == 0) 55 __userpte_alloc_gfp &= ~__GFP_HIGHMEM; 56 else 57 return -EINVAL; 58 return 0; 59 } 60 early_param("userpte", setup_userpte); 61 62 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 63 { 64 pgtable_page_dtor(pte); 65 paravirt_release_pte(page_to_pfn(pte)); 66 paravirt_tlb_remove_table(tlb, pte); 67 } 68 69 #if CONFIG_PGTABLE_LEVELS > 2 70 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) 71 { 72 struct page *page = virt_to_page(pmd); 73 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); 74 /* 75 * NOTE! For PAE, any changes to the top page-directory-pointer-table 76 * entries need a full cr3 reload to flush. 77 */ 78 #ifdef CONFIG_X86_PAE 79 tlb->need_flush_all = 1; 80 #endif 81 pgtable_pmd_page_dtor(page); 82 paravirt_tlb_remove_table(tlb, page); 83 } 84 85 #if CONFIG_PGTABLE_LEVELS > 3 86 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 87 { 88 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 89 paravirt_tlb_remove_table(tlb, virt_to_page(pud)); 90 } 91 92 #if CONFIG_PGTABLE_LEVELS > 4 93 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) 94 { 95 paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); 96 paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); 97 } 98 #endif /* CONFIG_PGTABLE_LEVELS > 4 */ 99 #endif /* CONFIG_PGTABLE_LEVELS > 3 */ 100 #endif /* CONFIG_PGTABLE_LEVELS > 2 */ 101 102 static inline void pgd_list_add(pgd_t *pgd) 103 { 104 struct page *page = virt_to_page(pgd); 105 106 list_add(&page->lru, &pgd_list); 107 } 108 109 static inline void pgd_list_del(pgd_t *pgd) 110 { 111 struct page *page = virt_to_page(pgd); 112 113 list_del(&page->lru); 114 } 115 116 #define UNSHARED_PTRS_PER_PGD \ 117 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 118 119 120 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) 121 { 122 virt_to_page(pgd)->pt_mm = mm; 123 } 124 125 struct mm_struct *pgd_page_get_mm(struct page *page) 126 { 127 return page->pt_mm; 128 } 129 130 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) 131 { 132 /* If the pgd points to a shared pagetable level (either the 133 ptes in non-PAE, or shared PMD in PAE), then just copy the 134 references from swapper_pg_dir. */ 135 if (CONFIG_PGTABLE_LEVELS == 2 || 136 (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || 137 CONFIG_PGTABLE_LEVELS >= 4) { 138 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, 139 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 140 KERNEL_PGD_PTRS); 141 } 142 143 /* list required to sync kernel mapping updates */ 144 if (!SHARED_KERNEL_PMD) { 145 pgd_set_mm(pgd, mm); 146 pgd_list_add(pgd); 147 } 148 } 149 150 static void pgd_dtor(pgd_t *pgd) 151 { 152 if (SHARED_KERNEL_PMD) 153 return; 154 155 spin_lock(&pgd_lock); 156 pgd_list_del(pgd); 157 spin_unlock(&pgd_lock); 158 } 159 160 /* 161 * List of all pgd's needed for non-PAE so it can invalidate entries 162 * in both cached and uncached pgd's; not needed for PAE since the 163 * kernel pmd is shared. If PAE were not to share the pmd a similar 164 * tactic would be needed. This is essentially codepath-based locking 165 * against pageattr.c; it is the unique case in which a valid change 166 * of kernel pagetables can't be lazily synchronized by vmalloc faults. 167 * vmalloc faults work because attached pagetables are never freed. 168 * -- nyc 169 */ 170 171 #ifdef CONFIG_X86_PAE 172 /* 173 * In PAE mode, we need to do a cr3 reload (=tlb flush) when 174 * updating the top-level pagetable entries to guarantee the 175 * processor notices the update. Since this is expensive, and 176 * all 4 top-level entries are used almost immediately in a 177 * new process's life, we just pre-populate them here. 178 * 179 * Also, if we're in a paravirt environment where the kernel pmd is 180 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate 181 * and initialize the kernel pmds here. 182 */ 183 #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD 184 185 /* 186 * We allocate separate PMDs for the kernel part of the user page-table 187 * when PTI is enabled. We need them to map the per-process LDT into the 188 * user-space page-table. 189 */ 190 #define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \ 191 KERNEL_PGD_PTRS : 0) 192 193 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 194 { 195 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 196 197 /* Note: almost everything apart from _PAGE_PRESENT is 198 reserved at the pmd (PDPT) level. */ 199 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); 200 201 /* 202 * According to Intel App note "TLBs, Paging-Structure Caches, 203 * and Their Invalidation", April 2007, document 317080-001, 204 * section 8.1: in PAE mode we explicitly have to flush the 205 * TLB via cr3 if the top-level pgd is changed... 206 */ 207 flush_tlb_mm(mm); 208 } 209 #else /* !CONFIG_X86_PAE */ 210 211 /* No need to prepopulate any pagetable entries in non-PAE modes. */ 212 #define PREALLOCATED_PMDS 0 213 #define PREALLOCATED_USER_PMDS 0 214 #endif /* CONFIG_X86_PAE */ 215 216 static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) 217 { 218 int i; 219 220 for (i = 0; i < count; i++) 221 if (pmds[i]) { 222 pgtable_pmd_page_dtor(virt_to_page(pmds[i])); 223 free_page((unsigned long)pmds[i]); 224 mm_dec_nr_pmds(mm); 225 } 226 } 227 228 static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) 229 { 230 int i; 231 bool failed = false; 232 gfp_t gfp = PGALLOC_GFP; 233 234 if (mm == &init_mm) 235 gfp &= ~__GFP_ACCOUNT; 236 237 for (i = 0; i < count; i++) { 238 pmd_t *pmd = (pmd_t *)__get_free_page(gfp); 239 if (!pmd) 240 failed = true; 241 if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { 242 free_page((unsigned long)pmd); 243 pmd = NULL; 244 failed = true; 245 } 246 if (pmd) 247 mm_inc_nr_pmds(mm); 248 pmds[i] = pmd; 249 } 250 251 if (failed) { 252 free_pmds(mm, pmds, count); 253 return -ENOMEM; 254 } 255 256 return 0; 257 } 258 259 /* 260 * Mop up any pmd pages which may still be attached to the pgd. 261 * Normally they will be freed by munmap/exit_mmap, but any pmd we 262 * preallocate which never got a corresponding vma will need to be 263 * freed manually. 264 */ 265 static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) 266 { 267 pgd_t pgd = *pgdp; 268 269 if (pgd_val(pgd) != 0) { 270 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); 271 272 *pgdp = native_make_pgd(0); 273 274 paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); 275 pmd_free(mm, pmd); 276 mm_dec_nr_pmds(mm); 277 } 278 } 279 280 static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) 281 { 282 int i; 283 284 for (i = 0; i < PREALLOCATED_PMDS; i++) 285 mop_up_one_pmd(mm, &pgdp[i]); 286 287 #ifdef CONFIG_PAGE_TABLE_ISOLATION 288 289 if (!static_cpu_has(X86_FEATURE_PTI)) 290 return; 291 292 pgdp = kernel_to_user_pgdp(pgdp); 293 294 for (i = 0; i < PREALLOCATED_USER_PMDS; i++) 295 mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); 296 #endif 297 } 298 299 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) 300 { 301 p4d_t *p4d; 302 pud_t *pud; 303 int i; 304 305 if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ 306 return; 307 308 p4d = p4d_offset(pgd, 0); 309 pud = pud_offset(p4d, 0); 310 311 for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { 312 pmd_t *pmd = pmds[i]; 313 314 if (i >= KERNEL_PGD_BOUNDARY) 315 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), 316 sizeof(pmd_t) * PTRS_PER_PMD); 317 318 pud_populate(mm, pud, pmd); 319 } 320 } 321 322 #ifdef CONFIG_PAGE_TABLE_ISOLATION 323 static void pgd_prepopulate_user_pmd(struct mm_struct *mm, 324 pgd_t *k_pgd, pmd_t *pmds[]) 325 { 326 pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); 327 pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); 328 p4d_t *u_p4d; 329 pud_t *u_pud; 330 int i; 331 332 u_p4d = p4d_offset(u_pgd, 0); 333 u_pud = pud_offset(u_p4d, 0); 334 335 s_pgd += KERNEL_PGD_BOUNDARY; 336 u_pud += KERNEL_PGD_BOUNDARY; 337 338 for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { 339 pmd_t *pmd = pmds[i]; 340 341 memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), 342 sizeof(pmd_t) * PTRS_PER_PMD); 343 344 pud_populate(mm, u_pud, pmd); 345 } 346 347 } 348 #else 349 static void pgd_prepopulate_user_pmd(struct mm_struct *mm, 350 pgd_t *k_pgd, pmd_t *pmds[]) 351 { 352 } 353 #endif 354 /* 355 * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also 356 * assumes that pgd should be in one page. 357 * 358 * But kernel with PAE paging that is not running as a Xen domain 359 * only needs to allocate 32 bytes for pgd instead of one page. 360 */ 361 #ifdef CONFIG_X86_PAE 362 363 #include <linux/slab.h> 364 365 #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) 366 #define PGD_ALIGN 32 367 368 static struct kmem_cache *pgd_cache; 369 370 static int __init pgd_cache_init(void) 371 { 372 /* 373 * When PAE kernel is running as a Xen domain, it does not use 374 * shared kernel pmd. And this requires a whole page for pgd. 375 */ 376 if (!SHARED_KERNEL_PMD) 377 return 0; 378 379 /* 380 * when PAE kernel is not running as a Xen domain, it uses 381 * shared kernel pmd. Shared kernel pmd does not require a whole 382 * page for pgd. We are able to just allocate a 32-byte for pgd. 383 * During boot time, we create a 32-byte slab for pgd table allocation. 384 */ 385 pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, 386 SLAB_PANIC, NULL); 387 return 0; 388 } 389 core_initcall(pgd_cache_init); 390 391 static inline pgd_t *_pgd_alloc(void) 392 { 393 /* 394 * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. 395 * We allocate one page for pgd. 396 */ 397 if (!SHARED_KERNEL_PMD) 398 return (pgd_t *)__get_free_pages(PGALLOC_GFP, 399 PGD_ALLOCATION_ORDER); 400 401 /* 402 * Now PAE kernel is not running as a Xen domain. We can allocate 403 * a 32-byte slab for pgd to save memory space. 404 */ 405 return kmem_cache_alloc(pgd_cache, PGALLOC_GFP); 406 } 407 408 static inline void _pgd_free(pgd_t *pgd) 409 { 410 if (!SHARED_KERNEL_PMD) 411 free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); 412 else 413 kmem_cache_free(pgd_cache, pgd); 414 } 415 #else 416 417 static inline pgd_t *_pgd_alloc(void) 418 { 419 return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); 420 } 421 422 static inline void _pgd_free(pgd_t *pgd) 423 { 424 free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); 425 } 426 #endif /* CONFIG_X86_PAE */ 427 428 pgd_t *pgd_alloc(struct mm_struct *mm) 429 { 430 pgd_t *pgd; 431 pmd_t *u_pmds[PREALLOCATED_USER_PMDS]; 432 pmd_t *pmds[PREALLOCATED_PMDS]; 433 434 pgd = _pgd_alloc(); 435 436 if (pgd == NULL) 437 goto out; 438 439 mm->pgd = pgd; 440 441 if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) 442 goto out_free_pgd; 443 444 if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) 445 goto out_free_pmds; 446 447 if (paravirt_pgd_alloc(mm) != 0) 448 goto out_free_user_pmds; 449 450 /* 451 * Make sure that pre-populating the pmds is atomic with 452 * respect to anything walking the pgd_list, so that they 453 * never see a partially populated pgd. 454 */ 455 spin_lock(&pgd_lock); 456 457 pgd_ctor(mm, pgd); 458 pgd_prepopulate_pmd(mm, pgd, pmds); 459 pgd_prepopulate_user_pmd(mm, pgd, u_pmds); 460 461 spin_unlock(&pgd_lock); 462 463 return pgd; 464 465 out_free_user_pmds: 466 free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); 467 out_free_pmds: 468 free_pmds(mm, pmds, PREALLOCATED_PMDS); 469 out_free_pgd: 470 _pgd_free(pgd); 471 out: 472 return NULL; 473 } 474 475 void pgd_free(struct mm_struct *mm, pgd_t *pgd) 476 { 477 pgd_mop_up_pmds(mm, pgd); 478 pgd_dtor(pgd); 479 paravirt_pgd_free(mm, pgd); 480 _pgd_free(pgd); 481 } 482 483 /* 484 * Used to set accessed or dirty bits in the page table entries 485 * on other architectures. On x86, the accessed and dirty bits 486 * are tracked by hardware. However, do_wp_page calls this function 487 * to also make the pte writeable at the same time the dirty bit is 488 * set. In that case we do actually need to write the PTE. 489 */ 490 int ptep_set_access_flags(struct vm_area_struct *vma, 491 unsigned long address, pte_t *ptep, 492 pte_t entry, int dirty) 493 { 494 int changed = !pte_same(*ptep, entry); 495 496 if (changed && dirty) 497 *ptep = entry; 498 499 return changed; 500 } 501 502 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 503 int pmdp_set_access_flags(struct vm_area_struct *vma, 504 unsigned long address, pmd_t *pmdp, 505 pmd_t entry, int dirty) 506 { 507 int changed = !pmd_same(*pmdp, entry); 508 509 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 510 511 if (changed && dirty) { 512 *pmdp = entry; 513 /* 514 * We had a write-protection fault here and changed the pmd 515 * to to more permissive. No need to flush the TLB for that, 516 * #PF is architecturally guaranteed to do that and in the 517 * worst-case we'll generate a spurious fault. 518 */ 519 } 520 521 return changed; 522 } 523 524 int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, 525 pud_t *pudp, pud_t entry, int dirty) 526 { 527 int changed = !pud_same(*pudp, entry); 528 529 VM_BUG_ON(address & ~HPAGE_PUD_MASK); 530 531 if (changed && dirty) { 532 *pudp = entry; 533 /* 534 * We had a write-protection fault here and changed the pud 535 * to to more permissive. No need to flush the TLB for that, 536 * #PF is architecturally guaranteed to do that and in the 537 * worst-case we'll generate a spurious fault. 538 */ 539 } 540 541 return changed; 542 } 543 #endif 544 545 int ptep_test_and_clear_young(struct vm_area_struct *vma, 546 unsigned long addr, pte_t *ptep) 547 { 548 int ret = 0; 549 550 if (pte_young(*ptep)) 551 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 552 (unsigned long *) &ptep->pte); 553 554 return ret; 555 } 556 557 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 558 int pmdp_test_and_clear_young(struct vm_area_struct *vma, 559 unsigned long addr, pmd_t *pmdp) 560 { 561 int ret = 0; 562 563 if (pmd_young(*pmdp)) 564 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 565 (unsigned long *)pmdp); 566 567 return ret; 568 } 569 int pudp_test_and_clear_young(struct vm_area_struct *vma, 570 unsigned long addr, pud_t *pudp) 571 { 572 int ret = 0; 573 574 if (pud_young(*pudp)) 575 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 576 (unsigned long *)pudp); 577 578 return ret; 579 } 580 #endif 581 582 int ptep_clear_flush_young(struct vm_area_struct *vma, 583 unsigned long address, pte_t *ptep) 584 { 585 /* 586 * On x86 CPUs, clearing the accessed bit without a TLB flush 587 * doesn't cause data corruption. [ It could cause incorrect 588 * page aging and the (mistaken) reclaim of hot pages, but the 589 * chance of that should be relatively low. ] 590 * 591 * So as a performance optimization don't flush the TLB when 592 * clearing the accessed bit, it will eventually be flushed by 593 * a context switch or a VM operation anyway. [ In the rare 594 * event of it not getting flushed for a long time the delay 595 * shouldn't really matter because there's no real memory 596 * pressure for swapout to react to. ] 597 */ 598 return ptep_test_and_clear_young(vma, address, ptep); 599 } 600 601 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 602 int pmdp_clear_flush_young(struct vm_area_struct *vma, 603 unsigned long address, pmd_t *pmdp) 604 { 605 int young; 606 607 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 608 609 young = pmdp_test_and_clear_young(vma, address, pmdp); 610 if (young) 611 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 612 613 return young; 614 } 615 #endif 616 617 /** 618 * reserve_top_address - reserves a hole in the top of kernel address space 619 * @reserve - size of hole to reserve 620 * 621 * Can be used to relocate the fixmap area and poke a hole in the top 622 * of kernel address space to make room for a hypervisor. 623 */ 624 void __init reserve_top_address(unsigned long reserve) 625 { 626 #ifdef CONFIG_X86_32 627 BUG_ON(fixmaps_set > 0); 628 __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; 629 printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", 630 -reserve, __FIXADDR_TOP + PAGE_SIZE); 631 #endif 632 } 633 634 int fixmaps_set; 635 636 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) 637 { 638 unsigned long address = __fix_to_virt(idx); 639 640 if (idx >= __end_of_fixed_addresses) { 641 BUG(); 642 return; 643 } 644 set_pte_vaddr(address, pte); 645 fixmaps_set++; 646 } 647 648 void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, 649 pgprot_t flags) 650 { 651 /* Sanitize 'prot' against any unsupported bits: */ 652 pgprot_val(flags) &= __default_kernel_pte_mask; 653 654 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); 655 } 656 657 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP 658 #ifdef CONFIG_X86_5LEVEL 659 /** 660 * p4d_set_huge - setup kernel P4D mapping 661 * 662 * No 512GB pages yet -- always return 0 663 */ 664 int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) 665 { 666 return 0; 667 } 668 669 /** 670 * p4d_clear_huge - clear kernel P4D mapping when it is set 671 * 672 * No 512GB pages yet -- always return 0 673 */ 674 int p4d_clear_huge(p4d_t *p4d) 675 { 676 return 0; 677 } 678 #endif 679 680 /** 681 * pud_set_huge - setup kernel PUD mapping 682 * 683 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this 684 * function sets up a huge page only if any of the following conditions are met: 685 * 686 * - MTRRs are disabled, or 687 * 688 * - MTRRs are enabled and the range is completely covered by a single MTRR, or 689 * 690 * - MTRRs are enabled and the corresponding MTRR memory type is WB, which 691 * has no effect on the requested PAT memory type. 692 * 693 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger 694 * page mapping attempt fails. 695 * 696 * Returns 1 on success and 0 on failure. 697 */ 698 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) 699 { 700 u8 mtrr, uniform; 701 702 mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); 703 if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && 704 (mtrr != MTRR_TYPE_WRBACK)) 705 return 0; 706 707 /* Bail out if we are we on a populated non-leaf entry: */ 708 if (pud_present(*pud) && !pud_huge(*pud)) 709 return 0; 710 711 prot = pgprot_4k_2_large(prot); 712 713 set_pte((pte_t *)pud, pfn_pte( 714 (u64)addr >> PAGE_SHIFT, 715 __pgprot(pgprot_val(prot) | _PAGE_PSE))); 716 717 return 1; 718 } 719 720 /** 721 * pmd_set_huge - setup kernel PMD mapping 722 * 723 * See text over pud_set_huge() above. 724 * 725 * Returns 1 on success and 0 on failure. 726 */ 727 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) 728 { 729 u8 mtrr, uniform; 730 731 mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); 732 if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && 733 (mtrr != MTRR_TYPE_WRBACK)) { 734 pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", 735 __func__, addr, addr + PMD_SIZE); 736 return 0; 737 } 738 739 /* Bail out if we are we on a populated non-leaf entry: */ 740 if (pmd_present(*pmd) && !pmd_huge(*pmd)) 741 return 0; 742 743 prot = pgprot_4k_2_large(prot); 744 745 set_pte((pte_t *)pmd, pfn_pte( 746 (u64)addr >> PAGE_SHIFT, 747 __pgprot(pgprot_val(prot) | _PAGE_PSE))); 748 749 return 1; 750 } 751 752 /** 753 * pud_clear_huge - clear kernel PUD mapping when it is set 754 * 755 * Returns 1 on success and 0 on failure (no PUD map is found). 756 */ 757 int pud_clear_huge(pud_t *pud) 758 { 759 if (pud_large(*pud)) { 760 pud_clear(pud); 761 return 1; 762 } 763 764 return 0; 765 } 766 767 /** 768 * pmd_clear_huge - clear kernel PMD mapping when it is set 769 * 770 * Returns 1 on success and 0 on failure (no PMD map is found). 771 */ 772 int pmd_clear_huge(pmd_t *pmd) 773 { 774 if (pmd_large(*pmd)) { 775 pmd_clear(pmd); 776 return 1; 777 } 778 779 return 0; 780 } 781 782 #ifdef CONFIG_X86_64 783 /** 784 * pud_free_pmd_page - Clear pud entry and free pmd page. 785 * @pud: Pointer to a PUD. 786 * @addr: Virtual address associated with pud. 787 * 788 * Context: The pud range has been unmapped and TLB purged. 789 * Return: 1 if clearing the entry succeeded. 0 otherwise. 790 * 791 * NOTE: Callers must allow a single page allocation. 792 */ 793 int pud_free_pmd_page(pud_t *pud, unsigned long addr) 794 { 795 pmd_t *pmd, *pmd_sv; 796 pte_t *pte; 797 int i; 798 799 if (pud_none(*pud)) 800 return 1; 801 802 pmd = (pmd_t *)pud_page_vaddr(*pud); 803 pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); 804 if (!pmd_sv) 805 return 0; 806 807 for (i = 0; i < PTRS_PER_PMD; i++) { 808 pmd_sv[i] = pmd[i]; 809 if (!pmd_none(pmd[i])) 810 pmd_clear(&pmd[i]); 811 } 812 813 pud_clear(pud); 814 815 /* INVLPG to clear all paging-structure caches */ 816 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); 817 818 for (i = 0; i < PTRS_PER_PMD; i++) { 819 if (!pmd_none(pmd_sv[i])) { 820 pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); 821 free_page((unsigned long)pte); 822 } 823 } 824 825 free_page((unsigned long)pmd_sv); 826 free_page((unsigned long)pmd); 827 828 return 1; 829 } 830 831 /** 832 * pmd_free_pte_page - Clear pmd entry and free pte page. 833 * @pmd: Pointer to a PMD. 834 * @addr: Virtual address associated with pmd. 835 * 836 * Context: The pmd range has been unmapped and TLB purged. 837 * Return: 1 if clearing the entry succeeded. 0 otherwise. 838 */ 839 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 840 { 841 pte_t *pte; 842 843 if (pmd_none(*pmd)) 844 return 1; 845 846 pte = (pte_t *)pmd_page_vaddr(*pmd); 847 pmd_clear(pmd); 848 849 /* INVLPG to clear all paging-structure caches */ 850 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); 851 852 free_page((unsigned long)pte); 853 854 return 1; 855 } 856 857 #else /* !CONFIG_X86_64 */ 858 859 int pud_free_pmd_page(pud_t *pud, unsigned long addr) 860 { 861 return pud_none(*pud); 862 } 863 864 /* 865 * Disable free page handling on x86-PAE. This assures that ioremap() 866 * does not update sync'd pmd entries. See vmalloc_sync_one(). 867 */ 868 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 869 { 870 return pmd_none(*pmd); 871 } 872 873 #endif /* CONFIG_X86_64 */ 874 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ 875