1 #include <linux/mm.h> 2 #include <linux/gfp.h> 3 #include <asm/pgalloc.h> 4 #include <asm/pgtable.h> 5 #include <asm/tlb.h> 6 #include <asm/fixmap.h> 7 #include <asm/mtrr.h> 8 9 #define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO) 10 11 #ifdef CONFIG_HIGHPTE 12 #define PGALLOC_USER_GFP __GFP_HIGHMEM 13 #else 14 #define PGALLOC_USER_GFP 0 15 #endif 16 17 gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; 18 19 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 20 { 21 return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT); 22 } 23 24 pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) 25 { 26 struct page *pte; 27 28 pte = alloc_pages(__userpte_alloc_gfp, 0); 29 if (!pte) 30 return NULL; 31 if (!pgtable_page_ctor(pte)) { 32 __free_page(pte); 33 return NULL; 34 } 35 return pte; 36 } 37 38 static int __init setup_userpte(char *arg) 39 { 40 if (!arg) 41 return -EINVAL; 42 43 /* 44 * "userpte=nohigh" disables allocation of user pagetables in 45 * high memory. 46 */ 47 if (strcmp(arg, "nohigh") == 0) 48 __userpte_alloc_gfp &= ~__GFP_HIGHMEM; 49 else 50 return -EINVAL; 51 return 0; 52 } 53 early_param("userpte", setup_userpte); 54 55 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 56 { 57 pgtable_page_dtor(pte); 58 paravirt_release_pte(page_to_pfn(pte)); 59 tlb_remove_table(tlb, pte); 60 } 61 62 #if CONFIG_PGTABLE_LEVELS > 2 63 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) 64 { 65 struct page *page = virt_to_page(pmd); 66 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); 67 /* 68 * NOTE! For PAE, any changes to the top page-directory-pointer-table 69 * entries need a full cr3 reload to flush. 70 */ 71 #ifdef CONFIG_X86_PAE 72 tlb->need_flush_all = 1; 73 #endif 74 pgtable_pmd_page_dtor(page); 75 tlb_remove_table(tlb, page); 76 } 77 78 #if CONFIG_PGTABLE_LEVELS > 3 79 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 80 { 81 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 82 tlb_remove_table(tlb, virt_to_page(pud)); 83 } 84 85 #if CONFIG_PGTABLE_LEVELS > 4 86 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) 87 { 88 paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); 89 tlb_remove_table(tlb, virt_to_page(p4d)); 90 } 91 #endif /* CONFIG_PGTABLE_LEVELS > 4 */ 92 #endif /* CONFIG_PGTABLE_LEVELS > 3 */ 93 #endif /* CONFIG_PGTABLE_LEVELS > 2 */ 94 95 static inline void pgd_list_add(pgd_t *pgd) 96 { 97 struct page *page = virt_to_page(pgd); 98 99 list_add(&page->lru, &pgd_list); 100 } 101 102 static inline void pgd_list_del(pgd_t *pgd) 103 { 104 struct page *page = virt_to_page(pgd); 105 106 list_del(&page->lru); 107 } 108 109 #define UNSHARED_PTRS_PER_PGD \ 110 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 111 112 113 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) 114 { 115 BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); 116 virt_to_page(pgd)->index = (pgoff_t)mm; 117 } 118 119 struct mm_struct *pgd_page_get_mm(struct page *page) 120 { 121 return (struct mm_struct *)page->index; 122 } 123 124 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) 125 { 126 /* If the pgd points to a shared pagetable level (either the 127 ptes in non-PAE, or shared PMD in PAE), then just copy the 128 references from swapper_pg_dir. */ 129 if (CONFIG_PGTABLE_LEVELS == 2 || 130 (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || 131 CONFIG_PGTABLE_LEVELS >= 4) { 132 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, 133 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 134 KERNEL_PGD_PTRS); 135 } 136 137 /* list required to sync kernel mapping updates */ 138 if (!SHARED_KERNEL_PMD) { 139 pgd_set_mm(pgd, mm); 140 pgd_list_add(pgd); 141 } 142 } 143 144 static void pgd_dtor(pgd_t *pgd) 145 { 146 if (SHARED_KERNEL_PMD) 147 return; 148 149 spin_lock(&pgd_lock); 150 pgd_list_del(pgd); 151 spin_unlock(&pgd_lock); 152 } 153 154 /* 155 * List of all pgd's needed for non-PAE so it can invalidate entries 156 * in both cached and uncached pgd's; not needed for PAE since the 157 * kernel pmd is shared. If PAE were not to share the pmd a similar 158 * tactic would be needed. This is essentially codepath-based locking 159 * against pageattr.c; it is the unique case in which a valid change 160 * of kernel pagetables can't be lazily synchronized by vmalloc faults. 161 * vmalloc faults work because attached pagetables are never freed. 162 * -- nyc 163 */ 164 165 #ifdef CONFIG_X86_PAE 166 /* 167 * In PAE mode, we need to do a cr3 reload (=tlb flush) when 168 * updating the top-level pagetable entries to guarantee the 169 * processor notices the update. Since this is expensive, and 170 * all 4 top-level entries are used almost immediately in a 171 * new process's life, we just pre-populate them here. 172 * 173 * Also, if we're in a paravirt environment where the kernel pmd is 174 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate 175 * and initialize the kernel pmds here. 176 */ 177 #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD 178 179 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 180 { 181 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 182 183 /* Note: almost everything apart from _PAGE_PRESENT is 184 reserved at the pmd (PDPT) level. */ 185 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); 186 187 /* 188 * According to Intel App note "TLBs, Paging-Structure Caches, 189 * and Their Invalidation", April 2007, document 317080-001, 190 * section 8.1: in PAE mode we explicitly have to flush the 191 * TLB via cr3 if the top-level pgd is changed... 192 */ 193 flush_tlb_mm(mm); 194 } 195 #else /* !CONFIG_X86_PAE */ 196 197 /* No need to prepopulate any pagetable entries in non-PAE modes. */ 198 #define PREALLOCATED_PMDS 0 199 200 #endif /* CONFIG_X86_PAE */ 201 202 static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) 203 { 204 int i; 205 206 for(i = 0; i < PREALLOCATED_PMDS; i++) 207 if (pmds[i]) { 208 pgtable_pmd_page_dtor(virt_to_page(pmds[i])); 209 free_page((unsigned long)pmds[i]); 210 mm_dec_nr_pmds(mm); 211 } 212 } 213 214 static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) 215 { 216 int i; 217 bool failed = false; 218 gfp_t gfp = PGALLOC_GFP; 219 220 if (mm == &init_mm) 221 gfp &= ~__GFP_ACCOUNT; 222 223 for(i = 0; i < PREALLOCATED_PMDS; i++) { 224 pmd_t *pmd = (pmd_t *)__get_free_page(gfp); 225 if (!pmd) 226 failed = true; 227 if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { 228 free_page((unsigned long)pmd); 229 pmd = NULL; 230 failed = true; 231 } 232 if (pmd) 233 mm_inc_nr_pmds(mm); 234 pmds[i] = pmd; 235 } 236 237 if (failed) { 238 free_pmds(mm, pmds); 239 return -ENOMEM; 240 } 241 242 return 0; 243 } 244 245 /* 246 * Mop up any pmd pages which may still be attached to the pgd. 247 * Normally they will be freed by munmap/exit_mmap, but any pmd we 248 * preallocate which never got a corresponding vma will need to be 249 * freed manually. 250 */ 251 static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) 252 { 253 int i; 254 255 for(i = 0; i < PREALLOCATED_PMDS; i++) { 256 pgd_t pgd = pgdp[i]; 257 258 if (pgd_val(pgd) != 0) { 259 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); 260 261 pgdp[i] = native_make_pgd(0); 262 263 paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); 264 pmd_free(mm, pmd); 265 mm_dec_nr_pmds(mm); 266 } 267 } 268 } 269 270 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) 271 { 272 p4d_t *p4d; 273 pud_t *pud; 274 int i; 275 276 if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ 277 return; 278 279 p4d = p4d_offset(pgd, 0); 280 pud = pud_offset(p4d, 0); 281 282 for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { 283 pmd_t *pmd = pmds[i]; 284 285 if (i >= KERNEL_PGD_BOUNDARY) 286 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), 287 sizeof(pmd_t) * PTRS_PER_PMD); 288 289 pud_populate(mm, pud, pmd); 290 } 291 } 292 293 /* 294 * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also 295 * assumes that pgd should be in one page. 296 * 297 * But kernel with PAE paging that is not running as a Xen domain 298 * only needs to allocate 32 bytes for pgd instead of one page. 299 */ 300 #ifdef CONFIG_X86_PAE 301 302 #include <linux/slab.h> 303 304 #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) 305 #define PGD_ALIGN 32 306 307 static struct kmem_cache *pgd_cache; 308 309 static int __init pgd_cache_init(void) 310 { 311 /* 312 * When PAE kernel is running as a Xen domain, it does not use 313 * shared kernel pmd. And this requires a whole page for pgd. 314 */ 315 if (!SHARED_KERNEL_PMD) 316 return 0; 317 318 /* 319 * when PAE kernel is not running as a Xen domain, it uses 320 * shared kernel pmd. Shared kernel pmd does not require a whole 321 * page for pgd. We are able to just allocate a 32-byte for pgd. 322 * During boot time, we create a 32-byte slab for pgd table allocation. 323 */ 324 pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, 325 SLAB_PANIC, NULL); 326 if (!pgd_cache) 327 return -ENOMEM; 328 329 return 0; 330 } 331 core_initcall(pgd_cache_init); 332 333 static inline pgd_t *_pgd_alloc(void) 334 { 335 /* 336 * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. 337 * We allocate one page for pgd. 338 */ 339 if (!SHARED_KERNEL_PMD) 340 return (pgd_t *)__get_free_page(PGALLOC_GFP); 341 342 /* 343 * Now PAE kernel is not running as a Xen domain. We can allocate 344 * a 32-byte slab for pgd to save memory space. 345 */ 346 return kmem_cache_alloc(pgd_cache, PGALLOC_GFP); 347 } 348 349 static inline void _pgd_free(pgd_t *pgd) 350 { 351 if (!SHARED_KERNEL_PMD) 352 free_page((unsigned long)pgd); 353 else 354 kmem_cache_free(pgd_cache, pgd); 355 } 356 #else 357 static inline pgd_t *_pgd_alloc(void) 358 { 359 return (pgd_t *)__get_free_page(PGALLOC_GFP); 360 } 361 362 static inline void _pgd_free(pgd_t *pgd) 363 { 364 free_page((unsigned long)pgd); 365 } 366 #endif /* CONFIG_X86_PAE */ 367 368 pgd_t *pgd_alloc(struct mm_struct *mm) 369 { 370 pgd_t *pgd; 371 pmd_t *pmds[PREALLOCATED_PMDS]; 372 373 pgd = _pgd_alloc(); 374 375 if (pgd == NULL) 376 goto out; 377 378 mm->pgd = pgd; 379 380 if (preallocate_pmds(mm, pmds) != 0) 381 goto out_free_pgd; 382 383 if (paravirt_pgd_alloc(mm) != 0) 384 goto out_free_pmds; 385 386 /* 387 * Make sure that pre-populating the pmds is atomic with 388 * respect to anything walking the pgd_list, so that they 389 * never see a partially populated pgd. 390 */ 391 spin_lock(&pgd_lock); 392 393 pgd_ctor(mm, pgd); 394 pgd_prepopulate_pmd(mm, pgd, pmds); 395 396 spin_unlock(&pgd_lock); 397 398 return pgd; 399 400 out_free_pmds: 401 free_pmds(mm, pmds); 402 out_free_pgd: 403 _pgd_free(pgd); 404 out: 405 return NULL; 406 } 407 408 void pgd_free(struct mm_struct *mm, pgd_t *pgd) 409 { 410 pgd_mop_up_pmds(mm, pgd); 411 pgd_dtor(pgd); 412 paravirt_pgd_free(mm, pgd); 413 _pgd_free(pgd); 414 } 415 416 /* 417 * Used to set accessed or dirty bits in the page table entries 418 * on other architectures. On x86, the accessed and dirty bits 419 * are tracked by hardware. However, do_wp_page calls this function 420 * to also make the pte writeable at the same time the dirty bit is 421 * set. In that case we do actually need to write the PTE. 422 */ 423 int ptep_set_access_flags(struct vm_area_struct *vma, 424 unsigned long address, pte_t *ptep, 425 pte_t entry, int dirty) 426 { 427 int changed = !pte_same(*ptep, entry); 428 429 if (changed && dirty) 430 *ptep = entry; 431 432 return changed; 433 } 434 435 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 436 int pmdp_set_access_flags(struct vm_area_struct *vma, 437 unsigned long address, pmd_t *pmdp, 438 pmd_t entry, int dirty) 439 { 440 int changed = !pmd_same(*pmdp, entry); 441 442 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 443 444 if (changed && dirty) { 445 *pmdp = entry; 446 /* 447 * We had a write-protection fault here and changed the pmd 448 * to to more permissive. No need to flush the TLB for that, 449 * #PF is architecturally guaranteed to do that and in the 450 * worst-case we'll generate a spurious fault. 451 */ 452 } 453 454 return changed; 455 } 456 457 int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, 458 pud_t *pudp, pud_t entry, int dirty) 459 { 460 int changed = !pud_same(*pudp, entry); 461 462 VM_BUG_ON(address & ~HPAGE_PUD_MASK); 463 464 if (changed && dirty) { 465 *pudp = entry; 466 /* 467 * We had a write-protection fault here and changed the pud 468 * to to more permissive. No need to flush the TLB for that, 469 * #PF is architecturally guaranteed to do that and in the 470 * worst-case we'll generate a spurious fault. 471 */ 472 } 473 474 return changed; 475 } 476 #endif 477 478 int ptep_test_and_clear_young(struct vm_area_struct *vma, 479 unsigned long addr, pte_t *ptep) 480 { 481 int ret = 0; 482 483 if (pte_young(*ptep)) 484 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 485 (unsigned long *) &ptep->pte); 486 487 return ret; 488 } 489 490 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 491 int pmdp_test_and_clear_young(struct vm_area_struct *vma, 492 unsigned long addr, pmd_t *pmdp) 493 { 494 int ret = 0; 495 496 if (pmd_young(*pmdp)) 497 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 498 (unsigned long *)pmdp); 499 500 return ret; 501 } 502 int pudp_test_and_clear_young(struct vm_area_struct *vma, 503 unsigned long addr, pud_t *pudp) 504 { 505 int ret = 0; 506 507 if (pud_young(*pudp)) 508 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 509 (unsigned long *)pudp); 510 511 return ret; 512 } 513 #endif 514 515 int ptep_clear_flush_young(struct vm_area_struct *vma, 516 unsigned long address, pte_t *ptep) 517 { 518 /* 519 * On x86 CPUs, clearing the accessed bit without a TLB flush 520 * doesn't cause data corruption. [ It could cause incorrect 521 * page aging and the (mistaken) reclaim of hot pages, but the 522 * chance of that should be relatively low. ] 523 * 524 * So as a performance optimization don't flush the TLB when 525 * clearing the accessed bit, it will eventually be flushed by 526 * a context switch or a VM operation anyway. [ In the rare 527 * event of it not getting flushed for a long time the delay 528 * shouldn't really matter because there's no real memory 529 * pressure for swapout to react to. ] 530 */ 531 return ptep_test_and_clear_young(vma, address, ptep); 532 } 533 534 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 535 int pmdp_clear_flush_young(struct vm_area_struct *vma, 536 unsigned long address, pmd_t *pmdp) 537 { 538 int young; 539 540 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 541 542 young = pmdp_test_and_clear_young(vma, address, pmdp); 543 if (young) 544 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 545 546 return young; 547 } 548 #endif 549 550 /** 551 * reserve_top_address - reserves a hole in the top of kernel address space 552 * @reserve - size of hole to reserve 553 * 554 * Can be used to relocate the fixmap area and poke a hole in the top 555 * of kernel address space to make room for a hypervisor. 556 */ 557 void __init reserve_top_address(unsigned long reserve) 558 { 559 #ifdef CONFIG_X86_32 560 BUG_ON(fixmaps_set > 0); 561 __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; 562 printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", 563 -reserve, __FIXADDR_TOP + PAGE_SIZE); 564 #endif 565 } 566 567 int fixmaps_set; 568 569 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) 570 { 571 unsigned long address = __fix_to_virt(idx); 572 573 if (idx >= __end_of_fixed_addresses) { 574 BUG(); 575 return; 576 } 577 set_pte_vaddr(address, pte); 578 fixmaps_set++; 579 } 580 581 void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, 582 pgprot_t flags) 583 { 584 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); 585 } 586 587 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP 588 #ifdef CONFIG_X86_5LEVEL 589 /** 590 * p4d_set_huge - setup kernel P4D mapping 591 * 592 * No 512GB pages yet -- always return 0 593 */ 594 int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) 595 { 596 return 0; 597 } 598 599 /** 600 * p4d_clear_huge - clear kernel P4D mapping when it is set 601 * 602 * No 512GB pages yet -- always return 0 603 */ 604 int p4d_clear_huge(p4d_t *p4d) 605 { 606 return 0; 607 } 608 #endif 609 610 /** 611 * pud_set_huge - setup kernel PUD mapping 612 * 613 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this 614 * function sets up a huge page only if any of the following conditions are met: 615 * 616 * - MTRRs are disabled, or 617 * 618 * - MTRRs are enabled and the range is completely covered by a single MTRR, or 619 * 620 * - MTRRs are enabled and the corresponding MTRR memory type is WB, which 621 * has no effect on the requested PAT memory type. 622 * 623 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger 624 * page mapping attempt fails. 625 * 626 * Returns 1 on success and 0 on failure. 627 */ 628 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) 629 { 630 u8 mtrr, uniform; 631 632 mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); 633 if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && 634 (mtrr != MTRR_TYPE_WRBACK)) 635 return 0; 636 637 prot = pgprot_4k_2_large(prot); 638 639 set_pte((pte_t *)pud, pfn_pte( 640 (u64)addr >> PAGE_SHIFT, 641 __pgprot(pgprot_val(prot) | _PAGE_PSE))); 642 643 return 1; 644 } 645 646 /** 647 * pmd_set_huge - setup kernel PMD mapping 648 * 649 * See text over pud_set_huge() above. 650 * 651 * Returns 1 on success and 0 on failure. 652 */ 653 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) 654 { 655 u8 mtrr, uniform; 656 657 mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); 658 if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && 659 (mtrr != MTRR_TYPE_WRBACK)) { 660 pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", 661 __func__, addr, addr + PMD_SIZE); 662 return 0; 663 } 664 665 prot = pgprot_4k_2_large(prot); 666 667 set_pte((pte_t *)pmd, pfn_pte( 668 (u64)addr >> PAGE_SHIFT, 669 __pgprot(pgprot_val(prot) | _PAGE_PSE))); 670 671 return 1; 672 } 673 674 /** 675 * pud_clear_huge - clear kernel PUD mapping when it is set 676 * 677 * Returns 1 on success and 0 on failure (no PUD map is found). 678 */ 679 int pud_clear_huge(pud_t *pud) 680 { 681 if (pud_large(*pud)) { 682 pud_clear(pud); 683 return 1; 684 } 685 686 return 0; 687 } 688 689 /** 690 * pmd_clear_huge - clear kernel PMD mapping when it is set 691 * 692 * Returns 1 on success and 0 on failure (no PMD map is found). 693 */ 694 int pmd_clear_huge(pmd_t *pmd) 695 { 696 if (pmd_large(*pmd)) { 697 pmd_clear(pmd); 698 return 1; 699 } 700 701 return 0; 702 } 703 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ 704