1 /* 2 * PPC Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2003 David Gibson, IBM Corporation. 5 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor 6 * 7 * Based on the IA-32 version: 8 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 9 */ 10 11 #include <linux/mm.h> 12 #include <linux/io.h> 13 #include <linux/slab.h> 14 #include <linux/hugetlb.h> 15 #include <linux/export.h> 16 #include <linux/of_fdt.h> 17 #include <linux/memblock.h> 18 #include <linux/moduleparam.h> 19 #include <linux/swap.h> 20 #include <linux/swapops.h> 21 #include <linux/kmemleak.h> 22 #include <asm/pgtable.h> 23 #include <asm/pgalloc.h> 24 #include <asm/tlb.h> 25 #include <asm/setup.h> 26 #include <asm/hugetlb.h> 27 #include <asm/pte-walk.h> 28 29 30 #ifdef CONFIG_HUGETLB_PAGE 31 32 #define PAGE_SHIFT_64K 16 33 #define PAGE_SHIFT_512K 19 34 #define PAGE_SHIFT_8M 23 35 #define PAGE_SHIFT_16M 24 36 #define PAGE_SHIFT_16G 34 37 38 bool hugetlb_disabled = false; 39 40 unsigned int HPAGE_SHIFT; 41 EXPORT_SYMBOL(HPAGE_SHIFT); 42 43 #define hugepd_none(hpd) (hpd_val(hpd) == 0) 44 45 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) 46 { 47 /* 48 * Only called for hugetlbfs pages, hence can ignore THP and the 49 * irq disabled walk. 50 */ 51 return __find_linux_pte(mm->pgd, addr, NULL, NULL); 52 } 53 54 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 55 unsigned long address, unsigned int pdshift, 56 unsigned int pshift, spinlock_t *ptl) 57 { 58 struct kmem_cache *cachep; 59 pte_t *new; 60 int i; 61 int num_hugepd; 62 63 if (pshift >= pdshift) { 64 cachep = hugepte_cache; 65 num_hugepd = 1 << (pshift - pdshift); 66 } else { 67 cachep = PGT_CACHE(pdshift - pshift); 68 num_hugepd = 1; 69 } 70 71 new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL)); 72 73 BUG_ON(pshift > HUGEPD_SHIFT_MASK); 74 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); 75 76 if (! new) 77 return -ENOMEM; 78 79 /* 80 * Make sure other cpus find the hugepd set only after a 81 * properly initialized page table is visible to them. 82 * For more details look for comment in __pte_alloc(). 83 */ 84 smp_wmb(); 85 86 spin_lock(ptl); 87 /* 88 * We have multiple higher-level entries that point to the same 89 * actual pte location. Fill in each as we go and backtrack on error. 90 * We need all of these so the DTLB pgtable walk code can find the 91 * right higher-level entry without knowing if it's a hugepage or not. 92 */ 93 for (i = 0; i < num_hugepd; i++, hpdp++) { 94 if (unlikely(!hugepd_none(*hpdp))) 95 break; 96 else { 97 #ifdef CONFIG_PPC_BOOK3S_64 98 *hpdp = __hugepd(__pa(new) | HUGEPD_VAL_BITS | 99 (shift_to_mmu_psize(pshift) << 2)); 100 #elif defined(CONFIG_PPC_8xx) 101 *hpdp = __hugepd(__pa(new) | _PMD_USER | 102 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M : 103 _PMD_PAGE_512K) | _PMD_PRESENT); 104 #else 105 /* We use the old format for PPC_FSL_BOOK3E */ 106 *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift); 107 #endif 108 } 109 } 110 /* If we bailed from the for loop early, an error occurred, clean up */ 111 if (i < num_hugepd) { 112 for (i = i - 1 ; i >= 0; i--, hpdp--) 113 *hpdp = __hugepd(0); 114 kmem_cache_free(cachep, new); 115 } else { 116 kmemleak_ignore(new); 117 } 118 spin_unlock(ptl); 119 return 0; 120 } 121 122 /* 123 * At this point we do the placement change only for BOOK3S 64. This would 124 * possibly work on other subarchs. 125 */ 126 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) 127 { 128 pgd_t *pg; 129 pud_t *pu; 130 pmd_t *pm; 131 hugepd_t *hpdp = NULL; 132 unsigned pshift = __ffs(sz); 133 unsigned pdshift = PGDIR_SHIFT; 134 spinlock_t *ptl; 135 136 addr &= ~(sz-1); 137 pg = pgd_offset(mm, addr); 138 139 #ifdef CONFIG_PPC_BOOK3S_64 140 if (pshift == PGDIR_SHIFT) 141 /* 16GB huge page */ 142 return (pte_t *) pg; 143 else if (pshift > PUD_SHIFT) { 144 /* 145 * We need to use hugepd table 146 */ 147 ptl = &mm->page_table_lock; 148 hpdp = (hugepd_t *)pg; 149 } else { 150 pdshift = PUD_SHIFT; 151 pu = pud_alloc(mm, pg, addr); 152 if (pshift == PUD_SHIFT) 153 return (pte_t *)pu; 154 else if (pshift > PMD_SHIFT) { 155 ptl = pud_lockptr(mm, pu); 156 hpdp = (hugepd_t *)pu; 157 } else { 158 pdshift = PMD_SHIFT; 159 pm = pmd_alloc(mm, pu, addr); 160 if (pshift == PMD_SHIFT) 161 /* 16MB hugepage */ 162 return (pte_t *)pm; 163 else { 164 ptl = pmd_lockptr(mm, pm); 165 hpdp = (hugepd_t *)pm; 166 } 167 } 168 } 169 #else 170 if (pshift >= PGDIR_SHIFT) { 171 ptl = &mm->page_table_lock; 172 hpdp = (hugepd_t *)pg; 173 } else { 174 pdshift = PUD_SHIFT; 175 pu = pud_alloc(mm, pg, addr); 176 if (pshift >= PUD_SHIFT) { 177 ptl = pud_lockptr(mm, pu); 178 hpdp = (hugepd_t *)pu; 179 } else { 180 pdshift = PMD_SHIFT; 181 pm = pmd_alloc(mm, pu, addr); 182 ptl = pmd_lockptr(mm, pm); 183 hpdp = (hugepd_t *)pm; 184 } 185 } 186 #endif 187 if (!hpdp) 188 return NULL; 189 190 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); 191 192 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, 193 pdshift, pshift, ptl)) 194 return NULL; 195 196 return hugepte_offset(*hpdp, addr, pdshift); 197 } 198 199 #ifdef CONFIG_PPC_BOOK3S_64 200 /* 201 * Tracks gpages after the device tree is scanned and before the 202 * huge_boot_pages list is ready on pseries. 203 */ 204 #define MAX_NUMBER_GPAGES 1024 205 __initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES]; 206 __initdata static unsigned nr_gpages; 207 208 /* 209 * Build list of addresses of gigantic pages. This function is used in early 210 * boot before the buddy allocator is setup. 211 */ 212 void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) 213 { 214 if (!addr) 215 return; 216 while (number_of_pages > 0) { 217 gpage_freearray[nr_gpages] = addr; 218 nr_gpages++; 219 number_of_pages--; 220 addr += page_size; 221 } 222 } 223 224 int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) 225 { 226 struct huge_bootmem_page *m; 227 if (nr_gpages == 0) 228 return 0; 229 m = phys_to_virt(gpage_freearray[--nr_gpages]); 230 gpage_freearray[nr_gpages] = 0; 231 list_add(&m->list, &huge_boot_pages); 232 m->hstate = hstate; 233 return 1; 234 } 235 #endif 236 237 238 int __init alloc_bootmem_huge_page(struct hstate *h) 239 { 240 241 #ifdef CONFIG_PPC_BOOK3S_64 242 if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) 243 return pseries_alloc_bootmem_huge_page(h); 244 #endif 245 return __alloc_bootmem_huge_page(h); 246 } 247 248 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 249 #define HUGEPD_FREELIST_SIZE \ 250 ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) 251 252 struct hugepd_freelist { 253 struct rcu_head rcu; 254 unsigned int index; 255 void *ptes[0]; 256 }; 257 258 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); 259 260 static void hugepd_free_rcu_callback(struct rcu_head *head) 261 { 262 struct hugepd_freelist *batch = 263 container_of(head, struct hugepd_freelist, rcu); 264 unsigned int i; 265 266 for (i = 0; i < batch->index; i++) 267 kmem_cache_free(hugepte_cache, batch->ptes[i]); 268 269 free_page((unsigned long)batch); 270 } 271 272 static void hugepd_free(struct mmu_gather *tlb, void *hugepte) 273 { 274 struct hugepd_freelist **batchp; 275 276 batchp = &get_cpu_var(hugepd_freelist_cur); 277 278 if (atomic_read(&tlb->mm->mm_users) < 2 || 279 mm_is_thread_local(tlb->mm)) { 280 kmem_cache_free(hugepte_cache, hugepte); 281 put_cpu_var(hugepd_freelist_cur); 282 return; 283 } 284 285 if (*batchp == NULL) { 286 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); 287 (*batchp)->index = 0; 288 } 289 290 (*batchp)->ptes[(*batchp)->index++] = hugepte; 291 if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { 292 call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback); 293 *batchp = NULL; 294 } 295 put_cpu_var(hugepd_freelist_cur); 296 } 297 #else 298 static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {} 299 #endif 300 301 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, 302 unsigned long start, unsigned long end, 303 unsigned long floor, unsigned long ceiling) 304 { 305 pte_t *hugepte = hugepd_page(*hpdp); 306 int i; 307 308 unsigned long pdmask = ~((1UL << pdshift) - 1); 309 unsigned int num_hugepd = 1; 310 unsigned int shift = hugepd_shift(*hpdp); 311 312 /* Note: On fsl the hpdp may be the first of several */ 313 if (shift > pdshift) 314 num_hugepd = 1 << (shift - pdshift); 315 316 start &= pdmask; 317 if (start < floor) 318 return; 319 if (ceiling) { 320 ceiling &= pdmask; 321 if (! ceiling) 322 return; 323 } 324 if (end - 1 > ceiling - 1) 325 return; 326 327 for (i = 0; i < num_hugepd; i++, hpdp++) 328 *hpdp = __hugepd(0); 329 330 if (shift >= pdshift) 331 hugepd_free(tlb, hugepte); 332 else 333 pgtable_free_tlb(tlb, hugepte, 334 get_hugepd_cache_index(pdshift - shift)); 335 } 336 337 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 338 unsigned long addr, unsigned long end, 339 unsigned long floor, unsigned long ceiling) 340 { 341 pmd_t *pmd; 342 unsigned long next; 343 unsigned long start; 344 345 start = addr; 346 do { 347 unsigned long more; 348 349 pmd = pmd_offset(pud, addr); 350 next = pmd_addr_end(addr, end); 351 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) { 352 /* 353 * if it is not hugepd pointer, we should already find 354 * it cleared. 355 */ 356 WARN_ON(!pmd_none_or_clear_bad(pmd)); 357 continue; 358 } 359 /* 360 * Increment next by the size of the huge mapping since 361 * there may be more than one entry at this level for a 362 * single hugepage, but all of them point to 363 * the same kmem cache that holds the hugepte. 364 */ 365 more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd)); 366 if (more > next) 367 next = more; 368 369 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, 370 addr, next, floor, ceiling); 371 } while (addr = next, addr != end); 372 373 start &= PUD_MASK; 374 if (start < floor) 375 return; 376 if (ceiling) { 377 ceiling &= PUD_MASK; 378 if (!ceiling) 379 return; 380 } 381 if (end - 1 > ceiling - 1) 382 return; 383 384 pmd = pmd_offset(pud, start); 385 pud_clear(pud); 386 pmd_free_tlb(tlb, pmd, start); 387 mm_dec_nr_pmds(tlb->mm); 388 } 389 390 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 391 unsigned long addr, unsigned long end, 392 unsigned long floor, unsigned long ceiling) 393 { 394 pud_t *pud; 395 unsigned long next; 396 unsigned long start; 397 398 start = addr; 399 do { 400 pud = pud_offset(pgd, addr); 401 next = pud_addr_end(addr, end); 402 if (!is_hugepd(__hugepd(pud_val(*pud)))) { 403 if (pud_none_or_clear_bad(pud)) 404 continue; 405 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 406 ceiling); 407 } else { 408 unsigned long more; 409 /* 410 * Increment next by the size of the huge mapping since 411 * there may be more than one entry at this level for a 412 * single hugepage, but all of them point to 413 * the same kmem cache that holds the hugepte. 414 */ 415 more = addr + (1 << hugepd_shift(*(hugepd_t *)pud)); 416 if (more > next) 417 next = more; 418 419 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, 420 addr, next, floor, ceiling); 421 } 422 } while (addr = next, addr != end); 423 424 start &= PGDIR_MASK; 425 if (start < floor) 426 return; 427 if (ceiling) { 428 ceiling &= PGDIR_MASK; 429 if (!ceiling) 430 return; 431 } 432 if (end - 1 > ceiling - 1) 433 return; 434 435 pud = pud_offset(pgd, start); 436 pgd_clear(pgd); 437 pud_free_tlb(tlb, pud, start); 438 mm_dec_nr_puds(tlb->mm); 439 } 440 441 /* 442 * This function frees user-level page tables of a process. 443 */ 444 void hugetlb_free_pgd_range(struct mmu_gather *tlb, 445 unsigned long addr, unsigned long end, 446 unsigned long floor, unsigned long ceiling) 447 { 448 pgd_t *pgd; 449 unsigned long next; 450 451 /* 452 * Because there are a number of different possible pagetable 453 * layouts for hugepage ranges, we limit knowledge of how 454 * things should be laid out to the allocation path 455 * (huge_pte_alloc(), above). Everything else works out the 456 * structure as it goes from information in the hugepd 457 * pointers. That means that we can't here use the 458 * optimization used in the normal page free_pgd_range(), of 459 * checking whether we're actually covering a large enough 460 * range to have to do anything at the top level of the walk 461 * instead of at the bottom. 462 * 463 * To make sense of this, you should probably go read the big 464 * block comment at the top of the normal free_pgd_range(), 465 * too. 466 */ 467 468 do { 469 next = pgd_addr_end(addr, end); 470 pgd = pgd_offset(tlb->mm, addr); 471 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) { 472 if (pgd_none_or_clear_bad(pgd)) 473 continue; 474 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 475 } else { 476 unsigned long more; 477 /* 478 * Increment next by the size of the huge mapping since 479 * there may be more than one entry at the pgd level 480 * for a single hugepage, but all of them point to the 481 * same kmem cache that holds the hugepte. 482 */ 483 more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); 484 if (more > next) 485 next = more; 486 487 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, 488 addr, next, floor, ceiling); 489 } 490 } while (addr = next, addr != end); 491 } 492 493 struct page *follow_huge_pd(struct vm_area_struct *vma, 494 unsigned long address, hugepd_t hpd, 495 int flags, int pdshift) 496 { 497 pte_t *ptep; 498 spinlock_t *ptl; 499 struct page *page = NULL; 500 unsigned long mask; 501 int shift = hugepd_shift(hpd); 502 struct mm_struct *mm = vma->vm_mm; 503 504 retry: 505 /* 506 * hugepage directory entries are protected by mm->page_table_lock 507 * Use this instead of huge_pte_lockptr 508 */ 509 ptl = &mm->page_table_lock; 510 spin_lock(ptl); 511 512 ptep = hugepte_offset(hpd, address, pdshift); 513 if (pte_present(*ptep)) { 514 mask = (1UL << shift) - 1; 515 page = pte_page(*ptep); 516 page += ((address & mask) >> PAGE_SHIFT); 517 if (flags & FOLL_GET) 518 get_page(page); 519 } else { 520 if (is_hugetlb_entry_migration(*ptep)) { 521 spin_unlock(ptl); 522 __migration_entry_wait(mm, ptep, ptl); 523 goto retry; 524 } 525 } 526 spin_unlock(ptl); 527 return page; 528 } 529 530 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, 531 unsigned long sz) 532 { 533 unsigned long __boundary = (addr + sz) & ~(sz-1); 534 return (__boundary - 1 < end - 1) ? __boundary : end; 535 } 536 537 int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift, 538 unsigned long end, int write, struct page **pages, int *nr) 539 { 540 pte_t *ptep; 541 unsigned long sz = 1UL << hugepd_shift(hugepd); 542 unsigned long next; 543 544 ptep = hugepte_offset(hugepd, addr, pdshift); 545 do { 546 next = hugepte_addr_end(addr, end, sz); 547 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) 548 return 0; 549 } while (ptep++, addr = next, addr != end); 550 551 return 1; 552 } 553 554 #ifdef CONFIG_PPC_MM_SLICES 555 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 556 unsigned long len, unsigned long pgoff, 557 unsigned long flags) 558 { 559 struct hstate *hstate = hstate_file(file); 560 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 561 562 #ifdef CONFIG_PPC_RADIX_MMU 563 if (radix_enabled()) 564 return radix__hugetlb_get_unmapped_area(file, addr, len, 565 pgoff, flags); 566 #endif 567 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); 568 } 569 #endif 570 571 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 572 { 573 #ifdef CONFIG_PPC_MM_SLICES 574 /* With radix we don't use slice, so derive it from vma*/ 575 if (!radix_enabled()) { 576 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); 577 578 return 1UL << mmu_psize_to_shift(psize); 579 } 580 #endif 581 return vma_kernel_pagesize(vma); 582 } 583 584 static inline bool is_power_of_4(unsigned long x) 585 { 586 if (is_power_of_2(x)) 587 return (__ilog2(x) % 2) ? false : true; 588 return false; 589 } 590 591 static int __init add_huge_page_size(unsigned long long size) 592 { 593 int shift = __ffs(size); 594 int mmu_psize; 595 596 /* Check that it is a page size supported by the hardware and 597 * that it fits within pagetable and slice limits. */ 598 if (size <= PAGE_SIZE) 599 return -EINVAL; 600 #if defined(CONFIG_PPC_FSL_BOOK3E) 601 if (!is_power_of_4(size)) 602 return -EINVAL; 603 #elif !defined(CONFIG_PPC_8xx) 604 if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT)) 605 return -EINVAL; 606 #endif 607 608 if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) 609 return -EINVAL; 610 611 #ifdef CONFIG_PPC_BOOK3S_64 612 /* 613 * We need to make sure that for different page sizes reported by 614 * firmware we only add hugetlb support for page sizes that can be 615 * supported by linux page table layout. 616 * For now we have 617 * Radix: 2M and 1G 618 * Hash: 16M and 16G 619 */ 620 if (radix_enabled()) { 621 if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G) 622 return -EINVAL; 623 } else { 624 if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) 625 return -EINVAL; 626 } 627 #endif 628 629 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); 630 631 /* Return if huge page size has already been setup */ 632 if (size_to_hstate(size)) 633 return 0; 634 635 hugetlb_add_hstate(shift - PAGE_SHIFT); 636 637 return 0; 638 } 639 640 static int __init hugepage_setup_sz(char *str) 641 { 642 unsigned long long size; 643 644 size = memparse(str, &str); 645 646 if (add_huge_page_size(size) != 0) { 647 hugetlb_bad_size(); 648 pr_err("Invalid huge page size specified(%llu)\n", size); 649 } 650 651 return 1; 652 } 653 __setup("hugepagesz=", hugepage_setup_sz); 654 655 struct kmem_cache *hugepte_cache; 656 static int __init hugetlbpage_init(void) 657 { 658 int psize; 659 660 if (hugetlb_disabled) { 661 pr_info("HugeTLB support is disabled!\n"); 662 return 0; 663 } 664 665 #if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx) 666 if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE)) 667 return -ENODEV; 668 #endif 669 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 670 unsigned shift; 671 unsigned pdshift; 672 673 if (!mmu_psize_defs[psize].shift) 674 continue; 675 676 shift = mmu_psize_to_shift(psize); 677 678 #ifdef CONFIG_PPC_BOOK3S_64 679 if (shift > PGDIR_SHIFT) 680 continue; 681 else if (shift > PUD_SHIFT) 682 pdshift = PGDIR_SHIFT; 683 else if (shift > PMD_SHIFT) 684 pdshift = PUD_SHIFT; 685 else 686 pdshift = PMD_SHIFT; 687 #else 688 if (shift < PUD_SHIFT) 689 pdshift = PMD_SHIFT; 690 else if (shift < PGDIR_SHIFT) 691 pdshift = PUD_SHIFT; 692 else 693 pdshift = PGDIR_SHIFT; 694 #endif 695 696 if (add_huge_page_size(1ULL << shift) < 0) 697 continue; 698 /* 699 * if we have pdshift and shift value same, we don't 700 * use pgt cache for hugepd. 701 */ 702 if (pdshift > shift) 703 pgtable_cache_add(pdshift - shift, NULL); 704 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 705 else if (!hugepte_cache) { 706 /* 707 * Create a kmem cache for hugeptes. The bottom bits in 708 * the pte have size information encoded in them, so 709 * align them to allow this 710 */ 711 hugepte_cache = kmem_cache_create("hugepte-cache", 712 sizeof(pte_t), 713 HUGEPD_SHIFT_MASK + 1, 714 0, NULL); 715 if (hugepte_cache == NULL) 716 panic("%s: Unable to create kmem cache " 717 "for hugeptes\n", __func__); 718 719 } 720 #endif 721 } 722 723 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 724 /* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */ 725 if (mmu_psize_defs[MMU_PAGE_4M].shift) 726 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift; 727 else if (mmu_psize_defs[MMU_PAGE_512K].shift) 728 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift; 729 #else 730 /* Set default large page size. Currently, we pick 16M or 1M 731 * depending on what is available 732 */ 733 if (mmu_psize_defs[MMU_PAGE_16M].shift) 734 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; 735 else if (mmu_psize_defs[MMU_PAGE_1M].shift) 736 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; 737 else if (mmu_psize_defs[MMU_PAGE_2M].shift) 738 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift; 739 #endif 740 return 0; 741 } 742 743 arch_initcall(hugetlbpage_init); 744 745 void flush_dcache_icache_hugepage(struct page *page) 746 { 747 int i; 748 void *start; 749 750 BUG_ON(!PageCompound(page)); 751 752 for (i = 0; i < (1UL << compound_order(page)); i++) { 753 if (!PageHighMem(page)) { 754 __flush_dcache_icache(page_address(page+i)); 755 } else { 756 start = kmap_atomic(page+i); 757 __flush_dcache_icache(start); 758 kunmap_atomic(start); 759 } 760 } 761 } 762 763 #endif /* CONFIG_HUGETLB_PAGE */ 764 765 /* 766 * We have 4 cases for pgds and pmds: 767 * (1) invalid (all zeroes) 768 * (2) pointer to next table, as normal; bottom 6 bits == 0 769 * (3) leaf pte for huge page _PAGE_PTE set 770 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table 771 * 772 * So long as we atomically load page table pointers we are safe against teardown, 773 * we can follow the address down to the the page and take a ref on it. 774 * This function need to be called with interrupts disabled. We use this variant 775 * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED 776 */ 777 pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, 778 bool *is_thp, unsigned *hpage_shift) 779 { 780 pgd_t pgd, *pgdp; 781 pud_t pud, *pudp; 782 pmd_t pmd, *pmdp; 783 pte_t *ret_pte; 784 hugepd_t *hpdp = NULL; 785 unsigned pdshift = PGDIR_SHIFT; 786 787 if (hpage_shift) 788 *hpage_shift = 0; 789 790 if (is_thp) 791 *is_thp = false; 792 793 pgdp = pgdir + pgd_index(ea); 794 pgd = READ_ONCE(*pgdp); 795 /* 796 * Always operate on the local stack value. This make sure the 797 * value don't get updated by a parallel THP split/collapse, 798 * page fault or a page unmap. The return pte_t * is still not 799 * stable. So should be checked there for above conditions. 800 */ 801 if (pgd_none(pgd)) 802 return NULL; 803 else if (pgd_huge(pgd)) { 804 ret_pte = (pte_t *) pgdp; 805 goto out; 806 } else if (is_hugepd(__hugepd(pgd_val(pgd)))) 807 hpdp = (hugepd_t *)&pgd; 808 else { 809 /* 810 * Even if we end up with an unmap, the pgtable will not 811 * be freed, because we do an rcu free and here we are 812 * irq disabled 813 */ 814 pdshift = PUD_SHIFT; 815 pudp = pud_offset(&pgd, ea); 816 pud = READ_ONCE(*pudp); 817 818 if (pud_none(pud)) 819 return NULL; 820 else if (pud_huge(pud)) { 821 ret_pte = (pte_t *) pudp; 822 goto out; 823 } else if (is_hugepd(__hugepd(pud_val(pud)))) 824 hpdp = (hugepd_t *)&pud; 825 else { 826 pdshift = PMD_SHIFT; 827 pmdp = pmd_offset(&pud, ea); 828 pmd = READ_ONCE(*pmdp); 829 /* 830 * A hugepage collapse is captured by pmd_none, because 831 * it mark the pmd none and do a hpte invalidate. 832 */ 833 if (pmd_none(pmd)) 834 return NULL; 835 836 if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { 837 if (is_thp) 838 *is_thp = true; 839 ret_pte = (pte_t *) pmdp; 840 goto out; 841 } 842 /* 843 * pmd_large check below will handle the swap pmd pte 844 * we need to do both the check because they are config 845 * dependent. 846 */ 847 if (pmd_huge(pmd) || pmd_large(pmd)) { 848 ret_pte = (pte_t *) pmdp; 849 goto out; 850 } else if (is_hugepd(__hugepd(pmd_val(pmd)))) 851 hpdp = (hugepd_t *)&pmd; 852 else 853 return pte_offset_kernel(&pmd, ea); 854 } 855 } 856 if (!hpdp) 857 return NULL; 858 859 ret_pte = hugepte_offset(*hpdp, ea, pdshift); 860 pdshift = hugepd_shift(*hpdp); 861 out: 862 if (hpage_shift) 863 *hpage_shift = pdshift; 864 return ret_pte; 865 } 866 EXPORT_SYMBOL_GPL(__find_linux_pte); 867 868 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, 869 unsigned long end, int write, struct page **pages, int *nr) 870 { 871 unsigned long pte_end; 872 struct page *head, *page; 873 pte_t pte; 874 int refs; 875 876 pte_end = (addr + sz) & ~(sz-1); 877 if (pte_end < end) 878 end = pte_end; 879 880 pte = READ_ONCE(*ptep); 881 882 if (!pte_access_permitted(pte, write)) 883 return 0; 884 885 /* hugepages are never "special" */ 886 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 887 888 refs = 0; 889 head = pte_page(pte); 890 891 page = head + ((addr & (sz-1)) >> PAGE_SHIFT); 892 do { 893 VM_BUG_ON(compound_head(page) != head); 894 pages[*nr] = page; 895 (*nr)++; 896 page++; 897 refs++; 898 } while (addr += PAGE_SIZE, addr != end); 899 900 if (!page_cache_add_speculative(head, refs)) { 901 *nr -= refs; 902 return 0; 903 } 904 905 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 906 /* Could be optimized better */ 907 *nr -= refs; 908 while (refs--) 909 put_page(head); 910 return 0; 911 } 912 913 return 1; 914 } 915