1 /* 2 * PPC Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2003 David Gibson, IBM Corporation. 5 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor 6 * 7 * Based on the IA-32 version: 8 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 9 */ 10 11 #include <linux/mm.h> 12 #include <linux/io.h> 13 #include <linux/slab.h> 14 #include <linux/hugetlb.h> 15 #include <linux/export.h> 16 #include <linux/of_fdt.h> 17 #include <linux/memblock.h> 18 #include <linux/bootmem.h> 19 #include <linux/moduleparam.h> 20 #include <linux/swap.h> 21 #include <linux/swapops.h> 22 #include <asm/pgtable.h> 23 #include <asm/pgalloc.h> 24 #include <asm/tlb.h> 25 #include <asm/setup.h> 26 #include <asm/hugetlb.h> 27 #include <asm/pte-walk.h> 28 29 30 #ifdef CONFIG_HUGETLB_PAGE 31 32 #define PAGE_SHIFT_64K 16 33 #define PAGE_SHIFT_512K 19 34 #define PAGE_SHIFT_8M 23 35 #define PAGE_SHIFT_16M 24 36 #define PAGE_SHIFT_16G 34 37 38 unsigned int HPAGE_SHIFT; 39 EXPORT_SYMBOL(HPAGE_SHIFT); 40 41 #define hugepd_none(hpd) (hpd_val(hpd) == 0) 42 43 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) 44 { 45 /* 46 * Only called for hugetlbfs pages, hence can ignore THP and the 47 * irq disabled walk. 48 */ 49 return __find_linux_pte(mm->pgd, addr, NULL, NULL); 50 } 51 52 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 53 unsigned long address, unsigned pdshift, unsigned pshift) 54 { 55 struct kmem_cache *cachep; 56 pte_t *new; 57 int i; 58 int num_hugepd; 59 60 if (pshift >= pdshift) { 61 cachep = hugepte_cache; 62 num_hugepd = 1 << (pshift - pdshift); 63 } else { 64 cachep = PGT_CACHE(pdshift - pshift); 65 num_hugepd = 1; 66 } 67 68 new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL)); 69 70 BUG_ON(pshift > HUGEPD_SHIFT_MASK); 71 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); 72 73 if (! new) 74 return -ENOMEM; 75 76 /* 77 * Make sure other cpus find the hugepd set only after a 78 * properly initialized page table is visible to them. 79 * For more details look for comment in __pte_alloc(). 80 */ 81 smp_wmb(); 82 83 spin_lock(&mm->page_table_lock); 84 85 /* 86 * We have multiple higher-level entries that point to the same 87 * actual pte location. Fill in each as we go and backtrack on error. 88 * We need all of these so the DTLB pgtable walk code can find the 89 * right higher-level entry without knowing if it's a hugepage or not. 90 */ 91 for (i = 0; i < num_hugepd; i++, hpdp++) { 92 if (unlikely(!hugepd_none(*hpdp))) 93 break; 94 else { 95 #ifdef CONFIG_PPC_BOOK3S_64 96 *hpdp = __hugepd(__pa(new) | 97 (shift_to_mmu_psize(pshift) << 2)); 98 #elif defined(CONFIG_PPC_8xx) 99 *hpdp = __hugepd(__pa(new) | _PMD_USER | 100 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M : 101 _PMD_PAGE_512K) | _PMD_PRESENT); 102 #else 103 /* We use the old format for PPC_FSL_BOOK3E */ 104 *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift); 105 #endif 106 } 107 } 108 /* If we bailed from the for loop early, an error occurred, clean up */ 109 if (i < num_hugepd) { 110 for (i = i - 1 ; i >= 0; i--, hpdp--) 111 *hpdp = __hugepd(0); 112 kmem_cache_free(cachep, new); 113 } 114 spin_unlock(&mm->page_table_lock); 115 return 0; 116 } 117 118 /* 119 * These macros define how to determine which level of the page table holds 120 * the hpdp. 121 */ 122 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 123 #define HUGEPD_PGD_SHIFT PGDIR_SHIFT 124 #define HUGEPD_PUD_SHIFT PUD_SHIFT 125 #endif 126 127 /* 128 * At this point we do the placement change only for BOOK3S 64. This would 129 * possibly work on other subarchs. 130 */ 131 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) 132 { 133 pgd_t *pg; 134 pud_t *pu; 135 pmd_t *pm; 136 hugepd_t *hpdp = NULL; 137 unsigned pshift = __ffs(sz); 138 unsigned pdshift = PGDIR_SHIFT; 139 140 addr &= ~(sz-1); 141 pg = pgd_offset(mm, addr); 142 143 #ifdef CONFIG_PPC_BOOK3S_64 144 if (pshift == PGDIR_SHIFT) 145 /* 16GB huge page */ 146 return (pte_t *) pg; 147 else if (pshift > PUD_SHIFT) 148 /* 149 * We need to use hugepd table 150 */ 151 hpdp = (hugepd_t *)pg; 152 else { 153 pdshift = PUD_SHIFT; 154 pu = pud_alloc(mm, pg, addr); 155 if (pshift == PUD_SHIFT) 156 return (pte_t *)pu; 157 else if (pshift > PMD_SHIFT) 158 hpdp = (hugepd_t *)pu; 159 else { 160 pdshift = PMD_SHIFT; 161 pm = pmd_alloc(mm, pu, addr); 162 if (pshift == PMD_SHIFT) 163 /* 16MB hugepage */ 164 return (pte_t *)pm; 165 else 166 hpdp = (hugepd_t *)pm; 167 } 168 } 169 #else 170 if (pshift >= HUGEPD_PGD_SHIFT) { 171 hpdp = (hugepd_t *)pg; 172 } else { 173 pdshift = PUD_SHIFT; 174 pu = pud_alloc(mm, pg, addr); 175 if (pshift >= HUGEPD_PUD_SHIFT) { 176 hpdp = (hugepd_t *)pu; 177 } else { 178 pdshift = PMD_SHIFT; 179 pm = pmd_alloc(mm, pu, addr); 180 hpdp = (hugepd_t *)pm; 181 } 182 } 183 #endif 184 if (!hpdp) 185 return NULL; 186 187 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); 188 189 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) 190 return NULL; 191 192 return hugepte_offset(*hpdp, addr, pdshift); 193 } 194 195 #ifdef CONFIG_PPC_BOOK3S_64 196 /* 197 * Tracks gpages after the device tree is scanned and before the 198 * huge_boot_pages list is ready on pseries. 199 */ 200 #define MAX_NUMBER_GPAGES 1024 201 __initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES]; 202 __initdata static unsigned nr_gpages; 203 204 /* 205 * Build list of addresses of gigantic pages. This function is used in early 206 * boot before the buddy allocator is setup. 207 */ 208 void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) 209 { 210 if (!addr) 211 return; 212 while (number_of_pages > 0) { 213 gpage_freearray[nr_gpages] = addr; 214 nr_gpages++; 215 number_of_pages--; 216 addr += page_size; 217 } 218 } 219 220 int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) 221 { 222 struct huge_bootmem_page *m; 223 if (nr_gpages == 0) 224 return 0; 225 m = phys_to_virt(gpage_freearray[--nr_gpages]); 226 gpage_freearray[nr_gpages] = 0; 227 list_add(&m->list, &huge_boot_pages); 228 m->hstate = hstate; 229 return 1; 230 } 231 #endif 232 233 234 int __init alloc_bootmem_huge_page(struct hstate *h) 235 { 236 237 #ifdef CONFIG_PPC_BOOK3S_64 238 if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) 239 return pseries_alloc_bootmem_huge_page(h); 240 #endif 241 return __alloc_bootmem_huge_page(h); 242 } 243 244 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 245 #define HUGEPD_FREELIST_SIZE \ 246 ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) 247 248 struct hugepd_freelist { 249 struct rcu_head rcu; 250 unsigned int index; 251 void *ptes[0]; 252 }; 253 254 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); 255 256 static void hugepd_free_rcu_callback(struct rcu_head *head) 257 { 258 struct hugepd_freelist *batch = 259 container_of(head, struct hugepd_freelist, rcu); 260 unsigned int i; 261 262 for (i = 0; i < batch->index; i++) 263 kmem_cache_free(hugepte_cache, batch->ptes[i]); 264 265 free_page((unsigned long)batch); 266 } 267 268 static void hugepd_free(struct mmu_gather *tlb, void *hugepte) 269 { 270 struct hugepd_freelist **batchp; 271 272 batchp = &get_cpu_var(hugepd_freelist_cur); 273 274 if (atomic_read(&tlb->mm->mm_users) < 2 || 275 mm_is_thread_local(tlb->mm)) { 276 kmem_cache_free(hugepte_cache, hugepte); 277 put_cpu_var(hugepd_freelist_cur); 278 return; 279 } 280 281 if (*batchp == NULL) { 282 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); 283 (*batchp)->index = 0; 284 } 285 286 (*batchp)->ptes[(*batchp)->index++] = hugepte; 287 if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { 288 call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback); 289 *batchp = NULL; 290 } 291 put_cpu_var(hugepd_freelist_cur); 292 } 293 #else 294 static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {} 295 #endif 296 297 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, 298 unsigned long start, unsigned long end, 299 unsigned long floor, unsigned long ceiling) 300 { 301 pte_t *hugepte = hugepd_page(*hpdp); 302 int i; 303 304 unsigned long pdmask = ~((1UL << pdshift) - 1); 305 unsigned int num_hugepd = 1; 306 unsigned int shift = hugepd_shift(*hpdp); 307 308 /* Note: On fsl the hpdp may be the first of several */ 309 if (shift > pdshift) 310 num_hugepd = 1 << (shift - pdshift); 311 312 start &= pdmask; 313 if (start < floor) 314 return; 315 if (ceiling) { 316 ceiling &= pdmask; 317 if (! ceiling) 318 return; 319 } 320 if (end - 1 > ceiling - 1) 321 return; 322 323 for (i = 0; i < num_hugepd; i++, hpdp++) 324 *hpdp = __hugepd(0); 325 326 if (shift >= pdshift) 327 hugepd_free(tlb, hugepte); 328 else 329 pgtable_free_tlb(tlb, hugepte, pdshift - shift); 330 } 331 332 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 333 unsigned long addr, unsigned long end, 334 unsigned long floor, unsigned long ceiling) 335 { 336 pmd_t *pmd; 337 unsigned long next; 338 unsigned long start; 339 340 start = addr; 341 do { 342 unsigned long more; 343 344 pmd = pmd_offset(pud, addr); 345 next = pmd_addr_end(addr, end); 346 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) { 347 /* 348 * if it is not hugepd pointer, we should already find 349 * it cleared. 350 */ 351 WARN_ON(!pmd_none_or_clear_bad(pmd)); 352 continue; 353 } 354 /* 355 * Increment next by the size of the huge mapping since 356 * there may be more than one entry at this level for a 357 * single hugepage, but all of them point to 358 * the same kmem cache that holds the hugepte. 359 */ 360 more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd)); 361 if (more > next) 362 next = more; 363 364 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, 365 addr, next, floor, ceiling); 366 } while (addr = next, addr != end); 367 368 start &= PUD_MASK; 369 if (start < floor) 370 return; 371 if (ceiling) { 372 ceiling &= PUD_MASK; 373 if (!ceiling) 374 return; 375 } 376 if (end - 1 > ceiling - 1) 377 return; 378 379 pmd = pmd_offset(pud, start); 380 pud_clear(pud); 381 pmd_free_tlb(tlb, pmd, start); 382 mm_dec_nr_pmds(tlb->mm); 383 } 384 385 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 386 unsigned long addr, unsigned long end, 387 unsigned long floor, unsigned long ceiling) 388 { 389 pud_t *pud; 390 unsigned long next; 391 unsigned long start; 392 393 start = addr; 394 do { 395 pud = pud_offset(pgd, addr); 396 next = pud_addr_end(addr, end); 397 if (!is_hugepd(__hugepd(pud_val(*pud)))) { 398 if (pud_none_or_clear_bad(pud)) 399 continue; 400 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 401 ceiling); 402 } else { 403 unsigned long more; 404 /* 405 * Increment next by the size of the huge mapping since 406 * there may be more than one entry at this level for a 407 * single hugepage, but all of them point to 408 * the same kmem cache that holds the hugepte. 409 */ 410 more = addr + (1 << hugepd_shift(*(hugepd_t *)pud)); 411 if (more > next) 412 next = more; 413 414 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, 415 addr, next, floor, ceiling); 416 } 417 } while (addr = next, addr != end); 418 419 start &= PGDIR_MASK; 420 if (start < floor) 421 return; 422 if (ceiling) { 423 ceiling &= PGDIR_MASK; 424 if (!ceiling) 425 return; 426 } 427 if (end - 1 > ceiling - 1) 428 return; 429 430 pud = pud_offset(pgd, start); 431 pgd_clear(pgd); 432 pud_free_tlb(tlb, pud, start); 433 mm_dec_nr_puds(tlb->mm); 434 } 435 436 /* 437 * This function frees user-level page tables of a process. 438 */ 439 void hugetlb_free_pgd_range(struct mmu_gather *tlb, 440 unsigned long addr, unsigned long end, 441 unsigned long floor, unsigned long ceiling) 442 { 443 pgd_t *pgd; 444 unsigned long next; 445 446 /* 447 * Because there are a number of different possible pagetable 448 * layouts for hugepage ranges, we limit knowledge of how 449 * things should be laid out to the allocation path 450 * (huge_pte_alloc(), above). Everything else works out the 451 * structure as it goes from information in the hugepd 452 * pointers. That means that we can't here use the 453 * optimization used in the normal page free_pgd_range(), of 454 * checking whether we're actually covering a large enough 455 * range to have to do anything at the top level of the walk 456 * instead of at the bottom. 457 * 458 * To make sense of this, you should probably go read the big 459 * block comment at the top of the normal free_pgd_range(), 460 * too. 461 */ 462 463 do { 464 next = pgd_addr_end(addr, end); 465 pgd = pgd_offset(tlb->mm, addr); 466 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) { 467 if (pgd_none_or_clear_bad(pgd)) 468 continue; 469 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 470 } else { 471 unsigned long more; 472 /* 473 * Increment next by the size of the huge mapping since 474 * there may be more than one entry at the pgd level 475 * for a single hugepage, but all of them point to the 476 * same kmem cache that holds the hugepte. 477 */ 478 more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); 479 if (more > next) 480 next = more; 481 482 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, 483 addr, next, floor, ceiling); 484 } 485 } while (addr = next, addr != end); 486 } 487 488 struct page *follow_huge_pd(struct vm_area_struct *vma, 489 unsigned long address, hugepd_t hpd, 490 int flags, int pdshift) 491 { 492 pte_t *ptep; 493 spinlock_t *ptl; 494 struct page *page = NULL; 495 unsigned long mask; 496 int shift = hugepd_shift(hpd); 497 struct mm_struct *mm = vma->vm_mm; 498 499 retry: 500 ptl = &mm->page_table_lock; 501 spin_lock(ptl); 502 503 ptep = hugepte_offset(hpd, address, pdshift); 504 if (pte_present(*ptep)) { 505 mask = (1UL << shift) - 1; 506 page = pte_page(*ptep); 507 page += ((address & mask) >> PAGE_SHIFT); 508 if (flags & FOLL_GET) 509 get_page(page); 510 } else { 511 if (is_hugetlb_entry_migration(*ptep)) { 512 spin_unlock(ptl); 513 __migration_entry_wait(mm, ptep, ptl); 514 goto retry; 515 } 516 } 517 spin_unlock(ptl); 518 return page; 519 } 520 521 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, 522 unsigned long sz) 523 { 524 unsigned long __boundary = (addr + sz) & ~(sz-1); 525 return (__boundary - 1 < end - 1) ? __boundary : end; 526 } 527 528 int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift, 529 unsigned long end, int write, struct page **pages, int *nr) 530 { 531 pte_t *ptep; 532 unsigned long sz = 1UL << hugepd_shift(hugepd); 533 unsigned long next; 534 535 ptep = hugepte_offset(hugepd, addr, pdshift); 536 do { 537 next = hugepte_addr_end(addr, end, sz); 538 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) 539 return 0; 540 } while (ptep++, addr = next, addr != end); 541 542 return 1; 543 } 544 545 #ifdef CONFIG_PPC_MM_SLICES 546 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 547 unsigned long len, unsigned long pgoff, 548 unsigned long flags) 549 { 550 struct hstate *hstate = hstate_file(file); 551 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 552 553 #ifdef CONFIG_PPC_RADIX_MMU 554 if (radix_enabled()) 555 return radix__hugetlb_get_unmapped_area(file, addr, len, 556 pgoff, flags); 557 #endif 558 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); 559 } 560 #endif 561 562 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 563 { 564 #ifdef CONFIG_PPC_MM_SLICES 565 /* With radix we don't use slice, so derive it from vma*/ 566 if (!radix_enabled()) { 567 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); 568 569 return 1UL << mmu_psize_to_shift(psize); 570 } 571 #endif 572 return vma_kernel_pagesize(vma); 573 } 574 575 static inline bool is_power_of_4(unsigned long x) 576 { 577 if (is_power_of_2(x)) 578 return (__ilog2(x) % 2) ? false : true; 579 return false; 580 } 581 582 static int __init add_huge_page_size(unsigned long long size) 583 { 584 int shift = __ffs(size); 585 int mmu_psize; 586 587 /* Check that it is a page size supported by the hardware and 588 * that it fits within pagetable and slice limits. */ 589 if (size <= PAGE_SIZE) 590 return -EINVAL; 591 #if defined(CONFIG_PPC_FSL_BOOK3E) 592 if (!is_power_of_4(size)) 593 return -EINVAL; 594 #elif !defined(CONFIG_PPC_8xx) 595 if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT)) 596 return -EINVAL; 597 #endif 598 599 if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) 600 return -EINVAL; 601 602 #ifdef CONFIG_PPC_BOOK3S_64 603 /* 604 * We need to make sure that for different page sizes reported by 605 * firmware we only add hugetlb support for page sizes that can be 606 * supported by linux page table layout. 607 * For now we have 608 * Radix: 2M 609 * Hash: 16M and 16G 610 */ 611 if (radix_enabled()) { 612 if (mmu_psize != MMU_PAGE_2M) { 613 if (cpu_has_feature(CPU_FTR_POWER9_DD1) || 614 (mmu_psize != MMU_PAGE_1G)) 615 return -EINVAL; 616 } 617 } else { 618 if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) 619 return -EINVAL; 620 } 621 #endif 622 623 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); 624 625 /* Return if huge page size has already been setup */ 626 if (size_to_hstate(size)) 627 return 0; 628 629 hugetlb_add_hstate(shift - PAGE_SHIFT); 630 631 return 0; 632 } 633 634 static int __init hugepage_setup_sz(char *str) 635 { 636 unsigned long long size; 637 638 size = memparse(str, &str); 639 640 if (add_huge_page_size(size) != 0) { 641 hugetlb_bad_size(); 642 pr_err("Invalid huge page size specified(%llu)\n", size); 643 } 644 645 return 1; 646 } 647 __setup("hugepagesz=", hugepage_setup_sz); 648 649 struct kmem_cache *hugepte_cache; 650 static int __init hugetlbpage_init(void) 651 { 652 int psize; 653 654 #if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx) 655 if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE)) 656 return -ENODEV; 657 #endif 658 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 659 unsigned shift; 660 unsigned pdshift; 661 662 if (!mmu_psize_defs[psize].shift) 663 continue; 664 665 shift = mmu_psize_to_shift(psize); 666 667 #ifdef CONFIG_PPC_BOOK3S_64 668 if (shift > PGDIR_SHIFT) 669 continue; 670 else if (shift > PUD_SHIFT) 671 pdshift = PGDIR_SHIFT; 672 else if (shift > PMD_SHIFT) 673 pdshift = PUD_SHIFT; 674 else 675 pdshift = PMD_SHIFT; 676 #else 677 if (shift < HUGEPD_PUD_SHIFT) 678 pdshift = PMD_SHIFT; 679 else if (shift < HUGEPD_PGD_SHIFT) 680 pdshift = PUD_SHIFT; 681 else 682 pdshift = PGDIR_SHIFT; 683 #endif 684 685 if (add_huge_page_size(1ULL << shift) < 0) 686 continue; 687 /* 688 * if we have pdshift and shift value same, we don't 689 * use pgt cache for hugepd. 690 */ 691 if (pdshift > shift) 692 pgtable_cache_add(pdshift - shift, NULL); 693 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 694 else if (!hugepte_cache) { 695 /* 696 * Create a kmem cache for hugeptes. The bottom bits in 697 * the pte have size information encoded in them, so 698 * align them to allow this 699 */ 700 hugepte_cache = kmem_cache_create("hugepte-cache", 701 sizeof(pte_t), 702 HUGEPD_SHIFT_MASK + 1, 703 0, NULL); 704 if (hugepte_cache == NULL) 705 panic("%s: Unable to create kmem cache " 706 "for hugeptes\n", __func__); 707 708 } 709 #endif 710 } 711 712 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 713 /* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */ 714 if (mmu_psize_defs[MMU_PAGE_4M].shift) 715 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift; 716 else if (mmu_psize_defs[MMU_PAGE_512K].shift) 717 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift; 718 #else 719 /* Set default large page size. Currently, we pick 16M or 1M 720 * depending on what is available 721 */ 722 if (mmu_psize_defs[MMU_PAGE_16M].shift) 723 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; 724 else if (mmu_psize_defs[MMU_PAGE_1M].shift) 725 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; 726 else if (mmu_psize_defs[MMU_PAGE_2M].shift) 727 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift; 728 #endif 729 return 0; 730 } 731 732 arch_initcall(hugetlbpage_init); 733 734 void flush_dcache_icache_hugepage(struct page *page) 735 { 736 int i; 737 void *start; 738 739 BUG_ON(!PageCompound(page)); 740 741 for (i = 0; i < (1UL << compound_order(page)); i++) { 742 if (!PageHighMem(page)) { 743 __flush_dcache_icache(page_address(page+i)); 744 } else { 745 start = kmap_atomic(page+i); 746 __flush_dcache_icache(start); 747 kunmap_atomic(start); 748 } 749 } 750 } 751 752 #endif /* CONFIG_HUGETLB_PAGE */ 753 754 /* 755 * We have 4 cases for pgds and pmds: 756 * (1) invalid (all zeroes) 757 * (2) pointer to next table, as normal; bottom 6 bits == 0 758 * (3) leaf pte for huge page _PAGE_PTE set 759 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table 760 * 761 * So long as we atomically load page table pointers we are safe against teardown, 762 * we can follow the address down to the the page and take a ref on it. 763 * This function need to be called with interrupts disabled. We use this variant 764 * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED 765 */ 766 pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, 767 bool *is_thp, unsigned *hpage_shift) 768 { 769 pgd_t pgd, *pgdp; 770 pud_t pud, *pudp; 771 pmd_t pmd, *pmdp; 772 pte_t *ret_pte; 773 hugepd_t *hpdp = NULL; 774 unsigned pdshift = PGDIR_SHIFT; 775 776 if (hpage_shift) 777 *hpage_shift = 0; 778 779 if (is_thp) 780 *is_thp = false; 781 782 pgdp = pgdir + pgd_index(ea); 783 pgd = READ_ONCE(*pgdp); 784 /* 785 * Always operate on the local stack value. This make sure the 786 * value don't get updated by a parallel THP split/collapse, 787 * page fault or a page unmap. The return pte_t * is still not 788 * stable. So should be checked there for above conditions. 789 */ 790 if (pgd_none(pgd)) 791 return NULL; 792 else if (pgd_huge(pgd)) { 793 ret_pte = (pte_t *) pgdp; 794 goto out; 795 } else if (is_hugepd(__hugepd(pgd_val(pgd)))) 796 hpdp = (hugepd_t *)&pgd; 797 else { 798 /* 799 * Even if we end up with an unmap, the pgtable will not 800 * be freed, because we do an rcu free and here we are 801 * irq disabled 802 */ 803 pdshift = PUD_SHIFT; 804 pudp = pud_offset(&pgd, ea); 805 pud = READ_ONCE(*pudp); 806 807 if (pud_none(pud)) 808 return NULL; 809 else if (pud_huge(pud)) { 810 ret_pte = (pte_t *) pudp; 811 goto out; 812 } else if (is_hugepd(__hugepd(pud_val(pud)))) 813 hpdp = (hugepd_t *)&pud; 814 else { 815 pdshift = PMD_SHIFT; 816 pmdp = pmd_offset(&pud, ea); 817 pmd = READ_ONCE(*pmdp); 818 /* 819 * A hugepage collapse is captured by pmd_none, because 820 * it mark the pmd none and do a hpte invalidate. 821 */ 822 if (pmd_none(pmd)) 823 return NULL; 824 825 if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { 826 if (is_thp) 827 *is_thp = true; 828 ret_pte = (pte_t *) pmdp; 829 goto out; 830 } 831 832 if (pmd_huge(pmd)) { 833 ret_pte = (pte_t *) pmdp; 834 goto out; 835 } else if (is_hugepd(__hugepd(pmd_val(pmd)))) 836 hpdp = (hugepd_t *)&pmd; 837 else 838 return pte_offset_kernel(&pmd, ea); 839 } 840 } 841 if (!hpdp) 842 return NULL; 843 844 ret_pte = hugepte_offset(*hpdp, ea, pdshift); 845 pdshift = hugepd_shift(*hpdp); 846 out: 847 if (hpage_shift) 848 *hpage_shift = pdshift; 849 return ret_pte; 850 } 851 EXPORT_SYMBOL_GPL(__find_linux_pte); 852 853 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, 854 unsigned long end, int write, struct page **pages, int *nr) 855 { 856 unsigned long pte_end; 857 struct page *head, *page; 858 pte_t pte; 859 int refs; 860 861 pte_end = (addr + sz) & ~(sz-1); 862 if (pte_end < end) 863 end = pte_end; 864 865 pte = READ_ONCE(*ptep); 866 867 if (!pte_access_permitted(pte, write)) 868 return 0; 869 870 /* hugepages are never "special" */ 871 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 872 873 refs = 0; 874 head = pte_page(pte); 875 876 page = head + ((addr & (sz-1)) >> PAGE_SHIFT); 877 do { 878 VM_BUG_ON(compound_head(page) != head); 879 pages[*nr] = page; 880 (*nr)++; 881 page++; 882 refs++; 883 } while (addr += PAGE_SIZE, addr != end); 884 885 if (!page_cache_add_speculative(head, refs)) { 886 *nr -= refs; 887 return 0; 888 } 889 890 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 891 /* Could be optimized better */ 892 *nr -= refs; 893 while (refs--) 894 put_page(head); 895 return 0; 896 } 897 898 return 1; 899 } 900