1 /* 2 * PPC Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2003 David Gibson, IBM Corporation. 5 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor 6 * 7 * Based on the IA-32 version: 8 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 9 */ 10 11 #include <linux/mm.h> 12 #include <linux/io.h> 13 #include <linux/slab.h> 14 #include <linux/hugetlb.h> 15 #include <linux/export.h> 16 #include <linux/of_fdt.h> 17 #include <linux/memblock.h> 18 #include <linux/moduleparam.h> 19 #include <linux/swap.h> 20 #include <linux/swapops.h> 21 #include <linux/kmemleak.h> 22 #include <asm/pgalloc.h> 23 #include <asm/tlb.h> 24 #include <asm/setup.h> 25 #include <asm/hugetlb.h> 26 #include <asm/pte-walk.h> 27 28 bool hugetlb_disabled = false; 29 30 #define hugepd_none(hpd) (hpd_val(hpd) == 0) 31 32 #define PTE_T_ORDER (__builtin_ffs(sizeof(pte_basic_t)) - \ 33 __builtin_ffs(sizeof(void *))) 34 35 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) 36 { 37 /* 38 * Only called for hugetlbfs pages, hence can ignore THP and the 39 * irq disabled walk. 40 */ 41 return __find_linux_pte(mm->pgd, addr, NULL, NULL); 42 } 43 44 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 45 unsigned long address, unsigned int pdshift, 46 unsigned int pshift, spinlock_t *ptl) 47 { 48 struct kmem_cache *cachep; 49 pte_t *new; 50 int i; 51 int num_hugepd; 52 53 if (pshift >= pdshift) { 54 cachep = PGT_CACHE(PTE_T_ORDER); 55 num_hugepd = 1 << (pshift - pdshift); 56 } else { 57 cachep = PGT_CACHE(pdshift - pshift); 58 num_hugepd = 1; 59 } 60 61 if (!cachep) { 62 WARN_ONCE(1, "No page table cache created for hugetlb tables"); 63 return -ENOMEM; 64 } 65 66 new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL)); 67 68 BUG_ON(pshift > HUGEPD_SHIFT_MASK); 69 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); 70 71 if (!new) 72 return -ENOMEM; 73 74 /* 75 * Make sure other cpus find the hugepd set only after a 76 * properly initialized page table is visible to them. 77 * For more details look for comment in __pte_alloc(). 78 */ 79 smp_wmb(); 80 81 spin_lock(ptl); 82 /* 83 * We have multiple higher-level entries that point to the same 84 * actual pte location. Fill in each as we go and backtrack on error. 85 * We need all of these so the DTLB pgtable walk code can find the 86 * right higher-level entry without knowing if it's a hugepage or not. 87 */ 88 for (i = 0; i < num_hugepd; i++, hpdp++) { 89 if (unlikely(!hugepd_none(*hpdp))) 90 break; 91 hugepd_populate(hpdp, new, pshift); 92 } 93 /* If we bailed from the for loop early, an error occurred, clean up */ 94 if (i < num_hugepd) { 95 for (i = i - 1 ; i >= 0; i--, hpdp--) 96 *hpdp = __hugepd(0); 97 kmem_cache_free(cachep, new); 98 } else { 99 kmemleak_ignore(new); 100 } 101 spin_unlock(ptl); 102 return 0; 103 } 104 105 /* 106 * At this point we do the placement change only for BOOK3S 64. This would 107 * possibly work on other subarchs. 108 */ 109 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 110 unsigned long addr, unsigned long sz) 111 { 112 pgd_t *pg; 113 p4d_t *p4; 114 pud_t *pu; 115 pmd_t *pm; 116 hugepd_t *hpdp = NULL; 117 unsigned pshift = __ffs(sz); 118 unsigned pdshift = PGDIR_SHIFT; 119 spinlock_t *ptl; 120 121 addr &= ~(sz-1); 122 pg = pgd_offset(mm, addr); 123 p4 = p4d_offset(pg, addr); 124 125 #ifdef CONFIG_PPC_BOOK3S_64 126 if (pshift == PGDIR_SHIFT) 127 /* 16GB huge page */ 128 return (pte_t *) p4; 129 else if (pshift > PUD_SHIFT) { 130 /* 131 * We need to use hugepd table 132 */ 133 ptl = &mm->page_table_lock; 134 hpdp = (hugepd_t *)p4; 135 } else { 136 pdshift = PUD_SHIFT; 137 pu = pud_alloc(mm, p4, addr); 138 if (!pu) 139 return NULL; 140 if (pshift == PUD_SHIFT) 141 return (pte_t *)pu; 142 else if (pshift > PMD_SHIFT) { 143 ptl = pud_lockptr(mm, pu); 144 hpdp = (hugepd_t *)pu; 145 } else { 146 pdshift = PMD_SHIFT; 147 pm = pmd_alloc(mm, pu, addr); 148 if (!pm) 149 return NULL; 150 if (pshift == PMD_SHIFT) 151 /* 16MB hugepage */ 152 return (pte_t *)pm; 153 else { 154 ptl = pmd_lockptr(mm, pm); 155 hpdp = (hugepd_t *)pm; 156 } 157 } 158 } 159 #else 160 if (pshift >= PGDIR_SHIFT) { 161 ptl = &mm->page_table_lock; 162 hpdp = (hugepd_t *)p4; 163 } else { 164 pdshift = PUD_SHIFT; 165 pu = pud_alloc(mm, p4, addr); 166 if (!pu) 167 return NULL; 168 if (pshift >= PUD_SHIFT) { 169 ptl = pud_lockptr(mm, pu); 170 hpdp = (hugepd_t *)pu; 171 } else { 172 pdshift = PMD_SHIFT; 173 pm = pmd_alloc(mm, pu, addr); 174 if (!pm) 175 return NULL; 176 ptl = pmd_lockptr(mm, pm); 177 hpdp = (hugepd_t *)pm; 178 } 179 } 180 #endif 181 if (!hpdp) 182 return NULL; 183 184 if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT) 185 return pte_alloc_map(mm, (pmd_t *)hpdp, addr); 186 187 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); 188 189 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, 190 pdshift, pshift, ptl)) 191 return NULL; 192 193 return hugepte_offset(*hpdp, addr, pdshift); 194 } 195 196 #ifdef CONFIG_PPC_BOOK3S_64 197 /* 198 * Tracks gpages after the device tree is scanned and before the 199 * huge_boot_pages list is ready on pseries. 200 */ 201 #define MAX_NUMBER_GPAGES 1024 202 __initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES]; 203 __initdata static unsigned nr_gpages; 204 205 /* 206 * Build list of addresses of gigantic pages. This function is used in early 207 * boot before the buddy allocator is setup. 208 */ 209 void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) 210 { 211 if (!addr) 212 return; 213 while (number_of_pages > 0) { 214 gpage_freearray[nr_gpages] = addr; 215 nr_gpages++; 216 number_of_pages--; 217 addr += page_size; 218 } 219 } 220 221 static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) 222 { 223 struct huge_bootmem_page *m; 224 if (nr_gpages == 0) 225 return 0; 226 m = phys_to_virt(gpage_freearray[--nr_gpages]); 227 gpage_freearray[nr_gpages] = 0; 228 list_add(&m->list, &huge_boot_pages); 229 m->hstate = hstate; 230 return 1; 231 } 232 233 bool __init hugetlb_node_alloc_supported(void) 234 { 235 return false; 236 } 237 #endif 238 239 240 int __init alloc_bootmem_huge_page(struct hstate *h, int nid) 241 { 242 243 #ifdef CONFIG_PPC_BOOK3S_64 244 if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) 245 return pseries_alloc_bootmem_huge_page(h); 246 #endif 247 return __alloc_bootmem_huge_page(h, nid); 248 } 249 250 #ifndef CONFIG_PPC_BOOK3S_64 251 #define HUGEPD_FREELIST_SIZE \ 252 ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) 253 254 struct hugepd_freelist { 255 struct rcu_head rcu; 256 unsigned int index; 257 void *ptes[]; 258 }; 259 260 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); 261 262 static void hugepd_free_rcu_callback(struct rcu_head *head) 263 { 264 struct hugepd_freelist *batch = 265 container_of(head, struct hugepd_freelist, rcu); 266 unsigned int i; 267 268 for (i = 0; i < batch->index; i++) 269 kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]); 270 271 free_page((unsigned long)batch); 272 } 273 274 static void hugepd_free(struct mmu_gather *tlb, void *hugepte) 275 { 276 struct hugepd_freelist **batchp; 277 278 batchp = &get_cpu_var(hugepd_freelist_cur); 279 280 if (atomic_read(&tlb->mm->mm_users) < 2 || 281 mm_is_thread_local(tlb->mm)) { 282 kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte); 283 put_cpu_var(hugepd_freelist_cur); 284 return; 285 } 286 287 if (*batchp == NULL) { 288 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); 289 (*batchp)->index = 0; 290 } 291 292 (*batchp)->ptes[(*batchp)->index++] = hugepte; 293 if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { 294 call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback); 295 *batchp = NULL; 296 } 297 put_cpu_var(hugepd_freelist_cur); 298 } 299 #else 300 static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {} 301 #endif 302 303 /* Return true when the entry to be freed maps more than the area being freed */ 304 static bool range_is_outside_limits(unsigned long start, unsigned long end, 305 unsigned long floor, unsigned long ceiling, 306 unsigned long mask) 307 { 308 if ((start & mask) < floor) 309 return true; 310 if (ceiling) { 311 ceiling &= mask; 312 if (!ceiling) 313 return true; 314 } 315 return end - 1 > ceiling - 1; 316 } 317 318 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, 319 unsigned long start, unsigned long end, 320 unsigned long floor, unsigned long ceiling) 321 { 322 pte_t *hugepte = hugepd_page(*hpdp); 323 int i; 324 325 unsigned long pdmask = ~((1UL << pdshift) - 1); 326 unsigned int num_hugepd = 1; 327 unsigned int shift = hugepd_shift(*hpdp); 328 329 /* Note: On fsl the hpdp may be the first of several */ 330 if (shift > pdshift) 331 num_hugepd = 1 << (shift - pdshift); 332 333 if (range_is_outside_limits(start, end, floor, ceiling, pdmask)) 334 return; 335 336 for (i = 0; i < num_hugepd; i++, hpdp++) 337 *hpdp = __hugepd(0); 338 339 if (shift >= pdshift) 340 hugepd_free(tlb, hugepte); 341 else 342 pgtable_free_tlb(tlb, hugepte, 343 get_hugepd_cache_index(pdshift - shift)); 344 } 345 346 static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 347 unsigned long addr, unsigned long end, 348 unsigned long floor, unsigned long ceiling) 349 { 350 pgtable_t token = pmd_pgtable(*pmd); 351 352 if (range_is_outside_limits(addr, end, floor, ceiling, PMD_MASK)) 353 return; 354 355 pmd_clear(pmd); 356 pte_free_tlb(tlb, token, addr); 357 mm_dec_nr_ptes(tlb->mm); 358 } 359 360 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 361 unsigned long addr, unsigned long end, 362 unsigned long floor, unsigned long ceiling) 363 { 364 pmd_t *pmd; 365 unsigned long next; 366 unsigned long start; 367 368 start = addr; 369 do { 370 unsigned long more; 371 372 pmd = pmd_offset(pud, addr); 373 next = pmd_addr_end(addr, end); 374 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) { 375 if (pmd_none_or_clear_bad(pmd)) 376 continue; 377 378 /* 379 * if it is not hugepd pointer, we should already find 380 * it cleared. 381 */ 382 WARN_ON(!IS_ENABLED(CONFIG_PPC_8xx)); 383 384 hugetlb_free_pte_range(tlb, pmd, addr, end, floor, ceiling); 385 386 continue; 387 } 388 /* 389 * Increment next by the size of the huge mapping since 390 * there may be more than one entry at this level for a 391 * single hugepage, but all of them point to 392 * the same kmem cache that holds the hugepte. 393 */ 394 more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd)); 395 if (more > next) 396 next = more; 397 398 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, 399 addr, next, floor, ceiling); 400 } while (addr = next, addr != end); 401 402 if (range_is_outside_limits(start, end, floor, ceiling, PUD_MASK)) 403 return; 404 405 pmd = pmd_offset(pud, start & PUD_MASK); 406 pud_clear(pud); 407 pmd_free_tlb(tlb, pmd, start & PUD_MASK); 408 mm_dec_nr_pmds(tlb->mm); 409 } 410 411 static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, 412 unsigned long addr, unsigned long end, 413 unsigned long floor, unsigned long ceiling) 414 { 415 pud_t *pud; 416 unsigned long next; 417 unsigned long start; 418 419 start = addr; 420 do { 421 pud = pud_offset(p4d, addr); 422 next = pud_addr_end(addr, end); 423 if (!is_hugepd(__hugepd(pud_val(*pud)))) { 424 if (pud_none_or_clear_bad(pud)) 425 continue; 426 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 427 ceiling); 428 } else { 429 unsigned long more; 430 /* 431 * Increment next by the size of the huge mapping since 432 * there may be more than one entry at this level for a 433 * single hugepage, but all of them point to 434 * the same kmem cache that holds the hugepte. 435 */ 436 more = addr + (1 << hugepd_shift(*(hugepd_t *)pud)); 437 if (more > next) 438 next = more; 439 440 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, 441 addr, next, floor, ceiling); 442 } 443 } while (addr = next, addr != end); 444 445 if (range_is_outside_limits(start, end, floor, ceiling, PGDIR_MASK)) 446 return; 447 448 pud = pud_offset(p4d, start & PGDIR_MASK); 449 p4d_clear(p4d); 450 pud_free_tlb(tlb, pud, start & PGDIR_MASK); 451 mm_dec_nr_puds(tlb->mm); 452 } 453 454 /* 455 * This function frees user-level page tables of a process. 456 */ 457 void hugetlb_free_pgd_range(struct mmu_gather *tlb, 458 unsigned long addr, unsigned long end, 459 unsigned long floor, unsigned long ceiling) 460 { 461 pgd_t *pgd; 462 p4d_t *p4d; 463 unsigned long next; 464 465 /* 466 * Because there are a number of different possible pagetable 467 * layouts for hugepage ranges, we limit knowledge of how 468 * things should be laid out to the allocation path 469 * (huge_pte_alloc(), above). Everything else works out the 470 * structure as it goes from information in the hugepd 471 * pointers. That means that we can't here use the 472 * optimization used in the normal page free_pgd_range(), of 473 * checking whether we're actually covering a large enough 474 * range to have to do anything at the top level of the walk 475 * instead of at the bottom. 476 * 477 * To make sense of this, you should probably go read the big 478 * block comment at the top of the normal free_pgd_range(), 479 * too. 480 */ 481 482 do { 483 next = pgd_addr_end(addr, end); 484 pgd = pgd_offset(tlb->mm, addr); 485 p4d = p4d_offset(pgd, addr); 486 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) { 487 if (p4d_none_or_clear_bad(p4d)) 488 continue; 489 hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling); 490 } else { 491 unsigned long more; 492 /* 493 * Increment next by the size of the huge mapping since 494 * there may be more than one entry at the pgd level 495 * for a single hugepage, but all of them point to the 496 * same kmem cache that holds the hugepte. 497 */ 498 more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); 499 if (more > next) 500 next = more; 501 502 free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT, 503 addr, next, floor, ceiling); 504 } 505 } while (addr = next, addr != end); 506 } 507 508 struct page *follow_huge_pd(struct vm_area_struct *vma, 509 unsigned long address, hugepd_t hpd, 510 int flags, int pdshift) 511 { 512 pte_t *ptep; 513 spinlock_t *ptl; 514 struct page *page = NULL; 515 unsigned long mask; 516 int shift = hugepd_shift(hpd); 517 struct mm_struct *mm = vma->vm_mm; 518 519 retry: 520 /* 521 * hugepage directory entries are protected by mm->page_table_lock 522 * Use this instead of huge_pte_lockptr 523 */ 524 ptl = &mm->page_table_lock; 525 spin_lock(ptl); 526 527 ptep = hugepte_offset(hpd, address, pdshift); 528 if (pte_present(*ptep)) { 529 mask = (1UL << shift) - 1; 530 page = pte_page(*ptep); 531 page += ((address & mask) >> PAGE_SHIFT); 532 if (flags & FOLL_GET) 533 get_page(page); 534 } else { 535 if (is_hugetlb_entry_migration(*ptep)) { 536 spin_unlock(ptl); 537 __migration_entry_wait(mm, ptep, ptl); 538 goto retry; 539 } 540 } 541 spin_unlock(ptl); 542 return page; 543 } 544 545 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA 546 static inline int file_to_psize(struct file *file) 547 { 548 struct hstate *hstate = hstate_file(file); 549 return shift_to_mmu_psize(huge_page_shift(hstate)); 550 } 551 552 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 553 unsigned long len, unsigned long pgoff, 554 unsigned long flags) 555 { 556 #ifdef CONFIG_PPC_RADIX_MMU 557 if (radix_enabled()) 558 return radix__hugetlb_get_unmapped_area(file, addr, len, 559 pgoff, flags); 560 #endif 561 #ifdef CONFIG_PPC_MM_SLICES 562 return slice_get_unmapped_area(addr, len, flags, file_to_psize(file), 1); 563 #endif 564 BUG(); 565 } 566 #endif 567 568 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 569 { 570 /* With radix we don't use slice, so derive it from vma*/ 571 if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) { 572 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); 573 574 return 1UL << mmu_psize_to_shift(psize); 575 } 576 return vma_kernel_pagesize(vma); 577 } 578 579 bool __init arch_hugetlb_valid_size(unsigned long size) 580 { 581 int shift = __ffs(size); 582 int mmu_psize; 583 584 /* Check that it is a page size supported by the hardware and 585 * that it fits within pagetable and slice limits. */ 586 if (size <= PAGE_SIZE || !is_power_of_2(size)) 587 return false; 588 589 mmu_psize = check_and_get_huge_psize(shift); 590 if (mmu_psize < 0) 591 return false; 592 593 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); 594 595 return true; 596 } 597 598 static int __init add_huge_page_size(unsigned long long size) 599 { 600 int shift = __ffs(size); 601 602 if (!arch_hugetlb_valid_size((unsigned long)size)) 603 return -EINVAL; 604 605 hugetlb_add_hstate(shift - PAGE_SHIFT); 606 return 0; 607 } 608 609 static int __init hugetlbpage_init(void) 610 { 611 bool configured = false; 612 int psize; 613 614 if (hugetlb_disabled) { 615 pr_info("HugeTLB support is disabled!\n"); 616 return 0; 617 } 618 619 if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() && 620 !mmu_has_feature(MMU_FTR_16M_PAGE)) 621 return -ENODEV; 622 623 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 624 unsigned shift; 625 unsigned pdshift; 626 627 if (!mmu_psize_defs[psize].shift) 628 continue; 629 630 shift = mmu_psize_to_shift(psize); 631 632 #ifdef CONFIG_PPC_BOOK3S_64 633 if (shift > PGDIR_SHIFT) 634 continue; 635 else if (shift > PUD_SHIFT) 636 pdshift = PGDIR_SHIFT; 637 else if (shift > PMD_SHIFT) 638 pdshift = PUD_SHIFT; 639 else 640 pdshift = PMD_SHIFT; 641 #else 642 if (shift < PUD_SHIFT) 643 pdshift = PMD_SHIFT; 644 else if (shift < PGDIR_SHIFT) 645 pdshift = PUD_SHIFT; 646 else 647 pdshift = PGDIR_SHIFT; 648 #endif 649 650 if (add_huge_page_size(1ULL << shift) < 0) 651 continue; 652 /* 653 * if we have pdshift and shift value same, we don't 654 * use pgt cache for hugepd. 655 */ 656 if (pdshift > shift) { 657 if (!IS_ENABLED(CONFIG_PPC_8xx)) 658 pgtable_cache_add(pdshift - shift); 659 } else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || 660 IS_ENABLED(CONFIG_PPC_8xx)) { 661 pgtable_cache_add(PTE_T_ORDER); 662 } 663 664 configured = true; 665 } 666 667 if (!configured) 668 pr_info("Failed to initialize. Disabling HugeTLB"); 669 670 return 0; 671 } 672 673 arch_initcall(hugetlbpage_init); 674 675 void __init gigantic_hugetlb_cma_reserve(void) 676 { 677 unsigned long order = 0; 678 679 if (radix_enabled()) 680 order = PUD_SHIFT - PAGE_SHIFT; 681 else if (!firmware_has_feature(FW_FEATURE_LPAR) && mmu_psize_defs[MMU_PAGE_16G].shift) 682 /* 683 * For pseries we do use ibm,expected#pages for reserving 16G pages. 684 */ 685 order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT; 686 687 if (order) { 688 VM_WARN_ON(order < MAX_ORDER); 689 hugetlb_cma_reserve(order); 690 } 691 } 692