1 /* 2 * PPC Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2003 David Gibson, IBM Corporation. 5 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor 6 * 7 * Based on the IA-32 version: 8 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 9 */ 10 11 #include <linux/mm.h> 12 #include <linux/io.h> 13 #include <linux/slab.h> 14 #include <linux/hugetlb.h> 15 #include <linux/export.h> 16 #include <linux/of_fdt.h> 17 #include <linux/memblock.h> 18 #include <linux/moduleparam.h> 19 #include <linux/swap.h> 20 #include <linux/swapops.h> 21 #include <linux/kmemleak.h> 22 #include <asm/pgalloc.h> 23 #include <asm/tlb.h> 24 #include <asm/setup.h> 25 #include <asm/hugetlb.h> 26 #include <asm/pte-walk.h> 27 28 bool hugetlb_disabled = false; 29 30 #define hugepd_none(hpd) (hpd_val(hpd) == 0) 31 32 #define PTE_T_ORDER (__builtin_ffs(sizeof(pte_basic_t)) - \ 33 __builtin_ffs(sizeof(void *))) 34 35 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) 36 { 37 /* 38 * Only called for hugetlbfs pages, hence can ignore THP and the 39 * irq disabled walk. 40 */ 41 return __find_linux_pte(mm->pgd, addr, NULL, NULL); 42 } 43 44 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 45 unsigned long address, unsigned int pdshift, 46 unsigned int pshift, spinlock_t *ptl) 47 { 48 struct kmem_cache *cachep; 49 pte_t *new; 50 int i; 51 int num_hugepd; 52 53 if (pshift >= pdshift) { 54 cachep = PGT_CACHE(PTE_T_ORDER); 55 num_hugepd = 1 << (pshift - pdshift); 56 } else { 57 cachep = PGT_CACHE(pdshift - pshift); 58 num_hugepd = 1; 59 } 60 61 if (!cachep) { 62 WARN_ONCE(1, "No page table cache created for hugetlb tables"); 63 return -ENOMEM; 64 } 65 66 new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL)); 67 68 BUG_ON(pshift > HUGEPD_SHIFT_MASK); 69 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); 70 71 if (!new) 72 return -ENOMEM; 73 74 /* 75 * Make sure other cpus find the hugepd set only after a 76 * properly initialized page table is visible to them. 77 * For more details look for comment in __pte_alloc(). 78 */ 79 smp_wmb(); 80 81 spin_lock(ptl); 82 /* 83 * We have multiple higher-level entries that point to the same 84 * actual pte location. Fill in each as we go and backtrack on error. 85 * We need all of these so the DTLB pgtable walk code can find the 86 * right higher-level entry without knowing if it's a hugepage or not. 87 */ 88 for (i = 0; i < num_hugepd; i++, hpdp++) { 89 if (unlikely(!hugepd_none(*hpdp))) 90 break; 91 hugepd_populate(hpdp, new, pshift); 92 } 93 /* If we bailed from the for loop early, an error occurred, clean up */ 94 if (i < num_hugepd) { 95 for (i = i - 1 ; i >= 0; i--, hpdp--) 96 *hpdp = __hugepd(0); 97 kmem_cache_free(cachep, new); 98 } else { 99 kmemleak_ignore(new); 100 } 101 spin_unlock(ptl); 102 return 0; 103 } 104 105 /* 106 * At this point we do the placement change only for BOOK3S 64. This would 107 * possibly work on other subarchs. 108 */ 109 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 110 unsigned long addr, unsigned long sz) 111 { 112 pgd_t *pg; 113 p4d_t *p4; 114 pud_t *pu; 115 pmd_t *pm; 116 hugepd_t *hpdp = NULL; 117 unsigned pshift = __ffs(sz); 118 unsigned pdshift = PGDIR_SHIFT; 119 spinlock_t *ptl; 120 121 addr &= ~(sz-1); 122 pg = pgd_offset(mm, addr); 123 p4 = p4d_offset(pg, addr); 124 125 #ifdef CONFIG_PPC_BOOK3S_64 126 if (pshift == PGDIR_SHIFT) 127 /* 16GB huge page */ 128 return (pte_t *) p4; 129 else if (pshift > PUD_SHIFT) { 130 /* 131 * We need to use hugepd table 132 */ 133 ptl = &mm->page_table_lock; 134 hpdp = (hugepd_t *)p4; 135 } else { 136 pdshift = PUD_SHIFT; 137 pu = pud_alloc(mm, p4, addr); 138 if (!pu) 139 return NULL; 140 if (pshift == PUD_SHIFT) 141 return (pte_t *)pu; 142 else if (pshift > PMD_SHIFT) { 143 ptl = pud_lockptr(mm, pu); 144 hpdp = (hugepd_t *)pu; 145 } else { 146 pdshift = PMD_SHIFT; 147 pm = pmd_alloc(mm, pu, addr); 148 if (!pm) 149 return NULL; 150 if (pshift == PMD_SHIFT) 151 /* 16MB hugepage */ 152 return (pte_t *)pm; 153 else { 154 ptl = pmd_lockptr(mm, pm); 155 hpdp = (hugepd_t *)pm; 156 } 157 } 158 } 159 #else 160 if (pshift >= PGDIR_SHIFT) { 161 ptl = &mm->page_table_lock; 162 hpdp = (hugepd_t *)p4; 163 } else { 164 pdshift = PUD_SHIFT; 165 pu = pud_alloc(mm, p4, addr); 166 if (!pu) 167 return NULL; 168 if (pshift >= PUD_SHIFT) { 169 ptl = pud_lockptr(mm, pu); 170 hpdp = (hugepd_t *)pu; 171 } else { 172 pdshift = PMD_SHIFT; 173 pm = pmd_alloc(mm, pu, addr); 174 if (!pm) 175 return NULL; 176 ptl = pmd_lockptr(mm, pm); 177 hpdp = (hugepd_t *)pm; 178 } 179 } 180 #endif 181 if (!hpdp) 182 return NULL; 183 184 if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT) 185 return pte_alloc_map(mm, (pmd_t *)hpdp, addr); 186 187 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); 188 189 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, 190 pdshift, pshift, ptl)) 191 return NULL; 192 193 return hugepte_offset(*hpdp, addr, pdshift); 194 } 195 196 #ifdef CONFIG_PPC_BOOK3S_64 197 /* 198 * Tracks gpages after the device tree is scanned and before the 199 * huge_boot_pages list is ready on pseries. 200 */ 201 #define MAX_NUMBER_GPAGES 1024 202 __initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES]; 203 __initdata static unsigned nr_gpages; 204 205 /* 206 * Build list of addresses of gigantic pages. This function is used in early 207 * boot before the buddy allocator is setup. 208 */ 209 void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) 210 { 211 if (!addr) 212 return; 213 while (number_of_pages > 0) { 214 gpage_freearray[nr_gpages] = addr; 215 nr_gpages++; 216 number_of_pages--; 217 addr += page_size; 218 } 219 } 220 221 static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) 222 { 223 struct huge_bootmem_page *m; 224 if (nr_gpages == 0) 225 return 0; 226 m = phys_to_virt(gpage_freearray[--nr_gpages]); 227 gpage_freearray[nr_gpages] = 0; 228 list_add(&m->list, &huge_boot_pages); 229 m->hstate = hstate; 230 return 1; 231 } 232 #endif 233 234 235 int __init alloc_bootmem_huge_page(struct hstate *h) 236 { 237 238 #ifdef CONFIG_PPC_BOOK3S_64 239 if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) 240 return pseries_alloc_bootmem_huge_page(h); 241 #endif 242 return __alloc_bootmem_huge_page(h); 243 } 244 245 #ifndef CONFIG_PPC_BOOK3S_64 246 #define HUGEPD_FREELIST_SIZE \ 247 ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) 248 249 struct hugepd_freelist { 250 struct rcu_head rcu; 251 unsigned int index; 252 void *ptes[]; 253 }; 254 255 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); 256 257 static void hugepd_free_rcu_callback(struct rcu_head *head) 258 { 259 struct hugepd_freelist *batch = 260 container_of(head, struct hugepd_freelist, rcu); 261 unsigned int i; 262 263 for (i = 0; i < batch->index; i++) 264 kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]); 265 266 free_page((unsigned long)batch); 267 } 268 269 static void hugepd_free(struct mmu_gather *tlb, void *hugepte) 270 { 271 struct hugepd_freelist **batchp; 272 273 batchp = &get_cpu_var(hugepd_freelist_cur); 274 275 if (atomic_read(&tlb->mm->mm_users) < 2 || 276 mm_is_thread_local(tlb->mm)) { 277 kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte); 278 put_cpu_var(hugepd_freelist_cur); 279 return; 280 } 281 282 if (*batchp == NULL) { 283 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); 284 (*batchp)->index = 0; 285 } 286 287 (*batchp)->ptes[(*batchp)->index++] = hugepte; 288 if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { 289 call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback); 290 *batchp = NULL; 291 } 292 put_cpu_var(hugepd_freelist_cur); 293 } 294 #else 295 static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {} 296 #endif 297 298 /* Return true when the entry to be freed maps more than the area being freed */ 299 static bool range_is_outside_limits(unsigned long start, unsigned long end, 300 unsigned long floor, unsigned long ceiling, 301 unsigned long mask) 302 { 303 if ((start & mask) < floor) 304 return true; 305 if (ceiling) { 306 ceiling &= mask; 307 if (!ceiling) 308 return true; 309 } 310 return end - 1 > ceiling - 1; 311 } 312 313 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, 314 unsigned long start, unsigned long end, 315 unsigned long floor, unsigned long ceiling) 316 { 317 pte_t *hugepte = hugepd_page(*hpdp); 318 int i; 319 320 unsigned long pdmask = ~((1UL << pdshift) - 1); 321 unsigned int num_hugepd = 1; 322 unsigned int shift = hugepd_shift(*hpdp); 323 324 /* Note: On fsl the hpdp may be the first of several */ 325 if (shift > pdshift) 326 num_hugepd = 1 << (shift - pdshift); 327 328 if (range_is_outside_limits(start, end, floor, ceiling, pdmask)) 329 return; 330 331 for (i = 0; i < num_hugepd; i++, hpdp++) 332 *hpdp = __hugepd(0); 333 334 if (shift >= pdshift) 335 hugepd_free(tlb, hugepte); 336 else 337 pgtable_free_tlb(tlb, hugepte, 338 get_hugepd_cache_index(pdshift - shift)); 339 } 340 341 static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 342 unsigned long addr, unsigned long end, 343 unsigned long floor, unsigned long ceiling) 344 { 345 pgtable_t token = pmd_pgtable(*pmd); 346 347 if (range_is_outside_limits(addr, end, floor, ceiling, PMD_MASK)) 348 return; 349 350 pmd_clear(pmd); 351 pte_free_tlb(tlb, token, addr); 352 mm_dec_nr_ptes(tlb->mm); 353 } 354 355 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 356 unsigned long addr, unsigned long end, 357 unsigned long floor, unsigned long ceiling) 358 { 359 pmd_t *pmd; 360 unsigned long next; 361 unsigned long start; 362 363 start = addr; 364 do { 365 unsigned long more; 366 367 pmd = pmd_offset(pud, addr); 368 next = pmd_addr_end(addr, end); 369 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) { 370 if (pmd_none_or_clear_bad(pmd)) 371 continue; 372 373 /* 374 * if it is not hugepd pointer, we should already find 375 * it cleared. 376 */ 377 WARN_ON(!IS_ENABLED(CONFIG_PPC_8xx)); 378 379 hugetlb_free_pte_range(tlb, pmd, addr, end, floor, ceiling); 380 381 continue; 382 } 383 /* 384 * Increment next by the size of the huge mapping since 385 * there may be more than one entry at this level for a 386 * single hugepage, but all of them point to 387 * the same kmem cache that holds the hugepte. 388 */ 389 more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd)); 390 if (more > next) 391 next = more; 392 393 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, 394 addr, next, floor, ceiling); 395 } while (addr = next, addr != end); 396 397 if (range_is_outside_limits(start, end, floor, ceiling, PUD_MASK)) 398 return; 399 400 pmd = pmd_offset(pud, start & PUD_MASK); 401 pud_clear(pud); 402 pmd_free_tlb(tlb, pmd, start & PUD_MASK); 403 mm_dec_nr_pmds(tlb->mm); 404 } 405 406 static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, 407 unsigned long addr, unsigned long end, 408 unsigned long floor, unsigned long ceiling) 409 { 410 pud_t *pud; 411 unsigned long next; 412 unsigned long start; 413 414 start = addr; 415 do { 416 pud = pud_offset(p4d, addr); 417 next = pud_addr_end(addr, end); 418 if (!is_hugepd(__hugepd(pud_val(*pud)))) { 419 if (pud_none_or_clear_bad(pud)) 420 continue; 421 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 422 ceiling); 423 } else { 424 unsigned long more; 425 /* 426 * Increment next by the size of the huge mapping since 427 * there may be more than one entry at this level for a 428 * single hugepage, but all of them point to 429 * the same kmem cache that holds the hugepte. 430 */ 431 more = addr + (1 << hugepd_shift(*(hugepd_t *)pud)); 432 if (more > next) 433 next = more; 434 435 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, 436 addr, next, floor, ceiling); 437 } 438 } while (addr = next, addr != end); 439 440 if (range_is_outside_limits(start, end, floor, ceiling, PGDIR_MASK)) 441 return; 442 443 pud = pud_offset(p4d, start & PGDIR_MASK); 444 p4d_clear(p4d); 445 pud_free_tlb(tlb, pud, start & PGDIR_MASK); 446 mm_dec_nr_puds(tlb->mm); 447 } 448 449 /* 450 * This function frees user-level page tables of a process. 451 */ 452 void hugetlb_free_pgd_range(struct mmu_gather *tlb, 453 unsigned long addr, unsigned long end, 454 unsigned long floor, unsigned long ceiling) 455 { 456 pgd_t *pgd; 457 p4d_t *p4d; 458 unsigned long next; 459 460 /* 461 * Because there are a number of different possible pagetable 462 * layouts for hugepage ranges, we limit knowledge of how 463 * things should be laid out to the allocation path 464 * (huge_pte_alloc(), above). Everything else works out the 465 * structure as it goes from information in the hugepd 466 * pointers. That means that we can't here use the 467 * optimization used in the normal page free_pgd_range(), of 468 * checking whether we're actually covering a large enough 469 * range to have to do anything at the top level of the walk 470 * instead of at the bottom. 471 * 472 * To make sense of this, you should probably go read the big 473 * block comment at the top of the normal free_pgd_range(), 474 * too. 475 */ 476 477 do { 478 next = pgd_addr_end(addr, end); 479 pgd = pgd_offset(tlb->mm, addr); 480 p4d = p4d_offset(pgd, addr); 481 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) { 482 if (p4d_none_or_clear_bad(p4d)) 483 continue; 484 hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling); 485 } else { 486 unsigned long more; 487 /* 488 * Increment next by the size of the huge mapping since 489 * there may be more than one entry at the pgd level 490 * for a single hugepage, but all of them point to the 491 * same kmem cache that holds the hugepte. 492 */ 493 more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); 494 if (more > next) 495 next = more; 496 497 free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT, 498 addr, next, floor, ceiling); 499 } 500 } while (addr = next, addr != end); 501 } 502 503 struct page *follow_huge_pd(struct vm_area_struct *vma, 504 unsigned long address, hugepd_t hpd, 505 int flags, int pdshift) 506 { 507 pte_t *ptep; 508 spinlock_t *ptl; 509 struct page *page = NULL; 510 unsigned long mask; 511 int shift = hugepd_shift(hpd); 512 struct mm_struct *mm = vma->vm_mm; 513 514 retry: 515 /* 516 * hugepage directory entries are protected by mm->page_table_lock 517 * Use this instead of huge_pte_lockptr 518 */ 519 ptl = &mm->page_table_lock; 520 spin_lock(ptl); 521 522 ptep = hugepte_offset(hpd, address, pdshift); 523 if (pte_present(*ptep)) { 524 mask = (1UL << shift) - 1; 525 page = pte_page(*ptep); 526 page += ((address & mask) >> PAGE_SHIFT); 527 if (flags & FOLL_GET) 528 get_page(page); 529 } else { 530 if (is_hugetlb_entry_migration(*ptep)) { 531 spin_unlock(ptl); 532 __migration_entry_wait(mm, ptep, ptl); 533 goto retry; 534 } 535 } 536 spin_unlock(ptl); 537 return page; 538 } 539 540 #ifdef CONFIG_PPC_MM_SLICES 541 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 542 unsigned long len, unsigned long pgoff, 543 unsigned long flags) 544 { 545 struct hstate *hstate = hstate_file(file); 546 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 547 548 #ifdef CONFIG_PPC_RADIX_MMU 549 if (radix_enabled()) 550 return radix__hugetlb_get_unmapped_area(file, addr, len, 551 pgoff, flags); 552 #endif 553 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); 554 } 555 #endif 556 557 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 558 { 559 /* With radix we don't use slice, so derive it from vma*/ 560 if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) { 561 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); 562 563 return 1UL << mmu_psize_to_shift(psize); 564 } 565 return vma_kernel_pagesize(vma); 566 } 567 568 bool __init arch_hugetlb_valid_size(unsigned long size) 569 { 570 int shift = __ffs(size); 571 int mmu_psize; 572 573 /* Check that it is a page size supported by the hardware and 574 * that it fits within pagetable and slice limits. */ 575 if (size <= PAGE_SIZE || !is_power_of_2(size)) 576 return false; 577 578 mmu_psize = check_and_get_huge_psize(shift); 579 if (mmu_psize < 0) 580 return false; 581 582 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); 583 584 return true; 585 } 586 587 static int __init add_huge_page_size(unsigned long long size) 588 { 589 int shift = __ffs(size); 590 591 if (!arch_hugetlb_valid_size((unsigned long)size)) 592 return -EINVAL; 593 594 hugetlb_add_hstate(shift - PAGE_SHIFT); 595 return 0; 596 } 597 598 static int __init hugetlbpage_init(void) 599 { 600 bool configured = false; 601 int psize; 602 603 if (hugetlb_disabled) { 604 pr_info("HugeTLB support is disabled!\n"); 605 return 0; 606 } 607 608 if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() && 609 !mmu_has_feature(MMU_FTR_16M_PAGE)) 610 return -ENODEV; 611 612 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 613 unsigned shift; 614 unsigned pdshift; 615 616 if (!mmu_psize_defs[psize].shift) 617 continue; 618 619 shift = mmu_psize_to_shift(psize); 620 621 #ifdef CONFIG_PPC_BOOK3S_64 622 if (shift > PGDIR_SHIFT) 623 continue; 624 else if (shift > PUD_SHIFT) 625 pdshift = PGDIR_SHIFT; 626 else if (shift > PMD_SHIFT) 627 pdshift = PUD_SHIFT; 628 else 629 pdshift = PMD_SHIFT; 630 #else 631 if (shift < PUD_SHIFT) 632 pdshift = PMD_SHIFT; 633 else if (shift < PGDIR_SHIFT) 634 pdshift = PUD_SHIFT; 635 else 636 pdshift = PGDIR_SHIFT; 637 #endif 638 639 if (add_huge_page_size(1ULL << shift) < 0) 640 continue; 641 /* 642 * if we have pdshift and shift value same, we don't 643 * use pgt cache for hugepd. 644 */ 645 if (pdshift > shift) { 646 if (!IS_ENABLED(CONFIG_PPC_8xx)) 647 pgtable_cache_add(pdshift - shift); 648 } else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || 649 IS_ENABLED(CONFIG_PPC_8xx)) { 650 pgtable_cache_add(PTE_T_ORDER); 651 } 652 653 configured = true; 654 } 655 656 if (configured) { 657 if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE)) 658 hugetlbpage_init_default(); 659 } else 660 pr_info("Failed to initialize. Disabling HugeTLB"); 661 662 return 0; 663 } 664 665 arch_initcall(hugetlbpage_init); 666 667 void __init gigantic_hugetlb_cma_reserve(void) 668 { 669 unsigned long order = 0; 670 671 if (radix_enabled()) 672 order = PUD_SHIFT - PAGE_SHIFT; 673 else if (!firmware_has_feature(FW_FEATURE_LPAR) && mmu_psize_defs[MMU_PAGE_16G].shift) 674 /* 675 * For pseries we do use ibm,expected#pages for reserving 16G pages. 676 */ 677 order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT; 678 679 if (order) { 680 VM_WARN_ON(order < MAX_ORDER); 681 hugetlb_cma_reserve(order); 682 } 683 } 684