1 /* 2 * PPC Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2003 David Gibson, IBM Corporation. 5 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor 6 * 7 * Based on the IA-32 version: 8 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 9 */ 10 11 #include <linux/mm.h> 12 #include <linux/io.h> 13 #include <linux/slab.h> 14 #include <linux/hugetlb.h> 15 #include <linux/export.h> 16 #include <linux/of_fdt.h> 17 #include <linux/memblock.h> 18 #include <linux/bootmem.h> 19 #include <linux/moduleparam.h> 20 #include <linux/swap.h> 21 #include <linux/swapops.h> 22 #include <asm/pgtable.h> 23 #include <asm/pgalloc.h> 24 #include <asm/tlb.h> 25 #include <asm/setup.h> 26 #include <asm/hugetlb.h> 27 28 #ifdef CONFIG_HUGETLB_PAGE 29 30 #define PAGE_SHIFT_64K 16 31 #define PAGE_SHIFT_512K 19 32 #define PAGE_SHIFT_8M 23 33 #define PAGE_SHIFT_16M 24 34 #define PAGE_SHIFT_16G 34 35 36 unsigned int HPAGE_SHIFT; 37 EXPORT_SYMBOL(HPAGE_SHIFT); 38 39 /* 40 * Tracks gpages after the device tree is scanned and before the 41 * huge_boot_pages list is ready. On non-Freescale implementations, this is 42 * just used to track 16G pages and so is a single array. FSL-based 43 * implementations may have more than one gpage size, so we need multiple 44 * arrays 45 */ 46 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 47 #define MAX_NUMBER_GPAGES 128 48 struct psize_gpages { 49 u64 gpage_list[MAX_NUMBER_GPAGES]; 50 unsigned int nr_gpages; 51 }; 52 static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT]; 53 #else 54 #define MAX_NUMBER_GPAGES 1024 55 static u64 gpage_freearray[MAX_NUMBER_GPAGES]; 56 static unsigned nr_gpages; 57 #endif 58 59 #define hugepd_none(hpd) (hpd_val(hpd) == 0) 60 61 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) 62 { 63 /* Only called for hugetlbfs pages, hence can ignore THP */ 64 return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL); 65 } 66 67 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 68 unsigned long address, unsigned pdshift, unsigned pshift) 69 { 70 struct kmem_cache *cachep; 71 pte_t *new; 72 int i; 73 int num_hugepd; 74 75 if (pshift >= pdshift) { 76 cachep = hugepte_cache; 77 num_hugepd = 1 << (pshift - pdshift); 78 } else { 79 cachep = PGT_CACHE(pdshift - pshift); 80 num_hugepd = 1; 81 } 82 83 new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL)); 84 85 BUG_ON(pshift > HUGEPD_SHIFT_MASK); 86 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); 87 88 if (! new) 89 return -ENOMEM; 90 91 /* 92 * Make sure other cpus find the hugepd set only after a 93 * properly initialized page table is visible to them. 94 * For more details look for comment in __pte_alloc(). 95 */ 96 smp_wmb(); 97 98 spin_lock(&mm->page_table_lock); 99 100 /* 101 * We have multiple higher-level entries that point to the same 102 * actual pte location. Fill in each as we go and backtrack on error. 103 * We need all of these so the DTLB pgtable walk code can find the 104 * right higher-level entry without knowing if it's a hugepage or not. 105 */ 106 for (i = 0; i < num_hugepd; i++, hpdp++) { 107 if (unlikely(!hugepd_none(*hpdp))) 108 break; 109 else { 110 #ifdef CONFIG_PPC_BOOK3S_64 111 *hpdp = __hugepd(__pa(new) | 112 (shift_to_mmu_psize(pshift) << 2)); 113 #elif defined(CONFIG_PPC_8xx) 114 *hpdp = __hugepd(__pa(new) | 115 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M : 116 _PMD_PAGE_512K) | _PMD_PRESENT); 117 #else 118 /* We use the old format for PPC_FSL_BOOK3E */ 119 *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift); 120 #endif 121 } 122 } 123 /* If we bailed from the for loop early, an error occurred, clean up */ 124 if (i < num_hugepd) { 125 for (i = i - 1 ; i >= 0; i--, hpdp--) 126 *hpdp = __hugepd(0); 127 kmem_cache_free(cachep, new); 128 } 129 spin_unlock(&mm->page_table_lock); 130 return 0; 131 } 132 133 /* 134 * These macros define how to determine which level of the page table holds 135 * the hpdp. 136 */ 137 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 138 #define HUGEPD_PGD_SHIFT PGDIR_SHIFT 139 #define HUGEPD_PUD_SHIFT PUD_SHIFT 140 #else 141 #define HUGEPD_PGD_SHIFT PUD_SHIFT 142 #define HUGEPD_PUD_SHIFT PMD_SHIFT 143 #endif 144 145 /* 146 * At this point we do the placement change only for BOOK3S 64. This would 147 * possibly work on other subarchs. 148 */ 149 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) 150 { 151 pgd_t *pg; 152 pud_t *pu; 153 pmd_t *pm; 154 hugepd_t *hpdp = NULL; 155 unsigned pshift = __ffs(sz); 156 unsigned pdshift = PGDIR_SHIFT; 157 158 addr &= ~(sz-1); 159 pg = pgd_offset(mm, addr); 160 161 #ifdef CONFIG_PPC_BOOK3S_64 162 if (pshift == PGDIR_SHIFT) 163 /* 16GB huge page */ 164 return (pte_t *) pg; 165 else if (pshift > PUD_SHIFT) 166 /* 167 * We need to use hugepd table 168 */ 169 hpdp = (hugepd_t *)pg; 170 else { 171 pdshift = PUD_SHIFT; 172 pu = pud_alloc(mm, pg, addr); 173 if (pshift == PUD_SHIFT) 174 return (pte_t *)pu; 175 else if (pshift > PMD_SHIFT) 176 hpdp = (hugepd_t *)pu; 177 else { 178 pdshift = PMD_SHIFT; 179 pm = pmd_alloc(mm, pu, addr); 180 if (pshift == PMD_SHIFT) 181 /* 16MB hugepage */ 182 return (pte_t *)pm; 183 else 184 hpdp = (hugepd_t *)pm; 185 } 186 } 187 #else 188 if (pshift >= HUGEPD_PGD_SHIFT) { 189 hpdp = (hugepd_t *)pg; 190 } else { 191 pdshift = PUD_SHIFT; 192 pu = pud_alloc(mm, pg, addr); 193 if (pshift >= HUGEPD_PUD_SHIFT) { 194 hpdp = (hugepd_t *)pu; 195 } else { 196 pdshift = PMD_SHIFT; 197 pm = pmd_alloc(mm, pu, addr); 198 hpdp = (hugepd_t *)pm; 199 } 200 } 201 #endif 202 if (!hpdp) 203 return NULL; 204 205 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); 206 207 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) 208 return NULL; 209 210 return hugepte_offset(*hpdp, addr, pdshift); 211 } 212 213 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 214 /* Build list of addresses of gigantic pages. This function is used in early 215 * boot before the buddy allocator is setup. 216 */ 217 void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) 218 { 219 unsigned int idx = shift_to_mmu_psize(__ffs(page_size)); 220 int i; 221 222 if (addr == 0) 223 return; 224 225 gpage_freearray[idx].nr_gpages = number_of_pages; 226 227 for (i = 0; i < number_of_pages; i++) { 228 gpage_freearray[idx].gpage_list[i] = addr; 229 addr += page_size; 230 } 231 } 232 233 /* 234 * Moves the gigantic page addresses from the temporary list to the 235 * huge_boot_pages list. 236 */ 237 int alloc_bootmem_huge_page(struct hstate *hstate) 238 { 239 struct huge_bootmem_page *m; 240 int idx = shift_to_mmu_psize(huge_page_shift(hstate)); 241 int nr_gpages = gpage_freearray[idx].nr_gpages; 242 243 if (nr_gpages == 0) 244 return 0; 245 246 #ifdef CONFIG_HIGHMEM 247 /* 248 * If gpages can be in highmem we can't use the trick of storing the 249 * data structure in the page; allocate space for this 250 */ 251 m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0); 252 m->phys = gpage_freearray[idx].gpage_list[--nr_gpages]; 253 #else 254 m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]); 255 #endif 256 257 list_add(&m->list, &huge_boot_pages); 258 gpage_freearray[idx].nr_gpages = nr_gpages; 259 gpage_freearray[idx].gpage_list[nr_gpages] = 0; 260 m->hstate = hstate; 261 262 return 1; 263 } 264 /* 265 * Scan the command line hugepagesz= options for gigantic pages; store those in 266 * a list that we use to allocate the memory once all options are parsed. 267 */ 268 269 unsigned long gpage_npages[MMU_PAGE_COUNT]; 270 271 static int __init do_gpage_early_setup(char *param, char *val, 272 const char *unused, void *arg) 273 { 274 static phys_addr_t size; 275 unsigned long npages; 276 277 /* 278 * The hugepagesz and hugepages cmdline options are interleaved. We 279 * use the size variable to keep track of whether or not this was done 280 * properly and skip over instances where it is incorrect. Other 281 * command-line parsing code will issue warnings, so we don't need to. 282 * 283 */ 284 if ((strcmp(param, "default_hugepagesz") == 0) || 285 (strcmp(param, "hugepagesz") == 0)) { 286 size = memparse(val, NULL); 287 } else if (strcmp(param, "hugepages") == 0) { 288 if (size != 0) { 289 if (sscanf(val, "%lu", &npages) <= 0) 290 npages = 0; 291 if (npages > MAX_NUMBER_GPAGES) { 292 pr_warn("MMU: %lu pages requested for page " 293 #ifdef CONFIG_PHYS_ADDR_T_64BIT 294 "size %llu KB, limiting to " 295 #else 296 "size %u KB, limiting to " 297 #endif 298 __stringify(MAX_NUMBER_GPAGES) "\n", 299 npages, size / 1024); 300 npages = MAX_NUMBER_GPAGES; 301 } 302 gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages; 303 size = 0; 304 } 305 } 306 return 0; 307 } 308 309 310 /* 311 * This function allocates physical space for pages that are larger than the 312 * buddy allocator can handle. We want to allocate these in highmem because 313 * the amount of lowmem is limited. This means that this function MUST be 314 * called before lowmem_end_addr is set up in MMU_init() in order for the lmb 315 * allocate to grab highmem. 316 */ 317 void __init reserve_hugetlb_gpages(void) 318 { 319 static __initdata char cmdline[COMMAND_LINE_SIZE]; 320 phys_addr_t size, base; 321 int i; 322 323 strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); 324 parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0, 325 NULL, &do_gpage_early_setup); 326 327 /* 328 * Walk gpage list in reverse, allocating larger page sizes first. 329 * Skip over unsupported sizes, or sizes that have 0 gpages allocated. 330 * When we reach the point in the list where pages are no longer 331 * considered gpages, we're done. 332 */ 333 for (i = MMU_PAGE_COUNT-1; i >= 0; i--) { 334 if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0) 335 continue; 336 else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT)) 337 break; 338 339 size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i)); 340 base = memblock_alloc_base(size * gpage_npages[i], size, 341 MEMBLOCK_ALLOC_ANYWHERE); 342 add_gpage(base, size, gpage_npages[i]); 343 } 344 } 345 346 #else /* !PPC_FSL_BOOK3E */ 347 348 /* Build list of addresses of gigantic pages. This function is used in early 349 * boot before the buddy allocator is setup. 350 */ 351 void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) 352 { 353 if (!addr) 354 return; 355 while (number_of_pages > 0) { 356 gpage_freearray[nr_gpages] = addr; 357 nr_gpages++; 358 number_of_pages--; 359 addr += page_size; 360 } 361 } 362 363 /* Moves the gigantic page addresses from the temporary list to the 364 * huge_boot_pages list. 365 */ 366 int alloc_bootmem_huge_page(struct hstate *hstate) 367 { 368 struct huge_bootmem_page *m; 369 if (nr_gpages == 0) 370 return 0; 371 m = phys_to_virt(gpage_freearray[--nr_gpages]); 372 gpage_freearray[nr_gpages] = 0; 373 list_add(&m->list, &huge_boot_pages); 374 m->hstate = hstate; 375 return 1; 376 } 377 #endif 378 379 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 380 #define HUGEPD_FREELIST_SIZE \ 381 ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) 382 383 struct hugepd_freelist { 384 struct rcu_head rcu; 385 unsigned int index; 386 void *ptes[0]; 387 }; 388 389 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); 390 391 static void hugepd_free_rcu_callback(struct rcu_head *head) 392 { 393 struct hugepd_freelist *batch = 394 container_of(head, struct hugepd_freelist, rcu); 395 unsigned int i; 396 397 for (i = 0; i < batch->index; i++) 398 kmem_cache_free(hugepte_cache, batch->ptes[i]); 399 400 free_page((unsigned long)batch); 401 } 402 403 static void hugepd_free(struct mmu_gather *tlb, void *hugepte) 404 { 405 struct hugepd_freelist **batchp; 406 407 batchp = &get_cpu_var(hugepd_freelist_cur); 408 409 if (atomic_read(&tlb->mm->mm_users) < 2 || 410 cpumask_equal(mm_cpumask(tlb->mm), 411 cpumask_of(smp_processor_id()))) { 412 kmem_cache_free(hugepte_cache, hugepte); 413 put_cpu_var(hugepd_freelist_cur); 414 return; 415 } 416 417 if (*batchp == NULL) { 418 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); 419 (*batchp)->index = 0; 420 } 421 422 (*batchp)->ptes[(*batchp)->index++] = hugepte; 423 if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { 424 call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback); 425 *batchp = NULL; 426 } 427 put_cpu_var(hugepd_freelist_cur); 428 } 429 #else 430 static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {} 431 #endif 432 433 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, 434 unsigned long start, unsigned long end, 435 unsigned long floor, unsigned long ceiling) 436 { 437 pte_t *hugepte = hugepd_page(*hpdp); 438 int i; 439 440 unsigned long pdmask = ~((1UL << pdshift) - 1); 441 unsigned int num_hugepd = 1; 442 unsigned int shift = hugepd_shift(*hpdp); 443 444 /* Note: On fsl the hpdp may be the first of several */ 445 if (shift > pdshift) 446 num_hugepd = 1 << (shift - pdshift); 447 448 start &= pdmask; 449 if (start < floor) 450 return; 451 if (ceiling) { 452 ceiling &= pdmask; 453 if (! ceiling) 454 return; 455 } 456 if (end - 1 > ceiling - 1) 457 return; 458 459 for (i = 0; i < num_hugepd; i++, hpdp++) 460 *hpdp = __hugepd(0); 461 462 if (shift >= pdshift) 463 hugepd_free(tlb, hugepte); 464 else 465 pgtable_free_tlb(tlb, hugepte, pdshift - shift); 466 } 467 468 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 469 unsigned long addr, unsigned long end, 470 unsigned long floor, unsigned long ceiling) 471 { 472 pmd_t *pmd; 473 unsigned long next; 474 unsigned long start; 475 476 start = addr; 477 do { 478 unsigned long more; 479 480 pmd = pmd_offset(pud, addr); 481 next = pmd_addr_end(addr, end); 482 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) { 483 /* 484 * if it is not hugepd pointer, we should already find 485 * it cleared. 486 */ 487 WARN_ON(!pmd_none_or_clear_bad(pmd)); 488 continue; 489 } 490 /* 491 * Increment next by the size of the huge mapping since 492 * there may be more than one entry at this level for a 493 * single hugepage, but all of them point to 494 * the same kmem cache that holds the hugepte. 495 */ 496 more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd)); 497 if (more > next) 498 next = more; 499 500 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, 501 addr, next, floor, ceiling); 502 } while (addr = next, addr != end); 503 504 start &= PUD_MASK; 505 if (start < floor) 506 return; 507 if (ceiling) { 508 ceiling &= PUD_MASK; 509 if (!ceiling) 510 return; 511 } 512 if (end - 1 > ceiling - 1) 513 return; 514 515 pmd = pmd_offset(pud, start); 516 pud_clear(pud); 517 pmd_free_tlb(tlb, pmd, start); 518 mm_dec_nr_pmds(tlb->mm); 519 } 520 521 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 522 unsigned long addr, unsigned long end, 523 unsigned long floor, unsigned long ceiling) 524 { 525 pud_t *pud; 526 unsigned long next; 527 unsigned long start; 528 529 start = addr; 530 do { 531 pud = pud_offset(pgd, addr); 532 next = pud_addr_end(addr, end); 533 if (!is_hugepd(__hugepd(pud_val(*pud)))) { 534 if (pud_none_or_clear_bad(pud)) 535 continue; 536 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 537 ceiling); 538 } else { 539 unsigned long more; 540 /* 541 * Increment next by the size of the huge mapping since 542 * there may be more than one entry at this level for a 543 * single hugepage, but all of them point to 544 * the same kmem cache that holds the hugepte. 545 */ 546 more = addr + (1 << hugepd_shift(*(hugepd_t *)pud)); 547 if (more > next) 548 next = more; 549 550 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, 551 addr, next, floor, ceiling); 552 } 553 } while (addr = next, addr != end); 554 555 start &= PGDIR_MASK; 556 if (start < floor) 557 return; 558 if (ceiling) { 559 ceiling &= PGDIR_MASK; 560 if (!ceiling) 561 return; 562 } 563 if (end - 1 > ceiling - 1) 564 return; 565 566 pud = pud_offset(pgd, start); 567 pgd_clear(pgd); 568 pud_free_tlb(tlb, pud, start); 569 } 570 571 /* 572 * This function frees user-level page tables of a process. 573 */ 574 void hugetlb_free_pgd_range(struct mmu_gather *tlb, 575 unsigned long addr, unsigned long end, 576 unsigned long floor, unsigned long ceiling) 577 { 578 pgd_t *pgd; 579 unsigned long next; 580 581 /* 582 * Because there are a number of different possible pagetable 583 * layouts for hugepage ranges, we limit knowledge of how 584 * things should be laid out to the allocation path 585 * (huge_pte_alloc(), above). Everything else works out the 586 * structure as it goes from information in the hugepd 587 * pointers. That means that we can't here use the 588 * optimization used in the normal page free_pgd_range(), of 589 * checking whether we're actually covering a large enough 590 * range to have to do anything at the top level of the walk 591 * instead of at the bottom. 592 * 593 * To make sense of this, you should probably go read the big 594 * block comment at the top of the normal free_pgd_range(), 595 * too. 596 */ 597 598 do { 599 next = pgd_addr_end(addr, end); 600 pgd = pgd_offset(tlb->mm, addr); 601 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) { 602 if (pgd_none_or_clear_bad(pgd)) 603 continue; 604 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 605 } else { 606 unsigned long more; 607 /* 608 * Increment next by the size of the huge mapping since 609 * there may be more than one entry at the pgd level 610 * for a single hugepage, but all of them point to the 611 * same kmem cache that holds the hugepte. 612 */ 613 more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); 614 if (more > next) 615 next = more; 616 617 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, 618 addr, next, floor, ceiling); 619 } 620 } while (addr = next, addr != end); 621 } 622 623 struct page *follow_huge_pd(struct vm_area_struct *vma, 624 unsigned long address, hugepd_t hpd, 625 int flags, int pdshift) 626 { 627 pte_t *ptep; 628 spinlock_t *ptl; 629 struct page *page = NULL; 630 unsigned long mask; 631 int shift = hugepd_shift(hpd); 632 struct mm_struct *mm = vma->vm_mm; 633 634 retry: 635 ptl = &mm->page_table_lock; 636 spin_lock(ptl); 637 638 ptep = hugepte_offset(hpd, address, pdshift); 639 if (pte_present(*ptep)) { 640 mask = (1UL << shift) - 1; 641 page = pte_page(*ptep); 642 page += ((address & mask) >> PAGE_SHIFT); 643 if (flags & FOLL_GET) 644 get_page(page); 645 } else { 646 if (is_hugetlb_entry_migration(*ptep)) { 647 spin_unlock(ptl); 648 __migration_entry_wait(mm, ptep, ptl); 649 goto retry; 650 } 651 } 652 spin_unlock(ptl); 653 return page; 654 } 655 656 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, 657 unsigned long sz) 658 { 659 unsigned long __boundary = (addr + sz) & ~(sz-1); 660 return (__boundary - 1 < end - 1) ? __boundary : end; 661 } 662 663 int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift, 664 unsigned long end, int write, struct page **pages, int *nr) 665 { 666 pte_t *ptep; 667 unsigned long sz = 1UL << hugepd_shift(hugepd); 668 unsigned long next; 669 670 ptep = hugepte_offset(hugepd, addr, pdshift); 671 do { 672 next = hugepte_addr_end(addr, end, sz); 673 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) 674 return 0; 675 } while (ptep++, addr = next, addr != end); 676 677 return 1; 678 } 679 680 #ifdef CONFIG_PPC_MM_SLICES 681 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 682 unsigned long len, unsigned long pgoff, 683 unsigned long flags) 684 { 685 struct hstate *hstate = hstate_file(file); 686 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 687 688 if (radix_enabled()) 689 return radix__hugetlb_get_unmapped_area(file, addr, len, 690 pgoff, flags); 691 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); 692 } 693 #endif 694 695 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 696 { 697 #ifdef CONFIG_PPC_MM_SLICES 698 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); 699 /* With radix we don't use slice, so derive it from vma*/ 700 if (!radix_enabled()) 701 return 1UL << mmu_psize_to_shift(psize); 702 #endif 703 if (!is_vm_hugetlb_page(vma)) 704 return PAGE_SIZE; 705 706 return huge_page_size(hstate_vma(vma)); 707 } 708 709 static inline bool is_power_of_4(unsigned long x) 710 { 711 if (is_power_of_2(x)) 712 return (__ilog2(x) % 2) ? false : true; 713 return false; 714 } 715 716 static int __init add_huge_page_size(unsigned long long size) 717 { 718 int shift = __ffs(size); 719 int mmu_psize; 720 721 /* Check that it is a page size supported by the hardware and 722 * that it fits within pagetable and slice limits. */ 723 if (size <= PAGE_SIZE) 724 return -EINVAL; 725 #if defined(CONFIG_PPC_FSL_BOOK3E) 726 if (!is_power_of_4(size)) 727 return -EINVAL; 728 #elif !defined(CONFIG_PPC_8xx) 729 if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT)) 730 return -EINVAL; 731 #endif 732 733 if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) 734 return -EINVAL; 735 736 #ifdef CONFIG_PPC_BOOK3S_64 737 /* 738 * We need to make sure that for different page sizes reported by 739 * firmware we only add hugetlb support for page sizes that can be 740 * supported by linux page table layout. 741 * For now we have 742 * Radix: 2M 743 * Hash: 16M and 16G 744 */ 745 if (radix_enabled()) { 746 if (mmu_psize != MMU_PAGE_2M) { 747 if (cpu_has_feature(CPU_FTR_POWER9_DD1) || 748 (mmu_psize != MMU_PAGE_1G)) 749 return -EINVAL; 750 } 751 } else { 752 if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) 753 return -EINVAL; 754 } 755 #endif 756 757 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); 758 759 /* Return if huge page size has already been setup */ 760 if (size_to_hstate(size)) 761 return 0; 762 763 hugetlb_add_hstate(shift - PAGE_SHIFT); 764 765 return 0; 766 } 767 768 static int __init hugepage_setup_sz(char *str) 769 { 770 unsigned long long size; 771 772 size = memparse(str, &str); 773 774 if (add_huge_page_size(size) != 0) { 775 hugetlb_bad_size(); 776 pr_err("Invalid huge page size specified(%llu)\n", size); 777 } 778 779 return 1; 780 } 781 __setup("hugepagesz=", hugepage_setup_sz); 782 783 struct kmem_cache *hugepte_cache; 784 static int __init hugetlbpage_init(void) 785 { 786 int psize; 787 788 #if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx) 789 if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE)) 790 return -ENODEV; 791 #endif 792 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 793 unsigned shift; 794 unsigned pdshift; 795 796 if (!mmu_psize_defs[psize].shift) 797 continue; 798 799 shift = mmu_psize_to_shift(psize); 800 801 if (add_huge_page_size(1ULL << shift) < 0) 802 continue; 803 804 if (shift < HUGEPD_PUD_SHIFT) 805 pdshift = PMD_SHIFT; 806 else if (shift < HUGEPD_PGD_SHIFT) 807 pdshift = PUD_SHIFT; 808 else 809 pdshift = PGDIR_SHIFT; 810 /* 811 * if we have pdshift and shift value same, we don't 812 * use pgt cache for hugepd. 813 */ 814 if (pdshift > shift) 815 pgtable_cache_add(pdshift - shift, NULL); 816 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 817 else if (!hugepte_cache) { 818 /* 819 * Create a kmem cache for hugeptes. The bottom bits in 820 * the pte have size information encoded in them, so 821 * align them to allow this 822 */ 823 hugepte_cache = kmem_cache_create("hugepte-cache", 824 sizeof(pte_t), 825 HUGEPD_SHIFT_MASK + 1, 826 0, NULL); 827 if (hugepte_cache == NULL) 828 panic("%s: Unable to create kmem cache " 829 "for hugeptes\n", __func__); 830 831 } 832 #endif 833 } 834 835 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 836 /* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */ 837 if (mmu_psize_defs[MMU_PAGE_4M].shift) 838 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift; 839 else if (mmu_psize_defs[MMU_PAGE_512K].shift) 840 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift; 841 #else 842 /* Set default large page size. Currently, we pick 16M or 1M 843 * depending on what is available 844 */ 845 if (mmu_psize_defs[MMU_PAGE_16M].shift) 846 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; 847 else if (mmu_psize_defs[MMU_PAGE_1M].shift) 848 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; 849 else if (mmu_psize_defs[MMU_PAGE_2M].shift) 850 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift; 851 #endif 852 return 0; 853 } 854 855 arch_initcall(hugetlbpage_init); 856 857 void flush_dcache_icache_hugepage(struct page *page) 858 { 859 int i; 860 void *start; 861 862 BUG_ON(!PageCompound(page)); 863 864 for (i = 0; i < (1UL << compound_order(page)); i++) { 865 if (!PageHighMem(page)) { 866 __flush_dcache_icache(page_address(page+i)); 867 } else { 868 start = kmap_atomic(page+i); 869 __flush_dcache_icache(start); 870 kunmap_atomic(start); 871 } 872 } 873 } 874 875 #endif /* CONFIG_HUGETLB_PAGE */ 876 877 /* 878 * We have 4 cases for pgds and pmds: 879 * (1) invalid (all zeroes) 880 * (2) pointer to next table, as normal; bottom 6 bits == 0 881 * (3) leaf pte for huge page _PAGE_PTE set 882 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table 883 * 884 * So long as we atomically load page table pointers we are safe against teardown, 885 * we can follow the address down to the the page and take a ref on it. 886 * This function need to be called with interrupts disabled. We use this variant 887 * when we have MSR[EE] = 0 but the paca->soft_enabled = 1 888 */ 889 890 pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, 891 bool *is_thp, unsigned *shift) 892 { 893 pgd_t pgd, *pgdp; 894 pud_t pud, *pudp; 895 pmd_t pmd, *pmdp; 896 pte_t *ret_pte; 897 hugepd_t *hpdp = NULL; 898 unsigned pdshift = PGDIR_SHIFT; 899 900 if (shift) 901 *shift = 0; 902 903 if (is_thp) 904 *is_thp = false; 905 906 pgdp = pgdir + pgd_index(ea); 907 pgd = READ_ONCE(*pgdp); 908 /* 909 * Always operate on the local stack value. This make sure the 910 * value don't get updated by a parallel THP split/collapse, 911 * page fault or a page unmap. The return pte_t * is still not 912 * stable. So should be checked there for above conditions. 913 */ 914 if (pgd_none(pgd)) 915 return NULL; 916 else if (pgd_huge(pgd)) { 917 ret_pte = (pte_t *) pgdp; 918 goto out; 919 } else if (is_hugepd(__hugepd(pgd_val(pgd)))) 920 hpdp = (hugepd_t *)&pgd; 921 else { 922 /* 923 * Even if we end up with an unmap, the pgtable will not 924 * be freed, because we do an rcu free and here we are 925 * irq disabled 926 */ 927 pdshift = PUD_SHIFT; 928 pudp = pud_offset(&pgd, ea); 929 pud = READ_ONCE(*pudp); 930 931 if (pud_none(pud)) 932 return NULL; 933 else if (pud_huge(pud)) { 934 ret_pte = (pte_t *) pudp; 935 goto out; 936 } else if (is_hugepd(__hugepd(pud_val(pud)))) 937 hpdp = (hugepd_t *)&pud; 938 else { 939 pdshift = PMD_SHIFT; 940 pmdp = pmd_offset(&pud, ea); 941 pmd = READ_ONCE(*pmdp); 942 /* 943 * A hugepage collapse is captured by pmd_none, because 944 * it mark the pmd none and do a hpte invalidate. 945 */ 946 if (pmd_none(pmd)) 947 return NULL; 948 949 if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { 950 if (is_thp) 951 *is_thp = true; 952 ret_pte = (pte_t *) pmdp; 953 goto out; 954 } 955 956 if (pmd_huge(pmd)) { 957 ret_pte = (pte_t *) pmdp; 958 goto out; 959 } else if (is_hugepd(__hugepd(pmd_val(pmd)))) 960 hpdp = (hugepd_t *)&pmd; 961 else 962 return pte_offset_kernel(&pmd, ea); 963 } 964 } 965 if (!hpdp) 966 return NULL; 967 968 ret_pte = hugepte_offset(*hpdp, ea, pdshift); 969 pdshift = hugepd_shift(*hpdp); 970 out: 971 if (shift) 972 *shift = pdshift; 973 return ret_pte; 974 } 975 EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte); 976 977 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, 978 unsigned long end, int write, struct page **pages, int *nr) 979 { 980 unsigned long mask; 981 unsigned long pte_end; 982 struct page *head, *page; 983 pte_t pte; 984 int refs; 985 986 pte_end = (addr + sz) & ~(sz-1); 987 if (pte_end < end) 988 end = pte_end; 989 990 pte = READ_ONCE(*ptep); 991 mask = _PAGE_PRESENT | _PAGE_READ; 992 993 /* 994 * On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined 995 * as 0 and _PAGE_RO has to be set when a page is not writable 996 */ 997 if (write) 998 mask |= _PAGE_WRITE; 999 else 1000 mask |= _PAGE_RO; 1001 1002 if ((pte_val(pte) & mask) != mask) 1003 return 0; 1004 1005 /* hugepages are never "special" */ 1006 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 1007 1008 refs = 0; 1009 head = pte_page(pte); 1010 1011 page = head + ((addr & (sz-1)) >> PAGE_SHIFT); 1012 do { 1013 VM_BUG_ON(compound_head(page) != head); 1014 pages[*nr] = page; 1015 (*nr)++; 1016 page++; 1017 refs++; 1018 } while (addr += PAGE_SIZE, addr != end); 1019 1020 if (!page_cache_add_speculative(head, refs)) { 1021 *nr -= refs; 1022 return 0; 1023 } 1024 1025 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 1026 /* Could be optimized better */ 1027 *nr -= refs; 1028 while (refs--) 1029 put_page(head); 1030 return 0; 1031 } 1032 1033 return 1; 1034 } 1035