1 /* 2 * PPC64 (POWER4) Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2003 David Gibson, IBM Corporation. 5 * 6 * Based on the IA-32 version: 7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 8 */ 9 10 #include <linux/init.h> 11 #include <linux/fs.h> 12 #include <linux/mm.h> 13 #include <linux/hugetlb.h> 14 #include <linux/pagemap.h> 15 #include <linux/slab.h> 16 #include <linux/err.h> 17 #include <linux/sysctl.h> 18 #include <asm/mman.h> 19 #include <asm/pgalloc.h> 20 #include <asm/tlb.h> 21 #include <asm/tlbflush.h> 22 #include <asm/mmu_context.h> 23 #include <asm/machdep.h> 24 #include <asm/cputable.h> 25 #include <asm/spu.h> 26 27 #define PAGE_SHIFT_64K 16 28 #define PAGE_SHIFT_16M 24 29 #define PAGE_SHIFT_16G 34 30 31 #define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) 32 #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) 33 #define MAX_NUMBER_GPAGES 1024 34 35 /* Tracks the 16G pages after the device tree is scanned and before the 36 * huge_boot_pages list is ready. */ 37 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; 38 static unsigned nr_gpages; 39 40 /* Array of valid huge page sizes - non-zero value(hugepte_shift) is 41 * stored for the huge page sizes that are valid. 42 */ 43 unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ 44 45 #define hugepte_shift mmu_huge_psizes 46 #define PTRS_PER_HUGEPTE(psize) (1 << hugepte_shift[psize]) 47 #define HUGEPTE_TABLE_SIZE(psize) (sizeof(pte_t) << hugepte_shift[psize]) 48 49 #define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \ 50 + hugepte_shift[psize]) 51 #define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize)) 52 #define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1)) 53 54 /* Subtract one from array size because we don't need a cache for 4K since 55 * is not a huge page size */ 56 #define huge_pgtable_cache(psize) (pgtable_cache[HUGEPTE_CACHE_NUM \ 57 + psize-1]) 58 #define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize]) 59 60 static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = { 61 "unused_4K", "hugepte_cache_64K", "unused_64K_AP", 62 "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G" 63 }; 64 65 /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() 66 * will choke on pointers to hugepte tables, which is handy for 67 * catching screwups early. */ 68 #define HUGEPD_OK 0x1 69 70 typedef struct { unsigned long pd; } hugepd_t; 71 72 #define hugepd_none(hpd) ((hpd).pd == 0) 73 74 static inline int shift_to_mmu_psize(unsigned int shift) 75 { 76 switch (shift) { 77 #ifndef CONFIG_PPC_64K_PAGES 78 case PAGE_SHIFT_64K: 79 return MMU_PAGE_64K; 80 #endif 81 case PAGE_SHIFT_16M: 82 return MMU_PAGE_16M; 83 case PAGE_SHIFT_16G: 84 return MMU_PAGE_16G; 85 } 86 return -1; 87 } 88 89 static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) 90 { 91 if (mmu_psize_defs[mmu_psize].shift) 92 return mmu_psize_defs[mmu_psize].shift; 93 BUG(); 94 } 95 96 static inline pte_t *hugepd_page(hugepd_t hpd) 97 { 98 BUG_ON(!(hpd.pd & HUGEPD_OK)); 99 return (pte_t *)(hpd.pd & ~HUGEPD_OK); 100 } 101 102 static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, 103 struct hstate *hstate) 104 { 105 unsigned int shift = huge_page_shift(hstate); 106 int psize = shift_to_mmu_psize(shift); 107 unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1)); 108 pte_t *dir = hugepd_page(*hpdp); 109 110 return dir + idx; 111 } 112 113 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 114 unsigned long address, unsigned int psize) 115 { 116 pte_t *new = kmem_cache_zalloc(huge_pgtable_cache(psize), 117 GFP_KERNEL|__GFP_REPEAT); 118 119 if (! new) 120 return -ENOMEM; 121 122 spin_lock(&mm->page_table_lock); 123 if (!hugepd_none(*hpdp)) 124 kmem_cache_free(huge_pgtable_cache(psize), new); 125 else 126 hpdp->pd = (unsigned long)new | HUGEPD_OK; 127 spin_unlock(&mm->page_table_lock); 128 return 0; 129 } 130 131 132 static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate) 133 { 134 if (huge_page_shift(hstate) < PUD_SHIFT) 135 return pud_offset(pgd, addr); 136 else 137 return (pud_t *) pgd; 138 } 139 static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, 140 struct hstate *hstate) 141 { 142 if (huge_page_shift(hstate) < PUD_SHIFT) 143 return pud_alloc(mm, pgd, addr); 144 else 145 return (pud_t *) pgd; 146 } 147 static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate) 148 { 149 if (huge_page_shift(hstate) < PMD_SHIFT) 150 return pmd_offset(pud, addr); 151 else 152 return (pmd_t *) pud; 153 } 154 static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr, 155 struct hstate *hstate) 156 { 157 if (huge_page_shift(hstate) < PMD_SHIFT) 158 return pmd_alloc(mm, pud, addr); 159 else 160 return (pmd_t *) pud; 161 } 162 163 /* Build list of addresses of gigantic pages. This function is used in early 164 * boot before the buddy or bootmem allocator is setup. 165 */ 166 void add_gpage(unsigned long addr, unsigned long page_size, 167 unsigned long number_of_pages) 168 { 169 if (!addr) 170 return; 171 while (number_of_pages > 0) { 172 gpage_freearray[nr_gpages] = addr; 173 nr_gpages++; 174 number_of_pages--; 175 addr += page_size; 176 } 177 } 178 179 /* Moves the gigantic page addresses from the temporary list to the 180 * huge_boot_pages list. 181 */ 182 int alloc_bootmem_huge_page(struct hstate *hstate) 183 { 184 struct huge_bootmem_page *m; 185 if (nr_gpages == 0) 186 return 0; 187 m = phys_to_virt(gpage_freearray[--nr_gpages]); 188 gpage_freearray[nr_gpages] = 0; 189 list_add(&m->list, &huge_boot_pages); 190 m->hstate = hstate; 191 return 1; 192 } 193 194 195 /* Modelled after find_linux_pte() */ 196 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 197 { 198 pgd_t *pg; 199 pud_t *pu; 200 pmd_t *pm; 201 202 unsigned int psize; 203 unsigned int shift; 204 unsigned long sz; 205 struct hstate *hstate; 206 psize = get_slice_psize(mm, addr); 207 shift = mmu_psize_to_shift(psize); 208 sz = ((1UL) << shift); 209 hstate = size_to_hstate(sz); 210 211 addr &= hstate->mask; 212 213 pg = pgd_offset(mm, addr); 214 if (!pgd_none(*pg)) { 215 pu = hpud_offset(pg, addr, hstate); 216 if (!pud_none(*pu)) { 217 pm = hpmd_offset(pu, addr, hstate); 218 if (!pmd_none(*pm)) 219 return hugepte_offset((hugepd_t *)pm, addr, 220 hstate); 221 } 222 } 223 224 return NULL; 225 } 226 227 pte_t *huge_pte_alloc(struct mm_struct *mm, 228 unsigned long addr, unsigned long sz) 229 { 230 pgd_t *pg; 231 pud_t *pu; 232 pmd_t *pm; 233 hugepd_t *hpdp = NULL; 234 struct hstate *hstate; 235 unsigned int psize; 236 hstate = size_to_hstate(sz); 237 238 psize = get_slice_psize(mm, addr); 239 BUG_ON(!mmu_huge_psizes[psize]); 240 241 addr &= hstate->mask; 242 243 pg = pgd_offset(mm, addr); 244 pu = hpud_alloc(mm, pg, addr, hstate); 245 246 if (pu) { 247 pm = hpmd_alloc(mm, pu, addr, hstate); 248 if (pm) 249 hpdp = (hugepd_t *)pm; 250 } 251 252 if (! hpdp) 253 return NULL; 254 255 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize)) 256 return NULL; 257 258 return hugepte_offset(hpdp, addr, hstate); 259 } 260 261 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 262 { 263 return 0; 264 } 265 266 static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp, 267 unsigned int psize) 268 { 269 pte_t *hugepte = hugepd_page(*hpdp); 270 271 hpdp->pd = 0; 272 tlb->need_flush = 1; 273 pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, 274 HUGEPTE_CACHE_NUM+psize-1, 275 PGF_CACHENUM_MASK)); 276 } 277 278 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 279 unsigned long addr, unsigned long end, 280 unsigned long floor, unsigned long ceiling, 281 unsigned int psize) 282 { 283 pmd_t *pmd; 284 unsigned long next; 285 unsigned long start; 286 287 start = addr; 288 pmd = pmd_offset(pud, addr); 289 do { 290 next = pmd_addr_end(addr, end); 291 if (pmd_none(*pmd)) 292 continue; 293 free_hugepte_range(tlb, (hugepd_t *)pmd, psize); 294 } while (pmd++, addr = next, addr != end); 295 296 start &= PUD_MASK; 297 if (start < floor) 298 return; 299 if (ceiling) { 300 ceiling &= PUD_MASK; 301 if (!ceiling) 302 return; 303 } 304 if (end - 1 > ceiling - 1) 305 return; 306 307 pmd = pmd_offset(pud, start); 308 pud_clear(pud); 309 pmd_free_tlb(tlb, pmd); 310 } 311 312 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 313 unsigned long addr, unsigned long end, 314 unsigned long floor, unsigned long ceiling) 315 { 316 pud_t *pud; 317 unsigned long next; 318 unsigned long start; 319 unsigned int shift; 320 unsigned int psize = get_slice_psize(tlb->mm, addr); 321 shift = mmu_psize_to_shift(psize); 322 323 start = addr; 324 pud = pud_offset(pgd, addr); 325 do { 326 next = pud_addr_end(addr, end); 327 if (shift < PMD_SHIFT) { 328 if (pud_none_or_clear_bad(pud)) 329 continue; 330 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 331 ceiling, psize); 332 } else { 333 if (pud_none(*pud)) 334 continue; 335 free_hugepte_range(tlb, (hugepd_t *)pud, psize); 336 } 337 } while (pud++, addr = next, addr != end); 338 339 start &= PGDIR_MASK; 340 if (start < floor) 341 return; 342 if (ceiling) { 343 ceiling &= PGDIR_MASK; 344 if (!ceiling) 345 return; 346 } 347 if (end - 1 > ceiling - 1) 348 return; 349 350 pud = pud_offset(pgd, start); 351 pgd_clear(pgd); 352 pud_free_tlb(tlb, pud); 353 } 354 355 /* 356 * This function frees user-level page tables of a process. 357 * 358 * Must be called with pagetable lock held. 359 */ 360 void hugetlb_free_pgd_range(struct mmu_gather *tlb, 361 unsigned long addr, unsigned long end, 362 unsigned long floor, unsigned long ceiling) 363 { 364 pgd_t *pgd; 365 unsigned long next; 366 unsigned long start; 367 368 /* 369 * Comments below take from the normal free_pgd_range(). They 370 * apply here too. The tests against HUGEPD_MASK below are 371 * essential, because we *don't* test for this at the bottom 372 * level. Without them we'll attempt to free a hugepte table 373 * when we unmap just part of it, even if there are other 374 * active mappings using it. 375 * 376 * The next few lines have given us lots of grief... 377 * 378 * Why are we testing HUGEPD* at this top level? Because 379 * often there will be no work to do at all, and we'd prefer 380 * not to go all the way down to the bottom just to discover 381 * that. 382 * 383 * Why all these "- 1"s? Because 0 represents both the bottom 384 * of the address space and the top of it (using -1 for the 385 * top wouldn't help much: the masks would do the wrong thing). 386 * The rule is that addr 0 and floor 0 refer to the bottom of 387 * the address space, but end 0 and ceiling 0 refer to the top 388 * Comparisons need to use "end - 1" and "ceiling - 1" (though 389 * that end 0 case should be mythical). 390 * 391 * Wherever addr is brought up or ceiling brought down, we 392 * must be careful to reject "the opposite 0" before it 393 * confuses the subsequent tests. But what about where end is 394 * brought down by HUGEPD_SIZE below? no, end can't go down to 395 * 0 there. 396 * 397 * Whereas we round start (addr) and ceiling down, by different 398 * masks at different levels, in order to test whether a table 399 * now has no other vmas using it, so can be freed, we don't 400 * bother to round floor or end up - the tests don't need that. 401 */ 402 unsigned int psize = get_slice_psize(tlb->mm, addr); 403 404 addr &= HUGEPD_MASK(psize); 405 if (addr < floor) { 406 addr += HUGEPD_SIZE(psize); 407 if (!addr) 408 return; 409 } 410 if (ceiling) { 411 ceiling &= HUGEPD_MASK(psize); 412 if (!ceiling) 413 return; 414 } 415 if (end - 1 > ceiling - 1) 416 end -= HUGEPD_SIZE(psize); 417 if (addr > end - 1) 418 return; 419 420 start = addr; 421 pgd = pgd_offset(tlb->mm, addr); 422 do { 423 psize = get_slice_psize(tlb->mm, addr); 424 BUG_ON(!mmu_huge_psizes[psize]); 425 next = pgd_addr_end(addr, end); 426 if (mmu_psize_to_shift(psize) < PUD_SHIFT) { 427 if (pgd_none_or_clear_bad(pgd)) 428 continue; 429 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 430 } else { 431 if (pgd_none(*pgd)) 432 continue; 433 free_hugepte_range(tlb, (hugepd_t *)pgd, psize); 434 } 435 } while (pgd++, addr = next, addr != end); 436 } 437 438 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 439 pte_t *ptep, pte_t pte) 440 { 441 if (pte_present(*ptep)) { 442 /* We open-code pte_clear because we need to pass the right 443 * argument to hpte_need_flush (huge / !huge). Might not be 444 * necessary anymore if we make hpte_need_flush() get the 445 * page size from the slices 446 */ 447 unsigned int psize = get_slice_psize(mm, addr); 448 unsigned int shift = mmu_psize_to_shift(psize); 449 unsigned long sz = ((1UL) << shift); 450 struct hstate *hstate = size_to_hstate(sz); 451 pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1); 452 } 453 *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); 454 } 455 456 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, 457 pte_t *ptep) 458 { 459 unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1); 460 return __pte(old); 461 } 462 463 struct page * 464 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 465 { 466 pte_t *ptep; 467 struct page *page; 468 unsigned int mmu_psize = get_slice_psize(mm, address); 469 470 /* Verify it is a huge page else bail. */ 471 if (!mmu_huge_psizes[mmu_psize]) 472 return ERR_PTR(-EINVAL); 473 474 ptep = huge_pte_offset(mm, address); 475 page = pte_page(*ptep); 476 if (page) { 477 unsigned int shift = mmu_psize_to_shift(mmu_psize); 478 unsigned long sz = ((1UL) << shift); 479 page += (address % sz) / PAGE_SIZE; 480 } 481 482 return page; 483 } 484 485 int pmd_huge(pmd_t pmd) 486 { 487 return 0; 488 } 489 490 int pud_huge(pud_t pud) 491 { 492 return 0; 493 } 494 495 struct page * 496 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 497 pmd_t *pmd, int write) 498 { 499 BUG(); 500 return NULL; 501 } 502 503 504 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 505 unsigned long len, unsigned long pgoff, 506 unsigned long flags) 507 { 508 struct hstate *hstate = hstate_file(file); 509 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 510 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); 511 } 512 513 /* 514 * Called by asm hashtable.S for doing lazy icache flush 515 */ 516 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags, 517 pte_t pte, int trap, unsigned long sz) 518 { 519 struct page *page; 520 int i; 521 522 if (!pfn_valid(pte_pfn(pte))) 523 return rflags; 524 525 page = pte_page(pte); 526 527 /* page is dirty */ 528 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { 529 if (trap == 0x400) { 530 for (i = 0; i < (sz / PAGE_SIZE); i++) 531 __flush_dcache_icache(page_address(page+i)); 532 set_bit(PG_arch_1, &page->flags); 533 } else { 534 rflags |= HPTE_R_N; 535 } 536 } 537 return rflags; 538 } 539 540 int hash_huge_page(struct mm_struct *mm, unsigned long access, 541 unsigned long ea, unsigned long vsid, int local, 542 unsigned long trap) 543 { 544 pte_t *ptep; 545 unsigned long old_pte, new_pte; 546 unsigned long va, rflags, pa, sz; 547 long slot; 548 int err = 1; 549 int ssize = user_segment_size(ea); 550 unsigned int mmu_psize; 551 int shift; 552 mmu_psize = get_slice_psize(mm, ea); 553 554 if (!mmu_huge_psizes[mmu_psize]) 555 goto out; 556 ptep = huge_pte_offset(mm, ea); 557 558 /* Search the Linux page table for a match with va */ 559 va = hpt_va(ea, vsid, ssize); 560 561 /* 562 * If no pte found or not present, send the problem up to 563 * do_page_fault 564 */ 565 if (unlikely(!ptep || pte_none(*ptep))) 566 goto out; 567 568 /* 569 * Check the user's access rights to the page. If access should be 570 * prevented then send the problem up to do_page_fault. 571 */ 572 if (unlikely(access & ~pte_val(*ptep))) 573 goto out; 574 /* 575 * At this point, we have a pte (old_pte) which can be used to build 576 * or update an HPTE. There are 2 cases: 577 * 578 * 1. There is a valid (present) pte with no associated HPTE (this is 579 * the most common case) 580 * 2. There is a valid (present) pte with an associated HPTE. The 581 * current values of the pp bits in the HPTE prevent access 582 * because we are doing software DIRTY bit management and the 583 * page is currently not DIRTY. 584 */ 585 586 587 do { 588 old_pte = pte_val(*ptep); 589 if (old_pte & _PAGE_BUSY) 590 goto out; 591 new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; 592 } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, 593 old_pte, new_pte)); 594 595 rflags = 0x2 | (!(new_pte & _PAGE_RW)); 596 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ 597 rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); 598 shift = mmu_psize_to_shift(mmu_psize); 599 sz = ((1UL) << shift); 600 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) 601 /* No CPU has hugepages but lacks no execute, so we 602 * don't need to worry about that case */ 603 rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte), 604 trap, sz); 605 606 /* Check if pte already has an hpte (case 2) */ 607 if (unlikely(old_pte & _PAGE_HASHPTE)) { 608 /* There MIGHT be an HPTE for this pte */ 609 unsigned long hash, slot; 610 611 hash = hpt_hash(va, shift, ssize); 612 if (old_pte & _PAGE_F_SECOND) 613 hash = ~hash; 614 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 615 slot += (old_pte & _PAGE_F_GIX) >> 12; 616 617 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize, 618 ssize, local) == -1) 619 old_pte &= ~_PAGE_HPTEFLAGS; 620 } 621 622 if (likely(!(old_pte & _PAGE_HASHPTE))) { 623 unsigned long hash = hpt_hash(va, shift, ssize); 624 unsigned long hpte_group; 625 626 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; 627 628 repeat: 629 hpte_group = ((hash & htab_hash_mask) * 630 HPTES_PER_GROUP) & ~0x7UL; 631 632 /* clear HPTE slot informations in new PTE */ 633 #ifdef CONFIG_PPC_64K_PAGES 634 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0; 635 #else 636 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; 637 #endif 638 /* Add in WIMG bits */ 639 rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | 640 _PAGE_COHERENT | _PAGE_GUARDED)); 641 642 /* Insert into the hash table, primary slot */ 643 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, 644 mmu_psize, ssize); 645 646 /* Primary is full, try the secondary */ 647 if (unlikely(slot == -1)) { 648 hpte_group = ((~hash & htab_hash_mask) * 649 HPTES_PER_GROUP) & ~0x7UL; 650 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 651 HPTE_V_SECONDARY, 652 mmu_psize, ssize); 653 if (slot == -1) { 654 if (mftb() & 0x1) 655 hpte_group = ((hash & htab_hash_mask) * 656 HPTES_PER_GROUP)&~0x7UL; 657 658 ppc_md.hpte_remove(hpte_group); 659 goto repeat; 660 } 661 } 662 663 if (unlikely(slot == -2)) 664 panic("hash_huge_page: pte_insert failed\n"); 665 666 new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); 667 } 668 669 /* 670 * No need to use ldarx/stdcx here 671 */ 672 *ptep = __pte(new_pte & ~_PAGE_BUSY); 673 674 err = 0; 675 676 out: 677 return err; 678 } 679 680 void set_huge_psize(int psize) 681 { 682 /* Check that it is a page size supported by the hardware and 683 * that it fits within pagetable limits. */ 684 if (mmu_psize_defs[psize].shift && 685 mmu_psize_defs[psize].shift < SID_SHIFT_1T && 686 (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT || 687 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K || 688 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) { 689 /* Return if huge page size has already been setup or is the 690 * same as the base page size. */ 691 if (mmu_huge_psizes[psize] || 692 mmu_psize_defs[psize].shift == PAGE_SHIFT) 693 return; 694 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); 695 696 switch (mmu_psize_defs[psize].shift) { 697 case PAGE_SHIFT_64K: 698 /* We only allow 64k hpages with 4k base page, 699 * which was checked above, and always put them 700 * at the PMD */ 701 hugepte_shift[psize] = PMD_SHIFT; 702 break; 703 case PAGE_SHIFT_16M: 704 /* 16M pages can be at two different levels 705 * of pagestables based on base page size */ 706 if (PAGE_SHIFT == PAGE_SHIFT_64K) 707 hugepte_shift[psize] = PMD_SHIFT; 708 else /* 4k base page */ 709 hugepte_shift[psize] = PUD_SHIFT; 710 break; 711 case PAGE_SHIFT_16G: 712 /* 16G pages are always at PGD level */ 713 hugepte_shift[psize] = PGDIR_SHIFT; 714 break; 715 } 716 hugepte_shift[psize] -= mmu_psize_defs[psize].shift; 717 } else 718 hugepte_shift[psize] = 0; 719 } 720 721 static int __init hugepage_setup_sz(char *str) 722 { 723 unsigned long long size; 724 int mmu_psize; 725 int shift; 726 727 size = memparse(str, &str); 728 729 shift = __ffs(size); 730 mmu_psize = shift_to_mmu_psize(shift); 731 if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift) 732 set_huge_psize(mmu_psize); 733 else 734 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); 735 736 return 1; 737 } 738 __setup("hugepagesz=", hugepage_setup_sz); 739 740 static int __init hugetlbpage_init(void) 741 { 742 unsigned int psize; 743 744 if (!cpu_has_feature(CPU_FTR_16M_PAGE)) 745 return -ENODEV; 746 747 /* Add supported huge page sizes. Need to change HUGE_MAX_HSTATE 748 * and adjust PTE_NONCACHE_NUM if the number of supported huge page 749 * sizes changes. 750 */ 751 set_huge_psize(MMU_PAGE_16M); 752 set_huge_psize(MMU_PAGE_16G); 753 754 /* Temporarily disable support for 64K huge pages when 64K SPU local 755 * store support is enabled as the current implementation conflicts. 756 */ 757 #ifndef CONFIG_SPU_FS_64K_LS 758 set_huge_psize(MMU_PAGE_64K); 759 #endif 760 761 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 762 if (mmu_huge_psizes[psize]) { 763 huge_pgtable_cache(psize) = kmem_cache_create( 764 HUGEPTE_CACHE_NAME(psize), 765 HUGEPTE_TABLE_SIZE(psize), 766 HUGEPTE_TABLE_SIZE(psize), 767 0, 768 NULL); 769 if (!huge_pgtable_cache(psize)) 770 panic("hugetlbpage_init(): could not create %s"\ 771 "\n", HUGEPTE_CACHE_NAME(psize)); 772 } 773 } 774 775 return 0; 776 } 777 778 module_init(hugetlbpage_init); 779