1 /* 2 * PPC64 (POWER4) Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2003 David Gibson, IBM Corporation. 5 * 6 * Based on the IA-32 version: 7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 8 */ 9 10 #include <linux/init.h> 11 #include <linux/fs.h> 12 #include <linux/mm.h> 13 #include <linux/hugetlb.h> 14 #include <linux/pagemap.h> 15 #include <linux/slab.h> 16 #include <linux/err.h> 17 #include <linux/sysctl.h> 18 #include <asm/mman.h> 19 #include <asm/pgalloc.h> 20 #include <asm/tlb.h> 21 #include <asm/tlbflush.h> 22 #include <asm/mmu_context.h> 23 #include <asm/machdep.h> 24 #include <asm/cputable.h> 25 #include <asm/spu.h> 26 27 #define PAGE_SHIFT_64K 16 28 #define PAGE_SHIFT_16M 24 29 #define PAGE_SHIFT_16G 34 30 31 #define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) 32 #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) 33 #define MAX_NUMBER_GPAGES 1024 34 35 /* Tracks the 16G pages after the device tree is scanned and before the 36 * huge_boot_pages list is ready. */ 37 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; 38 static unsigned nr_gpages; 39 40 /* Array of valid huge page sizes - non-zero value(hugepte_shift) is 41 * stored for the huge page sizes that are valid. 42 */ 43 unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ 44 45 #define hugepte_shift mmu_huge_psizes 46 #define PTRS_PER_HUGEPTE(psize) (1 << hugepte_shift[psize]) 47 #define HUGEPTE_TABLE_SIZE(psize) (sizeof(pte_t) << hugepte_shift[psize]) 48 49 #define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \ 50 + hugepte_shift[psize]) 51 #define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize)) 52 #define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1)) 53 54 /* Subtract one from array size because we don't need a cache for 4K since 55 * is not a huge page size */ 56 #define HUGE_PGTABLE_INDEX(psize) (HUGEPTE_CACHE_NUM + psize - 1) 57 #define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize]) 58 59 static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = { 60 "unused_4K", "hugepte_cache_64K", "unused_64K_AP", 61 "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G" 62 }; 63 64 /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() 65 * will choke on pointers to hugepte tables, which is handy for 66 * catching screwups early. */ 67 #define HUGEPD_OK 0x1 68 69 typedef struct { unsigned long pd; } hugepd_t; 70 71 #define hugepd_none(hpd) ((hpd).pd == 0) 72 73 static inline int shift_to_mmu_psize(unsigned int shift) 74 { 75 switch (shift) { 76 #ifndef CONFIG_PPC_64K_PAGES 77 case PAGE_SHIFT_64K: 78 return MMU_PAGE_64K; 79 #endif 80 case PAGE_SHIFT_16M: 81 return MMU_PAGE_16M; 82 case PAGE_SHIFT_16G: 83 return MMU_PAGE_16G; 84 } 85 return -1; 86 } 87 88 static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) 89 { 90 if (mmu_psize_defs[mmu_psize].shift) 91 return mmu_psize_defs[mmu_psize].shift; 92 BUG(); 93 } 94 95 static inline pte_t *hugepd_page(hugepd_t hpd) 96 { 97 BUG_ON(!(hpd.pd & HUGEPD_OK)); 98 return (pte_t *)(hpd.pd & ~HUGEPD_OK); 99 } 100 101 static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, 102 struct hstate *hstate) 103 { 104 unsigned int shift = huge_page_shift(hstate); 105 int psize = shift_to_mmu_psize(shift); 106 unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1)); 107 pte_t *dir = hugepd_page(*hpdp); 108 109 return dir + idx; 110 } 111 112 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 113 unsigned long address, unsigned int psize) 114 { 115 pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], 116 GFP_KERNEL|__GFP_REPEAT); 117 118 if (! new) 119 return -ENOMEM; 120 121 spin_lock(&mm->page_table_lock); 122 if (!hugepd_none(*hpdp)) 123 kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new); 124 else 125 hpdp->pd = (unsigned long)new | HUGEPD_OK; 126 spin_unlock(&mm->page_table_lock); 127 return 0; 128 } 129 130 131 static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate) 132 { 133 if (huge_page_shift(hstate) < PUD_SHIFT) 134 return pud_offset(pgd, addr); 135 else 136 return (pud_t *) pgd; 137 } 138 static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, 139 struct hstate *hstate) 140 { 141 if (huge_page_shift(hstate) < PUD_SHIFT) 142 return pud_alloc(mm, pgd, addr); 143 else 144 return (pud_t *) pgd; 145 } 146 static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate) 147 { 148 if (huge_page_shift(hstate) < PMD_SHIFT) 149 return pmd_offset(pud, addr); 150 else 151 return (pmd_t *) pud; 152 } 153 static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr, 154 struct hstate *hstate) 155 { 156 if (huge_page_shift(hstate) < PMD_SHIFT) 157 return pmd_alloc(mm, pud, addr); 158 else 159 return (pmd_t *) pud; 160 } 161 162 /* Build list of addresses of gigantic pages. This function is used in early 163 * boot before the buddy or bootmem allocator is setup. 164 */ 165 void add_gpage(unsigned long addr, unsigned long page_size, 166 unsigned long number_of_pages) 167 { 168 if (!addr) 169 return; 170 while (number_of_pages > 0) { 171 gpage_freearray[nr_gpages] = addr; 172 nr_gpages++; 173 number_of_pages--; 174 addr += page_size; 175 } 176 } 177 178 /* Moves the gigantic page addresses from the temporary list to the 179 * huge_boot_pages list. 180 */ 181 int alloc_bootmem_huge_page(struct hstate *hstate) 182 { 183 struct huge_bootmem_page *m; 184 if (nr_gpages == 0) 185 return 0; 186 m = phys_to_virt(gpage_freearray[--nr_gpages]); 187 gpage_freearray[nr_gpages] = 0; 188 list_add(&m->list, &huge_boot_pages); 189 m->hstate = hstate; 190 return 1; 191 } 192 193 194 /* Modelled after find_linux_pte() */ 195 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 196 { 197 pgd_t *pg; 198 pud_t *pu; 199 pmd_t *pm; 200 201 unsigned int psize; 202 unsigned int shift; 203 unsigned long sz; 204 struct hstate *hstate; 205 psize = get_slice_psize(mm, addr); 206 shift = mmu_psize_to_shift(psize); 207 sz = ((1UL) << shift); 208 hstate = size_to_hstate(sz); 209 210 addr &= hstate->mask; 211 212 pg = pgd_offset(mm, addr); 213 if (!pgd_none(*pg)) { 214 pu = hpud_offset(pg, addr, hstate); 215 if (!pud_none(*pu)) { 216 pm = hpmd_offset(pu, addr, hstate); 217 if (!pmd_none(*pm)) 218 return hugepte_offset((hugepd_t *)pm, addr, 219 hstate); 220 } 221 } 222 223 return NULL; 224 } 225 226 pte_t *huge_pte_alloc(struct mm_struct *mm, 227 unsigned long addr, unsigned long sz) 228 { 229 pgd_t *pg; 230 pud_t *pu; 231 pmd_t *pm; 232 hugepd_t *hpdp = NULL; 233 struct hstate *hstate; 234 unsigned int psize; 235 hstate = size_to_hstate(sz); 236 237 psize = get_slice_psize(mm, addr); 238 BUG_ON(!mmu_huge_psizes[psize]); 239 240 addr &= hstate->mask; 241 242 pg = pgd_offset(mm, addr); 243 pu = hpud_alloc(mm, pg, addr, hstate); 244 245 if (pu) { 246 pm = hpmd_alloc(mm, pu, addr, hstate); 247 if (pm) 248 hpdp = (hugepd_t *)pm; 249 } 250 251 if (! hpdp) 252 return NULL; 253 254 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize)) 255 return NULL; 256 257 return hugepte_offset(hpdp, addr, hstate); 258 } 259 260 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 261 { 262 return 0; 263 } 264 265 static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp, 266 unsigned int psize) 267 { 268 pte_t *hugepte = hugepd_page(*hpdp); 269 270 hpdp->pd = 0; 271 tlb->need_flush = 1; 272 pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, 273 HUGEPTE_CACHE_NUM+psize-1, 274 PGF_CACHENUM_MASK)); 275 } 276 277 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 278 unsigned long addr, unsigned long end, 279 unsigned long floor, unsigned long ceiling, 280 unsigned int psize) 281 { 282 pmd_t *pmd; 283 unsigned long next; 284 unsigned long start; 285 286 start = addr; 287 pmd = pmd_offset(pud, addr); 288 do { 289 next = pmd_addr_end(addr, end); 290 if (pmd_none(*pmd)) 291 continue; 292 free_hugepte_range(tlb, (hugepd_t *)pmd, psize); 293 } while (pmd++, addr = next, addr != end); 294 295 start &= PUD_MASK; 296 if (start < floor) 297 return; 298 if (ceiling) { 299 ceiling &= PUD_MASK; 300 if (!ceiling) 301 return; 302 } 303 if (end - 1 > ceiling - 1) 304 return; 305 306 pmd = pmd_offset(pud, start); 307 pud_clear(pud); 308 pmd_free_tlb(tlb, pmd); 309 } 310 311 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 312 unsigned long addr, unsigned long end, 313 unsigned long floor, unsigned long ceiling) 314 { 315 pud_t *pud; 316 unsigned long next; 317 unsigned long start; 318 unsigned int shift; 319 unsigned int psize = get_slice_psize(tlb->mm, addr); 320 shift = mmu_psize_to_shift(psize); 321 322 start = addr; 323 pud = pud_offset(pgd, addr); 324 do { 325 next = pud_addr_end(addr, end); 326 if (shift < PMD_SHIFT) { 327 if (pud_none_or_clear_bad(pud)) 328 continue; 329 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 330 ceiling, psize); 331 } else { 332 if (pud_none(*pud)) 333 continue; 334 free_hugepte_range(tlb, (hugepd_t *)pud, psize); 335 } 336 } while (pud++, addr = next, addr != end); 337 338 start &= PGDIR_MASK; 339 if (start < floor) 340 return; 341 if (ceiling) { 342 ceiling &= PGDIR_MASK; 343 if (!ceiling) 344 return; 345 } 346 if (end - 1 > ceiling - 1) 347 return; 348 349 pud = pud_offset(pgd, start); 350 pgd_clear(pgd); 351 pud_free_tlb(tlb, pud); 352 } 353 354 /* 355 * This function frees user-level page tables of a process. 356 * 357 * Must be called with pagetable lock held. 358 */ 359 void hugetlb_free_pgd_range(struct mmu_gather *tlb, 360 unsigned long addr, unsigned long end, 361 unsigned long floor, unsigned long ceiling) 362 { 363 pgd_t *pgd; 364 unsigned long next; 365 unsigned long start; 366 367 /* 368 * Comments below take from the normal free_pgd_range(). They 369 * apply here too. The tests against HUGEPD_MASK below are 370 * essential, because we *don't* test for this at the bottom 371 * level. Without them we'll attempt to free a hugepte table 372 * when we unmap just part of it, even if there are other 373 * active mappings using it. 374 * 375 * The next few lines have given us lots of grief... 376 * 377 * Why are we testing HUGEPD* at this top level? Because 378 * often there will be no work to do at all, and we'd prefer 379 * not to go all the way down to the bottom just to discover 380 * that. 381 * 382 * Why all these "- 1"s? Because 0 represents both the bottom 383 * of the address space and the top of it (using -1 for the 384 * top wouldn't help much: the masks would do the wrong thing). 385 * The rule is that addr 0 and floor 0 refer to the bottom of 386 * the address space, but end 0 and ceiling 0 refer to the top 387 * Comparisons need to use "end - 1" and "ceiling - 1" (though 388 * that end 0 case should be mythical). 389 * 390 * Wherever addr is brought up or ceiling brought down, we 391 * must be careful to reject "the opposite 0" before it 392 * confuses the subsequent tests. But what about where end is 393 * brought down by HUGEPD_SIZE below? no, end can't go down to 394 * 0 there. 395 * 396 * Whereas we round start (addr) and ceiling down, by different 397 * masks at different levels, in order to test whether a table 398 * now has no other vmas using it, so can be freed, we don't 399 * bother to round floor or end up - the tests don't need that. 400 */ 401 unsigned int psize = get_slice_psize(tlb->mm, addr); 402 403 addr &= HUGEPD_MASK(psize); 404 if (addr < floor) { 405 addr += HUGEPD_SIZE(psize); 406 if (!addr) 407 return; 408 } 409 if (ceiling) { 410 ceiling &= HUGEPD_MASK(psize); 411 if (!ceiling) 412 return; 413 } 414 if (end - 1 > ceiling - 1) 415 end -= HUGEPD_SIZE(psize); 416 if (addr > end - 1) 417 return; 418 419 start = addr; 420 pgd = pgd_offset(tlb->mm, addr); 421 do { 422 psize = get_slice_psize(tlb->mm, addr); 423 BUG_ON(!mmu_huge_psizes[psize]); 424 next = pgd_addr_end(addr, end); 425 if (mmu_psize_to_shift(psize) < PUD_SHIFT) { 426 if (pgd_none_or_clear_bad(pgd)) 427 continue; 428 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 429 } else { 430 if (pgd_none(*pgd)) 431 continue; 432 free_hugepte_range(tlb, (hugepd_t *)pgd, psize); 433 } 434 } while (pgd++, addr = next, addr != end); 435 } 436 437 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 438 pte_t *ptep, pte_t pte) 439 { 440 if (pte_present(*ptep)) { 441 /* We open-code pte_clear because we need to pass the right 442 * argument to hpte_need_flush (huge / !huge). Might not be 443 * necessary anymore if we make hpte_need_flush() get the 444 * page size from the slices 445 */ 446 unsigned int psize = get_slice_psize(mm, addr); 447 unsigned int shift = mmu_psize_to_shift(psize); 448 unsigned long sz = ((1UL) << shift); 449 struct hstate *hstate = size_to_hstate(sz); 450 pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1); 451 } 452 *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); 453 } 454 455 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, 456 pte_t *ptep) 457 { 458 unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1); 459 return __pte(old); 460 } 461 462 struct page * 463 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 464 { 465 pte_t *ptep; 466 struct page *page; 467 unsigned int mmu_psize = get_slice_psize(mm, address); 468 469 /* Verify it is a huge page else bail. */ 470 if (!mmu_huge_psizes[mmu_psize]) 471 return ERR_PTR(-EINVAL); 472 473 ptep = huge_pte_offset(mm, address); 474 page = pte_page(*ptep); 475 if (page) { 476 unsigned int shift = mmu_psize_to_shift(mmu_psize); 477 unsigned long sz = ((1UL) << shift); 478 page += (address % sz) / PAGE_SIZE; 479 } 480 481 return page; 482 } 483 484 int pmd_huge(pmd_t pmd) 485 { 486 return 0; 487 } 488 489 int pud_huge(pud_t pud) 490 { 491 return 0; 492 } 493 494 struct page * 495 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 496 pmd_t *pmd, int write) 497 { 498 BUG(); 499 return NULL; 500 } 501 502 503 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 504 unsigned long len, unsigned long pgoff, 505 unsigned long flags) 506 { 507 struct hstate *hstate = hstate_file(file); 508 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 509 510 if (!mmu_huge_psizes[mmu_psize]) 511 return -EINVAL; 512 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); 513 } 514 515 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 516 { 517 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); 518 519 return 1UL << mmu_psize_to_shift(psize); 520 } 521 522 /* 523 * Called by asm hashtable.S for doing lazy icache flush 524 */ 525 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags, 526 pte_t pte, int trap, unsigned long sz) 527 { 528 struct page *page; 529 int i; 530 531 if (!pfn_valid(pte_pfn(pte))) 532 return rflags; 533 534 page = pte_page(pte); 535 536 /* page is dirty */ 537 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { 538 if (trap == 0x400) { 539 for (i = 0; i < (sz / PAGE_SIZE); i++) 540 __flush_dcache_icache(page_address(page+i)); 541 set_bit(PG_arch_1, &page->flags); 542 } else { 543 rflags |= HPTE_R_N; 544 } 545 } 546 return rflags; 547 } 548 549 int hash_huge_page(struct mm_struct *mm, unsigned long access, 550 unsigned long ea, unsigned long vsid, int local, 551 unsigned long trap) 552 { 553 pte_t *ptep; 554 unsigned long old_pte, new_pte; 555 unsigned long va, rflags, pa, sz; 556 long slot; 557 int err = 1; 558 int ssize = user_segment_size(ea); 559 unsigned int mmu_psize; 560 int shift; 561 mmu_psize = get_slice_psize(mm, ea); 562 563 if (!mmu_huge_psizes[mmu_psize]) 564 goto out; 565 ptep = huge_pte_offset(mm, ea); 566 567 /* Search the Linux page table for a match with va */ 568 va = hpt_va(ea, vsid, ssize); 569 570 /* 571 * If no pte found or not present, send the problem up to 572 * do_page_fault 573 */ 574 if (unlikely(!ptep || pte_none(*ptep))) 575 goto out; 576 577 /* 578 * Check the user's access rights to the page. If access should be 579 * prevented then send the problem up to do_page_fault. 580 */ 581 if (unlikely(access & ~pte_val(*ptep))) 582 goto out; 583 /* 584 * At this point, we have a pte (old_pte) which can be used to build 585 * or update an HPTE. There are 2 cases: 586 * 587 * 1. There is a valid (present) pte with no associated HPTE (this is 588 * the most common case) 589 * 2. There is a valid (present) pte with an associated HPTE. The 590 * current values of the pp bits in the HPTE prevent access 591 * because we are doing software DIRTY bit management and the 592 * page is currently not DIRTY. 593 */ 594 595 596 do { 597 old_pte = pte_val(*ptep); 598 if (old_pte & _PAGE_BUSY) 599 goto out; 600 new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; 601 } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, 602 old_pte, new_pte)); 603 604 rflags = 0x2 | (!(new_pte & _PAGE_RW)); 605 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ 606 rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); 607 shift = mmu_psize_to_shift(mmu_psize); 608 sz = ((1UL) << shift); 609 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) 610 /* No CPU has hugepages but lacks no execute, so we 611 * don't need to worry about that case */ 612 rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte), 613 trap, sz); 614 615 /* Check if pte already has an hpte (case 2) */ 616 if (unlikely(old_pte & _PAGE_HASHPTE)) { 617 /* There MIGHT be an HPTE for this pte */ 618 unsigned long hash, slot; 619 620 hash = hpt_hash(va, shift, ssize); 621 if (old_pte & _PAGE_F_SECOND) 622 hash = ~hash; 623 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 624 slot += (old_pte & _PAGE_F_GIX) >> 12; 625 626 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize, 627 ssize, local) == -1) 628 old_pte &= ~_PAGE_HPTEFLAGS; 629 } 630 631 if (likely(!(old_pte & _PAGE_HASHPTE))) { 632 unsigned long hash = hpt_hash(va, shift, ssize); 633 unsigned long hpte_group; 634 635 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; 636 637 repeat: 638 hpte_group = ((hash & htab_hash_mask) * 639 HPTES_PER_GROUP) & ~0x7UL; 640 641 /* clear HPTE slot informations in new PTE */ 642 #ifdef CONFIG_PPC_64K_PAGES 643 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0; 644 #else 645 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; 646 #endif 647 /* Add in WIMG bits */ 648 rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | 649 _PAGE_COHERENT | _PAGE_GUARDED)); 650 651 /* Insert into the hash table, primary slot */ 652 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, 653 mmu_psize, ssize); 654 655 /* Primary is full, try the secondary */ 656 if (unlikely(slot == -1)) { 657 hpte_group = ((~hash & htab_hash_mask) * 658 HPTES_PER_GROUP) & ~0x7UL; 659 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 660 HPTE_V_SECONDARY, 661 mmu_psize, ssize); 662 if (slot == -1) { 663 if (mftb() & 0x1) 664 hpte_group = ((hash & htab_hash_mask) * 665 HPTES_PER_GROUP)&~0x7UL; 666 667 ppc_md.hpte_remove(hpte_group); 668 goto repeat; 669 } 670 } 671 672 if (unlikely(slot == -2)) 673 panic("hash_huge_page: pte_insert failed\n"); 674 675 new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); 676 } 677 678 /* 679 * No need to use ldarx/stdcx here 680 */ 681 *ptep = __pte(new_pte & ~_PAGE_BUSY); 682 683 err = 0; 684 685 out: 686 return err; 687 } 688 689 static void __init set_huge_psize(int psize) 690 { 691 /* Check that it is a page size supported by the hardware and 692 * that it fits within pagetable limits. */ 693 if (mmu_psize_defs[psize].shift && 694 mmu_psize_defs[psize].shift < SID_SHIFT_1T && 695 (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT || 696 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K || 697 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) { 698 /* Return if huge page size has already been setup or is the 699 * same as the base page size. */ 700 if (mmu_huge_psizes[psize] || 701 mmu_psize_defs[psize].shift == PAGE_SHIFT) 702 return; 703 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); 704 705 switch (mmu_psize_defs[psize].shift) { 706 case PAGE_SHIFT_64K: 707 /* We only allow 64k hpages with 4k base page, 708 * which was checked above, and always put them 709 * at the PMD */ 710 hugepte_shift[psize] = PMD_SHIFT; 711 break; 712 case PAGE_SHIFT_16M: 713 /* 16M pages can be at two different levels 714 * of pagestables based on base page size */ 715 if (PAGE_SHIFT == PAGE_SHIFT_64K) 716 hugepte_shift[psize] = PMD_SHIFT; 717 else /* 4k base page */ 718 hugepte_shift[psize] = PUD_SHIFT; 719 break; 720 case PAGE_SHIFT_16G: 721 /* 16G pages are always at PGD level */ 722 hugepte_shift[psize] = PGDIR_SHIFT; 723 break; 724 } 725 hugepte_shift[psize] -= mmu_psize_defs[psize].shift; 726 } else 727 hugepte_shift[psize] = 0; 728 } 729 730 static int __init hugepage_setup_sz(char *str) 731 { 732 unsigned long long size; 733 int mmu_psize; 734 int shift; 735 736 size = memparse(str, &str); 737 738 shift = __ffs(size); 739 mmu_psize = shift_to_mmu_psize(shift); 740 if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift) 741 set_huge_psize(mmu_psize); 742 else 743 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); 744 745 return 1; 746 } 747 __setup("hugepagesz=", hugepage_setup_sz); 748 749 static int __init hugetlbpage_init(void) 750 { 751 unsigned int psize; 752 753 if (!cpu_has_feature(CPU_FTR_16M_PAGE)) 754 return -ENODEV; 755 756 /* Add supported huge page sizes. Need to change HUGE_MAX_HSTATE 757 * and adjust PTE_NONCACHE_NUM if the number of supported huge page 758 * sizes changes. 759 */ 760 set_huge_psize(MMU_PAGE_16M); 761 set_huge_psize(MMU_PAGE_16G); 762 763 /* Temporarily disable support for 64K huge pages when 64K SPU local 764 * store support is enabled as the current implementation conflicts. 765 */ 766 #ifndef CONFIG_SPU_FS_64K_LS 767 set_huge_psize(MMU_PAGE_64K); 768 #endif 769 770 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 771 if (mmu_huge_psizes[psize]) { 772 pgtable_cache[HUGE_PGTABLE_INDEX(psize)] = 773 kmem_cache_create( 774 HUGEPTE_CACHE_NAME(psize), 775 HUGEPTE_TABLE_SIZE(psize), 776 HUGEPTE_TABLE_SIZE(psize), 777 0, 778 NULL); 779 if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)]) 780 panic("hugetlbpage_init(): could not create %s"\ 781 "\n", HUGEPTE_CACHE_NAME(psize)); 782 } 783 } 784 785 return 0; 786 } 787 788 module_init(hugetlbpage_init); 789