1 /* 2 * PPC64 (POWER4) Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2003 David Gibson, IBM Corporation. 5 * 6 * Based on the IA-32 version: 7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 8 */ 9 10 #include <linux/init.h> 11 #include <linux/fs.h> 12 #include <linux/mm.h> 13 #include <linux/hugetlb.h> 14 #include <linux/pagemap.h> 15 #include <linux/slab.h> 16 #include <linux/err.h> 17 #include <linux/sysctl.h> 18 #include <asm/mman.h> 19 #include <asm/pgalloc.h> 20 #include <asm/tlb.h> 21 #include <asm/tlbflush.h> 22 #include <asm/mmu_context.h> 23 #include <asm/machdep.h> 24 #include <asm/cputable.h> 25 #include <asm/spu.h> 26 27 #define PAGE_SHIFT_64K 16 28 #define PAGE_SHIFT_16M 24 29 #define PAGE_SHIFT_16G 34 30 31 #define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) 32 #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) 33 #define MAX_NUMBER_GPAGES 1024 34 35 /* Tracks the 16G pages after the device tree is scanned and before the 36 * huge_boot_pages list is ready. */ 37 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; 38 static unsigned nr_gpages; 39 40 /* Array of valid huge page sizes - non-zero value(hugepte_shift) is 41 * stored for the huge page sizes that are valid. 42 */ 43 unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ 44 45 #define hugepte_shift mmu_huge_psizes 46 #define PTRS_PER_HUGEPTE(psize) (1 << hugepte_shift[psize]) 47 #define HUGEPTE_TABLE_SIZE(psize) (sizeof(pte_t) << hugepte_shift[psize]) 48 49 #define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \ 50 + hugepte_shift[psize]) 51 #define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize)) 52 #define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1)) 53 54 /* Subtract one from array size because we don't need a cache for 4K since 55 * is not a huge page size */ 56 #define huge_pgtable_cache(psize) (pgtable_cache[HUGEPTE_CACHE_NUM \ 57 + psize-1]) 58 #define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize]) 59 60 static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = { 61 "unused_4K", "hugepte_cache_64K", "unused_64K_AP", 62 "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G" 63 }; 64 65 /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() 66 * will choke on pointers to hugepte tables, which is handy for 67 * catching screwups early. */ 68 #define HUGEPD_OK 0x1 69 70 typedef struct { unsigned long pd; } hugepd_t; 71 72 #define hugepd_none(hpd) ((hpd).pd == 0) 73 74 static inline int shift_to_mmu_psize(unsigned int shift) 75 { 76 switch (shift) { 77 #ifndef CONFIG_PPC_64K_PAGES 78 case PAGE_SHIFT_64K: 79 return MMU_PAGE_64K; 80 #endif 81 case PAGE_SHIFT_16M: 82 return MMU_PAGE_16M; 83 case PAGE_SHIFT_16G: 84 return MMU_PAGE_16G; 85 } 86 return -1; 87 } 88 89 static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) 90 { 91 if (mmu_psize_defs[mmu_psize].shift) 92 return mmu_psize_defs[mmu_psize].shift; 93 BUG(); 94 } 95 96 static inline pte_t *hugepd_page(hugepd_t hpd) 97 { 98 BUG_ON(!(hpd.pd & HUGEPD_OK)); 99 return (pte_t *)(hpd.pd & ~HUGEPD_OK); 100 } 101 102 static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, 103 struct hstate *hstate) 104 { 105 unsigned int shift = huge_page_shift(hstate); 106 int psize = shift_to_mmu_psize(shift); 107 unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1)); 108 pte_t *dir = hugepd_page(*hpdp); 109 110 return dir + idx; 111 } 112 113 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 114 unsigned long address, unsigned int psize) 115 { 116 pte_t *new = kmem_cache_alloc(huge_pgtable_cache(psize), 117 GFP_KERNEL|__GFP_REPEAT); 118 119 if (! new) 120 return -ENOMEM; 121 122 spin_lock(&mm->page_table_lock); 123 if (!hugepd_none(*hpdp)) 124 kmem_cache_free(huge_pgtable_cache(psize), new); 125 else 126 hpdp->pd = (unsigned long)new | HUGEPD_OK; 127 spin_unlock(&mm->page_table_lock); 128 return 0; 129 } 130 131 /* Base page size affects how we walk hugetlb page tables */ 132 #ifdef CONFIG_PPC_64K_PAGES 133 #define hpmd_offset(pud, addr, h) pmd_offset(pud, addr) 134 #define hpmd_alloc(mm, pud, addr, h) pmd_alloc(mm, pud, addr) 135 #else 136 static inline 137 pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate) 138 { 139 if (huge_page_shift(hstate) == PAGE_SHIFT_64K) 140 return pmd_offset(pud, addr); 141 else 142 return (pmd_t *) pud; 143 } 144 static inline 145 pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr, 146 struct hstate *hstate) 147 { 148 if (huge_page_shift(hstate) == PAGE_SHIFT_64K) 149 return pmd_alloc(mm, pud, addr); 150 else 151 return (pmd_t *) pud; 152 } 153 #endif 154 155 /* Build list of addresses of gigantic pages. This function is used in early 156 * boot before the buddy or bootmem allocator is setup. 157 */ 158 void add_gpage(unsigned long addr, unsigned long page_size, 159 unsigned long number_of_pages) 160 { 161 if (!addr) 162 return; 163 while (number_of_pages > 0) { 164 gpage_freearray[nr_gpages] = addr; 165 nr_gpages++; 166 number_of_pages--; 167 addr += page_size; 168 } 169 } 170 171 /* Moves the gigantic page addresses from the temporary list to the 172 * huge_boot_pages list. 173 */ 174 int alloc_bootmem_huge_page(struct hstate *hstate) 175 { 176 struct huge_bootmem_page *m; 177 if (nr_gpages == 0) 178 return 0; 179 m = phys_to_virt(gpage_freearray[--nr_gpages]); 180 gpage_freearray[nr_gpages] = 0; 181 list_add(&m->list, &huge_boot_pages); 182 m->hstate = hstate; 183 return 1; 184 } 185 186 187 /* Modelled after find_linux_pte() */ 188 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 189 { 190 pgd_t *pg; 191 pud_t *pu; 192 pmd_t *pm; 193 194 unsigned int psize; 195 unsigned int shift; 196 unsigned long sz; 197 struct hstate *hstate; 198 psize = get_slice_psize(mm, addr); 199 shift = mmu_psize_to_shift(psize); 200 sz = ((1UL) << shift); 201 hstate = size_to_hstate(sz); 202 203 addr &= hstate->mask; 204 205 pg = pgd_offset(mm, addr); 206 if (!pgd_none(*pg)) { 207 pu = pud_offset(pg, addr); 208 if (!pud_none(*pu)) { 209 pm = hpmd_offset(pu, addr, hstate); 210 if (!pmd_none(*pm)) 211 return hugepte_offset((hugepd_t *)pm, addr, 212 hstate); 213 } 214 } 215 216 return NULL; 217 } 218 219 pte_t *huge_pte_alloc(struct mm_struct *mm, 220 unsigned long addr, unsigned long sz) 221 { 222 pgd_t *pg; 223 pud_t *pu; 224 pmd_t *pm; 225 hugepd_t *hpdp = NULL; 226 struct hstate *hstate; 227 unsigned int psize; 228 hstate = size_to_hstate(sz); 229 230 psize = get_slice_psize(mm, addr); 231 BUG_ON(!mmu_huge_psizes[psize]); 232 233 addr &= hstate->mask; 234 235 pg = pgd_offset(mm, addr); 236 pu = pud_alloc(mm, pg, addr); 237 238 if (pu) { 239 pm = hpmd_alloc(mm, pu, addr, hstate); 240 if (pm) 241 hpdp = (hugepd_t *)pm; 242 } 243 244 if (! hpdp) 245 return NULL; 246 247 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize)) 248 return NULL; 249 250 return hugepte_offset(hpdp, addr, hstate); 251 } 252 253 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 254 { 255 return 0; 256 } 257 258 static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp, 259 unsigned int psize) 260 { 261 pte_t *hugepte = hugepd_page(*hpdp); 262 263 hpdp->pd = 0; 264 tlb->need_flush = 1; 265 pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, 266 HUGEPTE_CACHE_NUM+psize-1, 267 PGF_CACHENUM_MASK)); 268 } 269 270 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 271 unsigned long addr, unsigned long end, 272 unsigned long floor, unsigned long ceiling, 273 unsigned int psize) 274 { 275 pmd_t *pmd; 276 unsigned long next; 277 unsigned long start; 278 279 start = addr; 280 pmd = pmd_offset(pud, addr); 281 do { 282 next = pmd_addr_end(addr, end); 283 if (pmd_none(*pmd)) 284 continue; 285 free_hugepte_range(tlb, (hugepd_t *)pmd, psize); 286 } while (pmd++, addr = next, addr != end); 287 288 start &= PUD_MASK; 289 if (start < floor) 290 return; 291 if (ceiling) { 292 ceiling &= PUD_MASK; 293 if (!ceiling) 294 return; 295 } 296 if (end - 1 > ceiling - 1) 297 return; 298 299 pmd = pmd_offset(pud, start); 300 pud_clear(pud); 301 pmd_free_tlb(tlb, pmd); 302 } 303 304 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 305 unsigned long addr, unsigned long end, 306 unsigned long floor, unsigned long ceiling) 307 { 308 pud_t *pud; 309 unsigned long next; 310 unsigned long start; 311 unsigned int shift; 312 unsigned int psize = get_slice_psize(tlb->mm, addr); 313 shift = mmu_psize_to_shift(psize); 314 315 start = addr; 316 pud = pud_offset(pgd, addr); 317 do { 318 next = pud_addr_end(addr, end); 319 #ifdef CONFIG_PPC_64K_PAGES 320 if (pud_none_or_clear_bad(pud)) 321 continue; 322 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling, 323 psize); 324 #else 325 if (shift == PAGE_SHIFT_64K) { 326 if (pud_none_or_clear_bad(pud)) 327 continue; 328 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 329 ceiling, psize); 330 } else { 331 if (pud_none(*pud)) 332 continue; 333 free_hugepte_range(tlb, (hugepd_t *)pud, psize); 334 } 335 #endif 336 } while (pud++, addr = next, addr != end); 337 338 start &= PGDIR_MASK; 339 if (start < floor) 340 return; 341 if (ceiling) { 342 ceiling &= PGDIR_MASK; 343 if (!ceiling) 344 return; 345 } 346 if (end - 1 > ceiling - 1) 347 return; 348 349 pud = pud_offset(pgd, start); 350 pgd_clear(pgd); 351 pud_free_tlb(tlb, pud); 352 } 353 354 /* 355 * This function frees user-level page tables of a process. 356 * 357 * Must be called with pagetable lock held. 358 */ 359 void hugetlb_free_pgd_range(struct mmu_gather *tlb, 360 unsigned long addr, unsigned long end, 361 unsigned long floor, unsigned long ceiling) 362 { 363 pgd_t *pgd; 364 unsigned long next; 365 unsigned long start; 366 367 /* 368 * Comments below take from the normal free_pgd_range(). They 369 * apply here too. The tests against HUGEPD_MASK below are 370 * essential, because we *don't* test for this at the bottom 371 * level. Without them we'll attempt to free a hugepte table 372 * when we unmap just part of it, even if there are other 373 * active mappings using it. 374 * 375 * The next few lines have given us lots of grief... 376 * 377 * Why are we testing HUGEPD* at this top level? Because 378 * often there will be no work to do at all, and we'd prefer 379 * not to go all the way down to the bottom just to discover 380 * that. 381 * 382 * Why all these "- 1"s? Because 0 represents both the bottom 383 * of the address space and the top of it (using -1 for the 384 * top wouldn't help much: the masks would do the wrong thing). 385 * The rule is that addr 0 and floor 0 refer to the bottom of 386 * the address space, but end 0 and ceiling 0 refer to the top 387 * Comparisons need to use "end - 1" and "ceiling - 1" (though 388 * that end 0 case should be mythical). 389 * 390 * Wherever addr is brought up or ceiling brought down, we 391 * must be careful to reject "the opposite 0" before it 392 * confuses the subsequent tests. But what about where end is 393 * brought down by HUGEPD_SIZE below? no, end can't go down to 394 * 0 there. 395 * 396 * Whereas we round start (addr) and ceiling down, by different 397 * masks at different levels, in order to test whether a table 398 * now has no other vmas using it, so can be freed, we don't 399 * bother to round floor or end up - the tests don't need that. 400 */ 401 unsigned int psize = get_slice_psize(tlb->mm, addr); 402 403 addr &= HUGEPD_MASK(psize); 404 if (addr < floor) { 405 addr += HUGEPD_SIZE(psize); 406 if (!addr) 407 return; 408 } 409 if (ceiling) { 410 ceiling &= HUGEPD_MASK(psize); 411 if (!ceiling) 412 return; 413 } 414 if (end - 1 > ceiling - 1) 415 end -= HUGEPD_SIZE(psize); 416 if (addr > end - 1) 417 return; 418 419 start = addr; 420 pgd = pgd_offset(tlb->mm, addr); 421 do { 422 psize = get_slice_psize(tlb->mm, addr); 423 BUG_ON(!mmu_huge_psizes[psize]); 424 next = pgd_addr_end(addr, end); 425 if (pgd_none_or_clear_bad(pgd)) 426 continue; 427 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 428 } while (pgd++, addr = next, addr != end); 429 } 430 431 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 432 pte_t *ptep, pte_t pte) 433 { 434 if (pte_present(*ptep)) { 435 /* We open-code pte_clear because we need to pass the right 436 * argument to hpte_need_flush (huge / !huge). Might not be 437 * necessary anymore if we make hpte_need_flush() get the 438 * page size from the slices 439 */ 440 unsigned int psize = get_slice_psize(mm, addr); 441 unsigned int shift = mmu_psize_to_shift(psize); 442 unsigned long sz = ((1UL) << shift); 443 struct hstate *hstate = size_to_hstate(sz); 444 pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1); 445 } 446 *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); 447 } 448 449 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, 450 pte_t *ptep) 451 { 452 unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1); 453 return __pte(old); 454 } 455 456 struct page * 457 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 458 { 459 pte_t *ptep; 460 struct page *page; 461 unsigned int mmu_psize = get_slice_psize(mm, address); 462 463 /* Verify it is a huge page else bail. */ 464 if (!mmu_huge_psizes[mmu_psize]) 465 return ERR_PTR(-EINVAL); 466 467 ptep = huge_pte_offset(mm, address); 468 page = pte_page(*ptep); 469 if (page) { 470 unsigned int shift = mmu_psize_to_shift(mmu_psize); 471 unsigned long sz = ((1UL) << shift); 472 page += (address % sz) / PAGE_SIZE; 473 } 474 475 return page; 476 } 477 478 int pmd_huge(pmd_t pmd) 479 { 480 return 0; 481 } 482 483 int pud_huge(pud_t pud) 484 { 485 return 0; 486 } 487 488 struct page * 489 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 490 pmd_t *pmd, int write) 491 { 492 BUG(); 493 return NULL; 494 } 495 496 497 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 498 unsigned long len, unsigned long pgoff, 499 unsigned long flags) 500 { 501 struct hstate *hstate = hstate_file(file); 502 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 503 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); 504 } 505 506 /* 507 * Called by asm hashtable.S for doing lazy icache flush 508 */ 509 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags, 510 pte_t pte, int trap, unsigned long sz) 511 { 512 struct page *page; 513 int i; 514 515 if (!pfn_valid(pte_pfn(pte))) 516 return rflags; 517 518 page = pte_page(pte); 519 520 /* page is dirty */ 521 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { 522 if (trap == 0x400) { 523 for (i = 0; i < (sz / PAGE_SIZE); i++) 524 __flush_dcache_icache(page_address(page+i)); 525 set_bit(PG_arch_1, &page->flags); 526 } else { 527 rflags |= HPTE_R_N; 528 } 529 } 530 return rflags; 531 } 532 533 int hash_huge_page(struct mm_struct *mm, unsigned long access, 534 unsigned long ea, unsigned long vsid, int local, 535 unsigned long trap) 536 { 537 pte_t *ptep; 538 unsigned long old_pte, new_pte; 539 unsigned long va, rflags, pa, sz; 540 long slot; 541 int err = 1; 542 int ssize = user_segment_size(ea); 543 unsigned int mmu_psize; 544 int shift; 545 mmu_psize = get_slice_psize(mm, ea); 546 547 if (!mmu_huge_psizes[mmu_psize]) 548 goto out; 549 ptep = huge_pte_offset(mm, ea); 550 551 /* Search the Linux page table for a match with va */ 552 va = hpt_va(ea, vsid, ssize); 553 554 /* 555 * If no pte found or not present, send the problem up to 556 * do_page_fault 557 */ 558 if (unlikely(!ptep || pte_none(*ptep))) 559 goto out; 560 561 /* 562 * Check the user's access rights to the page. If access should be 563 * prevented then send the problem up to do_page_fault. 564 */ 565 if (unlikely(access & ~pte_val(*ptep))) 566 goto out; 567 /* 568 * At this point, we have a pte (old_pte) which can be used to build 569 * or update an HPTE. There are 2 cases: 570 * 571 * 1. There is a valid (present) pte with no associated HPTE (this is 572 * the most common case) 573 * 2. There is a valid (present) pte with an associated HPTE. The 574 * current values of the pp bits in the HPTE prevent access 575 * because we are doing software DIRTY bit management and the 576 * page is currently not DIRTY. 577 */ 578 579 580 do { 581 old_pte = pte_val(*ptep); 582 if (old_pte & _PAGE_BUSY) 583 goto out; 584 new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; 585 } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, 586 old_pte, new_pte)); 587 588 rflags = 0x2 | (!(new_pte & _PAGE_RW)); 589 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ 590 rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); 591 shift = mmu_psize_to_shift(mmu_psize); 592 sz = ((1UL) << shift); 593 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) 594 /* No CPU has hugepages but lacks no execute, so we 595 * don't need to worry about that case */ 596 rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte), 597 trap, sz); 598 599 /* Check if pte already has an hpte (case 2) */ 600 if (unlikely(old_pte & _PAGE_HASHPTE)) { 601 /* There MIGHT be an HPTE for this pte */ 602 unsigned long hash, slot; 603 604 hash = hpt_hash(va, shift, ssize); 605 if (old_pte & _PAGE_F_SECOND) 606 hash = ~hash; 607 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 608 slot += (old_pte & _PAGE_F_GIX) >> 12; 609 610 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize, 611 ssize, local) == -1) 612 old_pte &= ~_PAGE_HPTEFLAGS; 613 } 614 615 if (likely(!(old_pte & _PAGE_HASHPTE))) { 616 unsigned long hash = hpt_hash(va, shift, ssize); 617 unsigned long hpte_group; 618 619 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; 620 621 repeat: 622 hpte_group = ((hash & htab_hash_mask) * 623 HPTES_PER_GROUP) & ~0x7UL; 624 625 /* clear HPTE slot informations in new PTE */ 626 #ifdef CONFIG_PPC_64K_PAGES 627 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0; 628 #else 629 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; 630 #endif 631 /* Add in WIMG bits */ 632 rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | 633 _PAGE_COHERENT | _PAGE_GUARDED)); 634 635 /* Insert into the hash table, primary slot */ 636 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, 637 mmu_psize, ssize); 638 639 /* Primary is full, try the secondary */ 640 if (unlikely(slot == -1)) { 641 hpte_group = ((~hash & htab_hash_mask) * 642 HPTES_PER_GROUP) & ~0x7UL; 643 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 644 HPTE_V_SECONDARY, 645 mmu_psize, ssize); 646 if (slot == -1) { 647 if (mftb() & 0x1) 648 hpte_group = ((hash & htab_hash_mask) * 649 HPTES_PER_GROUP)&~0x7UL; 650 651 ppc_md.hpte_remove(hpte_group); 652 goto repeat; 653 } 654 } 655 656 if (unlikely(slot == -2)) 657 panic("hash_huge_page: pte_insert failed\n"); 658 659 new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); 660 } 661 662 /* 663 * No need to use ldarx/stdcx here 664 */ 665 *ptep = __pte(new_pte & ~_PAGE_BUSY); 666 667 err = 0; 668 669 out: 670 return err; 671 } 672 673 void set_huge_psize(int psize) 674 { 675 /* Check that it is a page size supported by the hardware and 676 * that it fits within pagetable limits. */ 677 if (mmu_psize_defs[psize].shift && 678 mmu_psize_defs[psize].shift < SID_SHIFT_1T && 679 (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT || 680 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K || 681 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) { 682 /* Return if huge page size has already been setup or is the 683 * same as the base page size. */ 684 if (mmu_huge_psizes[psize] || 685 mmu_psize_defs[psize].shift == PAGE_SHIFT) 686 return; 687 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); 688 689 switch (mmu_psize_defs[psize].shift) { 690 case PAGE_SHIFT_64K: 691 /* We only allow 64k hpages with 4k base page, 692 * which was checked above, and always put them 693 * at the PMD */ 694 hugepte_shift[psize] = PMD_SHIFT; 695 break; 696 case PAGE_SHIFT_16M: 697 /* 16M pages can be at two different levels 698 * of pagestables based on base page size */ 699 if (PAGE_SHIFT == PAGE_SHIFT_64K) 700 hugepte_shift[psize] = PMD_SHIFT; 701 else /* 4k base page */ 702 hugepte_shift[psize] = PUD_SHIFT; 703 break; 704 case PAGE_SHIFT_16G: 705 /* 16G pages are always at PGD level */ 706 hugepte_shift[psize] = PGDIR_SHIFT; 707 break; 708 } 709 hugepte_shift[psize] -= mmu_psize_defs[psize].shift; 710 } else 711 hugepte_shift[psize] = 0; 712 } 713 714 static int __init hugepage_setup_sz(char *str) 715 { 716 unsigned long long size; 717 int mmu_psize; 718 int shift; 719 720 size = memparse(str, &str); 721 722 shift = __ffs(size); 723 mmu_psize = shift_to_mmu_psize(shift); 724 if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift) 725 set_huge_psize(mmu_psize); 726 else 727 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); 728 729 return 1; 730 } 731 __setup("hugepagesz=", hugepage_setup_sz); 732 733 static void zero_ctor(struct kmem_cache *cache, void *addr) 734 { 735 memset(addr, 0, kmem_cache_size(cache)); 736 } 737 738 static int __init hugetlbpage_init(void) 739 { 740 unsigned int psize; 741 742 if (!cpu_has_feature(CPU_FTR_16M_PAGE)) 743 return -ENODEV; 744 /* Add supported huge page sizes. Need to change HUGE_MAX_HSTATE 745 * and adjust PTE_NONCACHE_NUM if the number of supported huge page 746 * sizes changes. 747 */ 748 set_huge_psize(MMU_PAGE_16M); 749 set_huge_psize(MMU_PAGE_64K); 750 set_huge_psize(MMU_PAGE_16G); 751 752 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 753 if (mmu_huge_psizes[psize]) { 754 huge_pgtable_cache(psize) = kmem_cache_create( 755 HUGEPTE_CACHE_NAME(psize), 756 HUGEPTE_TABLE_SIZE(psize), 757 HUGEPTE_TABLE_SIZE(psize), 758 0, 759 zero_ctor); 760 if (!huge_pgtable_cache(psize)) 761 panic("hugetlbpage_init(): could not create %s"\ 762 "\n", HUGEPTE_CACHE_NAME(psize)); 763 } 764 } 765 766 return 0; 767 } 768 769 module_init(hugetlbpage_init); 770