1 /* 2 * PPC64 (POWER4) Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2003 David Gibson, IBM Corporation. 5 * 6 * Based on the IA-32 version: 7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 8 */ 9 10 #include <linux/mm.h> 11 #include <linux/io.h> 12 #include <linux/slab.h> 13 #include <linux/hugetlb.h> 14 #include <asm/pgtable.h> 15 #include <asm/pgalloc.h> 16 #include <asm/tlb.h> 17 18 #define PAGE_SHIFT_64K 16 19 #define PAGE_SHIFT_16M 24 20 #define PAGE_SHIFT_16G 34 21 22 #define MAX_NUMBER_GPAGES 1024 23 24 /* Tracks the 16G pages after the device tree is scanned and before the 25 * huge_boot_pages list is ready. */ 26 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; 27 static unsigned nr_gpages; 28 29 /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() 30 * will choke on pointers to hugepte tables, which is handy for 31 * catching screwups early. */ 32 33 static inline int shift_to_mmu_psize(unsigned int shift) 34 { 35 int psize; 36 37 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) 38 if (mmu_psize_defs[psize].shift == shift) 39 return psize; 40 return -1; 41 } 42 43 static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) 44 { 45 if (mmu_psize_defs[mmu_psize].shift) 46 return mmu_psize_defs[mmu_psize].shift; 47 BUG(); 48 } 49 50 #define hugepd_none(hpd) ((hpd).pd == 0) 51 52 static inline pte_t *hugepd_page(hugepd_t hpd) 53 { 54 BUG_ON(!hugepd_ok(hpd)); 55 return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000); 56 } 57 58 static inline unsigned int hugepd_shift(hugepd_t hpd) 59 { 60 return hpd.pd & HUGEPD_SHIFT_MASK; 61 } 62 63 static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift) 64 { 65 unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp); 66 pte_t *dir = hugepd_page(*hpdp); 67 68 return dir + idx; 69 } 70 71 pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) 72 { 73 pgd_t *pg; 74 pud_t *pu; 75 pmd_t *pm; 76 hugepd_t *hpdp = NULL; 77 unsigned pdshift = PGDIR_SHIFT; 78 79 if (shift) 80 *shift = 0; 81 82 pg = pgdir + pgd_index(ea); 83 if (is_hugepd(pg)) { 84 hpdp = (hugepd_t *)pg; 85 } else if (!pgd_none(*pg)) { 86 pdshift = PUD_SHIFT; 87 pu = pud_offset(pg, ea); 88 if (is_hugepd(pu)) 89 hpdp = (hugepd_t *)pu; 90 else if (!pud_none(*pu)) { 91 pdshift = PMD_SHIFT; 92 pm = pmd_offset(pu, ea); 93 if (is_hugepd(pm)) 94 hpdp = (hugepd_t *)pm; 95 else if (!pmd_none(*pm)) { 96 return pte_offset_map(pm, ea); 97 } 98 } 99 } 100 101 if (!hpdp) 102 return NULL; 103 104 if (shift) 105 *shift = hugepd_shift(*hpdp); 106 return hugepte_offset(hpdp, ea, pdshift); 107 } 108 109 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 110 { 111 return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); 112 } 113 114 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 115 unsigned long address, unsigned pdshift, unsigned pshift) 116 { 117 pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift), 118 GFP_KERNEL|__GFP_REPEAT); 119 120 BUG_ON(pshift > HUGEPD_SHIFT_MASK); 121 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); 122 123 if (! new) 124 return -ENOMEM; 125 126 spin_lock(&mm->page_table_lock); 127 if (!hugepd_none(*hpdp)) 128 kmem_cache_free(PGT_CACHE(pdshift - pshift), new); 129 else 130 hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift; 131 spin_unlock(&mm->page_table_lock); 132 return 0; 133 } 134 135 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) 136 { 137 pgd_t *pg; 138 pud_t *pu; 139 pmd_t *pm; 140 hugepd_t *hpdp = NULL; 141 unsigned pshift = __ffs(sz); 142 unsigned pdshift = PGDIR_SHIFT; 143 144 addr &= ~(sz-1); 145 146 pg = pgd_offset(mm, addr); 147 if (pshift >= PUD_SHIFT) { 148 hpdp = (hugepd_t *)pg; 149 } else { 150 pdshift = PUD_SHIFT; 151 pu = pud_alloc(mm, pg, addr); 152 if (pshift >= PMD_SHIFT) { 153 hpdp = (hugepd_t *)pu; 154 } else { 155 pdshift = PMD_SHIFT; 156 pm = pmd_alloc(mm, pu, addr); 157 hpdp = (hugepd_t *)pm; 158 } 159 } 160 161 if (!hpdp) 162 return NULL; 163 164 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); 165 166 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) 167 return NULL; 168 169 return hugepte_offset(hpdp, addr, pdshift); 170 } 171 172 /* Build list of addresses of gigantic pages. This function is used in early 173 * boot before the buddy or bootmem allocator is setup. 174 */ 175 void add_gpage(unsigned long addr, unsigned long page_size, 176 unsigned long number_of_pages) 177 { 178 if (!addr) 179 return; 180 while (number_of_pages > 0) { 181 gpage_freearray[nr_gpages] = addr; 182 nr_gpages++; 183 number_of_pages--; 184 addr += page_size; 185 } 186 } 187 188 /* Moves the gigantic page addresses from the temporary list to the 189 * huge_boot_pages list. 190 */ 191 int alloc_bootmem_huge_page(struct hstate *hstate) 192 { 193 struct huge_bootmem_page *m; 194 if (nr_gpages == 0) 195 return 0; 196 m = phys_to_virt(gpage_freearray[--nr_gpages]); 197 gpage_freearray[nr_gpages] = 0; 198 list_add(&m->list, &huge_boot_pages); 199 m->hstate = hstate; 200 return 1; 201 } 202 203 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 204 { 205 return 0; 206 } 207 208 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, 209 unsigned long start, unsigned long end, 210 unsigned long floor, unsigned long ceiling) 211 { 212 pte_t *hugepte = hugepd_page(*hpdp); 213 unsigned shift = hugepd_shift(*hpdp); 214 unsigned long pdmask = ~((1UL << pdshift) - 1); 215 216 start &= pdmask; 217 if (start < floor) 218 return; 219 if (ceiling) { 220 ceiling &= pdmask; 221 if (! ceiling) 222 return; 223 } 224 if (end - 1 > ceiling - 1) 225 return; 226 227 hpdp->pd = 0; 228 tlb->need_flush = 1; 229 pgtable_free_tlb(tlb, hugepte, pdshift - shift); 230 } 231 232 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 233 unsigned long addr, unsigned long end, 234 unsigned long floor, unsigned long ceiling) 235 { 236 pmd_t *pmd; 237 unsigned long next; 238 unsigned long start; 239 240 start = addr; 241 pmd = pmd_offset(pud, addr); 242 do { 243 next = pmd_addr_end(addr, end); 244 if (pmd_none(*pmd)) 245 continue; 246 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, 247 addr, next, floor, ceiling); 248 } while (pmd++, addr = next, addr != end); 249 250 start &= PUD_MASK; 251 if (start < floor) 252 return; 253 if (ceiling) { 254 ceiling &= PUD_MASK; 255 if (!ceiling) 256 return; 257 } 258 if (end - 1 > ceiling - 1) 259 return; 260 261 pmd = pmd_offset(pud, start); 262 pud_clear(pud); 263 pmd_free_tlb(tlb, pmd, start); 264 } 265 266 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 267 unsigned long addr, unsigned long end, 268 unsigned long floor, unsigned long ceiling) 269 { 270 pud_t *pud; 271 unsigned long next; 272 unsigned long start; 273 274 start = addr; 275 pud = pud_offset(pgd, addr); 276 do { 277 next = pud_addr_end(addr, end); 278 if (!is_hugepd(pud)) { 279 if (pud_none_or_clear_bad(pud)) 280 continue; 281 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 282 ceiling); 283 } else { 284 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, 285 addr, next, floor, ceiling); 286 } 287 } while (pud++, addr = next, addr != end); 288 289 start &= PGDIR_MASK; 290 if (start < floor) 291 return; 292 if (ceiling) { 293 ceiling &= PGDIR_MASK; 294 if (!ceiling) 295 return; 296 } 297 if (end - 1 > ceiling - 1) 298 return; 299 300 pud = pud_offset(pgd, start); 301 pgd_clear(pgd); 302 pud_free_tlb(tlb, pud, start); 303 } 304 305 /* 306 * This function frees user-level page tables of a process. 307 * 308 * Must be called with pagetable lock held. 309 */ 310 void hugetlb_free_pgd_range(struct mmu_gather *tlb, 311 unsigned long addr, unsigned long end, 312 unsigned long floor, unsigned long ceiling) 313 { 314 pgd_t *pgd; 315 unsigned long next; 316 317 /* 318 * Because there are a number of different possible pagetable 319 * layouts for hugepage ranges, we limit knowledge of how 320 * things should be laid out to the allocation path 321 * (huge_pte_alloc(), above). Everything else works out the 322 * structure as it goes from information in the hugepd 323 * pointers. That means that we can't here use the 324 * optimization used in the normal page free_pgd_range(), of 325 * checking whether we're actually covering a large enough 326 * range to have to do anything at the top level of the walk 327 * instead of at the bottom. 328 * 329 * To make sense of this, you should probably go read the big 330 * block comment at the top of the normal free_pgd_range(), 331 * too. 332 */ 333 334 pgd = pgd_offset(tlb->mm, addr); 335 do { 336 next = pgd_addr_end(addr, end); 337 if (!is_hugepd(pgd)) { 338 if (pgd_none_or_clear_bad(pgd)) 339 continue; 340 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 341 } else { 342 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, 343 addr, next, floor, ceiling); 344 } 345 } while (pgd++, addr = next, addr != end); 346 } 347 348 struct page * 349 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 350 { 351 pte_t *ptep; 352 struct page *page; 353 unsigned shift; 354 unsigned long mask; 355 356 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); 357 358 /* Verify it is a huge page else bail. */ 359 if (!ptep || !shift) 360 return ERR_PTR(-EINVAL); 361 362 mask = (1UL << shift) - 1; 363 page = pte_page(*ptep); 364 if (page) 365 page += (address & mask) / PAGE_SIZE; 366 367 return page; 368 } 369 370 int pmd_huge(pmd_t pmd) 371 { 372 return 0; 373 } 374 375 int pud_huge(pud_t pud) 376 { 377 return 0; 378 } 379 380 struct page * 381 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 382 pmd_t *pmd, int write) 383 { 384 BUG(); 385 return NULL; 386 } 387 388 static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, 389 unsigned long end, int write, struct page **pages, int *nr) 390 { 391 unsigned long mask; 392 unsigned long pte_end; 393 struct page *head, *page; 394 pte_t pte; 395 int refs; 396 397 pte_end = (addr + sz) & ~(sz-1); 398 if (pte_end < end) 399 end = pte_end; 400 401 pte = *ptep; 402 mask = _PAGE_PRESENT | _PAGE_USER; 403 if (write) 404 mask |= _PAGE_RW; 405 406 if ((pte_val(pte) & mask) != mask) 407 return 0; 408 409 /* hugepages are never "special" */ 410 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 411 412 refs = 0; 413 head = pte_page(pte); 414 415 page = head + ((addr & (sz-1)) >> PAGE_SHIFT); 416 do { 417 VM_BUG_ON(compound_head(page) != head); 418 pages[*nr] = page; 419 (*nr)++; 420 page++; 421 refs++; 422 } while (addr += PAGE_SIZE, addr != end); 423 424 if (!page_cache_add_speculative(head, refs)) { 425 *nr -= refs; 426 return 0; 427 } 428 429 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 430 /* Could be optimized better */ 431 while (*nr) { 432 put_page(page); 433 (*nr)--; 434 } 435 } 436 437 return 1; 438 } 439 440 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, 441 unsigned long sz) 442 { 443 unsigned long __boundary = (addr + sz) & ~(sz-1); 444 return (__boundary - 1 < end - 1) ? __boundary : end; 445 } 446 447 int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, 448 unsigned long addr, unsigned long end, 449 int write, struct page **pages, int *nr) 450 { 451 pte_t *ptep; 452 unsigned long sz = 1UL << hugepd_shift(*hugepd); 453 unsigned long next; 454 455 ptep = hugepte_offset(hugepd, addr, pdshift); 456 do { 457 next = hugepte_addr_end(addr, end, sz); 458 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) 459 return 0; 460 } while (ptep++, addr = next, addr != end); 461 462 return 1; 463 } 464 465 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 466 unsigned long len, unsigned long pgoff, 467 unsigned long flags) 468 { 469 struct hstate *hstate = hstate_file(file); 470 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 471 472 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); 473 } 474 475 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 476 { 477 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); 478 479 return 1UL << mmu_psize_to_shift(psize); 480 } 481 482 static int __init add_huge_page_size(unsigned long long size) 483 { 484 int shift = __ffs(size); 485 int mmu_psize; 486 487 /* Check that it is a page size supported by the hardware and 488 * that it fits within pagetable and slice limits. */ 489 if (!is_power_of_2(size) 490 || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT)) 491 return -EINVAL; 492 493 if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) 494 return -EINVAL; 495 496 #ifdef CONFIG_SPU_FS_64K_LS 497 /* Disable support for 64K huge pages when 64K SPU local store 498 * support is enabled as the current implementation conflicts. 499 */ 500 if (shift == PAGE_SHIFT_64K) 501 return -EINVAL; 502 #endif /* CONFIG_SPU_FS_64K_LS */ 503 504 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); 505 506 /* Return if huge page size has already been setup */ 507 if (size_to_hstate(size)) 508 return 0; 509 510 hugetlb_add_hstate(shift - PAGE_SHIFT); 511 512 return 0; 513 } 514 515 static int __init hugepage_setup_sz(char *str) 516 { 517 unsigned long long size; 518 519 size = memparse(str, &str); 520 521 if (add_huge_page_size(size) != 0) 522 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); 523 524 return 1; 525 } 526 __setup("hugepagesz=", hugepage_setup_sz); 527 528 static int __init hugetlbpage_init(void) 529 { 530 int psize; 531 532 if (!cpu_has_feature(CPU_FTR_16M_PAGE)) 533 return -ENODEV; 534 535 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 536 unsigned shift; 537 unsigned pdshift; 538 539 if (!mmu_psize_defs[psize].shift) 540 continue; 541 542 shift = mmu_psize_to_shift(psize); 543 544 if (add_huge_page_size(1ULL << shift) < 0) 545 continue; 546 547 if (shift < PMD_SHIFT) 548 pdshift = PMD_SHIFT; 549 else if (shift < PUD_SHIFT) 550 pdshift = PUD_SHIFT; 551 else 552 pdshift = PGDIR_SHIFT; 553 554 pgtable_cache_add(pdshift - shift, NULL); 555 if (!PGT_CACHE(pdshift - shift)) 556 panic("hugetlbpage_init(): could not create " 557 "pgtable cache for %d bit pagesize\n", shift); 558 } 559 560 /* Set default large page size. Currently, we pick 16M or 1M 561 * depending on what is available 562 */ 563 if (mmu_psize_defs[MMU_PAGE_16M].shift) 564 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; 565 else if (mmu_psize_defs[MMU_PAGE_1M].shift) 566 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; 567 568 return 0; 569 } 570 571 module_init(hugetlbpage_init); 572 573 void flush_dcache_icache_hugepage(struct page *page) 574 { 575 int i; 576 577 BUG_ON(!PageCompound(page)); 578 579 for (i = 0; i < (1UL << compound_order(page)); i++) 580 __flush_dcache_icache(page_address(page+i)); 581 } 582