1 /* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5 #include <linux/gfp.h> 6 #include <linux/list.h> 7 #include <linux/init.h> 8 #include <linux/module.h> 9 #include <linux/mm.h> 10 #include <linux/sysctl.h> 11 #include <linux/highmem.h> 12 #include <linux/nodemask.h> 13 #include <linux/pagemap.h> 14 #include <linux/mempolicy.h> 15 #include <linux/cpuset.h> 16 #include <linux/mutex.h> 17 18 #include <asm/page.h> 19 #include <asm/pgtable.h> 20 21 #include <linux/hugetlb.h> 22 #include "internal.h" 23 24 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 25 static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; 26 static unsigned long surplus_huge_pages; 27 unsigned long max_huge_pages; 28 static struct list_head hugepage_freelists[MAX_NUMNODES]; 29 static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 30 static unsigned int free_huge_pages_node[MAX_NUMNODES]; 31 static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; 32 static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 33 unsigned long hugepages_treat_as_movable; 34 int hugetlb_dynamic_pool; 35 static int hugetlb_next_nid; 36 37 /* 38 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 39 */ 40 static DEFINE_SPINLOCK(hugetlb_lock); 41 42 static void clear_huge_page(struct page *page, unsigned long addr) 43 { 44 int i; 45 46 might_sleep(); 47 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 48 cond_resched(); 49 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 50 } 51 } 52 53 static void copy_huge_page(struct page *dst, struct page *src, 54 unsigned long addr, struct vm_area_struct *vma) 55 { 56 int i; 57 58 might_sleep(); 59 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 60 cond_resched(); 61 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 62 } 63 } 64 65 static void enqueue_huge_page(struct page *page) 66 { 67 int nid = page_to_nid(page); 68 list_add(&page->lru, &hugepage_freelists[nid]); 69 free_huge_pages++; 70 free_huge_pages_node[nid]++; 71 } 72 73 static struct page *dequeue_huge_page(struct vm_area_struct *vma, 74 unsigned long address) 75 { 76 int nid; 77 struct page *page = NULL; 78 struct mempolicy *mpol; 79 struct zonelist *zonelist = huge_zonelist(vma, address, 80 htlb_alloc_mask, &mpol); 81 struct zone **z; 82 83 for (z = zonelist->zones; *z; z++) { 84 nid = zone_to_nid(*z); 85 if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && 86 !list_empty(&hugepage_freelists[nid])) { 87 page = list_entry(hugepage_freelists[nid].next, 88 struct page, lru); 89 list_del(&page->lru); 90 free_huge_pages--; 91 free_huge_pages_node[nid]--; 92 if (vma && vma->vm_flags & VM_MAYSHARE) 93 resv_huge_pages--; 94 break; 95 } 96 } 97 mpol_free(mpol); /* unref if mpol !NULL */ 98 return page; 99 } 100 101 static void update_and_free_page(struct page *page) 102 { 103 int i; 104 nr_huge_pages--; 105 nr_huge_pages_node[page_to_nid(page)]--; 106 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 107 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 108 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 109 1 << PG_private | 1<< PG_writeback); 110 } 111 set_compound_page_dtor(page, NULL); 112 set_page_refcounted(page); 113 __free_pages(page, HUGETLB_PAGE_ORDER); 114 } 115 116 static void free_huge_page(struct page *page) 117 { 118 int nid = page_to_nid(page); 119 120 BUG_ON(page_count(page)); 121 INIT_LIST_HEAD(&page->lru); 122 123 spin_lock(&hugetlb_lock); 124 if (surplus_huge_pages_node[nid]) { 125 update_and_free_page(page); 126 surplus_huge_pages--; 127 surplus_huge_pages_node[nid]--; 128 } else { 129 enqueue_huge_page(page); 130 } 131 spin_unlock(&hugetlb_lock); 132 } 133 134 /* 135 * Increment or decrement surplus_huge_pages. Keep node-specific counters 136 * balanced by operating on them in a round-robin fashion. 137 * Returns 1 if an adjustment was made. 138 */ 139 static int adjust_pool_surplus(int delta) 140 { 141 static int prev_nid; 142 int nid = prev_nid; 143 int ret = 0; 144 145 VM_BUG_ON(delta != -1 && delta != 1); 146 do { 147 nid = next_node(nid, node_online_map); 148 if (nid == MAX_NUMNODES) 149 nid = first_node(node_online_map); 150 151 /* To shrink on this node, there must be a surplus page */ 152 if (delta < 0 && !surplus_huge_pages_node[nid]) 153 continue; 154 /* Surplus cannot exceed the total number of pages */ 155 if (delta > 0 && surplus_huge_pages_node[nid] >= 156 nr_huge_pages_node[nid]) 157 continue; 158 159 surplus_huge_pages += delta; 160 surplus_huge_pages_node[nid] += delta; 161 ret = 1; 162 break; 163 } while (nid != prev_nid); 164 165 prev_nid = nid; 166 return ret; 167 } 168 169 static struct page *alloc_fresh_huge_page_node(int nid) 170 { 171 struct page *page; 172 173 page = alloc_pages_node(nid, 174 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, 175 HUGETLB_PAGE_ORDER); 176 if (page) { 177 set_compound_page_dtor(page, free_huge_page); 178 spin_lock(&hugetlb_lock); 179 nr_huge_pages++; 180 nr_huge_pages_node[nid]++; 181 spin_unlock(&hugetlb_lock); 182 put_page(page); /* free it into the hugepage allocator */ 183 } 184 185 return page; 186 } 187 188 static int alloc_fresh_huge_page(void) 189 { 190 struct page *page; 191 int start_nid; 192 int next_nid; 193 int ret = 0; 194 195 start_nid = hugetlb_next_nid; 196 197 do { 198 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 199 if (page) 200 ret = 1; 201 /* 202 * Use a helper variable to find the next node and then 203 * copy it back to hugetlb_next_nid afterwards: 204 * otherwise there's a window in which a racer might 205 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 206 * But we don't need to use a spin_lock here: it really 207 * doesn't matter if occasionally a racer chooses the 208 * same nid as we do. Move nid forward in the mask even 209 * if we just successfully allocated a hugepage so that 210 * the next caller gets hugepages on the next node. 211 */ 212 next_nid = next_node(hugetlb_next_nid, node_online_map); 213 if (next_nid == MAX_NUMNODES) 214 next_nid = first_node(node_online_map); 215 hugetlb_next_nid = next_nid; 216 } while (!page && hugetlb_next_nid != start_nid); 217 218 return ret; 219 } 220 221 static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 222 unsigned long address) 223 { 224 struct page *page; 225 226 /* Check if the dynamic pool is enabled */ 227 if (!hugetlb_dynamic_pool) 228 return NULL; 229 230 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, 231 HUGETLB_PAGE_ORDER); 232 if (page) { 233 set_compound_page_dtor(page, free_huge_page); 234 spin_lock(&hugetlb_lock); 235 nr_huge_pages++; 236 nr_huge_pages_node[page_to_nid(page)]++; 237 surplus_huge_pages++; 238 surplus_huge_pages_node[page_to_nid(page)]++; 239 spin_unlock(&hugetlb_lock); 240 } 241 242 return page; 243 } 244 245 /* 246 * Increase the hugetlb pool such that it can accomodate a reservation 247 * of size 'delta'. 248 */ 249 static int gather_surplus_pages(int delta) 250 { 251 struct list_head surplus_list; 252 struct page *page, *tmp; 253 int ret, i; 254 int needed, allocated; 255 256 needed = (resv_huge_pages + delta) - free_huge_pages; 257 if (needed <= 0) 258 return 0; 259 260 allocated = 0; 261 INIT_LIST_HEAD(&surplus_list); 262 263 ret = -ENOMEM; 264 retry: 265 spin_unlock(&hugetlb_lock); 266 for (i = 0; i < needed; i++) { 267 page = alloc_buddy_huge_page(NULL, 0); 268 if (!page) { 269 /* 270 * We were not able to allocate enough pages to 271 * satisfy the entire reservation so we free what 272 * we've allocated so far. 273 */ 274 spin_lock(&hugetlb_lock); 275 needed = 0; 276 goto free; 277 } 278 279 list_add(&page->lru, &surplus_list); 280 } 281 allocated += needed; 282 283 /* 284 * After retaking hugetlb_lock, we need to recalculate 'needed' 285 * because either resv_huge_pages or free_huge_pages may have changed. 286 */ 287 spin_lock(&hugetlb_lock); 288 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 289 if (needed > 0) 290 goto retry; 291 292 /* 293 * The surplus_list now contains _at_least_ the number of extra pages 294 * needed to accomodate the reservation. Add the appropriate number 295 * of pages to the hugetlb pool and free the extras back to the buddy 296 * allocator. 297 */ 298 needed += allocated; 299 ret = 0; 300 free: 301 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 302 list_del(&page->lru); 303 if ((--needed) >= 0) 304 enqueue_huge_page(page); 305 else { 306 /* 307 * Decrement the refcount and free the page using its 308 * destructor. This must be done with hugetlb_lock 309 * unlocked which is safe because free_huge_page takes 310 * hugetlb_lock before deciding how to free the page. 311 */ 312 spin_unlock(&hugetlb_lock); 313 put_page(page); 314 spin_lock(&hugetlb_lock); 315 } 316 } 317 318 return ret; 319 } 320 321 /* 322 * When releasing a hugetlb pool reservation, any surplus pages that were 323 * allocated to satisfy the reservation must be explicitly freed if they were 324 * never used. 325 */ 326 void return_unused_surplus_pages(unsigned long unused_resv_pages) 327 { 328 static int nid = -1; 329 struct page *page; 330 unsigned long nr_pages; 331 332 nr_pages = min(unused_resv_pages, surplus_huge_pages); 333 334 while (nr_pages) { 335 nid = next_node(nid, node_online_map); 336 if (nid == MAX_NUMNODES) 337 nid = first_node(node_online_map); 338 339 if (!surplus_huge_pages_node[nid]) 340 continue; 341 342 if (!list_empty(&hugepage_freelists[nid])) { 343 page = list_entry(hugepage_freelists[nid].next, 344 struct page, lru); 345 list_del(&page->lru); 346 update_and_free_page(page); 347 free_huge_pages--; 348 free_huge_pages_node[nid]--; 349 surplus_huge_pages--; 350 surplus_huge_pages_node[nid]--; 351 nr_pages--; 352 } 353 } 354 } 355 356 static struct page *alloc_huge_page(struct vm_area_struct *vma, 357 unsigned long addr) 358 { 359 struct page *page = NULL; 360 int use_reserved_page = vma->vm_flags & VM_MAYSHARE; 361 362 spin_lock(&hugetlb_lock); 363 if (!use_reserved_page && (free_huge_pages <= resv_huge_pages)) 364 goto fail; 365 366 page = dequeue_huge_page(vma, addr); 367 if (!page) 368 goto fail; 369 370 spin_unlock(&hugetlb_lock); 371 set_page_refcounted(page); 372 return page; 373 374 fail: 375 spin_unlock(&hugetlb_lock); 376 377 /* 378 * Private mappings do not use reserved huge pages so the allocation 379 * may have failed due to an undersized hugetlb pool. Try to grab a 380 * surplus huge page from the buddy allocator. 381 */ 382 if (!use_reserved_page) 383 page = alloc_buddy_huge_page(vma, addr); 384 385 return page; 386 } 387 388 static int __init hugetlb_init(void) 389 { 390 unsigned long i; 391 392 if (HPAGE_SHIFT == 0) 393 return 0; 394 395 for (i = 0; i < MAX_NUMNODES; ++i) 396 INIT_LIST_HEAD(&hugepage_freelists[i]); 397 398 hugetlb_next_nid = first_node(node_online_map); 399 400 for (i = 0; i < max_huge_pages; ++i) { 401 if (!alloc_fresh_huge_page()) 402 break; 403 } 404 max_huge_pages = free_huge_pages = nr_huge_pages = i; 405 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 406 return 0; 407 } 408 module_init(hugetlb_init); 409 410 static int __init hugetlb_setup(char *s) 411 { 412 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 413 max_huge_pages = 0; 414 return 1; 415 } 416 __setup("hugepages=", hugetlb_setup); 417 418 static unsigned int cpuset_mems_nr(unsigned int *array) 419 { 420 int node; 421 unsigned int nr = 0; 422 423 for_each_node_mask(node, cpuset_current_mems_allowed) 424 nr += array[node]; 425 426 return nr; 427 } 428 429 #ifdef CONFIG_SYSCTL 430 #ifdef CONFIG_HIGHMEM 431 static void try_to_free_low(unsigned long count) 432 { 433 int i; 434 435 for (i = 0; i < MAX_NUMNODES; ++i) { 436 struct page *page, *next; 437 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 438 if (count >= nr_huge_pages) 439 return; 440 if (PageHighMem(page)) 441 continue; 442 list_del(&page->lru); 443 update_and_free_page(page); 444 free_huge_pages--; 445 free_huge_pages_node[page_to_nid(page)]--; 446 } 447 } 448 } 449 #else 450 static inline void try_to_free_low(unsigned long count) 451 { 452 } 453 #endif 454 455 #define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 456 static unsigned long set_max_huge_pages(unsigned long count) 457 { 458 unsigned long min_count, ret; 459 460 /* 461 * Increase the pool size 462 * First take pages out of surplus state. Then make up the 463 * remaining difference by allocating fresh huge pages. 464 */ 465 spin_lock(&hugetlb_lock); 466 while (surplus_huge_pages && count > persistent_huge_pages) { 467 if (!adjust_pool_surplus(-1)) 468 break; 469 } 470 471 while (count > persistent_huge_pages) { 472 int ret; 473 /* 474 * If this allocation races such that we no longer need the 475 * page, free_huge_page will handle it by freeing the page 476 * and reducing the surplus. 477 */ 478 spin_unlock(&hugetlb_lock); 479 ret = alloc_fresh_huge_page(); 480 spin_lock(&hugetlb_lock); 481 if (!ret) 482 goto out; 483 484 } 485 486 /* 487 * Decrease the pool size 488 * First return free pages to the buddy allocator (being careful 489 * to keep enough around to satisfy reservations). Then place 490 * pages into surplus state as needed so the pool will shrink 491 * to the desired size as pages become free. 492 */ 493 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 494 min_count = max(count, min_count); 495 try_to_free_low(min_count); 496 while (min_count < persistent_huge_pages) { 497 struct page *page = dequeue_huge_page(NULL, 0); 498 if (!page) 499 break; 500 update_and_free_page(page); 501 } 502 while (count < persistent_huge_pages) { 503 if (!adjust_pool_surplus(1)) 504 break; 505 } 506 out: 507 ret = persistent_huge_pages; 508 spin_unlock(&hugetlb_lock); 509 return ret; 510 } 511 512 int hugetlb_sysctl_handler(struct ctl_table *table, int write, 513 struct file *file, void __user *buffer, 514 size_t *length, loff_t *ppos) 515 { 516 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 517 max_huge_pages = set_max_huge_pages(max_huge_pages); 518 return 0; 519 } 520 521 int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 522 struct file *file, void __user *buffer, 523 size_t *length, loff_t *ppos) 524 { 525 proc_dointvec(table, write, file, buffer, length, ppos); 526 if (hugepages_treat_as_movable) 527 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 528 else 529 htlb_alloc_mask = GFP_HIGHUSER; 530 return 0; 531 } 532 533 #endif /* CONFIG_SYSCTL */ 534 535 int hugetlb_report_meminfo(char *buf) 536 { 537 return sprintf(buf, 538 "HugePages_Total: %5lu\n" 539 "HugePages_Free: %5lu\n" 540 "HugePages_Rsvd: %5lu\n" 541 "HugePages_Surp: %5lu\n" 542 "Hugepagesize: %5lu kB\n", 543 nr_huge_pages, 544 free_huge_pages, 545 resv_huge_pages, 546 surplus_huge_pages, 547 HPAGE_SIZE/1024); 548 } 549 550 int hugetlb_report_node_meminfo(int nid, char *buf) 551 { 552 return sprintf(buf, 553 "Node %d HugePages_Total: %5u\n" 554 "Node %d HugePages_Free: %5u\n", 555 nid, nr_huge_pages_node[nid], 556 nid, free_huge_pages_node[nid]); 557 } 558 559 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 560 unsigned long hugetlb_total_pages(void) 561 { 562 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 563 } 564 565 /* 566 * We cannot handle pagefaults against hugetlb pages at all. They cause 567 * handle_mm_fault() to try to instantiate regular-sized pages in the 568 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 569 * this far. 570 */ 571 static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 572 { 573 BUG(); 574 return 0; 575 } 576 577 struct vm_operations_struct hugetlb_vm_ops = { 578 .fault = hugetlb_vm_op_fault, 579 }; 580 581 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 582 int writable) 583 { 584 pte_t entry; 585 586 if (writable) { 587 entry = 588 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 589 } else { 590 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 591 } 592 entry = pte_mkyoung(entry); 593 entry = pte_mkhuge(entry); 594 595 return entry; 596 } 597 598 static void set_huge_ptep_writable(struct vm_area_struct *vma, 599 unsigned long address, pte_t *ptep) 600 { 601 pte_t entry; 602 603 entry = pte_mkwrite(pte_mkdirty(*ptep)); 604 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { 605 update_mmu_cache(vma, address, entry); 606 } 607 } 608 609 610 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 611 struct vm_area_struct *vma) 612 { 613 pte_t *src_pte, *dst_pte, entry; 614 struct page *ptepage; 615 unsigned long addr; 616 int cow; 617 618 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 619 620 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 621 src_pte = huge_pte_offset(src, addr); 622 if (!src_pte) 623 continue; 624 dst_pte = huge_pte_alloc(dst, addr); 625 if (!dst_pte) 626 goto nomem; 627 spin_lock(&dst->page_table_lock); 628 spin_lock(&src->page_table_lock); 629 if (!pte_none(*src_pte)) { 630 if (cow) 631 ptep_set_wrprotect(src, addr, src_pte); 632 entry = *src_pte; 633 ptepage = pte_page(entry); 634 get_page(ptepage); 635 set_huge_pte_at(dst, addr, dst_pte, entry); 636 } 637 spin_unlock(&src->page_table_lock); 638 spin_unlock(&dst->page_table_lock); 639 } 640 return 0; 641 642 nomem: 643 return -ENOMEM; 644 } 645 646 void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 647 unsigned long end) 648 { 649 struct mm_struct *mm = vma->vm_mm; 650 unsigned long address; 651 pte_t *ptep; 652 pte_t pte; 653 struct page *page; 654 struct page *tmp; 655 /* 656 * A page gathering list, protected by per file i_mmap_lock. The 657 * lock is used to avoid list corruption from multiple unmapping 658 * of the same page since we are using page->lru. 659 */ 660 LIST_HEAD(page_list); 661 662 WARN_ON(!is_vm_hugetlb_page(vma)); 663 BUG_ON(start & ~HPAGE_MASK); 664 BUG_ON(end & ~HPAGE_MASK); 665 666 spin_lock(&mm->page_table_lock); 667 for (address = start; address < end; address += HPAGE_SIZE) { 668 ptep = huge_pte_offset(mm, address); 669 if (!ptep) 670 continue; 671 672 if (huge_pmd_unshare(mm, &address, ptep)) 673 continue; 674 675 pte = huge_ptep_get_and_clear(mm, address, ptep); 676 if (pte_none(pte)) 677 continue; 678 679 page = pte_page(pte); 680 if (pte_dirty(pte)) 681 set_page_dirty(page); 682 list_add(&page->lru, &page_list); 683 } 684 spin_unlock(&mm->page_table_lock); 685 flush_tlb_range(vma, start, end); 686 list_for_each_entry_safe(page, tmp, &page_list, lru) { 687 list_del(&page->lru); 688 put_page(page); 689 } 690 } 691 692 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 693 unsigned long end) 694 { 695 /* 696 * It is undesirable to test vma->vm_file as it should be non-null 697 * for valid hugetlb area. However, vm_file will be NULL in the error 698 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, 699 * do_mmap_pgoff() nullifies vma->vm_file before calling this function 700 * to clean up. Since no pte has actually been setup, it is safe to 701 * do nothing in this case. 702 */ 703 if (vma->vm_file) { 704 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 705 __unmap_hugepage_range(vma, start, end); 706 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 707 } 708 } 709 710 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 711 unsigned long address, pte_t *ptep, pte_t pte) 712 { 713 struct page *old_page, *new_page; 714 int avoidcopy; 715 716 old_page = pte_page(pte); 717 718 /* If no-one else is actually using this page, avoid the copy 719 * and just make the page writable */ 720 avoidcopy = (page_count(old_page) == 1); 721 if (avoidcopy) { 722 set_huge_ptep_writable(vma, address, ptep); 723 return 0; 724 } 725 726 page_cache_get(old_page); 727 new_page = alloc_huge_page(vma, address); 728 729 if (!new_page) { 730 page_cache_release(old_page); 731 return VM_FAULT_OOM; 732 } 733 734 spin_unlock(&mm->page_table_lock); 735 copy_huge_page(new_page, old_page, address, vma); 736 spin_lock(&mm->page_table_lock); 737 738 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 739 if (likely(pte_same(*ptep, pte))) { 740 /* Break COW */ 741 set_huge_pte_at(mm, address, ptep, 742 make_huge_pte(vma, new_page, 1)); 743 /* Make the old page be freed below */ 744 new_page = old_page; 745 } 746 page_cache_release(new_page); 747 page_cache_release(old_page); 748 return 0; 749 } 750 751 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 752 unsigned long address, pte_t *ptep, int write_access) 753 { 754 int ret = VM_FAULT_SIGBUS; 755 unsigned long idx; 756 unsigned long size; 757 struct page *page; 758 struct address_space *mapping; 759 pte_t new_pte; 760 761 mapping = vma->vm_file->f_mapping; 762 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 763 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 764 765 /* 766 * Use page lock to guard against racing truncation 767 * before we get page_table_lock. 768 */ 769 retry: 770 page = find_lock_page(mapping, idx); 771 if (!page) { 772 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 773 if (idx >= size) 774 goto out; 775 if (hugetlb_get_quota(mapping)) 776 goto out; 777 page = alloc_huge_page(vma, address); 778 if (!page) { 779 hugetlb_put_quota(mapping); 780 ret = VM_FAULT_OOM; 781 goto out; 782 } 783 clear_huge_page(page, address); 784 785 if (vma->vm_flags & VM_SHARED) { 786 int err; 787 788 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 789 if (err) { 790 put_page(page); 791 hugetlb_put_quota(mapping); 792 if (err == -EEXIST) 793 goto retry; 794 goto out; 795 } 796 } else 797 lock_page(page); 798 } 799 800 spin_lock(&mm->page_table_lock); 801 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 802 if (idx >= size) 803 goto backout; 804 805 ret = 0; 806 if (!pte_none(*ptep)) 807 goto backout; 808 809 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 810 && (vma->vm_flags & VM_SHARED))); 811 set_huge_pte_at(mm, address, ptep, new_pte); 812 813 if (write_access && !(vma->vm_flags & VM_SHARED)) { 814 /* Optimization, do the COW without a second fault */ 815 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 816 } 817 818 spin_unlock(&mm->page_table_lock); 819 unlock_page(page); 820 out: 821 return ret; 822 823 backout: 824 spin_unlock(&mm->page_table_lock); 825 hugetlb_put_quota(mapping); 826 unlock_page(page); 827 put_page(page); 828 goto out; 829 } 830 831 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 832 unsigned long address, int write_access) 833 { 834 pte_t *ptep; 835 pte_t entry; 836 int ret; 837 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 838 839 ptep = huge_pte_alloc(mm, address); 840 if (!ptep) 841 return VM_FAULT_OOM; 842 843 /* 844 * Serialize hugepage allocation and instantiation, so that we don't 845 * get spurious allocation failures if two CPUs race to instantiate 846 * the same page in the page cache. 847 */ 848 mutex_lock(&hugetlb_instantiation_mutex); 849 entry = *ptep; 850 if (pte_none(entry)) { 851 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 852 mutex_unlock(&hugetlb_instantiation_mutex); 853 return ret; 854 } 855 856 ret = 0; 857 858 spin_lock(&mm->page_table_lock); 859 /* Check for a racing update before calling hugetlb_cow */ 860 if (likely(pte_same(entry, *ptep))) 861 if (write_access && !pte_write(entry)) 862 ret = hugetlb_cow(mm, vma, address, ptep, entry); 863 spin_unlock(&mm->page_table_lock); 864 mutex_unlock(&hugetlb_instantiation_mutex); 865 866 return ret; 867 } 868 869 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 870 struct page **pages, struct vm_area_struct **vmas, 871 unsigned long *position, int *length, int i) 872 { 873 unsigned long pfn_offset; 874 unsigned long vaddr = *position; 875 int remainder = *length; 876 877 spin_lock(&mm->page_table_lock); 878 while (vaddr < vma->vm_end && remainder) { 879 pte_t *pte; 880 struct page *page; 881 882 /* 883 * Some archs (sparc64, sh*) have multiple pte_ts to 884 * each hugepage. We have to make * sure we get the 885 * first, for the page indexing below to work. 886 */ 887 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 888 889 if (!pte || pte_none(*pte)) { 890 int ret; 891 892 spin_unlock(&mm->page_table_lock); 893 ret = hugetlb_fault(mm, vma, vaddr, 0); 894 spin_lock(&mm->page_table_lock); 895 if (!(ret & VM_FAULT_ERROR)) 896 continue; 897 898 remainder = 0; 899 if (!i) 900 i = -EFAULT; 901 break; 902 } 903 904 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 905 page = pte_page(*pte); 906 same_page: 907 if (pages) { 908 get_page(page); 909 pages[i] = page + pfn_offset; 910 } 911 912 if (vmas) 913 vmas[i] = vma; 914 915 vaddr += PAGE_SIZE; 916 ++pfn_offset; 917 --remainder; 918 ++i; 919 if (vaddr < vma->vm_end && remainder && 920 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 921 /* 922 * We use pfn_offset to avoid touching the pageframes 923 * of this compound page. 924 */ 925 goto same_page; 926 } 927 } 928 spin_unlock(&mm->page_table_lock); 929 *length = remainder; 930 *position = vaddr; 931 932 return i; 933 } 934 935 void hugetlb_change_protection(struct vm_area_struct *vma, 936 unsigned long address, unsigned long end, pgprot_t newprot) 937 { 938 struct mm_struct *mm = vma->vm_mm; 939 unsigned long start = address; 940 pte_t *ptep; 941 pte_t pte; 942 943 BUG_ON(address >= end); 944 flush_cache_range(vma, address, end); 945 946 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 947 spin_lock(&mm->page_table_lock); 948 for (; address < end; address += HPAGE_SIZE) { 949 ptep = huge_pte_offset(mm, address); 950 if (!ptep) 951 continue; 952 if (huge_pmd_unshare(mm, &address, ptep)) 953 continue; 954 if (!pte_none(*ptep)) { 955 pte = huge_ptep_get_and_clear(mm, address, ptep); 956 pte = pte_mkhuge(pte_modify(pte, newprot)); 957 set_huge_pte_at(mm, address, ptep, pte); 958 } 959 } 960 spin_unlock(&mm->page_table_lock); 961 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 962 963 flush_tlb_range(vma, start, end); 964 } 965 966 struct file_region { 967 struct list_head link; 968 long from; 969 long to; 970 }; 971 972 static long region_add(struct list_head *head, long f, long t) 973 { 974 struct file_region *rg, *nrg, *trg; 975 976 /* Locate the region we are either in or before. */ 977 list_for_each_entry(rg, head, link) 978 if (f <= rg->to) 979 break; 980 981 /* Round our left edge to the current segment if it encloses us. */ 982 if (f > rg->from) 983 f = rg->from; 984 985 /* Check for and consume any regions we now overlap with. */ 986 nrg = rg; 987 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 988 if (&rg->link == head) 989 break; 990 if (rg->from > t) 991 break; 992 993 /* If this area reaches higher then extend our area to 994 * include it completely. If this is not the first area 995 * which we intend to reuse, free it. */ 996 if (rg->to > t) 997 t = rg->to; 998 if (rg != nrg) { 999 list_del(&rg->link); 1000 kfree(rg); 1001 } 1002 } 1003 nrg->from = f; 1004 nrg->to = t; 1005 return 0; 1006 } 1007 1008 static long region_chg(struct list_head *head, long f, long t) 1009 { 1010 struct file_region *rg, *nrg; 1011 long chg = 0; 1012 1013 /* Locate the region we are before or in. */ 1014 list_for_each_entry(rg, head, link) 1015 if (f <= rg->to) 1016 break; 1017 1018 /* If we are below the current region then a new region is required. 1019 * Subtle, allocate a new region at the position but make it zero 1020 * size such that we can guarentee to record the reservation. */ 1021 if (&rg->link == head || t < rg->from) { 1022 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 1023 if (!nrg) 1024 return -ENOMEM; 1025 nrg->from = f; 1026 nrg->to = f; 1027 INIT_LIST_HEAD(&nrg->link); 1028 list_add(&nrg->link, rg->link.prev); 1029 1030 return t - f; 1031 } 1032 1033 /* Round our left edge to the current segment if it encloses us. */ 1034 if (f > rg->from) 1035 f = rg->from; 1036 chg = t - f; 1037 1038 /* Check for and consume any regions we now overlap with. */ 1039 list_for_each_entry(rg, rg->link.prev, link) { 1040 if (&rg->link == head) 1041 break; 1042 if (rg->from > t) 1043 return chg; 1044 1045 /* We overlap with this area, if it extends futher than 1046 * us then we must extend ourselves. Account for its 1047 * existing reservation. */ 1048 if (rg->to > t) { 1049 chg += rg->to - t; 1050 t = rg->to; 1051 } 1052 chg -= rg->to - rg->from; 1053 } 1054 return chg; 1055 } 1056 1057 static long region_truncate(struct list_head *head, long end) 1058 { 1059 struct file_region *rg, *trg; 1060 long chg = 0; 1061 1062 /* Locate the region we are either in or before. */ 1063 list_for_each_entry(rg, head, link) 1064 if (end <= rg->to) 1065 break; 1066 if (&rg->link == head) 1067 return 0; 1068 1069 /* If we are in the middle of a region then adjust it. */ 1070 if (end > rg->from) { 1071 chg = rg->to - end; 1072 rg->to = end; 1073 rg = list_entry(rg->link.next, typeof(*rg), link); 1074 } 1075 1076 /* Drop any remaining regions. */ 1077 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1078 if (&rg->link == head) 1079 break; 1080 chg += rg->to - rg->from; 1081 list_del(&rg->link); 1082 kfree(rg); 1083 } 1084 return chg; 1085 } 1086 1087 static int hugetlb_acct_memory(long delta) 1088 { 1089 int ret = -ENOMEM; 1090 1091 spin_lock(&hugetlb_lock); 1092 /* 1093 * When cpuset is configured, it breaks the strict hugetlb page 1094 * reservation as the accounting is done on a global variable. Such 1095 * reservation is completely rubbish in the presence of cpuset because 1096 * the reservation is not checked against page availability for the 1097 * current cpuset. Application can still potentially OOM'ed by kernel 1098 * with lack of free htlb page in cpuset that the task is in. 1099 * Attempt to enforce strict accounting with cpuset is almost 1100 * impossible (or too ugly) because cpuset is too fluid that 1101 * task or memory node can be dynamically moved between cpusets. 1102 * 1103 * The change of semantics for shared hugetlb mapping with cpuset is 1104 * undesirable. However, in order to preserve some of the semantics, 1105 * we fall back to check against current free page availability as 1106 * a best attempt and hopefully to minimize the impact of changing 1107 * semantics that cpuset has. 1108 */ 1109 if (delta > 0) { 1110 if (gather_surplus_pages(delta) < 0) 1111 goto out; 1112 1113 if (delta > cpuset_mems_nr(free_huge_pages_node)) 1114 goto out; 1115 } 1116 1117 ret = 0; 1118 resv_huge_pages += delta; 1119 if (delta < 0) 1120 return_unused_surplus_pages((unsigned long) -delta); 1121 1122 out: 1123 spin_unlock(&hugetlb_lock); 1124 return ret; 1125 } 1126 1127 int hugetlb_reserve_pages(struct inode *inode, long from, long to) 1128 { 1129 long ret, chg; 1130 1131 chg = region_chg(&inode->i_mapping->private_list, from, to); 1132 if (chg < 0) 1133 return chg; 1134 1135 ret = hugetlb_acct_memory(chg); 1136 if (ret < 0) 1137 return ret; 1138 region_add(&inode->i_mapping->private_list, from, to); 1139 return 0; 1140 } 1141 1142 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 1143 { 1144 long chg = region_truncate(&inode->i_mapping->private_list, offset); 1145 hugetlb_acct_memory(freed - chg); 1146 } 1147