1 /* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5 #include <linux/gfp.h> 6 #include <linux/list.h> 7 #include <linux/init.h> 8 #include <linux/module.h> 9 #include <linux/mm.h> 10 #include <linux/sysctl.h> 11 #include <linux/highmem.h> 12 #include <linux/nodemask.h> 13 #include <linux/pagemap.h> 14 #include <linux/mempolicy.h> 15 #include <linux/cpuset.h> 16 #include <linux/mutex.h> 17 18 #include <asm/page.h> 19 #include <asm/pgtable.h> 20 21 #include <linux/hugetlb.h> 22 #include "internal.h" 23 24 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 25 static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; 26 static unsigned long surplus_huge_pages; 27 unsigned long max_huge_pages; 28 static struct list_head hugepage_freelists[MAX_NUMNODES]; 29 static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 30 static unsigned int free_huge_pages_node[MAX_NUMNODES]; 31 static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; 32 static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 33 unsigned long hugepages_treat_as_movable; 34 unsigned long nr_overcommit_huge_pages; 35 static int hugetlb_next_nid; 36 37 /* 38 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 39 */ 40 static DEFINE_SPINLOCK(hugetlb_lock); 41 42 static void clear_huge_page(struct page *page, unsigned long addr) 43 { 44 int i; 45 46 might_sleep(); 47 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 48 cond_resched(); 49 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 50 } 51 } 52 53 static void copy_huge_page(struct page *dst, struct page *src, 54 unsigned long addr, struct vm_area_struct *vma) 55 { 56 int i; 57 58 might_sleep(); 59 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 60 cond_resched(); 61 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 62 } 63 } 64 65 static void enqueue_huge_page(struct page *page) 66 { 67 int nid = page_to_nid(page); 68 list_add(&page->lru, &hugepage_freelists[nid]); 69 free_huge_pages++; 70 free_huge_pages_node[nid]++; 71 } 72 73 static struct page *dequeue_huge_page(struct vm_area_struct *vma, 74 unsigned long address) 75 { 76 int nid; 77 struct page *page = NULL; 78 struct mempolicy *mpol; 79 struct zonelist *zonelist = huge_zonelist(vma, address, 80 htlb_alloc_mask, &mpol); 81 struct zone **z; 82 83 for (z = zonelist->zones; *z; z++) { 84 nid = zone_to_nid(*z); 85 if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && 86 !list_empty(&hugepage_freelists[nid])) { 87 page = list_entry(hugepage_freelists[nid].next, 88 struct page, lru); 89 list_del(&page->lru); 90 free_huge_pages--; 91 free_huge_pages_node[nid]--; 92 if (vma && vma->vm_flags & VM_MAYSHARE) 93 resv_huge_pages--; 94 break; 95 } 96 } 97 mpol_free(mpol); /* unref if mpol !NULL */ 98 return page; 99 } 100 101 static void update_and_free_page(struct page *page) 102 { 103 int i; 104 nr_huge_pages--; 105 nr_huge_pages_node[page_to_nid(page)]--; 106 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 107 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 108 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 109 1 << PG_private | 1<< PG_writeback); 110 } 111 set_compound_page_dtor(page, NULL); 112 set_page_refcounted(page); 113 __free_pages(page, HUGETLB_PAGE_ORDER); 114 } 115 116 static void free_huge_page(struct page *page) 117 { 118 int nid = page_to_nid(page); 119 struct address_space *mapping; 120 121 mapping = (struct address_space *) page_private(page); 122 BUG_ON(page_count(page)); 123 INIT_LIST_HEAD(&page->lru); 124 125 spin_lock(&hugetlb_lock); 126 if (surplus_huge_pages_node[nid]) { 127 update_and_free_page(page); 128 surplus_huge_pages--; 129 surplus_huge_pages_node[nid]--; 130 } else { 131 enqueue_huge_page(page); 132 } 133 spin_unlock(&hugetlb_lock); 134 if (mapping) 135 hugetlb_put_quota(mapping, 1); 136 set_page_private(page, 0); 137 } 138 139 /* 140 * Increment or decrement surplus_huge_pages. Keep node-specific counters 141 * balanced by operating on them in a round-robin fashion. 142 * Returns 1 if an adjustment was made. 143 */ 144 static int adjust_pool_surplus(int delta) 145 { 146 static int prev_nid; 147 int nid = prev_nid; 148 int ret = 0; 149 150 VM_BUG_ON(delta != -1 && delta != 1); 151 do { 152 nid = next_node(nid, node_online_map); 153 if (nid == MAX_NUMNODES) 154 nid = first_node(node_online_map); 155 156 /* To shrink on this node, there must be a surplus page */ 157 if (delta < 0 && !surplus_huge_pages_node[nid]) 158 continue; 159 /* Surplus cannot exceed the total number of pages */ 160 if (delta > 0 && surplus_huge_pages_node[nid] >= 161 nr_huge_pages_node[nid]) 162 continue; 163 164 surplus_huge_pages += delta; 165 surplus_huge_pages_node[nid] += delta; 166 ret = 1; 167 break; 168 } while (nid != prev_nid); 169 170 prev_nid = nid; 171 return ret; 172 } 173 174 static struct page *alloc_fresh_huge_page_node(int nid) 175 { 176 struct page *page; 177 178 page = alloc_pages_node(nid, 179 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, 180 HUGETLB_PAGE_ORDER); 181 if (page) { 182 set_compound_page_dtor(page, free_huge_page); 183 spin_lock(&hugetlb_lock); 184 nr_huge_pages++; 185 nr_huge_pages_node[nid]++; 186 spin_unlock(&hugetlb_lock); 187 put_page(page); /* free it into the hugepage allocator */ 188 } 189 190 return page; 191 } 192 193 static int alloc_fresh_huge_page(void) 194 { 195 struct page *page; 196 int start_nid; 197 int next_nid; 198 int ret = 0; 199 200 start_nid = hugetlb_next_nid; 201 202 do { 203 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 204 if (page) 205 ret = 1; 206 /* 207 * Use a helper variable to find the next node and then 208 * copy it back to hugetlb_next_nid afterwards: 209 * otherwise there's a window in which a racer might 210 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 211 * But we don't need to use a spin_lock here: it really 212 * doesn't matter if occasionally a racer chooses the 213 * same nid as we do. Move nid forward in the mask even 214 * if we just successfully allocated a hugepage so that 215 * the next caller gets hugepages on the next node. 216 */ 217 next_nid = next_node(hugetlb_next_nid, node_online_map); 218 if (next_nid == MAX_NUMNODES) 219 next_nid = first_node(node_online_map); 220 hugetlb_next_nid = next_nid; 221 } while (!page && hugetlb_next_nid != start_nid); 222 223 return ret; 224 } 225 226 static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 227 unsigned long address) 228 { 229 struct page *page; 230 unsigned int nid; 231 232 /* 233 * Assume we will successfully allocate the surplus page to 234 * prevent racing processes from causing the surplus to exceed 235 * overcommit 236 * 237 * This however introduces a different race, where a process B 238 * tries to grow the static hugepage pool while alloc_pages() is 239 * called by process A. B will only examine the per-node 240 * counters in determining if surplus huge pages can be 241 * converted to normal huge pages in adjust_pool_surplus(). A 242 * won't be able to increment the per-node counter, until the 243 * lock is dropped by B, but B doesn't drop hugetlb_lock until 244 * no more huge pages can be converted from surplus to normal 245 * state (and doesn't try to convert again). Thus, we have a 246 * case where a surplus huge page exists, the pool is grown, and 247 * the surplus huge page still exists after, even though it 248 * should just have been converted to a normal huge page. This 249 * does not leak memory, though, as the hugepage will be freed 250 * once it is out of use. It also does not allow the counters to 251 * go out of whack in adjust_pool_surplus() as we don't modify 252 * the node values until we've gotten the hugepage and only the 253 * per-node value is checked there. 254 */ 255 spin_lock(&hugetlb_lock); 256 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 257 spin_unlock(&hugetlb_lock); 258 return NULL; 259 } else { 260 nr_huge_pages++; 261 surplus_huge_pages++; 262 } 263 spin_unlock(&hugetlb_lock); 264 265 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, 266 HUGETLB_PAGE_ORDER); 267 268 spin_lock(&hugetlb_lock); 269 if (page) { 270 nid = page_to_nid(page); 271 set_compound_page_dtor(page, free_huge_page); 272 /* 273 * We incremented the global counters already 274 */ 275 nr_huge_pages_node[nid]++; 276 surplus_huge_pages_node[nid]++; 277 } else { 278 nr_huge_pages--; 279 surplus_huge_pages--; 280 } 281 spin_unlock(&hugetlb_lock); 282 283 return page; 284 } 285 286 /* 287 * Increase the hugetlb pool such that it can accomodate a reservation 288 * of size 'delta'. 289 */ 290 static int gather_surplus_pages(int delta) 291 { 292 struct list_head surplus_list; 293 struct page *page, *tmp; 294 int ret, i; 295 int needed, allocated; 296 297 needed = (resv_huge_pages + delta) - free_huge_pages; 298 if (needed <= 0) 299 return 0; 300 301 allocated = 0; 302 INIT_LIST_HEAD(&surplus_list); 303 304 ret = -ENOMEM; 305 retry: 306 spin_unlock(&hugetlb_lock); 307 for (i = 0; i < needed; i++) { 308 page = alloc_buddy_huge_page(NULL, 0); 309 if (!page) { 310 /* 311 * We were not able to allocate enough pages to 312 * satisfy the entire reservation so we free what 313 * we've allocated so far. 314 */ 315 spin_lock(&hugetlb_lock); 316 needed = 0; 317 goto free; 318 } 319 320 list_add(&page->lru, &surplus_list); 321 } 322 allocated += needed; 323 324 /* 325 * After retaking hugetlb_lock, we need to recalculate 'needed' 326 * because either resv_huge_pages or free_huge_pages may have changed. 327 */ 328 spin_lock(&hugetlb_lock); 329 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 330 if (needed > 0) 331 goto retry; 332 333 /* 334 * The surplus_list now contains _at_least_ the number of extra pages 335 * needed to accomodate the reservation. Add the appropriate number 336 * of pages to the hugetlb pool and free the extras back to the buddy 337 * allocator. 338 */ 339 needed += allocated; 340 ret = 0; 341 free: 342 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 343 list_del(&page->lru); 344 if ((--needed) >= 0) 345 enqueue_huge_page(page); 346 else { 347 /* 348 * Decrement the refcount and free the page using its 349 * destructor. This must be done with hugetlb_lock 350 * unlocked which is safe because free_huge_page takes 351 * hugetlb_lock before deciding how to free the page. 352 */ 353 spin_unlock(&hugetlb_lock); 354 put_page(page); 355 spin_lock(&hugetlb_lock); 356 } 357 } 358 359 return ret; 360 } 361 362 /* 363 * When releasing a hugetlb pool reservation, any surplus pages that were 364 * allocated to satisfy the reservation must be explicitly freed if they were 365 * never used. 366 */ 367 static void return_unused_surplus_pages(unsigned long unused_resv_pages) 368 { 369 static int nid = -1; 370 struct page *page; 371 unsigned long nr_pages; 372 373 nr_pages = min(unused_resv_pages, surplus_huge_pages); 374 375 while (nr_pages) { 376 nid = next_node(nid, node_online_map); 377 if (nid == MAX_NUMNODES) 378 nid = first_node(node_online_map); 379 380 if (!surplus_huge_pages_node[nid]) 381 continue; 382 383 if (!list_empty(&hugepage_freelists[nid])) { 384 page = list_entry(hugepage_freelists[nid].next, 385 struct page, lru); 386 list_del(&page->lru); 387 update_and_free_page(page); 388 free_huge_pages--; 389 free_huge_pages_node[nid]--; 390 surplus_huge_pages--; 391 surplus_huge_pages_node[nid]--; 392 nr_pages--; 393 } 394 } 395 } 396 397 398 static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, 399 unsigned long addr) 400 { 401 struct page *page; 402 403 spin_lock(&hugetlb_lock); 404 page = dequeue_huge_page(vma, addr); 405 spin_unlock(&hugetlb_lock); 406 return page ? page : ERR_PTR(-VM_FAULT_OOM); 407 } 408 409 static struct page *alloc_huge_page_private(struct vm_area_struct *vma, 410 unsigned long addr) 411 { 412 struct page *page = NULL; 413 414 if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) 415 return ERR_PTR(-VM_FAULT_SIGBUS); 416 417 spin_lock(&hugetlb_lock); 418 if (free_huge_pages > resv_huge_pages) 419 page = dequeue_huge_page(vma, addr); 420 spin_unlock(&hugetlb_lock); 421 if (!page) { 422 page = alloc_buddy_huge_page(vma, addr); 423 if (!page) { 424 hugetlb_put_quota(vma->vm_file->f_mapping, 1); 425 return ERR_PTR(-VM_FAULT_OOM); 426 } 427 } 428 return page; 429 } 430 431 static struct page *alloc_huge_page(struct vm_area_struct *vma, 432 unsigned long addr) 433 { 434 struct page *page; 435 struct address_space *mapping = vma->vm_file->f_mapping; 436 437 if (vma->vm_flags & VM_MAYSHARE) 438 page = alloc_huge_page_shared(vma, addr); 439 else 440 page = alloc_huge_page_private(vma, addr); 441 442 if (!IS_ERR(page)) { 443 set_page_refcounted(page); 444 set_page_private(page, (unsigned long) mapping); 445 } 446 return page; 447 } 448 449 static int __init hugetlb_init(void) 450 { 451 unsigned long i; 452 453 if (HPAGE_SHIFT == 0) 454 return 0; 455 456 for (i = 0; i < MAX_NUMNODES; ++i) 457 INIT_LIST_HEAD(&hugepage_freelists[i]); 458 459 hugetlb_next_nid = first_node(node_online_map); 460 461 for (i = 0; i < max_huge_pages; ++i) { 462 if (!alloc_fresh_huge_page()) 463 break; 464 } 465 max_huge_pages = free_huge_pages = nr_huge_pages = i; 466 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 467 return 0; 468 } 469 module_init(hugetlb_init); 470 471 static int __init hugetlb_setup(char *s) 472 { 473 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 474 max_huge_pages = 0; 475 return 1; 476 } 477 __setup("hugepages=", hugetlb_setup); 478 479 static unsigned int cpuset_mems_nr(unsigned int *array) 480 { 481 int node; 482 unsigned int nr = 0; 483 484 for_each_node_mask(node, cpuset_current_mems_allowed) 485 nr += array[node]; 486 487 return nr; 488 } 489 490 #ifdef CONFIG_SYSCTL 491 #ifdef CONFIG_HIGHMEM 492 static void try_to_free_low(unsigned long count) 493 { 494 int i; 495 496 for (i = 0; i < MAX_NUMNODES; ++i) { 497 struct page *page, *next; 498 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 499 if (count >= nr_huge_pages) 500 return; 501 if (PageHighMem(page)) 502 continue; 503 list_del(&page->lru); 504 update_and_free_page(page); 505 free_huge_pages--; 506 free_huge_pages_node[page_to_nid(page)]--; 507 } 508 } 509 } 510 #else 511 static inline void try_to_free_low(unsigned long count) 512 { 513 } 514 #endif 515 516 #define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 517 static unsigned long set_max_huge_pages(unsigned long count) 518 { 519 unsigned long min_count, ret; 520 521 /* 522 * Increase the pool size 523 * First take pages out of surplus state. Then make up the 524 * remaining difference by allocating fresh huge pages. 525 * 526 * We might race with alloc_buddy_huge_page() here and be unable 527 * to convert a surplus huge page to a normal huge page. That is 528 * not critical, though, it just means the overall size of the 529 * pool might be one hugepage larger than it needs to be, but 530 * within all the constraints specified by the sysctls. 531 */ 532 spin_lock(&hugetlb_lock); 533 while (surplus_huge_pages && count > persistent_huge_pages) { 534 if (!adjust_pool_surplus(-1)) 535 break; 536 } 537 538 while (count > persistent_huge_pages) { 539 int ret; 540 /* 541 * If this allocation races such that we no longer need the 542 * page, free_huge_page will handle it by freeing the page 543 * and reducing the surplus. 544 */ 545 spin_unlock(&hugetlb_lock); 546 ret = alloc_fresh_huge_page(); 547 spin_lock(&hugetlb_lock); 548 if (!ret) 549 goto out; 550 551 } 552 553 /* 554 * Decrease the pool size 555 * First return free pages to the buddy allocator (being careful 556 * to keep enough around to satisfy reservations). Then place 557 * pages into surplus state as needed so the pool will shrink 558 * to the desired size as pages become free. 559 * 560 * By placing pages into the surplus state independent of the 561 * overcommit value, we are allowing the surplus pool size to 562 * exceed overcommit. There are few sane options here. Since 563 * alloc_buddy_huge_page() is checking the global counter, 564 * though, we'll note that we're not allowed to exceed surplus 565 * and won't grow the pool anywhere else. Not until one of the 566 * sysctls are changed, or the surplus pages go out of use. 567 */ 568 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 569 min_count = max(count, min_count); 570 try_to_free_low(min_count); 571 while (min_count < persistent_huge_pages) { 572 struct page *page = dequeue_huge_page(NULL, 0); 573 if (!page) 574 break; 575 update_and_free_page(page); 576 } 577 while (count < persistent_huge_pages) { 578 if (!adjust_pool_surplus(1)) 579 break; 580 } 581 out: 582 ret = persistent_huge_pages; 583 spin_unlock(&hugetlb_lock); 584 return ret; 585 } 586 587 int hugetlb_sysctl_handler(struct ctl_table *table, int write, 588 struct file *file, void __user *buffer, 589 size_t *length, loff_t *ppos) 590 { 591 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 592 max_huge_pages = set_max_huge_pages(max_huge_pages); 593 return 0; 594 } 595 596 int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 597 struct file *file, void __user *buffer, 598 size_t *length, loff_t *ppos) 599 { 600 proc_dointvec(table, write, file, buffer, length, ppos); 601 if (hugepages_treat_as_movable) 602 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 603 else 604 htlb_alloc_mask = GFP_HIGHUSER; 605 return 0; 606 } 607 608 #endif /* CONFIG_SYSCTL */ 609 610 int hugetlb_report_meminfo(char *buf) 611 { 612 return sprintf(buf, 613 "HugePages_Total: %5lu\n" 614 "HugePages_Free: %5lu\n" 615 "HugePages_Rsvd: %5lu\n" 616 "HugePages_Surp: %5lu\n" 617 "Hugepagesize: %5lu kB\n", 618 nr_huge_pages, 619 free_huge_pages, 620 resv_huge_pages, 621 surplus_huge_pages, 622 HPAGE_SIZE/1024); 623 } 624 625 int hugetlb_report_node_meminfo(int nid, char *buf) 626 { 627 return sprintf(buf, 628 "Node %d HugePages_Total: %5u\n" 629 "Node %d HugePages_Free: %5u\n", 630 nid, nr_huge_pages_node[nid], 631 nid, free_huge_pages_node[nid]); 632 } 633 634 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 635 unsigned long hugetlb_total_pages(void) 636 { 637 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 638 } 639 640 /* 641 * We cannot handle pagefaults against hugetlb pages at all. They cause 642 * handle_mm_fault() to try to instantiate regular-sized pages in the 643 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 644 * this far. 645 */ 646 static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 647 { 648 BUG(); 649 return 0; 650 } 651 652 struct vm_operations_struct hugetlb_vm_ops = { 653 .fault = hugetlb_vm_op_fault, 654 }; 655 656 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 657 int writable) 658 { 659 pte_t entry; 660 661 if (writable) { 662 entry = 663 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 664 } else { 665 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 666 } 667 entry = pte_mkyoung(entry); 668 entry = pte_mkhuge(entry); 669 670 return entry; 671 } 672 673 static void set_huge_ptep_writable(struct vm_area_struct *vma, 674 unsigned long address, pte_t *ptep) 675 { 676 pte_t entry; 677 678 entry = pte_mkwrite(pte_mkdirty(*ptep)); 679 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { 680 update_mmu_cache(vma, address, entry); 681 } 682 } 683 684 685 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 686 struct vm_area_struct *vma) 687 { 688 pte_t *src_pte, *dst_pte, entry; 689 struct page *ptepage; 690 unsigned long addr; 691 int cow; 692 693 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 694 695 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 696 src_pte = huge_pte_offset(src, addr); 697 if (!src_pte) 698 continue; 699 dst_pte = huge_pte_alloc(dst, addr); 700 if (!dst_pte) 701 goto nomem; 702 703 /* If the pagetables are shared don't copy or take references */ 704 if (dst_pte == src_pte) 705 continue; 706 707 spin_lock(&dst->page_table_lock); 708 spin_lock(&src->page_table_lock); 709 if (!pte_none(*src_pte)) { 710 if (cow) 711 ptep_set_wrprotect(src, addr, src_pte); 712 entry = *src_pte; 713 ptepage = pte_page(entry); 714 get_page(ptepage); 715 set_huge_pte_at(dst, addr, dst_pte, entry); 716 } 717 spin_unlock(&src->page_table_lock); 718 spin_unlock(&dst->page_table_lock); 719 } 720 return 0; 721 722 nomem: 723 return -ENOMEM; 724 } 725 726 void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 727 unsigned long end) 728 { 729 struct mm_struct *mm = vma->vm_mm; 730 unsigned long address; 731 pte_t *ptep; 732 pte_t pte; 733 struct page *page; 734 struct page *tmp; 735 /* 736 * A page gathering list, protected by per file i_mmap_lock. The 737 * lock is used to avoid list corruption from multiple unmapping 738 * of the same page since we are using page->lru. 739 */ 740 LIST_HEAD(page_list); 741 742 WARN_ON(!is_vm_hugetlb_page(vma)); 743 BUG_ON(start & ~HPAGE_MASK); 744 BUG_ON(end & ~HPAGE_MASK); 745 746 spin_lock(&mm->page_table_lock); 747 for (address = start; address < end; address += HPAGE_SIZE) { 748 ptep = huge_pte_offset(mm, address); 749 if (!ptep) 750 continue; 751 752 if (huge_pmd_unshare(mm, &address, ptep)) 753 continue; 754 755 pte = huge_ptep_get_and_clear(mm, address, ptep); 756 if (pte_none(pte)) 757 continue; 758 759 page = pte_page(pte); 760 if (pte_dirty(pte)) 761 set_page_dirty(page); 762 list_add(&page->lru, &page_list); 763 } 764 spin_unlock(&mm->page_table_lock); 765 flush_tlb_range(vma, start, end); 766 list_for_each_entry_safe(page, tmp, &page_list, lru) { 767 list_del(&page->lru); 768 put_page(page); 769 } 770 } 771 772 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 773 unsigned long end) 774 { 775 /* 776 * It is undesirable to test vma->vm_file as it should be non-null 777 * for valid hugetlb area. However, vm_file will be NULL in the error 778 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, 779 * do_mmap_pgoff() nullifies vma->vm_file before calling this function 780 * to clean up. Since no pte has actually been setup, it is safe to 781 * do nothing in this case. 782 */ 783 if (vma->vm_file) { 784 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 785 __unmap_hugepage_range(vma, start, end); 786 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 787 } 788 } 789 790 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 791 unsigned long address, pte_t *ptep, pte_t pte) 792 { 793 struct page *old_page, *new_page; 794 int avoidcopy; 795 796 old_page = pte_page(pte); 797 798 /* If no-one else is actually using this page, avoid the copy 799 * and just make the page writable */ 800 avoidcopy = (page_count(old_page) == 1); 801 if (avoidcopy) { 802 set_huge_ptep_writable(vma, address, ptep); 803 return 0; 804 } 805 806 page_cache_get(old_page); 807 new_page = alloc_huge_page(vma, address); 808 809 if (IS_ERR(new_page)) { 810 page_cache_release(old_page); 811 return -PTR_ERR(new_page); 812 } 813 814 spin_unlock(&mm->page_table_lock); 815 copy_huge_page(new_page, old_page, address, vma); 816 spin_lock(&mm->page_table_lock); 817 818 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 819 if (likely(pte_same(*ptep, pte))) { 820 /* Break COW */ 821 set_huge_pte_at(mm, address, ptep, 822 make_huge_pte(vma, new_page, 1)); 823 /* Make the old page be freed below */ 824 new_page = old_page; 825 } 826 page_cache_release(new_page); 827 page_cache_release(old_page); 828 return 0; 829 } 830 831 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 832 unsigned long address, pte_t *ptep, int write_access) 833 { 834 int ret = VM_FAULT_SIGBUS; 835 unsigned long idx; 836 unsigned long size; 837 struct page *page; 838 struct address_space *mapping; 839 pte_t new_pte; 840 841 mapping = vma->vm_file->f_mapping; 842 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 843 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 844 845 /* 846 * Use page lock to guard against racing truncation 847 * before we get page_table_lock. 848 */ 849 retry: 850 page = find_lock_page(mapping, idx); 851 if (!page) { 852 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 853 if (idx >= size) 854 goto out; 855 page = alloc_huge_page(vma, address); 856 if (IS_ERR(page)) { 857 ret = -PTR_ERR(page); 858 goto out; 859 } 860 clear_huge_page(page, address); 861 862 if (vma->vm_flags & VM_SHARED) { 863 int err; 864 struct inode *inode = mapping->host; 865 866 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 867 if (err) { 868 put_page(page); 869 if (err == -EEXIST) 870 goto retry; 871 goto out; 872 } 873 874 spin_lock(&inode->i_lock); 875 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 876 spin_unlock(&inode->i_lock); 877 } else 878 lock_page(page); 879 } 880 881 spin_lock(&mm->page_table_lock); 882 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 883 if (idx >= size) 884 goto backout; 885 886 ret = 0; 887 if (!pte_none(*ptep)) 888 goto backout; 889 890 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 891 && (vma->vm_flags & VM_SHARED))); 892 set_huge_pte_at(mm, address, ptep, new_pte); 893 894 if (write_access && !(vma->vm_flags & VM_SHARED)) { 895 /* Optimization, do the COW without a second fault */ 896 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 897 } 898 899 spin_unlock(&mm->page_table_lock); 900 unlock_page(page); 901 out: 902 return ret; 903 904 backout: 905 spin_unlock(&mm->page_table_lock); 906 unlock_page(page); 907 put_page(page); 908 goto out; 909 } 910 911 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 912 unsigned long address, int write_access) 913 { 914 pte_t *ptep; 915 pte_t entry; 916 int ret; 917 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 918 919 ptep = huge_pte_alloc(mm, address); 920 if (!ptep) 921 return VM_FAULT_OOM; 922 923 /* 924 * Serialize hugepage allocation and instantiation, so that we don't 925 * get spurious allocation failures if two CPUs race to instantiate 926 * the same page in the page cache. 927 */ 928 mutex_lock(&hugetlb_instantiation_mutex); 929 entry = *ptep; 930 if (pte_none(entry)) { 931 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 932 mutex_unlock(&hugetlb_instantiation_mutex); 933 return ret; 934 } 935 936 ret = 0; 937 938 spin_lock(&mm->page_table_lock); 939 /* Check for a racing update before calling hugetlb_cow */ 940 if (likely(pte_same(entry, *ptep))) 941 if (write_access && !pte_write(entry)) 942 ret = hugetlb_cow(mm, vma, address, ptep, entry); 943 spin_unlock(&mm->page_table_lock); 944 mutex_unlock(&hugetlb_instantiation_mutex); 945 946 return ret; 947 } 948 949 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 950 struct page **pages, struct vm_area_struct **vmas, 951 unsigned long *position, int *length, int i, 952 int write) 953 { 954 unsigned long pfn_offset; 955 unsigned long vaddr = *position; 956 int remainder = *length; 957 958 spin_lock(&mm->page_table_lock); 959 while (vaddr < vma->vm_end && remainder) { 960 pte_t *pte; 961 struct page *page; 962 963 /* 964 * Some archs (sparc64, sh*) have multiple pte_ts to 965 * each hugepage. We have to make * sure we get the 966 * first, for the page indexing below to work. 967 */ 968 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 969 970 if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) { 971 int ret; 972 973 spin_unlock(&mm->page_table_lock); 974 ret = hugetlb_fault(mm, vma, vaddr, write); 975 spin_lock(&mm->page_table_lock); 976 if (!(ret & VM_FAULT_ERROR)) 977 continue; 978 979 remainder = 0; 980 if (!i) 981 i = -EFAULT; 982 break; 983 } 984 985 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 986 page = pte_page(*pte); 987 same_page: 988 if (pages) { 989 get_page(page); 990 pages[i] = page + pfn_offset; 991 } 992 993 if (vmas) 994 vmas[i] = vma; 995 996 vaddr += PAGE_SIZE; 997 ++pfn_offset; 998 --remainder; 999 ++i; 1000 if (vaddr < vma->vm_end && remainder && 1001 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 1002 /* 1003 * We use pfn_offset to avoid touching the pageframes 1004 * of this compound page. 1005 */ 1006 goto same_page; 1007 } 1008 } 1009 spin_unlock(&mm->page_table_lock); 1010 *length = remainder; 1011 *position = vaddr; 1012 1013 return i; 1014 } 1015 1016 void hugetlb_change_protection(struct vm_area_struct *vma, 1017 unsigned long address, unsigned long end, pgprot_t newprot) 1018 { 1019 struct mm_struct *mm = vma->vm_mm; 1020 unsigned long start = address; 1021 pte_t *ptep; 1022 pte_t pte; 1023 1024 BUG_ON(address >= end); 1025 flush_cache_range(vma, address, end); 1026 1027 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1028 spin_lock(&mm->page_table_lock); 1029 for (; address < end; address += HPAGE_SIZE) { 1030 ptep = huge_pte_offset(mm, address); 1031 if (!ptep) 1032 continue; 1033 if (huge_pmd_unshare(mm, &address, ptep)) 1034 continue; 1035 if (!pte_none(*ptep)) { 1036 pte = huge_ptep_get_and_clear(mm, address, ptep); 1037 pte = pte_mkhuge(pte_modify(pte, newprot)); 1038 set_huge_pte_at(mm, address, ptep, pte); 1039 } 1040 } 1041 spin_unlock(&mm->page_table_lock); 1042 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1043 1044 flush_tlb_range(vma, start, end); 1045 } 1046 1047 struct file_region { 1048 struct list_head link; 1049 long from; 1050 long to; 1051 }; 1052 1053 static long region_add(struct list_head *head, long f, long t) 1054 { 1055 struct file_region *rg, *nrg, *trg; 1056 1057 /* Locate the region we are either in or before. */ 1058 list_for_each_entry(rg, head, link) 1059 if (f <= rg->to) 1060 break; 1061 1062 /* Round our left edge to the current segment if it encloses us. */ 1063 if (f > rg->from) 1064 f = rg->from; 1065 1066 /* Check for and consume any regions we now overlap with. */ 1067 nrg = rg; 1068 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1069 if (&rg->link == head) 1070 break; 1071 if (rg->from > t) 1072 break; 1073 1074 /* If this area reaches higher then extend our area to 1075 * include it completely. If this is not the first area 1076 * which we intend to reuse, free it. */ 1077 if (rg->to > t) 1078 t = rg->to; 1079 if (rg != nrg) { 1080 list_del(&rg->link); 1081 kfree(rg); 1082 } 1083 } 1084 nrg->from = f; 1085 nrg->to = t; 1086 return 0; 1087 } 1088 1089 static long region_chg(struct list_head *head, long f, long t) 1090 { 1091 struct file_region *rg, *nrg; 1092 long chg = 0; 1093 1094 /* Locate the region we are before or in. */ 1095 list_for_each_entry(rg, head, link) 1096 if (f <= rg->to) 1097 break; 1098 1099 /* If we are below the current region then a new region is required. 1100 * Subtle, allocate a new region at the position but make it zero 1101 * size such that we can guarantee to record the reservation. */ 1102 if (&rg->link == head || t < rg->from) { 1103 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 1104 if (!nrg) 1105 return -ENOMEM; 1106 nrg->from = f; 1107 nrg->to = f; 1108 INIT_LIST_HEAD(&nrg->link); 1109 list_add(&nrg->link, rg->link.prev); 1110 1111 return t - f; 1112 } 1113 1114 /* Round our left edge to the current segment if it encloses us. */ 1115 if (f > rg->from) 1116 f = rg->from; 1117 chg = t - f; 1118 1119 /* Check for and consume any regions we now overlap with. */ 1120 list_for_each_entry(rg, rg->link.prev, link) { 1121 if (&rg->link == head) 1122 break; 1123 if (rg->from > t) 1124 return chg; 1125 1126 /* We overlap with this area, if it extends futher than 1127 * us then we must extend ourselves. Account for its 1128 * existing reservation. */ 1129 if (rg->to > t) { 1130 chg += rg->to - t; 1131 t = rg->to; 1132 } 1133 chg -= rg->to - rg->from; 1134 } 1135 return chg; 1136 } 1137 1138 static long region_truncate(struct list_head *head, long end) 1139 { 1140 struct file_region *rg, *trg; 1141 long chg = 0; 1142 1143 /* Locate the region we are either in or before. */ 1144 list_for_each_entry(rg, head, link) 1145 if (end <= rg->to) 1146 break; 1147 if (&rg->link == head) 1148 return 0; 1149 1150 /* If we are in the middle of a region then adjust it. */ 1151 if (end > rg->from) { 1152 chg = rg->to - end; 1153 rg->to = end; 1154 rg = list_entry(rg->link.next, typeof(*rg), link); 1155 } 1156 1157 /* Drop any remaining regions. */ 1158 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1159 if (&rg->link == head) 1160 break; 1161 chg += rg->to - rg->from; 1162 list_del(&rg->link); 1163 kfree(rg); 1164 } 1165 return chg; 1166 } 1167 1168 static int hugetlb_acct_memory(long delta) 1169 { 1170 int ret = -ENOMEM; 1171 1172 spin_lock(&hugetlb_lock); 1173 /* 1174 * When cpuset is configured, it breaks the strict hugetlb page 1175 * reservation as the accounting is done on a global variable. Such 1176 * reservation is completely rubbish in the presence of cpuset because 1177 * the reservation is not checked against page availability for the 1178 * current cpuset. Application can still potentially OOM'ed by kernel 1179 * with lack of free htlb page in cpuset that the task is in. 1180 * Attempt to enforce strict accounting with cpuset is almost 1181 * impossible (or too ugly) because cpuset is too fluid that 1182 * task or memory node can be dynamically moved between cpusets. 1183 * 1184 * The change of semantics for shared hugetlb mapping with cpuset is 1185 * undesirable. However, in order to preserve some of the semantics, 1186 * we fall back to check against current free page availability as 1187 * a best attempt and hopefully to minimize the impact of changing 1188 * semantics that cpuset has. 1189 */ 1190 if (delta > 0) { 1191 if (gather_surplus_pages(delta) < 0) 1192 goto out; 1193 1194 if (delta > cpuset_mems_nr(free_huge_pages_node)) 1195 goto out; 1196 } 1197 1198 ret = 0; 1199 resv_huge_pages += delta; 1200 if (delta < 0) 1201 return_unused_surplus_pages((unsigned long) -delta); 1202 1203 out: 1204 spin_unlock(&hugetlb_lock); 1205 return ret; 1206 } 1207 1208 int hugetlb_reserve_pages(struct inode *inode, long from, long to) 1209 { 1210 long ret, chg; 1211 1212 chg = region_chg(&inode->i_mapping->private_list, from, to); 1213 if (chg < 0) 1214 return chg; 1215 1216 if (hugetlb_get_quota(inode->i_mapping, chg)) 1217 return -ENOSPC; 1218 ret = hugetlb_acct_memory(chg); 1219 if (ret < 0) { 1220 hugetlb_put_quota(inode->i_mapping, chg); 1221 return ret; 1222 } 1223 region_add(&inode->i_mapping->private_list, from, to); 1224 return 0; 1225 } 1226 1227 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 1228 { 1229 long chg = region_truncate(&inode->i_mapping->private_list, offset); 1230 1231 spin_lock(&inode->i_lock); 1232 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 1233 spin_unlock(&inode->i_lock); 1234 1235 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 1236 hugetlb_acct_memory(-(chg - freed)); 1237 } 1238