1 /* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5 #include <linux/gfp.h> 6 #include <linux/list.h> 7 #include <linux/init.h> 8 #include <linux/module.h> 9 #include <linux/mm.h> 10 #include <linux/sysctl.h> 11 #include <linux/highmem.h> 12 #include <linux/nodemask.h> 13 #include <linux/pagemap.h> 14 #include <linux/mempolicy.h> 15 #include <linux/cpuset.h> 16 #include <linux/mutex.h> 17 18 #include <asm/page.h> 19 #include <asm/pgtable.h> 20 21 #include <linux/hugetlb.h> 22 #include "internal.h" 23 24 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 25 static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; 26 static unsigned long surplus_huge_pages; 27 static unsigned long nr_overcommit_huge_pages; 28 unsigned long max_huge_pages; 29 unsigned long sysctl_overcommit_huge_pages; 30 static struct list_head hugepage_freelists[MAX_NUMNODES]; 31 static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 32 static unsigned int free_huge_pages_node[MAX_NUMNODES]; 33 static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; 34 static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 35 unsigned long hugepages_treat_as_movable; 36 static int hugetlb_next_nid; 37 38 /* 39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 40 */ 41 static DEFINE_SPINLOCK(hugetlb_lock); 42 43 static void clear_huge_page(struct page *page, unsigned long addr) 44 { 45 int i; 46 47 might_sleep(); 48 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 49 cond_resched(); 50 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 51 } 52 } 53 54 static void copy_huge_page(struct page *dst, struct page *src, 55 unsigned long addr, struct vm_area_struct *vma) 56 { 57 int i; 58 59 might_sleep(); 60 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 61 cond_resched(); 62 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 63 } 64 } 65 66 static void enqueue_huge_page(struct page *page) 67 { 68 int nid = page_to_nid(page); 69 list_add(&page->lru, &hugepage_freelists[nid]); 70 free_huge_pages++; 71 free_huge_pages_node[nid]++; 72 } 73 74 static struct page *dequeue_huge_page(void) 75 { 76 int nid; 77 struct page *page = NULL; 78 79 for (nid = 0; nid < MAX_NUMNODES; ++nid) { 80 if (!list_empty(&hugepage_freelists[nid])) { 81 page = list_entry(hugepage_freelists[nid].next, 82 struct page, lru); 83 list_del(&page->lru); 84 free_huge_pages--; 85 free_huge_pages_node[nid]--; 86 break; 87 } 88 } 89 return page; 90 } 91 92 static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, 93 unsigned long address) 94 { 95 int nid; 96 struct page *page = NULL; 97 struct mempolicy *mpol; 98 struct zonelist *zonelist = huge_zonelist(vma, address, 99 htlb_alloc_mask, &mpol); 100 struct zone **z; 101 102 for (z = zonelist->zones; *z; z++) { 103 nid = zone_to_nid(*z); 104 if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && 105 !list_empty(&hugepage_freelists[nid])) { 106 page = list_entry(hugepage_freelists[nid].next, 107 struct page, lru); 108 list_del(&page->lru); 109 free_huge_pages--; 110 free_huge_pages_node[nid]--; 111 if (vma && vma->vm_flags & VM_MAYSHARE) 112 resv_huge_pages--; 113 break; 114 } 115 } 116 mpol_free(mpol); /* unref if mpol !NULL */ 117 return page; 118 } 119 120 static void update_and_free_page(struct page *page) 121 { 122 int i; 123 nr_huge_pages--; 124 nr_huge_pages_node[page_to_nid(page)]--; 125 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 126 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 127 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 128 1 << PG_private | 1<< PG_writeback); 129 } 130 set_compound_page_dtor(page, NULL); 131 set_page_refcounted(page); 132 __free_pages(page, HUGETLB_PAGE_ORDER); 133 } 134 135 static void free_huge_page(struct page *page) 136 { 137 int nid = page_to_nid(page); 138 struct address_space *mapping; 139 140 mapping = (struct address_space *) page_private(page); 141 set_page_private(page, 0); 142 BUG_ON(page_count(page)); 143 INIT_LIST_HEAD(&page->lru); 144 145 spin_lock(&hugetlb_lock); 146 if (surplus_huge_pages_node[nid]) { 147 update_and_free_page(page); 148 surplus_huge_pages--; 149 surplus_huge_pages_node[nid]--; 150 } else { 151 enqueue_huge_page(page); 152 } 153 spin_unlock(&hugetlb_lock); 154 if (mapping) 155 hugetlb_put_quota(mapping, 1); 156 } 157 158 /* 159 * Increment or decrement surplus_huge_pages. Keep node-specific counters 160 * balanced by operating on them in a round-robin fashion. 161 * Returns 1 if an adjustment was made. 162 */ 163 static int adjust_pool_surplus(int delta) 164 { 165 static int prev_nid; 166 int nid = prev_nid; 167 int ret = 0; 168 169 VM_BUG_ON(delta != -1 && delta != 1); 170 do { 171 nid = next_node(nid, node_online_map); 172 if (nid == MAX_NUMNODES) 173 nid = first_node(node_online_map); 174 175 /* To shrink on this node, there must be a surplus page */ 176 if (delta < 0 && !surplus_huge_pages_node[nid]) 177 continue; 178 /* Surplus cannot exceed the total number of pages */ 179 if (delta > 0 && surplus_huge_pages_node[nid] >= 180 nr_huge_pages_node[nid]) 181 continue; 182 183 surplus_huge_pages += delta; 184 surplus_huge_pages_node[nid] += delta; 185 ret = 1; 186 break; 187 } while (nid != prev_nid); 188 189 prev_nid = nid; 190 return ret; 191 } 192 193 static struct page *alloc_fresh_huge_page_node(int nid) 194 { 195 struct page *page; 196 197 page = alloc_pages_node(nid, 198 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, 199 HUGETLB_PAGE_ORDER); 200 if (page) { 201 set_compound_page_dtor(page, free_huge_page); 202 spin_lock(&hugetlb_lock); 203 nr_huge_pages++; 204 nr_huge_pages_node[nid]++; 205 spin_unlock(&hugetlb_lock); 206 put_page(page); /* free it into the hugepage allocator */ 207 } 208 209 return page; 210 } 211 212 static int alloc_fresh_huge_page(void) 213 { 214 struct page *page; 215 int start_nid; 216 int next_nid; 217 int ret = 0; 218 219 start_nid = hugetlb_next_nid; 220 221 do { 222 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 223 if (page) 224 ret = 1; 225 /* 226 * Use a helper variable to find the next node and then 227 * copy it back to hugetlb_next_nid afterwards: 228 * otherwise there's a window in which a racer might 229 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 230 * But we don't need to use a spin_lock here: it really 231 * doesn't matter if occasionally a racer chooses the 232 * same nid as we do. Move nid forward in the mask even 233 * if we just successfully allocated a hugepage so that 234 * the next caller gets hugepages on the next node. 235 */ 236 next_nid = next_node(hugetlb_next_nid, node_online_map); 237 if (next_nid == MAX_NUMNODES) 238 next_nid = first_node(node_online_map); 239 hugetlb_next_nid = next_nid; 240 } while (!page && hugetlb_next_nid != start_nid); 241 242 return ret; 243 } 244 245 static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 246 unsigned long address) 247 { 248 struct page *page; 249 unsigned int nid; 250 251 /* 252 * Assume we will successfully allocate the surplus page to 253 * prevent racing processes from causing the surplus to exceed 254 * overcommit 255 * 256 * This however introduces a different race, where a process B 257 * tries to grow the static hugepage pool while alloc_pages() is 258 * called by process A. B will only examine the per-node 259 * counters in determining if surplus huge pages can be 260 * converted to normal huge pages in adjust_pool_surplus(). A 261 * won't be able to increment the per-node counter, until the 262 * lock is dropped by B, but B doesn't drop hugetlb_lock until 263 * no more huge pages can be converted from surplus to normal 264 * state (and doesn't try to convert again). Thus, we have a 265 * case where a surplus huge page exists, the pool is grown, and 266 * the surplus huge page still exists after, even though it 267 * should just have been converted to a normal huge page. This 268 * does not leak memory, though, as the hugepage will be freed 269 * once it is out of use. It also does not allow the counters to 270 * go out of whack in adjust_pool_surplus() as we don't modify 271 * the node values until we've gotten the hugepage and only the 272 * per-node value is checked there. 273 */ 274 spin_lock(&hugetlb_lock); 275 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 276 spin_unlock(&hugetlb_lock); 277 return NULL; 278 } else { 279 nr_huge_pages++; 280 surplus_huge_pages++; 281 } 282 spin_unlock(&hugetlb_lock); 283 284 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, 285 HUGETLB_PAGE_ORDER); 286 287 spin_lock(&hugetlb_lock); 288 if (page) { 289 /* 290 * This page is now managed by the hugetlb allocator and has 291 * no users -- drop the buddy allocator's reference. 292 */ 293 put_page_testzero(page); 294 VM_BUG_ON(page_count(page)); 295 nid = page_to_nid(page); 296 set_compound_page_dtor(page, free_huge_page); 297 /* 298 * We incremented the global counters already 299 */ 300 nr_huge_pages_node[nid]++; 301 surplus_huge_pages_node[nid]++; 302 } else { 303 nr_huge_pages--; 304 surplus_huge_pages--; 305 } 306 spin_unlock(&hugetlb_lock); 307 308 return page; 309 } 310 311 /* 312 * Increase the hugetlb pool such that it can accomodate a reservation 313 * of size 'delta'. 314 */ 315 static int gather_surplus_pages(int delta) 316 { 317 struct list_head surplus_list; 318 struct page *page, *tmp; 319 int ret, i; 320 int needed, allocated; 321 322 needed = (resv_huge_pages + delta) - free_huge_pages; 323 if (needed <= 0) { 324 resv_huge_pages += delta; 325 return 0; 326 } 327 328 allocated = 0; 329 INIT_LIST_HEAD(&surplus_list); 330 331 ret = -ENOMEM; 332 retry: 333 spin_unlock(&hugetlb_lock); 334 for (i = 0; i < needed; i++) { 335 page = alloc_buddy_huge_page(NULL, 0); 336 if (!page) { 337 /* 338 * We were not able to allocate enough pages to 339 * satisfy the entire reservation so we free what 340 * we've allocated so far. 341 */ 342 spin_lock(&hugetlb_lock); 343 needed = 0; 344 goto free; 345 } 346 347 list_add(&page->lru, &surplus_list); 348 } 349 allocated += needed; 350 351 /* 352 * After retaking hugetlb_lock, we need to recalculate 'needed' 353 * because either resv_huge_pages or free_huge_pages may have changed. 354 */ 355 spin_lock(&hugetlb_lock); 356 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 357 if (needed > 0) 358 goto retry; 359 360 /* 361 * The surplus_list now contains _at_least_ the number of extra pages 362 * needed to accomodate the reservation. Add the appropriate number 363 * of pages to the hugetlb pool and free the extras back to the buddy 364 * allocator. Commit the entire reservation here to prevent another 365 * process from stealing the pages as they are added to the pool but 366 * before they are reserved. 367 */ 368 needed += allocated; 369 resv_huge_pages += delta; 370 ret = 0; 371 free: 372 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 373 list_del(&page->lru); 374 if ((--needed) >= 0) 375 enqueue_huge_page(page); 376 else { 377 /* 378 * The page has a reference count of zero already, so 379 * call free_huge_page directly instead of using 380 * put_page. This must be done with hugetlb_lock 381 * unlocked which is safe because free_huge_page takes 382 * hugetlb_lock before deciding how to free the page. 383 */ 384 spin_unlock(&hugetlb_lock); 385 free_huge_page(page); 386 spin_lock(&hugetlb_lock); 387 } 388 } 389 390 return ret; 391 } 392 393 /* 394 * When releasing a hugetlb pool reservation, any surplus pages that were 395 * allocated to satisfy the reservation must be explicitly freed if they were 396 * never used. 397 */ 398 static void return_unused_surplus_pages(unsigned long unused_resv_pages) 399 { 400 static int nid = -1; 401 struct page *page; 402 unsigned long nr_pages; 403 404 /* 405 * We want to release as many surplus pages as possible, spread 406 * evenly across all nodes. Iterate across all nodes until we 407 * can no longer free unreserved surplus pages. This occurs when 408 * the nodes with surplus pages have no free pages. 409 */ 410 unsigned long remaining_iterations = num_online_nodes(); 411 412 /* Uncommit the reservation */ 413 resv_huge_pages -= unused_resv_pages; 414 415 nr_pages = min(unused_resv_pages, surplus_huge_pages); 416 417 while (remaining_iterations-- && nr_pages) { 418 nid = next_node(nid, node_online_map); 419 if (nid == MAX_NUMNODES) 420 nid = first_node(node_online_map); 421 422 if (!surplus_huge_pages_node[nid]) 423 continue; 424 425 if (!list_empty(&hugepage_freelists[nid])) { 426 page = list_entry(hugepage_freelists[nid].next, 427 struct page, lru); 428 list_del(&page->lru); 429 update_and_free_page(page); 430 free_huge_pages--; 431 free_huge_pages_node[nid]--; 432 surplus_huge_pages--; 433 surplus_huge_pages_node[nid]--; 434 nr_pages--; 435 remaining_iterations = num_online_nodes(); 436 } 437 } 438 } 439 440 441 static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, 442 unsigned long addr) 443 { 444 struct page *page; 445 446 spin_lock(&hugetlb_lock); 447 page = dequeue_huge_page_vma(vma, addr); 448 spin_unlock(&hugetlb_lock); 449 return page ? page : ERR_PTR(-VM_FAULT_OOM); 450 } 451 452 static struct page *alloc_huge_page_private(struct vm_area_struct *vma, 453 unsigned long addr) 454 { 455 struct page *page = NULL; 456 457 if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) 458 return ERR_PTR(-VM_FAULT_SIGBUS); 459 460 spin_lock(&hugetlb_lock); 461 if (free_huge_pages > resv_huge_pages) 462 page = dequeue_huge_page_vma(vma, addr); 463 spin_unlock(&hugetlb_lock); 464 if (!page) { 465 page = alloc_buddy_huge_page(vma, addr); 466 if (!page) { 467 hugetlb_put_quota(vma->vm_file->f_mapping, 1); 468 return ERR_PTR(-VM_FAULT_OOM); 469 } 470 } 471 return page; 472 } 473 474 static struct page *alloc_huge_page(struct vm_area_struct *vma, 475 unsigned long addr) 476 { 477 struct page *page; 478 struct address_space *mapping = vma->vm_file->f_mapping; 479 480 if (vma->vm_flags & VM_MAYSHARE) 481 page = alloc_huge_page_shared(vma, addr); 482 else 483 page = alloc_huge_page_private(vma, addr); 484 485 if (!IS_ERR(page)) { 486 set_page_refcounted(page); 487 set_page_private(page, (unsigned long) mapping); 488 } 489 return page; 490 } 491 492 static int __init hugetlb_init(void) 493 { 494 unsigned long i; 495 496 if (HPAGE_SHIFT == 0) 497 return 0; 498 499 for (i = 0; i < MAX_NUMNODES; ++i) 500 INIT_LIST_HEAD(&hugepage_freelists[i]); 501 502 hugetlb_next_nid = first_node(node_online_map); 503 504 for (i = 0; i < max_huge_pages; ++i) { 505 if (!alloc_fresh_huge_page()) 506 break; 507 } 508 max_huge_pages = free_huge_pages = nr_huge_pages = i; 509 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 510 return 0; 511 } 512 module_init(hugetlb_init); 513 514 static int __init hugetlb_setup(char *s) 515 { 516 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 517 max_huge_pages = 0; 518 return 1; 519 } 520 __setup("hugepages=", hugetlb_setup); 521 522 static unsigned int cpuset_mems_nr(unsigned int *array) 523 { 524 int node; 525 unsigned int nr = 0; 526 527 for_each_node_mask(node, cpuset_current_mems_allowed) 528 nr += array[node]; 529 530 return nr; 531 } 532 533 #ifdef CONFIG_SYSCTL 534 #ifdef CONFIG_HIGHMEM 535 static void try_to_free_low(unsigned long count) 536 { 537 int i; 538 539 for (i = 0; i < MAX_NUMNODES; ++i) { 540 struct page *page, *next; 541 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 542 if (count >= nr_huge_pages) 543 return; 544 if (PageHighMem(page)) 545 continue; 546 list_del(&page->lru); 547 update_and_free_page(page); 548 free_huge_pages--; 549 free_huge_pages_node[page_to_nid(page)]--; 550 } 551 } 552 } 553 #else 554 static inline void try_to_free_low(unsigned long count) 555 { 556 } 557 #endif 558 559 #define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 560 static unsigned long set_max_huge_pages(unsigned long count) 561 { 562 unsigned long min_count, ret; 563 564 /* 565 * Increase the pool size 566 * First take pages out of surplus state. Then make up the 567 * remaining difference by allocating fresh huge pages. 568 * 569 * We might race with alloc_buddy_huge_page() here and be unable 570 * to convert a surplus huge page to a normal huge page. That is 571 * not critical, though, it just means the overall size of the 572 * pool might be one hugepage larger than it needs to be, but 573 * within all the constraints specified by the sysctls. 574 */ 575 spin_lock(&hugetlb_lock); 576 while (surplus_huge_pages && count > persistent_huge_pages) { 577 if (!adjust_pool_surplus(-1)) 578 break; 579 } 580 581 while (count > persistent_huge_pages) { 582 int ret; 583 /* 584 * If this allocation races such that we no longer need the 585 * page, free_huge_page will handle it by freeing the page 586 * and reducing the surplus. 587 */ 588 spin_unlock(&hugetlb_lock); 589 ret = alloc_fresh_huge_page(); 590 spin_lock(&hugetlb_lock); 591 if (!ret) 592 goto out; 593 594 } 595 596 /* 597 * Decrease the pool size 598 * First return free pages to the buddy allocator (being careful 599 * to keep enough around to satisfy reservations). Then place 600 * pages into surplus state as needed so the pool will shrink 601 * to the desired size as pages become free. 602 * 603 * By placing pages into the surplus state independent of the 604 * overcommit value, we are allowing the surplus pool size to 605 * exceed overcommit. There are few sane options here. Since 606 * alloc_buddy_huge_page() is checking the global counter, 607 * though, we'll note that we're not allowed to exceed surplus 608 * and won't grow the pool anywhere else. Not until one of the 609 * sysctls are changed, or the surplus pages go out of use. 610 */ 611 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 612 min_count = max(count, min_count); 613 try_to_free_low(min_count); 614 while (min_count < persistent_huge_pages) { 615 struct page *page = dequeue_huge_page(); 616 if (!page) 617 break; 618 update_and_free_page(page); 619 } 620 while (count < persistent_huge_pages) { 621 if (!adjust_pool_surplus(1)) 622 break; 623 } 624 out: 625 ret = persistent_huge_pages; 626 spin_unlock(&hugetlb_lock); 627 return ret; 628 } 629 630 int hugetlb_sysctl_handler(struct ctl_table *table, int write, 631 struct file *file, void __user *buffer, 632 size_t *length, loff_t *ppos) 633 { 634 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 635 max_huge_pages = set_max_huge_pages(max_huge_pages); 636 return 0; 637 } 638 639 int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 640 struct file *file, void __user *buffer, 641 size_t *length, loff_t *ppos) 642 { 643 proc_dointvec(table, write, file, buffer, length, ppos); 644 if (hugepages_treat_as_movable) 645 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 646 else 647 htlb_alloc_mask = GFP_HIGHUSER; 648 return 0; 649 } 650 651 int hugetlb_overcommit_handler(struct ctl_table *table, int write, 652 struct file *file, void __user *buffer, 653 size_t *length, loff_t *ppos) 654 { 655 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 656 spin_lock(&hugetlb_lock); 657 nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; 658 spin_unlock(&hugetlb_lock); 659 return 0; 660 } 661 662 #endif /* CONFIG_SYSCTL */ 663 664 int hugetlb_report_meminfo(char *buf) 665 { 666 return sprintf(buf, 667 "HugePages_Total: %5lu\n" 668 "HugePages_Free: %5lu\n" 669 "HugePages_Rsvd: %5lu\n" 670 "HugePages_Surp: %5lu\n" 671 "Hugepagesize: %5lu kB\n", 672 nr_huge_pages, 673 free_huge_pages, 674 resv_huge_pages, 675 surplus_huge_pages, 676 HPAGE_SIZE/1024); 677 } 678 679 int hugetlb_report_node_meminfo(int nid, char *buf) 680 { 681 return sprintf(buf, 682 "Node %d HugePages_Total: %5u\n" 683 "Node %d HugePages_Free: %5u\n" 684 "Node %d HugePages_Surp: %5u\n", 685 nid, nr_huge_pages_node[nid], 686 nid, free_huge_pages_node[nid], 687 nid, surplus_huge_pages_node[nid]); 688 } 689 690 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 691 unsigned long hugetlb_total_pages(void) 692 { 693 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 694 } 695 696 /* 697 * We cannot handle pagefaults against hugetlb pages at all. They cause 698 * handle_mm_fault() to try to instantiate regular-sized pages in the 699 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 700 * this far. 701 */ 702 static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 703 { 704 BUG(); 705 return 0; 706 } 707 708 struct vm_operations_struct hugetlb_vm_ops = { 709 .fault = hugetlb_vm_op_fault, 710 }; 711 712 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 713 int writable) 714 { 715 pte_t entry; 716 717 if (writable) { 718 entry = 719 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 720 } else { 721 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 722 } 723 entry = pte_mkyoung(entry); 724 entry = pte_mkhuge(entry); 725 726 return entry; 727 } 728 729 static void set_huge_ptep_writable(struct vm_area_struct *vma, 730 unsigned long address, pte_t *ptep) 731 { 732 pte_t entry; 733 734 entry = pte_mkwrite(pte_mkdirty(*ptep)); 735 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { 736 update_mmu_cache(vma, address, entry); 737 } 738 } 739 740 741 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 742 struct vm_area_struct *vma) 743 { 744 pte_t *src_pte, *dst_pte, entry; 745 struct page *ptepage; 746 unsigned long addr; 747 int cow; 748 749 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 750 751 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 752 src_pte = huge_pte_offset(src, addr); 753 if (!src_pte) 754 continue; 755 dst_pte = huge_pte_alloc(dst, addr); 756 if (!dst_pte) 757 goto nomem; 758 759 /* If the pagetables are shared don't copy or take references */ 760 if (dst_pte == src_pte) 761 continue; 762 763 spin_lock(&dst->page_table_lock); 764 spin_lock(&src->page_table_lock); 765 if (!pte_none(*src_pte)) { 766 if (cow) 767 ptep_set_wrprotect(src, addr, src_pte); 768 entry = *src_pte; 769 ptepage = pte_page(entry); 770 get_page(ptepage); 771 set_huge_pte_at(dst, addr, dst_pte, entry); 772 } 773 spin_unlock(&src->page_table_lock); 774 spin_unlock(&dst->page_table_lock); 775 } 776 return 0; 777 778 nomem: 779 return -ENOMEM; 780 } 781 782 void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 783 unsigned long end) 784 { 785 struct mm_struct *mm = vma->vm_mm; 786 unsigned long address; 787 pte_t *ptep; 788 pte_t pte; 789 struct page *page; 790 struct page *tmp; 791 /* 792 * A page gathering list, protected by per file i_mmap_lock. The 793 * lock is used to avoid list corruption from multiple unmapping 794 * of the same page since we are using page->lru. 795 */ 796 LIST_HEAD(page_list); 797 798 WARN_ON(!is_vm_hugetlb_page(vma)); 799 BUG_ON(start & ~HPAGE_MASK); 800 BUG_ON(end & ~HPAGE_MASK); 801 802 spin_lock(&mm->page_table_lock); 803 for (address = start; address < end; address += HPAGE_SIZE) { 804 ptep = huge_pte_offset(mm, address); 805 if (!ptep) 806 continue; 807 808 if (huge_pmd_unshare(mm, &address, ptep)) 809 continue; 810 811 pte = huge_ptep_get_and_clear(mm, address, ptep); 812 if (pte_none(pte)) 813 continue; 814 815 page = pte_page(pte); 816 if (pte_dirty(pte)) 817 set_page_dirty(page); 818 list_add(&page->lru, &page_list); 819 } 820 spin_unlock(&mm->page_table_lock); 821 flush_tlb_range(vma, start, end); 822 list_for_each_entry_safe(page, tmp, &page_list, lru) { 823 list_del(&page->lru); 824 put_page(page); 825 } 826 } 827 828 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 829 unsigned long end) 830 { 831 /* 832 * It is undesirable to test vma->vm_file as it should be non-null 833 * for valid hugetlb area. However, vm_file will be NULL in the error 834 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, 835 * do_mmap_pgoff() nullifies vma->vm_file before calling this function 836 * to clean up. Since no pte has actually been setup, it is safe to 837 * do nothing in this case. 838 */ 839 if (vma->vm_file) { 840 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 841 __unmap_hugepage_range(vma, start, end); 842 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 843 } 844 } 845 846 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 847 unsigned long address, pte_t *ptep, pte_t pte) 848 { 849 struct page *old_page, *new_page; 850 int avoidcopy; 851 852 old_page = pte_page(pte); 853 854 /* If no-one else is actually using this page, avoid the copy 855 * and just make the page writable */ 856 avoidcopy = (page_count(old_page) == 1); 857 if (avoidcopy) { 858 set_huge_ptep_writable(vma, address, ptep); 859 return 0; 860 } 861 862 page_cache_get(old_page); 863 new_page = alloc_huge_page(vma, address); 864 865 if (IS_ERR(new_page)) { 866 page_cache_release(old_page); 867 return -PTR_ERR(new_page); 868 } 869 870 spin_unlock(&mm->page_table_lock); 871 copy_huge_page(new_page, old_page, address, vma); 872 __SetPageUptodate(new_page); 873 spin_lock(&mm->page_table_lock); 874 875 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 876 if (likely(pte_same(*ptep, pte))) { 877 /* Break COW */ 878 set_huge_pte_at(mm, address, ptep, 879 make_huge_pte(vma, new_page, 1)); 880 /* Make the old page be freed below */ 881 new_page = old_page; 882 } 883 page_cache_release(new_page); 884 page_cache_release(old_page); 885 return 0; 886 } 887 888 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 889 unsigned long address, pte_t *ptep, int write_access) 890 { 891 int ret = VM_FAULT_SIGBUS; 892 unsigned long idx; 893 unsigned long size; 894 struct page *page; 895 struct address_space *mapping; 896 pte_t new_pte; 897 898 mapping = vma->vm_file->f_mapping; 899 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 900 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 901 902 /* 903 * Use page lock to guard against racing truncation 904 * before we get page_table_lock. 905 */ 906 retry: 907 page = find_lock_page(mapping, idx); 908 if (!page) { 909 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 910 if (idx >= size) 911 goto out; 912 page = alloc_huge_page(vma, address); 913 if (IS_ERR(page)) { 914 ret = -PTR_ERR(page); 915 goto out; 916 } 917 clear_huge_page(page, address); 918 __SetPageUptodate(page); 919 920 if (vma->vm_flags & VM_SHARED) { 921 int err; 922 struct inode *inode = mapping->host; 923 924 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 925 if (err) { 926 put_page(page); 927 if (err == -EEXIST) 928 goto retry; 929 goto out; 930 } 931 932 spin_lock(&inode->i_lock); 933 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 934 spin_unlock(&inode->i_lock); 935 } else 936 lock_page(page); 937 } 938 939 spin_lock(&mm->page_table_lock); 940 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 941 if (idx >= size) 942 goto backout; 943 944 ret = 0; 945 if (!pte_none(*ptep)) 946 goto backout; 947 948 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 949 && (vma->vm_flags & VM_SHARED))); 950 set_huge_pte_at(mm, address, ptep, new_pte); 951 952 if (write_access && !(vma->vm_flags & VM_SHARED)) { 953 /* Optimization, do the COW without a second fault */ 954 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 955 } 956 957 spin_unlock(&mm->page_table_lock); 958 unlock_page(page); 959 out: 960 return ret; 961 962 backout: 963 spin_unlock(&mm->page_table_lock); 964 unlock_page(page); 965 put_page(page); 966 goto out; 967 } 968 969 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 970 unsigned long address, int write_access) 971 { 972 pte_t *ptep; 973 pte_t entry; 974 int ret; 975 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 976 977 ptep = huge_pte_alloc(mm, address); 978 if (!ptep) 979 return VM_FAULT_OOM; 980 981 /* 982 * Serialize hugepage allocation and instantiation, so that we don't 983 * get spurious allocation failures if two CPUs race to instantiate 984 * the same page in the page cache. 985 */ 986 mutex_lock(&hugetlb_instantiation_mutex); 987 entry = *ptep; 988 if (pte_none(entry)) { 989 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 990 mutex_unlock(&hugetlb_instantiation_mutex); 991 return ret; 992 } 993 994 ret = 0; 995 996 spin_lock(&mm->page_table_lock); 997 /* Check for a racing update before calling hugetlb_cow */ 998 if (likely(pte_same(entry, *ptep))) 999 if (write_access && !pte_write(entry)) 1000 ret = hugetlb_cow(mm, vma, address, ptep, entry); 1001 spin_unlock(&mm->page_table_lock); 1002 mutex_unlock(&hugetlb_instantiation_mutex); 1003 1004 return ret; 1005 } 1006 1007 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 1008 struct page **pages, struct vm_area_struct **vmas, 1009 unsigned long *position, int *length, int i, 1010 int write) 1011 { 1012 unsigned long pfn_offset; 1013 unsigned long vaddr = *position; 1014 int remainder = *length; 1015 1016 spin_lock(&mm->page_table_lock); 1017 while (vaddr < vma->vm_end && remainder) { 1018 pte_t *pte; 1019 struct page *page; 1020 1021 /* 1022 * Some archs (sparc64, sh*) have multiple pte_ts to 1023 * each hugepage. We have to make * sure we get the 1024 * first, for the page indexing below to work. 1025 */ 1026 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 1027 1028 if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) { 1029 int ret; 1030 1031 spin_unlock(&mm->page_table_lock); 1032 ret = hugetlb_fault(mm, vma, vaddr, write); 1033 spin_lock(&mm->page_table_lock); 1034 if (!(ret & VM_FAULT_ERROR)) 1035 continue; 1036 1037 remainder = 0; 1038 if (!i) 1039 i = -EFAULT; 1040 break; 1041 } 1042 1043 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 1044 page = pte_page(*pte); 1045 same_page: 1046 if (pages) { 1047 get_page(page); 1048 pages[i] = page + pfn_offset; 1049 } 1050 1051 if (vmas) 1052 vmas[i] = vma; 1053 1054 vaddr += PAGE_SIZE; 1055 ++pfn_offset; 1056 --remainder; 1057 ++i; 1058 if (vaddr < vma->vm_end && remainder && 1059 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 1060 /* 1061 * We use pfn_offset to avoid touching the pageframes 1062 * of this compound page. 1063 */ 1064 goto same_page; 1065 } 1066 } 1067 spin_unlock(&mm->page_table_lock); 1068 *length = remainder; 1069 *position = vaddr; 1070 1071 return i; 1072 } 1073 1074 void hugetlb_change_protection(struct vm_area_struct *vma, 1075 unsigned long address, unsigned long end, pgprot_t newprot) 1076 { 1077 struct mm_struct *mm = vma->vm_mm; 1078 unsigned long start = address; 1079 pte_t *ptep; 1080 pte_t pte; 1081 1082 BUG_ON(address >= end); 1083 flush_cache_range(vma, address, end); 1084 1085 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1086 spin_lock(&mm->page_table_lock); 1087 for (; address < end; address += HPAGE_SIZE) { 1088 ptep = huge_pte_offset(mm, address); 1089 if (!ptep) 1090 continue; 1091 if (huge_pmd_unshare(mm, &address, ptep)) 1092 continue; 1093 if (!pte_none(*ptep)) { 1094 pte = huge_ptep_get_and_clear(mm, address, ptep); 1095 pte = pte_mkhuge(pte_modify(pte, newprot)); 1096 set_huge_pte_at(mm, address, ptep, pte); 1097 } 1098 } 1099 spin_unlock(&mm->page_table_lock); 1100 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1101 1102 flush_tlb_range(vma, start, end); 1103 } 1104 1105 struct file_region { 1106 struct list_head link; 1107 long from; 1108 long to; 1109 }; 1110 1111 static long region_add(struct list_head *head, long f, long t) 1112 { 1113 struct file_region *rg, *nrg, *trg; 1114 1115 /* Locate the region we are either in or before. */ 1116 list_for_each_entry(rg, head, link) 1117 if (f <= rg->to) 1118 break; 1119 1120 /* Round our left edge to the current segment if it encloses us. */ 1121 if (f > rg->from) 1122 f = rg->from; 1123 1124 /* Check for and consume any regions we now overlap with. */ 1125 nrg = rg; 1126 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1127 if (&rg->link == head) 1128 break; 1129 if (rg->from > t) 1130 break; 1131 1132 /* If this area reaches higher then extend our area to 1133 * include it completely. If this is not the first area 1134 * which we intend to reuse, free it. */ 1135 if (rg->to > t) 1136 t = rg->to; 1137 if (rg != nrg) { 1138 list_del(&rg->link); 1139 kfree(rg); 1140 } 1141 } 1142 nrg->from = f; 1143 nrg->to = t; 1144 return 0; 1145 } 1146 1147 static long region_chg(struct list_head *head, long f, long t) 1148 { 1149 struct file_region *rg, *nrg; 1150 long chg = 0; 1151 1152 /* Locate the region we are before or in. */ 1153 list_for_each_entry(rg, head, link) 1154 if (f <= rg->to) 1155 break; 1156 1157 /* If we are below the current region then a new region is required. 1158 * Subtle, allocate a new region at the position but make it zero 1159 * size such that we can guarantee to record the reservation. */ 1160 if (&rg->link == head || t < rg->from) { 1161 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 1162 if (!nrg) 1163 return -ENOMEM; 1164 nrg->from = f; 1165 nrg->to = f; 1166 INIT_LIST_HEAD(&nrg->link); 1167 list_add(&nrg->link, rg->link.prev); 1168 1169 return t - f; 1170 } 1171 1172 /* Round our left edge to the current segment if it encloses us. */ 1173 if (f > rg->from) 1174 f = rg->from; 1175 chg = t - f; 1176 1177 /* Check for and consume any regions we now overlap with. */ 1178 list_for_each_entry(rg, rg->link.prev, link) { 1179 if (&rg->link == head) 1180 break; 1181 if (rg->from > t) 1182 return chg; 1183 1184 /* We overlap with this area, if it extends futher than 1185 * us then we must extend ourselves. Account for its 1186 * existing reservation. */ 1187 if (rg->to > t) { 1188 chg += rg->to - t; 1189 t = rg->to; 1190 } 1191 chg -= rg->to - rg->from; 1192 } 1193 return chg; 1194 } 1195 1196 static long region_truncate(struct list_head *head, long end) 1197 { 1198 struct file_region *rg, *trg; 1199 long chg = 0; 1200 1201 /* Locate the region we are either in or before. */ 1202 list_for_each_entry(rg, head, link) 1203 if (end <= rg->to) 1204 break; 1205 if (&rg->link == head) 1206 return 0; 1207 1208 /* If we are in the middle of a region then adjust it. */ 1209 if (end > rg->from) { 1210 chg = rg->to - end; 1211 rg->to = end; 1212 rg = list_entry(rg->link.next, typeof(*rg), link); 1213 } 1214 1215 /* Drop any remaining regions. */ 1216 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1217 if (&rg->link == head) 1218 break; 1219 chg += rg->to - rg->from; 1220 list_del(&rg->link); 1221 kfree(rg); 1222 } 1223 return chg; 1224 } 1225 1226 static int hugetlb_acct_memory(long delta) 1227 { 1228 int ret = -ENOMEM; 1229 1230 spin_lock(&hugetlb_lock); 1231 /* 1232 * When cpuset is configured, it breaks the strict hugetlb page 1233 * reservation as the accounting is done on a global variable. Such 1234 * reservation is completely rubbish in the presence of cpuset because 1235 * the reservation is not checked against page availability for the 1236 * current cpuset. Application can still potentially OOM'ed by kernel 1237 * with lack of free htlb page in cpuset that the task is in. 1238 * Attempt to enforce strict accounting with cpuset is almost 1239 * impossible (or too ugly) because cpuset is too fluid that 1240 * task or memory node can be dynamically moved between cpusets. 1241 * 1242 * The change of semantics for shared hugetlb mapping with cpuset is 1243 * undesirable. However, in order to preserve some of the semantics, 1244 * we fall back to check against current free page availability as 1245 * a best attempt and hopefully to minimize the impact of changing 1246 * semantics that cpuset has. 1247 */ 1248 if (delta > 0) { 1249 if (gather_surplus_pages(delta) < 0) 1250 goto out; 1251 1252 if (delta > cpuset_mems_nr(free_huge_pages_node)) { 1253 return_unused_surplus_pages(delta); 1254 goto out; 1255 } 1256 } 1257 1258 ret = 0; 1259 if (delta < 0) 1260 return_unused_surplus_pages((unsigned long) -delta); 1261 1262 out: 1263 spin_unlock(&hugetlb_lock); 1264 return ret; 1265 } 1266 1267 int hugetlb_reserve_pages(struct inode *inode, long from, long to) 1268 { 1269 long ret, chg; 1270 1271 chg = region_chg(&inode->i_mapping->private_list, from, to); 1272 if (chg < 0) 1273 return chg; 1274 1275 if (hugetlb_get_quota(inode->i_mapping, chg)) 1276 return -ENOSPC; 1277 ret = hugetlb_acct_memory(chg); 1278 if (ret < 0) { 1279 hugetlb_put_quota(inode->i_mapping, chg); 1280 return ret; 1281 } 1282 region_add(&inode->i_mapping->private_list, from, to); 1283 return 0; 1284 } 1285 1286 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 1287 { 1288 long chg = region_truncate(&inode->i_mapping->private_list, offset); 1289 1290 spin_lock(&inode->i_lock); 1291 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 1292 spin_unlock(&inode->i_lock); 1293 1294 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 1295 hugetlb_acct_memory(-(chg - freed)); 1296 } 1297