1 /* 2 * Generic hugetlb support. 3 * (C) Nadia Yvette Chambers, April 2004 4 */ 5 #include <linux/list.h> 6 #include <linux/init.h> 7 #include <linux/mm.h> 8 #include <linux/seq_file.h> 9 #include <linux/sysctl.h> 10 #include <linux/highmem.h> 11 #include <linux/mmu_notifier.h> 12 #include <linux/nodemask.h> 13 #include <linux/pagemap.h> 14 #include <linux/mempolicy.h> 15 #include <linux/compiler.h> 16 #include <linux/cpuset.h> 17 #include <linux/mutex.h> 18 #include <linux/bootmem.h> 19 #include <linux/sysfs.h> 20 #include <linux/slab.h> 21 #include <linux/sched/signal.h> 22 #include <linux/rmap.h> 23 #include <linux/swap.h> 24 #include <linux/swapops.h> 25 #include <linux/page-isolation.h> 26 #include <linux/jhash.h> 27 28 #include <asm/page.h> 29 #include <asm/pgtable.h> 30 #include <asm/tlb.h> 31 32 #include <linux/io.h> 33 #include <linux/hugetlb.h> 34 #include <linux/hugetlb_cgroup.h> 35 #include <linux/node.h> 36 #include <linux/userfaultfd_k.h> 37 #include "internal.h" 38 39 int hugepages_treat_as_movable; 40 41 int hugetlb_max_hstate __read_mostly; 42 unsigned int default_hstate_idx; 43 struct hstate hstates[HUGE_MAX_HSTATE]; 44 /* 45 * Minimum page order among possible hugepage sizes, set to a proper value 46 * at boot time. 47 */ 48 static unsigned int minimum_order __read_mostly = UINT_MAX; 49 50 __initdata LIST_HEAD(huge_boot_pages); 51 52 /* for command line parsing */ 53 static struct hstate * __initdata parsed_hstate; 54 static unsigned long __initdata default_hstate_max_huge_pages; 55 static unsigned long __initdata default_hstate_size; 56 static bool __initdata parsed_valid_hugepagesz = true; 57 58 /* 59 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, 60 * free_huge_pages, and surplus_huge_pages. 61 */ 62 DEFINE_SPINLOCK(hugetlb_lock); 63 64 /* 65 * Serializes faults on the same logical page. This is used to 66 * prevent spurious OOMs when the hugepage pool is fully utilized. 67 */ 68 static int num_fault_mutexes; 69 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; 70 71 /* Forward declaration */ 72 static int hugetlb_acct_memory(struct hstate *h, long delta); 73 74 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 75 { 76 bool free = (spool->count == 0) && (spool->used_hpages == 0); 77 78 spin_unlock(&spool->lock); 79 80 /* If no pages are used, and no other handles to the subpool 81 * remain, give up any reservations mased on minimum size and 82 * free the subpool */ 83 if (free) { 84 if (spool->min_hpages != -1) 85 hugetlb_acct_memory(spool->hstate, 86 -spool->min_hpages); 87 kfree(spool); 88 } 89 } 90 91 struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, 92 long min_hpages) 93 { 94 struct hugepage_subpool *spool; 95 96 spool = kzalloc(sizeof(*spool), GFP_KERNEL); 97 if (!spool) 98 return NULL; 99 100 spin_lock_init(&spool->lock); 101 spool->count = 1; 102 spool->max_hpages = max_hpages; 103 spool->hstate = h; 104 spool->min_hpages = min_hpages; 105 106 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { 107 kfree(spool); 108 return NULL; 109 } 110 spool->rsv_hpages = min_hpages; 111 112 return spool; 113 } 114 115 void hugepage_put_subpool(struct hugepage_subpool *spool) 116 { 117 spin_lock(&spool->lock); 118 BUG_ON(!spool->count); 119 spool->count--; 120 unlock_or_release_subpool(spool); 121 } 122 123 /* 124 * Subpool accounting for allocating and reserving pages. 125 * Return -ENOMEM if there are not enough resources to satisfy the 126 * the request. Otherwise, return the number of pages by which the 127 * global pools must be adjusted (upward). The returned value may 128 * only be different than the passed value (delta) in the case where 129 * a subpool minimum size must be manitained. 130 */ 131 static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, 132 long delta) 133 { 134 long ret = delta; 135 136 if (!spool) 137 return ret; 138 139 spin_lock(&spool->lock); 140 141 if (spool->max_hpages != -1) { /* maximum size accounting */ 142 if ((spool->used_hpages + delta) <= spool->max_hpages) 143 spool->used_hpages += delta; 144 else { 145 ret = -ENOMEM; 146 goto unlock_ret; 147 } 148 } 149 150 /* minimum size accounting */ 151 if (spool->min_hpages != -1 && spool->rsv_hpages) { 152 if (delta > spool->rsv_hpages) { 153 /* 154 * Asking for more reserves than those already taken on 155 * behalf of subpool. Return difference. 156 */ 157 ret = delta - spool->rsv_hpages; 158 spool->rsv_hpages = 0; 159 } else { 160 ret = 0; /* reserves already accounted for */ 161 spool->rsv_hpages -= delta; 162 } 163 } 164 165 unlock_ret: 166 spin_unlock(&spool->lock); 167 return ret; 168 } 169 170 /* 171 * Subpool accounting for freeing and unreserving pages. 172 * Return the number of global page reservations that must be dropped. 173 * The return value may only be different than the passed value (delta) 174 * in the case where a subpool minimum size must be maintained. 175 */ 176 static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, 177 long delta) 178 { 179 long ret = delta; 180 181 if (!spool) 182 return delta; 183 184 spin_lock(&spool->lock); 185 186 if (spool->max_hpages != -1) /* maximum size accounting */ 187 spool->used_hpages -= delta; 188 189 /* minimum size accounting */ 190 if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { 191 if (spool->rsv_hpages + delta <= spool->min_hpages) 192 ret = 0; 193 else 194 ret = spool->rsv_hpages + delta - spool->min_hpages; 195 196 spool->rsv_hpages += delta; 197 if (spool->rsv_hpages > spool->min_hpages) 198 spool->rsv_hpages = spool->min_hpages; 199 } 200 201 /* 202 * If hugetlbfs_put_super couldn't free spool due to an outstanding 203 * quota reference, free it now. 204 */ 205 unlock_or_release_subpool(spool); 206 207 return ret; 208 } 209 210 static inline struct hugepage_subpool *subpool_inode(struct inode *inode) 211 { 212 return HUGETLBFS_SB(inode->i_sb)->spool; 213 } 214 215 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) 216 { 217 return subpool_inode(file_inode(vma->vm_file)); 218 } 219 220 /* 221 * Region tracking -- allows tracking of reservations and instantiated pages 222 * across the pages in a mapping. 223 * 224 * The region data structures are embedded into a resv_map and protected 225 * by a resv_map's lock. The set of regions within the resv_map represent 226 * reservations for huge pages, or huge pages that have already been 227 * instantiated within the map. The from and to elements are huge page 228 * indicies into the associated mapping. from indicates the starting index 229 * of the region. to represents the first index past the end of the region. 230 * 231 * For example, a file region structure with from == 0 and to == 4 represents 232 * four huge pages in a mapping. It is important to note that the to element 233 * represents the first element past the end of the region. This is used in 234 * arithmetic as 4(to) - 0(from) = 4 huge pages in the region. 235 * 236 * Interval notation of the form [from, to) will be used to indicate that 237 * the endpoint from is inclusive and to is exclusive. 238 */ 239 struct file_region { 240 struct list_head link; 241 long from; 242 long to; 243 }; 244 245 /* 246 * Add the huge page range represented by [f, t) to the reserve 247 * map. In the normal case, existing regions will be expanded 248 * to accommodate the specified range. Sufficient regions should 249 * exist for expansion due to the previous call to region_chg 250 * with the same range. However, it is possible that region_del 251 * could have been called after region_chg and modifed the map 252 * in such a way that no region exists to be expanded. In this 253 * case, pull a region descriptor from the cache associated with 254 * the map and use that for the new range. 255 * 256 * Return the number of new huge pages added to the map. This 257 * number is greater than or equal to zero. 258 */ 259 static long region_add(struct resv_map *resv, long f, long t) 260 { 261 struct list_head *head = &resv->regions; 262 struct file_region *rg, *nrg, *trg; 263 long add = 0; 264 265 spin_lock(&resv->lock); 266 /* Locate the region we are either in or before. */ 267 list_for_each_entry(rg, head, link) 268 if (f <= rg->to) 269 break; 270 271 /* 272 * If no region exists which can be expanded to include the 273 * specified range, the list must have been modified by an 274 * interleving call to region_del(). Pull a region descriptor 275 * from the cache and use it for this range. 276 */ 277 if (&rg->link == head || t < rg->from) { 278 VM_BUG_ON(resv->region_cache_count <= 0); 279 280 resv->region_cache_count--; 281 nrg = list_first_entry(&resv->region_cache, struct file_region, 282 link); 283 list_del(&nrg->link); 284 285 nrg->from = f; 286 nrg->to = t; 287 list_add(&nrg->link, rg->link.prev); 288 289 add += t - f; 290 goto out_locked; 291 } 292 293 /* Round our left edge to the current segment if it encloses us. */ 294 if (f > rg->from) 295 f = rg->from; 296 297 /* Check for and consume any regions we now overlap with. */ 298 nrg = rg; 299 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 300 if (&rg->link == head) 301 break; 302 if (rg->from > t) 303 break; 304 305 /* If this area reaches higher then extend our area to 306 * include it completely. If this is not the first area 307 * which we intend to reuse, free it. */ 308 if (rg->to > t) 309 t = rg->to; 310 if (rg != nrg) { 311 /* Decrement return value by the deleted range. 312 * Another range will span this area so that by 313 * end of routine add will be >= zero 314 */ 315 add -= (rg->to - rg->from); 316 list_del(&rg->link); 317 kfree(rg); 318 } 319 } 320 321 add += (nrg->from - f); /* Added to beginning of region */ 322 nrg->from = f; 323 add += t - nrg->to; /* Added to end of region */ 324 nrg->to = t; 325 326 out_locked: 327 resv->adds_in_progress--; 328 spin_unlock(&resv->lock); 329 VM_BUG_ON(add < 0); 330 return add; 331 } 332 333 /* 334 * Examine the existing reserve map and determine how many 335 * huge pages in the specified range [f, t) are NOT currently 336 * represented. This routine is called before a subsequent 337 * call to region_add that will actually modify the reserve 338 * map to add the specified range [f, t). region_chg does 339 * not change the number of huge pages represented by the 340 * map. However, if the existing regions in the map can not 341 * be expanded to represent the new range, a new file_region 342 * structure is added to the map as a placeholder. This is 343 * so that the subsequent region_add call will have all the 344 * regions it needs and will not fail. 345 * 346 * Upon entry, region_chg will also examine the cache of region descriptors 347 * associated with the map. If there are not enough descriptors cached, one 348 * will be allocated for the in progress add operation. 349 * 350 * Returns the number of huge pages that need to be added to the existing 351 * reservation map for the range [f, t). This number is greater or equal to 352 * zero. -ENOMEM is returned if a new file_region structure or cache entry 353 * is needed and can not be allocated. 354 */ 355 static long region_chg(struct resv_map *resv, long f, long t) 356 { 357 struct list_head *head = &resv->regions; 358 struct file_region *rg, *nrg = NULL; 359 long chg = 0; 360 361 retry: 362 spin_lock(&resv->lock); 363 retry_locked: 364 resv->adds_in_progress++; 365 366 /* 367 * Check for sufficient descriptors in the cache to accommodate 368 * the number of in progress add operations. 369 */ 370 if (resv->adds_in_progress > resv->region_cache_count) { 371 struct file_region *trg; 372 373 VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1); 374 /* Must drop lock to allocate a new descriptor. */ 375 resv->adds_in_progress--; 376 spin_unlock(&resv->lock); 377 378 trg = kmalloc(sizeof(*trg), GFP_KERNEL); 379 if (!trg) { 380 kfree(nrg); 381 return -ENOMEM; 382 } 383 384 spin_lock(&resv->lock); 385 list_add(&trg->link, &resv->region_cache); 386 resv->region_cache_count++; 387 goto retry_locked; 388 } 389 390 /* Locate the region we are before or in. */ 391 list_for_each_entry(rg, head, link) 392 if (f <= rg->to) 393 break; 394 395 /* If we are below the current region then a new region is required. 396 * Subtle, allocate a new region at the position but make it zero 397 * size such that we can guarantee to record the reservation. */ 398 if (&rg->link == head || t < rg->from) { 399 if (!nrg) { 400 resv->adds_in_progress--; 401 spin_unlock(&resv->lock); 402 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 403 if (!nrg) 404 return -ENOMEM; 405 406 nrg->from = f; 407 nrg->to = f; 408 INIT_LIST_HEAD(&nrg->link); 409 goto retry; 410 } 411 412 list_add(&nrg->link, rg->link.prev); 413 chg = t - f; 414 goto out_nrg; 415 } 416 417 /* Round our left edge to the current segment if it encloses us. */ 418 if (f > rg->from) 419 f = rg->from; 420 chg = t - f; 421 422 /* Check for and consume any regions we now overlap with. */ 423 list_for_each_entry(rg, rg->link.prev, link) { 424 if (&rg->link == head) 425 break; 426 if (rg->from > t) 427 goto out; 428 429 /* We overlap with this area, if it extends further than 430 * us then we must extend ourselves. Account for its 431 * existing reservation. */ 432 if (rg->to > t) { 433 chg += rg->to - t; 434 t = rg->to; 435 } 436 chg -= rg->to - rg->from; 437 } 438 439 out: 440 spin_unlock(&resv->lock); 441 /* We already know we raced and no longer need the new region */ 442 kfree(nrg); 443 return chg; 444 out_nrg: 445 spin_unlock(&resv->lock); 446 return chg; 447 } 448 449 /* 450 * Abort the in progress add operation. The adds_in_progress field 451 * of the resv_map keeps track of the operations in progress between 452 * calls to region_chg and region_add. Operations are sometimes 453 * aborted after the call to region_chg. In such cases, region_abort 454 * is called to decrement the adds_in_progress counter. 455 * 456 * NOTE: The range arguments [f, t) are not needed or used in this 457 * routine. They are kept to make reading the calling code easier as 458 * arguments will match the associated region_chg call. 459 */ 460 static void region_abort(struct resv_map *resv, long f, long t) 461 { 462 spin_lock(&resv->lock); 463 VM_BUG_ON(!resv->region_cache_count); 464 resv->adds_in_progress--; 465 spin_unlock(&resv->lock); 466 } 467 468 /* 469 * Delete the specified range [f, t) from the reserve map. If the 470 * t parameter is LONG_MAX, this indicates that ALL regions after f 471 * should be deleted. Locate the regions which intersect [f, t) 472 * and either trim, delete or split the existing regions. 473 * 474 * Returns the number of huge pages deleted from the reserve map. 475 * In the normal case, the return value is zero or more. In the 476 * case where a region must be split, a new region descriptor must 477 * be allocated. If the allocation fails, -ENOMEM will be returned. 478 * NOTE: If the parameter t == LONG_MAX, then we will never split 479 * a region and possibly return -ENOMEM. Callers specifying 480 * t == LONG_MAX do not need to check for -ENOMEM error. 481 */ 482 static long region_del(struct resv_map *resv, long f, long t) 483 { 484 struct list_head *head = &resv->regions; 485 struct file_region *rg, *trg; 486 struct file_region *nrg = NULL; 487 long del = 0; 488 489 retry: 490 spin_lock(&resv->lock); 491 list_for_each_entry_safe(rg, trg, head, link) { 492 /* 493 * Skip regions before the range to be deleted. file_region 494 * ranges are normally of the form [from, to). However, there 495 * may be a "placeholder" entry in the map which is of the form 496 * (from, to) with from == to. Check for placeholder entries 497 * at the beginning of the range to be deleted. 498 */ 499 if (rg->to <= f && (rg->to != rg->from || rg->to != f)) 500 continue; 501 502 if (rg->from >= t) 503 break; 504 505 if (f > rg->from && t < rg->to) { /* Must split region */ 506 /* 507 * Check for an entry in the cache before dropping 508 * lock and attempting allocation. 509 */ 510 if (!nrg && 511 resv->region_cache_count > resv->adds_in_progress) { 512 nrg = list_first_entry(&resv->region_cache, 513 struct file_region, 514 link); 515 list_del(&nrg->link); 516 resv->region_cache_count--; 517 } 518 519 if (!nrg) { 520 spin_unlock(&resv->lock); 521 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 522 if (!nrg) 523 return -ENOMEM; 524 goto retry; 525 } 526 527 del += t - f; 528 529 /* New entry for end of split region */ 530 nrg->from = t; 531 nrg->to = rg->to; 532 INIT_LIST_HEAD(&nrg->link); 533 534 /* Original entry is trimmed */ 535 rg->to = f; 536 537 list_add(&nrg->link, &rg->link); 538 nrg = NULL; 539 break; 540 } 541 542 if (f <= rg->from && t >= rg->to) { /* Remove entire region */ 543 del += rg->to - rg->from; 544 list_del(&rg->link); 545 kfree(rg); 546 continue; 547 } 548 549 if (f <= rg->from) { /* Trim beginning of region */ 550 del += t - rg->from; 551 rg->from = t; 552 } else { /* Trim end of region */ 553 del += rg->to - f; 554 rg->to = f; 555 } 556 } 557 558 spin_unlock(&resv->lock); 559 kfree(nrg); 560 return del; 561 } 562 563 /* 564 * A rare out of memory error was encountered which prevented removal of 565 * the reserve map region for a page. The huge page itself was free'ed 566 * and removed from the page cache. This routine will adjust the subpool 567 * usage count, and the global reserve count if needed. By incrementing 568 * these counts, the reserve map entry which could not be deleted will 569 * appear as a "reserved" entry instead of simply dangling with incorrect 570 * counts. 571 */ 572 void hugetlb_fix_reserve_counts(struct inode *inode) 573 { 574 struct hugepage_subpool *spool = subpool_inode(inode); 575 long rsv_adjust; 576 577 rsv_adjust = hugepage_subpool_get_pages(spool, 1); 578 if (rsv_adjust) { 579 struct hstate *h = hstate_inode(inode); 580 581 hugetlb_acct_memory(h, 1); 582 } 583 } 584 585 /* 586 * Count and return the number of huge pages in the reserve map 587 * that intersect with the range [f, t). 588 */ 589 static long region_count(struct resv_map *resv, long f, long t) 590 { 591 struct list_head *head = &resv->regions; 592 struct file_region *rg; 593 long chg = 0; 594 595 spin_lock(&resv->lock); 596 /* Locate each segment we overlap with, and count that overlap. */ 597 list_for_each_entry(rg, head, link) { 598 long seg_from; 599 long seg_to; 600 601 if (rg->to <= f) 602 continue; 603 if (rg->from >= t) 604 break; 605 606 seg_from = max(rg->from, f); 607 seg_to = min(rg->to, t); 608 609 chg += seg_to - seg_from; 610 } 611 spin_unlock(&resv->lock); 612 613 return chg; 614 } 615 616 /* 617 * Convert the address within this vma to the page offset within 618 * the mapping, in pagecache page units; huge pages here. 619 */ 620 static pgoff_t vma_hugecache_offset(struct hstate *h, 621 struct vm_area_struct *vma, unsigned long address) 622 { 623 return ((address - vma->vm_start) >> huge_page_shift(h)) + 624 (vma->vm_pgoff >> huge_page_order(h)); 625 } 626 627 pgoff_t linear_hugepage_index(struct vm_area_struct *vma, 628 unsigned long address) 629 { 630 return vma_hugecache_offset(hstate_vma(vma), vma, address); 631 } 632 EXPORT_SYMBOL_GPL(linear_hugepage_index); 633 634 /* 635 * Return the size of the pages allocated when backing a VMA. In the majority 636 * cases this will be same size as used by the page table entries. 637 */ 638 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 639 { 640 struct hstate *hstate; 641 642 if (!is_vm_hugetlb_page(vma)) 643 return PAGE_SIZE; 644 645 hstate = hstate_vma(vma); 646 647 return 1UL << huge_page_shift(hstate); 648 } 649 EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 650 651 /* 652 * Return the page size being used by the MMU to back a VMA. In the majority 653 * of cases, the page size used by the kernel matches the MMU size. On 654 * architectures where it differs, an architecture-specific version of this 655 * function is required. 656 */ 657 #ifndef vma_mmu_pagesize 658 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 659 { 660 return vma_kernel_pagesize(vma); 661 } 662 #endif 663 664 /* 665 * Flags for MAP_PRIVATE reservations. These are stored in the bottom 666 * bits of the reservation map pointer, which are always clear due to 667 * alignment. 668 */ 669 #define HPAGE_RESV_OWNER (1UL << 0) 670 #define HPAGE_RESV_UNMAPPED (1UL << 1) 671 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 672 673 /* 674 * These helpers are used to track how many pages are reserved for 675 * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 676 * is guaranteed to have their future faults succeed. 677 * 678 * With the exception of reset_vma_resv_huge_pages() which is called at fork(), 679 * the reserve counters are updated with the hugetlb_lock held. It is safe 680 * to reset the VMA at fork() time as it is not in use yet and there is no 681 * chance of the global counters getting corrupted as a result of the values. 682 * 683 * The private mapping reservation is represented in a subtly different 684 * manner to a shared mapping. A shared mapping has a region map associated 685 * with the underlying file, this region map represents the backing file 686 * pages which have ever had a reservation assigned which this persists even 687 * after the page is instantiated. A private mapping has a region map 688 * associated with the original mmap which is attached to all VMAs which 689 * reference it, this region map represents those offsets which have consumed 690 * reservation ie. where pages have been instantiated. 691 */ 692 static unsigned long get_vma_private_data(struct vm_area_struct *vma) 693 { 694 return (unsigned long)vma->vm_private_data; 695 } 696 697 static void set_vma_private_data(struct vm_area_struct *vma, 698 unsigned long value) 699 { 700 vma->vm_private_data = (void *)value; 701 } 702 703 struct resv_map *resv_map_alloc(void) 704 { 705 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 706 struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); 707 708 if (!resv_map || !rg) { 709 kfree(resv_map); 710 kfree(rg); 711 return NULL; 712 } 713 714 kref_init(&resv_map->refs); 715 spin_lock_init(&resv_map->lock); 716 INIT_LIST_HEAD(&resv_map->regions); 717 718 resv_map->adds_in_progress = 0; 719 720 INIT_LIST_HEAD(&resv_map->region_cache); 721 list_add(&rg->link, &resv_map->region_cache); 722 resv_map->region_cache_count = 1; 723 724 return resv_map; 725 } 726 727 void resv_map_release(struct kref *ref) 728 { 729 struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 730 struct list_head *head = &resv_map->region_cache; 731 struct file_region *rg, *trg; 732 733 /* Clear out any active regions before we release the map. */ 734 region_del(resv_map, 0, LONG_MAX); 735 736 /* ... and any entries left in the cache */ 737 list_for_each_entry_safe(rg, trg, head, link) { 738 list_del(&rg->link); 739 kfree(rg); 740 } 741 742 VM_BUG_ON(resv_map->adds_in_progress); 743 744 kfree(resv_map); 745 } 746 747 static inline struct resv_map *inode_resv_map(struct inode *inode) 748 { 749 return inode->i_mapping->private_data; 750 } 751 752 static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 753 { 754 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 755 if (vma->vm_flags & VM_MAYSHARE) { 756 struct address_space *mapping = vma->vm_file->f_mapping; 757 struct inode *inode = mapping->host; 758 759 return inode_resv_map(inode); 760 761 } else { 762 return (struct resv_map *)(get_vma_private_data(vma) & 763 ~HPAGE_RESV_MASK); 764 } 765 } 766 767 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 768 { 769 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 770 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 771 772 set_vma_private_data(vma, (get_vma_private_data(vma) & 773 HPAGE_RESV_MASK) | (unsigned long)map); 774 } 775 776 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 777 { 778 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 779 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 780 781 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 782 } 783 784 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 785 { 786 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 787 788 return (get_vma_private_data(vma) & flag) != 0; 789 } 790 791 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 792 void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 793 { 794 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 795 if (!(vma->vm_flags & VM_MAYSHARE)) 796 vma->vm_private_data = (void *)0; 797 } 798 799 /* Returns true if the VMA has associated reserve pages */ 800 static bool vma_has_reserves(struct vm_area_struct *vma, long chg) 801 { 802 if (vma->vm_flags & VM_NORESERVE) { 803 /* 804 * This address is already reserved by other process(chg == 0), 805 * so, we should decrement reserved count. Without decrementing, 806 * reserve count remains after releasing inode, because this 807 * allocated page will go into page cache and is regarded as 808 * coming from reserved pool in releasing step. Currently, we 809 * don't have any other solution to deal with this situation 810 * properly, so add work-around here. 811 */ 812 if (vma->vm_flags & VM_MAYSHARE && chg == 0) 813 return true; 814 else 815 return false; 816 } 817 818 /* Shared mappings always use reserves */ 819 if (vma->vm_flags & VM_MAYSHARE) { 820 /* 821 * We know VM_NORESERVE is not set. Therefore, there SHOULD 822 * be a region map for all pages. The only situation where 823 * there is no region map is if a hole was punched via 824 * fallocate. In this case, there really are no reverves to 825 * use. This situation is indicated if chg != 0. 826 */ 827 if (chg) 828 return false; 829 else 830 return true; 831 } 832 833 /* 834 * Only the process that called mmap() has reserves for 835 * private mappings. 836 */ 837 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 838 /* 839 * Like the shared case above, a hole punch or truncate 840 * could have been performed on the private mapping. 841 * Examine the value of chg to determine if reserves 842 * actually exist or were previously consumed. 843 * Very Subtle - The value of chg comes from a previous 844 * call to vma_needs_reserves(). The reserve map for 845 * private mappings has different (opposite) semantics 846 * than that of shared mappings. vma_needs_reserves() 847 * has already taken this difference in semantics into 848 * account. Therefore, the meaning of chg is the same 849 * as in the shared case above. Code could easily be 850 * combined, but keeping it separate draws attention to 851 * subtle differences. 852 */ 853 if (chg) 854 return false; 855 else 856 return true; 857 } 858 859 return false; 860 } 861 862 static void enqueue_huge_page(struct hstate *h, struct page *page) 863 { 864 int nid = page_to_nid(page); 865 list_move(&page->lru, &h->hugepage_freelists[nid]); 866 h->free_huge_pages++; 867 h->free_huge_pages_node[nid]++; 868 } 869 870 static struct page *dequeue_huge_page_node(struct hstate *h, int nid) 871 { 872 struct page *page; 873 874 list_for_each_entry(page, &h->hugepage_freelists[nid], lru) 875 if (!is_migrate_isolate_page(page)) 876 break; 877 /* 878 * if 'non-isolated free hugepage' not found on the list, 879 * the allocation fails. 880 */ 881 if (&h->hugepage_freelists[nid] == &page->lru) 882 return NULL; 883 list_move(&page->lru, &h->hugepage_activelist); 884 set_page_refcounted(page); 885 h->free_huge_pages--; 886 h->free_huge_pages_node[nid]--; 887 return page; 888 } 889 890 /* Movability of hugepages depends on migration support. */ 891 static inline gfp_t htlb_alloc_mask(struct hstate *h) 892 { 893 if (hugepages_treat_as_movable || hugepage_migration_supported(h)) 894 return GFP_HIGHUSER_MOVABLE; 895 else 896 return GFP_HIGHUSER; 897 } 898 899 static struct page *dequeue_huge_page_vma(struct hstate *h, 900 struct vm_area_struct *vma, 901 unsigned long address, int avoid_reserve, 902 long chg) 903 { 904 struct page *page = NULL; 905 struct mempolicy *mpol; 906 nodemask_t *nodemask; 907 struct zonelist *zonelist; 908 struct zone *zone; 909 struct zoneref *z; 910 unsigned int cpuset_mems_cookie; 911 912 /* 913 * A child process with MAP_PRIVATE mappings created by their parent 914 * have no page reserves. This check ensures that reservations are 915 * not "stolen". The child may still get SIGKILLed 916 */ 917 if (!vma_has_reserves(vma, chg) && 918 h->free_huge_pages - h->resv_huge_pages == 0) 919 goto err; 920 921 /* If reserves cannot be used, ensure enough pages are in the pool */ 922 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 923 goto err; 924 925 retry_cpuset: 926 cpuset_mems_cookie = read_mems_allowed_begin(); 927 zonelist = huge_zonelist(vma, address, 928 htlb_alloc_mask(h), &mpol, &nodemask); 929 930 for_each_zone_zonelist_nodemask(zone, z, zonelist, 931 MAX_NR_ZONES - 1, nodemask) { 932 if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) { 933 page = dequeue_huge_page_node(h, zone_to_nid(zone)); 934 if (page) { 935 if (avoid_reserve) 936 break; 937 if (!vma_has_reserves(vma, chg)) 938 break; 939 940 SetPagePrivate(page); 941 h->resv_huge_pages--; 942 break; 943 } 944 } 945 } 946 947 mpol_cond_put(mpol); 948 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 949 goto retry_cpuset; 950 return page; 951 952 err: 953 return NULL; 954 } 955 956 /* 957 * common helper functions for hstate_next_node_to_{alloc|free}. 958 * We may have allocated or freed a huge page based on a different 959 * nodes_allowed previously, so h->next_node_to_{alloc|free} might 960 * be outside of *nodes_allowed. Ensure that we use an allowed 961 * node for alloc or free. 962 */ 963 static int next_node_allowed(int nid, nodemask_t *nodes_allowed) 964 { 965 nid = next_node_in(nid, *nodes_allowed); 966 VM_BUG_ON(nid >= MAX_NUMNODES); 967 968 return nid; 969 } 970 971 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) 972 { 973 if (!node_isset(nid, *nodes_allowed)) 974 nid = next_node_allowed(nid, nodes_allowed); 975 return nid; 976 } 977 978 /* 979 * returns the previously saved node ["this node"] from which to 980 * allocate a persistent huge page for the pool and advance the 981 * next node from which to allocate, handling wrap at end of node 982 * mask. 983 */ 984 static int hstate_next_node_to_alloc(struct hstate *h, 985 nodemask_t *nodes_allowed) 986 { 987 int nid; 988 989 VM_BUG_ON(!nodes_allowed); 990 991 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); 992 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); 993 994 return nid; 995 } 996 997 /* 998 * helper for free_pool_huge_page() - return the previously saved 999 * node ["this node"] from which to free a huge page. Advance the 1000 * next node id whether or not we find a free huge page to free so 1001 * that the next attempt to free addresses the next node. 1002 */ 1003 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) 1004 { 1005 int nid; 1006 1007 VM_BUG_ON(!nodes_allowed); 1008 1009 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); 1010 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); 1011 1012 return nid; 1013 } 1014 1015 #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ 1016 for (nr_nodes = nodes_weight(*mask); \ 1017 nr_nodes > 0 && \ 1018 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ 1019 nr_nodes--) 1020 1021 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ 1022 for (nr_nodes = nodes_weight(*mask); \ 1023 nr_nodes > 0 && \ 1024 ((node = hstate_next_node_to_free(hs, mask)) || 1); \ 1025 nr_nodes--) 1026 1027 #if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \ 1028 ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \ 1029 defined(CONFIG_CMA)) 1030 static void destroy_compound_gigantic_page(struct page *page, 1031 unsigned int order) 1032 { 1033 int i; 1034 int nr_pages = 1 << order; 1035 struct page *p = page + 1; 1036 1037 atomic_set(compound_mapcount_ptr(page), 0); 1038 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1039 clear_compound_head(p); 1040 set_page_refcounted(p); 1041 } 1042 1043 set_compound_order(page, 0); 1044 __ClearPageHead(page); 1045 } 1046 1047 static void free_gigantic_page(struct page *page, unsigned int order) 1048 { 1049 free_contig_range(page_to_pfn(page), 1 << order); 1050 } 1051 1052 static int __alloc_gigantic_page(unsigned long start_pfn, 1053 unsigned long nr_pages) 1054 { 1055 unsigned long end_pfn = start_pfn + nr_pages; 1056 return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, 1057 GFP_KERNEL); 1058 } 1059 1060 static bool pfn_range_valid_gigantic(struct zone *z, 1061 unsigned long start_pfn, unsigned long nr_pages) 1062 { 1063 unsigned long i, end_pfn = start_pfn + nr_pages; 1064 struct page *page; 1065 1066 for (i = start_pfn; i < end_pfn; i++) { 1067 if (!pfn_valid(i)) 1068 return false; 1069 1070 page = pfn_to_page(i); 1071 1072 if (page_zone(page) != z) 1073 return false; 1074 1075 if (PageReserved(page)) 1076 return false; 1077 1078 if (page_count(page) > 0) 1079 return false; 1080 1081 if (PageHuge(page)) 1082 return false; 1083 } 1084 1085 return true; 1086 } 1087 1088 static bool zone_spans_last_pfn(const struct zone *zone, 1089 unsigned long start_pfn, unsigned long nr_pages) 1090 { 1091 unsigned long last_pfn = start_pfn + nr_pages - 1; 1092 return zone_spans_pfn(zone, last_pfn); 1093 } 1094 1095 static struct page *alloc_gigantic_page(int nid, unsigned int order) 1096 { 1097 unsigned long nr_pages = 1 << order; 1098 unsigned long ret, pfn, flags; 1099 struct zone *z; 1100 1101 z = NODE_DATA(nid)->node_zones; 1102 for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) { 1103 spin_lock_irqsave(&z->lock, flags); 1104 1105 pfn = ALIGN(z->zone_start_pfn, nr_pages); 1106 while (zone_spans_last_pfn(z, pfn, nr_pages)) { 1107 if (pfn_range_valid_gigantic(z, pfn, nr_pages)) { 1108 /* 1109 * We release the zone lock here because 1110 * alloc_contig_range() will also lock the zone 1111 * at some point. If there's an allocation 1112 * spinning on this lock, it may win the race 1113 * and cause alloc_contig_range() to fail... 1114 */ 1115 spin_unlock_irqrestore(&z->lock, flags); 1116 ret = __alloc_gigantic_page(pfn, nr_pages); 1117 if (!ret) 1118 return pfn_to_page(pfn); 1119 spin_lock_irqsave(&z->lock, flags); 1120 } 1121 pfn += nr_pages; 1122 } 1123 1124 spin_unlock_irqrestore(&z->lock, flags); 1125 } 1126 1127 return NULL; 1128 } 1129 1130 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); 1131 static void prep_compound_gigantic_page(struct page *page, unsigned int order); 1132 1133 static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) 1134 { 1135 struct page *page; 1136 1137 page = alloc_gigantic_page(nid, huge_page_order(h)); 1138 if (page) { 1139 prep_compound_gigantic_page(page, huge_page_order(h)); 1140 prep_new_huge_page(h, page, nid); 1141 } 1142 1143 return page; 1144 } 1145 1146 static int alloc_fresh_gigantic_page(struct hstate *h, 1147 nodemask_t *nodes_allowed) 1148 { 1149 struct page *page = NULL; 1150 int nr_nodes, node; 1151 1152 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 1153 page = alloc_fresh_gigantic_page_node(h, node); 1154 if (page) 1155 return 1; 1156 } 1157 1158 return 0; 1159 } 1160 1161 static inline bool gigantic_page_supported(void) { return true; } 1162 #else 1163 static inline bool gigantic_page_supported(void) { return false; } 1164 static inline void free_gigantic_page(struct page *page, unsigned int order) { } 1165 static inline void destroy_compound_gigantic_page(struct page *page, 1166 unsigned int order) { } 1167 static inline int alloc_fresh_gigantic_page(struct hstate *h, 1168 nodemask_t *nodes_allowed) { return 0; } 1169 #endif 1170 1171 static void update_and_free_page(struct hstate *h, struct page *page) 1172 { 1173 int i; 1174 1175 if (hstate_is_gigantic(h) && !gigantic_page_supported()) 1176 return; 1177 1178 h->nr_huge_pages--; 1179 h->nr_huge_pages_node[page_to_nid(page)]--; 1180 for (i = 0; i < pages_per_huge_page(h); i++) { 1181 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1182 1 << PG_referenced | 1 << PG_dirty | 1183 1 << PG_active | 1 << PG_private | 1184 1 << PG_writeback); 1185 } 1186 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); 1187 set_compound_page_dtor(page, NULL_COMPOUND_DTOR); 1188 set_page_refcounted(page); 1189 if (hstate_is_gigantic(h)) { 1190 destroy_compound_gigantic_page(page, huge_page_order(h)); 1191 free_gigantic_page(page, huge_page_order(h)); 1192 } else { 1193 __free_pages(page, huge_page_order(h)); 1194 } 1195 } 1196 1197 struct hstate *size_to_hstate(unsigned long size) 1198 { 1199 struct hstate *h; 1200 1201 for_each_hstate(h) { 1202 if (huge_page_size(h) == size) 1203 return h; 1204 } 1205 return NULL; 1206 } 1207 1208 /* 1209 * Test to determine whether the hugepage is "active/in-use" (i.e. being linked 1210 * to hstate->hugepage_activelist.) 1211 * 1212 * This function can be called for tail pages, but never returns true for them. 1213 */ 1214 bool page_huge_active(struct page *page) 1215 { 1216 VM_BUG_ON_PAGE(!PageHuge(page), page); 1217 return PageHead(page) && PagePrivate(&page[1]); 1218 } 1219 1220 /* never called for tail page */ 1221 static void set_page_huge_active(struct page *page) 1222 { 1223 VM_BUG_ON_PAGE(!PageHeadHuge(page), page); 1224 SetPagePrivate(&page[1]); 1225 } 1226 1227 static void clear_page_huge_active(struct page *page) 1228 { 1229 VM_BUG_ON_PAGE(!PageHeadHuge(page), page); 1230 ClearPagePrivate(&page[1]); 1231 } 1232 1233 void free_huge_page(struct page *page) 1234 { 1235 /* 1236 * Can't pass hstate in here because it is called from the 1237 * compound page destructor. 1238 */ 1239 struct hstate *h = page_hstate(page); 1240 int nid = page_to_nid(page); 1241 struct hugepage_subpool *spool = 1242 (struct hugepage_subpool *)page_private(page); 1243 bool restore_reserve; 1244 1245 set_page_private(page, 0); 1246 page->mapping = NULL; 1247 VM_BUG_ON_PAGE(page_count(page), page); 1248 VM_BUG_ON_PAGE(page_mapcount(page), page); 1249 restore_reserve = PagePrivate(page); 1250 ClearPagePrivate(page); 1251 1252 /* 1253 * A return code of zero implies that the subpool will be under its 1254 * minimum size if the reservation is not restored after page is free. 1255 * Therefore, force restore_reserve operation. 1256 */ 1257 if (hugepage_subpool_put_pages(spool, 1) == 0) 1258 restore_reserve = true; 1259 1260 spin_lock(&hugetlb_lock); 1261 clear_page_huge_active(page); 1262 hugetlb_cgroup_uncharge_page(hstate_index(h), 1263 pages_per_huge_page(h), page); 1264 if (restore_reserve) 1265 h->resv_huge_pages++; 1266 1267 if (h->surplus_huge_pages_node[nid]) { 1268 /* remove the page from active list */ 1269 list_del(&page->lru); 1270 update_and_free_page(h, page); 1271 h->surplus_huge_pages--; 1272 h->surplus_huge_pages_node[nid]--; 1273 } else { 1274 arch_clear_hugepage_flags(page); 1275 enqueue_huge_page(h, page); 1276 } 1277 spin_unlock(&hugetlb_lock); 1278 } 1279 1280 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 1281 { 1282 INIT_LIST_HEAD(&page->lru); 1283 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); 1284 spin_lock(&hugetlb_lock); 1285 set_hugetlb_cgroup(page, NULL); 1286 h->nr_huge_pages++; 1287 h->nr_huge_pages_node[nid]++; 1288 spin_unlock(&hugetlb_lock); 1289 put_page(page); /* free it into the hugepage allocator */ 1290 } 1291 1292 static void prep_compound_gigantic_page(struct page *page, unsigned int order) 1293 { 1294 int i; 1295 int nr_pages = 1 << order; 1296 struct page *p = page + 1; 1297 1298 /* we rely on prep_new_huge_page to set the destructor */ 1299 set_compound_order(page, order); 1300 __ClearPageReserved(page); 1301 __SetPageHead(page); 1302 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1303 /* 1304 * For gigantic hugepages allocated through bootmem at 1305 * boot, it's safer to be consistent with the not-gigantic 1306 * hugepages and clear the PG_reserved bit from all tail pages 1307 * too. Otherwse drivers using get_user_pages() to access tail 1308 * pages may get the reference counting wrong if they see 1309 * PG_reserved set on a tail page (despite the head page not 1310 * having PG_reserved set). Enforcing this consistency between 1311 * head and tail pages allows drivers to optimize away a check 1312 * on the head page when they need know if put_page() is needed 1313 * after get_user_pages(). 1314 */ 1315 __ClearPageReserved(p); 1316 set_page_count(p, 0); 1317 set_compound_head(p, page); 1318 } 1319 atomic_set(compound_mapcount_ptr(page), -1); 1320 } 1321 1322 /* 1323 * PageHuge() only returns true for hugetlbfs pages, but not for normal or 1324 * transparent huge pages. See the PageTransHuge() documentation for more 1325 * details. 1326 */ 1327 int PageHuge(struct page *page) 1328 { 1329 if (!PageCompound(page)) 1330 return 0; 1331 1332 page = compound_head(page); 1333 return page[1].compound_dtor == HUGETLB_PAGE_DTOR; 1334 } 1335 EXPORT_SYMBOL_GPL(PageHuge); 1336 1337 /* 1338 * PageHeadHuge() only returns true for hugetlbfs head page, but not for 1339 * normal or transparent huge pages. 1340 */ 1341 int PageHeadHuge(struct page *page_head) 1342 { 1343 if (!PageHead(page_head)) 1344 return 0; 1345 1346 return get_compound_page_dtor(page_head) == free_huge_page; 1347 } 1348 1349 pgoff_t __basepage_index(struct page *page) 1350 { 1351 struct page *page_head = compound_head(page); 1352 pgoff_t index = page_index(page_head); 1353 unsigned long compound_idx; 1354 1355 if (!PageHuge(page_head)) 1356 return page_index(page); 1357 1358 if (compound_order(page_head) >= MAX_ORDER) 1359 compound_idx = page_to_pfn(page) - page_to_pfn(page_head); 1360 else 1361 compound_idx = page - page_head; 1362 1363 return (index << compound_order(page_head)) + compound_idx; 1364 } 1365 1366 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 1367 { 1368 struct page *page; 1369 1370 page = __alloc_pages_node(nid, 1371 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| 1372 __GFP_REPEAT|__GFP_NOWARN, 1373 huge_page_order(h)); 1374 if (page) { 1375 prep_new_huge_page(h, page, nid); 1376 } 1377 1378 return page; 1379 } 1380 1381 static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) 1382 { 1383 struct page *page; 1384 int nr_nodes, node; 1385 int ret = 0; 1386 1387 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 1388 page = alloc_fresh_huge_page_node(h, node); 1389 if (page) { 1390 ret = 1; 1391 break; 1392 } 1393 } 1394 1395 if (ret) 1396 count_vm_event(HTLB_BUDDY_PGALLOC); 1397 else 1398 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 1399 1400 return ret; 1401 } 1402 1403 /* 1404 * Free huge page from pool from next node to free. 1405 * Attempt to keep persistent huge pages more or less 1406 * balanced over allowed nodes. 1407 * Called with hugetlb_lock locked. 1408 */ 1409 static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 1410 bool acct_surplus) 1411 { 1412 int nr_nodes, node; 1413 int ret = 0; 1414 1415 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 1416 /* 1417 * If we're returning unused surplus pages, only examine 1418 * nodes with surplus pages. 1419 */ 1420 if ((!acct_surplus || h->surplus_huge_pages_node[node]) && 1421 !list_empty(&h->hugepage_freelists[node])) { 1422 struct page *page = 1423 list_entry(h->hugepage_freelists[node].next, 1424 struct page, lru); 1425 list_del(&page->lru); 1426 h->free_huge_pages--; 1427 h->free_huge_pages_node[node]--; 1428 if (acct_surplus) { 1429 h->surplus_huge_pages--; 1430 h->surplus_huge_pages_node[node]--; 1431 } 1432 update_and_free_page(h, page); 1433 ret = 1; 1434 break; 1435 } 1436 } 1437 1438 return ret; 1439 } 1440 1441 /* 1442 * Dissolve a given free hugepage into free buddy pages. This function does 1443 * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the 1444 * number of free hugepages would be reduced below the number of reserved 1445 * hugepages. 1446 */ 1447 static int dissolve_free_huge_page(struct page *page) 1448 { 1449 int rc = 0; 1450 1451 spin_lock(&hugetlb_lock); 1452 if (PageHuge(page) && !page_count(page)) { 1453 struct page *head = compound_head(page); 1454 struct hstate *h = page_hstate(head); 1455 int nid = page_to_nid(head); 1456 if (h->free_huge_pages - h->resv_huge_pages == 0) { 1457 rc = -EBUSY; 1458 goto out; 1459 } 1460 list_del(&head->lru); 1461 h->free_huge_pages--; 1462 h->free_huge_pages_node[nid]--; 1463 h->max_huge_pages--; 1464 update_and_free_page(h, head); 1465 } 1466 out: 1467 spin_unlock(&hugetlb_lock); 1468 return rc; 1469 } 1470 1471 /* 1472 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to 1473 * make specified memory blocks removable from the system. 1474 * Note that this will dissolve a free gigantic hugepage completely, if any 1475 * part of it lies within the given range. 1476 * Also note that if dissolve_free_huge_page() returns with an error, all 1477 * free hugepages that were dissolved before that error are lost. 1478 */ 1479 int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) 1480 { 1481 unsigned long pfn; 1482 struct page *page; 1483 int rc = 0; 1484 1485 if (!hugepages_supported()) 1486 return rc; 1487 1488 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { 1489 page = pfn_to_page(pfn); 1490 if (PageHuge(page) && !page_count(page)) { 1491 rc = dissolve_free_huge_page(page); 1492 if (rc) 1493 break; 1494 } 1495 } 1496 1497 return rc; 1498 } 1499 1500 /* 1501 * There are 3 ways this can get called: 1502 * 1. With vma+addr: we use the VMA's memory policy 1503 * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge 1504 * page from any node, and let the buddy allocator itself figure 1505 * it out. 1506 * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page 1507 * strictly from 'nid' 1508 */ 1509 static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, 1510 struct vm_area_struct *vma, unsigned long addr, int nid) 1511 { 1512 int order = huge_page_order(h); 1513 gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; 1514 unsigned int cpuset_mems_cookie; 1515 1516 /* 1517 * We need a VMA to get a memory policy. If we do not 1518 * have one, we use the 'nid' argument. 1519 * 1520 * The mempolicy stuff below has some non-inlined bits 1521 * and calls ->vm_ops. That makes it hard to optimize at 1522 * compile-time, even when NUMA is off and it does 1523 * nothing. This helps the compiler optimize it out. 1524 */ 1525 if (!IS_ENABLED(CONFIG_NUMA) || !vma) { 1526 /* 1527 * If a specific node is requested, make sure to 1528 * get memory from there, but only when a node 1529 * is explicitly specified. 1530 */ 1531 if (nid != NUMA_NO_NODE) 1532 gfp |= __GFP_THISNODE; 1533 /* 1534 * Make sure to call something that can handle 1535 * nid=NUMA_NO_NODE 1536 */ 1537 return alloc_pages_node(nid, gfp, order); 1538 } 1539 1540 /* 1541 * OK, so we have a VMA. Fetch the mempolicy and try to 1542 * allocate a huge page with it. We will only reach this 1543 * when CONFIG_NUMA=y. 1544 */ 1545 do { 1546 struct page *page; 1547 struct mempolicy *mpol; 1548 struct zonelist *zl; 1549 nodemask_t *nodemask; 1550 1551 cpuset_mems_cookie = read_mems_allowed_begin(); 1552 zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask); 1553 mpol_cond_put(mpol); 1554 page = __alloc_pages_nodemask(gfp, order, zl, nodemask); 1555 if (page) 1556 return page; 1557 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 1558 1559 return NULL; 1560 } 1561 1562 /* 1563 * There are two ways to allocate a huge page: 1564 * 1. When you have a VMA and an address (like a fault) 1565 * 2. When you have no VMA (like when setting /proc/.../nr_hugepages) 1566 * 1567 * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in 1568 * this case which signifies that the allocation should be done with 1569 * respect for the VMA's memory policy. 1570 * 1571 * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This 1572 * implies that memory policies will not be taken in to account. 1573 */ 1574 static struct page *__alloc_buddy_huge_page(struct hstate *h, 1575 struct vm_area_struct *vma, unsigned long addr, int nid) 1576 { 1577 struct page *page; 1578 unsigned int r_nid; 1579 1580 if (hstate_is_gigantic(h)) 1581 return NULL; 1582 1583 /* 1584 * Make sure that anyone specifying 'nid' is not also specifying a VMA. 1585 * This makes sure the caller is picking _one_ of the modes with which 1586 * we can call this function, not both. 1587 */ 1588 if (vma || (addr != -1)) { 1589 VM_WARN_ON_ONCE(addr == -1); 1590 VM_WARN_ON_ONCE(nid != NUMA_NO_NODE); 1591 } 1592 /* 1593 * Assume we will successfully allocate the surplus page to 1594 * prevent racing processes from causing the surplus to exceed 1595 * overcommit 1596 * 1597 * This however introduces a different race, where a process B 1598 * tries to grow the static hugepage pool while alloc_pages() is 1599 * called by process A. B will only examine the per-node 1600 * counters in determining if surplus huge pages can be 1601 * converted to normal huge pages in adjust_pool_surplus(). A 1602 * won't be able to increment the per-node counter, until the 1603 * lock is dropped by B, but B doesn't drop hugetlb_lock until 1604 * no more huge pages can be converted from surplus to normal 1605 * state (and doesn't try to convert again). Thus, we have a 1606 * case where a surplus huge page exists, the pool is grown, and 1607 * the surplus huge page still exists after, even though it 1608 * should just have been converted to a normal huge page. This 1609 * does not leak memory, though, as the hugepage will be freed 1610 * once it is out of use. It also does not allow the counters to 1611 * go out of whack in adjust_pool_surplus() as we don't modify 1612 * the node values until we've gotten the hugepage and only the 1613 * per-node value is checked there. 1614 */ 1615 spin_lock(&hugetlb_lock); 1616 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { 1617 spin_unlock(&hugetlb_lock); 1618 return NULL; 1619 } else { 1620 h->nr_huge_pages++; 1621 h->surplus_huge_pages++; 1622 } 1623 spin_unlock(&hugetlb_lock); 1624 1625 page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid); 1626 1627 spin_lock(&hugetlb_lock); 1628 if (page) { 1629 INIT_LIST_HEAD(&page->lru); 1630 r_nid = page_to_nid(page); 1631 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); 1632 set_hugetlb_cgroup(page, NULL); 1633 /* 1634 * We incremented the global counters already 1635 */ 1636 h->nr_huge_pages_node[r_nid]++; 1637 h->surplus_huge_pages_node[r_nid]++; 1638 __count_vm_event(HTLB_BUDDY_PGALLOC); 1639 } else { 1640 h->nr_huge_pages--; 1641 h->surplus_huge_pages--; 1642 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 1643 } 1644 spin_unlock(&hugetlb_lock); 1645 1646 return page; 1647 } 1648 1649 /* 1650 * Allocate a huge page from 'nid'. Note, 'nid' may be 1651 * NUMA_NO_NODE, which means that it may be allocated 1652 * anywhere. 1653 */ 1654 static 1655 struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid) 1656 { 1657 unsigned long addr = -1; 1658 1659 return __alloc_buddy_huge_page(h, NULL, addr, nid); 1660 } 1661 1662 /* 1663 * Use the VMA's mpolicy to allocate a huge page from the buddy. 1664 */ 1665 static 1666 struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, 1667 struct vm_area_struct *vma, unsigned long addr) 1668 { 1669 return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE); 1670 } 1671 1672 /* 1673 * This allocation function is useful in the context where vma is irrelevant. 1674 * E.g. soft-offlining uses this function because it only cares physical 1675 * address of error page. 1676 */ 1677 struct page *alloc_huge_page_node(struct hstate *h, int nid) 1678 { 1679 struct page *page = NULL; 1680 1681 spin_lock(&hugetlb_lock); 1682 if (h->free_huge_pages - h->resv_huge_pages > 0) 1683 page = dequeue_huge_page_node(h, nid); 1684 spin_unlock(&hugetlb_lock); 1685 1686 if (!page) 1687 page = __alloc_buddy_huge_page_no_mpol(h, nid); 1688 1689 return page; 1690 } 1691 1692 /* 1693 * Increase the hugetlb pool such that it can accommodate a reservation 1694 * of size 'delta'. 1695 */ 1696 static int gather_surplus_pages(struct hstate *h, int delta) 1697 { 1698 struct list_head surplus_list; 1699 struct page *page, *tmp; 1700 int ret, i; 1701 int needed, allocated; 1702 bool alloc_ok = true; 1703 1704 needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 1705 if (needed <= 0) { 1706 h->resv_huge_pages += delta; 1707 return 0; 1708 } 1709 1710 allocated = 0; 1711 INIT_LIST_HEAD(&surplus_list); 1712 1713 ret = -ENOMEM; 1714 retry: 1715 spin_unlock(&hugetlb_lock); 1716 for (i = 0; i < needed; i++) { 1717 page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE); 1718 if (!page) { 1719 alloc_ok = false; 1720 break; 1721 } 1722 list_add(&page->lru, &surplus_list); 1723 } 1724 allocated += i; 1725 1726 /* 1727 * After retaking hugetlb_lock, we need to recalculate 'needed' 1728 * because either resv_huge_pages or free_huge_pages may have changed. 1729 */ 1730 spin_lock(&hugetlb_lock); 1731 needed = (h->resv_huge_pages + delta) - 1732 (h->free_huge_pages + allocated); 1733 if (needed > 0) { 1734 if (alloc_ok) 1735 goto retry; 1736 /* 1737 * We were not able to allocate enough pages to 1738 * satisfy the entire reservation so we free what 1739 * we've allocated so far. 1740 */ 1741 goto free; 1742 } 1743 /* 1744 * The surplus_list now contains _at_least_ the number of extra pages 1745 * needed to accommodate the reservation. Add the appropriate number 1746 * of pages to the hugetlb pool and free the extras back to the buddy 1747 * allocator. Commit the entire reservation here to prevent another 1748 * process from stealing the pages as they are added to the pool but 1749 * before they are reserved. 1750 */ 1751 needed += allocated; 1752 h->resv_huge_pages += delta; 1753 ret = 0; 1754 1755 /* Free the needed pages to the hugetlb pool */ 1756 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1757 if ((--needed) < 0) 1758 break; 1759 /* 1760 * This page is now managed by the hugetlb allocator and has 1761 * no users -- drop the buddy allocator's reference. 1762 */ 1763 put_page_testzero(page); 1764 VM_BUG_ON_PAGE(page_count(page), page); 1765 enqueue_huge_page(h, page); 1766 } 1767 free: 1768 spin_unlock(&hugetlb_lock); 1769 1770 /* Free unnecessary surplus pages to the buddy allocator */ 1771 list_for_each_entry_safe(page, tmp, &surplus_list, lru) 1772 put_page(page); 1773 spin_lock(&hugetlb_lock); 1774 1775 return ret; 1776 } 1777 1778 /* 1779 * This routine has two main purposes: 1780 * 1) Decrement the reservation count (resv_huge_pages) by the value passed 1781 * in unused_resv_pages. This corresponds to the prior adjustments made 1782 * to the associated reservation map. 1783 * 2) Free any unused surplus pages that may have been allocated to satisfy 1784 * the reservation. As many as unused_resv_pages may be freed. 1785 * 1786 * Called with hugetlb_lock held. However, the lock could be dropped (and 1787 * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, 1788 * we must make sure nobody else can claim pages we are in the process of 1789 * freeing. Do this by ensuring resv_huge_page always is greater than the 1790 * number of huge pages we plan to free when dropping the lock. 1791 */ 1792 static void return_unused_surplus_pages(struct hstate *h, 1793 unsigned long unused_resv_pages) 1794 { 1795 unsigned long nr_pages; 1796 1797 /* Cannot return gigantic pages currently */ 1798 if (hstate_is_gigantic(h)) 1799 goto out; 1800 1801 /* 1802 * Part (or even all) of the reservation could have been backed 1803 * by pre-allocated pages. Only free surplus pages. 1804 */ 1805 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 1806 1807 /* 1808 * We want to release as many surplus pages as possible, spread 1809 * evenly across all nodes with memory. Iterate across these nodes 1810 * until we can no longer free unreserved surplus pages. This occurs 1811 * when the nodes with surplus pages have no free pages. 1812 * free_pool_huge_page() will balance the the freed pages across the 1813 * on-line nodes with memory and will handle the hstate accounting. 1814 * 1815 * Note that we decrement resv_huge_pages as we free the pages. If 1816 * we drop the lock, resv_huge_pages will still be sufficiently large 1817 * to cover subsequent pages we may free. 1818 */ 1819 while (nr_pages--) { 1820 h->resv_huge_pages--; 1821 unused_resv_pages--; 1822 if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) 1823 goto out; 1824 cond_resched_lock(&hugetlb_lock); 1825 } 1826 1827 out: 1828 /* Fully uncommit the reservation */ 1829 h->resv_huge_pages -= unused_resv_pages; 1830 } 1831 1832 1833 /* 1834 * vma_needs_reservation, vma_commit_reservation and vma_end_reservation 1835 * are used by the huge page allocation routines to manage reservations. 1836 * 1837 * vma_needs_reservation is called to determine if the huge page at addr 1838 * within the vma has an associated reservation. If a reservation is 1839 * needed, the value 1 is returned. The caller is then responsible for 1840 * managing the global reservation and subpool usage counts. After 1841 * the huge page has been allocated, vma_commit_reservation is called 1842 * to add the page to the reservation map. If the page allocation fails, 1843 * the reservation must be ended instead of committed. vma_end_reservation 1844 * is called in such cases. 1845 * 1846 * In the normal case, vma_commit_reservation returns the same value 1847 * as the preceding vma_needs_reservation call. The only time this 1848 * is not the case is if a reserve map was changed between calls. It 1849 * is the responsibility of the caller to notice the difference and 1850 * take appropriate action. 1851 * 1852 * vma_add_reservation is used in error paths where a reservation must 1853 * be restored when a newly allocated huge page must be freed. It is 1854 * to be called after calling vma_needs_reservation to determine if a 1855 * reservation exists. 1856 */ 1857 enum vma_resv_mode { 1858 VMA_NEEDS_RESV, 1859 VMA_COMMIT_RESV, 1860 VMA_END_RESV, 1861 VMA_ADD_RESV, 1862 }; 1863 static long __vma_reservation_common(struct hstate *h, 1864 struct vm_area_struct *vma, unsigned long addr, 1865 enum vma_resv_mode mode) 1866 { 1867 struct resv_map *resv; 1868 pgoff_t idx; 1869 long ret; 1870 1871 resv = vma_resv_map(vma); 1872 if (!resv) 1873 return 1; 1874 1875 idx = vma_hugecache_offset(h, vma, addr); 1876 switch (mode) { 1877 case VMA_NEEDS_RESV: 1878 ret = region_chg(resv, idx, idx + 1); 1879 break; 1880 case VMA_COMMIT_RESV: 1881 ret = region_add(resv, idx, idx + 1); 1882 break; 1883 case VMA_END_RESV: 1884 region_abort(resv, idx, idx + 1); 1885 ret = 0; 1886 break; 1887 case VMA_ADD_RESV: 1888 if (vma->vm_flags & VM_MAYSHARE) 1889 ret = region_add(resv, idx, idx + 1); 1890 else { 1891 region_abort(resv, idx, idx + 1); 1892 ret = region_del(resv, idx, idx + 1); 1893 } 1894 break; 1895 default: 1896 BUG(); 1897 } 1898 1899 if (vma->vm_flags & VM_MAYSHARE) 1900 return ret; 1901 else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) { 1902 /* 1903 * In most cases, reserves always exist for private mappings. 1904 * However, a file associated with mapping could have been 1905 * hole punched or truncated after reserves were consumed. 1906 * As subsequent fault on such a range will not use reserves. 1907 * Subtle - The reserve map for private mappings has the 1908 * opposite meaning than that of shared mappings. If NO 1909 * entry is in the reserve map, it means a reservation exists. 1910 * If an entry exists in the reserve map, it means the 1911 * reservation has already been consumed. As a result, the 1912 * return value of this routine is the opposite of the 1913 * value returned from reserve map manipulation routines above. 1914 */ 1915 if (ret) 1916 return 0; 1917 else 1918 return 1; 1919 } 1920 else 1921 return ret < 0 ? ret : 0; 1922 } 1923 1924 static long vma_needs_reservation(struct hstate *h, 1925 struct vm_area_struct *vma, unsigned long addr) 1926 { 1927 return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); 1928 } 1929 1930 static long vma_commit_reservation(struct hstate *h, 1931 struct vm_area_struct *vma, unsigned long addr) 1932 { 1933 return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); 1934 } 1935 1936 static void vma_end_reservation(struct hstate *h, 1937 struct vm_area_struct *vma, unsigned long addr) 1938 { 1939 (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); 1940 } 1941 1942 static long vma_add_reservation(struct hstate *h, 1943 struct vm_area_struct *vma, unsigned long addr) 1944 { 1945 return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); 1946 } 1947 1948 /* 1949 * This routine is called to restore a reservation on error paths. In the 1950 * specific error paths, a huge page was allocated (via alloc_huge_page) 1951 * and is about to be freed. If a reservation for the page existed, 1952 * alloc_huge_page would have consumed the reservation and set PagePrivate 1953 * in the newly allocated page. When the page is freed via free_huge_page, 1954 * the global reservation count will be incremented if PagePrivate is set. 1955 * However, free_huge_page can not adjust the reserve map. Adjust the 1956 * reserve map here to be consistent with global reserve count adjustments 1957 * to be made by free_huge_page. 1958 */ 1959 static void restore_reserve_on_error(struct hstate *h, 1960 struct vm_area_struct *vma, unsigned long address, 1961 struct page *page) 1962 { 1963 if (unlikely(PagePrivate(page))) { 1964 long rc = vma_needs_reservation(h, vma, address); 1965 1966 if (unlikely(rc < 0)) { 1967 /* 1968 * Rare out of memory condition in reserve map 1969 * manipulation. Clear PagePrivate so that 1970 * global reserve count will not be incremented 1971 * by free_huge_page. This will make it appear 1972 * as though the reservation for this page was 1973 * consumed. This may prevent the task from 1974 * faulting in the page at a later time. This 1975 * is better than inconsistent global huge page 1976 * accounting of reserve counts. 1977 */ 1978 ClearPagePrivate(page); 1979 } else if (rc) { 1980 rc = vma_add_reservation(h, vma, address); 1981 if (unlikely(rc < 0)) 1982 /* 1983 * See above comment about rare out of 1984 * memory condition. 1985 */ 1986 ClearPagePrivate(page); 1987 } else 1988 vma_end_reservation(h, vma, address); 1989 } 1990 } 1991 1992 struct page *alloc_huge_page(struct vm_area_struct *vma, 1993 unsigned long addr, int avoid_reserve) 1994 { 1995 struct hugepage_subpool *spool = subpool_vma(vma); 1996 struct hstate *h = hstate_vma(vma); 1997 struct page *page; 1998 long map_chg, map_commit; 1999 long gbl_chg; 2000 int ret, idx; 2001 struct hugetlb_cgroup *h_cg; 2002 2003 idx = hstate_index(h); 2004 /* 2005 * Examine the region/reserve map to determine if the process 2006 * has a reservation for the page to be allocated. A return 2007 * code of zero indicates a reservation exists (no change). 2008 */ 2009 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); 2010 if (map_chg < 0) 2011 return ERR_PTR(-ENOMEM); 2012 2013 /* 2014 * Processes that did not create the mapping will have no 2015 * reserves as indicated by the region/reserve map. Check 2016 * that the allocation will not exceed the subpool limit. 2017 * Allocations for MAP_NORESERVE mappings also need to be 2018 * checked against any subpool limit. 2019 */ 2020 if (map_chg || avoid_reserve) { 2021 gbl_chg = hugepage_subpool_get_pages(spool, 1); 2022 if (gbl_chg < 0) { 2023 vma_end_reservation(h, vma, addr); 2024 return ERR_PTR(-ENOSPC); 2025 } 2026 2027 /* 2028 * Even though there was no reservation in the region/reserve 2029 * map, there could be reservations associated with the 2030 * subpool that can be used. This would be indicated if the 2031 * return value of hugepage_subpool_get_pages() is zero. 2032 * However, if avoid_reserve is specified we still avoid even 2033 * the subpool reservations. 2034 */ 2035 if (avoid_reserve) 2036 gbl_chg = 1; 2037 } 2038 2039 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 2040 if (ret) 2041 goto out_subpool_put; 2042 2043 spin_lock(&hugetlb_lock); 2044 /* 2045 * glb_chg is passed to indicate whether or not a page must be taken 2046 * from the global free pool (global change). gbl_chg == 0 indicates 2047 * a reservation exists for the allocation. 2048 */ 2049 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); 2050 if (!page) { 2051 spin_unlock(&hugetlb_lock); 2052 page = __alloc_buddy_huge_page_with_mpol(h, vma, addr); 2053 if (!page) 2054 goto out_uncharge_cgroup; 2055 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { 2056 SetPagePrivate(page); 2057 h->resv_huge_pages--; 2058 } 2059 spin_lock(&hugetlb_lock); 2060 list_move(&page->lru, &h->hugepage_activelist); 2061 /* Fall through */ 2062 } 2063 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); 2064 spin_unlock(&hugetlb_lock); 2065 2066 set_page_private(page, (unsigned long)spool); 2067 2068 map_commit = vma_commit_reservation(h, vma, addr); 2069 if (unlikely(map_chg > map_commit)) { 2070 /* 2071 * The page was added to the reservation map between 2072 * vma_needs_reservation and vma_commit_reservation. 2073 * This indicates a race with hugetlb_reserve_pages. 2074 * Adjust for the subpool count incremented above AND 2075 * in hugetlb_reserve_pages for the same page. Also, 2076 * the reservation count added in hugetlb_reserve_pages 2077 * no longer applies. 2078 */ 2079 long rsv_adjust; 2080 2081 rsv_adjust = hugepage_subpool_put_pages(spool, 1); 2082 hugetlb_acct_memory(h, -rsv_adjust); 2083 } 2084 return page; 2085 2086 out_uncharge_cgroup: 2087 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); 2088 out_subpool_put: 2089 if (map_chg || avoid_reserve) 2090 hugepage_subpool_put_pages(spool, 1); 2091 vma_end_reservation(h, vma, addr); 2092 return ERR_PTR(-ENOSPC); 2093 } 2094 2095 /* 2096 * alloc_huge_page()'s wrapper which simply returns the page if allocation 2097 * succeeds, otherwise NULL. This function is called from new_vma_page(), 2098 * where no ERR_VALUE is expected to be returned. 2099 */ 2100 struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, 2101 unsigned long addr, int avoid_reserve) 2102 { 2103 struct page *page = alloc_huge_page(vma, addr, avoid_reserve); 2104 if (IS_ERR(page)) 2105 page = NULL; 2106 return page; 2107 } 2108 2109 int __weak alloc_bootmem_huge_page(struct hstate *h) 2110 { 2111 struct huge_bootmem_page *m; 2112 int nr_nodes, node; 2113 2114 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 2115 void *addr; 2116 2117 addr = memblock_virt_alloc_try_nid_nopanic( 2118 huge_page_size(h), huge_page_size(h), 2119 0, BOOTMEM_ALLOC_ACCESSIBLE, node); 2120 if (addr) { 2121 /* 2122 * Use the beginning of the huge page to store the 2123 * huge_bootmem_page struct (until gather_bootmem 2124 * puts them into the mem_map). 2125 */ 2126 m = addr; 2127 goto found; 2128 } 2129 } 2130 return 0; 2131 2132 found: 2133 BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); 2134 /* Put them into a private list first because mem_map is not up yet */ 2135 list_add(&m->list, &huge_boot_pages); 2136 m->hstate = h; 2137 return 1; 2138 } 2139 2140 static void __init prep_compound_huge_page(struct page *page, 2141 unsigned int order) 2142 { 2143 if (unlikely(order > (MAX_ORDER - 1))) 2144 prep_compound_gigantic_page(page, order); 2145 else 2146 prep_compound_page(page, order); 2147 } 2148 2149 /* Put bootmem huge pages into the standard lists after mem_map is up */ 2150 static void __init gather_bootmem_prealloc(void) 2151 { 2152 struct huge_bootmem_page *m; 2153 2154 list_for_each_entry(m, &huge_boot_pages, list) { 2155 struct hstate *h = m->hstate; 2156 struct page *page; 2157 2158 #ifdef CONFIG_HIGHMEM 2159 page = pfn_to_page(m->phys >> PAGE_SHIFT); 2160 memblock_free_late(__pa(m), 2161 sizeof(struct huge_bootmem_page)); 2162 #else 2163 page = virt_to_page(m); 2164 #endif 2165 WARN_ON(page_count(page) != 1); 2166 prep_compound_huge_page(page, h->order); 2167 WARN_ON(PageReserved(page)); 2168 prep_new_huge_page(h, page, page_to_nid(page)); 2169 /* 2170 * If we had gigantic hugepages allocated at boot time, we need 2171 * to restore the 'stolen' pages to totalram_pages in order to 2172 * fix confusing memory reports from free(1) and another 2173 * side-effects, like CommitLimit going negative. 2174 */ 2175 if (hstate_is_gigantic(h)) 2176 adjust_managed_page_count(page, 1 << h->order); 2177 } 2178 } 2179 2180 static void __init hugetlb_hstate_alloc_pages(struct hstate *h) 2181 { 2182 unsigned long i; 2183 2184 for (i = 0; i < h->max_huge_pages; ++i) { 2185 if (hstate_is_gigantic(h)) { 2186 if (!alloc_bootmem_huge_page(h)) 2187 break; 2188 } else if (!alloc_fresh_huge_page(h, 2189 &node_states[N_MEMORY])) 2190 break; 2191 } 2192 h->max_huge_pages = i; 2193 } 2194 2195 static void __init hugetlb_init_hstates(void) 2196 { 2197 struct hstate *h; 2198 2199 for_each_hstate(h) { 2200 if (minimum_order > huge_page_order(h)) 2201 minimum_order = huge_page_order(h); 2202 2203 /* oversize hugepages were init'ed in early boot */ 2204 if (!hstate_is_gigantic(h)) 2205 hugetlb_hstate_alloc_pages(h); 2206 } 2207 VM_BUG_ON(minimum_order == UINT_MAX); 2208 } 2209 2210 static char * __init memfmt(char *buf, unsigned long n) 2211 { 2212 if (n >= (1UL << 30)) 2213 sprintf(buf, "%lu GB", n >> 30); 2214 else if (n >= (1UL << 20)) 2215 sprintf(buf, "%lu MB", n >> 20); 2216 else 2217 sprintf(buf, "%lu KB", n >> 10); 2218 return buf; 2219 } 2220 2221 static void __init report_hugepages(void) 2222 { 2223 struct hstate *h; 2224 2225 for_each_hstate(h) { 2226 char buf[32]; 2227 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", 2228 memfmt(buf, huge_page_size(h)), 2229 h->free_huge_pages); 2230 } 2231 } 2232 2233 #ifdef CONFIG_HIGHMEM 2234 static void try_to_free_low(struct hstate *h, unsigned long count, 2235 nodemask_t *nodes_allowed) 2236 { 2237 int i; 2238 2239 if (hstate_is_gigantic(h)) 2240 return; 2241 2242 for_each_node_mask(i, *nodes_allowed) { 2243 struct page *page, *next; 2244 struct list_head *freel = &h->hugepage_freelists[i]; 2245 list_for_each_entry_safe(page, next, freel, lru) { 2246 if (count >= h->nr_huge_pages) 2247 return; 2248 if (PageHighMem(page)) 2249 continue; 2250 list_del(&page->lru); 2251 update_and_free_page(h, page); 2252 h->free_huge_pages--; 2253 h->free_huge_pages_node[page_to_nid(page)]--; 2254 } 2255 } 2256 } 2257 #else 2258 static inline void try_to_free_low(struct hstate *h, unsigned long count, 2259 nodemask_t *nodes_allowed) 2260 { 2261 } 2262 #endif 2263 2264 /* 2265 * Increment or decrement surplus_huge_pages. Keep node-specific counters 2266 * balanced by operating on them in a round-robin fashion. 2267 * Returns 1 if an adjustment was made. 2268 */ 2269 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 2270 int delta) 2271 { 2272 int nr_nodes, node; 2273 2274 VM_BUG_ON(delta != -1 && delta != 1); 2275 2276 if (delta < 0) { 2277 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 2278 if (h->surplus_huge_pages_node[node]) 2279 goto found; 2280 } 2281 } else { 2282 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 2283 if (h->surplus_huge_pages_node[node] < 2284 h->nr_huge_pages_node[node]) 2285 goto found; 2286 } 2287 } 2288 return 0; 2289 2290 found: 2291 h->surplus_huge_pages += delta; 2292 h->surplus_huge_pages_node[node] += delta; 2293 return 1; 2294 } 2295 2296 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 2297 static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, 2298 nodemask_t *nodes_allowed) 2299 { 2300 unsigned long min_count, ret; 2301 2302 if (hstate_is_gigantic(h) && !gigantic_page_supported()) 2303 return h->max_huge_pages; 2304 2305 /* 2306 * Increase the pool size 2307 * First take pages out of surplus state. Then make up the 2308 * remaining difference by allocating fresh huge pages. 2309 * 2310 * We might race with __alloc_buddy_huge_page() here and be unable 2311 * to convert a surplus huge page to a normal huge page. That is 2312 * not critical, though, it just means the overall size of the 2313 * pool might be one hugepage larger than it needs to be, but 2314 * within all the constraints specified by the sysctls. 2315 */ 2316 spin_lock(&hugetlb_lock); 2317 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 2318 if (!adjust_pool_surplus(h, nodes_allowed, -1)) 2319 break; 2320 } 2321 2322 while (count > persistent_huge_pages(h)) { 2323 /* 2324 * If this allocation races such that we no longer need the 2325 * page, free_huge_page will handle it by freeing the page 2326 * and reducing the surplus. 2327 */ 2328 spin_unlock(&hugetlb_lock); 2329 2330 /* yield cpu to avoid soft lockup */ 2331 cond_resched(); 2332 2333 if (hstate_is_gigantic(h)) 2334 ret = alloc_fresh_gigantic_page(h, nodes_allowed); 2335 else 2336 ret = alloc_fresh_huge_page(h, nodes_allowed); 2337 spin_lock(&hugetlb_lock); 2338 if (!ret) 2339 goto out; 2340 2341 /* Bail for signals. Probably ctrl-c from user */ 2342 if (signal_pending(current)) 2343 goto out; 2344 } 2345 2346 /* 2347 * Decrease the pool size 2348 * First return free pages to the buddy allocator (being careful 2349 * to keep enough around to satisfy reservations). Then place 2350 * pages into surplus state as needed so the pool will shrink 2351 * to the desired size as pages become free. 2352 * 2353 * By placing pages into the surplus state independent of the 2354 * overcommit value, we are allowing the surplus pool size to 2355 * exceed overcommit. There are few sane options here. Since 2356 * __alloc_buddy_huge_page() is checking the global counter, 2357 * though, we'll note that we're not allowed to exceed surplus 2358 * and won't grow the pool anywhere else. Not until one of the 2359 * sysctls are changed, or the surplus pages go out of use. 2360 */ 2361 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 2362 min_count = max(count, min_count); 2363 try_to_free_low(h, min_count, nodes_allowed); 2364 while (min_count < persistent_huge_pages(h)) { 2365 if (!free_pool_huge_page(h, nodes_allowed, 0)) 2366 break; 2367 cond_resched_lock(&hugetlb_lock); 2368 } 2369 while (count < persistent_huge_pages(h)) { 2370 if (!adjust_pool_surplus(h, nodes_allowed, 1)) 2371 break; 2372 } 2373 out: 2374 ret = persistent_huge_pages(h); 2375 spin_unlock(&hugetlb_lock); 2376 return ret; 2377 } 2378 2379 #define HSTATE_ATTR_RO(_name) \ 2380 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 2381 2382 #define HSTATE_ATTR(_name) \ 2383 static struct kobj_attribute _name##_attr = \ 2384 __ATTR(_name, 0644, _name##_show, _name##_store) 2385 2386 static struct kobject *hugepages_kobj; 2387 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 2388 2389 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); 2390 2391 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) 2392 { 2393 int i; 2394 2395 for (i = 0; i < HUGE_MAX_HSTATE; i++) 2396 if (hstate_kobjs[i] == kobj) { 2397 if (nidp) 2398 *nidp = NUMA_NO_NODE; 2399 return &hstates[i]; 2400 } 2401 2402 return kobj_to_node_hstate(kobj, nidp); 2403 } 2404 2405 static ssize_t nr_hugepages_show_common(struct kobject *kobj, 2406 struct kobj_attribute *attr, char *buf) 2407 { 2408 struct hstate *h; 2409 unsigned long nr_huge_pages; 2410 int nid; 2411 2412 h = kobj_to_hstate(kobj, &nid); 2413 if (nid == NUMA_NO_NODE) 2414 nr_huge_pages = h->nr_huge_pages; 2415 else 2416 nr_huge_pages = h->nr_huge_pages_node[nid]; 2417 2418 return sprintf(buf, "%lu\n", nr_huge_pages); 2419 } 2420 2421 static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, 2422 struct hstate *h, int nid, 2423 unsigned long count, size_t len) 2424 { 2425 int err; 2426 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); 2427 2428 if (hstate_is_gigantic(h) && !gigantic_page_supported()) { 2429 err = -EINVAL; 2430 goto out; 2431 } 2432 2433 if (nid == NUMA_NO_NODE) { 2434 /* 2435 * global hstate attribute 2436 */ 2437 if (!(obey_mempolicy && 2438 init_nodemask_of_mempolicy(nodes_allowed))) { 2439 NODEMASK_FREE(nodes_allowed); 2440 nodes_allowed = &node_states[N_MEMORY]; 2441 } 2442 } else if (nodes_allowed) { 2443 /* 2444 * per node hstate attribute: adjust count to global, 2445 * but restrict alloc/free to the specified node. 2446 */ 2447 count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 2448 init_nodemask_of_node(nodes_allowed, nid); 2449 } else 2450 nodes_allowed = &node_states[N_MEMORY]; 2451 2452 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); 2453 2454 if (nodes_allowed != &node_states[N_MEMORY]) 2455 NODEMASK_FREE(nodes_allowed); 2456 2457 return len; 2458 out: 2459 NODEMASK_FREE(nodes_allowed); 2460 return err; 2461 } 2462 2463 static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 2464 struct kobject *kobj, const char *buf, 2465 size_t len) 2466 { 2467 struct hstate *h; 2468 unsigned long count; 2469 int nid; 2470 int err; 2471 2472 err = kstrtoul(buf, 10, &count); 2473 if (err) 2474 return err; 2475 2476 h = kobj_to_hstate(kobj, &nid); 2477 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); 2478 } 2479 2480 static ssize_t nr_hugepages_show(struct kobject *kobj, 2481 struct kobj_attribute *attr, char *buf) 2482 { 2483 return nr_hugepages_show_common(kobj, attr, buf); 2484 } 2485 2486 static ssize_t nr_hugepages_store(struct kobject *kobj, 2487 struct kobj_attribute *attr, const char *buf, size_t len) 2488 { 2489 return nr_hugepages_store_common(false, kobj, buf, len); 2490 } 2491 HSTATE_ATTR(nr_hugepages); 2492 2493 #ifdef CONFIG_NUMA 2494 2495 /* 2496 * hstate attribute for optionally mempolicy-based constraint on persistent 2497 * huge page alloc/free. 2498 */ 2499 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, 2500 struct kobj_attribute *attr, char *buf) 2501 { 2502 return nr_hugepages_show_common(kobj, attr, buf); 2503 } 2504 2505 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, 2506 struct kobj_attribute *attr, const char *buf, size_t len) 2507 { 2508 return nr_hugepages_store_common(true, kobj, buf, len); 2509 } 2510 HSTATE_ATTR(nr_hugepages_mempolicy); 2511 #endif 2512 2513 2514 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 2515 struct kobj_attribute *attr, char *buf) 2516 { 2517 struct hstate *h = kobj_to_hstate(kobj, NULL); 2518 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 2519 } 2520 2521 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 2522 struct kobj_attribute *attr, const char *buf, size_t count) 2523 { 2524 int err; 2525 unsigned long input; 2526 struct hstate *h = kobj_to_hstate(kobj, NULL); 2527 2528 if (hstate_is_gigantic(h)) 2529 return -EINVAL; 2530 2531 err = kstrtoul(buf, 10, &input); 2532 if (err) 2533 return err; 2534 2535 spin_lock(&hugetlb_lock); 2536 h->nr_overcommit_huge_pages = input; 2537 spin_unlock(&hugetlb_lock); 2538 2539 return count; 2540 } 2541 HSTATE_ATTR(nr_overcommit_hugepages); 2542 2543 static ssize_t free_hugepages_show(struct kobject *kobj, 2544 struct kobj_attribute *attr, char *buf) 2545 { 2546 struct hstate *h; 2547 unsigned long free_huge_pages; 2548 int nid; 2549 2550 h = kobj_to_hstate(kobj, &nid); 2551 if (nid == NUMA_NO_NODE) 2552 free_huge_pages = h->free_huge_pages; 2553 else 2554 free_huge_pages = h->free_huge_pages_node[nid]; 2555 2556 return sprintf(buf, "%lu\n", free_huge_pages); 2557 } 2558 HSTATE_ATTR_RO(free_hugepages); 2559 2560 static ssize_t resv_hugepages_show(struct kobject *kobj, 2561 struct kobj_attribute *attr, char *buf) 2562 { 2563 struct hstate *h = kobj_to_hstate(kobj, NULL); 2564 return sprintf(buf, "%lu\n", h->resv_huge_pages); 2565 } 2566 HSTATE_ATTR_RO(resv_hugepages); 2567 2568 static ssize_t surplus_hugepages_show(struct kobject *kobj, 2569 struct kobj_attribute *attr, char *buf) 2570 { 2571 struct hstate *h; 2572 unsigned long surplus_huge_pages; 2573 int nid; 2574 2575 h = kobj_to_hstate(kobj, &nid); 2576 if (nid == NUMA_NO_NODE) 2577 surplus_huge_pages = h->surplus_huge_pages; 2578 else 2579 surplus_huge_pages = h->surplus_huge_pages_node[nid]; 2580 2581 return sprintf(buf, "%lu\n", surplus_huge_pages); 2582 } 2583 HSTATE_ATTR_RO(surplus_hugepages); 2584 2585 static struct attribute *hstate_attrs[] = { 2586 &nr_hugepages_attr.attr, 2587 &nr_overcommit_hugepages_attr.attr, 2588 &free_hugepages_attr.attr, 2589 &resv_hugepages_attr.attr, 2590 &surplus_hugepages_attr.attr, 2591 #ifdef CONFIG_NUMA 2592 &nr_hugepages_mempolicy_attr.attr, 2593 #endif 2594 NULL, 2595 }; 2596 2597 static struct attribute_group hstate_attr_group = { 2598 .attrs = hstate_attrs, 2599 }; 2600 2601 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, 2602 struct kobject **hstate_kobjs, 2603 struct attribute_group *hstate_attr_group) 2604 { 2605 int retval; 2606 int hi = hstate_index(h); 2607 2608 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 2609 if (!hstate_kobjs[hi]) 2610 return -ENOMEM; 2611 2612 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); 2613 if (retval) 2614 kobject_put(hstate_kobjs[hi]); 2615 2616 return retval; 2617 } 2618 2619 static void __init hugetlb_sysfs_init(void) 2620 { 2621 struct hstate *h; 2622 int err; 2623 2624 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); 2625 if (!hugepages_kobj) 2626 return; 2627 2628 for_each_hstate(h) { 2629 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 2630 hstate_kobjs, &hstate_attr_group); 2631 if (err) 2632 pr_err("Hugetlb: Unable to add hstate %s", h->name); 2633 } 2634 } 2635 2636 #ifdef CONFIG_NUMA 2637 2638 /* 2639 * node_hstate/s - associate per node hstate attributes, via their kobjects, 2640 * with node devices in node_devices[] using a parallel array. The array 2641 * index of a node device or _hstate == node id. 2642 * This is here to avoid any static dependency of the node device driver, in 2643 * the base kernel, on the hugetlb module. 2644 */ 2645 struct node_hstate { 2646 struct kobject *hugepages_kobj; 2647 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 2648 }; 2649 static struct node_hstate node_hstates[MAX_NUMNODES]; 2650 2651 /* 2652 * A subset of global hstate attributes for node devices 2653 */ 2654 static struct attribute *per_node_hstate_attrs[] = { 2655 &nr_hugepages_attr.attr, 2656 &free_hugepages_attr.attr, 2657 &surplus_hugepages_attr.attr, 2658 NULL, 2659 }; 2660 2661 static struct attribute_group per_node_hstate_attr_group = { 2662 .attrs = per_node_hstate_attrs, 2663 }; 2664 2665 /* 2666 * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. 2667 * Returns node id via non-NULL nidp. 2668 */ 2669 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 2670 { 2671 int nid; 2672 2673 for (nid = 0; nid < nr_node_ids; nid++) { 2674 struct node_hstate *nhs = &node_hstates[nid]; 2675 int i; 2676 for (i = 0; i < HUGE_MAX_HSTATE; i++) 2677 if (nhs->hstate_kobjs[i] == kobj) { 2678 if (nidp) 2679 *nidp = nid; 2680 return &hstates[i]; 2681 } 2682 } 2683 2684 BUG(); 2685 return NULL; 2686 } 2687 2688 /* 2689 * Unregister hstate attributes from a single node device. 2690 * No-op if no hstate attributes attached. 2691 */ 2692 static void hugetlb_unregister_node(struct node *node) 2693 { 2694 struct hstate *h; 2695 struct node_hstate *nhs = &node_hstates[node->dev.id]; 2696 2697 if (!nhs->hugepages_kobj) 2698 return; /* no hstate attributes */ 2699 2700 for_each_hstate(h) { 2701 int idx = hstate_index(h); 2702 if (nhs->hstate_kobjs[idx]) { 2703 kobject_put(nhs->hstate_kobjs[idx]); 2704 nhs->hstate_kobjs[idx] = NULL; 2705 } 2706 } 2707 2708 kobject_put(nhs->hugepages_kobj); 2709 nhs->hugepages_kobj = NULL; 2710 } 2711 2712 2713 /* 2714 * Register hstate attributes for a single node device. 2715 * No-op if attributes already registered. 2716 */ 2717 static void hugetlb_register_node(struct node *node) 2718 { 2719 struct hstate *h; 2720 struct node_hstate *nhs = &node_hstates[node->dev.id]; 2721 int err; 2722 2723 if (nhs->hugepages_kobj) 2724 return; /* already allocated */ 2725 2726 nhs->hugepages_kobj = kobject_create_and_add("hugepages", 2727 &node->dev.kobj); 2728 if (!nhs->hugepages_kobj) 2729 return; 2730 2731 for_each_hstate(h) { 2732 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, 2733 nhs->hstate_kobjs, 2734 &per_node_hstate_attr_group); 2735 if (err) { 2736 pr_err("Hugetlb: Unable to add hstate %s for node %d\n", 2737 h->name, node->dev.id); 2738 hugetlb_unregister_node(node); 2739 break; 2740 } 2741 } 2742 } 2743 2744 /* 2745 * hugetlb init time: register hstate attributes for all registered node 2746 * devices of nodes that have memory. All on-line nodes should have 2747 * registered their associated device by this time. 2748 */ 2749 static void __init hugetlb_register_all_nodes(void) 2750 { 2751 int nid; 2752 2753 for_each_node_state(nid, N_MEMORY) { 2754 struct node *node = node_devices[nid]; 2755 if (node->dev.id == nid) 2756 hugetlb_register_node(node); 2757 } 2758 2759 /* 2760 * Let the node device driver know we're here so it can 2761 * [un]register hstate attributes on node hotplug. 2762 */ 2763 register_hugetlbfs_with_node(hugetlb_register_node, 2764 hugetlb_unregister_node); 2765 } 2766 #else /* !CONFIG_NUMA */ 2767 2768 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 2769 { 2770 BUG(); 2771 if (nidp) 2772 *nidp = -1; 2773 return NULL; 2774 } 2775 2776 static void hugetlb_register_all_nodes(void) { } 2777 2778 #endif 2779 2780 static int __init hugetlb_init(void) 2781 { 2782 int i; 2783 2784 if (!hugepages_supported()) 2785 return 0; 2786 2787 if (!size_to_hstate(default_hstate_size)) { 2788 default_hstate_size = HPAGE_SIZE; 2789 if (!size_to_hstate(default_hstate_size)) 2790 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 2791 } 2792 default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); 2793 if (default_hstate_max_huge_pages) { 2794 if (!default_hstate.max_huge_pages) 2795 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 2796 } 2797 2798 hugetlb_init_hstates(); 2799 gather_bootmem_prealloc(); 2800 report_hugepages(); 2801 2802 hugetlb_sysfs_init(); 2803 hugetlb_register_all_nodes(); 2804 hugetlb_cgroup_file_init(); 2805 2806 #ifdef CONFIG_SMP 2807 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); 2808 #else 2809 num_fault_mutexes = 1; 2810 #endif 2811 hugetlb_fault_mutex_table = 2812 kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); 2813 BUG_ON(!hugetlb_fault_mutex_table); 2814 2815 for (i = 0; i < num_fault_mutexes; i++) 2816 mutex_init(&hugetlb_fault_mutex_table[i]); 2817 return 0; 2818 } 2819 subsys_initcall(hugetlb_init); 2820 2821 /* Should be called on processing a hugepagesz=... option */ 2822 void __init hugetlb_bad_size(void) 2823 { 2824 parsed_valid_hugepagesz = false; 2825 } 2826 2827 void __init hugetlb_add_hstate(unsigned int order) 2828 { 2829 struct hstate *h; 2830 unsigned long i; 2831 2832 if (size_to_hstate(PAGE_SIZE << order)) { 2833 pr_warn("hugepagesz= specified twice, ignoring\n"); 2834 return; 2835 } 2836 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); 2837 BUG_ON(order == 0); 2838 h = &hstates[hugetlb_max_hstate++]; 2839 h->order = order; 2840 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); 2841 h->nr_huge_pages = 0; 2842 h->free_huge_pages = 0; 2843 for (i = 0; i < MAX_NUMNODES; ++i) 2844 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 2845 INIT_LIST_HEAD(&h->hugepage_activelist); 2846 h->next_nid_to_alloc = first_memory_node; 2847 h->next_nid_to_free = first_memory_node; 2848 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 2849 huge_page_size(h)/1024); 2850 2851 parsed_hstate = h; 2852 } 2853 2854 static int __init hugetlb_nrpages_setup(char *s) 2855 { 2856 unsigned long *mhp; 2857 static unsigned long *last_mhp; 2858 2859 if (!parsed_valid_hugepagesz) { 2860 pr_warn("hugepages = %s preceded by " 2861 "an unsupported hugepagesz, ignoring\n", s); 2862 parsed_valid_hugepagesz = true; 2863 return 1; 2864 } 2865 /* 2866 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, 2867 * so this hugepages= parameter goes to the "default hstate". 2868 */ 2869 else if (!hugetlb_max_hstate) 2870 mhp = &default_hstate_max_huge_pages; 2871 else 2872 mhp = &parsed_hstate->max_huge_pages; 2873 2874 if (mhp == last_mhp) { 2875 pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n"); 2876 return 1; 2877 } 2878 2879 if (sscanf(s, "%lu", mhp) <= 0) 2880 *mhp = 0; 2881 2882 /* 2883 * Global state is always initialized later in hugetlb_init. 2884 * But we need to allocate >= MAX_ORDER hstates here early to still 2885 * use the bootmem allocator. 2886 */ 2887 if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) 2888 hugetlb_hstate_alloc_pages(parsed_hstate); 2889 2890 last_mhp = mhp; 2891 2892 return 1; 2893 } 2894 __setup("hugepages=", hugetlb_nrpages_setup); 2895 2896 static int __init hugetlb_default_setup(char *s) 2897 { 2898 default_hstate_size = memparse(s, &s); 2899 return 1; 2900 } 2901 __setup("default_hugepagesz=", hugetlb_default_setup); 2902 2903 static unsigned int cpuset_mems_nr(unsigned int *array) 2904 { 2905 int node; 2906 unsigned int nr = 0; 2907 2908 for_each_node_mask(node, cpuset_current_mems_allowed) 2909 nr += array[node]; 2910 2911 return nr; 2912 } 2913 2914 #ifdef CONFIG_SYSCTL 2915 static int hugetlb_sysctl_handler_common(bool obey_mempolicy, 2916 struct ctl_table *table, int write, 2917 void __user *buffer, size_t *length, loff_t *ppos) 2918 { 2919 struct hstate *h = &default_hstate; 2920 unsigned long tmp = h->max_huge_pages; 2921 int ret; 2922 2923 if (!hugepages_supported()) 2924 return -EOPNOTSUPP; 2925 2926 table->data = &tmp; 2927 table->maxlen = sizeof(unsigned long); 2928 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); 2929 if (ret) 2930 goto out; 2931 2932 if (write) 2933 ret = __nr_hugepages_store_common(obey_mempolicy, h, 2934 NUMA_NO_NODE, tmp, *length); 2935 out: 2936 return ret; 2937 } 2938 2939 int hugetlb_sysctl_handler(struct ctl_table *table, int write, 2940 void __user *buffer, size_t *length, loff_t *ppos) 2941 { 2942 2943 return hugetlb_sysctl_handler_common(false, table, write, 2944 buffer, length, ppos); 2945 } 2946 2947 #ifdef CONFIG_NUMA 2948 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, 2949 void __user *buffer, size_t *length, loff_t *ppos) 2950 { 2951 return hugetlb_sysctl_handler_common(true, table, write, 2952 buffer, length, ppos); 2953 } 2954 #endif /* CONFIG_NUMA */ 2955 2956 int hugetlb_overcommit_handler(struct ctl_table *table, int write, 2957 void __user *buffer, 2958 size_t *length, loff_t *ppos) 2959 { 2960 struct hstate *h = &default_hstate; 2961 unsigned long tmp; 2962 int ret; 2963 2964 if (!hugepages_supported()) 2965 return -EOPNOTSUPP; 2966 2967 tmp = h->nr_overcommit_huge_pages; 2968 2969 if (write && hstate_is_gigantic(h)) 2970 return -EINVAL; 2971 2972 table->data = &tmp; 2973 table->maxlen = sizeof(unsigned long); 2974 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); 2975 if (ret) 2976 goto out; 2977 2978 if (write) { 2979 spin_lock(&hugetlb_lock); 2980 h->nr_overcommit_huge_pages = tmp; 2981 spin_unlock(&hugetlb_lock); 2982 } 2983 out: 2984 return ret; 2985 } 2986 2987 #endif /* CONFIG_SYSCTL */ 2988 2989 void hugetlb_report_meminfo(struct seq_file *m) 2990 { 2991 struct hstate *h = &default_hstate; 2992 if (!hugepages_supported()) 2993 return; 2994 seq_printf(m, 2995 "HugePages_Total: %5lu\n" 2996 "HugePages_Free: %5lu\n" 2997 "HugePages_Rsvd: %5lu\n" 2998 "HugePages_Surp: %5lu\n" 2999 "Hugepagesize: %8lu kB\n", 3000 h->nr_huge_pages, 3001 h->free_huge_pages, 3002 h->resv_huge_pages, 3003 h->surplus_huge_pages, 3004 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); 3005 } 3006 3007 int hugetlb_report_node_meminfo(int nid, char *buf) 3008 { 3009 struct hstate *h = &default_hstate; 3010 if (!hugepages_supported()) 3011 return 0; 3012 return sprintf(buf, 3013 "Node %d HugePages_Total: %5u\n" 3014 "Node %d HugePages_Free: %5u\n" 3015 "Node %d HugePages_Surp: %5u\n", 3016 nid, h->nr_huge_pages_node[nid], 3017 nid, h->free_huge_pages_node[nid], 3018 nid, h->surplus_huge_pages_node[nid]); 3019 } 3020 3021 void hugetlb_show_meminfo(void) 3022 { 3023 struct hstate *h; 3024 int nid; 3025 3026 if (!hugepages_supported()) 3027 return; 3028 3029 for_each_node_state(nid, N_MEMORY) 3030 for_each_hstate(h) 3031 pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", 3032 nid, 3033 h->nr_huge_pages_node[nid], 3034 h->free_huge_pages_node[nid], 3035 h->surplus_huge_pages_node[nid], 3036 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); 3037 } 3038 3039 void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) 3040 { 3041 seq_printf(m, "HugetlbPages:\t%8lu kB\n", 3042 atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); 3043 } 3044 3045 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 3046 unsigned long hugetlb_total_pages(void) 3047 { 3048 struct hstate *h; 3049 unsigned long nr_total_pages = 0; 3050 3051 for_each_hstate(h) 3052 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); 3053 return nr_total_pages; 3054 } 3055 3056 static int hugetlb_acct_memory(struct hstate *h, long delta) 3057 { 3058 int ret = -ENOMEM; 3059 3060 spin_lock(&hugetlb_lock); 3061 /* 3062 * When cpuset is configured, it breaks the strict hugetlb page 3063 * reservation as the accounting is done on a global variable. Such 3064 * reservation is completely rubbish in the presence of cpuset because 3065 * the reservation is not checked against page availability for the 3066 * current cpuset. Application can still potentially OOM'ed by kernel 3067 * with lack of free htlb page in cpuset that the task is in. 3068 * Attempt to enforce strict accounting with cpuset is almost 3069 * impossible (or too ugly) because cpuset is too fluid that 3070 * task or memory node can be dynamically moved between cpusets. 3071 * 3072 * The change of semantics for shared hugetlb mapping with cpuset is 3073 * undesirable. However, in order to preserve some of the semantics, 3074 * we fall back to check against current free page availability as 3075 * a best attempt and hopefully to minimize the impact of changing 3076 * semantics that cpuset has. 3077 */ 3078 if (delta > 0) { 3079 if (gather_surplus_pages(h, delta) < 0) 3080 goto out; 3081 3082 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { 3083 return_unused_surplus_pages(h, delta); 3084 goto out; 3085 } 3086 } 3087 3088 ret = 0; 3089 if (delta < 0) 3090 return_unused_surplus_pages(h, (unsigned long) -delta); 3091 3092 out: 3093 spin_unlock(&hugetlb_lock); 3094 return ret; 3095 } 3096 3097 static void hugetlb_vm_op_open(struct vm_area_struct *vma) 3098 { 3099 struct resv_map *resv = vma_resv_map(vma); 3100 3101 /* 3102 * This new VMA should share its siblings reservation map if present. 3103 * The VMA will only ever have a valid reservation map pointer where 3104 * it is being copied for another still existing VMA. As that VMA 3105 * has a reference to the reservation map it cannot disappear until 3106 * after this open call completes. It is therefore safe to take a 3107 * new reference here without additional locking. 3108 */ 3109 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 3110 kref_get(&resv->refs); 3111 } 3112 3113 static void hugetlb_vm_op_close(struct vm_area_struct *vma) 3114 { 3115 struct hstate *h = hstate_vma(vma); 3116 struct resv_map *resv = vma_resv_map(vma); 3117 struct hugepage_subpool *spool = subpool_vma(vma); 3118 unsigned long reserve, start, end; 3119 long gbl_reserve; 3120 3121 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 3122 return; 3123 3124 start = vma_hugecache_offset(h, vma, vma->vm_start); 3125 end = vma_hugecache_offset(h, vma, vma->vm_end); 3126 3127 reserve = (end - start) - region_count(resv, start, end); 3128 3129 kref_put(&resv->refs, resv_map_release); 3130 3131 if (reserve) { 3132 /* 3133 * Decrement reserve counts. The global reserve count may be 3134 * adjusted if the subpool has a minimum size. 3135 */ 3136 gbl_reserve = hugepage_subpool_put_pages(spool, reserve); 3137 hugetlb_acct_memory(h, -gbl_reserve); 3138 } 3139 } 3140 3141 /* 3142 * We cannot handle pagefaults against hugetlb pages at all. They cause 3143 * handle_mm_fault() to try to instantiate regular-sized pages in the 3144 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 3145 * this far. 3146 */ 3147 static int hugetlb_vm_op_fault(struct vm_fault *vmf) 3148 { 3149 BUG(); 3150 return 0; 3151 } 3152 3153 const struct vm_operations_struct hugetlb_vm_ops = { 3154 .fault = hugetlb_vm_op_fault, 3155 .open = hugetlb_vm_op_open, 3156 .close = hugetlb_vm_op_close, 3157 }; 3158 3159 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 3160 int writable) 3161 { 3162 pte_t entry; 3163 3164 if (writable) { 3165 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, 3166 vma->vm_page_prot))); 3167 } else { 3168 entry = huge_pte_wrprotect(mk_huge_pte(page, 3169 vma->vm_page_prot)); 3170 } 3171 entry = pte_mkyoung(entry); 3172 entry = pte_mkhuge(entry); 3173 entry = arch_make_huge_pte(entry, vma, page, writable); 3174 3175 return entry; 3176 } 3177 3178 static void set_huge_ptep_writable(struct vm_area_struct *vma, 3179 unsigned long address, pte_t *ptep) 3180 { 3181 pte_t entry; 3182 3183 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); 3184 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) 3185 update_mmu_cache(vma, address, ptep); 3186 } 3187 3188 static int is_hugetlb_entry_migration(pte_t pte) 3189 { 3190 swp_entry_t swp; 3191 3192 if (huge_pte_none(pte) || pte_present(pte)) 3193 return 0; 3194 swp = pte_to_swp_entry(pte); 3195 if (non_swap_entry(swp) && is_migration_entry(swp)) 3196 return 1; 3197 else 3198 return 0; 3199 } 3200 3201 static int is_hugetlb_entry_hwpoisoned(pte_t pte) 3202 { 3203 swp_entry_t swp; 3204 3205 if (huge_pte_none(pte) || pte_present(pte)) 3206 return 0; 3207 swp = pte_to_swp_entry(pte); 3208 if (non_swap_entry(swp) && is_hwpoison_entry(swp)) 3209 return 1; 3210 else 3211 return 0; 3212 } 3213 3214 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 3215 struct vm_area_struct *vma) 3216 { 3217 pte_t *src_pte, *dst_pte, entry; 3218 struct page *ptepage; 3219 unsigned long addr; 3220 int cow; 3221 struct hstate *h = hstate_vma(vma); 3222 unsigned long sz = huge_page_size(h); 3223 unsigned long mmun_start; /* For mmu_notifiers */ 3224 unsigned long mmun_end; /* For mmu_notifiers */ 3225 int ret = 0; 3226 3227 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 3228 3229 mmun_start = vma->vm_start; 3230 mmun_end = vma->vm_end; 3231 if (cow) 3232 mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); 3233 3234 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 3235 spinlock_t *src_ptl, *dst_ptl; 3236 src_pte = huge_pte_offset(src, addr); 3237 if (!src_pte) 3238 continue; 3239 dst_pte = huge_pte_alloc(dst, addr, sz); 3240 if (!dst_pte) { 3241 ret = -ENOMEM; 3242 break; 3243 } 3244 3245 /* If the pagetables are shared don't copy or take references */ 3246 if (dst_pte == src_pte) 3247 continue; 3248 3249 dst_ptl = huge_pte_lock(h, dst, dst_pte); 3250 src_ptl = huge_pte_lockptr(h, src, src_pte); 3251 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 3252 entry = huge_ptep_get(src_pte); 3253 if (huge_pte_none(entry)) { /* skip none entry */ 3254 ; 3255 } else if (unlikely(is_hugetlb_entry_migration(entry) || 3256 is_hugetlb_entry_hwpoisoned(entry))) { 3257 swp_entry_t swp_entry = pte_to_swp_entry(entry); 3258 3259 if (is_write_migration_entry(swp_entry) && cow) { 3260 /* 3261 * COW mappings require pages in both 3262 * parent and child to be set to read. 3263 */ 3264 make_migration_entry_read(&swp_entry); 3265 entry = swp_entry_to_pte(swp_entry); 3266 set_huge_pte_at(src, addr, src_pte, entry); 3267 } 3268 set_huge_pte_at(dst, addr, dst_pte, entry); 3269 } else { 3270 if (cow) { 3271 huge_ptep_set_wrprotect(src, addr, src_pte); 3272 mmu_notifier_invalidate_range(src, mmun_start, 3273 mmun_end); 3274 } 3275 entry = huge_ptep_get(src_pte); 3276 ptepage = pte_page(entry); 3277 get_page(ptepage); 3278 page_dup_rmap(ptepage, true); 3279 set_huge_pte_at(dst, addr, dst_pte, entry); 3280 hugetlb_count_add(pages_per_huge_page(h), dst); 3281 } 3282 spin_unlock(src_ptl); 3283 spin_unlock(dst_ptl); 3284 } 3285 3286 if (cow) 3287 mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); 3288 3289 return ret; 3290 } 3291 3292 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 3293 unsigned long start, unsigned long end, 3294 struct page *ref_page) 3295 { 3296 struct mm_struct *mm = vma->vm_mm; 3297 unsigned long address; 3298 pte_t *ptep; 3299 pte_t pte; 3300 spinlock_t *ptl; 3301 struct page *page; 3302 struct hstate *h = hstate_vma(vma); 3303 unsigned long sz = huge_page_size(h); 3304 const unsigned long mmun_start = start; /* For mmu_notifiers */ 3305 const unsigned long mmun_end = end; /* For mmu_notifiers */ 3306 3307 WARN_ON(!is_vm_hugetlb_page(vma)); 3308 BUG_ON(start & ~huge_page_mask(h)); 3309 BUG_ON(end & ~huge_page_mask(h)); 3310 3311 /* 3312 * This is a hugetlb vma, all the pte entries should point 3313 * to huge page. 3314 */ 3315 tlb_remove_check_page_size_change(tlb, sz); 3316 tlb_start_vma(tlb, vma); 3317 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 3318 address = start; 3319 for (; address < end; address += sz) { 3320 ptep = huge_pte_offset(mm, address); 3321 if (!ptep) 3322 continue; 3323 3324 ptl = huge_pte_lock(h, mm, ptep); 3325 if (huge_pmd_unshare(mm, &address, ptep)) { 3326 spin_unlock(ptl); 3327 continue; 3328 } 3329 3330 pte = huge_ptep_get(ptep); 3331 if (huge_pte_none(pte)) { 3332 spin_unlock(ptl); 3333 continue; 3334 } 3335 3336 /* 3337 * Migrating hugepage or HWPoisoned hugepage is already 3338 * unmapped and its refcount is dropped, so just clear pte here. 3339 */ 3340 if (unlikely(!pte_present(pte))) { 3341 huge_pte_clear(mm, address, ptep); 3342 spin_unlock(ptl); 3343 continue; 3344 } 3345 3346 page = pte_page(pte); 3347 /* 3348 * If a reference page is supplied, it is because a specific 3349 * page is being unmapped, not a range. Ensure the page we 3350 * are about to unmap is the actual page of interest. 3351 */ 3352 if (ref_page) { 3353 if (page != ref_page) { 3354 spin_unlock(ptl); 3355 continue; 3356 } 3357 /* 3358 * Mark the VMA as having unmapped its page so that 3359 * future faults in this VMA will fail rather than 3360 * looking like data was lost 3361 */ 3362 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); 3363 } 3364 3365 pte = huge_ptep_get_and_clear(mm, address, ptep); 3366 tlb_remove_huge_tlb_entry(h, tlb, ptep, address); 3367 if (huge_pte_dirty(pte)) 3368 set_page_dirty(page); 3369 3370 hugetlb_count_sub(pages_per_huge_page(h), mm); 3371 page_remove_rmap(page, true); 3372 3373 spin_unlock(ptl); 3374 tlb_remove_page_size(tlb, page, huge_page_size(h)); 3375 /* 3376 * Bail out after unmapping reference page if supplied 3377 */ 3378 if (ref_page) 3379 break; 3380 } 3381 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 3382 tlb_end_vma(tlb, vma); 3383 } 3384 3385 void __unmap_hugepage_range_final(struct mmu_gather *tlb, 3386 struct vm_area_struct *vma, unsigned long start, 3387 unsigned long end, struct page *ref_page) 3388 { 3389 __unmap_hugepage_range(tlb, vma, start, end, ref_page); 3390 3391 /* 3392 * Clear this flag so that x86's huge_pmd_share page_table_shareable 3393 * test will fail on a vma being torn down, and not grab a page table 3394 * on its way out. We're lucky that the flag has such an appropriate 3395 * name, and can in fact be safely cleared here. We could clear it 3396 * before the __unmap_hugepage_range above, but all that's necessary 3397 * is to clear it before releasing the i_mmap_rwsem. This works 3398 * because in the context this is called, the VMA is about to be 3399 * destroyed and the i_mmap_rwsem is held. 3400 */ 3401 vma->vm_flags &= ~VM_MAYSHARE; 3402 } 3403 3404 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 3405 unsigned long end, struct page *ref_page) 3406 { 3407 struct mm_struct *mm; 3408 struct mmu_gather tlb; 3409 3410 mm = vma->vm_mm; 3411 3412 tlb_gather_mmu(&tlb, mm, start, end); 3413 __unmap_hugepage_range(&tlb, vma, start, end, ref_page); 3414 tlb_finish_mmu(&tlb, start, end); 3415 } 3416 3417 /* 3418 * This is called when the original mapper is failing to COW a MAP_PRIVATE 3419 * mappping it owns the reserve page for. The intention is to unmap the page 3420 * from other VMAs and let the children be SIGKILLed if they are faulting the 3421 * same region. 3422 */ 3423 static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, 3424 struct page *page, unsigned long address) 3425 { 3426 struct hstate *h = hstate_vma(vma); 3427 struct vm_area_struct *iter_vma; 3428 struct address_space *mapping; 3429 pgoff_t pgoff; 3430 3431 /* 3432 * vm_pgoff is in PAGE_SIZE units, hence the different calculation 3433 * from page cache lookup which is in HPAGE_SIZE units. 3434 */ 3435 address = address & huge_page_mask(h); 3436 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + 3437 vma->vm_pgoff; 3438 mapping = vma->vm_file->f_mapping; 3439 3440 /* 3441 * Take the mapping lock for the duration of the table walk. As 3442 * this mapping should be shared between all the VMAs, 3443 * __unmap_hugepage_range() is called as the lock is already held 3444 */ 3445 i_mmap_lock_write(mapping); 3446 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { 3447 /* Do not unmap the current VMA */ 3448 if (iter_vma == vma) 3449 continue; 3450 3451 /* 3452 * Shared VMAs have their own reserves and do not affect 3453 * MAP_PRIVATE accounting but it is possible that a shared 3454 * VMA is using the same page so check and skip such VMAs. 3455 */ 3456 if (iter_vma->vm_flags & VM_MAYSHARE) 3457 continue; 3458 3459 /* 3460 * Unmap the page from other VMAs without their own reserves. 3461 * They get marked to be SIGKILLed if they fault in these 3462 * areas. This is because a future no-page fault on this VMA 3463 * could insert a zeroed page instead of the data existing 3464 * from the time of fork. This would look like data corruption 3465 */ 3466 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 3467 unmap_hugepage_range(iter_vma, address, 3468 address + huge_page_size(h), page); 3469 } 3470 i_mmap_unlock_write(mapping); 3471 } 3472 3473 /* 3474 * Hugetlb_cow() should be called with page lock of the original hugepage held. 3475 * Called with hugetlb_instantiation_mutex held and pte_page locked so we 3476 * cannot race with other handlers or page migration. 3477 * Keep the pte_same checks anyway to make transition from the mutex easier. 3478 */ 3479 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 3480 unsigned long address, pte_t *ptep, 3481 struct page *pagecache_page, spinlock_t *ptl) 3482 { 3483 pte_t pte; 3484 struct hstate *h = hstate_vma(vma); 3485 struct page *old_page, *new_page; 3486 int ret = 0, outside_reserve = 0; 3487 unsigned long mmun_start; /* For mmu_notifiers */ 3488 unsigned long mmun_end; /* For mmu_notifiers */ 3489 3490 pte = huge_ptep_get(ptep); 3491 old_page = pte_page(pte); 3492 3493 retry_avoidcopy: 3494 /* If no-one else is actually using this page, avoid the copy 3495 * and just make the page writable */ 3496 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { 3497 page_move_anon_rmap(old_page, vma); 3498 set_huge_ptep_writable(vma, address, ptep); 3499 return 0; 3500 } 3501 3502 /* 3503 * If the process that created a MAP_PRIVATE mapping is about to 3504 * perform a COW due to a shared page count, attempt to satisfy 3505 * the allocation without using the existing reserves. The pagecache 3506 * page is used to determine if the reserve at this address was 3507 * consumed or not. If reserves were used, a partial faulted mapping 3508 * at the time of fork() could consume its reserves on COW instead 3509 * of the full address range. 3510 */ 3511 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 3512 old_page != pagecache_page) 3513 outside_reserve = 1; 3514 3515 get_page(old_page); 3516 3517 /* 3518 * Drop page table lock as buddy allocator may be called. It will 3519 * be acquired again before returning to the caller, as expected. 3520 */ 3521 spin_unlock(ptl); 3522 new_page = alloc_huge_page(vma, address, outside_reserve); 3523 3524 if (IS_ERR(new_page)) { 3525 /* 3526 * If a process owning a MAP_PRIVATE mapping fails to COW, 3527 * it is due to references held by a child and an insufficient 3528 * huge page pool. To guarantee the original mappers 3529 * reliability, unmap the page from child processes. The child 3530 * may get SIGKILLed if it later faults. 3531 */ 3532 if (outside_reserve) { 3533 put_page(old_page); 3534 BUG_ON(huge_pte_none(pte)); 3535 unmap_ref_private(mm, vma, old_page, address); 3536 BUG_ON(huge_pte_none(pte)); 3537 spin_lock(ptl); 3538 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 3539 if (likely(ptep && 3540 pte_same(huge_ptep_get(ptep), pte))) 3541 goto retry_avoidcopy; 3542 /* 3543 * race occurs while re-acquiring page table 3544 * lock, and our job is done. 3545 */ 3546 return 0; 3547 } 3548 3549 ret = (PTR_ERR(new_page) == -ENOMEM) ? 3550 VM_FAULT_OOM : VM_FAULT_SIGBUS; 3551 goto out_release_old; 3552 } 3553 3554 /* 3555 * When the original hugepage is shared one, it does not have 3556 * anon_vma prepared. 3557 */ 3558 if (unlikely(anon_vma_prepare(vma))) { 3559 ret = VM_FAULT_OOM; 3560 goto out_release_all; 3561 } 3562 3563 copy_user_huge_page(new_page, old_page, address, vma, 3564 pages_per_huge_page(h)); 3565 __SetPageUptodate(new_page); 3566 set_page_huge_active(new_page); 3567 3568 mmun_start = address & huge_page_mask(h); 3569 mmun_end = mmun_start + huge_page_size(h); 3570 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 3571 3572 /* 3573 * Retake the page table lock to check for racing updates 3574 * before the page tables are altered 3575 */ 3576 spin_lock(ptl); 3577 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 3578 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { 3579 ClearPagePrivate(new_page); 3580 3581 /* Break COW */ 3582 huge_ptep_clear_flush(vma, address, ptep); 3583 mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); 3584 set_huge_pte_at(mm, address, ptep, 3585 make_huge_pte(vma, new_page, 1)); 3586 page_remove_rmap(old_page, true); 3587 hugepage_add_new_anon_rmap(new_page, vma, address); 3588 /* Make the old page be freed below */ 3589 new_page = old_page; 3590 } 3591 spin_unlock(ptl); 3592 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 3593 out_release_all: 3594 restore_reserve_on_error(h, vma, address, new_page); 3595 put_page(new_page); 3596 out_release_old: 3597 put_page(old_page); 3598 3599 spin_lock(ptl); /* Caller expects lock to be held */ 3600 return ret; 3601 } 3602 3603 /* Return the pagecache page at a given address within a VMA */ 3604 static struct page *hugetlbfs_pagecache_page(struct hstate *h, 3605 struct vm_area_struct *vma, unsigned long address) 3606 { 3607 struct address_space *mapping; 3608 pgoff_t idx; 3609 3610 mapping = vma->vm_file->f_mapping; 3611 idx = vma_hugecache_offset(h, vma, address); 3612 3613 return find_lock_page(mapping, idx); 3614 } 3615 3616 /* 3617 * Return whether there is a pagecache page to back given address within VMA. 3618 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. 3619 */ 3620 static bool hugetlbfs_pagecache_present(struct hstate *h, 3621 struct vm_area_struct *vma, unsigned long address) 3622 { 3623 struct address_space *mapping; 3624 pgoff_t idx; 3625 struct page *page; 3626 3627 mapping = vma->vm_file->f_mapping; 3628 idx = vma_hugecache_offset(h, vma, address); 3629 3630 page = find_get_page(mapping, idx); 3631 if (page) 3632 put_page(page); 3633 return page != NULL; 3634 } 3635 3636 int huge_add_to_page_cache(struct page *page, struct address_space *mapping, 3637 pgoff_t idx) 3638 { 3639 struct inode *inode = mapping->host; 3640 struct hstate *h = hstate_inode(inode); 3641 int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 3642 3643 if (err) 3644 return err; 3645 ClearPagePrivate(page); 3646 3647 spin_lock(&inode->i_lock); 3648 inode->i_blocks += blocks_per_huge_page(h); 3649 spin_unlock(&inode->i_lock); 3650 return 0; 3651 } 3652 3653 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 3654 struct address_space *mapping, pgoff_t idx, 3655 unsigned long address, pte_t *ptep, unsigned int flags) 3656 { 3657 struct hstate *h = hstate_vma(vma); 3658 int ret = VM_FAULT_SIGBUS; 3659 int anon_rmap = 0; 3660 unsigned long size; 3661 struct page *page; 3662 pte_t new_pte; 3663 spinlock_t *ptl; 3664 3665 /* 3666 * Currently, we are forced to kill the process in the event the 3667 * original mapper has unmapped pages from the child due to a failed 3668 * COW. Warn that such a situation has occurred as it may not be obvious 3669 */ 3670 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 3671 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", 3672 current->pid); 3673 return ret; 3674 } 3675 3676 /* 3677 * Use page lock to guard against racing truncation 3678 * before we get page_table_lock. 3679 */ 3680 retry: 3681 page = find_lock_page(mapping, idx); 3682 if (!page) { 3683 size = i_size_read(mapping->host) >> huge_page_shift(h); 3684 if (idx >= size) 3685 goto out; 3686 3687 /* 3688 * Check for page in userfault range 3689 */ 3690 if (userfaultfd_missing(vma)) { 3691 u32 hash; 3692 struct vm_fault vmf = { 3693 .vma = vma, 3694 .address = address, 3695 .flags = flags, 3696 /* 3697 * Hard to debug if it ends up being 3698 * used by a callee that assumes 3699 * something about the other 3700 * uninitialized fields... same as in 3701 * memory.c 3702 */ 3703 }; 3704 3705 /* 3706 * hugetlb_fault_mutex must be dropped before 3707 * handling userfault. Reacquire after handling 3708 * fault to make calling code simpler. 3709 */ 3710 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, 3711 idx, address); 3712 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 3713 ret = handle_userfault(&vmf, VM_UFFD_MISSING); 3714 mutex_lock(&hugetlb_fault_mutex_table[hash]); 3715 goto out; 3716 } 3717 3718 page = alloc_huge_page(vma, address, 0); 3719 if (IS_ERR(page)) { 3720 ret = PTR_ERR(page); 3721 if (ret == -ENOMEM) 3722 ret = VM_FAULT_OOM; 3723 else 3724 ret = VM_FAULT_SIGBUS; 3725 goto out; 3726 } 3727 clear_huge_page(page, address, pages_per_huge_page(h)); 3728 __SetPageUptodate(page); 3729 set_page_huge_active(page); 3730 3731 if (vma->vm_flags & VM_MAYSHARE) { 3732 int err = huge_add_to_page_cache(page, mapping, idx); 3733 if (err) { 3734 put_page(page); 3735 if (err == -EEXIST) 3736 goto retry; 3737 goto out; 3738 } 3739 } else { 3740 lock_page(page); 3741 if (unlikely(anon_vma_prepare(vma))) { 3742 ret = VM_FAULT_OOM; 3743 goto backout_unlocked; 3744 } 3745 anon_rmap = 1; 3746 } 3747 } else { 3748 /* 3749 * If memory error occurs between mmap() and fault, some process 3750 * don't have hwpoisoned swap entry for errored virtual address. 3751 * So we need to block hugepage fault by PG_hwpoison bit check. 3752 */ 3753 if (unlikely(PageHWPoison(page))) { 3754 ret = VM_FAULT_HWPOISON | 3755 VM_FAULT_SET_HINDEX(hstate_index(h)); 3756 goto backout_unlocked; 3757 } 3758 } 3759 3760 /* 3761 * If we are going to COW a private mapping later, we examine the 3762 * pending reservations for this page now. This will ensure that 3763 * any allocations necessary to record that reservation occur outside 3764 * the spinlock. 3765 */ 3766 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 3767 if (vma_needs_reservation(h, vma, address) < 0) { 3768 ret = VM_FAULT_OOM; 3769 goto backout_unlocked; 3770 } 3771 /* Just decrements count, does not deallocate */ 3772 vma_end_reservation(h, vma, address); 3773 } 3774 3775 ptl = huge_pte_lock(h, mm, ptep); 3776 size = i_size_read(mapping->host) >> huge_page_shift(h); 3777 if (idx >= size) 3778 goto backout; 3779 3780 ret = 0; 3781 if (!huge_pte_none(huge_ptep_get(ptep))) 3782 goto backout; 3783 3784 if (anon_rmap) { 3785 ClearPagePrivate(page); 3786 hugepage_add_new_anon_rmap(page, vma, address); 3787 } else 3788 page_dup_rmap(page, true); 3789 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 3790 && (vma->vm_flags & VM_SHARED))); 3791 set_huge_pte_at(mm, address, ptep, new_pte); 3792 3793 hugetlb_count_add(pages_per_huge_page(h), mm); 3794 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 3795 /* Optimization, do the COW without a second fault */ 3796 ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); 3797 } 3798 3799 spin_unlock(ptl); 3800 unlock_page(page); 3801 out: 3802 return ret; 3803 3804 backout: 3805 spin_unlock(ptl); 3806 backout_unlocked: 3807 unlock_page(page); 3808 restore_reserve_on_error(h, vma, address, page); 3809 put_page(page); 3810 goto out; 3811 } 3812 3813 #ifdef CONFIG_SMP 3814 u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 3815 struct vm_area_struct *vma, 3816 struct address_space *mapping, 3817 pgoff_t idx, unsigned long address) 3818 { 3819 unsigned long key[2]; 3820 u32 hash; 3821 3822 if (vma->vm_flags & VM_SHARED) { 3823 key[0] = (unsigned long) mapping; 3824 key[1] = idx; 3825 } else { 3826 key[0] = (unsigned long) mm; 3827 key[1] = address >> huge_page_shift(h); 3828 } 3829 3830 hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); 3831 3832 return hash & (num_fault_mutexes - 1); 3833 } 3834 #else 3835 /* 3836 * For uniprocesor systems we always use a single mutex, so just 3837 * return 0 and avoid the hashing overhead. 3838 */ 3839 u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 3840 struct vm_area_struct *vma, 3841 struct address_space *mapping, 3842 pgoff_t idx, unsigned long address) 3843 { 3844 return 0; 3845 } 3846 #endif 3847 3848 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3849 unsigned long address, unsigned int flags) 3850 { 3851 pte_t *ptep, entry; 3852 spinlock_t *ptl; 3853 int ret; 3854 u32 hash; 3855 pgoff_t idx; 3856 struct page *page = NULL; 3857 struct page *pagecache_page = NULL; 3858 struct hstate *h = hstate_vma(vma); 3859 struct address_space *mapping; 3860 int need_wait_lock = 0; 3861 3862 address &= huge_page_mask(h); 3863 3864 ptep = huge_pte_offset(mm, address); 3865 if (ptep) { 3866 entry = huge_ptep_get(ptep); 3867 if (unlikely(is_hugetlb_entry_migration(entry))) { 3868 migration_entry_wait_huge(vma, mm, ptep); 3869 return 0; 3870 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 3871 return VM_FAULT_HWPOISON_LARGE | 3872 VM_FAULT_SET_HINDEX(hstate_index(h)); 3873 } else { 3874 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 3875 if (!ptep) 3876 return VM_FAULT_OOM; 3877 } 3878 3879 mapping = vma->vm_file->f_mapping; 3880 idx = vma_hugecache_offset(h, vma, address); 3881 3882 /* 3883 * Serialize hugepage allocation and instantiation, so that we don't 3884 * get spurious allocation failures if two CPUs race to instantiate 3885 * the same page in the page cache. 3886 */ 3887 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address); 3888 mutex_lock(&hugetlb_fault_mutex_table[hash]); 3889 3890 entry = huge_ptep_get(ptep); 3891 if (huge_pte_none(entry)) { 3892 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); 3893 goto out_mutex; 3894 } 3895 3896 ret = 0; 3897 3898 /* 3899 * entry could be a migration/hwpoison entry at this point, so this 3900 * check prevents the kernel from going below assuming that we have 3901 * a active hugepage in pagecache. This goto expects the 2nd page fault, 3902 * and is_hugetlb_entry_(migration|hwpoisoned) check will properly 3903 * handle it. 3904 */ 3905 if (!pte_present(entry)) 3906 goto out_mutex; 3907 3908 /* 3909 * If we are going to COW the mapping later, we examine the pending 3910 * reservations for this page now. This will ensure that any 3911 * allocations necessary to record that reservation occur outside the 3912 * spinlock. For private mappings, we also lookup the pagecache 3913 * page now as it is used to determine if a reservation has been 3914 * consumed. 3915 */ 3916 if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { 3917 if (vma_needs_reservation(h, vma, address) < 0) { 3918 ret = VM_FAULT_OOM; 3919 goto out_mutex; 3920 } 3921 /* Just decrements count, does not deallocate */ 3922 vma_end_reservation(h, vma, address); 3923 3924 if (!(vma->vm_flags & VM_MAYSHARE)) 3925 pagecache_page = hugetlbfs_pagecache_page(h, 3926 vma, address); 3927 } 3928 3929 ptl = huge_pte_lock(h, mm, ptep); 3930 3931 /* Check for a racing update before calling hugetlb_cow */ 3932 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 3933 goto out_ptl; 3934 3935 /* 3936 * hugetlb_cow() requires page locks of pte_page(entry) and 3937 * pagecache_page, so here we need take the former one 3938 * when page != pagecache_page or !pagecache_page. 3939 */ 3940 page = pte_page(entry); 3941 if (page != pagecache_page) 3942 if (!trylock_page(page)) { 3943 need_wait_lock = 1; 3944 goto out_ptl; 3945 } 3946 3947 get_page(page); 3948 3949 if (flags & FAULT_FLAG_WRITE) { 3950 if (!huge_pte_write(entry)) { 3951 ret = hugetlb_cow(mm, vma, address, ptep, 3952 pagecache_page, ptl); 3953 goto out_put_page; 3954 } 3955 entry = huge_pte_mkdirty(entry); 3956 } 3957 entry = pte_mkyoung(entry); 3958 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 3959 flags & FAULT_FLAG_WRITE)) 3960 update_mmu_cache(vma, address, ptep); 3961 out_put_page: 3962 if (page != pagecache_page) 3963 unlock_page(page); 3964 put_page(page); 3965 out_ptl: 3966 spin_unlock(ptl); 3967 3968 if (pagecache_page) { 3969 unlock_page(pagecache_page); 3970 put_page(pagecache_page); 3971 } 3972 out_mutex: 3973 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 3974 /* 3975 * Generally it's safe to hold refcount during waiting page lock. But 3976 * here we just wait to defer the next page fault to avoid busy loop and 3977 * the page is not used after unlocked before returning from the current 3978 * page fault. So we are safe from accessing freed page, even if we wait 3979 * here without taking refcount. 3980 */ 3981 if (need_wait_lock) 3982 wait_on_page_locked(page); 3983 return ret; 3984 } 3985 3986 /* 3987 * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with 3988 * modifications for huge pages. 3989 */ 3990 int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, 3991 pte_t *dst_pte, 3992 struct vm_area_struct *dst_vma, 3993 unsigned long dst_addr, 3994 unsigned long src_addr, 3995 struct page **pagep) 3996 { 3997 int vm_shared = dst_vma->vm_flags & VM_SHARED; 3998 struct hstate *h = hstate_vma(dst_vma); 3999 pte_t _dst_pte; 4000 spinlock_t *ptl; 4001 int ret; 4002 struct page *page; 4003 4004 if (!*pagep) { 4005 ret = -ENOMEM; 4006 page = alloc_huge_page(dst_vma, dst_addr, 0); 4007 if (IS_ERR(page)) 4008 goto out; 4009 4010 ret = copy_huge_page_from_user(page, 4011 (const void __user *) src_addr, 4012 pages_per_huge_page(h), false); 4013 4014 /* fallback to copy_from_user outside mmap_sem */ 4015 if (unlikely(ret)) { 4016 ret = -EFAULT; 4017 *pagep = page; 4018 /* don't free the page */ 4019 goto out; 4020 } 4021 } else { 4022 page = *pagep; 4023 *pagep = NULL; 4024 } 4025 4026 /* 4027 * The memory barrier inside __SetPageUptodate makes sure that 4028 * preceding stores to the page contents become visible before 4029 * the set_pte_at() write. 4030 */ 4031 __SetPageUptodate(page); 4032 set_page_huge_active(page); 4033 4034 /* 4035 * If shared, add to page cache 4036 */ 4037 if (vm_shared) { 4038 struct address_space *mapping = dst_vma->vm_file->f_mapping; 4039 pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); 4040 4041 ret = huge_add_to_page_cache(page, mapping, idx); 4042 if (ret) 4043 goto out_release_nounlock; 4044 } 4045 4046 ptl = huge_pte_lockptr(h, dst_mm, dst_pte); 4047 spin_lock(ptl); 4048 4049 ret = -EEXIST; 4050 if (!huge_pte_none(huge_ptep_get(dst_pte))) 4051 goto out_release_unlock; 4052 4053 if (vm_shared) { 4054 page_dup_rmap(page, true); 4055 } else { 4056 ClearPagePrivate(page); 4057 hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); 4058 } 4059 4060 _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); 4061 if (dst_vma->vm_flags & VM_WRITE) 4062 _dst_pte = huge_pte_mkdirty(_dst_pte); 4063 _dst_pte = pte_mkyoung(_dst_pte); 4064 4065 set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 4066 4067 (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, 4068 dst_vma->vm_flags & VM_WRITE); 4069 hugetlb_count_add(pages_per_huge_page(h), dst_mm); 4070 4071 /* No need to invalidate - it was non-present before */ 4072 update_mmu_cache(dst_vma, dst_addr, dst_pte); 4073 4074 spin_unlock(ptl); 4075 if (vm_shared) 4076 unlock_page(page); 4077 ret = 0; 4078 out: 4079 return ret; 4080 out_release_unlock: 4081 spin_unlock(ptl); 4082 out_release_nounlock: 4083 if (vm_shared) 4084 unlock_page(page); 4085 put_page(page); 4086 goto out; 4087 } 4088 4089 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 4090 struct page **pages, struct vm_area_struct **vmas, 4091 unsigned long *position, unsigned long *nr_pages, 4092 long i, unsigned int flags, int *nonblocking) 4093 { 4094 unsigned long pfn_offset; 4095 unsigned long vaddr = *position; 4096 unsigned long remainder = *nr_pages; 4097 struct hstate *h = hstate_vma(vma); 4098 4099 while (vaddr < vma->vm_end && remainder) { 4100 pte_t *pte; 4101 spinlock_t *ptl = NULL; 4102 int absent; 4103 struct page *page; 4104 4105 /* 4106 * If we have a pending SIGKILL, don't keep faulting pages and 4107 * potentially allocating memory. 4108 */ 4109 if (unlikely(fatal_signal_pending(current))) { 4110 remainder = 0; 4111 break; 4112 } 4113 4114 /* 4115 * Some archs (sparc64, sh*) have multiple pte_ts to 4116 * each hugepage. We have to make sure we get the 4117 * first, for the page indexing below to work. 4118 * 4119 * Note that page table lock is not held when pte is null. 4120 */ 4121 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 4122 if (pte) 4123 ptl = huge_pte_lock(h, mm, pte); 4124 absent = !pte || huge_pte_none(huge_ptep_get(pte)); 4125 4126 /* 4127 * When coredumping, it suits get_dump_page if we just return 4128 * an error where there's an empty slot with no huge pagecache 4129 * to back it. This way, we avoid allocating a hugepage, and 4130 * the sparse dumpfile avoids allocating disk blocks, but its 4131 * huge holes still show up with zeroes where they need to be. 4132 */ 4133 if (absent && (flags & FOLL_DUMP) && 4134 !hugetlbfs_pagecache_present(h, vma, vaddr)) { 4135 if (pte) 4136 spin_unlock(ptl); 4137 remainder = 0; 4138 break; 4139 } 4140 4141 /* 4142 * We need call hugetlb_fault for both hugepages under migration 4143 * (in which case hugetlb_fault waits for the migration,) and 4144 * hwpoisoned hugepages (in which case we need to prevent the 4145 * caller from accessing to them.) In order to do this, we use 4146 * here is_swap_pte instead of is_hugetlb_entry_migration and 4147 * is_hugetlb_entry_hwpoisoned. This is because it simply covers 4148 * both cases, and because we can't follow correct pages 4149 * directly from any kind of swap entries. 4150 */ 4151 if (absent || is_swap_pte(huge_ptep_get(pte)) || 4152 ((flags & FOLL_WRITE) && 4153 !huge_pte_write(huge_ptep_get(pte)))) { 4154 int ret; 4155 unsigned int fault_flags = 0; 4156 4157 if (pte) 4158 spin_unlock(ptl); 4159 if (flags & FOLL_WRITE) 4160 fault_flags |= FAULT_FLAG_WRITE; 4161 if (nonblocking) 4162 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 4163 if (flags & FOLL_NOWAIT) 4164 fault_flags |= FAULT_FLAG_ALLOW_RETRY | 4165 FAULT_FLAG_RETRY_NOWAIT; 4166 if (flags & FOLL_TRIED) { 4167 VM_WARN_ON_ONCE(fault_flags & 4168 FAULT_FLAG_ALLOW_RETRY); 4169 fault_flags |= FAULT_FLAG_TRIED; 4170 } 4171 ret = hugetlb_fault(mm, vma, vaddr, fault_flags); 4172 if (ret & VM_FAULT_ERROR) { 4173 remainder = 0; 4174 break; 4175 } 4176 if (ret & VM_FAULT_RETRY) { 4177 if (nonblocking) 4178 *nonblocking = 0; 4179 *nr_pages = 0; 4180 /* 4181 * VM_FAULT_RETRY must not return an 4182 * error, it will return zero 4183 * instead. 4184 * 4185 * No need to update "position" as the 4186 * caller will not check it after 4187 * *nr_pages is set to 0. 4188 */ 4189 return i; 4190 } 4191 continue; 4192 } 4193 4194 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; 4195 page = pte_page(huge_ptep_get(pte)); 4196 same_page: 4197 if (pages) { 4198 pages[i] = mem_map_offset(page, pfn_offset); 4199 get_page(pages[i]); 4200 } 4201 4202 if (vmas) 4203 vmas[i] = vma; 4204 4205 vaddr += PAGE_SIZE; 4206 ++pfn_offset; 4207 --remainder; 4208 ++i; 4209 if (vaddr < vma->vm_end && remainder && 4210 pfn_offset < pages_per_huge_page(h)) { 4211 /* 4212 * We use pfn_offset to avoid touching the pageframes 4213 * of this compound page. 4214 */ 4215 goto same_page; 4216 } 4217 spin_unlock(ptl); 4218 } 4219 *nr_pages = remainder; 4220 /* 4221 * setting position is actually required only if remainder is 4222 * not zero but it's faster not to add a "if (remainder)" 4223 * branch. 4224 */ 4225 *position = vaddr; 4226 4227 return i ? i : -EFAULT; 4228 } 4229 4230 #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE 4231 /* 4232 * ARCHes with special requirements for evicting HUGETLB backing TLB entries can 4233 * implement this. 4234 */ 4235 #define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) 4236 #endif 4237 4238 unsigned long hugetlb_change_protection(struct vm_area_struct *vma, 4239 unsigned long address, unsigned long end, pgprot_t newprot) 4240 { 4241 struct mm_struct *mm = vma->vm_mm; 4242 unsigned long start = address; 4243 pte_t *ptep; 4244 pte_t pte; 4245 struct hstate *h = hstate_vma(vma); 4246 unsigned long pages = 0; 4247 4248 BUG_ON(address >= end); 4249 flush_cache_range(vma, address, end); 4250 4251 mmu_notifier_invalidate_range_start(mm, start, end); 4252 i_mmap_lock_write(vma->vm_file->f_mapping); 4253 for (; address < end; address += huge_page_size(h)) { 4254 spinlock_t *ptl; 4255 ptep = huge_pte_offset(mm, address); 4256 if (!ptep) 4257 continue; 4258 ptl = huge_pte_lock(h, mm, ptep); 4259 if (huge_pmd_unshare(mm, &address, ptep)) { 4260 pages++; 4261 spin_unlock(ptl); 4262 continue; 4263 } 4264 pte = huge_ptep_get(ptep); 4265 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 4266 spin_unlock(ptl); 4267 continue; 4268 } 4269 if (unlikely(is_hugetlb_entry_migration(pte))) { 4270 swp_entry_t entry = pte_to_swp_entry(pte); 4271 4272 if (is_write_migration_entry(entry)) { 4273 pte_t newpte; 4274 4275 make_migration_entry_read(&entry); 4276 newpte = swp_entry_to_pte(entry); 4277 set_huge_pte_at(mm, address, ptep, newpte); 4278 pages++; 4279 } 4280 spin_unlock(ptl); 4281 continue; 4282 } 4283 if (!huge_pte_none(pte)) { 4284 pte = huge_ptep_get_and_clear(mm, address, ptep); 4285 pte = pte_mkhuge(huge_pte_modify(pte, newprot)); 4286 pte = arch_make_huge_pte(pte, vma, NULL, 0); 4287 set_huge_pte_at(mm, address, ptep, pte); 4288 pages++; 4289 } 4290 spin_unlock(ptl); 4291 } 4292 /* 4293 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare 4294 * may have cleared our pud entry and done put_page on the page table: 4295 * once we release i_mmap_rwsem, another task can do the final put_page 4296 * and that page table be reused and filled with junk. 4297 */ 4298 flush_hugetlb_tlb_range(vma, start, end); 4299 mmu_notifier_invalidate_range(mm, start, end); 4300 i_mmap_unlock_write(vma->vm_file->f_mapping); 4301 mmu_notifier_invalidate_range_end(mm, start, end); 4302 4303 return pages << h->order; 4304 } 4305 4306 int hugetlb_reserve_pages(struct inode *inode, 4307 long from, long to, 4308 struct vm_area_struct *vma, 4309 vm_flags_t vm_flags) 4310 { 4311 long ret, chg; 4312 struct hstate *h = hstate_inode(inode); 4313 struct hugepage_subpool *spool = subpool_inode(inode); 4314 struct resv_map *resv_map; 4315 long gbl_reserve; 4316 4317 /* 4318 * Only apply hugepage reservation if asked. At fault time, an 4319 * attempt will be made for VM_NORESERVE to allocate a page 4320 * without using reserves 4321 */ 4322 if (vm_flags & VM_NORESERVE) 4323 return 0; 4324 4325 /* 4326 * Shared mappings base their reservation on the number of pages that 4327 * are already allocated on behalf of the file. Private mappings need 4328 * to reserve the full area even if read-only as mprotect() may be 4329 * called to make the mapping read-write. Assume !vma is a shm mapping 4330 */ 4331 if (!vma || vma->vm_flags & VM_MAYSHARE) { 4332 resv_map = inode_resv_map(inode); 4333 4334 chg = region_chg(resv_map, from, to); 4335 4336 } else { 4337 resv_map = resv_map_alloc(); 4338 if (!resv_map) 4339 return -ENOMEM; 4340 4341 chg = to - from; 4342 4343 set_vma_resv_map(vma, resv_map); 4344 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 4345 } 4346 4347 if (chg < 0) { 4348 ret = chg; 4349 goto out_err; 4350 } 4351 4352 /* 4353 * There must be enough pages in the subpool for the mapping. If 4354 * the subpool has a minimum size, there may be some global 4355 * reservations already in place (gbl_reserve). 4356 */ 4357 gbl_reserve = hugepage_subpool_get_pages(spool, chg); 4358 if (gbl_reserve < 0) { 4359 ret = -ENOSPC; 4360 goto out_err; 4361 } 4362 4363 /* 4364 * Check enough hugepages are available for the reservation. 4365 * Hand the pages back to the subpool if there are not 4366 */ 4367 ret = hugetlb_acct_memory(h, gbl_reserve); 4368 if (ret < 0) { 4369 /* put back original number of pages, chg */ 4370 (void)hugepage_subpool_put_pages(spool, chg); 4371 goto out_err; 4372 } 4373 4374 /* 4375 * Account for the reservations made. Shared mappings record regions 4376 * that have reservations as they are shared by multiple VMAs. 4377 * When the last VMA disappears, the region map says how much 4378 * the reservation was and the page cache tells how much of 4379 * the reservation was consumed. Private mappings are per-VMA and 4380 * only the consumed reservations are tracked. When the VMA 4381 * disappears, the original reservation is the VMA size and the 4382 * consumed reservations are stored in the map. Hence, nothing 4383 * else has to be done for private mappings here 4384 */ 4385 if (!vma || vma->vm_flags & VM_MAYSHARE) { 4386 long add = region_add(resv_map, from, to); 4387 4388 if (unlikely(chg > add)) { 4389 /* 4390 * pages in this range were added to the reserve 4391 * map between region_chg and region_add. This 4392 * indicates a race with alloc_huge_page. Adjust 4393 * the subpool and reserve counts modified above 4394 * based on the difference. 4395 */ 4396 long rsv_adjust; 4397 4398 rsv_adjust = hugepage_subpool_put_pages(spool, 4399 chg - add); 4400 hugetlb_acct_memory(h, -rsv_adjust); 4401 } 4402 } 4403 return 0; 4404 out_err: 4405 if (!vma || vma->vm_flags & VM_MAYSHARE) 4406 region_abort(resv_map, from, to); 4407 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 4408 kref_put(&resv_map->refs, resv_map_release); 4409 return ret; 4410 } 4411 4412 long hugetlb_unreserve_pages(struct inode *inode, long start, long end, 4413 long freed) 4414 { 4415 struct hstate *h = hstate_inode(inode); 4416 struct resv_map *resv_map = inode_resv_map(inode); 4417 long chg = 0; 4418 struct hugepage_subpool *spool = subpool_inode(inode); 4419 long gbl_reserve; 4420 4421 if (resv_map) { 4422 chg = region_del(resv_map, start, end); 4423 /* 4424 * region_del() can fail in the rare case where a region 4425 * must be split and another region descriptor can not be 4426 * allocated. If end == LONG_MAX, it will not fail. 4427 */ 4428 if (chg < 0) 4429 return chg; 4430 } 4431 4432 spin_lock(&inode->i_lock); 4433 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 4434 spin_unlock(&inode->i_lock); 4435 4436 /* 4437 * If the subpool has a minimum size, the number of global 4438 * reservations to be released may be adjusted. 4439 */ 4440 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); 4441 hugetlb_acct_memory(h, -gbl_reserve); 4442 4443 return 0; 4444 } 4445 4446 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 4447 static unsigned long page_table_shareable(struct vm_area_struct *svma, 4448 struct vm_area_struct *vma, 4449 unsigned long addr, pgoff_t idx) 4450 { 4451 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + 4452 svma->vm_start; 4453 unsigned long sbase = saddr & PUD_MASK; 4454 unsigned long s_end = sbase + PUD_SIZE; 4455 4456 /* Allow segments to share if only one is marked locked */ 4457 unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; 4458 unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; 4459 4460 /* 4461 * match the virtual addresses, permission and the alignment of the 4462 * page table page. 4463 */ 4464 if (pmd_index(addr) != pmd_index(saddr) || 4465 vm_flags != svm_flags || 4466 sbase < svma->vm_start || svma->vm_end < s_end) 4467 return 0; 4468 4469 return saddr; 4470 } 4471 4472 static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) 4473 { 4474 unsigned long base = addr & PUD_MASK; 4475 unsigned long end = base + PUD_SIZE; 4476 4477 /* 4478 * check on proper vm_flags and page table alignment 4479 */ 4480 if (vma->vm_flags & VM_MAYSHARE && 4481 vma->vm_start <= base && end <= vma->vm_end) 4482 return true; 4483 return false; 4484 } 4485 4486 /* 4487 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 4488 * and returns the corresponding pte. While this is not necessary for the 4489 * !shared pmd case because we can allocate the pmd later as well, it makes the 4490 * code much cleaner. pmd allocation is essential for the shared case because 4491 * pud has to be populated inside the same i_mmap_rwsem section - otherwise 4492 * racing tasks could either miss the sharing (see huge_pte_offset) or select a 4493 * bad pmd for sharing. 4494 */ 4495 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 4496 { 4497 struct vm_area_struct *vma = find_vma(mm, addr); 4498 struct address_space *mapping = vma->vm_file->f_mapping; 4499 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 4500 vma->vm_pgoff; 4501 struct vm_area_struct *svma; 4502 unsigned long saddr; 4503 pte_t *spte = NULL; 4504 pte_t *pte; 4505 spinlock_t *ptl; 4506 4507 if (!vma_shareable(vma, addr)) 4508 return (pte_t *)pmd_alloc(mm, pud, addr); 4509 4510 i_mmap_lock_write(mapping); 4511 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 4512 if (svma == vma) 4513 continue; 4514 4515 saddr = page_table_shareable(svma, vma, addr, idx); 4516 if (saddr) { 4517 spte = huge_pte_offset(svma->vm_mm, saddr); 4518 if (spte) { 4519 get_page(virt_to_page(spte)); 4520 break; 4521 } 4522 } 4523 } 4524 4525 if (!spte) 4526 goto out; 4527 4528 ptl = huge_pte_lock(hstate_vma(vma), mm, spte); 4529 if (pud_none(*pud)) { 4530 pud_populate(mm, pud, 4531 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 4532 mm_inc_nr_pmds(mm); 4533 } else { 4534 put_page(virt_to_page(spte)); 4535 } 4536 spin_unlock(ptl); 4537 out: 4538 pte = (pte_t *)pmd_alloc(mm, pud, addr); 4539 i_mmap_unlock_write(mapping); 4540 return pte; 4541 } 4542 4543 /* 4544 * unmap huge page backed by shared pte. 4545 * 4546 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared 4547 * indicated by page_count > 1, unmap is achieved by clearing pud and 4548 * decrementing the ref count. If count == 1, the pte page is not shared. 4549 * 4550 * called with page table lock held. 4551 * 4552 * returns: 1 successfully unmapped a shared pte page 4553 * 0 the underlying pte page is not shared, or it is the last user 4554 */ 4555 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 4556 { 4557 pgd_t *pgd = pgd_offset(mm, *addr); 4558 pud_t *pud = pud_offset(pgd, *addr); 4559 4560 BUG_ON(page_count(virt_to_page(ptep)) == 0); 4561 if (page_count(virt_to_page(ptep)) == 1) 4562 return 0; 4563 4564 pud_clear(pud); 4565 put_page(virt_to_page(ptep)); 4566 mm_dec_nr_pmds(mm); 4567 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; 4568 return 1; 4569 } 4570 #define want_pmd_share() (1) 4571 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 4572 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 4573 { 4574 return NULL; 4575 } 4576 4577 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 4578 { 4579 return 0; 4580 } 4581 #define want_pmd_share() (0) 4582 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 4583 4584 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB 4585 pte_t *huge_pte_alloc(struct mm_struct *mm, 4586 unsigned long addr, unsigned long sz) 4587 { 4588 pgd_t *pgd; 4589 pud_t *pud; 4590 pte_t *pte = NULL; 4591 4592 pgd = pgd_offset(mm, addr); 4593 pud = pud_alloc(mm, pgd, addr); 4594 if (pud) { 4595 if (sz == PUD_SIZE) { 4596 pte = (pte_t *)pud; 4597 } else { 4598 BUG_ON(sz != PMD_SIZE); 4599 if (want_pmd_share() && pud_none(*pud)) 4600 pte = huge_pmd_share(mm, addr, pud); 4601 else 4602 pte = (pte_t *)pmd_alloc(mm, pud, addr); 4603 } 4604 } 4605 BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte)); 4606 4607 return pte; 4608 } 4609 4610 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 4611 { 4612 pgd_t *pgd; 4613 pud_t *pud; 4614 pmd_t *pmd = NULL; 4615 4616 pgd = pgd_offset(mm, addr); 4617 if (pgd_present(*pgd)) { 4618 pud = pud_offset(pgd, addr); 4619 if (pud_present(*pud)) { 4620 if (pud_huge(*pud)) 4621 return (pte_t *)pud; 4622 pmd = pmd_offset(pud, addr); 4623 } 4624 } 4625 return (pte_t *) pmd; 4626 } 4627 4628 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 4629 4630 /* 4631 * These functions are overwritable if your architecture needs its own 4632 * behavior. 4633 */ 4634 struct page * __weak 4635 follow_huge_addr(struct mm_struct *mm, unsigned long address, 4636 int write) 4637 { 4638 return ERR_PTR(-EINVAL); 4639 } 4640 4641 struct page * __weak 4642 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 4643 pmd_t *pmd, int flags) 4644 { 4645 struct page *page = NULL; 4646 spinlock_t *ptl; 4647 retry: 4648 ptl = pmd_lockptr(mm, pmd); 4649 spin_lock(ptl); 4650 /* 4651 * make sure that the address range covered by this pmd is not 4652 * unmapped from other threads. 4653 */ 4654 if (!pmd_huge(*pmd)) 4655 goto out; 4656 if (pmd_present(*pmd)) { 4657 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); 4658 if (flags & FOLL_GET) 4659 get_page(page); 4660 } else { 4661 if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) { 4662 spin_unlock(ptl); 4663 __migration_entry_wait(mm, (pte_t *)pmd, ptl); 4664 goto retry; 4665 } 4666 /* 4667 * hwpoisoned entry is treated as no_page_table in 4668 * follow_page_mask(). 4669 */ 4670 } 4671 out: 4672 spin_unlock(ptl); 4673 return page; 4674 } 4675 4676 struct page * __weak 4677 follow_huge_pud(struct mm_struct *mm, unsigned long address, 4678 pud_t *pud, int flags) 4679 { 4680 if (flags & FOLL_GET) 4681 return NULL; 4682 4683 return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); 4684 } 4685 4686 #ifdef CONFIG_MEMORY_FAILURE 4687 4688 /* 4689 * This function is called from memory failure code. 4690 */ 4691 int dequeue_hwpoisoned_huge_page(struct page *hpage) 4692 { 4693 struct hstate *h = page_hstate(hpage); 4694 int nid = page_to_nid(hpage); 4695 int ret = -EBUSY; 4696 4697 spin_lock(&hugetlb_lock); 4698 /* 4699 * Just checking !page_huge_active is not enough, because that could be 4700 * an isolated/hwpoisoned hugepage (which have >0 refcount). 4701 */ 4702 if (!page_huge_active(hpage) && !page_count(hpage)) { 4703 /* 4704 * Hwpoisoned hugepage isn't linked to activelist or freelist, 4705 * but dangling hpage->lru can trigger list-debug warnings 4706 * (this happens when we call unpoison_memory() on it), 4707 * so let it point to itself with list_del_init(). 4708 */ 4709 list_del_init(&hpage->lru); 4710 set_page_refcounted(hpage); 4711 h->free_huge_pages--; 4712 h->free_huge_pages_node[nid]--; 4713 ret = 0; 4714 } 4715 spin_unlock(&hugetlb_lock); 4716 return ret; 4717 } 4718 #endif 4719 4720 bool isolate_huge_page(struct page *page, struct list_head *list) 4721 { 4722 bool ret = true; 4723 4724 VM_BUG_ON_PAGE(!PageHead(page), page); 4725 spin_lock(&hugetlb_lock); 4726 if (!page_huge_active(page) || !get_page_unless_zero(page)) { 4727 ret = false; 4728 goto unlock; 4729 } 4730 clear_page_huge_active(page); 4731 list_move_tail(&page->lru, list); 4732 unlock: 4733 spin_unlock(&hugetlb_lock); 4734 return ret; 4735 } 4736 4737 void putback_active_hugepage(struct page *page) 4738 { 4739 VM_BUG_ON_PAGE(!PageHead(page), page); 4740 spin_lock(&hugetlb_lock); 4741 set_page_huge_active(page); 4742 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); 4743 spin_unlock(&hugetlb_lock); 4744 put_page(page); 4745 } 4746