1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Generic hugetlb support. 4 * (C) Nadia Yvette Chambers, April 2004 5 */ 6 #include <linux/list.h> 7 #include <linux/init.h> 8 #include <linux/mm.h> 9 #include <linux/seq_file.h> 10 #include <linux/sysctl.h> 11 #include <linux/highmem.h> 12 #include <linux/mmu_notifier.h> 13 #include <linux/nodemask.h> 14 #include <linux/pagemap.h> 15 #include <linux/mempolicy.h> 16 #include <linux/compiler.h> 17 #include <linux/cpuset.h> 18 #include <linux/mutex.h> 19 #include <linux/memblock.h> 20 #include <linux/sysfs.h> 21 #include <linux/slab.h> 22 #include <linux/sched/mm.h> 23 #include <linux/mmdebug.h> 24 #include <linux/sched/signal.h> 25 #include <linux/rmap.h> 26 #include <linux/string_helpers.h> 27 #include <linux/swap.h> 28 #include <linux/swapops.h> 29 #include <linux/jhash.h> 30 #include <linux/numa.h> 31 #include <linux/llist.h> 32 #include <linux/cma.h> 33 34 #include <asm/page.h> 35 #include <asm/pgalloc.h> 36 #include <asm/tlb.h> 37 38 #include <linux/io.h> 39 #include <linux/hugetlb.h> 40 #include <linux/hugetlb_cgroup.h> 41 #include <linux/node.h> 42 #include <linux/page_owner.h> 43 #include "internal.h" 44 #include "hugetlb_vmemmap.h" 45 46 int hugetlb_max_hstate __read_mostly; 47 unsigned int default_hstate_idx; 48 struct hstate hstates[HUGE_MAX_HSTATE]; 49 50 #ifdef CONFIG_CMA 51 static struct cma *hugetlb_cma[MAX_NUMNODES]; 52 #endif 53 static unsigned long hugetlb_cma_size __initdata; 54 55 /* 56 * Minimum page order among possible hugepage sizes, set to a proper value 57 * at boot time. 58 */ 59 static unsigned int minimum_order __read_mostly = UINT_MAX; 60 61 __initdata LIST_HEAD(huge_boot_pages); 62 63 /* for command line parsing */ 64 static struct hstate * __initdata parsed_hstate; 65 static unsigned long __initdata default_hstate_max_huge_pages; 66 static bool __initdata parsed_valid_hugepagesz = true; 67 static bool __initdata parsed_default_hugepagesz; 68 69 /* 70 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, 71 * free_huge_pages, and surplus_huge_pages. 72 */ 73 DEFINE_SPINLOCK(hugetlb_lock); 74 75 /* 76 * Serializes faults on the same logical page. This is used to 77 * prevent spurious OOMs when the hugepage pool is fully utilized. 78 */ 79 static int num_fault_mutexes; 80 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; 81 82 /* Forward declaration */ 83 static int hugetlb_acct_memory(struct hstate *h, long delta); 84 85 static inline bool subpool_is_free(struct hugepage_subpool *spool) 86 { 87 if (spool->count) 88 return false; 89 if (spool->max_hpages != -1) 90 return spool->used_hpages == 0; 91 if (spool->min_hpages != -1) 92 return spool->rsv_hpages == spool->min_hpages; 93 94 return true; 95 } 96 97 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, 98 unsigned long irq_flags) 99 { 100 spin_unlock_irqrestore(&spool->lock, irq_flags); 101 102 /* If no pages are used, and no other handles to the subpool 103 * remain, give up any reservations based on minimum size and 104 * free the subpool */ 105 if (subpool_is_free(spool)) { 106 if (spool->min_hpages != -1) 107 hugetlb_acct_memory(spool->hstate, 108 -spool->min_hpages); 109 kfree(spool); 110 } 111 } 112 113 struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, 114 long min_hpages) 115 { 116 struct hugepage_subpool *spool; 117 118 spool = kzalloc(sizeof(*spool), GFP_KERNEL); 119 if (!spool) 120 return NULL; 121 122 spin_lock_init(&spool->lock); 123 spool->count = 1; 124 spool->max_hpages = max_hpages; 125 spool->hstate = h; 126 spool->min_hpages = min_hpages; 127 128 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { 129 kfree(spool); 130 return NULL; 131 } 132 spool->rsv_hpages = min_hpages; 133 134 return spool; 135 } 136 137 void hugepage_put_subpool(struct hugepage_subpool *spool) 138 { 139 unsigned long flags; 140 141 spin_lock_irqsave(&spool->lock, flags); 142 BUG_ON(!spool->count); 143 spool->count--; 144 unlock_or_release_subpool(spool, flags); 145 } 146 147 /* 148 * Subpool accounting for allocating and reserving pages. 149 * Return -ENOMEM if there are not enough resources to satisfy the 150 * request. Otherwise, return the number of pages by which the 151 * global pools must be adjusted (upward). The returned value may 152 * only be different than the passed value (delta) in the case where 153 * a subpool minimum size must be maintained. 154 */ 155 static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, 156 long delta) 157 { 158 long ret = delta; 159 160 if (!spool) 161 return ret; 162 163 spin_lock_irq(&spool->lock); 164 165 if (spool->max_hpages != -1) { /* maximum size accounting */ 166 if ((spool->used_hpages + delta) <= spool->max_hpages) 167 spool->used_hpages += delta; 168 else { 169 ret = -ENOMEM; 170 goto unlock_ret; 171 } 172 } 173 174 /* minimum size accounting */ 175 if (spool->min_hpages != -1 && spool->rsv_hpages) { 176 if (delta > spool->rsv_hpages) { 177 /* 178 * Asking for more reserves than those already taken on 179 * behalf of subpool. Return difference. 180 */ 181 ret = delta - spool->rsv_hpages; 182 spool->rsv_hpages = 0; 183 } else { 184 ret = 0; /* reserves already accounted for */ 185 spool->rsv_hpages -= delta; 186 } 187 } 188 189 unlock_ret: 190 spin_unlock_irq(&spool->lock); 191 return ret; 192 } 193 194 /* 195 * Subpool accounting for freeing and unreserving pages. 196 * Return the number of global page reservations that must be dropped. 197 * The return value may only be different than the passed value (delta) 198 * in the case where a subpool minimum size must be maintained. 199 */ 200 static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, 201 long delta) 202 { 203 long ret = delta; 204 unsigned long flags; 205 206 if (!spool) 207 return delta; 208 209 spin_lock_irqsave(&spool->lock, flags); 210 211 if (spool->max_hpages != -1) /* maximum size accounting */ 212 spool->used_hpages -= delta; 213 214 /* minimum size accounting */ 215 if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { 216 if (spool->rsv_hpages + delta <= spool->min_hpages) 217 ret = 0; 218 else 219 ret = spool->rsv_hpages + delta - spool->min_hpages; 220 221 spool->rsv_hpages += delta; 222 if (spool->rsv_hpages > spool->min_hpages) 223 spool->rsv_hpages = spool->min_hpages; 224 } 225 226 /* 227 * If hugetlbfs_put_super couldn't free spool due to an outstanding 228 * quota reference, free it now. 229 */ 230 unlock_or_release_subpool(spool, flags); 231 232 return ret; 233 } 234 235 static inline struct hugepage_subpool *subpool_inode(struct inode *inode) 236 { 237 return HUGETLBFS_SB(inode->i_sb)->spool; 238 } 239 240 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) 241 { 242 return subpool_inode(file_inode(vma->vm_file)); 243 } 244 245 /* Helper that removes a struct file_region from the resv_map cache and returns 246 * it for use. 247 */ 248 static struct file_region * 249 get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) 250 { 251 struct file_region *nrg = NULL; 252 253 VM_BUG_ON(resv->region_cache_count <= 0); 254 255 resv->region_cache_count--; 256 nrg = list_first_entry(&resv->region_cache, struct file_region, link); 257 list_del(&nrg->link); 258 259 nrg->from = from; 260 nrg->to = to; 261 262 return nrg; 263 } 264 265 static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, 266 struct file_region *rg) 267 { 268 #ifdef CONFIG_CGROUP_HUGETLB 269 nrg->reservation_counter = rg->reservation_counter; 270 nrg->css = rg->css; 271 if (rg->css) 272 css_get(rg->css); 273 #endif 274 } 275 276 /* Helper that records hugetlb_cgroup uncharge info. */ 277 static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, 278 struct hstate *h, 279 struct resv_map *resv, 280 struct file_region *nrg) 281 { 282 #ifdef CONFIG_CGROUP_HUGETLB 283 if (h_cg) { 284 nrg->reservation_counter = 285 &h_cg->rsvd_hugepage[hstate_index(h)]; 286 nrg->css = &h_cg->css; 287 /* 288 * The caller will hold exactly one h_cg->css reference for the 289 * whole contiguous reservation region. But this area might be 290 * scattered when there are already some file_regions reside in 291 * it. As a result, many file_regions may share only one css 292 * reference. In order to ensure that one file_region must hold 293 * exactly one h_cg->css reference, we should do css_get for 294 * each file_region and leave the reference held by caller 295 * untouched. 296 */ 297 css_get(&h_cg->css); 298 if (!resv->pages_per_hpage) 299 resv->pages_per_hpage = pages_per_huge_page(h); 300 /* pages_per_hpage should be the same for all entries in 301 * a resv_map. 302 */ 303 VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); 304 } else { 305 nrg->reservation_counter = NULL; 306 nrg->css = NULL; 307 } 308 #endif 309 } 310 311 static void put_uncharge_info(struct file_region *rg) 312 { 313 #ifdef CONFIG_CGROUP_HUGETLB 314 if (rg->css) 315 css_put(rg->css); 316 #endif 317 } 318 319 static bool has_same_uncharge_info(struct file_region *rg, 320 struct file_region *org) 321 { 322 #ifdef CONFIG_CGROUP_HUGETLB 323 return rg && org && 324 rg->reservation_counter == org->reservation_counter && 325 rg->css == org->css; 326 327 #else 328 return true; 329 #endif 330 } 331 332 static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) 333 { 334 struct file_region *nrg = NULL, *prg = NULL; 335 336 prg = list_prev_entry(rg, link); 337 if (&prg->link != &resv->regions && prg->to == rg->from && 338 has_same_uncharge_info(prg, rg)) { 339 prg->to = rg->to; 340 341 list_del(&rg->link); 342 put_uncharge_info(rg); 343 kfree(rg); 344 345 rg = prg; 346 } 347 348 nrg = list_next_entry(rg, link); 349 if (&nrg->link != &resv->regions && nrg->from == rg->to && 350 has_same_uncharge_info(nrg, rg)) { 351 nrg->from = rg->from; 352 353 list_del(&rg->link); 354 put_uncharge_info(rg); 355 kfree(rg); 356 } 357 } 358 359 static inline long 360 hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from, 361 long to, struct hstate *h, struct hugetlb_cgroup *cg, 362 long *regions_needed) 363 { 364 struct file_region *nrg; 365 366 if (!regions_needed) { 367 nrg = get_file_region_entry_from_cache(map, from, to); 368 record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); 369 list_add(&nrg->link, rg->link.prev); 370 coalesce_file_region(map, nrg); 371 } else 372 *regions_needed += 1; 373 374 return to - from; 375 } 376 377 /* 378 * Must be called with resv->lock held. 379 * 380 * Calling this with regions_needed != NULL will count the number of pages 381 * to be added but will not modify the linked list. And regions_needed will 382 * indicate the number of file_regions needed in the cache to carry out to add 383 * the regions for this range. 384 */ 385 static long add_reservation_in_range(struct resv_map *resv, long f, long t, 386 struct hugetlb_cgroup *h_cg, 387 struct hstate *h, long *regions_needed) 388 { 389 long add = 0; 390 struct list_head *head = &resv->regions; 391 long last_accounted_offset = f; 392 struct file_region *rg = NULL, *trg = NULL; 393 394 if (regions_needed) 395 *regions_needed = 0; 396 397 /* In this loop, we essentially handle an entry for the range 398 * [last_accounted_offset, rg->from), at every iteration, with some 399 * bounds checking. 400 */ 401 list_for_each_entry_safe(rg, trg, head, link) { 402 /* Skip irrelevant regions that start before our range. */ 403 if (rg->from < f) { 404 /* If this region ends after the last accounted offset, 405 * then we need to update last_accounted_offset. 406 */ 407 if (rg->to > last_accounted_offset) 408 last_accounted_offset = rg->to; 409 continue; 410 } 411 412 /* When we find a region that starts beyond our range, we've 413 * finished. 414 */ 415 if (rg->from >= t) 416 break; 417 418 /* Add an entry for last_accounted_offset -> rg->from, and 419 * update last_accounted_offset. 420 */ 421 if (rg->from > last_accounted_offset) 422 add += hugetlb_resv_map_add(resv, rg, 423 last_accounted_offset, 424 rg->from, h, h_cg, 425 regions_needed); 426 427 last_accounted_offset = rg->to; 428 } 429 430 /* Handle the case where our range extends beyond 431 * last_accounted_offset. 432 */ 433 if (last_accounted_offset < t) 434 add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, 435 t, h, h_cg, regions_needed); 436 437 VM_BUG_ON(add < 0); 438 return add; 439 } 440 441 /* Must be called with resv->lock acquired. Will drop lock to allocate entries. 442 */ 443 static int allocate_file_region_entries(struct resv_map *resv, 444 int regions_needed) 445 __must_hold(&resv->lock) 446 { 447 struct list_head allocated_regions; 448 int to_allocate = 0, i = 0; 449 struct file_region *trg = NULL, *rg = NULL; 450 451 VM_BUG_ON(regions_needed < 0); 452 453 INIT_LIST_HEAD(&allocated_regions); 454 455 /* 456 * Check for sufficient descriptors in the cache to accommodate 457 * the number of in progress add operations plus regions_needed. 458 * 459 * This is a while loop because when we drop the lock, some other call 460 * to region_add or region_del may have consumed some region_entries, 461 * so we keep looping here until we finally have enough entries for 462 * (adds_in_progress + regions_needed). 463 */ 464 while (resv->region_cache_count < 465 (resv->adds_in_progress + regions_needed)) { 466 to_allocate = resv->adds_in_progress + regions_needed - 467 resv->region_cache_count; 468 469 /* At this point, we should have enough entries in the cache 470 * for all the existing adds_in_progress. We should only be 471 * needing to allocate for regions_needed. 472 */ 473 VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); 474 475 spin_unlock(&resv->lock); 476 for (i = 0; i < to_allocate; i++) { 477 trg = kmalloc(sizeof(*trg), GFP_KERNEL); 478 if (!trg) 479 goto out_of_memory; 480 list_add(&trg->link, &allocated_regions); 481 } 482 483 spin_lock(&resv->lock); 484 485 list_splice(&allocated_regions, &resv->region_cache); 486 resv->region_cache_count += to_allocate; 487 } 488 489 return 0; 490 491 out_of_memory: 492 list_for_each_entry_safe(rg, trg, &allocated_regions, link) { 493 list_del(&rg->link); 494 kfree(rg); 495 } 496 return -ENOMEM; 497 } 498 499 /* 500 * Add the huge page range represented by [f, t) to the reserve 501 * map. Regions will be taken from the cache to fill in this range. 502 * Sufficient regions should exist in the cache due to the previous 503 * call to region_chg with the same range, but in some cases the cache will not 504 * have sufficient entries due to races with other code doing region_add or 505 * region_del. The extra needed entries will be allocated. 506 * 507 * regions_needed is the out value provided by a previous call to region_chg. 508 * 509 * Return the number of new huge pages added to the map. This number is greater 510 * than or equal to zero. If file_region entries needed to be allocated for 511 * this operation and we were not able to allocate, it returns -ENOMEM. 512 * region_add of regions of length 1 never allocate file_regions and cannot 513 * fail; region_chg will always allocate at least 1 entry and a region_add for 514 * 1 page will only require at most 1 entry. 515 */ 516 static long region_add(struct resv_map *resv, long f, long t, 517 long in_regions_needed, struct hstate *h, 518 struct hugetlb_cgroup *h_cg) 519 { 520 long add = 0, actual_regions_needed = 0; 521 522 spin_lock(&resv->lock); 523 retry: 524 525 /* Count how many regions are actually needed to execute this add. */ 526 add_reservation_in_range(resv, f, t, NULL, NULL, 527 &actual_regions_needed); 528 529 /* 530 * Check for sufficient descriptors in the cache to accommodate 531 * this add operation. Note that actual_regions_needed may be greater 532 * than in_regions_needed, as the resv_map may have been modified since 533 * the region_chg call. In this case, we need to make sure that we 534 * allocate extra entries, such that we have enough for all the 535 * existing adds_in_progress, plus the excess needed for this 536 * operation. 537 */ 538 if (actual_regions_needed > in_regions_needed && 539 resv->region_cache_count < 540 resv->adds_in_progress + 541 (actual_regions_needed - in_regions_needed)) { 542 /* region_add operation of range 1 should never need to 543 * allocate file_region entries. 544 */ 545 VM_BUG_ON(t - f <= 1); 546 547 if (allocate_file_region_entries( 548 resv, actual_regions_needed - in_regions_needed)) { 549 return -ENOMEM; 550 } 551 552 goto retry; 553 } 554 555 add = add_reservation_in_range(resv, f, t, h_cg, h, NULL); 556 557 resv->adds_in_progress -= in_regions_needed; 558 559 spin_unlock(&resv->lock); 560 return add; 561 } 562 563 /* 564 * Examine the existing reserve map and determine how many 565 * huge pages in the specified range [f, t) are NOT currently 566 * represented. This routine is called before a subsequent 567 * call to region_add that will actually modify the reserve 568 * map to add the specified range [f, t). region_chg does 569 * not change the number of huge pages represented by the 570 * map. A number of new file_region structures is added to the cache as a 571 * placeholder, for the subsequent region_add call to use. At least 1 572 * file_region structure is added. 573 * 574 * out_regions_needed is the number of regions added to the 575 * resv->adds_in_progress. This value needs to be provided to a follow up call 576 * to region_add or region_abort for proper accounting. 577 * 578 * Returns the number of huge pages that need to be added to the existing 579 * reservation map for the range [f, t). This number is greater or equal to 580 * zero. -ENOMEM is returned if a new file_region structure or cache entry 581 * is needed and can not be allocated. 582 */ 583 static long region_chg(struct resv_map *resv, long f, long t, 584 long *out_regions_needed) 585 { 586 long chg = 0; 587 588 spin_lock(&resv->lock); 589 590 /* Count how many hugepages in this range are NOT represented. */ 591 chg = add_reservation_in_range(resv, f, t, NULL, NULL, 592 out_regions_needed); 593 594 if (*out_regions_needed == 0) 595 *out_regions_needed = 1; 596 597 if (allocate_file_region_entries(resv, *out_regions_needed)) 598 return -ENOMEM; 599 600 resv->adds_in_progress += *out_regions_needed; 601 602 spin_unlock(&resv->lock); 603 return chg; 604 } 605 606 /* 607 * Abort the in progress add operation. The adds_in_progress field 608 * of the resv_map keeps track of the operations in progress between 609 * calls to region_chg and region_add. Operations are sometimes 610 * aborted after the call to region_chg. In such cases, region_abort 611 * is called to decrement the adds_in_progress counter. regions_needed 612 * is the value returned by the region_chg call, it is used to decrement 613 * the adds_in_progress counter. 614 * 615 * NOTE: The range arguments [f, t) are not needed or used in this 616 * routine. They are kept to make reading the calling code easier as 617 * arguments will match the associated region_chg call. 618 */ 619 static void region_abort(struct resv_map *resv, long f, long t, 620 long regions_needed) 621 { 622 spin_lock(&resv->lock); 623 VM_BUG_ON(!resv->region_cache_count); 624 resv->adds_in_progress -= regions_needed; 625 spin_unlock(&resv->lock); 626 } 627 628 /* 629 * Delete the specified range [f, t) from the reserve map. If the 630 * t parameter is LONG_MAX, this indicates that ALL regions after f 631 * should be deleted. Locate the regions which intersect [f, t) 632 * and either trim, delete or split the existing regions. 633 * 634 * Returns the number of huge pages deleted from the reserve map. 635 * In the normal case, the return value is zero or more. In the 636 * case where a region must be split, a new region descriptor must 637 * be allocated. If the allocation fails, -ENOMEM will be returned. 638 * NOTE: If the parameter t == LONG_MAX, then we will never split 639 * a region and possibly return -ENOMEM. Callers specifying 640 * t == LONG_MAX do not need to check for -ENOMEM error. 641 */ 642 static long region_del(struct resv_map *resv, long f, long t) 643 { 644 struct list_head *head = &resv->regions; 645 struct file_region *rg, *trg; 646 struct file_region *nrg = NULL; 647 long del = 0; 648 649 retry: 650 spin_lock(&resv->lock); 651 list_for_each_entry_safe(rg, trg, head, link) { 652 /* 653 * Skip regions before the range to be deleted. file_region 654 * ranges are normally of the form [from, to). However, there 655 * may be a "placeholder" entry in the map which is of the form 656 * (from, to) with from == to. Check for placeholder entries 657 * at the beginning of the range to be deleted. 658 */ 659 if (rg->to <= f && (rg->to != rg->from || rg->to != f)) 660 continue; 661 662 if (rg->from >= t) 663 break; 664 665 if (f > rg->from && t < rg->to) { /* Must split region */ 666 /* 667 * Check for an entry in the cache before dropping 668 * lock and attempting allocation. 669 */ 670 if (!nrg && 671 resv->region_cache_count > resv->adds_in_progress) { 672 nrg = list_first_entry(&resv->region_cache, 673 struct file_region, 674 link); 675 list_del(&nrg->link); 676 resv->region_cache_count--; 677 } 678 679 if (!nrg) { 680 spin_unlock(&resv->lock); 681 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 682 if (!nrg) 683 return -ENOMEM; 684 goto retry; 685 } 686 687 del += t - f; 688 hugetlb_cgroup_uncharge_file_region( 689 resv, rg, t - f, false); 690 691 /* New entry for end of split region */ 692 nrg->from = t; 693 nrg->to = rg->to; 694 695 copy_hugetlb_cgroup_uncharge_info(nrg, rg); 696 697 INIT_LIST_HEAD(&nrg->link); 698 699 /* Original entry is trimmed */ 700 rg->to = f; 701 702 list_add(&nrg->link, &rg->link); 703 nrg = NULL; 704 break; 705 } 706 707 if (f <= rg->from && t >= rg->to) { /* Remove entire region */ 708 del += rg->to - rg->from; 709 hugetlb_cgroup_uncharge_file_region(resv, rg, 710 rg->to - rg->from, true); 711 list_del(&rg->link); 712 kfree(rg); 713 continue; 714 } 715 716 if (f <= rg->from) { /* Trim beginning of region */ 717 hugetlb_cgroup_uncharge_file_region(resv, rg, 718 t - rg->from, false); 719 720 del += t - rg->from; 721 rg->from = t; 722 } else { /* Trim end of region */ 723 hugetlb_cgroup_uncharge_file_region(resv, rg, 724 rg->to - f, false); 725 726 del += rg->to - f; 727 rg->to = f; 728 } 729 } 730 731 spin_unlock(&resv->lock); 732 kfree(nrg); 733 return del; 734 } 735 736 /* 737 * A rare out of memory error was encountered which prevented removal of 738 * the reserve map region for a page. The huge page itself was free'ed 739 * and removed from the page cache. This routine will adjust the subpool 740 * usage count, and the global reserve count if needed. By incrementing 741 * these counts, the reserve map entry which could not be deleted will 742 * appear as a "reserved" entry instead of simply dangling with incorrect 743 * counts. 744 */ 745 void hugetlb_fix_reserve_counts(struct inode *inode) 746 { 747 struct hugepage_subpool *spool = subpool_inode(inode); 748 long rsv_adjust; 749 bool reserved = false; 750 751 rsv_adjust = hugepage_subpool_get_pages(spool, 1); 752 if (rsv_adjust > 0) { 753 struct hstate *h = hstate_inode(inode); 754 755 if (!hugetlb_acct_memory(h, 1)) 756 reserved = true; 757 } else if (!rsv_adjust) { 758 reserved = true; 759 } 760 761 if (!reserved) 762 pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); 763 } 764 765 /* 766 * Count and return the number of huge pages in the reserve map 767 * that intersect with the range [f, t). 768 */ 769 static long region_count(struct resv_map *resv, long f, long t) 770 { 771 struct list_head *head = &resv->regions; 772 struct file_region *rg; 773 long chg = 0; 774 775 spin_lock(&resv->lock); 776 /* Locate each segment we overlap with, and count that overlap. */ 777 list_for_each_entry(rg, head, link) { 778 long seg_from; 779 long seg_to; 780 781 if (rg->to <= f) 782 continue; 783 if (rg->from >= t) 784 break; 785 786 seg_from = max(rg->from, f); 787 seg_to = min(rg->to, t); 788 789 chg += seg_to - seg_from; 790 } 791 spin_unlock(&resv->lock); 792 793 return chg; 794 } 795 796 /* 797 * Convert the address within this vma to the page offset within 798 * the mapping, in pagecache page units; huge pages here. 799 */ 800 static pgoff_t vma_hugecache_offset(struct hstate *h, 801 struct vm_area_struct *vma, unsigned long address) 802 { 803 return ((address - vma->vm_start) >> huge_page_shift(h)) + 804 (vma->vm_pgoff >> huge_page_order(h)); 805 } 806 807 pgoff_t linear_hugepage_index(struct vm_area_struct *vma, 808 unsigned long address) 809 { 810 return vma_hugecache_offset(hstate_vma(vma), vma, address); 811 } 812 EXPORT_SYMBOL_GPL(linear_hugepage_index); 813 814 /* 815 * Return the size of the pages allocated when backing a VMA. In the majority 816 * cases this will be same size as used by the page table entries. 817 */ 818 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 819 { 820 if (vma->vm_ops && vma->vm_ops->pagesize) 821 return vma->vm_ops->pagesize(vma); 822 return PAGE_SIZE; 823 } 824 EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 825 826 /* 827 * Return the page size being used by the MMU to back a VMA. In the majority 828 * of cases, the page size used by the kernel matches the MMU size. On 829 * architectures where it differs, an architecture-specific 'strong' 830 * version of this symbol is required. 831 */ 832 __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 833 { 834 return vma_kernel_pagesize(vma); 835 } 836 837 /* 838 * Flags for MAP_PRIVATE reservations. These are stored in the bottom 839 * bits of the reservation map pointer, which are always clear due to 840 * alignment. 841 */ 842 #define HPAGE_RESV_OWNER (1UL << 0) 843 #define HPAGE_RESV_UNMAPPED (1UL << 1) 844 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 845 846 /* 847 * These helpers are used to track how many pages are reserved for 848 * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 849 * is guaranteed to have their future faults succeed. 850 * 851 * With the exception of reset_vma_resv_huge_pages() which is called at fork(), 852 * the reserve counters are updated with the hugetlb_lock held. It is safe 853 * to reset the VMA at fork() time as it is not in use yet and there is no 854 * chance of the global counters getting corrupted as a result of the values. 855 * 856 * The private mapping reservation is represented in a subtly different 857 * manner to a shared mapping. A shared mapping has a region map associated 858 * with the underlying file, this region map represents the backing file 859 * pages which have ever had a reservation assigned which this persists even 860 * after the page is instantiated. A private mapping has a region map 861 * associated with the original mmap which is attached to all VMAs which 862 * reference it, this region map represents those offsets which have consumed 863 * reservation ie. where pages have been instantiated. 864 */ 865 static unsigned long get_vma_private_data(struct vm_area_struct *vma) 866 { 867 return (unsigned long)vma->vm_private_data; 868 } 869 870 static void set_vma_private_data(struct vm_area_struct *vma, 871 unsigned long value) 872 { 873 vma->vm_private_data = (void *)value; 874 } 875 876 static void 877 resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map, 878 struct hugetlb_cgroup *h_cg, 879 struct hstate *h) 880 { 881 #ifdef CONFIG_CGROUP_HUGETLB 882 if (!h_cg || !h) { 883 resv_map->reservation_counter = NULL; 884 resv_map->pages_per_hpage = 0; 885 resv_map->css = NULL; 886 } else { 887 resv_map->reservation_counter = 888 &h_cg->rsvd_hugepage[hstate_index(h)]; 889 resv_map->pages_per_hpage = pages_per_huge_page(h); 890 resv_map->css = &h_cg->css; 891 } 892 #endif 893 } 894 895 struct resv_map *resv_map_alloc(void) 896 { 897 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 898 struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); 899 900 if (!resv_map || !rg) { 901 kfree(resv_map); 902 kfree(rg); 903 return NULL; 904 } 905 906 kref_init(&resv_map->refs); 907 spin_lock_init(&resv_map->lock); 908 INIT_LIST_HEAD(&resv_map->regions); 909 910 resv_map->adds_in_progress = 0; 911 /* 912 * Initialize these to 0. On shared mappings, 0's here indicate these 913 * fields don't do cgroup accounting. On private mappings, these will be 914 * re-initialized to the proper values, to indicate that hugetlb cgroup 915 * reservations are to be un-charged from here. 916 */ 917 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL); 918 919 INIT_LIST_HEAD(&resv_map->region_cache); 920 list_add(&rg->link, &resv_map->region_cache); 921 resv_map->region_cache_count = 1; 922 923 return resv_map; 924 } 925 926 void resv_map_release(struct kref *ref) 927 { 928 struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 929 struct list_head *head = &resv_map->region_cache; 930 struct file_region *rg, *trg; 931 932 /* Clear out any active regions before we release the map. */ 933 region_del(resv_map, 0, LONG_MAX); 934 935 /* ... and any entries left in the cache */ 936 list_for_each_entry_safe(rg, trg, head, link) { 937 list_del(&rg->link); 938 kfree(rg); 939 } 940 941 VM_BUG_ON(resv_map->adds_in_progress); 942 943 kfree(resv_map); 944 } 945 946 static inline struct resv_map *inode_resv_map(struct inode *inode) 947 { 948 /* 949 * At inode evict time, i_mapping may not point to the original 950 * address space within the inode. This original address space 951 * contains the pointer to the resv_map. So, always use the 952 * address space embedded within the inode. 953 * The VERY common case is inode->mapping == &inode->i_data but, 954 * this may not be true for device special inodes. 955 */ 956 return (struct resv_map *)(&inode->i_data)->private_data; 957 } 958 959 static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 960 { 961 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 962 if (vma->vm_flags & VM_MAYSHARE) { 963 struct address_space *mapping = vma->vm_file->f_mapping; 964 struct inode *inode = mapping->host; 965 966 return inode_resv_map(inode); 967 968 } else { 969 return (struct resv_map *)(get_vma_private_data(vma) & 970 ~HPAGE_RESV_MASK); 971 } 972 } 973 974 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 975 { 976 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 977 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 978 979 set_vma_private_data(vma, (get_vma_private_data(vma) & 980 HPAGE_RESV_MASK) | (unsigned long)map); 981 } 982 983 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 984 { 985 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 986 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 987 988 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 989 } 990 991 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 992 { 993 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 994 995 return (get_vma_private_data(vma) & flag) != 0; 996 } 997 998 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 999 void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 1000 { 1001 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 1002 if (!(vma->vm_flags & VM_MAYSHARE)) 1003 vma->vm_private_data = (void *)0; 1004 } 1005 1006 /* Returns true if the VMA has associated reserve pages */ 1007 static bool vma_has_reserves(struct vm_area_struct *vma, long chg) 1008 { 1009 if (vma->vm_flags & VM_NORESERVE) { 1010 /* 1011 * This address is already reserved by other process(chg == 0), 1012 * so, we should decrement reserved count. Without decrementing, 1013 * reserve count remains after releasing inode, because this 1014 * allocated page will go into page cache and is regarded as 1015 * coming from reserved pool in releasing step. Currently, we 1016 * don't have any other solution to deal with this situation 1017 * properly, so add work-around here. 1018 */ 1019 if (vma->vm_flags & VM_MAYSHARE && chg == 0) 1020 return true; 1021 else 1022 return false; 1023 } 1024 1025 /* Shared mappings always use reserves */ 1026 if (vma->vm_flags & VM_MAYSHARE) { 1027 /* 1028 * We know VM_NORESERVE is not set. Therefore, there SHOULD 1029 * be a region map for all pages. The only situation where 1030 * there is no region map is if a hole was punched via 1031 * fallocate. In this case, there really are no reserves to 1032 * use. This situation is indicated if chg != 0. 1033 */ 1034 if (chg) 1035 return false; 1036 else 1037 return true; 1038 } 1039 1040 /* 1041 * Only the process that called mmap() has reserves for 1042 * private mappings. 1043 */ 1044 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 1045 /* 1046 * Like the shared case above, a hole punch or truncate 1047 * could have been performed on the private mapping. 1048 * Examine the value of chg to determine if reserves 1049 * actually exist or were previously consumed. 1050 * Very Subtle - The value of chg comes from a previous 1051 * call to vma_needs_reserves(). The reserve map for 1052 * private mappings has different (opposite) semantics 1053 * than that of shared mappings. vma_needs_reserves() 1054 * has already taken this difference in semantics into 1055 * account. Therefore, the meaning of chg is the same 1056 * as in the shared case above. Code could easily be 1057 * combined, but keeping it separate draws attention to 1058 * subtle differences. 1059 */ 1060 if (chg) 1061 return false; 1062 else 1063 return true; 1064 } 1065 1066 return false; 1067 } 1068 1069 static void enqueue_huge_page(struct hstate *h, struct page *page) 1070 { 1071 int nid = page_to_nid(page); 1072 1073 lockdep_assert_held(&hugetlb_lock); 1074 list_move(&page->lru, &h->hugepage_freelists[nid]); 1075 h->free_huge_pages++; 1076 h->free_huge_pages_node[nid]++; 1077 SetHPageFreed(page); 1078 } 1079 1080 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) 1081 { 1082 struct page *page; 1083 bool pin = !!(current->flags & PF_MEMALLOC_PIN); 1084 1085 lockdep_assert_held(&hugetlb_lock); 1086 list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { 1087 if (pin && !is_pinnable_page(page)) 1088 continue; 1089 1090 if (PageHWPoison(page)) 1091 continue; 1092 1093 list_move(&page->lru, &h->hugepage_activelist); 1094 set_page_refcounted(page); 1095 ClearHPageFreed(page); 1096 h->free_huge_pages--; 1097 h->free_huge_pages_node[nid]--; 1098 return page; 1099 } 1100 1101 return NULL; 1102 } 1103 1104 static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, 1105 nodemask_t *nmask) 1106 { 1107 unsigned int cpuset_mems_cookie; 1108 struct zonelist *zonelist; 1109 struct zone *zone; 1110 struct zoneref *z; 1111 int node = NUMA_NO_NODE; 1112 1113 zonelist = node_zonelist(nid, gfp_mask); 1114 1115 retry_cpuset: 1116 cpuset_mems_cookie = read_mems_allowed_begin(); 1117 for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { 1118 struct page *page; 1119 1120 if (!cpuset_zone_allowed(zone, gfp_mask)) 1121 continue; 1122 /* 1123 * no need to ask again on the same node. Pool is node rather than 1124 * zone aware 1125 */ 1126 if (zone_to_nid(zone) == node) 1127 continue; 1128 node = zone_to_nid(zone); 1129 1130 page = dequeue_huge_page_node_exact(h, node); 1131 if (page) 1132 return page; 1133 } 1134 if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) 1135 goto retry_cpuset; 1136 1137 return NULL; 1138 } 1139 1140 static struct page *dequeue_huge_page_vma(struct hstate *h, 1141 struct vm_area_struct *vma, 1142 unsigned long address, int avoid_reserve, 1143 long chg) 1144 { 1145 struct page *page; 1146 struct mempolicy *mpol; 1147 gfp_t gfp_mask; 1148 nodemask_t *nodemask; 1149 int nid; 1150 1151 /* 1152 * A child process with MAP_PRIVATE mappings created by their parent 1153 * have no page reserves. This check ensures that reservations are 1154 * not "stolen". The child may still get SIGKILLed 1155 */ 1156 if (!vma_has_reserves(vma, chg) && 1157 h->free_huge_pages - h->resv_huge_pages == 0) 1158 goto err; 1159 1160 /* If reserves cannot be used, ensure enough pages are in the pool */ 1161 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 1162 goto err; 1163 1164 gfp_mask = htlb_alloc_mask(h); 1165 nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 1166 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); 1167 if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { 1168 SetHPageRestoreReserve(page); 1169 h->resv_huge_pages--; 1170 } 1171 1172 mpol_cond_put(mpol); 1173 return page; 1174 1175 err: 1176 return NULL; 1177 } 1178 1179 /* 1180 * common helper functions for hstate_next_node_to_{alloc|free}. 1181 * We may have allocated or freed a huge page based on a different 1182 * nodes_allowed previously, so h->next_node_to_{alloc|free} might 1183 * be outside of *nodes_allowed. Ensure that we use an allowed 1184 * node for alloc or free. 1185 */ 1186 static int next_node_allowed(int nid, nodemask_t *nodes_allowed) 1187 { 1188 nid = next_node_in(nid, *nodes_allowed); 1189 VM_BUG_ON(nid >= MAX_NUMNODES); 1190 1191 return nid; 1192 } 1193 1194 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) 1195 { 1196 if (!node_isset(nid, *nodes_allowed)) 1197 nid = next_node_allowed(nid, nodes_allowed); 1198 return nid; 1199 } 1200 1201 /* 1202 * returns the previously saved node ["this node"] from which to 1203 * allocate a persistent huge page for the pool and advance the 1204 * next node from which to allocate, handling wrap at end of node 1205 * mask. 1206 */ 1207 static int hstate_next_node_to_alloc(struct hstate *h, 1208 nodemask_t *nodes_allowed) 1209 { 1210 int nid; 1211 1212 VM_BUG_ON(!nodes_allowed); 1213 1214 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); 1215 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); 1216 1217 return nid; 1218 } 1219 1220 /* 1221 * helper for remove_pool_huge_page() - return the previously saved 1222 * node ["this node"] from which to free a huge page. Advance the 1223 * next node id whether or not we find a free huge page to free so 1224 * that the next attempt to free addresses the next node. 1225 */ 1226 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) 1227 { 1228 int nid; 1229 1230 VM_BUG_ON(!nodes_allowed); 1231 1232 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); 1233 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); 1234 1235 return nid; 1236 } 1237 1238 #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ 1239 for (nr_nodes = nodes_weight(*mask); \ 1240 nr_nodes > 0 && \ 1241 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ 1242 nr_nodes--) 1243 1244 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ 1245 for (nr_nodes = nodes_weight(*mask); \ 1246 nr_nodes > 0 && \ 1247 ((node = hstate_next_node_to_free(hs, mask)) || 1); \ 1248 nr_nodes--) 1249 1250 #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE 1251 static void destroy_compound_gigantic_page(struct page *page, 1252 unsigned int order) 1253 { 1254 int i; 1255 int nr_pages = 1 << order; 1256 struct page *p = page + 1; 1257 1258 atomic_set(compound_mapcount_ptr(page), 0); 1259 atomic_set(compound_pincount_ptr(page), 0); 1260 1261 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1262 clear_compound_head(p); 1263 set_page_refcounted(p); 1264 } 1265 1266 set_compound_order(page, 0); 1267 page[1].compound_nr = 0; 1268 __ClearPageHead(page); 1269 } 1270 1271 static void free_gigantic_page(struct page *page, unsigned int order) 1272 { 1273 /* 1274 * If the page isn't allocated using the cma allocator, 1275 * cma_release() returns false. 1276 */ 1277 #ifdef CONFIG_CMA 1278 if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) 1279 return; 1280 #endif 1281 1282 free_contig_range(page_to_pfn(page), 1 << order); 1283 } 1284 1285 #ifdef CONFIG_CONTIG_ALLOC 1286 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1287 int nid, nodemask_t *nodemask) 1288 { 1289 unsigned long nr_pages = pages_per_huge_page(h); 1290 if (nid == NUMA_NO_NODE) 1291 nid = numa_mem_id(); 1292 1293 #ifdef CONFIG_CMA 1294 { 1295 struct page *page; 1296 int node; 1297 1298 if (hugetlb_cma[nid]) { 1299 page = cma_alloc(hugetlb_cma[nid], nr_pages, 1300 huge_page_order(h), true); 1301 if (page) 1302 return page; 1303 } 1304 1305 if (!(gfp_mask & __GFP_THISNODE)) { 1306 for_each_node_mask(node, *nodemask) { 1307 if (node == nid || !hugetlb_cma[node]) 1308 continue; 1309 1310 page = cma_alloc(hugetlb_cma[node], nr_pages, 1311 huge_page_order(h), true); 1312 if (page) 1313 return page; 1314 } 1315 } 1316 } 1317 #endif 1318 1319 return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); 1320 } 1321 1322 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); 1323 static void prep_compound_gigantic_page(struct page *page, unsigned int order); 1324 #else /* !CONFIG_CONTIG_ALLOC */ 1325 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1326 int nid, nodemask_t *nodemask) 1327 { 1328 return NULL; 1329 } 1330 #endif /* CONFIG_CONTIG_ALLOC */ 1331 1332 #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ 1333 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1334 int nid, nodemask_t *nodemask) 1335 { 1336 return NULL; 1337 } 1338 static inline void free_gigantic_page(struct page *page, unsigned int order) { } 1339 static inline void destroy_compound_gigantic_page(struct page *page, 1340 unsigned int order) { } 1341 #endif 1342 1343 /* 1344 * Remove hugetlb page from lists, and update dtor so that page appears 1345 * as just a compound page. A reference is held on the page. 1346 * 1347 * Must be called with hugetlb lock held. 1348 */ 1349 static void remove_hugetlb_page(struct hstate *h, struct page *page, 1350 bool adjust_surplus) 1351 { 1352 int nid = page_to_nid(page); 1353 1354 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); 1355 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); 1356 1357 lockdep_assert_held(&hugetlb_lock); 1358 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 1359 return; 1360 1361 list_del(&page->lru); 1362 1363 if (HPageFreed(page)) { 1364 h->free_huge_pages--; 1365 h->free_huge_pages_node[nid]--; 1366 } 1367 if (adjust_surplus) { 1368 h->surplus_huge_pages--; 1369 h->surplus_huge_pages_node[nid]--; 1370 } 1371 1372 set_page_refcounted(page); 1373 set_compound_page_dtor(page, NULL_COMPOUND_DTOR); 1374 1375 h->nr_huge_pages--; 1376 h->nr_huge_pages_node[nid]--; 1377 } 1378 1379 static void __update_and_free_page(struct hstate *h, struct page *page) 1380 { 1381 int i; 1382 struct page *subpage = page; 1383 1384 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 1385 return; 1386 1387 for (i = 0; i < pages_per_huge_page(h); 1388 i++, subpage = mem_map_next(subpage, page, i)) { 1389 subpage->flags &= ~(1 << PG_locked | 1 << PG_error | 1390 1 << PG_referenced | 1 << PG_dirty | 1391 1 << PG_active | 1 << PG_private | 1392 1 << PG_writeback); 1393 } 1394 if (hstate_is_gigantic(h)) { 1395 destroy_compound_gigantic_page(page, huge_page_order(h)); 1396 free_gigantic_page(page, huge_page_order(h)); 1397 } else { 1398 __free_pages(page, huge_page_order(h)); 1399 } 1400 } 1401 1402 /* 1403 * As update_and_free_page() can be called under any context, so we cannot 1404 * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the 1405 * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate 1406 * the vmemmap pages. 1407 * 1408 * free_hpage_workfn() locklessly retrieves the linked list of pages to be 1409 * freed and frees them one-by-one. As the page->mapping pointer is going 1410 * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node 1411 * structure of a lockless linked list of huge pages to be freed. 1412 */ 1413 static LLIST_HEAD(hpage_freelist); 1414 1415 static void free_hpage_workfn(struct work_struct *work) 1416 { 1417 struct llist_node *node; 1418 1419 node = llist_del_all(&hpage_freelist); 1420 1421 while (node) { 1422 struct page *page; 1423 struct hstate *h; 1424 1425 page = container_of((struct address_space **)node, 1426 struct page, mapping); 1427 node = node->next; 1428 page->mapping = NULL; 1429 /* 1430 * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate() 1431 * is going to trigger because a previous call to 1432 * remove_hugetlb_page() will set_compound_page_dtor(page, 1433 * NULL_COMPOUND_DTOR), so do not use page_hstate() directly. 1434 */ 1435 h = size_to_hstate(page_size(page)); 1436 1437 __update_and_free_page(h, page); 1438 1439 cond_resched(); 1440 } 1441 } 1442 static DECLARE_WORK(free_hpage_work, free_hpage_workfn); 1443 1444 static inline void flush_free_hpage_work(struct hstate *h) 1445 { 1446 if (free_vmemmap_pages_per_hpage(h)) 1447 flush_work(&free_hpage_work); 1448 } 1449 1450 static void update_and_free_page(struct hstate *h, struct page *page, 1451 bool atomic) 1452 { 1453 if (!free_vmemmap_pages_per_hpage(h) || !atomic) { 1454 __update_and_free_page(h, page); 1455 return; 1456 } 1457 1458 /* 1459 * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages. 1460 * 1461 * Only call schedule_work() if hpage_freelist is previously 1462 * empty. Otherwise, schedule_work() had been called but the workfn 1463 * hasn't retrieved the list yet. 1464 */ 1465 if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist)) 1466 schedule_work(&free_hpage_work); 1467 } 1468 1469 static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) 1470 { 1471 struct page *page, *t_page; 1472 1473 list_for_each_entry_safe(page, t_page, list, lru) { 1474 update_and_free_page(h, page, false); 1475 cond_resched(); 1476 } 1477 } 1478 1479 struct hstate *size_to_hstate(unsigned long size) 1480 { 1481 struct hstate *h; 1482 1483 for_each_hstate(h) { 1484 if (huge_page_size(h) == size) 1485 return h; 1486 } 1487 return NULL; 1488 } 1489 1490 void free_huge_page(struct page *page) 1491 { 1492 /* 1493 * Can't pass hstate in here because it is called from the 1494 * compound page destructor. 1495 */ 1496 struct hstate *h = page_hstate(page); 1497 int nid = page_to_nid(page); 1498 struct hugepage_subpool *spool = hugetlb_page_subpool(page); 1499 bool restore_reserve; 1500 unsigned long flags; 1501 1502 VM_BUG_ON_PAGE(page_count(page), page); 1503 VM_BUG_ON_PAGE(page_mapcount(page), page); 1504 1505 hugetlb_set_page_subpool(page, NULL); 1506 page->mapping = NULL; 1507 restore_reserve = HPageRestoreReserve(page); 1508 ClearHPageRestoreReserve(page); 1509 1510 /* 1511 * If HPageRestoreReserve was set on page, page allocation consumed a 1512 * reservation. If the page was associated with a subpool, there 1513 * would have been a page reserved in the subpool before allocation 1514 * via hugepage_subpool_get_pages(). Since we are 'restoring' the 1515 * reservation, do not call hugepage_subpool_put_pages() as this will 1516 * remove the reserved page from the subpool. 1517 */ 1518 if (!restore_reserve) { 1519 /* 1520 * A return code of zero implies that the subpool will be 1521 * under its minimum size if the reservation is not restored 1522 * after page is free. Therefore, force restore_reserve 1523 * operation. 1524 */ 1525 if (hugepage_subpool_put_pages(spool, 1) == 0) 1526 restore_reserve = true; 1527 } 1528 1529 spin_lock_irqsave(&hugetlb_lock, flags); 1530 ClearHPageMigratable(page); 1531 hugetlb_cgroup_uncharge_page(hstate_index(h), 1532 pages_per_huge_page(h), page); 1533 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), 1534 pages_per_huge_page(h), page); 1535 if (restore_reserve) 1536 h->resv_huge_pages++; 1537 1538 if (HPageTemporary(page)) { 1539 remove_hugetlb_page(h, page, false); 1540 spin_unlock_irqrestore(&hugetlb_lock, flags); 1541 update_and_free_page(h, page, true); 1542 } else if (h->surplus_huge_pages_node[nid]) { 1543 /* remove the page from active list */ 1544 remove_hugetlb_page(h, page, true); 1545 spin_unlock_irqrestore(&hugetlb_lock, flags); 1546 update_and_free_page(h, page, true); 1547 } else { 1548 arch_clear_hugepage_flags(page); 1549 enqueue_huge_page(h, page); 1550 spin_unlock_irqrestore(&hugetlb_lock, flags); 1551 } 1552 } 1553 1554 /* 1555 * Must be called with the hugetlb lock held 1556 */ 1557 static void __prep_account_new_huge_page(struct hstate *h, int nid) 1558 { 1559 lockdep_assert_held(&hugetlb_lock); 1560 h->nr_huge_pages++; 1561 h->nr_huge_pages_node[nid]++; 1562 } 1563 1564 static void __prep_new_huge_page(struct hstate *h, struct page *page) 1565 { 1566 free_huge_page_vmemmap(h, page); 1567 INIT_LIST_HEAD(&page->lru); 1568 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); 1569 hugetlb_set_page_subpool(page, NULL); 1570 set_hugetlb_cgroup(page, NULL); 1571 set_hugetlb_cgroup_rsvd(page, NULL); 1572 } 1573 1574 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 1575 { 1576 __prep_new_huge_page(h, page); 1577 spin_lock_irq(&hugetlb_lock); 1578 __prep_account_new_huge_page(h, nid); 1579 spin_unlock_irq(&hugetlb_lock); 1580 } 1581 1582 static void prep_compound_gigantic_page(struct page *page, unsigned int order) 1583 { 1584 int i; 1585 int nr_pages = 1 << order; 1586 struct page *p = page + 1; 1587 1588 /* we rely on prep_new_huge_page to set the destructor */ 1589 set_compound_order(page, order); 1590 __ClearPageReserved(page); 1591 __SetPageHead(page); 1592 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1593 /* 1594 * For gigantic hugepages allocated through bootmem at 1595 * boot, it's safer to be consistent with the not-gigantic 1596 * hugepages and clear the PG_reserved bit from all tail pages 1597 * too. Otherwise drivers using get_user_pages() to access tail 1598 * pages may get the reference counting wrong if they see 1599 * PG_reserved set on a tail page (despite the head page not 1600 * having PG_reserved set). Enforcing this consistency between 1601 * head and tail pages allows drivers to optimize away a check 1602 * on the head page when they need know if put_page() is needed 1603 * after get_user_pages(). 1604 */ 1605 __ClearPageReserved(p); 1606 set_page_count(p, 0); 1607 set_compound_head(p, page); 1608 } 1609 atomic_set(compound_mapcount_ptr(page), -1); 1610 atomic_set(compound_pincount_ptr(page), 0); 1611 } 1612 1613 /* 1614 * PageHuge() only returns true for hugetlbfs pages, but not for normal or 1615 * transparent huge pages. See the PageTransHuge() documentation for more 1616 * details. 1617 */ 1618 int PageHuge(struct page *page) 1619 { 1620 if (!PageCompound(page)) 1621 return 0; 1622 1623 page = compound_head(page); 1624 return page[1].compound_dtor == HUGETLB_PAGE_DTOR; 1625 } 1626 EXPORT_SYMBOL_GPL(PageHuge); 1627 1628 /* 1629 * PageHeadHuge() only returns true for hugetlbfs head page, but not for 1630 * normal or transparent huge pages. 1631 */ 1632 int PageHeadHuge(struct page *page_head) 1633 { 1634 if (!PageHead(page_head)) 1635 return 0; 1636 1637 return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; 1638 } 1639 1640 /* 1641 * Find and lock address space (mapping) in write mode. 1642 * 1643 * Upon entry, the page is locked which means that page_mapping() is 1644 * stable. Due to locking order, we can only trylock_write. If we can 1645 * not get the lock, simply return NULL to caller. 1646 */ 1647 struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) 1648 { 1649 struct address_space *mapping = page_mapping(hpage); 1650 1651 if (!mapping) 1652 return mapping; 1653 1654 if (i_mmap_trylock_write(mapping)) 1655 return mapping; 1656 1657 return NULL; 1658 } 1659 1660 pgoff_t hugetlb_basepage_index(struct page *page) 1661 { 1662 struct page *page_head = compound_head(page); 1663 pgoff_t index = page_index(page_head); 1664 unsigned long compound_idx; 1665 1666 if (compound_order(page_head) >= MAX_ORDER) 1667 compound_idx = page_to_pfn(page) - page_to_pfn(page_head); 1668 else 1669 compound_idx = page - page_head; 1670 1671 return (index << compound_order(page_head)) + compound_idx; 1672 } 1673 1674 static struct page *alloc_buddy_huge_page(struct hstate *h, 1675 gfp_t gfp_mask, int nid, nodemask_t *nmask, 1676 nodemask_t *node_alloc_noretry) 1677 { 1678 int order = huge_page_order(h); 1679 struct page *page; 1680 bool alloc_try_hard = true; 1681 1682 /* 1683 * By default we always try hard to allocate the page with 1684 * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in 1685 * a loop (to adjust global huge page counts) and previous allocation 1686 * failed, do not continue to try hard on the same node. Use the 1687 * node_alloc_noretry bitmap to manage this state information. 1688 */ 1689 if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) 1690 alloc_try_hard = false; 1691 gfp_mask |= __GFP_COMP|__GFP_NOWARN; 1692 if (alloc_try_hard) 1693 gfp_mask |= __GFP_RETRY_MAYFAIL; 1694 if (nid == NUMA_NO_NODE) 1695 nid = numa_mem_id(); 1696 page = __alloc_pages(gfp_mask, order, nid, nmask); 1697 if (page) 1698 __count_vm_event(HTLB_BUDDY_PGALLOC); 1699 else 1700 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 1701 1702 /* 1703 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this 1704 * indicates an overall state change. Clear bit so that we resume 1705 * normal 'try hard' allocations. 1706 */ 1707 if (node_alloc_noretry && page && !alloc_try_hard) 1708 node_clear(nid, *node_alloc_noretry); 1709 1710 /* 1711 * If we tried hard to get a page but failed, set bit so that 1712 * subsequent attempts will not try as hard until there is an 1713 * overall state change. 1714 */ 1715 if (node_alloc_noretry && !page && alloc_try_hard) 1716 node_set(nid, *node_alloc_noretry); 1717 1718 return page; 1719 } 1720 1721 /* 1722 * Common helper to allocate a fresh hugetlb page. All specific allocators 1723 * should use this function to get new hugetlb pages 1724 */ 1725 static struct page *alloc_fresh_huge_page(struct hstate *h, 1726 gfp_t gfp_mask, int nid, nodemask_t *nmask, 1727 nodemask_t *node_alloc_noretry) 1728 { 1729 struct page *page; 1730 1731 if (hstate_is_gigantic(h)) 1732 page = alloc_gigantic_page(h, gfp_mask, nid, nmask); 1733 else 1734 page = alloc_buddy_huge_page(h, gfp_mask, 1735 nid, nmask, node_alloc_noretry); 1736 if (!page) 1737 return NULL; 1738 1739 if (hstate_is_gigantic(h)) 1740 prep_compound_gigantic_page(page, huge_page_order(h)); 1741 prep_new_huge_page(h, page, page_to_nid(page)); 1742 1743 return page; 1744 } 1745 1746 /* 1747 * Allocates a fresh page to the hugetlb allocator pool in the node interleaved 1748 * manner. 1749 */ 1750 static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 1751 nodemask_t *node_alloc_noretry) 1752 { 1753 struct page *page; 1754 int nr_nodes, node; 1755 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 1756 1757 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 1758 page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, 1759 node_alloc_noretry); 1760 if (page) 1761 break; 1762 } 1763 1764 if (!page) 1765 return 0; 1766 1767 put_page(page); /* free it into the hugepage allocator */ 1768 1769 return 1; 1770 } 1771 1772 /* 1773 * Remove huge page from pool from next node to free. Attempt to keep 1774 * persistent huge pages more or less balanced over allowed nodes. 1775 * This routine only 'removes' the hugetlb page. The caller must make 1776 * an additional call to free the page to low level allocators. 1777 * Called with hugetlb_lock locked. 1778 */ 1779 static struct page *remove_pool_huge_page(struct hstate *h, 1780 nodemask_t *nodes_allowed, 1781 bool acct_surplus) 1782 { 1783 int nr_nodes, node; 1784 struct page *page = NULL; 1785 1786 lockdep_assert_held(&hugetlb_lock); 1787 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 1788 /* 1789 * If we're returning unused surplus pages, only examine 1790 * nodes with surplus pages. 1791 */ 1792 if ((!acct_surplus || h->surplus_huge_pages_node[node]) && 1793 !list_empty(&h->hugepage_freelists[node])) { 1794 page = list_entry(h->hugepage_freelists[node].next, 1795 struct page, lru); 1796 remove_hugetlb_page(h, page, acct_surplus); 1797 break; 1798 } 1799 } 1800 1801 return page; 1802 } 1803 1804 /* 1805 * Dissolve a given free hugepage into free buddy pages. This function does 1806 * nothing for in-use hugepages and non-hugepages. 1807 * This function returns values like below: 1808 * 1809 * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use 1810 * (allocated or reserved.) 1811 * 0: successfully dissolved free hugepages or the page is not a 1812 * hugepage (considered as already dissolved) 1813 */ 1814 int dissolve_free_huge_page(struct page *page) 1815 { 1816 int rc = -EBUSY; 1817 1818 retry: 1819 /* Not to disrupt normal path by vainly holding hugetlb_lock */ 1820 if (!PageHuge(page)) 1821 return 0; 1822 1823 spin_lock_irq(&hugetlb_lock); 1824 if (!PageHuge(page)) { 1825 rc = 0; 1826 goto out; 1827 } 1828 1829 if (!page_count(page)) { 1830 struct page *head = compound_head(page); 1831 struct hstate *h = page_hstate(head); 1832 if (h->free_huge_pages - h->resv_huge_pages == 0) 1833 goto out; 1834 1835 /* 1836 * We should make sure that the page is already on the free list 1837 * when it is dissolved. 1838 */ 1839 if (unlikely(!HPageFreed(head))) { 1840 spin_unlock_irq(&hugetlb_lock); 1841 cond_resched(); 1842 1843 /* 1844 * Theoretically, we should return -EBUSY when we 1845 * encounter this race. In fact, we have a chance 1846 * to successfully dissolve the page if we do a 1847 * retry. Because the race window is quite small. 1848 * If we seize this opportunity, it is an optimization 1849 * for increasing the success rate of dissolving page. 1850 */ 1851 goto retry; 1852 } 1853 1854 /* 1855 * Move PageHWPoison flag from head page to the raw error page, 1856 * which makes any subpages rather than the error page reusable. 1857 */ 1858 if (PageHWPoison(head) && page != head) { 1859 SetPageHWPoison(page); 1860 ClearPageHWPoison(head); 1861 } 1862 remove_hugetlb_page(h, head, false); 1863 h->max_huge_pages--; 1864 spin_unlock_irq(&hugetlb_lock); 1865 update_and_free_page(h, head, false); 1866 return 0; 1867 } 1868 out: 1869 spin_unlock_irq(&hugetlb_lock); 1870 return rc; 1871 } 1872 1873 /* 1874 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to 1875 * make specified memory blocks removable from the system. 1876 * Note that this will dissolve a free gigantic hugepage completely, if any 1877 * part of it lies within the given range. 1878 * Also note that if dissolve_free_huge_page() returns with an error, all 1879 * free hugepages that were dissolved before that error are lost. 1880 */ 1881 int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) 1882 { 1883 unsigned long pfn; 1884 struct page *page; 1885 int rc = 0; 1886 1887 if (!hugepages_supported()) 1888 return rc; 1889 1890 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { 1891 page = pfn_to_page(pfn); 1892 rc = dissolve_free_huge_page(page); 1893 if (rc) 1894 break; 1895 } 1896 1897 return rc; 1898 } 1899 1900 /* 1901 * Allocates a fresh surplus page from the page allocator. 1902 */ 1903 static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, 1904 int nid, nodemask_t *nmask) 1905 { 1906 struct page *page = NULL; 1907 1908 if (hstate_is_gigantic(h)) 1909 return NULL; 1910 1911 spin_lock_irq(&hugetlb_lock); 1912 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) 1913 goto out_unlock; 1914 spin_unlock_irq(&hugetlb_lock); 1915 1916 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); 1917 if (!page) 1918 return NULL; 1919 1920 spin_lock_irq(&hugetlb_lock); 1921 /* 1922 * We could have raced with the pool size change. 1923 * Double check that and simply deallocate the new page 1924 * if we would end up overcommiting the surpluses. Abuse 1925 * temporary page to workaround the nasty free_huge_page 1926 * codeflow 1927 */ 1928 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { 1929 SetHPageTemporary(page); 1930 spin_unlock_irq(&hugetlb_lock); 1931 put_page(page); 1932 return NULL; 1933 } else { 1934 h->surplus_huge_pages++; 1935 h->surplus_huge_pages_node[page_to_nid(page)]++; 1936 } 1937 1938 out_unlock: 1939 spin_unlock_irq(&hugetlb_lock); 1940 1941 return page; 1942 } 1943 1944 static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, 1945 int nid, nodemask_t *nmask) 1946 { 1947 struct page *page; 1948 1949 if (hstate_is_gigantic(h)) 1950 return NULL; 1951 1952 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); 1953 if (!page) 1954 return NULL; 1955 1956 /* 1957 * We do not account these pages as surplus because they are only 1958 * temporary and will be released properly on the last reference 1959 */ 1960 SetHPageTemporary(page); 1961 1962 return page; 1963 } 1964 1965 /* 1966 * Use the VMA's mpolicy to allocate a huge page from the buddy. 1967 */ 1968 static 1969 struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, 1970 struct vm_area_struct *vma, unsigned long addr) 1971 { 1972 struct page *page; 1973 struct mempolicy *mpol; 1974 gfp_t gfp_mask = htlb_alloc_mask(h); 1975 int nid; 1976 nodemask_t *nodemask; 1977 1978 nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); 1979 page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); 1980 mpol_cond_put(mpol); 1981 1982 return page; 1983 } 1984 1985 /* page migration callback function */ 1986 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, 1987 nodemask_t *nmask, gfp_t gfp_mask) 1988 { 1989 spin_lock_irq(&hugetlb_lock); 1990 if (h->free_huge_pages - h->resv_huge_pages > 0) { 1991 struct page *page; 1992 1993 page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); 1994 if (page) { 1995 spin_unlock_irq(&hugetlb_lock); 1996 return page; 1997 } 1998 } 1999 spin_unlock_irq(&hugetlb_lock); 2000 2001 return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); 2002 } 2003 2004 /* mempolicy aware migration callback */ 2005 struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, 2006 unsigned long address) 2007 { 2008 struct mempolicy *mpol; 2009 nodemask_t *nodemask; 2010 struct page *page; 2011 gfp_t gfp_mask; 2012 int node; 2013 2014 gfp_mask = htlb_alloc_mask(h); 2015 node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 2016 page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask); 2017 mpol_cond_put(mpol); 2018 2019 return page; 2020 } 2021 2022 /* 2023 * Increase the hugetlb pool such that it can accommodate a reservation 2024 * of size 'delta'. 2025 */ 2026 static int gather_surplus_pages(struct hstate *h, long delta) 2027 __must_hold(&hugetlb_lock) 2028 { 2029 struct list_head surplus_list; 2030 struct page *page, *tmp; 2031 int ret; 2032 long i; 2033 long needed, allocated; 2034 bool alloc_ok = true; 2035 2036 lockdep_assert_held(&hugetlb_lock); 2037 needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 2038 if (needed <= 0) { 2039 h->resv_huge_pages += delta; 2040 return 0; 2041 } 2042 2043 allocated = 0; 2044 INIT_LIST_HEAD(&surplus_list); 2045 2046 ret = -ENOMEM; 2047 retry: 2048 spin_unlock_irq(&hugetlb_lock); 2049 for (i = 0; i < needed; i++) { 2050 page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), 2051 NUMA_NO_NODE, NULL); 2052 if (!page) { 2053 alloc_ok = false; 2054 break; 2055 } 2056 list_add(&page->lru, &surplus_list); 2057 cond_resched(); 2058 } 2059 allocated += i; 2060 2061 /* 2062 * After retaking hugetlb_lock, we need to recalculate 'needed' 2063 * because either resv_huge_pages or free_huge_pages may have changed. 2064 */ 2065 spin_lock_irq(&hugetlb_lock); 2066 needed = (h->resv_huge_pages + delta) - 2067 (h->free_huge_pages + allocated); 2068 if (needed > 0) { 2069 if (alloc_ok) 2070 goto retry; 2071 /* 2072 * We were not able to allocate enough pages to 2073 * satisfy the entire reservation so we free what 2074 * we've allocated so far. 2075 */ 2076 goto free; 2077 } 2078 /* 2079 * The surplus_list now contains _at_least_ the number of extra pages 2080 * needed to accommodate the reservation. Add the appropriate number 2081 * of pages to the hugetlb pool and free the extras back to the buddy 2082 * allocator. Commit the entire reservation here to prevent another 2083 * process from stealing the pages as they are added to the pool but 2084 * before they are reserved. 2085 */ 2086 needed += allocated; 2087 h->resv_huge_pages += delta; 2088 ret = 0; 2089 2090 /* Free the needed pages to the hugetlb pool */ 2091 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 2092 int zeroed; 2093 2094 if ((--needed) < 0) 2095 break; 2096 /* 2097 * This page is now managed by the hugetlb allocator and has 2098 * no users -- drop the buddy allocator's reference. 2099 */ 2100 zeroed = put_page_testzero(page); 2101 VM_BUG_ON_PAGE(!zeroed, page); 2102 enqueue_huge_page(h, page); 2103 } 2104 free: 2105 spin_unlock_irq(&hugetlb_lock); 2106 2107 /* Free unnecessary surplus pages to the buddy allocator */ 2108 list_for_each_entry_safe(page, tmp, &surplus_list, lru) 2109 put_page(page); 2110 spin_lock_irq(&hugetlb_lock); 2111 2112 return ret; 2113 } 2114 2115 /* 2116 * This routine has two main purposes: 2117 * 1) Decrement the reservation count (resv_huge_pages) by the value passed 2118 * in unused_resv_pages. This corresponds to the prior adjustments made 2119 * to the associated reservation map. 2120 * 2) Free any unused surplus pages that may have been allocated to satisfy 2121 * the reservation. As many as unused_resv_pages may be freed. 2122 */ 2123 static void return_unused_surplus_pages(struct hstate *h, 2124 unsigned long unused_resv_pages) 2125 { 2126 unsigned long nr_pages; 2127 struct page *page; 2128 LIST_HEAD(page_list); 2129 2130 lockdep_assert_held(&hugetlb_lock); 2131 /* Uncommit the reservation */ 2132 h->resv_huge_pages -= unused_resv_pages; 2133 2134 /* Cannot return gigantic pages currently */ 2135 if (hstate_is_gigantic(h)) 2136 goto out; 2137 2138 /* 2139 * Part (or even all) of the reservation could have been backed 2140 * by pre-allocated pages. Only free surplus pages. 2141 */ 2142 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 2143 2144 /* 2145 * We want to release as many surplus pages as possible, spread 2146 * evenly across all nodes with memory. Iterate across these nodes 2147 * until we can no longer free unreserved surplus pages. This occurs 2148 * when the nodes with surplus pages have no free pages. 2149 * remove_pool_huge_page() will balance the freed pages across the 2150 * on-line nodes with memory and will handle the hstate accounting. 2151 */ 2152 while (nr_pages--) { 2153 page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); 2154 if (!page) 2155 goto out; 2156 2157 list_add(&page->lru, &page_list); 2158 } 2159 2160 out: 2161 spin_unlock_irq(&hugetlb_lock); 2162 update_and_free_pages_bulk(h, &page_list); 2163 spin_lock_irq(&hugetlb_lock); 2164 } 2165 2166 2167 /* 2168 * vma_needs_reservation, vma_commit_reservation and vma_end_reservation 2169 * are used by the huge page allocation routines to manage reservations. 2170 * 2171 * vma_needs_reservation is called to determine if the huge page at addr 2172 * within the vma has an associated reservation. If a reservation is 2173 * needed, the value 1 is returned. The caller is then responsible for 2174 * managing the global reservation and subpool usage counts. After 2175 * the huge page has been allocated, vma_commit_reservation is called 2176 * to add the page to the reservation map. If the page allocation fails, 2177 * the reservation must be ended instead of committed. vma_end_reservation 2178 * is called in such cases. 2179 * 2180 * In the normal case, vma_commit_reservation returns the same value 2181 * as the preceding vma_needs_reservation call. The only time this 2182 * is not the case is if a reserve map was changed between calls. It 2183 * is the responsibility of the caller to notice the difference and 2184 * take appropriate action. 2185 * 2186 * vma_add_reservation is used in error paths where a reservation must 2187 * be restored when a newly allocated huge page must be freed. It is 2188 * to be called after calling vma_needs_reservation to determine if a 2189 * reservation exists. 2190 * 2191 * vma_del_reservation is used in error paths where an entry in the reserve 2192 * map was created during huge page allocation and must be removed. It is to 2193 * be called after calling vma_needs_reservation to determine if a reservation 2194 * exists. 2195 */ 2196 enum vma_resv_mode { 2197 VMA_NEEDS_RESV, 2198 VMA_COMMIT_RESV, 2199 VMA_END_RESV, 2200 VMA_ADD_RESV, 2201 VMA_DEL_RESV, 2202 }; 2203 static long __vma_reservation_common(struct hstate *h, 2204 struct vm_area_struct *vma, unsigned long addr, 2205 enum vma_resv_mode mode) 2206 { 2207 struct resv_map *resv; 2208 pgoff_t idx; 2209 long ret; 2210 long dummy_out_regions_needed; 2211 2212 resv = vma_resv_map(vma); 2213 if (!resv) 2214 return 1; 2215 2216 idx = vma_hugecache_offset(h, vma, addr); 2217 switch (mode) { 2218 case VMA_NEEDS_RESV: 2219 ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); 2220 /* We assume that vma_reservation_* routines always operate on 2221 * 1 page, and that adding to resv map a 1 page entry can only 2222 * ever require 1 region. 2223 */ 2224 VM_BUG_ON(dummy_out_regions_needed != 1); 2225 break; 2226 case VMA_COMMIT_RESV: 2227 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2228 /* region_add calls of range 1 should never fail. */ 2229 VM_BUG_ON(ret < 0); 2230 break; 2231 case VMA_END_RESV: 2232 region_abort(resv, idx, idx + 1, 1); 2233 ret = 0; 2234 break; 2235 case VMA_ADD_RESV: 2236 if (vma->vm_flags & VM_MAYSHARE) { 2237 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2238 /* region_add calls of range 1 should never fail. */ 2239 VM_BUG_ON(ret < 0); 2240 } else { 2241 region_abort(resv, idx, idx + 1, 1); 2242 ret = region_del(resv, idx, idx + 1); 2243 } 2244 break; 2245 case VMA_DEL_RESV: 2246 if (vma->vm_flags & VM_MAYSHARE) { 2247 region_abort(resv, idx, idx + 1, 1); 2248 ret = region_del(resv, idx, idx + 1); 2249 } else { 2250 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2251 /* region_add calls of range 1 should never fail. */ 2252 VM_BUG_ON(ret < 0); 2253 } 2254 break; 2255 default: 2256 BUG(); 2257 } 2258 2259 if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV) 2260 return ret; 2261 /* 2262 * We know private mapping must have HPAGE_RESV_OWNER set. 2263 * 2264 * In most cases, reserves always exist for private mappings. 2265 * However, a file associated with mapping could have been 2266 * hole punched or truncated after reserves were consumed. 2267 * As subsequent fault on such a range will not use reserves. 2268 * Subtle - The reserve map for private mappings has the 2269 * opposite meaning than that of shared mappings. If NO 2270 * entry is in the reserve map, it means a reservation exists. 2271 * If an entry exists in the reserve map, it means the 2272 * reservation has already been consumed. As a result, the 2273 * return value of this routine is the opposite of the 2274 * value returned from reserve map manipulation routines above. 2275 */ 2276 if (ret > 0) 2277 return 0; 2278 if (ret == 0) 2279 return 1; 2280 return ret; 2281 } 2282 2283 static long vma_needs_reservation(struct hstate *h, 2284 struct vm_area_struct *vma, unsigned long addr) 2285 { 2286 return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); 2287 } 2288 2289 static long vma_commit_reservation(struct hstate *h, 2290 struct vm_area_struct *vma, unsigned long addr) 2291 { 2292 return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); 2293 } 2294 2295 static void vma_end_reservation(struct hstate *h, 2296 struct vm_area_struct *vma, unsigned long addr) 2297 { 2298 (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); 2299 } 2300 2301 static long vma_add_reservation(struct hstate *h, 2302 struct vm_area_struct *vma, unsigned long addr) 2303 { 2304 return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); 2305 } 2306 2307 static long vma_del_reservation(struct hstate *h, 2308 struct vm_area_struct *vma, unsigned long addr) 2309 { 2310 return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV); 2311 } 2312 2313 /* 2314 * This routine is called to restore reservation information on error paths. 2315 * It should ONLY be called for pages allocated via alloc_huge_page(), and 2316 * the hugetlb mutex should remain held when calling this routine. 2317 * 2318 * It handles two specific cases: 2319 * 1) A reservation was in place and the page consumed the reservation. 2320 * HPageRestoreReserve is set in the page. 2321 * 2) No reservation was in place for the page, so HPageRestoreReserve is 2322 * not set. However, alloc_huge_page always updates the reserve map. 2323 * 2324 * In case 1, free_huge_page later in the error path will increment the 2325 * global reserve count. But, free_huge_page does not have enough context 2326 * to adjust the reservation map. This case deals primarily with private 2327 * mappings. Adjust the reserve map here to be consistent with global 2328 * reserve count adjustments to be made by free_huge_page. Make sure the 2329 * reserve map indicates there is a reservation present. 2330 * 2331 * In case 2, simply undo reserve map modifications done by alloc_huge_page. 2332 */ 2333 void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, 2334 unsigned long address, struct page *page) 2335 { 2336 long rc = vma_needs_reservation(h, vma, address); 2337 2338 if (HPageRestoreReserve(page)) { 2339 if (unlikely(rc < 0)) 2340 /* 2341 * Rare out of memory condition in reserve map 2342 * manipulation. Clear HPageRestoreReserve so that 2343 * global reserve count will not be incremented 2344 * by free_huge_page. This will make it appear 2345 * as though the reservation for this page was 2346 * consumed. This may prevent the task from 2347 * faulting in the page at a later time. This 2348 * is better than inconsistent global huge page 2349 * accounting of reserve counts. 2350 */ 2351 ClearHPageRestoreReserve(page); 2352 else if (rc) 2353 (void)vma_add_reservation(h, vma, address); 2354 else 2355 vma_end_reservation(h, vma, address); 2356 } else { 2357 if (!rc) { 2358 /* 2359 * This indicates there is an entry in the reserve map 2360 * added by alloc_huge_page. We know it was added 2361 * before the alloc_huge_page call, otherwise 2362 * HPageRestoreReserve would be set on the page. 2363 * Remove the entry so that a subsequent allocation 2364 * does not consume a reservation. 2365 */ 2366 rc = vma_del_reservation(h, vma, address); 2367 if (rc < 0) 2368 /* 2369 * VERY rare out of memory condition. Since 2370 * we can not delete the entry, set 2371 * HPageRestoreReserve so that the reserve 2372 * count will be incremented when the page 2373 * is freed. This reserve will be consumed 2374 * on a subsequent allocation. 2375 */ 2376 SetHPageRestoreReserve(page); 2377 } else if (rc < 0) { 2378 /* 2379 * Rare out of memory condition from 2380 * vma_needs_reservation call. Memory allocation is 2381 * only attempted if a new entry is needed. Therefore, 2382 * this implies there is not an entry in the 2383 * reserve map. 2384 * 2385 * For shared mappings, no entry in the map indicates 2386 * no reservation. We are done. 2387 */ 2388 if (!(vma->vm_flags & VM_MAYSHARE)) 2389 /* 2390 * For private mappings, no entry indicates 2391 * a reservation is present. Since we can 2392 * not add an entry, set SetHPageRestoreReserve 2393 * on the page so reserve count will be 2394 * incremented when freed. This reserve will 2395 * be consumed on a subsequent allocation. 2396 */ 2397 SetHPageRestoreReserve(page); 2398 } else 2399 /* 2400 * No reservation present, do nothing 2401 */ 2402 vma_end_reservation(h, vma, address); 2403 } 2404 } 2405 2406 /* 2407 * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one 2408 * @h: struct hstate old page belongs to 2409 * @old_page: Old page to dissolve 2410 * @list: List to isolate the page in case we need to 2411 * Returns 0 on success, otherwise negated error. 2412 */ 2413 static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, 2414 struct list_head *list) 2415 { 2416 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 2417 int nid = page_to_nid(old_page); 2418 struct page *new_page; 2419 int ret = 0; 2420 2421 /* 2422 * Before dissolving the page, we need to allocate a new one for the 2423 * pool to remain stable. Here, we allocate the page and 'prep' it 2424 * by doing everything but actually updating counters and adding to 2425 * the pool. This simplifies and let us do most of the processing 2426 * under the lock. 2427 */ 2428 new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); 2429 if (!new_page) 2430 return -ENOMEM; 2431 __prep_new_huge_page(h, new_page); 2432 2433 retry: 2434 spin_lock_irq(&hugetlb_lock); 2435 if (!PageHuge(old_page)) { 2436 /* 2437 * Freed from under us. Drop new_page too. 2438 */ 2439 goto free_new; 2440 } else if (page_count(old_page)) { 2441 /* 2442 * Someone has grabbed the page, try to isolate it here. 2443 * Fail with -EBUSY if not possible. 2444 */ 2445 spin_unlock_irq(&hugetlb_lock); 2446 if (!isolate_huge_page(old_page, list)) 2447 ret = -EBUSY; 2448 spin_lock_irq(&hugetlb_lock); 2449 goto free_new; 2450 } else if (!HPageFreed(old_page)) { 2451 /* 2452 * Page's refcount is 0 but it has not been enqueued in the 2453 * freelist yet. Race window is small, so we can succeed here if 2454 * we retry. 2455 */ 2456 spin_unlock_irq(&hugetlb_lock); 2457 cond_resched(); 2458 goto retry; 2459 } else { 2460 /* 2461 * Ok, old_page is still a genuine free hugepage. Remove it from 2462 * the freelist and decrease the counters. These will be 2463 * incremented again when calling __prep_account_new_huge_page() 2464 * and enqueue_huge_page() for new_page. The counters will remain 2465 * stable since this happens under the lock. 2466 */ 2467 remove_hugetlb_page(h, old_page, false); 2468 2469 /* 2470 * Reference count trick is needed because allocator gives us 2471 * referenced page but the pool requires pages with 0 refcount. 2472 */ 2473 __prep_account_new_huge_page(h, nid); 2474 page_ref_dec(new_page); 2475 enqueue_huge_page(h, new_page); 2476 2477 /* 2478 * Pages have been replaced, we can safely free the old one. 2479 */ 2480 spin_unlock_irq(&hugetlb_lock); 2481 update_and_free_page(h, old_page, false); 2482 } 2483 2484 return ret; 2485 2486 free_new: 2487 spin_unlock_irq(&hugetlb_lock); 2488 update_and_free_page(h, new_page, false); 2489 2490 return ret; 2491 } 2492 2493 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) 2494 { 2495 struct hstate *h; 2496 struct page *head; 2497 int ret = -EBUSY; 2498 2499 /* 2500 * The page might have been dissolved from under our feet, so make sure 2501 * to carefully check the state under the lock. 2502 * Return success when racing as if we dissolved the page ourselves. 2503 */ 2504 spin_lock_irq(&hugetlb_lock); 2505 if (PageHuge(page)) { 2506 head = compound_head(page); 2507 h = page_hstate(head); 2508 } else { 2509 spin_unlock_irq(&hugetlb_lock); 2510 return 0; 2511 } 2512 spin_unlock_irq(&hugetlb_lock); 2513 2514 /* 2515 * Fence off gigantic pages as there is a cyclic dependency between 2516 * alloc_contig_range and them. Return -ENOMEM as this has the effect 2517 * of bailing out right away without further retrying. 2518 */ 2519 if (hstate_is_gigantic(h)) 2520 return -ENOMEM; 2521 2522 if (page_count(head) && isolate_huge_page(head, list)) 2523 ret = 0; 2524 else if (!page_count(head)) 2525 ret = alloc_and_dissolve_huge_page(h, head, list); 2526 2527 return ret; 2528 } 2529 2530 struct page *alloc_huge_page(struct vm_area_struct *vma, 2531 unsigned long addr, int avoid_reserve) 2532 { 2533 struct hugepage_subpool *spool = subpool_vma(vma); 2534 struct hstate *h = hstate_vma(vma); 2535 struct page *page; 2536 long map_chg, map_commit; 2537 long gbl_chg; 2538 int ret, idx; 2539 struct hugetlb_cgroup *h_cg; 2540 bool deferred_reserve; 2541 2542 idx = hstate_index(h); 2543 /* 2544 * Examine the region/reserve map to determine if the process 2545 * has a reservation for the page to be allocated. A return 2546 * code of zero indicates a reservation exists (no change). 2547 */ 2548 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); 2549 if (map_chg < 0) 2550 return ERR_PTR(-ENOMEM); 2551 2552 /* 2553 * Processes that did not create the mapping will have no 2554 * reserves as indicated by the region/reserve map. Check 2555 * that the allocation will not exceed the subpool limit. 2556 * Allocations for MAP_NORESERVE mappings also need to be 2557 * checked against any subpool limit. 2558 */ 2559 if (map_chg || avoid_reserve) { 2560 gbl_chg = hugepage_subpool_get_pages(spool, 1); 2561 if (gbl_chg < 0) { 2562 vma_end_reservation(h, vma, addr); 2563 return ERR_PTR(-ENOSPC); 2564 } 2565 2566 /* 2567 * Even though there was no reservation in the region/reserve 2568 * map, there could be reservations associated with the 2569 * subpool that can be used. This would be indicated if the 2570 * return value of hugepage_subpool_get_pages() is zero. 2571 * However, if avoid_reserve is specified we still avoid even 2572 * the subpool reservations. 2573 */ 2574 if (avoid_reserve) 2575 gbl_chg = 1; 2576 } 2577 2578 /* If this allocation is not consuming a reservation, charge it now. 2579 */ 2580 deferred_reserve = map_chg || avoid_reserve; 2581 if (deferred_reserve) { 2582 ret = hugetlb_cgroup_charge_cgroup_rsvd( 2583 idx, pages_per_huge_page(h), &h_cg); 2584 if (ret) 2585 goto out_subpool_put; 2586 } 2587 2588 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 2589 if (ret) 2590 goto out_uncharge_cgroup_reservation; 2591 2592 spin_lock_irq(&hugetlb_lock); 2593 /* 2594 * glb_chg is passed to indicate whether or not a page must be taken 2595 * from the global free pool (global change). gbl_chg == 0 indicates 2596 * a reservation exists for the allocation. 2597 */ 2598 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); 2599 if (!page) { 2600 spin_unlock_irq(&hugetlb_lock); 2601 page = alloc_buddy_huge_page_with_mpol(h, vma, addr); 2602 if (!page) 2603 goto out_uncharge_cgroup; 2604 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { 2605 SetHPageRestoreReserve(page); 2606 h->resv_huge_pages--; 2607 } 2608 spin_lock_irq(&hugetlb_lock); 2609 list_add(&page->lru, &h->hugepage_activelist); 2610 /* Fall through */ 2611 } 2612 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); 2613 /* If allocation is not consuming a reservation, also store the 2614 * hugetlb_cgroup pointer on the page. 2615 */ 2616 if (deferred_reserve) { 2617 hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), 2618 h_cg, page); 2619 } 2620 2621 spin_unlock_irq(&hugetlb_lock); 2622 2623 hugetlb_set_page_subpool(page, spool); 2624 2625 map_commit = vma_commit_reservation(h, vma, addr); 2626 if (unlikely(map_chg > map_commit)) { 2627 /* 2628 * The page was added to the reservation map between 2629 * vma_needs_reservation and vma_commit_reservation. 2630 * This indicates a race with hugetlb_reserve_pages. 2631 * Adjust for the subpool count incremented above AND 2632 * in hugetlb_reserve_pages for the same page. Also, 2633 * the reservation count added in hugetlb_reserve_pages 2634 * no longer applies. 2635 */ 2636 long rsv_adjust; 2637 2638 rsv_adjust = hugepage_subpool_put_pages(spool, 1); 2639 hugetlb_acct_memory(h, -rsv_adjust); 2640 if (deferred_reserve) 2641 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), 2642 pages_per_huge_page(h), page); 2643 } 2644 return page; 2645 2646 out_uncharge_cgroup: 2647 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); 2648 out_uncharge_cgroup_reservation: 2649 if (deferred_reserve) 2650 hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), 2651 h_cg); 2652 out_subpool_put: 2653 if (map_chg || avoid_reserve) 2654 hugepage_subpool_put_pages(spool, 1); 2655 vma_end_reservation(h, vma, addr); 2656 return ERR_PTR(-ENOSPC); 2657 } 2658 2659 int alloc_bootmem_huge_page(struct hstate *h) 2660 __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); 2661 int __alloc_bootmem_huge_page(struct hstate *h) 2662 { 2663 struct huge_bootmem_page *m; 2664 int nr_nodes, node; 2665 2666 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 2667 void *addr; 2668 2669 addr = memblock_alloc_try_nid_raw( 2670 huge_page_size(h), huge_page_size(h), 2671 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); 2672 if (addr) { 2673 /* 2674 * Use the beginning of the huge page to store the 2675 * huge_bootmem_page struct (until gather_bootmem 2676 * puts them into the mem_map). 2677 */ 2678 m = addr; 2679 goto found; 2680 } 2681 } 2682 return 0; 2683 2684 found: 2685 BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); 2686 /* Put them into a private list first because mem_map is not up yet */ 2687 INIT_LIST_HEAD(&m->list); 2688 list_add(&m->list, &huge_boot_pages); 2689 m->hstate = h; 2690 return 1; 2691 } 2692 2693 static void __init prep_compound_huge_page(struct page *page, 2694 unsigned int order) 2695 { 2696 if (unlikely(order > (MAX_ORDER - 1))) 2697 prep_compound_gigantic_page(page, order); 2698 else 2699 prep_compound_page(page, order); 2700 } 2701 2702 /* Put bootmem huge pages into the standard lists after mem_map is up */ 2703 static void __init gather_bootmem_prealloc(void) 2704 { 2705 struct huge_bootmem_page *m; 2706 2707 list_for_each_entry(m, &huge_boot_pages, list) { 2708 struct page *page = virt_to_page(m); 2709 struct hstate *h = m->hstate; 2710 2711 WARN_ON(page_count(page) != 1); 2712 prep_compound_huge_page(page, huge_page_order(h)); 2713 WARN_ON(PageReserved(page)); 2714 prep_new_huge_page(h, page, page_to_nid(page)); 2715 put_page(page); /* free it into the hugepage allocator */ 2716 2717 /* 2718 * If we had gigantic hugepages allocated at boot time, we need 2719 * to restore the 'stolen' pages to totalram_pages in order to 2720 * fix confusing memory reports from free(1) and another 2721 * side-effects, like CommitLimit going negative. 2722 */ 2723 if (hstate_is_gigantic(h)) 2724 adjust_managed_page_count(page, pages_per_huge_page(h)); 2725 cond_resched(); 2726 } 2727 } 2728 2729 static void __init hugetlb_hstate_alloc_pages(struct hstate *h) 2730 { 2731 unsigned long i; 2732 nodemask_t *node_alloc_noretry; 2733 2734 if (!hstate_is_gigantic(h)) { 2735 /* 2736 * Bit mask controlling how hard we retry per-node allocations. 2737 * Ignore errors as lower level routines can deal with 2738 * node_alloc_noretry == NULL. If this kmalloc fails at boot 2739 * time, we are likely in bigger trouble. 2740 */ 2741 node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), 2742 GFP_KERNEL); 2743 } else { 2744 /* allocations done at boot time */ 2745 node_alloc_noretry = NULL; 2746 } 2747 2748 /* bit mask controlling how hard we retry per-node allocations */ 2749 if (node_alloc_noretry) 2750 nodes_clear(*node_alloc_noretry); 2751 2752 for (i = 0; i < h->max_huge_pages; ++i) { 2753 if (hstate_is_gigantic(h)) { 2754 if (hugetlb_cma_size) { 2755 pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); 2756 goto free; 2757 } 2758 if (!alloc_bootmem_huge_page(h)) 2759 break; 2760 } else if (!alloc_pool_huge_page(h, 2761 &node_states[N_MEMORY], 2762 node_alloc_noretry)) 2763 break; 2764 cond_resched(); 2765 } 2766 if (i < h->max_huge_pages) { 2767 char buf[32]; 2768 2769 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 2770 pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", 2771 h->max_huge_pages, buf, i); 2772 h->max_huge_pages = i; 2773 } 2774 free: 2775 kfree(node_alloc_noretry); 2776 } 2777 2778 static void __init hugetlb_init_hstates(void) 2779 { 2780 struct hstate *h; 2781 2782 for_each_hstate(h) { 2783 if (minimum_order > huge_page_order(h)) 2784 minimum_order = huge_page_order(h); 2785 2786 /* oversize hugepages were init'ed in early boot */ 2787 if (!hstate_is_gigantic(h)) 2788 hugetlb_hstate_alloc_pages(h); 2789 } 2790 VM_BUG_ON(minimum_order == UINT_MAX); 2791 } 2792 2793 static void __init report_hugepages(void) 2794 { 2795 struct hstate *h; 2796 2797 for_each_hstate(h) { 2798 char buf[32]; 2799 2800 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 2801 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", 2802 buf, h->free_huge_pages); 2803 } 2804 } 2805 2806 #ifdef CONFIG_HIGHMEM 2807 static void try_to_free_low(struct hstate *h, unsigned long count, 2808 nodemask_t *nodes_allowed) 2809 { 2810 int i; 2811 LIST_HEAD(page_list); 2812 2813 lockdep_assert_held(&hugetlb_lock); 2814 if (hstate_is_gigantic(h)) 2815 return; 2816 2817 /* 2818 * Collect pages to be freed on a list, and free after dropping lock 2819 */ 2820 for_each_node_mask(i, *nodes_allowed) { 2821 struct page *page, *next; 2822 struct list_head *freel = &h->hugepage_freelists[i]; 2823 list_for_each_entry_safe(page, next, freel, lru) { 2824 if (count >= h->nr_huge_pages) 2825 goto out; 2826 if (PageHighMem(page)) 2827 continue; 2828 remove_hugetlb_page(h, page, false); 2829 list_add(&page->lru, &page_list); 2830 } 2831 } 2832 2833 out: 2834 spin_unlock_irq(&hugetlb_lock); 2835 update_and_free_pages_bulk(h, &page_list); 2836 spin_lock_irq(&hugetlb_lock); 2837 } 2838 #else 2839 static inline void try_to_free_low(struct hstate *h, unsigned long count, 2840 nodemask_t *nodes_allowed) 2841 { 2842 } 2843 #endif 2844 2845 /* 2846 * Increment or decrement surplus_huge_pages. Keep node-specific counters 2847 * balanced by operating on them in a round-robin fashion. 2848 * Returns 1 if an adjustment was made. 2849 */ 2850 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 2851 int delta) 2852 { 2853 int nr_nodes, node; 2854 2855 lockdep_assert_held(&hugetlb_lock); 2856 VM_BUG_ON(delta != -1 && delta != 1); 2857 2858 if (delta < 0) { 2859 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 2860 if (h->surplus_huge_pages_node[node]) 2861 goto found; 2862 } 2863 } else { 2864 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 2865 if (h->surplus_huge_pages_node[node] < 2866 h->nr_huge_pages_node[node]) 2867 goto found; 2868 } 2869 } 2870 return 0; 2871 2872 found: 2873 h->surplus_huge_pages += delta; 2874 h->surplus_huge_pages_node[node] += delta; 2875 return 1; 2876 } 2877 2878 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 2879 static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, 2880 nodemask_t *nodes_allowed) 2881 { 2882 unsigned long min_count, ret; 2883 struct page *page; 2884 LIST_HEAD(page_list); 2885 NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); 2886 2887 /* 2888 * Bit mask controlling how hard we retry per-node allocations. 2889 * If we can not allocate the bit mask, do not attempt to allocate 2890 * the requested huge pages. 2891 */ 2892 if (node_alloc_noretry) 2893 nodes_clear(*node_alloc_noretry); 2894 else 2895 return -ENOMEM; 2896 2897 /* 2898 * resize_lock mutex prevents concurrent adjustments to number of 2899 * pages in hstate via the proc/sysfs interfaces. 2900 */ 2901 mutex_lock(&h->resize_lock); 2902 flush_free_hpage_work(h); 2903 spin_lock_irq(&hugetlb_lock); 2904 2905 /* 2906 * Check for a node specific request. 2907 * Changing node specific huge page count may require a corresponding 2908 * change to the global count. In any case, the passed node mask 2909 * (nodes_allowed) will restrict alloc/free to the specified node. 2910 */ 2911 if (nid != NUMA_NO_NODE) { 2912 unsigned long old_count = count; 2913 2914 count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 2915 /* 2916 * User may have specified a large count value which caused the 2917 * above calculation to overflow. In this case, they wanted 2918 * to allocate as many huge pages as possible. Set count to 2919 * largest possible value to align with their intention. 2920 */ 2921 if (count < old_count) 2922 count = ULONG_MAX; 2923 } 2924 2925 /* 2926 * Gigantic pages runtime allocation depend on the capability for large 2927 * page range allocation. 2928 * If the system does not provide this feature, return an error when 2929 * the user tries to allocate gigantic pages but let the user free the 2930 * boottime allocated gigantic pages. 2931 */ 2932 if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { 2933 if (count > persistent_huge_pages(h)) { 2934 spin_unlock_irq(&hugetlb_lock); 2935 mutex_unlock(&h->resize_lock); 2936 NODEMASK_FREE(node_alloc_noretry); 2937 return -EINVAL; 2938 } 2939 /* Fall through to decrease pool */ 2940 } 2941 2942 /* 2943 * Increase the pool size 2944 * First take pages out of surplus state. Then make up the 2945 * remaining difference by allocating fresh huge pages. 2946 * 2947 * We might race with alloc_surplus_huge_page() here and be unable 2948 * to convert a surplus huge page to a normal huge page. That is 2949 * not critical, though, it just means the overall size of the 2950 * pool might be one hugepage larger than it needs to be, but 2951 * within all the constraints specified by the sysctls. 2952 */ 2953 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 2954 if (!adjust_pool_surplus(h, nodes_allowed, -1)) 2955 break; 2956 } 2957 2958 while (count > persistent_huge_pages(h)) { 2959 /* 2960 * If this allocation races such that we no longer need the 2961 * page, free_huge_page will handle it by freeing the page 2962 * and reducing the surplus. 2963 */ 2964 spin_unlock_irq(&hugetlb_lock); 2965 2966 /* yield cpu to avoid soft lockup */ 2967 cond_resched(); 2968 2969 ret = alloc_pool_huge_page(h, nodes_allowed, 2970 node_alloc_noretry); 2971 spin_lock_irq(&hugetlb_lock); 2972 if (!ret) 2973 goto out; 2974 2975 /* Bail for signals. Probably ctrl-c from user */ 2976 if (signal_pending(current)) 2977 goto out; 2978 } 2979 2980 /* 2981 * Decrease the pool size 2982 * First return free pages to the buddy allocator (being careful 2983 * to keep enough around to satisfy reservations). Then place 2984 * pages into surplus state as needed so the pool will shrink 2985 * to the desired size as pages become free. 2986 * 2987 * By placing pages into the surplus state independent of the 2988 * overcommit value, we are allowing the surplus pool size to 2989 * exceed overcommit. There are few sane options here. Since 2990 * alloc_surplus_huge_page() is checking the global counter, 2991 * though, we'll note that we're not allowed to exceed surplus 2992 * and won't grow the pool anywhere else. Not until one of the 2993 * sysctls are changed, or the surplus pages go out of use. 2994 */ 2995 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 2996 min_count = max(count, min_count); 2997 try_to_free_low(h, min_count, nodes_allowed); 2998 2999 /* 3000 * Collect pages to be removed on list without dropping lock 3001 */ 3002 while (min_count < persistent_huge_pages(h)) { 3003 page = remove_pool_huge_page(h, nodes_allowed, 0); 3004 if (!page) 3005 break; 3006 3007 list_add(&page->lru, &page_list); 3008 } 3009 /* free the pages after dropping lock */ 3010 spin_unlock_irq(&hugetlb_lock); 3011 update_and_free_pages_bulk(h, &page_list); 3012 flush_free_hpage_work(h); 3013 spin_lock_irq(&hugetlb_lock); 3014 3015 while (count < persistent_huge_pages(h)) { 3016 if (!adjust_pool_surplus(h, nodes_allowed, 1)) 3017 break; 3018 } 3019 out: 3020 h->max_huge_pages = persistent_huge_pages(h); 3021 spin_unlock_irq(&hugetlb_lock); 3022 mutex_unlock(&h->resize_lock); 3023 3024 NODEMASK_FREE(node_alloc_noretry); 3025 3026 return 0; 3027 } 3028 3029 #define HSTATE_ATTR_RO(_name) \ 3030 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 3031 3032 #define HSTATE_ATTR(_name) \ 3033 static struct kobj_attribute _name##_attr = \ 3034 __ATTR(_name, 0644, _name##_show, _name##_store) 3035 3036 static struct kobject *hugepages_kobj; 3037 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 3038 3039 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); 3040 3041 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) 3042 { 3043 int i; 3044 3045 for (i = 0; i < HUGE_MAX_HSTATE; i++) 3046 if (hstate_kobjs[i] == kobj) { 3047 if (nidp) 3048 *nidp = NUMA_NO_NODE; 3049 return &hstates[i]; 3050 } 3051 3052 return kobj_to_node_hstate(kobj, nidp); 3053 } 3054 3055 static ssize_t nr_hugepages_show_common(struct kobject *kobj, 3056 struct kobj_attribute *attr, char *buf) 3057 { 3058 struct hstate *h; 3059 unsigned long nr_huge_pages; 3060 int nid; 3061 3062 h = kobj_to_hstate(kobj, &nid); 3063 if (nid == NUMA_NO_NODE) 3064 nr_huge_pages = h->nr_huge_pages; 3065 else 3066 nr_huge_pages = h->nr_huge_pages_node[nid]; 3067 3068 return sysfs_emit(buf, "%lu\n", nr_huge_pages); 3069 } 3070 3071 static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, 3072 struct hstate *h, int nid, 3073 unsigned long count, size_t len) 3074 { 3075 int err; 3076 nodemask_t nodes_allowed, *n_mask; 3077 3078 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 3079 return -EINVAL; 3080 3081 if (nid == NUMA_NO_NODE) { 3082 /* 3083 * global hstate attribute 3084 */ 3085 if (!(obey_mempolicy && 3086 init_nodemask_of_mempolicy(&nodes_allowed))) 3087 n_mask = &node_states[N_MEMORY]; 3088 else 3089 n_mask = &nodes_allowed; 3090 } else { 3091 /* 3092 * Node specific request. count adjustment happens in 3093 * set_max_huge_pages() after acquiring hugetlb_lock. 3094 */ 3095 init_nodemask_of_node(&nodes_allowed, nid); 3096 n_mask = &nodes_allowed; 3097 } 3098 3099 err = set_max_huge_pages(h, count, nid, n_mask); 3100 3101 return err ? err : len; 3102 } 3103 3104 static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 3105 struct kobject *kobj, const char *buf, 3106 size_t len) 3107 { 3108 struct hstate *h; 3109 unsigned long count; 3110 int nid; 3111 int err; 3112 3113 err = kstrtoul(buf, 10, &count); 3114 if (err) 3115 return err; 3116 3117 h = kobj_to_hstate(kobj, &nid); 3118 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); 3119 } 3120 3121 static ssize_t nr_hugepages_show(struct kobject *kobj, 3122 struct kobj_attribute *attr, char *buf) 3123 { 3124 return nr_hugepages_show_common(kobj, attr, buf); 3125 } 3126 3127 static ssize_t nr_hugepages_store(struct kobject *kobj, 3128 struct kobj_attribute *attr, const char *buf, size_t len) 3129 { 3130 return nr_hugepages_store_common(false, kobj, buf, len); 3131 } 3132 HSTATE_ATTR(nr_hugepages); 3133 3134 #ifdef CONFIG_NUMA 3135 3136 /* 3137 * hstate attribute for optionally mempolicy-based constraint on persistent 3138 * huge page alloc/free. 3139 */ 3140 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, 3141 struct kobj_attribute *attr, 3142 char *buf) 3143 { 3144 return nr_hugepages_show_common(kobj, attr, buf); 3145 } 3146 3147 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, 3148 struct kobj_attribute *attr, const char *buf, size_t len) 3149 { 3150 return nr_hugepages_store_common(true, kobj, buf, len); 3151 } 3152 HSTATE_ATTR(nr_hugepages_mempolicy); 3153 #endif 3154 3155 3156 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 3157 struct kobj_attribute *attr, char *buf) 3158 { 3159 struct hstate *h = kobj_to_hstate(kobj, NULL); 3160 return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); 3161 } 3162 3163 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 3164 struct kobj_attribute *attr, const char *buf, size_t count) 3165 { 3166 int err; 3167 unsigned long input; 3168 struct hstate *h = kobj_to_hstate(kobj, NULL); 3169 3170 if (hstate_is_gigantic(h)) 3171 return -EINVAL; 3172 3173 err = kstrtoul(buf, 10, &input); 3174 if (err) 3175 return err; 3176 3177 spin_lock_irq(&hugetlb_lock); 3178 h->nr_overcommit_huge_pages = input; 3179 spin_unlock_irq(&hugetlb_lock); 3180 3181 return count; 3182 } 3183 HSTATE_ATTR(nr_overcommit_hugepages); 3184 3185 static ssize_t free_hugepages_show(struct kobject *kobj, 3186 struct kobj_attribute *attr, char *buf) 3187 { 3188 struct hstate *h; 3189 unsigned long free_huge_pages; 3190 int nid; 3191 3192 h = kobj_to_hstate(kobj, &nid); 3193 if (nid == NUMA_NO_NODE) 3194 free_huge_pages = h->free_huge_pages; 3195 else 3196 free_huge_pages = h->free_huge_pages_node[nid]; 3197 3198 return sysfs_emit(buf, "%lu\n", free_huge_pages); 3199 } 3200 HSTATE_ATTR_RO(free_hugepages); 3201 3202 static ssize_t resv_hugepages_show(struct kobject *kobj, 3203 struct kobj_attribute *attr, char *buf) 3204 { 3205 struct hstate *h = kobj_to_hstate(kobj, NULL); 3206 return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); 3207 } 3208 HSTATE_ATTR_RO(resv_hugepages); 3209 3210 static ssize_t surplus_hugepages_show(struct kobject *kobj, 3211 struct kobj_attribute *attr, char *buf) 3212 { 3213 struct hstate *h; 3214 unsigned long surplus_huge_pages; 3215 int nid; 3216 3217 h = kobj_to_hstate(kobj, &nid); 3218 if (nid == NUMA_NO_NODE) 3219 surplus_huge_pages = h->surplus_huge_pages; 3220 else 3221 surplus_huge_pages = h->surplus_huge_pages_node[nid]; 3222 3223 return sysfs_emit(buf, "%lu\n", surplus_huge_pages); 3224 } 3225 HSTATE_ATTR_RO(surplus_hugepages); 3226 3227 static struct attribute *hstate_attrs[] = { 3228 &nr_hugepages_attr.attr, 3229 &nr_overcommit_hugepages_attr.attr, 3230 &free_hugepages_attr.attr, 3231 &resv_hugepages_attr.attr, 3232 &surplus_hugepages_attr.attr, 3233 #ifdef CONFIG_NUMA 3234 &nr_hugepages_mempolicy_attr.attr, 3235 #endif 3236 NULL, 3237 }; 3238 3239 static const struct attribute_group hstate_attr_group = { 3240 .attrs = hstate_attrs, 3241 }; 3242 3243 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, 3244 struct kobject **hstate_kobjs, 3245 const struct attribute_group *hstate_attr_group) 3246 { 3247 int retval; 3248 int hi = hstate_index(h); 3249 3250 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 3251 if (!hstate_kobjs[hi]) 3252 return -ENOMEM; 3253 3254 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); 3255 if (retval) { 3256 kobject_put(hstate_kobjs[hi]); 3257 hstate_kobjs[hi] = NULL; 3258 } 3259 3260 return retval; 3261 } 3262 3263 static void __init hugetlb_sysfs_init(void) 3264 { 3265 struct hstate *h; 3266 int err; 3267 3268 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); 3269 if (!hugepages_kobj) 3270 return; 3271 3272 for_each_hstate(h) { 3273 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 3274 hstate_kobjs, &hstate_attr_group); 3275 if (err) 3276 pr_err("HugeTLB: Unable to add hstate %s", h->name); 3277 } 3278 } 3279 3280 #ifdef CONFIG_NUMA 3281 3282 /* 3283 * node_hstate/s - associate per node hstate attributes, via their kobjects, 3284 * with node devices in node_devices[] using a parallel array. The array 3285 * index of a node device or _hstate == node id. 3286 * This is here to avoid any static dependency of the node device driver, in 3287 * the base kernel, on the hugetlb module. 3288 */ 3289 struct node_hstate { 3290 struct kobject *hugepages_kobj; 3291 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 3292 }; 3293 static struct node_hstate node_hstates[MAX_NUMNODES]; 3294 3295 /* 3296 * A subset of global hstate attributes for node devices 3297 */ 3298 static struct attribute *per_node_hstate_attrs[] = { 3299 &nr_hugepages_attr.attr, 3300 &free_hugepages_attr.attr, 3301 &surplus_hugepages_attr.attr, 3302 NULL, 3303 }; 3304 3305 static const struct attribute_group per_node_hstate_attr_group = { 3306 .attrs = per_node_hstate_attrs, 3307 }; 3308 3309 /* 3310 * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. 3311 * Returns node id via non-NULL nidp. 3312 */ 3313 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 3314 { 3315 int nid; 3316 3317 for (nid = 0; nid < nr_node_ids; nid++) { 3318 struct node_hstate *nhs = &node_hstates[nid]; 3319 int i; 3320 for (i = 0; i < HUGE_MAX_HSTATE; i++) 3321 if (nhs->hstate_kobjs[i] == kobj) { 3322 if (nidp) 3323 *nidp = nid; 3324 return &hstates[i]; 3325 } 3326 } 3327 3328 BUG(); 3329 return NULL; 3330 } 3331 3332 /* 3333 * Unregister hstate attributes from a single node device. 3334 * No-op if no hstate attributes attached. 3335 */ 3336 static void hugetlb_unregister_node(struct node *node) 3337 { 3338 struct hstate *h; 3339 struct node_hstate *nhs = &node_hstates[node->dev.id]; 3340 3341 if (!nhs->hugepages_kobj) 3342 return; /* no hstate attributes */ 3343 3344 for_each_hstate(h) { 3345 int idx = hstate_index(h); 3346 if (nhs->hstate_kobjs[idx]) { 3347 kobject_put(nhs->hstate_kobjs[idx]); 3348 nhs->hstate_kobjs[idx] = NULL; 3349 } 3350 } 3351 3352 kobject_put(nhs->hugepages_kobj); 3353 nhs->hugepages_kobj = NULL; 3354 } 3355 3356 3357 /* 3358 * Register hstate attributes for a single node device. 3359 * No-op if attributes already registered. 3360 */ 3361 static void hugetlb_register_node(struct node *node) 3362 { 3363 struct hstate *h; 3364 struct node_hstate *nhs = &node_hstates[node->dev.id]; 3365 int err; 3366 3367 if (nhs->hugepages_kobj) 3368 return; /* already allocated */ 3369 3370 nhs->hugepages_kobj = kobject_create_and_add("hugepages", 3371 &node->dev.kobj); 3372 if (!nhs->hugepages_kobj) 3373 return; 3374 3375 for_each_hstate(h) { 3376 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, 3377 nhs->hstate_kobjs, 3378 &per_node_hstate_attr_group); 3379 if (err) { 3380 pr_err("HugeTLB: Unable to add hstate %s for node %d\n", 3381 h->name, node->dev.id); 3382 hugetlb_unregister_node(node); 3383 break; 3384 } 3385 } 3386 } 3387 3388 /* 3389 * hugetlb init time: register hstate attributes for all registered node 3390 * devices of nodes that have memory. All on-line nodes should have 3391 * registered their associated device by this time. 3392 */ 3393 static void __init hugetlb_register_all_nodes(void) 3394 { 3395 int nid; 3396 3397 for_each_node_state(nid, N_MEMORY) { 3398 struct node *node = node_devices[nid]; 3399 if (node->dev.id == nid) 3400 hugetlb_register_node(node); 3401 } 3402 3403 /* 3404 * Let the node device driver know we're here so it can 3405 * [un]register hstate attributes on node hotplug. 3406 */ 3407 register_hugetlbfs_with_node(hugetlb_register_node, 3408 hugetlb_unregister_node); 3409 } 3410 #else /* !CONFIG_NUMA */ 3411 3412 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 3413 { 3414 BUG(); 3415 if (nidp) 3416 *nidp = -1; 3417 return NULL; 3418 } 3419 3420 static void hugetlb_register_all_nodes(void) { } 3421 3422 #endif 3423 3424 static int __init hugetlb_init(void) 3425 { 3426 int i; 3427 3428 BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < 3429 __NR_HPAGEFLAGS); 3430 3431 if (!hugepages_supported()) { 3432 if (hugetlb_max_hstate || default_hstate_max_huge_pages) 3433 pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); 3434 return 0; 3435 } 3436 3437 /* 3438 * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some 3439 * architectures depend on setup being done here. 3440 */ 3441 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 3442 if (!parsed_default_hugepagesz) { 3443 /* 3444 * If we did not parse a default huge page size, set 3445 * default_hstate_idx to HPAGE_SIZE hstate. And, if the 3446 * number of huge pages for this default size was implicitly 3447 * specified, set that here as well. 3448 * Note that the implicit setting will overwrite an explicit 3449 * setting. A warning will be printed in this case. 3450 */ 3451 default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); 3452 if (default_hstate_max_huge_pages) { 3453 if (default_hstate.max_huge_pages) { 3454 char buf[32]; 3455 3456 string_get_size(huge_page_size(&default_hstate), 3457 1, STRING_UNITS_2, buf, 32); 3458 pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", 3459 default_hstate.max_huge_pages, buf); 3460 pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", 3461 default_hstate_max_huge_pages); 3462 } 3463 default_hstate.max_huge_pages = 3464 default_hstate_max_huge_pages; 3465 } 3466 } 3467 3468 hugetlb_cma_check(); 3469 hugetlb_init_hstates(); 3470 gather_bootmem_prealloc(); 3471 report_hugepages(); 3472 3473 hugetlb_sysfs_init(); 3474 hugetlb_register_all_nodes(); 3475 hugetlb_cgroup_file_init(); 3476 3477 #ifdef CONFIG_SMP 3478 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); 3479 #else 3480 num_fault_mutexes = 1; 3481 #endif 3482 hugetlb_fault_mutex_table = 3483 kmalloc_array(num_fault_mutexes, sizeof(struct mutex), 3484 GFP_KERNEL); 3485 BUG_ON(!hugetlb_fault_mutex_table); 3486 3487 for (i = 0; i < num_fault_mutexes; i++) 3488 mutex_init(&hugetlb_fault_mutex_table[i]); 3489 return 0; 3490 } 3491 subsys_initcall(hugetlb_init); 3492 3493 /* Overwritten by architectures with more huge page sizes */ 3494 bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) 3495 { 3496 return size == HPAGE_SIZE; 3497 } 3498 3499 void __init hugetlb_add_hstate(unsigned int order) 3500 { 3501 struct hstate *h; 3502 unsigned long i; 3503 3504 if (size_to_hstate(PAGE_SIZE << order)) { 3505 return; 3506 } 3507 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); 3508 BUG_ON(order == 0); 3509 h = &hstates[hugetlb_max_hstate++]; 3510 mutex_init(&h->resize_lock); 3511 h->order = order; 3512 h->mask = ~(huge_page_size(h) - 1); 3513 for (i = 0; i < MAX_NUMNODES; ++i) 3514 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 3515 INIT_LIST_HEAD(&h->hugepage_activelist); 3516 h->next_nid_to_alloc = first_memory_node; 3517 h->next_nid_to_free = first_memory_node; 3518 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 3519 huge_page_size(h)/1024); 3520 3521 parsed_hstate = h; 3522 } 3523 3524 /* 3525 * hugepages command line processing 3526 * hugepages normally follows a valid hugepagsz or default_hugepagsz 3527 * specification. If not, ignore the hugepages value. hugepages can also 3528 * be the first huge page command line option in which case it implicitly 3529 * specifies the number of huge pages for the default size. 3530 */ 3531 static int __init hugepages_setup(char *s) 3532 { 3533 unsigned long *mhp; 3534 static unsigned long *last_mhp; 3535 3536 if (!parsed_valid_hugepagesz) { 3537 pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); 3538 parsed_valid_hugepagesz = true; 3539 return 0; 3540 } 3541 3542 /* 3543 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter 3544 * yet, so this hugepages= parameter goes to the "default hstate". 3545 * Otherwise, it goes with the previously parsed hugepagesz or 3546 * default_hugepagesz. 3547 */ 3548 else if (!hugetlb_max_hstate) 3549 mhp = &default_hstate_max_huge_pages; 3550 else 3551 mhp = &parsed_hstate->max_huge_pages; 3552 3553 if (mhp == last_mhp) { 3554 pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); 3555 return 0; 3556 } 3557 3558 if (sscanf(s, "%lu", mhp) <= 0) 3559 *mhp = 0; 3560 3561 /* 3562 * Global state is always initialized later in hugetlb_init. 3563 * But we need to allocate gigantic hstates here early to still 3564 * use the bootmem allocator. 3565 */ 3566 if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate)) 3567 hugetlb_hstate_alloc_pages(parsed_hstate); 3568 3569 last_mhp = mhp; 3570 3571 return 1; 3572 } 3573 __setup("hugepages=", hugepages_setup); 3574 3575 /* 3576 * hugepagesz command line processing 3577 * A specific huge page size can only be specified once with hugepagesz. 3578 * hugepagesz is followed by hugepages on the command line. The global 3579 * variable 'parsed_valid_hugepagesz' is used to determine if prior 3580 * hugepagesz argument was valid. 3581 */ 3582 static int __init hugepagesz_setup(char *s) 3583 { 3584 unsigned long size; 3585 struct hstate *h; 3586 3587 parsed_valid_hugepagesz = false; 3588 size = (unsigned long)memparse(s, NULL); 3589 3590 if (!arch_hugetlb_valid_size(size)) { 3591 pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); 3592 return 0; 3593 } 3594 3595 h = size_to_hstate(size); 3596 if (h) { 3597 /* 3598 * hstate for this size already exists. This is normally 3599 * an error, but is allowed if the existing hstate is the 3600 * default hstate. More specifically, it is only allowed if 3601 * the number of huge pages for the default hstate was not 3602 * previously specified. 3603 */ 3604 if (!parsed_default_hugepagesz || h != &default_hstate || 3605 default_hstate.max_huge_pages) { 3606 pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); 3607 return 0; 3608 } 3609 3610 /* 3611 * No need to call hugetlb_add_hstate() as hstate already 3612 * exists. But, do set parsed_hstate so that a following 3613 * hugepages= parameter will be applied to this hstate. 3614 */ 3615 parsed_hstate = h; 3616 parsed_valid_hugepagesz = true; 3617 return 1; 3618 } 3619 3620 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 3621 parsed_valid_hugepagesz = true; 3622 return 1; 3623 } 3624 __setup("hugepagesz=", hugepagesz_setup); 3625 3626 /* 3627 * default_hugepagesz command line input 3628 * Only one instance of default_hugepagesz allowed on command line. 3629 */ 3630 static int __init default_hugepagesz_setup(char *s) 3631 { 3632 unsigned long size; 3633 3634 parsed_valid_hugepagesz = false; 3635 if (parsed_default_hugepagesz) { 3636 pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); 3637 return 0; 3638 } 3639 3640 size = (unsigned long)memparse(s, NULL); 3641 3642 if (!arch_hugetlb_valid_size(size)) { 3643 pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); 3644 return 0; 3645 } 3646 3647 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 3648 parsed_valid_hugepagesz = true; 3649 parsed_default_hugepagesz = true; 3650 default_hstate_idx = hstate_index(size_to_hstate(size)); 3651 3652 /* 3653 * The number of default huge pages (for this size) could have been 3654 * specified as the first hugetlb parameter: hugepages=X. If so, 3655 * then default_hstate_max_huge_pages is set. If the default huge 3656 * page size is gigantic (>= MAX_ORDER), then the pages must be 3657 * allocated here from bootmem allocator. 3658 */ 3659 if (default_hstate_max_huge_pages) { 3660 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 3661 if (hstate_is_gigantic(&default_hstate)) 3662 hugetlb_hstate_alloc_pages(&default_hstate); 3663 default_hstate_max_huge_pages = 0; 3664 } 3665 3666 return 1; 3667 } 3668 __setup("default_hugepagesz=", default_hugepagesz_setup); 3669 3670 static unsigned int allowed_mems_nr(struct hstate *h) 3671 { 3672 int node; 3673 unsigned int nr = 0; 3674 nodemask_t *mpol_allowed; 3675 unsigned int *array = h->free_huge_pages_node; 3676 gfp_t gfp_mask = htlb_alloc_mask(h); 3677 3678 mpol_allowed = policy_nodemask_current(gfp_mask); 3679 3680 for_each_node_mask(node, cpuset_current_mems_allowed) { 3681 if (!mpol_allowed || node_isset(node, *mpol_allowed)) 3682 nr += array[node]; 3683 } 3684 3685 return nr; 3686 } 3687 3688 #ifdef CONFIG_SYSCTL 3689 static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, 3690 void *buffer, size_t *length, 3691 loff_t *ppos, unsigned long *out) 3692 { 3693 struct ctl_table dup_table; 3694 3695 /* 3696 * In order to avoid races with __do_proc_doulongvec_minmax(), we 3697 * can duplicate the @table and alter the duplicate of it. 3698 */ 3699 dup_table = *table; 3700 dup_table.data = out; 3701 3702 return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); 3703 } 3704 3705 static int hugetlb_sysctl_handler_common(bool obey_mempolicy, 3706 struct ctl_table *table, int write, 3707 void *buffer, size_t *length, loff_t *ppos) 3708 { 3709 struct hstate *h = &default_hstate; 3710 unsigned long tmp = h->max_huge_pages; 3711 int ret; 3712 3713 if (!hugepages_supported()) 3714 return -EOPNOTSUPP; 3715 3716 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, 3717 &tmp); 3718 if (ret) 3719 goto out; 3720 3721 if (write) 3722 ret = __nr_hugepages_store_common(obey_mempolicy, h, 3723 NUMA_NO_NODE, tmp, *length); 3724 out: 3725 return ret; 3726 } 3727 3728 int hugetlb_sysctl_handler(struct ctl_table *table, int write, 3729 void *buffer, size_t *length, loff_t *ppos) 3730 { 3731 3732 return hugetlb_sysctl_handler_common(false, table, write, 3733 buffer, length, ppos); 3734 } 3735 3736 #ifdef CONFIG_NUMA 3737 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, 3738 void *buffer, size_t *length, loff_t *ppos) 3739 { 3740 return hugetlb_sysctl_handler_common(true, table, write, 3741 buffer, length, ppos); 3742 } 3743 #endif /* CONFIG_NUMA */ 3744 3745 int hugetlb_overcommit_handler(struct ctl_table *table, int write, 3746 void *buffer, size_t *length, loff_t *ppos) 3747 { 3748 struct hstate *h = &default_hstate; 3749 unsigned long tmp; 3750 int ret; 3751 3752 if (!hugepages_supported()) 3753 return -EOPNOTSUPP; 3754 3755 tmp = h->nr_overcommit_huge_pages; 3756 3757 if (write && hstate_is_gigantic(h)) 3758 return -EINVAL; 3759 3760 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, 3761 &tmp); 3762 if (ret) 3763 goto out; 3764 3765 if (write) { 3766 spin_lock_irq(&hugetlb_lock); 3767 h->nr_overcommit_huge_pages = tmp; 3768 spin_unlock_irq(&hugetlb_lock); 3769 } 3770 out: 3771 return ret; 3772 } 3773 3774 #endif /* CONFIG_SYSCTL */ 3775 3776 void hugetlb_report_meminfo(struct seq_file *m) 3777 { 3778 struct hstate *h; 3779 unsigned long total = 0; 3780 3781 if (!hugepages_supported()) 3782 return; 3783 3784 for_each_hstate(h) { 3785 unsigned long count = h->nr_huge_pages; 3786 3787 total += huge_page_size(h) * count; 3788 3789 if (h == &default_hstate) 3790 seq_printf(m, 3791 "HugePages_Total: %5lu\n" 3792 "HugePages_Free: %5lu\n" 3793 "HugePages_Rsvd: %5lu\n" 3794 "HugePages_Surp: %5lu\n" 3795 "Hugepagesize: %8lu kB\n", 3796 count, 3797 h->free_huge_pages, 3798 h->resv_huge_pages, 3799 h->surplus_huge_pages, 3800 huge_page_size(h) / SZ_1K); 3801 } 3802 3803 seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K); 3804 } 3805 3806 int hugetlb_report_node_meminfo(char *buf, int len, int nid) 3807 { 3808 struct hstate *h = &default_hstate; 3809 3810 if (!hugepages_supported()) 3811 return 0; 3812 3813 return sysfs_emit_at(buf, len, 3814 "Node %d HugePages_Total: %5u\n" 3815 "Node %d HugePages_Free: %5u\n" 3816 "Node %d HugePages_Surp: %5u\n", 3817 nid, h->nr_huge_pages_node[nid], 3818 nid, h->free_huge_pages_node[nid], 3819 nid, h->surplus_huge_pages_node[nid]); 3820 } 3821 3822 void hugetlb_show_meminfo(void) 3823 { 3824 struct hstate *h; 3825 int nid; 3826 3827 if (!hugepages_supported()) 3828 return; 3829 3830 for_each_node_state(nid, N_MEMORY) 3831 for_each_hstate(h) 3832 pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", 3833 nid, 3834 h->nr_huge_pages_node[nid], 3835 h->free_huge_pages_node[nid], 3836 h->surplus_huge_pages_node[nid], 3837 huge_page_size(h) / SZ_1K); 3838 } 3839 3840 void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) 3841 { 3842 seq_printf(m, "HugetlbPages:\t%8lu kB\n", 3843 atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); 3844 } 3845 3846 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 3847 unsigned long hugetlb_total_pages(void) 3848 { 3849 struct hstate *h; 3850 unsigned long nr_total_pages = 0; 3851 3852 for_each_hstate(h) 3853 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); 3854 return nr_total_pages; 3855 } 3856 3857 static int hugetlb_acct_memory(struct hstate *h, long delta) 3858 { 3859 int ret = -ENOMEM; 3860 3861 if (!delta) 3862 return 0; 3863 3864 spin_lock_irq(&hugetlb_lock); 3865 /* 3866 * When cpuset is configured, it breaks the strict hugetlb page 3867 * reservation as the accounting is done on a global variable. Such 3868 * reservation is completely rubbish in the presence of cpuset because 3869 * the reservation is not checked against page availability for the 3870 * current cpuset. Application can still potentially OOM'ed by kernel 3871 * with lack of free htlb page in cpuset that the task is in. 3872 * Attempt to enforce strict accounting with cpuset is almost 3873 * impossible (or too ugly) because cpuset is too fluid that 3874 * task or memory node can be dynamically moved between cpusets. 3875 * 3876 * The change of semantics for shared hugetlb mapping with cpuset is 3877 * undesirable. However, in order to preserve some of the semantics, 3878 * we fall back to check against current free page availability as 3879 * a best attempt and hopefully to minimize the impact of changing 3880 * semantics that cpuset has. 3881 * 3882 * Apart from cpuset, we also have memory policy mechanism that 3883 * also determines from which node the kernel will allocate memory 3884 * in a NUMA system. So similar to cpuset, we also should consider 3885 * the memory policy of the current task. Similar to the description 3886 * above. 3887 */ 3888 if (delta > 0) { 3889 if (gather_surplus_pages(h, delta) < 0) 3890 goto out; 3891 3892 if (delta > allowed_mems_nr(h)) { 3893 return_unused_surplus_pages(h, delta); 3894 goto out; 3895 } 3896 } 3897 3898 ret = 0; 3899 if (delta < 0) 3900 return_unused_surplus_pages(h, (unsigned long) -delta); 3901 3902 out: 3903 spin_unlock_irq(&hugetlb_lock); 3904 return ret; 3905 } 3906 3907 static void hugetlb_vm_op_open(struct vm_area_struct *vma) 3908 { 3909 struct resv_map *resv = vma_resv_map(vma); 3910 3911 /* 3912 * This new VMA should share its siblings reservation map if present. 3913 * The VMA will only ever have a valid reservation map pointer where 3914 * it is being copied for another still existing VMA. As that VMA 3915 * has a reference to the reservation map it cannot disappear until 3916 * after this open call completes. It is therefore safe to take a 3917 * new reference here without additional locking. 3918 */ 3919 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 3920 kref_get(&resv->refs); 3921 } 3922 3923 static void hugetlb_vm_op_close(struct vm_area_struct *vma) 3924 { 3925 struct hstate *h = hstate_vma(vma); 3926 struct resv_map *resv = vma_resv_map(vma); 3927 struct hugepage_subpool *spool = subpool_vma(vma); 3928 unsigned long reserve, start, end; 3929 long gbl_reserve; 3930 3931 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 3932 return; 3933 3934 start = vma_hugecache_offset(h, vma, vma->vm_start); 3935 end = vma_hugecache_offset(h, vma, vma->vm_end); 3936 3937 reserve = (end - start) - region_count(resv, start, end); 3938 hugetlb_cgroup_uncharge_counter(resv, start, end); 3939 if (reserve) { 3940 /* 3941 * Decrement reserve counts. The global reserve count may be 3942 * adjusted if the subpool has a minimum size. 3943 */ 3944 gbl_reserve = hugepage_subpool_put_pages(spool, reserve); 3945 hugetlb_acct_memory(h, -gbl_reserve); 3946 } 3947 3948 kref_put(&resv->refs, resv_map_release); 3949 } 3950 3951 static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) 3952 { 3953 if (addr & ~(huge_page_mask(hstate_vma(vma)))) 3954 return -EINVAL; 3955 return 0; 3956 } 3957 3958 static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) 3959 { 3960 return huge_page_size(hstate_vma(vma)); 3961 } 3962 3963 /* 3964 * We cannot handle pagefaults against hugetlb pages at all. They cause 3965 * handle_mm_fault() to try to instantiate regular-sized pages in the 3966 * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get 3967 * this far. 3968 */ 3969 static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) 3970 { 3971 BUG(); 3972 return 0; 3973 } 3974 3975 /* 3976 * When a new function is introduced to vm_operations_struct and added 3977 * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. 3978 * This is because under System V memory model, mappings created via 3979 * shmget/shmat with "huge page" specified are backed by hugetlbfs files, 3980 * their original vm_ops are overwritten with shm_vm_ops. 3981 */ 3982 const struct vm_operations_struct hugetlb_vm_ops = { 3983 .fault = hugetlb_vm_op_fault, 3984 .open = hugetlb_vm_op_open, 3985 .close = hugetlb_vm_op_close, 3986 .may_split = hugetlb_vm_op_split, 3987 .pagesize = hugetlb_vm_op_pagesize, 3988 }; 3989 3990 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 3991 int writable) 3992 { 3993 pte_t entry; 3994 3995 if (writable) { 3996 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, 3997 vma->vm_page_prot))); 3998 } else { 3999 entry = huge_pte_wrprotect(mk_huge_pte(page, 4000 vma->vm_page_prot)); 4001 } 4002 entry = pte_mkyoung(entry); 4003 entry = pte_mkhuge(entry); 4004 entry = arch_make_huge_pte(entry, vma, page, writable); 4005 4006 return entry; 4007 } 4008 4009 static void set_huge_ptep_writable(struct vm_area_struct *vma, 4010 unsigned long address, pte_t *ptep) 4011 { 4012 pte_t entry; 4013 4014 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); 4015 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) 4016 update_mmu_cache(vma, address, ptep); 4017 } 4018 4019 bool is_hugetlb_entry_migration(pte_t pte) 4020 { 4021 swp_entry_t swp; 4022 4023 if (huge_pte_none(pte) || pte_present(pte)) 4024 return false; 4025 swp = pte_to_swp_entry(pte); 4026 if (is_migration_entry(swp)) 4027 return true; 4028 else 4029 return false; 4030 } 4031 4032 static bool is_hugetlb_entry_hwpoisoned(pte_t pte) 4033 { 4034 swp_entry_t swp; 4035 4036 if (huge_pte_none(pte) || pte_present(pte)) 4037 return false; 4038 swp = pte_to_swp_entry(pte); 4039 if (is_hwpoison_entry(swp)) 4040 return true; 4041 else 4042 return false; 4043 } 4044 4045 static void 4046 hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, 4047 struct page *new_page) 4048 { 4049 __SetPageUptodate(new_page); 4050 set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); 4051 hugepage_add_new_anon_rmap(new_page, vma, addr); 4052 hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); 4053 ClearHPageRestoreReserve(new_page); 4054 SetHPageMigratable(new_page); 4055 } 4056 4057 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 4058 struct vm_area_struct *vma) 4059 { 4060 pte_t *src_pte, *dst_pte, entry, dst_entry; 4061 struct page *ptepage; 4062 unsigned long addr; 4063 bool cow = is_cow_mapping(vma->vm_flags); 4064 struct hstate *h = hstate_vma(vma); 4065 unsigned long sz = huge_page_size(h); 4066 unsigned long npages = pages_per_huge_page(h); 4067 struct address_space *mapping = vma->vm_file->f_mapping; 4068 struct mmu_notifier_range range; 4069 int ret = 0; 4070 4071 if (cow) { 4072 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, 4073 vma->vm_start, 4074 vma->vm_end); 4075 mmu_notifier_invalidate_range_start(&range); 4076 } else { 4077 /* 4078 * For shared mappings i_mmap_rwsem must be held to call 4079 * huge_pte_alloc, otherwise the returned ptep could go 4080 * away if part of a shared pmd and another thread calls 4081 * huge_pmd_unshare. 4082 */ 4083 i_mmap_lock_read(mapping); 4084 } 4085 4086 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 4087 spinlock_t *src_ptl, *dst_ptl; 4088 src_pte = huge_pte_offset(src, addr, sz); 4089 if (!src_pte) 4090 continue; 4091 dst_pte = huge_pte_alloc(dst, vma, addr, sz); 4092 if (!dst_pte) { 4093 ret = -ENOMEM; 4094 break; 4095 } 4096 4097 /* 4098 * If the pagetables are shared don't copy or take references. 4099 * dst_pte == src_pte is the common case of src/dest sharing. 4100 * 4101 * However, src could have 'unshared' and dst shares with 4102 * another vma. If dst_pte !none, this implies sharing. 4103 * Check here before taking page table lock, and once again 4104 * after taking the lock below. 4105 */ 4106 dst_entry = huge_ptep_get(dst_pte); 4107 if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) 4108 continue; 4109 4110 dst_ptl = huge_pte_lock(h, dst, dst_pte); 4111 src_ptl = huge_pte_lockptr(h, src, src_pte); 4112 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 4113 entry = huge_ptep_get(src_pte); 4114 dst_entry = huge_ptep_get(dst_pte); 4115 again: 4116 if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { 4117 /* 4118 * Skip if src entry none. Also, skip in the 4119 * unlikely case dst entry !none as this implies 4120 * sharing with another vma. 4121 */ 4122 ; 4123 } else if (unlikely(is_hugetlb_entry_migration(entry) || 4124 is_hugetlb_entry_hwpoisoned(entry))) { 4125 swp_entry_t swp_entry = pte_to_swp_entry(entry); 4126 4127 if (is_write_migration_entry(swp_entry) && cow) { 4128 /* 4129 * COW mappings require pages in both 4130 * parent and child to be set to read. 4131 */ 4132 make_migration_entry_read(&swp_entry); 4133 entry = swp_entry_to_pte(swp_entry); 4134 set_huge_swap_pte_at(src, addr, src_pte, 4135 entry, sz); 4136 } 4137 set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); 4138 } else { 4139 entry = huge_ptep_get(src_pte); 4140 ptepage = pte_page(entry); 4141 get_page(ptepage); 4142 4143 /* 4144 * This is a rare case where we see pinned hugetlb 4145 * pages while they're prone to COW. We need to do the 4146 * COW earlier during fork. 4147 * 4148 * When pre-allocating the page or copying data, we 4149 * need to be without the pgtable locks since we could 4150 * sleep during the process. 4151 */ 4152 if (unlikely(page_needs_cow_for_dma(vma, ptepage))) { 4153 pte_t src_pte_old = entry; 4154 struct page *new; 4155 4156 spin_unlock(src_ptl); 4157 spin_unlock(dst_ptl); 4158 /* Do not use reserve as it's private owned */ 4159 new = alloc_huge_page(vma, addr, 1); 4160 if (IS_ERR(new)) { 4161 put_page(ptepage); 4162 ret = PTR_ERR(new); 4163 break; 4164 } 4165 copy_user_huge_page(new, ptepage, addr, vma, 4166 npages); 4167 put_page(ptepage); 4168 4169 /* Install the new huge page if src pte stable */ 4170 dst_ptl = huge_pte_lock(h, dst, dst_pte); 4171 src_ptl = huge_pte_lockptr(h, src, src_pte); 4172 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 4173 entry = huge_ptep_get(src_pte); 4174 if (!pte_same(src_pte_old, entry)) { 4175 restore_reserve_on_error(h, vma, addr, 4176 new); 4177 put_page(new); 4178 /* dst_entry won't change as in child */ 4179 goto again; 4180 } 4181 hugetlb_install_page(vma, dst_pte, addr, new); 4182 spin_unlock(src_ptl); 4183 spin_unlock(dst_ptl); 4184 continue; 4185 } 4186 4187 if (cow) { 4188 /* 4189 * No need to notify as we are downgrading page 4190 * table protection not changing it to point 4191 * to a new page. 4192 * 4193 * See Documentation/vm/mmu_notifier.rst 4194 */ 4195 huge_ptep_set_wrprotect(src, addr, src_pte); 4196 entry = huge_pte_wrprotect(entry); 4197 } 4198 4199 page_dup_rmap(ptepage, true); 4200 set_huge_pte_at(dst, addr, dst_pte, entry); 4201 hugetlb_count_add(npages, dst); 4202 } 4203 spin_unlock(src_ptl); 4204 spin_unlock(dst_ptl); 4205 } 4206 4207 if (cow) 4208 mmu_notifier_invalidate_range_end(&range); 4209 else 4210 i_mmap_unlock_read(mapping); 4211 4212 return ret; 4213 } 4214 4215 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 4216 unsigned long start, unsigned long end, 4217 struct page *ref_page) 4218 { 4219 struct mm_struct *mm = vma->vm_mm; 4220 unsigned long address; 4221 pte_t *ptep; 4222 pte_t pte; 4223 spinlock_t *ptl; 4224 struct page *page; 4225 struct hstate *h = hstate_vma(vma); 4226 unsigned long sz = huge_page_size(h); 4227 struct mmu_notifier_range range; 4228 4229 WARN_ON(!is_vm_hugetlb_page(vma)); 4230 BUG_ON(start & ~huge_page_mask(h)); 4231 BUG_ON(end & ~huge_page_mask(h)); 4232 4233 /* 4234 * This is a hugetlb vma, all the pte entries should point 4235 * to huge page. 4236 */ 4237 tlb_change_page_size(tlb, sz); 4238 tlb_start_vma(tlb, vma); 4239 4240 /* 4241 * If sharing possible, alert mmu notifiers of worst case. 4242 */ 4243 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, 4244 end); 4245 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 4246 mmu_notifier_invalidate_range_start(&range); 4247 address = start; 4248 for (; address < end; address += sz) { 4249 ptep = huge_pte_offset(mm, address, sz); 4250 if (!ptep) 4251 continue; 4252 4253 ptl = huge_pte_lock(h, mm, ptep); 4254 if (huge_pmd_unshare(mm, vma, &address, ptep)) { 4255 spin_unlock(ptl); 4256 /* 4257 * We just unmapped a page of PMDs by clearing a PUD. 4258 * The caller's TLB flush range should cover this area. 4259 */ 4260 continue; 4261 } 4262 4263 pte = huge_ptep_get(ptep); 4264 if (huge_pte_none(pte)) { 4265 spin_unlock(ptl); 4266 continue; 4267 } 4268 4269 /* 4270 * Migrating hugepage or HWPoisoned hugepage is already 4271 * unmapped and its refcount is dropped, so just clear pte here. 4272 */ 4273 if (unlikely(!pte_present(pte))) { 4274 huge_pte_clear(mm, address, ptep, sz); 4275 spin_unlock(ptl); 4276 continue; 4277 } 4278 4279 page = pte_page(pte); 4280 /* 4281 * If a reference page is supplied, it is because a specific 4282 * page is being unmapped, not a range. Ensure the page we 4283 * are about to unmap is the actual page of interest. 4284 */ 4285 if (ref_page) { 4286 if (page != ref_page) { 4287 spin_unlock(ptl); 4288 continue; 4289 } 4290 /* 4291 * Mark the VMA as having unmapped its page so that 4292 * future faults in this VMA will fail rather than 4293 * looking like data was lost 4294 */ 4295 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); 4296 } 4297 4298 pte = huge_ptep_get_and_clear(mm, address, ptep); 4299 tlb_remove_huge_tlb_entry(h, tlb, ptep, address); 4300 if (huge_pte_dirty(pte)) 4301 set_page_dirty(page); 4302 4303 hugetlb_count_sub(pages_per_huge_page(h), mm); 4304 page_remove_rmap(page, true); 4305 4306 spin_unlock(ptl); 4307 tlb_remove_page_size(tlb, page, huge_page_size(h)); 4308 /* 4309 * Bail out after unmapping reference page if supplied 4310 */ 4311 if (ref_page) 4312 break; 4313 } 4314 mmu_notifier_invalidate_range_end(&range); 4315 tlb_end_vma(tlb, vma); 4316 } 4317 4318 void __unmap_hugepage_range_final(struct mmu_gather *tlb, 4319 struct vm_area_struct *vma, unsigned long start, 4320 unsigned long end, struct page *ref_page) 4321 { 4322 __unmap_hugepage_range(tlb, vma, start, end, ref_page); 4323 4324 /* 4325 * Clear this flag so that x86's huge_pmd_share page_table_shareable 4326 * test will fail on a vma being torn down, and not grab a page table 4327 * on its way out. We're lucky that the flag has such an appropriate 4328 * name, and can in fact be safely cleared here. We could clear it 4329 * before the __unmap_hugepage_range above, but all that's necessary 4330 * is to clear it before releasing the i_mmap_rwsem. This works 4331 * because in the context this is called, the VMA is about to be 4332 * destroyed and the i_mmap_rwsem is held. 4333 */ 4334 vma->vm_flags &= ~VM_MAYSHARE; 4335 } 4336 4337 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 4338 unsigned long end, struct page *ref_page) 4339 { 4340 struct mmu_gather tlb; 4341 4342 tlb_gather_mmu(&tlb, vma->vm_mm); 4343 __unmap_hugepage_range(&tlb, vma, start, end, ref_page); 4344 tlb_finish_mmu(&tlb); 4345 } 4346 4347 /* 4348 * This is called when the original mapper is failing to COW a MAP_PRIVATE 4349 * mapping it owns the reserve page for. The intention is to unmap the page 4350 * from other VMAs and let the children be SIGKILLed if they are faulting the 4351 * same region. 4352 */ 4353 static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, 4354 struct page *page, unsigned long address) 4355 { 4356 struct hstate *h = hstate_vma(vma); 4357 struct vm_area_struct *iter_vma; 4358 struct address_space *mapping; 4359 pgoff_t pgoff; 4360 4361 /* 4362 * vm_pgoff is in PAGE_SIZE units, hence the different calculation 4363 * from page cache lookup which is in HPAGE_SIZE units. 4364 */ 4365 address = address & huge_page_mask(h); 4366 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + 4367 vma->vm_pgoff; 4368 mapping = vma->vm_file->f_mapping; 4369 4370 /* 4371 * Take the mapping lock for the duration of the table walk. As 4372 * this mapping should be shared between all the VMAs, 4373 * __unmap_hugepage_range() is called as the lock is already held 4374 */ 4375 i_mmap_lock_write(mapping); 4376 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { 4377 /* Do not unmap the current VMA */ 4378 if (iter_vma == vma) 4379 continue; 4380 4381 /* 4382 * Shared VMAs have their own reserves and do not affect 4383 * MAP_PRIVATE accounting but it is possible that a shared 4384 * VMA is using the same page so check and skip such VMAs. 4385 */ 4386 if (iter_vma->vm_flags & VM_MAYSHARE) 4387 continue; 4388 4389 /* 4390 * Unmap the page from other VMAs without their own reserves. 4391 * They get marked to be SIGKILLed if they fault in these 4392 * areas. This is because a future no-page fault on this VMA 4393 * could insert a zeroed page instead of the data existing 4394 * from the time of fork. This would look like data corruption 4395 */ 4396 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 4397 unmap_hugepage_range(iter_vma, address, 4398 address + huge_page_size(h), page); 4399 } 4400 i_mmap_unlock_write(mapping); 4401 } 4402 4403 /* 4404 * Hugetlb_cow() should be called with page lock of the original hugepage held. 4405 * Called with hugetlb_instantiation_mutex held and pte_page locked so we 4406 * cannot race with other handlers or page migration. 4407 * Keep the pte_same checks anyway to make transition from the mutex easier. 4408 */ 4409 static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 4410 unsigned long address, pte_t *ptep, 4411 struct page *pagecache_page, spinlock_t *ptl) 4412 { 4413 pte_t pte; 4414 struct hstate *h = hstate_vma(vma); 4415 struct page *old_page, *new_page; 4416 int outside_reserve = 0; 4417 vm_fault_t ret = 0; 4418 unsigned long haddr = address & huge_page_mask(h); 4419 struct mmu_notifier_range range; 4420 4421 pte = huge_ptep_get(ptep); 4422 old_page = pte_page(pte); 4423 4424 retry_avoidcopy: 4425 /* If no-one else is actually using this page, avoid the copy 4426 * and just make the page writable */ 4427 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { 4428 page_move_anon_rmap(old_page, vma); 4429 set_huge_ptep_writable(vma, haddr, ptep); 4430 return 0; 4431 } 4432 4433 /* 4434 * If the process that created a MAP_PRIVATE mapping is about to 4435 * perform a COW due to a shared page count, attempt to satisfy 4436 * the allocation without using the existing reserves. The pagecache 4437 * page is used to determine if the reserve at this address was 4438 * consumed or not. If reserves were used, a partial faulted mapping 4439 * at the time of fork() could consume its reserves on COW instead 4440 * of the full address range. 4441 */ 4442 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 4443 old_page != pagecache_page) 4444 outside_reserve = 1; 4445 4446 get_page(old_page); 4447 4448 /* 4449 * Drop page table lock as buddy allocator may be called. It will 4450 * be acquired again before returning to the caller, as expected. 4451 */ 4452 spin_unlock(ptl); 4453 new_page = alloc_huge_page(vma, haddr, outside_reserve); 4454 4455 if (IS_ERR(new_page)) { 4456 /* 4457 * If a process owning a MAP_PRIVATE mapping fails to COW, 4458 * it is due to references held by a child and an insufficient 4459 * huge page pool. To guarantee the original mappers 4460 * reliability, unmap the page from child processes. The child 4461 * may get SIGKILLed if it later faults. 4462 */ 4463 if (outside_reserve) { 4464 struct address_space *mapping = vma->vm_file->f_mapping; 4465 pgoff_t idx; 4466 u32 hash; 4467 4468 put_page(old_page); 4469 BUG_ON(huge_pte_none(pte)); 4470 /* 4471 * Drop hugetlb_fault_mutex and i_mmap_rwsem before 4472 * unmapping. unmapping needs to hold i_mmap_rwsem 4473 * in write mode. Dropping i_mmap_rwsem in read mode 4474 * here is OK as COW mappings do not interact with 4475 * PMD sharing. 4476 * 4477 * Reacquire both after unmap operation. 4478 */ 4479 idx = vma_hugecache_offset(h, vma, haddr); 4480 hash = hugetlb_fault_mutex_hash(mapping, idx); 4481 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 4482 i_mmap_unlock_read(mapping); 4483 4484 unmap_ref_private(mm, vma, old_page, haddr); 4485 4486 i_mmap_lock_read(mapping); 4487 mutex_lock(&hugetlb_fault_mutex_table[hash]); 4488 spin_lock(ptl); 4489 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 4490 if (likely(ptep && 4491 pte_same(huge_ptep_get(ptep), pte))) 4492 goto retry_avoidcopy; 4493 /* 4494 * race occurs while re-acquiring page table 4495 * lock, and our job is done. 4496 */ 4497 return 0; 4498 } 4499 4500 ret = vmf_error(PTR_ERR(new_page)); 4501 goto out_release_old; 4502 } 4503 4504 /* 4505 * When the original hugepage is shared one, it does not have 4506 * anon_vma prepared. 4507 */ 4508 if (unlikely(anon_vma_prepare(vma))) { 4509 ret = VM_FAULT_OOM; 4510 goto out_release_all; 4511 } 4512 4513 copy_user_huge_page(new_page, old_page, address, vma, 4514 pages_per_huge_page(h)); 4515 __SetPageUptodate(new_page); 4516 4517 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, 4518 haddr + huge_page_size(h)); 4519 mmu_notifier_invalidate_range_start(&range); 4520 4521 /* 4522 * Retake the page table lock to check for racing updates 4523 * before the page tables are altered 4524 */ 4525 spin_lock(ptl); 4526 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 4527 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { 4528 ClearHPageRestoreReserve(new_page); 4529 4530 /* Break COW */ 4531 huge_ptep_clear_flush(vma, haddr, ptep); 4532 mmu_notifier_invalidate_range(mm, range.start, range.end); 4533 set_huge_pte_at(mm, haddr, ptep, 4534 make_huge_pte(vma, new_page, 1)); 4535 page_remove_rmap(old_page, true); 4536 hugepage_add_new_anon_rmap(new_page, vma, haddr); 4537 SetHPageMigratable(new_page); 4538 /* Make the old page be freed below */ 4539 new_page = old_page; 4540 } 4541 spin_unlock(ptl); 4542 mmu_notifier_invalidate_range_end(&range); 4543 out_release_all: 4544 restore_reserve_on_error(h, vma, haddr, new_page); 4545 put_page(new_page); 4546 out_release_old: 4547 put_page(old_page); 4548 4549 spin_lock(ptl); /* Caller expects lock to be held */ 4550 return ret; 4551 } 4552 4553 /* Return the pagecache page at a given address within a VMA */ 4554 static struct page *hugetlbfs_pagecache_page(struct hstate *h, 4555 struct vm_area_struct *vma, unsigned long address) 4556 { 4557 struct address_space *mapping; 4558 pgoff_t idx; 4559 4560 mapping = vma->vm_file->f_mapping; 4561 idx = vma_hugecache_offset(h, vma, address); 4562 4563 return find_lock_page(mapping, idx); 4564 } 4565 4566 /* 4567 * Return whether there is a pagecache page to back given address within VMA. 4568 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. 4569 */ 4570 static bool hugetlbfs_pagecache_present(struct hstate *h, 4571 struct vm_area_struct *vma, unsigned long address) 4572 { 4573 struct address_space *mapping; 4574 pgoff_t idx; 4575 struct page *page; 4576 4577 mapping = vma->vm_file->f_mapping; 4578 idx = vma_hugecache_offset(h, vma, address); 4579 4580 page = find_get_page(mapping, idx); 4581 if (page) 4582 put_page(page); 4583 return page != NULL; 4584 } 4585 4586 int huge_add_to_page_cache(struct page *page, struct address_space *mapping, 4587 pgoff_t idx) 4588 { 4589 struct inode *inode = mapping->host; 4590 struct hstate *h = hstate_inode(inode); 4591 int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 4592 4593 if (err) 4594 return err; 4595 ClearHPageRestoreReserve(page); 4596 4597 /* 4598 * set page dirty so that it will not be removed from cache/file 4599 * by non-hugetlbfs specific code paths. 4600 */ 4601 set_page_dirty(page); 4602 4603 spin_lock(&inode->i_lock); 4604 inode->i_blocks += blocks_per_huge_page(h); 4605 spin_unlock(&inode->i_lock); 4606 return 0; 4607 } 4608 4609 static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, 4610 struct address_space *mapping, 4611 pgoff_t idx, 4612 unsigned int flags, 4613 unsigned long haddr, 4614 unsigned long reason) 4615 { 4616 vm_fault_t ret; 4617 u32 hash; 4618 struct vm_fault vmf = { 4619 .vma = vma, 4620 .address = haddr, 4621 .flags = flags, 4622 4623 /* 4624 * Hard to debug if it ends up being 4625 * used by a callee that assumes 4626 * something about the other 4627 * uninitialized fields... same as in 4628 * memory.c 4629 */ 4630 }; 4631 4632 /* 4633 * hugetlb_fault_mutex and i_mmap_rwsem must be 4634 * dropped before handling userfault. Reacquire 4635 * after handling fault to make calling code simpler. 4636 */ 4637 hash = hugetlb_fault_mutex_hash(mapping, idx); 4638 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 4639 i_mmap_unlock_read(mapping); 4640 ret = handle_userfault(&vmf, reason); 4641 i_mmap_lock_read(mapping); 4642 mutex_lock(&hugetlb_fault_mutex_table[hash]); 4643 4644 return ret; 4645 } 4646 4647 static vm_fault_t hugetlb_no_page(struct mm_struct *mm, 4648 struct vm_area_struct *vma, 4649 struct address_space *mapping, pgoff_t idx, 4650 unsigned long address, pte_t *ptep, unsigned int flags) 4651 { 4652 struct hstate *h = hstate_vma(vma); 4653 vm_fault_t ret = VM_FAULT_SIGBUS; 4654 int anon_rmap = 0; 4655 unsigned long size; 4656 struct page *page; 4657 pte_t new_pte; 4658 spinlock_t *ptl; 4659 unsigned long haddr = address & huge_page_mask(h); 4660 bool new_page = false; 4661 4662 /* 4663 * Currently, we are forced to kill the process in the event the 4664 * original mapper has unmapped pages from the child due to a failed 4665 * COW. Warn that such a situation has occurred as it may not be obvious 4666 */ 4667 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 4668 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", 4669 current->pid); 4670 return ret; 4671 } 4672 4673 /* 4674 * We can not race with truncation due to holding i_mmap_rwsem. 4675 * i_size is modified when holding i_mmap_rwsem, so check here 4676 * once for faults beyond end of file. 4677 */ 4678 size = i_size_read(mapping->host) >> huge_page_shift(h); 4679 if (idx >= size) 4680 goto out; 4681 4682 retry: 4683 page = find_lock_page(mapping, idx); 4684 if (!page) { 4685 /* Check for page in userfault range */ 4686 if (userfaultfd_missing(vma)) { 4687 ret = hugetlb_handle_userfault(vma, mapping, idx, 4688 flags, haddr, 4689 VM_UFFD_MISSING); 4690 goto out; 4691 } 4692 4693 page = alloc_huge_page(vma, haddr, 0); 4694 if (IS_ERR(page)) { 4695 /* 4696 * Returning error will result in faulting task being 4697 * sent SIGBUS. The hugetlb fault mutex prevents two 4698 * tasks from racing to fault in the same page which 4699 * could result in false unable to allocate errors. 4700 * Page migration does not take the fault mutex, but 4701 * does a clear then write of pte's under page table 4702 * lock. Page fault code could race with migration, 4703 * notice the clear pte and try to allocate a page 4704 * here. Before returning error, get ptl and make 4705 * sure there really is no pte entry. 4706 */ 4707 ptl = huge_pte_lock(h, mm, ptep); 4708 ret = 0; 4709 if (huge_pte_none(huge_ptep_get(ptep))) 4710 ret = vmf_error(PTR_ERR(page)); 4711 spin_unlock(ptl); 4712 goto out; 4713 } 4714 clear_huge_page(page, address, pages_per_huge_page(h)); 4715 __SetPageUptodate(page); 4716 new_page = true; 4717 4718 if (vma->vm_flags & VM_MAYSHARE) { 4719 int err = huge_add_to_page_cache(page, mapping, idx); 4720 if (err) { 4721 put_page(page); 4722 if (err == -EEXIST) 4723 goto retry; 4724 goto out; 4725 } 4726 } else { 4727 lock_page(page); 4728 if (unlikely(anon_vma_prepare(vma))) { 4729 ret = VM_FAULT_OOM; 4730 goto backout_unlocked; 4731 } 4732 anon_rmap = 1; 4733 } 4734 } else { 4735 /* 4736 * If memory error occurs between mmap() and fault, some process 4737 * don't have hwpoisoned swap entry for errored virtual address. 4738 * So we need to block hugepage fault by PG_hwpoison bit check. 4739 */ 4740 if (unlikely(PageHWPoison(page))) { 4741 ret = VM_FAULT_HWPOISON_LARGE | 4742 VM_FAULT_SET_HINDEX(hstate_index(h)); 4743 goto backout_unlocked; 4744 } 4745 4746 /* Check for page in userfault range. */ 4747 if (userfaultfd_minor(vma)) { 4748 unlock_page(page); 4749 put_page(page); 4750 ret = hugetlb_handle_userfault(vma, mapping, idx, 4751 flags, haddr, 4752 VM_UFFD_MINOR); 4753 goto out; 4754 } 4755 } 4756 4757 /* 4758 * If we are going to COW a private mapping later, we examine the 4759 * pending reservations for this page now. This will ensure that 4760 * any allocations necessary to record that reservation occur outside 4761 * the spinlock. 4762 */ 4763 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 4764 if (vma_needs_reservation(h, vma, haddr) < 0) { 4765 ret = VM_FAULT_OOM; 4766 goto backout_unlocked; 4767 } 4768 /* Just decrements count, does not deallocate */ 4769 vma_end_reservation(h, vma, haddr); 4770 } 4771 4772 ptl = huge_pte_lock(h, mm, ptep); 4773 ret = 0; 4774 if (!huge_pte_none(huge_ptep_get(ptep))) 4775 goto backout; 4776 4777 if (anon_rmap) { 4778 ClearHPageRestoreReserve(page); 4779 hugepage_add_new_anon_rmap(page, vma, haddr); 4780 } else 4781 page_dup_rmap(page, true); 4782 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 4783 && (vma->vm_flags & VM_SHARED))); 4784 set_huge_pte_at(mm, haddr, ptep, new_pte); 4785 4786 hugetlb_count_add(pages_per_huge_page(h), mm); 4787 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 4788 /* Optimization, do the COW without a second fault */ 4789 ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); 4790 } 4791 4792 spin_unlock(ptl); 4793 4794 /* 4795 * Only set HPageMigratable in newly allocated pages. Existing pages 4796 * found in the pagecache may not have HPageMigratableset if they have 4797 * been isolated for migration. 4798 */ 4799 if (new_page) 4800 SetHPageMigratable(page); 4801 4802 unlock_page(page); 4803 out: 4804 return ret; 4805 4806 backout: 4807 spin_unlock(ptl); 4808 backout_unlocked: 4809 unlock_page(page); 4810 restore_reserve_on_error(h, vma, haddr, page); 4811 put_page(page); 4812 goto out; 4813 } 4814 4815 #ifdef CONFIG_SMP 4816 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 4817 { 4818 unsigned long key[2]; 4819 u32 hash; 4820 4821 key[0] = (unsigned long) mapping; 4822 key[1] = idx; 4823 4824 hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); 4825 4826 return hash & (num_fault_mutexes - 1); 4827 } 4828 #else 4829 /* 4830 * For uniprocessor systems we always use a single mutex, so just 4831 * return 0 and avoid the hashing overhead. 4832 */ 4833 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 4834 { 4835 return 0; 4836 } 4837 #endif 4838 4839 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 4840 unsigned long address, unsigned int flags) 4841 { 4842 pte_t *ptep, entry; 4843 spinlock_t *ptl; 4844 vm_fault_t ret; 4845 u32 hash; 4846 pgoff_t idx; 4847 struct page *page = NULL; 4848 struct page *pagecache_page = NULL; 4849 struct hstate *h = hstate_vma(vma); 4850 struct address_space *mapping; 4851 int need_wait_lock = 0; 4852 unsigned long haddr = address & huge_page_mask(h); 4853 4854 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 4855 if (ptep) { 4856 /* 4857 * Since we hold no locks, ptep could be stale. That is 4858 * OK as we are only making decisions based on content and 4859 * not actually modifying content here. 4860 */ 4861 entry = huge_ptep_get(ptep); 4862 if (unlikely(is_hugetlb_entry_migration(entry))) { 4863 migration_entry_wait_huge(vma, mm, ptep); 4864 return 0; 4865 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 4866 return VM_FAULT_HWPOISON_LARGE | 4867 VM_FAULT_SET_HINDEX(hstate_index(h)); 4868 } 4869 4870 /* 4871 * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold 4872 * until finished with ptep. This serves two purposes: 4873 * 1) It prevents huge_pmd_unshare from being called elsewhere 4874 * and making the ptep no longer valid. 4875 * 2) It synchronizes us with i_size modifications during truncation. 4876 * 4877 * ptep could have already be assigned via huge_pte_offset. That 4878 * is OK, as huge_pte_alloc will return the same value unless 4879 * something has changed. 4880 */ 4881 mapping = vma->vm_file->f_mapping; 4882 i_mmap_lock_read(mapping); 4883 ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); 4884 if (!ptep) { 4885 i_mmap_unlock_read(mapping); 4886 return VM_FAULT_OOM; 4887 } 4888 4889 /* 4890 * Serialize hugepage allocation and instantiation, so that we don't 4891 * get spurious allocation failures if two CPUs race to instantiate 4892 * the same page in the page cache. 4893 */ 4894 idx = vma_hugecache_offset(h, vma, haddr); 4895 hash = hugetlb_fault_mutex_hash(mapping, idx); 4896 mutex_lock(&hugetlb_fault_mutex_table[hash]); 4897 4898 entry = huge_ptep_get(ptep); 4899 if (huge_pte_none(entry)) { 4900 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); 4901 goto out_mutex; 4902 } 4903 4904 ret = 0; 4905 4906 /* 4907 * entry could be a migration/hwpoison entry at this point, so this 4908 * check prevents the kernel from going below assuming that we have 4909 * an active hugepage in pagecache. This goto expects the 2nd page 4910 * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will 4911 * properly handle it. 4912 */ 4913 if (!pte_present(entry)) 4914 goto out_mutex; 4915 4916 /* 4917 * If we are going to COW the mapping later, we examine the pending 4918 * reservations for this page now. This will ensure that any 4919 * allocations necessary to record that reservation occur outside the 4920 * spinlock. For private mappings, we also lookup the pagecache 4921 * page now as it is used to determine if a reservation has been 4922 * consumed. 4923 */ 4924 if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { 4925 if (vma_needs_reservation(h, vma, haddr) < 0) { 4926 ret = VM_FAULT_OOM; 4927 goto out_mutex; 4928 } 4929 /* Just decrements count, does not deallocate */ 4930 vma_end_reservation(h, vma, haddr); 4931 4932 if (!(vma->vm_flags & VM_MAYSHARE)) 4933 pagecache_page = hugetlbfs_pagecache_page(h, 4934 vma, haddr); 4935 } 4936 4937 ptl = huge_pte_lock(h, mm, ptep); 4938 4939 /* Check for a racing update before calling hugetlb_cow */ 4940 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 4941 goto out_ptl; 4942 4943 /* 4944 * hugetlb_cow() requires page locks of pte_page(entry) and 4945 * pagecache_page, so here we need take the former one 4946 * when page != pagecache_page or !pagecache_page. 4947 */ 4948 page = pte_page(entry); 4949 if (page != pagecache_page) 4950 if (!trylock_page(page)) { 4951 need_wait_lock = 1; 4952 goto out_ptl; 4953 } 4954 4955 get_page(page); 4956 4957 if (flags & FAULT_FLAG_WRITE) { 4958 if (!huge_pte_write(entry)) { 4959 ret = hugetlb_cow(mm, vma, address, ptep, 4960 pagecache_page, ptl); 4961 goto out_put_page; 4962 } 4963 entry = huge_pte_mkdirty(entry); 4964 } 4965 entry = pte_mkyoung(entry); 4966 if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, 4967 flags & FAULT_FLAG_WRITE)) 4968 update_mmu_cache(vma, haddr, ptep); 4969 out_put_page: 4970 if (page != pagecache_page) 4971 unlock_page(page); 4972 put_page(page); 4973 out_ptl: 4974 spin_unlock(ptl); 4975 4976 if (pagecache_page) { 4977 unlock_page(pagecache_page); 4978 put_page(pagecache_page); 4979 } 4980 out_mutex: 4981 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 4982 i_mmap_unlock_read(mapping); 4983 /* 4984 * Generally it's safe to hold refcount during waiting page lock. But 4985 * here we just wait to defer the next page fault to avoid busy loop and 4986 * the page is not used after unlocked before returning from the current 4987 * page fault. So we are safe from accessing freed page, even if we wait 4988 * here without taking refcount. 4989 */ 4990 if (need_wait_lock) 4991 wait_on_page_locked(page); 4992 return ret; 4993 } 4994 4995 #ifdef CONFIG_USERFAULTFD 4996 /* 4997 * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with 4998 * modifications for huge pages. 4999 */ 5000 int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, 5001 pte_t *dst_pte, 5002 struct vm_area_struct *dst_vma, 5003 unsigned long dst_addr, 5004 unsigned long src_addr, 5005 enum mcopy_atomic_mode mode, 5006 struct page **pagep) 5007 { 5008 bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); 5009 struct address_space *mapping; 5010 pgoff_t idx; 5011 unsigned long size; 5012 int vm_shared = dst_vma->vm_flags & VM_SHARED; 5013 struct hstate *h = hstate_vma(dst_vma); 5014 pte_t _dst_pte; 5015 spinlock_t *ptl; 5016 int ret; 5017 struct page *page; 5018 int writable; 5019 5020 mapping = dst_vma->vm_file->f_mapping; 5021 idx = vma_hugecache_offset(h, dst_vma, dst_addr); 5022 5023 if (is_continue) { 5024 ret = -EFAULT; 5025 page = find_lock_page(mapping, idx); 5026 if (!page) 5027 goto out; 5028 } else if (!*pagep) { 5029 /* If a page already exists, then it's UFFDIO_COPY for 5030 * a non-missing case. Return -EEXIST. 5031 */ 5032 if (vm_shared && 5033 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { 5034 ret = -EEXIST; 5035 goto out; 5036 } 5037 5038 page = alloc_huge_page(dst_vma, dst_addr, 0); 5039 if (IS_ERR(page)) { 5040 ret = -ENOMEM; 5041 goto out; 5042 } 5043 5044 ret = copy_huge_page_from_user(page, 5045 (const void __user *) src_addr, 5046 pages_per_huge_page(h), false); 5047 5048 /* fallback to copy_from_user outside mmap_lock */ 5049 if (unlikely(ret)) { 5050 ret = -ENOENT; 5051 *pagep = page; 5052 /* don't free the page */ 5053 goto out; 5054 } 5055 } else { 5056 page = *pagep; 5057 *pagep = NULL; 5058 } 5059 5060 /* 5061 * The memory barrier inside __SetPageUptodate makes sure that 5062 * preceding stores to the page contents become visible before 5063 * the set_pte_at() write. 5064 */ 5065 __SetPageUptodate(page); 5066 5067 /* Add shared, newly allocated pages to the page cache. */ 5068 if (vm_shared && !is_continue) { 5069 size = i_size_read(mapping->host) >> huge_page_shift(h); 5070 ret = -EFAULT; 5071 if (idx >= size) 5072 goto out_release_nounlock; 5073 5074 /* 5075 * Serialization between remove_inode_hugepages() and 5076 * huge_add_to_page_cache() below happens through the 5077 * hugetlb_fault_mutex_table that here must be hold by 5078 * the caller. 5079 */ 5080 ret = huge_add_to_page_cache(page, mapping, idx); 5081 if (ret) 5082 goto out_release_nounlock; 5083 } 5084 5085 ptl = huge_pte_lockptr(h, dst_mm, dst_pte); 5086 spin_lock(ptl); 5087 5088 /* 5089 * Recheck the i_size after holding PT lock to make sure not 5090 * to leave any page mapped (as page_mapped()) beyond the end 5091 * of the i_size (remove_inode_hugepages() is strict about 5092 * enforcing that). If we bail out here, we'll also leave a 5093 * page in the radix tree in the vm_shared case beyond the end 5094 * of the i_size, but remove_inode_hugepages() will take care 5095 * of it as soon as we drop the hugetlb_fault_mutex_table. 5096 */ 5097 size = i_size_read(mapping->host) >> huge_page_shift(h); 5098 ret = -EFAULT; 5099 if (idx >= size) 5100 goto out_release_unlock; 5101 5102 ret = -EEXIST; 5103 if (!huge_pte_none(huge_ptep_get(dst_pte))) 5104 goto out_release_unlock; 5105 5106 if (vm_shared) { 5107 page_dup_rmap(page, true); 5108 } else { 5109 ClearHPageRestoreReserve(page); 5110 hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); 5111 } 5112 5113 /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */ 5114 if (is_continue && !vm_shared) 5115 writable = 0; 5116 else 5117 writable = dst_vma->vm_flags & VM_WRITE; 5118 5119 _dst_pte = make_huge_pte(dst_vma, page, writable); 5120 if (writable) 5121 _dst_pte = huge_pte_mkdirty(_dst_pte); 5122 _dst_pte = pte_mkyoung(_dst_pte); 5123 5124 set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 5125 5126 (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, 5127 dst_vma->vm_flags & VM_WRITE); 5128 hugetlb_count_add(pages_per_huge_page(h), dst_mm); 5129 5130 /* No need to invalidate - it was non-present before */ 5131 update_mmu_cache(dst_vma, dst_addr, dst_pte); 5132 5133 spin_unlock(ptl); 5134 if (!is_continue) 5135 SetHPageMigratable(page); 5136 if (vm_shared || is_continue) 5137 unlock_page(page); 5138 ret = 0; 5139 out: 5140 return ret; 5141 out_release_unlock: 5142 spin_unlock(ptl); 5143 if (vm_shared || is_continue) 5144 unlock_page(page); 5145 out_release_nounlock: 5146 restore_reserve_on_error(h, dst_vma, dst_addr, page); 5147 put_page(page); 5148 goto out; 5149 } 5150 #endif /* CONFIG_USERFAULTFD */ 5151 5152 static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, 5153 int refs, struct page **pages, 5154 struct vm_area_struct **vmas) 5155 { 5156 int nr; 5157 5158 for (nr = 0; nr < refs; nr++) { 5159 if (likely(pages)) 5160 pages[nr] = mem_map_offset(page, nr); 5161 if (vmas) 5162 vmas[nr] = vma; 5163 } 5164 } 5165 5166 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 5167 struct page **pages, struct vm_area_struct **vmas, 5168 unsigned long *position, unsigned long *nr_pages, 5169 long i, unsigned int flags, int *locked) 5170 { 5171 unsigned long pfn_offset; 5172 unsigned long vaddr = *position; 5173 unsigned long remainder = *nr_pages; 5174 struct hstate *h = hstate_vma(vma); 5175 int err = -EFAULT, refs; 5176 5177 while (vaddr < vma->vm_end && remainder) { 5178 pte_t *pte; 5179 spinlock_t *ptl = NULL; 5180 int absent; 5181 struct page *page; 5182 5183 /* 5184 * If we have a pending SIGKILL, don't keep faulting pages and 5185 * potentially allocating memory. 5186 */ 5187 if (fatal_signal_pending(current)) { 5188 remainder = 0; 5189 break; 5190 } 5191 5192 /* 5193 * Some archs (sparc64, sh*) have multiple pte_ts to 5194 * each hugepage. We have to make sure we get the 5195 * first, for the page indexing below to work. 5196 * 5197 * Note that page table lock is not held when pte is null. 5198 */ 5199 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), 5200 huge_page_size(h)); 5201 if (pte) 5202 ptl = huge_pte_lock(h, mm, pte); 5203 absent = !pte || huge_pte_none(huge_ptep_get(pte)); 5204 5205 /* 5206 * When coredumping, it suits get_dump_page if we just return 5207 * an error where there's an empty slot with no huge pagecache 5208 * to back it. This way, we avoid allocating a hugepage, and 5209 * the sparse dumpfile avoids allocating disk blocks, but its 5210 * huge holes still show up with zeroes where they need to be. 5211 */ 5212 if (absent && (flags & FOLL_DUMP) && 5213 !hugetlbfs_pagecache_present(h, vma, vaddr)) { 5214 if (pte) 5215 spin_unlock(ptl); 5216 remainder = 0; 5217 break; 5218 } 5219 5220 /* 5221 * We need call hugetlb_fault for both hugepages under migration 5222 * (in which case hugetlb_fault waits for the migration,) and 5223 * hwpoisoned hugepages (in which case we need to prevent the 5224 * caller from accessing to them.) In order to do this, we use 5225 * here is_swap_pte instead of is_hugetlb_entry_migration and 5226 * is_hugetlb_entry_hwpoisoned. This is because it simply covers 5227 * both cases, and because we can't follow correct pages 5228 * directly from any kind of swap entries. 5229 */ 5230 if (absent || is_swap_pte(huge_ptep_get(pte)) || 5231 ((flags & FOLL_WRITE) && 5232 !huge_pte_write(huge_ptep_get(pte)))) { 5233 vm_fault_t ret; 5234 unsigned int fault_flags = 0; 5235 5236 if (pte) 5237 spin_unlock(ptl); 5238 if (flags & FOLL_WRITE) 5239 fault_flags |= FAULT_FLAG_WRITE; 5240 if (locked) 5241 fault_flags |= FAULT_FLAG_ALLOW_RETRY | 5242 FAULT_FLAG_KILLABLE; 5243 if (flags & FOLL_NOWAIT) 5244 fault_flags |= FAULT_FLAG_ALLOW_RETRY | 5245 FAULT_FLAG_RETRY_NOWAIT; 5246 if (flags & FOLL_TRIED) { 5247 /* 5248 * Note: FAULT_FLAG_ALLOW_RETRY and 5249 * FAULT_FLAG_TRIED can co-exist 5250 */ 5251 fault_flags |= FAULT_FLAG_TRIED; 5252 } 5253 ret = hugetlb_fault(mm, vma, vaddr, fault_flags); 5254 if (ret & VM_FAULT_ERROR) { 5255 err = vm_fault_to_errno(ret, flags); 5256 remainder = 0; 5257 break; 5258 } 5259 if (ret & VM_FAULT_RETRY) { 5260 if (locked && 5261 !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) 5262 *locked = 0; 5263 *nr_pages = 0; 5264 /* 5265 * VM_FAULT_RETRY must not return an 5266 * error, it will return zero 5267 * instead. 5268 * 5269 * No need to update "position" as the 5270 * caller will not check it after 5271 * *nr_pages is set to 0. 5272 */ 5273 return i; 5274 } 5275 continue; 5276 } 5277 5278 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; 5279 page = pte_page(huge_ptep_get(pte)); 5280 5281 /* 5282 * If subpage information not requested, update counters 5283 * and skip the same_page loop below. 5284 */ 5285 if (!pages && !vmas && !pfn_offset && 5286 (vaddr + huge_page_size(h) < vma->vm_end) && 5287 (remainder >= pages_per_huge_page(h))) { 5288 vaddr += huge_page_size(h); 5289 remainder -= pages_per_huge_page(h); 5290 i += pages_per_huge_page(h); 5291 spin_unlock(ptl); 5292 continue; 5293 } 5294 5295 refs = min3(pages_per_huge_page(h) - pfn_offset, 5296 (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder); 5297 5298 if (pages || vmas) 5299 record_subpages_vmas(mem_map_offset(page, pfn_offset), 5300 vma, refs, 5301 likely(pages) ? pages + i : NULL, 5302 vmas ? vmas + i : NULL); 5303 5304 if (pages) { 5305 /* 5306 * try_grab_compound_head() should always succeed here, 5307 * because: a) we hold the ptl lock, and b) we've just 5308 * checked that the huge page is present in the page 5309 * tables. If the huge page is present, then the tail 5310 * pages must also be present. The ptl prevents the 5311 * head page and tail pages from being rearranged in 5312 * any way. So this page must be available at this 5313 * point, unless the page refcount overflowed: 5314 */ 5315 if (WARN_ON_ONCE(!try_grab_compound_head(pages[i], 5316 refs, 5317 flags))) { 5318 spin_unlock(ptl); 5319 remainder = 0; 5320 err = -ENOMEM; 5321 break; 5322 } 5323 } 5324 5325 vaddr += (refs << PAGE_SHIFT); 5326 remainder -= refs; 5327 i += refs; 5328 5329 spin_unlock(ptl); 5330 } 5331 *nr_pages = remainder; 5332 /* 5333 * setting position is actually required only if remainder is 5334 * not zero but it's faster not to add a "if (remainder)" 5335 * branch. 5336 */ 5337 *position = vaddr; 5338 5339 return i ? i : err; 5340 } 5341 5342 unsigned long hugetlb_change_protection(struct vm_area_struct *vma, 5343 unsigned long address, unsigned long end, pgprot_t newprot) 5344 { 5345 struct mm_struct *mm = vma->vm_mm; 5346 unsigned long start = address; 5347 pte_t *ptep; 5348 pte_t pte; 5349 struct hstate *h = hstate_vma(vma); 5350 unsigned long pages = 0; 5351 bool shared_pmd = false; 5352 struct mmu_notifier_range range; 5353 5354 /* 5355 * In the case of shared PMDs, the area to flush could be beyond 5356 * start/end. Set range.start/range.end to cover the maximum possible 5357 * range if PMD sharing is possible. 5358 */ 5359 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 5360 0, vma, mm, start, end); 5361 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 5362 5363 BUG_ON(address >= end); 5364 flush_cache_range(vma, range.start, range.end); 5365 5366 mmu_notifier_invalidate_range_start(&range); 5367 i_mmap_lock_write(vma->vm_file->f_mapping); 5368 for (; address < end; address += huge_page_size(h)) { 5369 spinlock_t *ptl; 5370 ptep = huge_pte_offset(mm, address, huge_page_size(h)); 5371 if (!ptep) 5372 continue; 5373 ptl = huge_pte_lock(h, mm, ptep); 5374 if (huge_pmd_unshare(mm, vma, &address, ptep)) { 5375 pages++; 5376 spin_unlock(ptl); 5377 shared_pmd = true; 5378 continue; 5379 } 5380 pte = huge_ptep_get(ptep); 5381 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 5382 spin_unlock(ptl); 5383 continue; 5384 } 5385 if (unlikely(is_hugetlb_entry_migration(pte))) { 5386 swp_entry_t entry = pte_to_swp_entry(pte); 5387 5388 if (is_write_migration_entry(entry)) { 5389 pte_t newpte; 5390 5391 make_migration_entry_read(&entry); 5392 newpte = swp_entry_to_pte(entry); 5393 set_huge_swap_pte_at(mm, address, ptep, 5394 newpte, huge_page_size(h)); 5395 pages++; 5396 } 5397 spin_unlock(ptl); 5398 continue; 5399 } 5400 if (!huge_pte_none(pte)) { 5401 pte_t old_pte; 5402 5403 old_pte = huge_ptep_modify_prot_start(vma, address, ptep); 5404 pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); 5405 pte = arch_make_huge_pte(pte, vma, NULL, 0); 5406 huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); 5407 pages++; 5408 } 5409 spin_unlock(ptl); 5410 } 5411 /* 5412 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare 5413 * may have cleared our pud entry and done put_page on the page table: 5414 * once we release i_mmap_rwsem, another task can do the final put_page 5415 * and that page table be reused and filled with junk. If we actually 5416 * did unshare a page of pmds, flush the range corresponding to the pud. 5417 */ 5418 if (shared_pmd) 5419 flush_hugetlb_tlb_range(vma, range.start, range.end); 5420 else 5421 flush_hugetlb_tlb_range(vma, start, end); 5422 /* 5423 * No need to call mmu_notifier_invalidate_range() we are downgrading 5424 * page table protection not changing it to point to a new page. 5425 * 5426 * See Documentation/vm/mmu_notifier.rst 5427 */ 5428 i_mmap_unlock_write(vma->vm_file->f_mapping); 5429 mmu_notifier_invalidate_range_end(&range); 5430 5431 return pages << h->order; 5432 } 5433 5434 /* Return true if reservation was successful, false otherwise. */ 5435 bool hugetlb_reserve_pages(struct inode *inode, 5436 long from, long to, 5437 struct vm_area_struct *vma, 5438 vm_flags_t vm_flags) 5439 { 5440 long chg, add = -1; 5441 struct hstate *h = hstate_inode(inode); 5442 struct hugepage_subpool *spool = subpool_inode(inode); 5443 struct resv_map *resv_map; 5444 struct hugetlb_cgroup *h_cg = NULL; 5445 long gbl_reserve, regions_needed = 0; 5446 5447 /* This should never happen */ 5448 if (from > to) { 5449 VM_WARN(1, "%s called with a negative range\n", __func__); 5450 return false; 5451 } 5452 5453 /* 5454 * Only apply hugepage reservation if asked. At fault time, an 5455 * attempt will be made for VM_NORESERVE to allocate a page 5456 * without using reserves 5457 */ 5458 if (vm_flags & VM_NORESERVE) 5459 return true; 5460 5461 /* 5462 * Shared mappings base their reservation on the number of pages that 5463 * are already allocated on behalf of the file. Private mappings need 5464 * to reserve the full area even if read-only as mprotect() may be 5465 * called to make the mapping read-write. Assume !vma is a shm mapping 5466 */ 5467 if (!vma || vma->vm_flags & VM_MAYSHARE) { 5468 /* 5469 * resv_map can not be NULL as hugetlb_reserve_pages is only 5470 * called for inodes for which resv_maps were created (see 5471 * hugetlbfs_get_inode). 5472 */ 5473 resv_map = inode_resv_map(inode); 5474 5475 chg = region_chg(resv_map, from, to, ®ions_needed); 5476 5477 } else { 5478 /* Private mapping. */ 5479 resv_map = resv_map_alloc(); 5480 if (!resv_map) 5481 return false; 5482 5483 chg = to - from; 5484 5485 set_vma_resv_map(vma, resv_map); 5486 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 5487 } 5488 5489 if (chg < 0) 5490 goto out_err; 5491 5492 if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h), 5493 chg * pages_per_huge_page(h), &h_cg) < 0) 5494 goto out_err; 5495 5496 if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { 5497 /* For private mappings, the hugetlb_cgroup uncharge info hangs 5498 * of the resv_map. 5499 */ 5500 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); 5501 } 5502 5503 /* 5504 * There must be enough pages in the subpool for the mapping. If 5505 * the subpool has a minimum size, there may be some global 5506 * reservations already in place (gbl_reserve). 5507 */ 5508 gbl_reserve = hugepage_subpool_get_pages(spool, chg); 5509 if (gbl_reserve < 0) 5510 goto out_uncharge_cgroup; 5511 5512 /* 5513 * Check enough hugepages are available for the reservation. 5514 * Hand the pages back to the subpool if there are not 5515 */ 5516 if (hugetlb_acct_memory(h, gbl_reserve) < 0) 5517 goto out_put_pages; 5518 5519 /* 5520 * Account for the reservations made. Shared mappings record regions 5521 * that have reservations as they are shared by multiple VMAs. 5522 * When the last VMA disappears, the region map says how much 5523 * the reservation was and the page cache tells how much of 5524 * the reservation was consumed. Private mappings are per-VMA and 5525 * only the consumed reservations are tracked. When the VMA 5526 * disappears, the original reservation is the VMA size and the 5527 * consumed reservations are stored in the map. Hence, nothing 5528 * else has to be done for private mappings here 5529 */ 5530 if (!vma || vma->vm_flags & VM_MAYSHARE) { 5531 add = region_add(resv_map, from, to, regions_needed, h, h_cg); 5532 5533 if (unlikely(add < 0)) { 5534 hugetlb_acct_memory(h, -gbl_reserve); 5535 goto out_put_pages; 5536 } else if (unlikely(chg > add)) { 5537 /* 5538 * pages in this range were added to the reserve 5539 * map between region_chg and region_add. This 5540 * indicates a race with alloc_huge_page. Adjust 5541 * the subpool and reserve counts modified above 5542 * based on the difference. 5543 */ 5544 long rsv_adjust; 5545 5546 /* 5547 * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the 5548 * reference to h_cg->css. See comment below for detail. 5549 */ 5550 hugetlb_cgroup_uncharge_cgroup_rsvd( 5551 hstate_index(h), 5552 (chg - add) * pages_per_huge_page(h), h_cg); 5553 5554 rsv_adjust = hugepage_subpool_put_pages(spool, 5555 chg - add); 5556 hugetlb_acct_memory(h, -rsv_adjust); 5557 } else if (h_cg) { 5558 /* 5559 * The file_regions will hold their own reference to 5560 * h_cg->css. So we should release the reference held 5561 * via hugetlb_cgroup_charge_cgroup_rsvd() when we are 5562 * done. 5563 */ 5564 hugetlb_cgroup_put_rsvd_cgroup(h_cg); 5565 } 5566 } 5567 return true; 5568 5569 out_put_pages: 5570 /* put back original number of pages, chg */ 5571 (void)hugepage_subpool_put_pages(spool, chg); 5572 out_uncharge_cgroup: 5573 hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), 5574 chg * pages_per_huge_page(h), h_cg); 5575 out_err: 5576 if (!vma || vma->vm_flags & VM_MAYSHARE) 5577 /* Only call region_abort if the region_chg succeeded but the 5578 * region_add failed or didn't run. 5579 */ 5580 if (chg >= 0 && add < 0) 5581 region_abort(resv_map, from, to, regions_needed); 5582 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 5583 kref_put(&resv_map->refs, resv_map_release); 5584 return false; 5585 } 5586 5587 long hugetlb_unreserve_pages(struct inode *inode, long start, long end, 5588 long freed) 5589 { 5590 struct hstate *h = hstate_inode(inode); 5591 struct resv_map *resv_map = inode_resv_map(inode); 5592 long chg = 0; 5593 struct hugepage_subpool *spool = subpool_inode(inode); 5594 long gbl_reserve; 5595 5596 /* 5597 * Since this routine can be called in the evict inode path for all 5598 * hugetlbfs inodes, resv_map could be NULL. 5599 */ 5600 if (resv_map) { 5601 chg = region_del(resv_map, start, end); 5602 /* 5603 * region_del() can fail in the rare case where a region 5604 * must be split and another region descriptor can not be 5605 * allocated. If end == LONG_MAX, it will not fail. 5606 */ 5607 if (chg < 0) 5608 return chg; 5609 } 5610 5611 spin_lock(&inode->i_lock); 5612 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 5613 spin_unlock(&inode->i_lock); 5614 5615 /* 5616 * If the subpool has a minimum size, the number of global 5617 * reservations to be released may be adjusted. 5618 * 5619 * Note that !resv_map implies freed == 0. So (chg - freed) 5620 * won't go negative. 5621 */ 5622 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); 5623 hugetlb_acct_memory(h, -gbl_reserve); 5624 5625 return 0; 5626 } 5627 5628 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 5629 static unsigned long page_table_shareable(struct vm_area_struct *svma, 5630 struct vm_area_struct *vma, 5631 unsigned long addr, pgoff_t idx) 5632 { 5633 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + 5634 svma->vm_start; 5635 unsigned long sbase = saddr & PUD_MASK; 5636 unsigned long s_end = sbase + PUD_SIZE; 5637 5638 /* Allow segments to share if only one is marked locked */ 5639 unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; 5640 unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; 5641 5642 /* 5643 * match the virtual addresses, permission and the alignment of the 5644 * page table page. 5645 */ 5646 if (pmd_index(addr) != pmd_index(saddr) || 5647 vm_flags != svm_flags || 5648 !range_in_vma(svma, sbase, s_end)) 5649 return 0; 5650 5651 return saddr; 5652 } 5653 5654 static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) 5655 { 5656 unsigned long base = addr & PUD_MASK; 5657 unsigned long end = base + PUD_SIZE; 5658 5659 /* 5660 * check on proper vm_flags and page table alignment 5661 */ 5662 if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) 5663 return true; 5664 return false; 5665 } 5666 5667 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) 5668 { 5669 #ifdef CONFIG_USERFAULTFD 5670 if (uffd_disable_huge_pmd_share(vma)) 5671 return false; 5672 #endif 5673 return vma_shareable(vma, addr); 5674 } 5675 5676 /* 5677 * Determine if start,end range within vma could be mapped by shared pmd. 5678 * If yes, adjust start and end to cover range associated with possible 5679 * shared pmd mappings. 5680 */ 5681 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 5682 unsigned long *start, unsigned long *end) 5683 { 5684 unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE), 5685 v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); 5686 5687 /* 5688 * vma needs to span at least one aligned PUD size, and the range 5689 * must be at least partially within in. 5690 */ 5691 if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || 5692 (*end <= v_start) || (*start >= v_end)) 5693 return; 5694 5695 /* Extend the range to be PUD aligned for a worst case scenario */ 5696 if (*start > v_start) 5697 *start = ALIGN_DOWN(*start, PUD_SIZE); 5698 5699 if (*end < v_end) 5700 *end = ALIGN(*end, PUD_SIZE); 5701 } 5702 5703 /* 5704 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 5705 * and returns the corresponding pte. While this is not necessary for the 5706 * !shared pmd case because we can allocate the pmd later as well, it makes the 5707 * code much cleaner. 5708 * 5709 * This routine must be called with i_mmap_rwsem held in at least read mode if 5710 * sharing is possible. For hugetlbfs, this prevents removal of any page 5711 * table entries associated with the address space. This is important as we 5712 * are setting up sharing based on existing page table entries (mappings). 5713 * 5714 * NOTE: This routine is only called from huge_pte_alloc. Some callers of 5715 * huge_pte_alloc know that sharing is not possible and do not take 5716 * i_mmap_rwsem as a performance optimization. This is handled by the 5717 * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is 5718 * only required for subsequent processing. 5719 */ 5720 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, 5721 unsigned long addr, pud_t *pud) 5722 { 5723 struct address_space *mapping = vma->vm_file->f_mapping; 5724 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 5725 vma->vm_pgoff; 5726 struct vm_area_struct *svma; 5727 unsigned long saddr; 5728 pte_t *spte = NULL; 5729 pte_t *pte; 5730 spinlock_t *ptl; 5731 5732 i_mmap_assert_locked(mapping); 5733 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 5734 if (svma == vma) 5735 continue; 5736 5737 saddr = page_table_shareable(svma, vma, addr, idx); 5738 if (saddr) { 5739 spte = huge_pte_offset(svma->vm_mm, saddr, 5740 vma_mmu_pagesize(svma)); 5741 if (spte) { 5742 get_page(virt_to_page(spte)); 5743 break; 5744 } 5745 } 5746 } 5747 5748 if (!spte) 5749 goto out; 5750 5751 ptl = huge_pte_lock(hstate_vma(vma), mm, spte); 5752 if (pud_none(*pud)) { 5753 pud_populate(mm, pud, 5754 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 5755 mm_inc_nr_pmds(mm); 5756 } else { 5757 put_page(virt_to_page(spte)); 5758 } 5759 spin_unlock(ptl); 5760 out: 5761 pte = (pte_t *)pmd_alloc(mm, pud, addr); 5762 return pte; 5763 } 5764 5765 /* 5766 * unmap huge page backed by shared pte. 5767 * 5768 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared 5769 * indicated by page_count > 1, unmap is achieved by clearing pud and 5770 * decrementing the ref count. If count == 1, the pte page is not shared. 5771 * 5772 * Called with page table lock held and i_mmap_rwsem held in write mode. 5773 * 5774 * returns: 1 successfully unmapped a shared pte page 5775 * 0 the underlying pte page is not shared, or it is the last user 5776 */ 5777 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, 5778 unsigned long *addr, pte_t *ptep) 5779 { 5780 pgd_t *pgd = pgd_offset(mm, *addr); 5781 p4d_t *p4d = p4d_offset(pgd, *addr); 5782 pud_t *pud = pud_offset(p4d, *addr); 5783 5784 i_mmap_assert_write_locked(vma->vm_file->f_mapping); 5785 BUG_ON(page_count(virt_to_page(ptep)) == 0); 5786 if (page_count(virt_to_page(ptep)) == 1) 5787 return 0; 5788 5789 pud_clear(pud); 5790 put_page(virt_to_page(ptep)); 5791 mm_dec_nr_pmds(mm); 5792 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; 5793 return 1; 5794 } 5795 5796 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 5797 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, 5798 unsigned long addr, pud_t *pud) 5799 { 5800 return NULL; 5801 } 5802 5803 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, 5804 unsigned long *addr, pte_t *ptep) 5805 { 5806 return 0; 5807 } 5808 5809 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 5810 unsigned long *start, unsigned long *end) 5811 { 5812 } 5813 5814 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) 5815 { 5816 return false; 5817 } 5818 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 5819 5820 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB 5821 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 5822 unsigned long addr, unsigned long sz) 5823 { 5824 pgd_t *pgd; 5825 p4d_t *p4d; 5826 pud_t *pud; 5827 pte_t *pte = NULL; 5828 5829 pgd = pgd_offset(mm, addr); 5830 p4d = p4d_alloc(mm, pgd, addr); 5831 if (!p4d) 5832 return NULL; 5833 pud = pud_alloc(mm, p4d, addr); 5834 if (pud) { 5835 if (sz == PUD_SIZE) { 5836 pte = (pte_t *)pud; 5837 } else { 5838 BUG_ON(sz != PMD_SIZE); 5839 if (want_pmd_share(vma, addr) && pud_none(*pud)) 5840 pte = huge_pmd_share(mm, vma, addr, pud); 5841 else 5842 pte = (pte_t *)pmd_alloc(mm, pud, addr); 5843 } 5844 } 5845 BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte)); 5846 5847 return pte; 5848 } 5849 5850 /* 5851 * huge_pte_offset() - Walk the page table to resolve the hugepage 5852 * entry at address @addr 5853 * 5854 * Return: Pointer to page table entry (PUD or PMD) for 5855 * address @addr, or NULL if a !p*d_present() entry is encountered and the 5856 * size @sz doesn't match the hugepage size at this level of the page 5857 * table. 5858 */ 5859 pte_t *huge_pte_offset(struct mm_struct *mm, 5860 unsigned long addr, unsigned long sz) 5861 { 5862 pgd_t *pgd; 5863 p4d_t *p4d; 5864 pud_t *pud; 5865 pmd_t *pmd; 5866 5867 pgd = pgd_offset(mm, addr); 5868 if (!pgd_present(*pgd)) 5869 return NULL; 5870 p4d = p4d_offset(pgd, addr); 5871 if (!p4d_present(*p4d)) 5872 return NULL; 5873 5874 pud = pud_offset(p4d, addr); 5875 if (sz == PUD_SIZE) 5876 /* must be pud huge, non-present or none */ 5877 return (pte_t *)pud; 5878 if (!pud_present(*pud)) 5879 return NULL; 5880 /* must have a valid entry and size to go further */ 5881 5882 pmd = pmd_offset(pud, addr); 5883 /* must be pmd huge, non-present or none */ 5884 return (pte_t *)pmd; 5885 } 5886 5887 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 5888 5889 /* 5890 * These functions are overwritable if your architecture needs its own 5891 * behavior. 5892 */ 5893 struct page * __weak 5894 follow_huge_addr(struct mm_struct *mm, unsigned long address, 5895 int write) 5896 { 5897 return ERR_PTR(-EINVAL); 5898 } 5899 5900 struct page * __weak 5901 follow_huge_pd(struct vm_area_struct *vma, 5902 unsigned long address, hugepd_t hpd, int flags, int pdshift) 5903 { 5904 WARN(1, "hugepd follow called with no support for hugepage directory format\n"); 5905 return NULL; 5906 } 5907 5908 struct page * __weak 5909 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 5910 pmd_t *pmd, int flags) 5911 { 5912 struct page *page = NULL; 5913 spinlock_t *ptl; 5914 pte_t pte; 5915 5916 /* FOLL_GET and FOLL_PIN are mutually exclusive. */ 5917 if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == 5918 (FOLL_PIN | FOLL_GET))) 5919 return NULL; 5920 5921 retry: 5922 ptl = pmd_lockptr(mm, pmd); 5923 spin_lock(ptl); 5924 /* 5925 * make sure that the address range covered by this pmd is not 5926 * unmapped from other threads. 5927 */ 5928 if (!pmd_huge(*pmd)) 5929 goto out; 5930 pte = huge_ptep_get((pte_t *)pmd); 5931 if (pte_present(pte)) { 5932 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); 5933 /* 5934 * try_grab_page() should always succeed here, because: a) we 5935 * hold the pmd (ptl) lock, and b) we've just checked that the 5936 * huge pmd (head) page is present in the page tables. The ptl 5937 * prevents the head page and tail pages from being rearranged 5938 * in any way. So this page must be available at this point, 5939 * unless the page refcount overflowed: 5940 */ 5941 if (WARN_ON_ONCE(!try_grab_page(page, flags))) { 5942 page = NULL; 5943 goto out; 5944 } 5945 } else { 5946 if (is_hugetlb_entry_migration(pte)) { 5947 spin_unlock(ptl); 5948 __migration_entry_wait(mm, (pte_t *)pmd, ptl); 5949 goto retry; 5950 } 5951 /* 5952 * hwpoisoned entry is treated as no_page_table in 5953 * follow_page_mask(). 5954 */ 5955 } 5956 out: 5957 spin_unlock(ptl); 5958 return page; 5959 } 5960 5961 struct page * __weak 5962 follow_huge_pud(struct mm_struct *mm, unsigned long address, 5963 pud_t *pud, int flags) 5964 { 5965 if (flags & (FOLL_GET | FOLL_PIN)) 5966 return NULL; 5967 5968 return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); 5969 } 5970 5971 struct page * __weak 5972 follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) 5973 { 5974 if (flags & (FOLL_GET | FOLL_PIN)) 5975 return NULL; 5976 5977 return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); 5978 } 5979 5980 bool isolate_huge_page(struct page *page, struct list_head *list) 5981 { 5982 bool ret = true; 5983 5984 spin_lock_irq(&hugetlb_lock); 5985 if (!PageHeadHuge(page) || 5986 !HPageMigratable(page) || 5987 !get_page_unless_zero(page)) { 5988 ret = false; 5989 goto unlock; 5990 } 5991 ClearHPageMigratable(page); 5992 list_move_tail(&page->lru, list); 5993 unlock: 5994 spin_unlock_irq(&hugetlb_lock); 5995 return ret; 5996 } 5997 5998 int get_hwpoison_huge_page(struct page *page, bool *hugetlb) 5999 { 6000 int ret = 0; 6001 6002 *hugetlb = false; 6003 spin_lock_irq(&hugetlb_lock); 6004 if (PageHeadHuge(page)) { 6005 *hugetlb = true; 6006 if (HPageFreed(page) || HPageMigratable(page)) 6007 ret = get_page_unless_zero(page); 6008 else 6009 ret = -EBUSY; 6010 } 6011 spin_unlock_irq(&hugetlb_lock); 6012 return ret; 6013 } 6014 6015 void putback_active_hugepage(struct page *page) 6016 { 6017 spin_lock_irq(&hugetlb_lock); 6018 SetHPageMigratable(page); 6019 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); 6020 spin_unlock_irq(&hugetlb_lock); 6021 put_page(page); 6022 } 6023 6024 void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) 6025 { 6026 struct hstate *h = page_hstate(oldpage); 6027 6028 hugetlb_cgroup_migrate(oldpage, newpage); 6029 set_page_owner_migrate_reason(newpage, reason); 6030 6031 /* 6032 * transfer temporary state of the new huge page. This is 6033 * reverse to other transitions because the newpage is going to 6034 * be final while the old one will be freed so it takes over 6035 * the temporary status. 6036 * 6037 * Also note that we have to transfer the per-node surplus state 6038 * here as well otherwise the global surplus count will not match 6039 * the per-node's. 6040 */ 6041 if (HPageTemporary(newpage)) { 6042 int old_nid = page_to_nid(oldpage); 6043 int new_nid = page_to_nid(newpage); 6044 6045 SetHPageTemporary(oldpage); 6046 ClearHPageTemporary(newpage); 6047 6048 /* 6049 * There is no need to transfer the per-node surplus state 6050 * when we do not cross the node. 6051 */ 6052 if (new_nid == old_nid) 6053 return; 6054 spin_lock_irq(&hugetlb_lock); 6055 if (h->surplus_huge_pages_node[old_nid]) { 6056 h->surplus_huge_pages_node[old_nid]--; 6057 h->surplus_huge_pages_node[new_nid]++; 6058 } 6059 spin_unlock_irq(&hugetlb_lock); 6060 } 6061 } 6062 6063 /* 6064 * This function will unconditionally remove all the shared pmd pgtable entries 6065 * within the specific vma for a hugetlbfs memory range. 6066 */ 6067 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) 6068 { 6069 struct hstate *h = hstate_vma(vma); 6070 unsigned long sz = huge_page_size(h); 6071 struct mm_struct *mm = vma->vm_mm; 6072 struct mmu_notifier_range range; 6073 unsigned long address, start, end; 6074 spinlock_t *ptl; 6075 pte_t *ptep; 6076 6077 if (!(vma->vm_flags & VM_MAYSHARE)) 6078 return; 6079 6080 start = ALIGN(vma->vm_start, PUD_SIZE); 6081 end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); 6082 6083 if (start >= end) 6084 return; 6085 6086 /* 6087 * No need to call adjust_range_if_pmd_sharing_possible(), because 6088 * we have already done the PUD_SIZE alignment. 6089 */ 6090 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 6091 start, end); 6092 mmu_notifier_invalidate_range_start(&range); 6093 i_mmap_lock_write(vma->vm_file->f_mapping); 6094 for (address = start; address < end; address += PUD_SIZE) { 6095 unsigned long tmp = address; 6096 6097 ptep = huge_pte_offset(mm, address, sz); 6098 if (!ptep) 6099 continue; 6100 ptl = huge_pte_lock(h, mm, ptep); 6101 /* We don't want 'address' to be changed */ 6102 huge_pmd_unshare(mm, vma, &tmp, ptep); 6103 spin_unlock(ptl); 6104 } 6105 flush_hugetlb_tlb_range(vma, start, end); 6106 i_mmap_unlock_write(vma->vm_file->f_mapping); 6107 /* 6108 * No need to call mmu_notifier_invalidate_range(), see 6109 * Documentation/vm/mmu_notifier.rst. 6110 */ 6111 mmu_notifier_invalidate_range_end(&range); 6112 } 6113 6114 #ifdef CONFIG_CMA 6115 static bool cma_reserve_called __initdata; 6116 6117 static int __init cmdline_parse_hugetlb_cma(char *p) 6118 { 6119 hugetlb_cma_size = memparse(p, &p); 6120 return 0; 6121 } 6122 6123 early_param("hugetlb_cma", cmdline_parse_hugetlb_cma); 6124 6125 void __init hugetlb_cma_reserve(int order) 6126 { 6127 unsigned long size, reserved, per_node; 6128 int nid; 6129 6130 cma_reserve_called = true; 6131 6132 if (!hugetlb_cma_size) 6133 return; 6134 6135 if (hugetlb_cma_size < (PAGE_SIZE << order)) { 6136 pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", 6137 (PAGE_SIZE << order) / SZ_1M); 6138 return; 6139 } 6140 6141 /* 6142 * If 3 GB area is requested on a machine with 4 numa nodes, 6143 * let's allocate 1 GB on first three nodes and ignore the last one. 6144 */ 6145 per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); 6146 pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", 6147 hugetlb_cma_size / SZ_1M, per_node / SZ_1M); 6148 6149 reserved = 0; 6150 for_each_node_state(nid, N_ONLINE) { 6151 int res; 6152 char name[CMA_MAX_NAME]; 6153 6154 size = min(per_node, hugetlb_cma_size - reserved); 6155 size = round_up(size, PAGE_SIZE << order); 6156 6157 snprintf(name, sizeof(name), "hugetlb%d", nid); 6158 res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order, 6159 0, false, name, 6160 &hugetlb_cma[nid], nid); 6161 if (res) { 6162 pr_warn("hugetlb_cma: reservation failed: err %d, node %d", 6163 res, nid); 6164 continue; 6165 } 6166 6167 reserved += size; 6168 pr_info("hugetlb_cma: reserved %lu MiB on node %d\n", 6169 size / SZ_1M, nid); 6170 6171 if (reserved >= hugetlb_cma_size) 6172 break; 6173 } 6174 } 6175 6176 void __init hugetlb_cma_check(void) 6177 { 6178 if (!hugetlb_cma_size || cma_reserve_called) 6179 return; 6180 6181 pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); 6182 } 6183 6184 #endif /* CONFIG_CMA */ 6185