1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Generic hugetlb support. 4 * (C) Nadia Yvette Chambers, April 2004 5 */ 6 #include <linux/list.h> 7 #include <linux/init.h> 8 #include <linux/mm.h> 9 #include <linux/seq_file.h> 10 #include <linux/sysctl.h> 11 #include <linux/highmem.h> 12 #include <linux/mmu_notifier.h> 13 #include <linux/nodemask.h> 14 #include <linux/pagemap.h> 15 #include <linux/mempolicy.h> 16 #include <linux/compiler.h> 17 #include <linux/cpuset.h> 18 #include <linux/mutex.h> 19 #include <linux/memblock.h> 20 #include <linux/sysfs.h> 21 #include <linux/slab.h> 22 #include <linux/sched/mm.h> 23 #include <linux/mmdebug.h> 24 #include <linux/sched/signal.h> 25 #include <linux/rmap.h> 26 #include <linux/string_helpers.h> 27 #include <linux/swap.h> 28 #include <linux/swapops.h> 29 #include <linux/jhash.h> 30 #include <linux/numa.h> 31 #include <linux/llist.h> 32 #include <linux/cma.h> 33 #include <linux/migrate.h> 34 #include <linux/nospec.h> 35 #include <linux/delayacct.h> 36 37 #include <asm/page.h> 38 #include <asm/pgalloc.h> 39 #include <asm/tlb.h> 40 41 #include <linux/io.h> 42 #include <linux/hugetlb.h> 43 #include <linux/hugetlb_cgroup.h> 44 #include <linux/node.h> 45 #include <linux/page_owner.h> 46 #include "internal.h" 47 #include "hugetlb_vmemmap.h" 48 49 int hugetlb_max_hstate __read_mostly; 50 unsigned int default_hstate_idx; 51 struct hstate hstates[HUGE_MAX_HSTATE]; 52 53 #ifdef CONFIG_CMA 54 static struct cma *hugetlb_cma[MAX_NUMNODES]; 55 static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; 56 static bool hugetlb_cma_page(struct page *page, unsigned int order) 57 { 58 return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page, 59 1 << order); 60 } 61 #else 62 static bool hugetlb_cma_page(struct page *page, unsigned int order) 63 { 64 return false; 65 } 66 #endif 67 static unsigned long hugetlb_cma_size __initdata; 68 69 __initdata LIST_HEAD(huge_boot_pages); 70 71 /* for command line parsing */ 72 static struct hstate * __initdata parsed_hstate; 73 static unsigned long __initdata default_hstate_max_huge_pages; 74 static bool __initdata parsed_valid_hugepagesz = true; 75 static bool __initdata parsed_default_hugepagesz; 76 static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; 77 78 /* 79 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, 80 * free_huge_pages, and surplus_huge_pages. 81 */ 82 DEFINE_SPINLOCK(hugetlb_lock); 83 84 /* 85 * Serializes faults on the same logical page. This is used to 86 * prevent spurious OOMs when the hugepage pool is fully utilized. 87 */ 88 static int num_fault_mutexes; 89 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; 90 91 /* Forward declaration */ 92 static int hugetlb_acct_memory(struct hstate *h, long delta); 93 94 static inline bool subpool_is_free(struct hugepage_subpool *spool) 95 { 96 if (spool->count) 97 return false; 98 if (spool->max_hpages != -1) 99 return spool->used_hpages == 0; 100 if (spool->min_hpages != -1) 101 return spool->rsv_hpages == spool->min_hpages; 102 103 return true; 104 } 105 106 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, 107 unsigned long irq_flags) 108 { 109 spin_unlock_irqrestore(&spool->lock, irq_flags); 110 111 /* If no pages are used, and no other handles to the subpool 112 * remain, give up any reservations based on minimum size and 113 * free the subpool */ 114 if (subpool_is_free(spool)) { 115 if (spool->min_hpages != -1) 116 hugetlb_acct_memory(spool->hstate, 117 -spool->min_hpages); 118 kfree(spool); 119 } 120 } 121 122 struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, 123 long min_hpages) 124 { 125 struct hugepage_subpool *spool; 126 127 spool = kzalloc(sizeof(*spool), GFP_KERNEL); 128 if (!spool) 129 return NULL; 130 131 spin_lock_init(&spool->lock); 132 spool->count = 1; 133 spool->max_hpages = max_hpages; 134 spool->hstate = h; 135 spool->min_hpages = min_hpages; 136 137 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { 138 kfree(spool); 139 return NULL; 140 } 141 spool->rsv_hpages = min_hpages; 142 143 return spool; 144 } 145 146 void hugepage_put_subpool(struct hugepage_subpool *spool) 147 { 148 unsigned long flags; 149 150 spin_lock_irqsave(&spool->lock, flags); 151 BUG_ON(!spool->count); 152 spool->count--; 153 unlock_or_release_subpool(spool, flags); 154 } 155 156 /* 157 * Subpool accounting for allocating and reserving pages. 158 * Return -ENOMEM if there are not enough resources to satisfy the 159 * request. Otherwise, return the number of pages by which the 160 * global pools must be adjusted (upward). The returned value may 161 * only be different than the passed value (delta) in the case where 162 * a subpool minimum size must be maintained. 163 */ 164 static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, 165 long delta) 166 { 167 long ret = delta; 168 169 if (!spool) 170 return ret; 171 172 spin_lock_irq(&spool->lock); 173 174 if (spool->max_hpages != -1) { /* maximum size accounting */ 175 if ((spool->used_hpages + delta) <= spool->max_hpages) 176 spool->used_hpages += delta; 177 else { 178 ret = -ENOMEM; 179 goto unlock_ret; 180 } 181 } 182 183 /* minimum size accounting */ 184 if (spool->min_hpages != -1 && spool->rsv_hpages) { 185 if (delta > spool->rsv_hpages) { 186 /* 187 * Asking for more reserves than those already taken on 188 * behalf of subpool. Return difference. 189 */ 190 ret = delta - spool->rsv_hpages; 191 spool->rsv_hpages = 0; 192 } else { 193 ret = 0; /* reserves already accounted for */ 194 spool->rsv_hpages -= delta; 195 } 196 } 197 198 unlock_ret: 199 spin_unlock_irq(&spool->lock); 200 return ret; 201 } 202 203 /* 204 * Subpool accounting for freeing and unreserving pages. 205 * Return the number of global page reservations that must be dropped. 206 * The return value may only be different than the passed value (delta) 207 * in the case where a subpool minimum size must be maintained. 208 */ 209 static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, 210 long delta) 211 { 212 long ret = delta; 213 unsigned long flags; 214 215 if (!spool) 216 return delta; 217 218 spin_lock_irqsave(&spool->lock, flags); 219 220 if (spool->max_hpages != -1) /* maximum size accounting */ 221 spool->used_hpages -= delta; 222 223 /* minimum size accounting */ 224 if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { 225 if (spool->rsv_hpages + delta <= spool->min_hpages) 226 ret = 0; 227 else 228 ret = spool->rsv_hpages + delta - spool->min_hpages; 229 230 spool->rsv_hpages += delta; 231 if (spool->rsv_hpages > spool->min_hpages) 232 spool->rsv_hpages = spool->min_hpages; 233 } 234 235 /* 236 * If hugetlbfs_put_super couldn't free spool due to an outstanding 237 * quota reference, free it now. 238 */ 239 unlock_or_release_subpool(spool, flags); 240 241 return ret; 242 } 243 244 static inline struct hugepage_subpool *subpool_inode(struct inode *inode) 245 { 246 return HUGETLBFS_SB(inode->i_sb)->spool; 247 } 248 249 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) 250 { 251 return subpool_inode(file_inode(vma->vm_file)); 252 } 253 254 /* Helper that removes a struct file_region from the resv_map cache and returns 255 * it for use. 256 */ 257 static struct file_region * 258 get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) 259 { 260 struct file_region *nrg = NULL; 261 262 VM_BUG_ON(resv->region_cache_count <= 0); 263 264 resv->region_cache_count--; 265 nrg = list_first_entry(&resv->region_cache, struct file_region, link); 266 list_del(&nrg->link); 267 268 nrg->from = from; 269 nrg->to = to; 270 271 return nrg; 272 } 273 274 static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, 275 struct file_region *rg) 276 { 277 #ifdef CONFIG_CGROUP_HUGETLB 278 nrg->reservation_counter = rg->reservation_counter; 279 nrg->css = rg->css; 280 if (rg->css) 281 css_get(rg->css); 282 #endif 283 } 284 285 /* Helper that records hugetlb_cgroup uncharge info. */ 286 static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, 287 struct hstate *h, 288 struct resv_map *resv, 289 struct file_region *nrg) 290 { 291 #ifdef CONFIG_CGROUP_HUGETLB 292 if (h_cg) { 293 nrg->reservation_counter = 294 &h_cg->rsvd_hugepage[hstate_index(h)]; 295 nrg->css = &h_cg->css; 296 /* 297 * The caller will hold exactly one h_cg->css reference for the 298 * whole contiguous reservation region. But this area might be 299 * scattered when there are already some file_regions reside in 300 * it. As a result, many file_regions may share only one css 301 * reference. In order to ensure that one file_region must hold 302 * exactly one h_cg->css reference, we should do css_get for 303 * each file_region and leave the reference held by caller 304 * untouched. 305 */ 306 css_get(&h_cg->css); 307 if (!resv->pages_per_hpage) 308 resv->pages_per_hpage = pages_per_huge_page(h); 309 /* pages_per_hpage should be the same for all entries in 310 * a resv_map. 311 */ 312 VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); 313 } else { 314 nrg->reservation_counter = NULL; 315 nrg->css = NULL; 316 } 317 #endif 318 } 319 320 static void put_uncharge_info(struct file_region *rg) 321 { 322 #ifdef CONFIG_CGROUP_HUGETLB 323 if (rg->css) 324 css_put(rg->css); 325 #endif 326 } 327 328 static bool has_same_uncharge_info(struct file_region *rg, 329 struct file_region *org) 330 { 331 #ifdef CONFIG_CGROUP_HUGETLB 332 return rg->reservation_counter == org->reservation_counter && 333 rg->css == org->css; 334 335 #else 336 return true; 337 #endif 338 } 339 340 static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) 341 { 342 struct file_region *nrg = NULL, *prg = NULL; 343 344 prg = list_prev_entry(rg, link); 345 if (&prg->link != &resv->regions && prg->to == rg->from && 346 has_same_uncharge_info(prg, rg)) { 347 prg->to = rg->to; 348 349 list_del(&rg->link); 350 put_uncharge_info(rg); 351 kfree(rg); 352 353 rg = prg; 354 } 355 356 nrg = list_next_entry(rg, link); 357 if (&nrg->link != &resv->regions && nrg->from == rg->to && 358 has_same_uncharge_info(nrg, rg)) { 359 nrg->from = rg->from; 360 361 list_del(&rg->link); 362 put_uncharge_info(rg); 363 kfree(rg); 364 } 365 } 366 367 static inline long 368 hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from, 369 long to, struct hstate *h, struct hugetlb_cgroup *cg, 370 long *regions_needed) 371 { 372 struct file_region *nrg; 373 374 if (!regions_needed) { 375 nrg = get_file_region_entry_from_cache(map, from, to); 376 record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); 377 list_add(&nrg->link, rg); 378 coalesce_file_region(map, nrg); 379 } else 380 *regions_needed += 1; 381 382 return to - from; 383 } 384 385 /* 386 * Must be called with resv->lock held. 387 * 388 * Calling this with regions_needed != NULL will count the number of pages 389 * to be added but will not modify the linked list. And regions_needed will 390 * indicate the number of file_regions needed in the cache to carry out to add 391 * the regions for this range. 392 */ 393 static long add_reservation_in_range(struct resv_map *resv, long f, long t, 394 struct hugetlb_cgroup *h_cg, 395 struct hstate *h, long *regions_needed) 396 { 397 long add = 0; 398 struct list_head *head = &resv->regions; 399 long last_accounted_offset = f; 400 struct file_region *iter, *trg = NULL; 401 struct list_head *rg = NULL; 402 403 if (regions_needed) 404 *regions_needed = 0; 405 406 /* In this loop, we essentially handle an entry for the range 407 * [last_accounted_offset, iter->from), at every iteration, with some 408 * bounds checking. 409 */ 410 list_for_each_entry_safe(iter, trg, head, link) { 411 /* Skip irrelevant regions that start before our range. */ 412 if (iter->from < f) { 413 /* If this region ends after the last accounted offset, 414 * then we need to update last_accounted_offset. 415 */ 416 if (iter->to > last_accounted_offset) 417 last_accounted_offset = iter->to; 418 continue; 419 } 420 421 /* When we find a region that starts beyond our range, we've 422 * finished. 423 */ 424 if (iter->from >= t) { 425 rg = iter->link.prev; 426 break; 427 } 428 429 /* Add an entry for last_accounted_offset -> iter->from, and 430 * update last_accounted_offset. 431 */ 432 if (iter->from > last_accounted_offset) 433 add += hugetlb_resv_map_add(resv, iter->link.prev, 434 last_accounted_offset, 435 iter->from, h, h_cg, 436 regions_needed); 437 438 last_accounted_offset = iter->to; 439 } 440 441 /* Handle the case where our range extends beyond 442 * last_accounted_offset. 443 */ 444 if (!rg) 445 rg = head->prev; 446 if (last_accounted_offset < t) 447 add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, 448 t, h, h_cg, regions_needed); 449 450 return add; 451 } 452 453 /* Must be called with resv->lock acquired. Will drop lock to allocate entries. 454 */ 455 static int allocate_file_region_entries(struct resv_map *resv, 456 int regions_needed) 457 __must_hold(&resv->lock) 458 { 459 LIST_HEAD(allocated_regions); 460 int to_allocate = 0, i = 0; 461 struct file_region *trg = NULL, *rg = NULL; 462 463 VM_BUG_ON(regions_needed < 0); 464 465 /* 466 * Check for sufficient descriptors in the cache to accommodate 467 * the number of in progress add operations plus regions_needed. 468 * 469 * This is a while loop because when we drop the lock, some other call 470 * to region_add or region_del may have consumed some region_entries, 471 * so we keep looping here until we finally have enough entries for 472 * (adds_in_progress + regions_needed). 473 */ 474 while (resv->region_cache_count < 475 (resv->adds_in_progress + regions_needed)) { 476 to_allocate = resv->adds_in_progress + regions_needed - 477 resv->region_cache_count; 478 479 /* At this point, we should have enough entries in the cache 480 * for all the existing adds_in_progress. We should only be 481 * needing to allocate for regions_needed. 482 */ 483 VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); 484 485 spin_unlock(&resv->lock); 486 for (i = 0; i < to_allocate; i++) { 487 trg = kmalloc(sizeof(*trg), GFP_KERNEL); 488 if (!trg) 489 goto out_of_memory; 490 list_add(&trg->link, &allocated_regions); 491 } 492 493 spin_lock(&resv->lock); 494 495 list_splice(&allocated_regions, &resv->region_cache); 496 resv->region_cache_count += to_allocate; 497 } 498 499 return 0; 500 501 out_of_memory: 502 list_for_each_entry_safe(rg, trg, &allocated_regions, link) { 503 list_del(&rg->link); 504 kfree(rg); 505 } 506 return -ENOMEM; 507 } 508 509 /* 510 * Add the huge page range represented by [f, t) to the reserve 511 * map. Regions will be taken from the cache to fill in this range. 512 * Sufficient regions should exist in the cache due to the previous 513 * call to region_chg with the same range, but in some cases the cache will not 514 * have sufficient entries due to races with other code doing region_add or 515 * region_del. The extra needed entries will be allocated. 516 * 517 * regions_needed is the out value provided by a previous call to region_chg. 518 * 519 * Return the number of new huge pages added to the map. This number is greater 520 * than or equal to zero. If file_region entries needed to be allocated for 521 * this operation and we were not able to allocate, it returns -ENOMEM. 522 * region_add of regions of length 1 never allocate file_regions and cannot 523 * fail; region_chg will always allocate at least 1 entry and a region_add for 524 * 1 page will only require at most 1 entry. 525 */ 526 static long region_add(struct resv_map *resv, long f, long t, 527 long in_regions_needed, struct hstate *h, 528 struct hugetlb_cgroup *h_cg) 529 { 530 long add = 0, actual_regions_needed = 0; 531 532 spin_lock(&resv->lock); 533 retry: 534 535 /* Count how many regions are actually needed to execute this add. */ 536 add_reservation_in_range(resv, f, t, NULL, NULL, 537 &actual_regions_needed); 538 539 /* 540 * Check for sufficient descriptors in the cache to accommodate 541 * this add operation. Note that actual_regions_needed may be greater 542 * than in_regions_needed, as the resv_map may have been modified since 543 * the region_chg call. In this case, we need to make sure that we 544 * allocate extra entries, such that we have enough for all the 545 * existing adds_in_progress, plus the excess needed for this 546 * operation. 547 */ 548 if (actual_regions_needed > in_regions_needed && 549 resv->region_cache_count < 550 resv->adds_in_progress + 551 (actual_regions_needed - in_regions_needed)) { 552 /* region_add operation of range 1 should never need to 553 * allocate file_region entries. 554 */ 555 VM_BUG_ON(t - f <= 1); 556 557 if (allocate_file_region_entries( 558 resv, actual_regions_needed - in_regions_needed)) { 559 return -ENOMEM; 560 } 561 562 goto retry; 563 } 564 565 add = add_reservation_in_range(resv, f, t, h_cg, h, NULL); 566 567 resv->adds_in_progress -= in_regions_needed; 568 569 spin_unlock(&resv->lock); 570 return add; 571 } 572 573 /* 574 * Examine the existing reserve map and determine how many 575 * huge pages in the specified range [f, t) are NOT currently 576 * represented. This routine is called before a subsequent 577 * call to region_add that will actually modify the reserve 578 * map to add the specified range [f, t). region_chg does 579 * not change the number of huge pages represented by the 580 * map. A number of new file_region structures is added to the cache as a 581 * placeholder, for the subsequent region_add call to use. At least 1 582 * file_region structure is added. 583 * 584 * out_regions_needed is the number of regions added to the 585 * resv->adds_in_progress. This value needs to be provided to a follow up call 586 * to region_add or region_abort for proper accounting. 587 * 588 * Returns the number of huge pages that need to be added to the existing 589 * reservation map for the range [f, t). This number is greater or equal to 590 * zero. -ENOMEM is returned if a new file_region structure or cache entry 591 * is needed and can not be allocated. 592 */ 593 static long region_chg(struct resv_map *resv, long f, long t, 594 long *out_regions_needed) 595 { 596 long chg = 0; 597 598 spin_lock(&resv->lock); 599 600 /* Count how many hugepages in this range are NOT represented. */ 601 chg = add_reservation_in_range(resv, f, t, NULL, NULL, 602 out_regions_needed); 603 604 if (*out_regions_needed == 0) 605 *out_regions_needed = 1; 606 607 if (allocate_file_region_entries(resv, *out_regions_needed)) 608 return -ENOMEM; 609 610 resv->adds_in_progress += *out_regions_needed; 611 612 spin_unlock(&resv->lock); 613 return chg; 614 } 615 616 /* 617 * Abort the in progress add operation. The adds_in_progress field 618 * of the resv_map keeps track of the operations in progress between 619 * calls to region_chg and region_add. Operations are sometimes 620 * aborted after the call to region_chg. In such cases, region_abort 621 * is called to decrement the adds_in_progress counter. regions_needed 622 * is the value returned by the region_chg call, it is used to decrement 623 * the adds_in_progress counter. 624 * 625 * NOTE: The range arguments [f, t) are not needed or used in this 626 * routine. They are kept to make reading the calling code easier as 627 * arguments will match the associated region_chg call. 628 */ 629 static void region_abort(struct resv_map *resv, long f, long t, 630 long regions_needed) 631 { 632 spin_lock(&resv->lock); 633 VM_BUG_ON(!resv->region_cache_count); 634 resv->adds_in_progress -= regions_needed; 635 spin_unlock(&resv->lock); 636 } 637 638 /* 639 * Delete the specified range [f, t) from the reserve map. If the 640 * t parameter is LONG_MAX, this indicates that ALL regions after f 641 * should be deleted. Locate the regions which intersect [f, t) 642 * and either trim, delete or split the existing regions. 643 * 644 * Returns the number of huge pages deleted from the reserve map. 645 * In the normal case, the return value is zero or more. In the 646 * case where a region must be split, a new region descriptor must 647 * be allocated. If the allocation fails, -ENOMEM will be returned. 648 * NOTE: If the parameter t == LONG_MAX, then we will never split 649 * a region and possibly return -ENOMEM. Callers specifying 650 * t == LONG_MAX do not need to check for -ENOMEM error. 651 */ 652 static long region_del(struct resv_map *resv, long f, long t) 653 { 654 struct list_head *head = &resv->regions; 655 struct file_region *rg, *trg; 656 struct file_region *nrg = NULL; 657 long del = 0; 658 659 retry: 660 spin_lock(&resv->lock); 661 list_for_each_entry_safe(rg, trg, head, link) { 662 /* 663 * Skip regions before the range to be deleted. file_region 664 * ranges are normally of the form [from, to). However, there 665 * may be a "placeholder" entry in the map which is of the form 666 * (from, to) with from == to. Check for placeholder entries 667 * at the beginning of the range to be deleted. 668 */ 669 if (rg->to <= f && (rg->to != rg->from || rg->to != f)) 670 continue; 671 672 if (rg->from >= t) 673 break; 674 675 if (f > rg->from && t < rg->to) { /* Must split region */ 676 /* 677 * Check for an entry in the cache before dropping 678 * lock and attempting allocation. 679 */ 680 if (!nrg && 681 resv->region_cache_count > resv->adds_in_progress) { 682 nrg = list_first_entry(&resv->region_cache, 683 struct file_region, 684 link); 685 list_del(&nrg->link); 686 resv->region_cache_count--; 687 } 688 689 if (!nrg) { 690 spin_unlock(&resv->lock); 691 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 692 if (!nrg) 693 return -ENOMEM; 694 goto retry; 695 } 696 697 del += t - f; 698 hugetlb_cgroup_uncharge_file_region( 699 resv, rg, t - f, false); 700 701 /* New entry for end of split region */ 702 nrg->from = t; 703 nrg->to = rg->to; 704 705 copy_hugetlb_cgroup_uncharge_info(nrg, rg); 706 707 INIT_LIST_HEAD(&nrg->link); 708 709 /* Original entry is trimmed */ 710 rg->to = f; 711 712 list_add(&nrg->link, &rg->link); 713 nrg = NULL; 714 break; 715 } 716 717 if (f <= rg->from && t >= rg->to) { /* Remove entire region */ 718 del += rg->to - rg->from; 719 hugetlb_cgroup_uncharge_file_region(resv, rg, 720 rg->to - rg->from, true); 721 list_del(&rg->link); 722 kfree(rg); 723 continue; 724 } 725 726 if (f <= rg->from) { /* Trim beginning of region */ 727 hugetlb_cgroup_uncharge_file_region(resv, rg, 728 t - rg->from, false); 729 730 del += t - rg->from; 731 rg->from = t; 732 } else { /* Trim end of region */ 733 hugetlb_cgroup_uncharge_file_region(resv, rg, 734 rg->to - f, false); 735 736 del += rg->to - f; 737 rg->to = f; 738 } 739 } 740 741 spin_unlock(&resv->lock); 742 kfree(nrg); 743 return del; 744 } 745 746 /* 747 * A rare out of memory error was encountered which prevented removal of 748 * the reserve map region for a page. The huge page itself was free'ed 749 * and removed from the page cache. This routine will adjust the subpool 750 * usage count, and the global reserve count if needed. By incrementing 751 * these counts, the reserve map entry which could not be deleted will 752 * appear as a "reserved" entry instead of simply dangling with incorrect 753 * counts. 754 */ 755 void hugetlb_fix_reserve_counts(struct inode *inode) 756 { 757 struct hugepage_subpool *spool = subpool_inode(inode); 758 long rsv_adjust; 759 bool reserved = false; 760 761 rsv_adjust = hugepage_subpool_get_pages(spool, 1); 762 if (rsv_adjust > 0) { 763 struct hstate *h = hstate_inode(inode); 764 765 if (!hugetlb_acct_memory(h, 1)) 766 reserved = true; 767 } else if (!rsv_adjust) { 768 reserved = true; 769 } 770 771 if (!reserved) 772 pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); 773 } 774 775 /* 776 * Count and return the number of huge pages in the reserve map 777 * that intersect with the range [f, t). 778 */ 779 static long region_count(struct resv_map *resv, long f, long t) 780 { 781 struct list_head *head = &resv->regions; 782 struct file_region *rg; 783 long chg = 0; 784 785 spin_lock(&resv->lock); 786 /* Locate each segment we overlap with, and count that overlap. */ 787 list_for_each_entry(rg, head, link) { 788 long seg_from; 789 long seg_to; 790 791 if (rg->to <= f) 792 continue; 793 if (rg->from >= t) 794 break; 795 796 seg_from = max(rg->from, f); 797 seg_to = min(rg->to, t); 798 799 chg += seg_to - seg_from; 800 } 801 spin_unlock(&resv->lock); 802 803 return chg; 804 } 805 806 /* 807 * Convert the address within this vma to the page offset within 808 * the mapping, in pagecache page units; huge pages here. 809 */ 810 static pgoff_t vma_hugecache_offset(struct hstate *h, 811 struct vm_area_struct *vma, unsigned long address) 812 { 813 return ((address - vma->vm_start) >> huge_page_shift(h)) + 814 (vma->vm_pgoff >> huge_page_order(h)); 815 } 816 817 pgoff_t linear_hugepage_index(struct vm_area_struct *vma, 818 unsigned long address) 819 { 820 return vma_hugecache_offset(hstate_vma(vma), vma, address); 821 } 822 EXPORT_SYMBOL_GPL(linear_hugepage_index); 823 824 /* 825 * Return the size of the pages allocated when backing a VMA. In the majority 826 * cases this will be same size as used by the page table entries. 827 */ 828 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 829 { 830 if (vma->vm_ops && vma->vm_ops->pagesize) 831 return vma->vm_ops->pagesize(vma); 832 return PAGE_SIZE; 833 } 834 EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 835 836 /* 837 * Return the page size being used by the MMU to back a VMA. In the majority 838 * of cases, the page size used by the kernel matches the MMU size. On 839 * architectures where it differs, an architecture-specific 'strong' 840 * version of this symbol is required. 841 */ 842 __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 843 { 844 return vma_kernel_pagesize(vma); 845 } 846 847 /* 848 * Flags for MAP_PRIVATE reservations. These are stored in the bottom 849 * bits of the reservation map pointer, which are always clear due to 850 * alignment. 851 */ 852 #define HPAGE_RESV_OWNER (1UL << 0) 853 #define HPAGE_RESV_UNMAPPED (1UL << 1) 854 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 855 856 /* 857 * These helpers are used to track how many pages are reserved for 858 * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 859 * is guaranteed to have their future faults succeed. 860 * 861 * With the exception of reset_vma_resv_huge_pages() which is called at fork(), 862 * the reserve counters are updated with the hugetlb_lock held. It is safe 863 * to reset the VMA at fork() time as it is not in use yet and there is no 864 * chance of the global counters getting corrupted as a result of the values. 865 * 866 * The private mapping reservation is represented in a subtly different 867 * manner to a shared mapping. A shared mapping has a region map associated 868 * with the underlying file, this region map represents the backing file 869 * pages which have ever had a reservation assigned which this persists even 870 * after the page is instantiated. A private mapping has a region map 871 * associated with the original mmap which is attached to all VMAs which 872 * reference it, this region map represents those offsets which have consumed 873 * reservation ie. where pages have been instantiated. 874 */ 875 static unsigned long get_vma_private_data(struct vm_area_struct *vma) 876 { 877 return (unsigned long)vma->vm_private_data; 878 } 879 880 static void set_vma_private_data(struct vm_area_struct *vma, 881 unsigned long value) 882 { 883 vma->vm_private_data = (void *)value; 884 } 885 886 static void 887 resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map, 888 struct hugetlb_cgroup *h_cg, 889 struct hstate *h) 890 { 891 #ifdef CONFIG_CGROUP_HUGETLB 892 if (!h_cg || !h) { 893 resv_map->reservation_counter = NULL; 894 resv_map->pages_per_hpage = 0; 895 resv_map->css = NULL; 896 } else { 897 resv_map->reservation_counter = 898 &h_cg->rsvd_hugepage[hstate_index(h)]; 899 resv_map->pages_per_hpage = pages_per_huge_page(h); 900 resv_map->css = &h_cg->css; 901 } 902 #endif 903 } 904 905 struct resv_map *resv_map_alloc(void) 906 { 907 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 908 struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); 909 910 if (!resv_map || !rg) { 911 kfree(resv_map); 912 kfree(rg); 913 return NULL; 914 } 915 916 kref_init(&resv_map->refs); 917 spin_lock_init(&resv_map->lock); 918 INIT_LIST_HEAD(&resv_map->regions); 919 920 resv_map->adds_in_progress = 0; 921 /* 922 * Initialize these to 0. On shared mappings, 0's here indicate these 923 * fields don't do cgroup accounting. On private mappings, these will be 924 * re-initialized to the proper values, to indicate that hugetlb cgroup 925 * reservations are to be un-charged from here. 926 */ 927 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL); 928 929 INIT_LIST_HEAD(&resv_map->region_cache); 930 list_add(&rg->link, &resv_map->region_cache); 931 resv_map->region_cache_count = 1; 932 933 return resv_map; 934 } 935 936 void resv_map_release(struct kref *ref) 937 { 938 struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 939 struct list_head *head = &resv_map->region_cache; 940 struct file_region *rg, *trg; 941 942 /* Clear out any active regions before we release the map. */ 943 region_del(resv_map, 0, LONG_MAX); 944 945 /* ... and any entries left in the cache */ 946 list_for_each_entry_safe(rg, trg, head, link) { 947 list_del(&rg->link); 948 kfree(rg); 949 } 950 951 VM_BUG_ON(resv_map->adds_in_progress); 952 953 kfree(resv_map); 954 } 955 956 static inline struct resv_map *inode_resv_map(struct inode *inode) 957 { 958 /* 959 * At inode evict time, i_mapping may not point to the original 960 * address space within the inode. This original address space 961 * contains the pointer to the resv_map. So, always use the 962 * address space embedded within the inode. 963 * The VERY common case is inode->mapping == &inode->i_data but, 964 * this may not be true for device special inodes. 965 */ 966 return (struct resv_map *)(&inode->i_data)->private_data; 967 } 968 969 static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 970 { 971 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 972 if (vma->vm_flags & VM_MAYSHARE) { 973 struct address_space *mapping = vma->vm_file->f_mapping; 974 struct inode *inode = mapping->host; 975 976 return inode_resv_map(inode); 977 978 } else { 979 return (struct resv_map *)(get_vma_private_data(vma) & 980 ~HPAGE_RESV_MASK); 981 } 982 } 983 984 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 985 { 986 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 987 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 988 989 set_vma_private_data(vma, (get_vma_private_data(vma) & 990 HPAGE_RESV_MASK) | (unsigned long)map); 991 } 992 993 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 994 { 995 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 996 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 997 998 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 999 } 1000 1001 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 1002 { 1003 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 1004 1005 return (get_vma_private_data(vma) & flag) != 0; 1006 } 1007 1008 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 1009 void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 1010 { 1011 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 1012 if (!(vma->vm_flags & VM_MAYSHARE)) 1013 vma->vm_private_data = (void *)0; 1014 } 1015 1016 /* 1017 * Reset and decrement one ref on hugepage private reservation. 1018 * Called with mm->mmap_sem writer semaphore held. 1019 * This function should be only used by move_vma() and operate on 1020 * same sized vma. It should never come here with last ref on the 1021 * reservation. 1022 */ 1023 void clear_vma_resv_huge_pages(struct vm_area_struct *vma) 1024 { 1025 /* 1026 * Clear the old hugetlb private page reservation. 1027 * It has already been transferred to new_vma. 1028 * 1029 * During a mremap() operation of a hugetlb vma we call move_vma() 1030 * which copies vma into new_vma and unmaps vma. After the copy 1031 * operation both new_vma and vma share a reference to the resv_map 1032 * struct, and at that point vma is about to be unmapped. We don't 1033 * want to return the reservation to the pool at unmap of vma because 1034 * the reservation still lives on in new_vma, so simply decrement the 1035 * ref here and remove the resv_map reference from this vma. 1036 */ 1037 struct resv_map *reservations = vma_resv_map(vma); 1038 1039 if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 1040 resv_map_put_hugetlb_cgroup_uncharge_info(reservations); 1041 kref_put(&reservations->refs, resv_map_release); 1042 } 1043 1044 reset_vma_resv_huge_pages(vma); 1045 } 1046 1047 /* Returns true if the VMA has associated reserve pages */ 1048 static bool vma_has_reserves(struct vm_area_struct *vma, long chg) 1049 { 1050 if (vma->vm_flags & VM_NORESERVE) { 1051 /* 1052 * This address is already reserved by other process(chg == 0), 1053 * so, we should decrement reserved count. Without decrementing, 1054 * reserve count remains after releasing inode, because this 1055 * allocated page will go into page cache and is regarded as 1056 * coming from reserved pool in releasing step. Currently, we 1057 * don't have any other solution to deal with this situation 1058 * properly, so add work-around here. 1059 */ 1060 if (vma->vm_flags & VM_MAYSHARE && chg == 0) 1061 return true; 1062 else 1063 return false; 1064 } 1065 1066 /* Shared mappings always use reserves */ 1067 if (vma->vm_flags & VM_MAYSHARE) { 1068 /* 1069 * We know VM_NORESERVE is not set. Therefore, there SHOULD 1070 * be a region map for all pages. The only situation where 1071 * there is no region map is if a hole was punched via 1072 * fallocate. In this case, there really are no reserves to 1073 * use. This situation is indicated if chg != 0. 1074 */ 1075 if (chg) 1076 return false; 1077 else 1078 return true; 1079 } 1080 1081 /* 1082 * Only the process that called mmap() has reserves for 1083 * private mappings. 1084 */ 1085 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 1086 /* 1087 * Like the shared case above, a hole punch or truncate 1088 * could have been performed on the private mapping. 1089 * Examine the value of chg to determine if reserves 1090 * actually exist or were previously consumed. 1091 * Very Subtle - The value of chg comes from a previous 1092 * call to vma_needs_reserves(). The reserve map for 1093 * private mappings has different (opposite) semantics 1094 * than that of shared mappings. vma_needs_reserves() 1095 * has already taken this difference in semantics into 1096 * account. Therefore, the meaning of chg is the same 1097 * as in the shared case above. Code could easily be 1098 * combined, but keeping it separate draws attention to 1099 * subtle differences. 1100 */ 1101 if (chg) 1102 return false; 1103 else 1104 return true; 1105 } 1106 1107 return false; 1108 } 1109 1110 static void enqueue_huge_page(struct hstate *h, struct page *page) 1111 { 1112 int nid = page_to_nid(page); 1113 1114 lockdep_assert_held(&hugetlb_lock); 1115 VM_BUG_ON_PAGE(page_count(page), page); 1116 1117 list_move(&page->lru, &h->hugepage_freelists[nid]); 1118 h->free_huge_pages++; 1119 h->free_huge_pages_node[nid]++; 1120 SetHPageFreed(page); 1121 } 1122 1123 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) 1124 { 1125 struct page *page; 1126 bool pin = !!(current->flags & PF_MEMALLOC_PIN); 1127 1128 lockdep_assert_held(&hugetlb_lock); 1129 list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { 1130 if (pin && !is_longterm_pinnable_page(page)) 1131 continue; 1132 1133 if (PageHWPoison(page)) 1134 continue; 1135 1136 list_move(&page->lru, &h->hugepage_activelist); 1137 set_page_refcounted(page); 1138 ClearHPageFreed(page); 1139 h->free_huge_pages--; 1140 h->free_huge_pages_node[nid]--; 1141 return page; 1142 } 1143 1144 return NULL; 1145 } 1146 1147 static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, 1148 nodemask_t *nmask) 1149 { 1150 unsigned int cpuset_mems_cookie; 1151 struct zonelist *zonelist; 1152 struct zone *zone; 1153 struct zoneref *z; 1154 int node = NUMA_NO_NODE; 1155 1156 zonelist = node_zonelist(nid, gfp_mask); 1157 1158 retry_cpuset: 1159 cpuset_mems_cookie = read_mems_allowed_begin(); 1160 for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { 1161 struct page *page; 1162 1163 if (!cpuset_zone_allowed(zone, gfp_mask)) 1164 continue; 1165 /* 1166 * no need to ask again on the same node. Pool is node rather than 1167 * zone aware 1168 */ 1169 if (zone_to_nid(zone) == node) 1170 continue; 1171 node = zone_to_nid(zone); 1172 1173 page = dequeue_huge_page_node_exact(h, node); 1174 if (page) 1175 return page; 1176 } 1177 if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) 1178 goto retry_cpuset; 1179 1180 return NULL; 1181 } 1182 1183 static struct page *dequeue_huge_page_vma(struct hstate *h, 1184 struct vm_area_struct *vma, 1185 unsigned long address, int avoid_reserve, 1186 long chg) 1187 { 1188 struct page *page = NULL; 1189 struct mempolicy *mpol; 1190 gfp_t gfp_mask; 1191 nodemask_t *nodemask; 1192 int nid; 1193 1194 /* 1195 * A child process with MAP_PRIVATE mappings created by their parent 1196 * have no page reserves. This check ensures that reservations are 1197 * not "stolen". The child may still get SIGKILLed 1198 */ 1199 if (!vma_has_reserves(vma, chg) && 1200 h->free_huge_pages - h->resv_huge_pages == 0) 1201 goto err; 1202 1203 /* If reserves cannot be used, ensure enough pages are in the pool */ 1204 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 1205 goto err; 1206 1207 gfp_mask = htlb_alloc_mask(h); 1208 nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 1209 1210 if (mpol_is_preferred_many(mpol)) { 1211 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); 1212 1213 /* Fallback to all nodes if page==NULL */ 1214 nodemask = NULL; 1215 } 1216 1217 if (!page) 1218 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); 1219 1220 if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { 1221 SetHPageRestoreReserve(page); 1222 h->resv_huge_pages--; 1223 } 1224 1225 mpol_cond_put(mpol); 1226 return page; 1227 1228 err: 1229 return NULL; 1230 } 1231 1232 /* 1233 * common helper functions for hstate_next_node_to_{alloc|free}. 1234 * We may have allocated or freed a huge page based on a different 1235 * nodes_allowed previously, so h->next_node_to_{alloc|free} might 1236 * be outside of *nodes_allowed. Ensure that we use an allowed 1237 * node for alloc or free. 1238 */ 1239 static int next_node_allowed(int nid, nodemask_t *nodes_allowed) 1240 { 1241 nid = next_node_in(nid, *nodes_allowed); 1242 VM_BUG_ON(nid >= MAX_NUMNODES); 1243 1244 return nid; 1245 } 1246 1247 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) 1248 { 1249 if (!node_isset(nid, *nodes_allowed)) 1250 nid = next_node_allowed(nid, nodes_allowed); 1251 return nid; 1252 } 1253 1254 /* 1255 * returns the previously saved node ["this node"] from which to 1256 * allocate a persistent huge page for the pool and advance the 1257 * next node from which to allocate, handling wrap at end of node 1258 * mask. 1259 */ 1260 static int hstate_next_node_to_alloc(struct hstate *h, 1261 nodemask_t *nodes_allowed) 1262 { 1263 int nid; 1264 1265 VM_BUG_ON(!nodes_allowed); 1266 1267 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); 1268 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); 1269 1270 return nid; 1271 } 1272 1273 /* 1274 * helper for remove_pool_huge_page() - return the previously saved 1275 * node ["this node"] from which to free a huge page. Advance the 1276 * next node id whether or not we find a free huge page to free so 1277 * that the next attempt to free addresses the next node. 1278 */ 1279 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) 1280 { 1281 int nid; 1282 1283 VM_BUG_ON(!nodes_allowed); 1284 1285 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); 1286 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); 1287 1288 return nid; 1289 } 1290 1291 #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ 1292 for (nr_nodes = nodes_weight(*mask); \ 1293 nr_nodes > 0 && \ 1294 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ 1295 nr_nodes--) 1296 1297 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ 1298 for (nr_nodes = nodes_weight(*mask); \ 1299 nr_nodes > 0 && \ 1300 ((node = hstate_next_node_to_free(hs, mask)) || 1); \ 1301 nr_nodes--) 1302 1303 /* used to demote non-gigantic_huge pages as well */ 1304 static void __destroy_compound_gigantic_page(struct page *page, 1305 unsigned int order, bool demote) 1306 { 1307 int i; 1308 int nr_pages = 1 << order; 1309 struct page *p = page + 1; 1310 1311 atomic_set(compound_mapcount_ptr(page), 0); 1312 atomic_set(compound_pincount_ptr(page), 0); 1313 1314 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1315 p->mapping = NULL; 1316 clear_compound_head(p); 1317 if (!demote) 1318 set_page_refcounted(p); 1319 } 1320 1321 set_compound_order(page, 0); 1322 #ifdef CONFIG_64BIT 1323 page[1].compound_nr = 0; 1324 #endif 1325 __ClearPageHead(page); 1326 } 1327 1328 static void destroy_compound_hugetlb_page_for_demote(struct page *page, 1329 unsigned int order) 1330 { 1331 __destroy_compound_gigantic_page(page, order, true); 1332 } 1333 1334 #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE 1335 static void destroy_compound_gigantic_page(struct page *page, 1336 unsigned int order) 1337 { 1338 __destroy_compound_gigantic_page(page, order, false); 1339 } 1340 1341 static void free_gigantic_page(struct page *page, unsigned int order) 1342 { 1343 /* 1344 * If the page isn't allocated using the cma allocator, 1345 * cma_release() returns false. 1346 */ 1347 #ifdef CONFIG_CMA 1348 if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) 1349 return; 1350 #endif 1351 1352 free_contig_range(page_to_pfn(page), 1 << order); 1353 } 1354 1355 #ifdef CONFIG_CONTIG_ALLOC 1356 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1357 int nid, nodemask_t *nodemask) 1358 { 1359 unsigned long nr_pages = pages_per_huge_page(h); 1360 if (nid == NUMA_NO_NODE) 1361 nid = numa_mem_id(); 1362 1363 #ifdef CONFIG_CMA 1364 { 1365 struct page *page; 1366 int node; 1367 1368 if (hugetlb_cma[nid]) { 1369 page = cma_alloc(hugetlb_cma[nid], nr_pages, 1370 huge_page_order(h), true); 1371 if (page) 1372 return page; 1373 } 1374 1375 if (!(gfp_mask & __GFP_THISNODE)) { 1376 for_each_node_mask(node, *nodemask) { 1377 if (node == nid || !hugetlb_cma[node]) 1378 continue; 1379 1380 page = cma_alloc(hugetlb_cma[node], nr_pages, 1381 huge_page_order(h), true); 1382 if (page) 1383 return page; 1384 } 1385 } 1386 } 1387 #endif 1388 1389 return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); 1390 } 1391 1392 #else /* !CONFIG_CONTIG_ALLOC */ 1393 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1394 int nid, nodemask_t *nodemask) 1395 { 1396 return NULL; 1397 } 1398 #endif /* CONFIG_CONTIG_ALLOC */ 1399 1400 #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ 1401 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1402 int nid, nodemask_t *nodemask) 1403 { 1404 return NULL; 1405 } 1406 static inline void free_gigantic_page(struct page *page, unsigned int order) { } 1407 static inline void destroy_compound_gigantic_page(struct page *page, 1408 unsigned int order) { } 1409 #endif 1410 1411 /* 1412 * Remove hugetlb page from lists, and update dtor so that page appears 1413 * as just a compound page. 1414 * 1415 * A reference is held on the page, except in the case of demote. 1416 * 1417 * Must be called with hugetlb lock held. 1418 */ 1419 static void __remove_hugetlb_page(struct hstate *h, struct page *page, 1420 bool adjust_surplus, 1421 bool demote) 1422 { 1423 int nid = page_to_nid(page); 1424 1425 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); 1426 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); 1427 1428 lockdep_assert_held(&hugetlb_lock); 1429 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 1430 return; 1431 1432 list_del(&page->lru); 1433 1434 if (HPageFreed(page)) { 1435 h->free_huge_pages--; 1436 h->free_huge_pages_node[nid]--; 1437 } 1438 if (adjust_surplus) { 1439 h->surplus_huge_pages--; 1440 h->surplus_huge_pages_node[nid]--; 1441 } 1442 1443 /* 1444 * Very subtle 1445 * 1446 * For non-gigantic pages set the destructor to the normal compound 1447 * page dtor. This is needed in case someone takes an additional 1448 * temporary ref to the page, and freeing is delayed until they drop 1449 * their reference. 1450 * 1451 * For gigantic pages set the destructor to the null dtor. This 1452 * destructor will never be called. Before freeing the gigantic 1453 * page destroy_compound_gigantic_page will turn the compound page 1454 * into a simple group of pages. After this the destructor does not 1455 * apply. 1456 * 1457 * This handles the case where more than one ref is held when and 1458 * after update_and_free_page is called. 1459 * 1460 * In the case of demote we do not ref count the page as it will soon 1461 * be turned into a page of smaller size. 1462 */ 1463 if (!demote) 1464 set_page_refcounted(page); 1465 if (hstate_is_gigantic(h)) 1466 set_compound_page_dtor(page, NULL_COMPOUND_DTOR); 1467 else 1468 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); 1469 1470 h->nr_huge_pages--; 1471 h->nr_huge_pages_node[nid]--; 1472 } 1473 1474 static void remove_hugetlb_page(struct hstate *h, struct page *page, 1475 bool adjust_surplus) 1476 { 1477 __remove_hugetlb_page(h, page, adjust_surplus, false); 1478 } 1479 1480 static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page, 1481 bool adjust_surplus) 1482 { 1483 __remove_hugetlb_page(h, page, adjust_surplus, true); 1484 } 1485 1486 static void add_hugetlb_page(struct hstate *h, struct page *page, 1487 bool adjust_surplus) 1488 { 1489 int zeroed; 1490 int nid = page_to_nid(page); 1491 1492 VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page); 1493 1494 lockdep_assert_held(&hugetlb_lock); 1495 1496 INIT_LIST_HEAD(&page->lru); 1497 h->nr_huge_pages++; 1498 h->nr_huge_pages_node[nid]++; 1499 1500 if (adjust_surplus) { 1501 h->surplus_huge_pages++; 1502 h->surplus_huge_pages_node[nid]++; 1503 } 1504 1505 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); 1506 set_page_private(page, 0); 1507 /* 1508 * We have to set HPageVmemmapOptimized again as above 1509 * set_page_private(page, 0) cleared it. 1510 */ 1511 SetHPageVmemmapOptimized(page); 1512 1513 /* 1514 * This page is about to be managed by the hugetlb allocator and 1515 * should have no users. Drop our reference, and check for others 1516 * just in case. 1517 */ 1518 zeroed = put_page_testzero(page); 1519 if (!zeroed) 1520 /* 1521 * It is VERY unlikely soneone else has taken a ref on 1522 * the page. In this case, we simply return as the 1523 * hugetlb destructor (free_huge_page) will be called 1524 * when this other ref is dropped. 1525 */ 1526 return; 1527 1528 arch_clear_hugepage_flags(page); 1529 enqueue_huge_page(h, page); 1530 } 1531 1532 static void __update_and_free_page(struct hstate *h, struct page *page) 1533 { 1534 int i; 1535 struct page *subpage = page; 1536 1537 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 1538 return; 1539 1540 /* 1541 * If we don't know which subpages are hwpoisoned, we can't free 1542 * the hugepage, so it's leaked intentionally. 1543 */ 1544 if (HPageRawHwpUnreliable(page)) 1545 return; 1546 1547 if (hugetlb_vmemmap_restore(h, page)) { 1548 spin_lock_irq(&hugetlb_lock); 1549 /* 1550 * If we cannot allocate vmemmap pages, just refuse to free the 1551 * page and put the page back on the hugetlb free list and treat 1552 * as a surplus page. 1553 */ 1554 add_hugetlb_page(h, page, true); 1555 spin_unlock_irq(&hugetlb_lock); 1556 return; 1557 } 1558 1559 /* 1560 * Move PageHWPoison flag from head page to the raw error pages, 1561 * which makes any healthy subpages reusable. 1562 */ 1563 if (unlikely(PageHWPoison(page))) 1564 hugetlb_clear_page_hwpoison(page); 1565 1566 for (i = 0; i < pages_per_huge_page(h); 1567 i++, subpage = mem_map_next(subpage, page, i)) { 1568 subpage->flags &= ~(1 << PG_locked | 1 << PG_error | 1569 1 << PG_referenced | 1 << PG_dirty | 1570 1 << PG_active | 1 << PG_private | 1571 1 << PG_writeback); 1572 } 1573 1574 /* 1575 * Non-gigantic pages demoted from CMA allocated gigantic pages 1576 * need to be given back to CMA in free_gigantic_page. 1577 */ 1578 if (hstate_is_gigantic(h) || 1579 hugetlb_cma_page(page, huge_page_order(h))) { 1580 destroy_compound_gigantic_page(page, huge_page_order(h)); 1581 free_gigantic_page(page, huge_page_order(h)); 1582 } else { 1583 __free_pages(page, huge_page_order(h)); 1584 } 1585 } 1586 1587 /* 1588 * As update_and_free_page() can be called under any context, so we cannot 1589 * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the 1590 * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate 1591 * the vmemmap pages. 1592 * 1593 * free_hpage_workfn() locklessly retrieves the linked list of pages to be 1594 * freed and frees them one-by-one. As the page->mapping pointer is going 1595 * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node 1596 * structure of a lockless linked list of huge pages to be freed. 1597 */ 1598 static LLIST_HEAD(hpage_freelist); 1599 1600 static void free_hpage_workfn(struct work_struct *work) 1601 { 1602 struct llist_node *node; 1603 1604 node = llist_del_all(&hpage_freelist); 1605 1606 while (node) { 1607 struct page *page; 1608 struct hstate *h; 1609 1610 page = container_of((struct address_space **)node, 1611 struct page, mapping); 1612 node = node->next; 1613 page->mapping = NULL; 1614 /* 1615 * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate() 1616 * is going to trigger because a previous call to 1617 * remove_hugetlb_page() will set_compound_page_dtor(page, 1618 * NULL_COMPOUND_DTOR), so do not use page_hstate() directly. 1619 */ 1620 h = size_to_hstate(page_size(page)); 1621 1622 __update_and_free_page(h, page); 1623 1624 cond_resched(); 1625 } 1626 } 1627 static DECLARE_WORK(free_hpage_work, free_hpage_workfn); 1628 1629 static inline void flush_free_hpage_work(struct hstate *h) 1630 { 1631 if (hugetlb_vmemmap_optimizable(h)) 1632 flush_work(&free_hpage_work); 1633 } 1634 1635 static void update_and_free_page(struct hstate *h, struct page *page, 1636 bool atomic) 1637 { 1638 if (!HPageVmemmapOptimized(page) || !atomic) { 1639 __update_and_free_page(h, page); 1640 return; 1641 } 1642 1643 /* 1644 * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages. 1645 * 1646 * Only call schedule_work() if hpage_freelist is previously 1647 * empty. Otherwise, schedule_work() had been called but the workfn 1648 * hasn't retrieved the list yet. 1649 */ 1650 if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist)) 1651 schedule_work(&free_hpage_work); 1652 } 1653 1654 static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) 1655 { 1656 struct page *page, *t_page; 1657 1658 list_for_each_entry_safe(page, t_page, list, lru) { 1659 update_and_free_page(h, page, false); 1660 cond_resched(); 1661 } 1662 } 1663 1664 struct hstate *size_to_hstate(unsigned long size) 1665 { 1666 struct hstate *h; 1667 1668 for_each_hstate(h) { 1669 if (huge_page_size(h) == size) 1670 return h; 1671 } 1672 return NULL; 1673 } 1674 1675 void free_huge_page(struct page *page) 1676 { 1677 /* 1678 * Can't pass hstate in here because it is called from the 1679 * compound page destructor. 1680 */ 1681 struct hstate *h = page_hstate(page); 1682 int nid = page_to_nid(page); 1683 struct hugepage_subpool *spool = hugetlb_page_subpool(page); 1684 bool restore_reserve; 1685 unsigned long flags; 1686 1687 VM_BUG_ON_PAGE(page_count(page), page); 1688 VM_BUG_ON_PAGE(page_mapcount(page), page); 1689 1690 hugetlb_set_page_subpool(page, NULL); 1691 if (PageAnon(page)) 1692 __ClearPageAnonExclusive(page); 1693 page->mapping = NULL; 1694 restore_reserve = HPageRestoreReserve(page); 1695 ClearHPageRestoreReserve(page); 1696 1697 /* 1698 * If HPageRestoreReserve was set on page, page allocation consumed a 1699 * reservation. If the page was associated with a subpool, there 1700 * would have been a page reserved in the subpool before allocation 1701 * via hugepage_subpool_get_pages(). Since we are 'restoring' the 1702 * reservation, do not call hugepage_subpool_put_pages() as this will 1703 * remove the reserved page from the subpool. 1704 */ 1705 if (!restore_reserve) { 1706 /* 1707 * A return code of zero implies that the subpool will be 1708 * under its minimum size if the reservation is not restored 1709 * after page is free. Therefore, force restore_reserve 1710 * operation. 1711 */ 1712 if (hugepage_subpool_put_pages(spool, 1) == 0) 1713 restore_reserve = true; 1714 } 1715 1716 spin_lock_irqsave(&hugetlb_lock, flags); 1717 ClearHPageMigratable(page); 1718 hugetlb_cgroup_uncharge_page(hstate_index(h), 1719 pages_per_huge_page(h), page); 1720 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), 1721 pages_per_huge_page(h), page); 1722 if (restore_reserve) 1723 h->resv_huge_pages++; 1724 1725 if (HPageTemporary(page)) { 1726 remove_hugetlb_page(h, page, false); 1727 spin_unlock_irqrestore(&hugetlb_lock, flags); 1728 update_and_free_page(h, page, true); 1729 } else if (h->surplus_huge_pages_node[nid]) { 1730 /* remove the page from active list */ 1731 remove_hugetlb_page(h, page, true); 1732 spin_unlock_irqrestore(&hugetlb_lock, flags); 1733 update_and_free_page(h, page, true); 1734 } else { 1735 arch_clear_hugepage_flags(page); 1736 enqueue_huge_page(h, page); 1737 spin_unlock_irqrestore(&hugetlb_lock, flags); 1738 } 1739 } 1740 1741 /* 1742 * Must be called with the hugetlb lock held 1743 */ 1744 static void __prep_account_new_huge_page(struct hstate *h, int nid) 1745 { 1746 lockdep_assert_held(&hugetlb_lock); 1747 h->nr_huge_pages++; 1748 h->nr_huge_pages_node[nid]++; 1749 } 1750 1751 static void __prep_new_huge_page(struct hstate *h, struct page *page) 1752 { 1753 hugetlb_vmemmap_optimize(h, page); 1754 INIT_LIST_HEAD(&page->lru); 1755 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); 1756 hugetlb_set_page_subpool(page, NULL); 1757 set_hugetlb_cgroup(page, NULL); 1758 set_hugetlb_cgroup_rsvd(page, NULL); 1759 } 1760 1761 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 1762 { 1763 __prep_new_huge_page(h, page); 1764 spin_lock_irq(&hugetlb_lock); 1765 __prep_account_new_huge_page(h, nid); 1766 spin_unlock_irq(&hugetlb_lock); 1767 } 1768 1769 static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, 1770 bool demote) 1771 { 1772 int i, j; 1773 int nr_pages = 1 << order; 1774 struct page *p = page + 1; 1775 1776 /* we rely on prep_new_huge_page to set the destructor */ 1777 set_compound_order(page, order); 1778 __ClearPageReserved(page); 1779 __SetPageHead(page); 1780 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1781 /* 1782 * For gigantic hugepages allocated through bootmem at 1783 * boot, it's safer to be consistent with the not-gigantic 1784 * hugepages and clear the PG_reserved bit from all tail pages 1785 * too. Otherwise drivers using get_user_pages() to access tail 1786 * pages may get the reference counting wrong if they see 1787 * PG_reserved set on a tail page (despite the head page not 1788 * having PG_reserved set). Enforcing this consistency between 1789 * head and tail pages allows drivers to optimize away a check 1790 * on the head page when they need know if put_page() is needed 1791 * after get_user_pages(). 1792 */ 1793 __ClearPageReserved(p); 1794 /* 1795 * Subtle and very unlikely 1796 * 1797 * Gigantic 'page allocators' such as memblock or cma will 1798 * return a set of pages with each page ref counted. We need 1799 * to turn this set of pages into a compound page with tail 1800 * page ref counts set to zero. Code such as speculative page 1801 * cache adding could take a ref on a 'to be' tail page. 1802 * We need to respect any increased ref count, and only set 1803 * the ref count to zero if count is currently 1. If count 1804 * is not 1, we return an error. An error return indicates 1805 * the set of pages can not be converted to a gigantic page. 1806 * The caller who allocated the pages should then discard the 1807 * pages using the appropriate free interface. 1808 * 1809 * In the case of demote, the ref count will be zero. 1810 */ 1811 if (!demote) { 1812 if (!page_ref_freeze(p, 1)) { 1813 pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); 1814 goto out_error; 1815 } 1816 } else { 1817 VM_BUG_ON_PAGE(page_count(p), p); 1818 } 1819 set_compound_head(p, page); 1820 } 1821 atomic_set(compound_mapcount_ptr(page), -1); 1822 atomic_set(compound_pincount_ptr(page), 0); 1823 return true; 1824 1825 out_error: 1826 /* undo tail page modifications made above */ 1827 p = page + 1; 1828 for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) { 1829 clear_compound_head(p); 1830 set_page_refcounted(p); 1831 } 1832 /* need to clear PG_reserved on remaining tail pages */ 1833 for (; j < nr_pages; j++, p = mem_map_next(p, page, j)) 1834 __ClearPageReserved(p); 1835 set_compound_order(page, 0); 1836 #ifdef CONFIG_64BIT 1837 page[1].compound_nr = 0; 1838 #endif 1839 __ClearPageHead(page); 1840 return false; 1841 } 1842 1843 static bool prep_compound_gigantic_page(struct page *page, unsigned int order) 1844 { 1845 return __prep_compound_gigantic_page(page, order, false); 1846 } 1847 1848 static bool prep_compound_gigantic_page_for_demote(struct page *page, 1849 unsigned int order) 1850 { 1851 return __prep_compound_gigantic_page(page, order, true); 1852 } 1853 1854 /* 1855 * PageHuge() only returns true for hugetlbfs pages, but not for normal or 1856 * transparent huge pages. See the PageTransHuge() documentation for more 1857 * details. 1858 */ 1859 int PageHuge(struct page *page) 1860 { 1861 if (!PageCompound(page)) 1862 return 0; 1863 1864 page = compound_head(page); 1865 return page[1].compound_dtor == HUGETLB_PAGE_DTOR; 1866 } 1867 EXPORT_SYMBOL_GPL(PageHuge); 1868 1869 /* 1870 * PageHeadHuge() only returns true for hugetlbfs head page, but not for 1871 * normal or transparent huge pages. 1872 */ 1873 int PageHeadHuge(struct page *page_head) 1874 { 1875 if (!PageHead(page_head)) 1876 return 0; 1877 1878 return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; 1879 } 1880 EXPORT_SYMBOL_GPL(PageHeadHuge); 1881 1882 /* 1883 * Find and lock address space (mapping) in write mode. 1884 * 1885 * Upon entry, the page is locked which means that page_mapping() is 1886 * stable. Due to locking order, we can only trylock_write. If we can 1887 * not get the lock, simply return NULL to caller. 1888 */ 1889 struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) 1890 { 1891 struct address_space *mapping = page_mapping(hpage); 1892 1893 if (!mapping) 1894 return mapping; 1895 1896 if (i_mmap_trylock_write(mapping)) 1897 return mapping; 1898 1899 return NULL; 1900 } 1901 1902 pgoff_t hugetlb_basepage_index(struct page *page) 1903 { 1904 struct page *page_head = compound_head(page); 1905 pgoff_t index = page_index(page_head); 1906 unsigned long compound_idx; 1907 1908 if (compound_order(page_head) >= MAX_ORDER) 1909 compound_idx = page_to_pfn(page) - page_to_pfn(page_head); 1910 else 1911 compound_idx = page - page_head; 1912 1913 return (index << compound_order(page_head)) + compound_idx; 1914 } 1915 1916 static struct page *alloc_buddy_huge_page(struct hstate *h, 1917 gfp_t gfp_mask, int nid, nodemask_t *nmask, 1918 nodemask_t *node_alloc_noretry) 1919 { 1920 int order = huge_page_order(h); 1921 struct page *page; 1922 bool alloc_try_hard = true; 1923 1924 /* 1925 * By default we always try hard to allocate the page with 1926 * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in 1927 * a loop (to adjust global huge page counts) and previous allocation 1928 * failed, do not continue to try hard on the same node. Use the 1929 * node_alloc_noretry bitmap to manage this state information. 1930 */ 1931 if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) 1932 alloc_try_hard = false; 1933 gfp_mask |= __GFP_COMP|__GFP_NOWARN; 1934 if (alloc_try_hard) 1935 gfp_mask |= __GFP_RETRY_MAYFAIL; 1936 if (nid == NUMA_NO_NODE) 1937 nid = numa_mem_id(); 1938 page = __alloc_pages(gfp_mask, order, nid, nmask); 1939 if (page) 1940 __count_vm_event(HTLB_BUDDY_PGALLOC); 1941 else 1942 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 1943 1944 /* 1945 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this 1946 * indicates an overall state change. Clear bit so that we resume 1947 * normal 'try hard' allocations. 1948 */ 1949 if (node_alloc_noretry && page && !alloc_try_hard) 1950 node_clear(nid, *node_alloc_noretry); 1951 1952 /* 1953 * If we tried hard to get a page but failed, set bit so that 1954 * subsequent attempts will not try as hard until there is an 1955 * overall state change. 1956 */ 1957 if (node_alloc_noretry && !page && alloc_try_hard) 1958 node_set(nid, *node_alloc_noretry); 1959 1960 return page; 1961 } 1962 1963 /* 1964 * Common helper to allocate a fresh hugetlb page. All specific allocators 1965 * should use this function to get new hugetlb pages 1966 */ 1967 static struct page *alloc_fresh_huge_page(struct hstate *h, 1968 gfp_t gfp_mask, int nid, nodemask_t *nmask, 1969 nodemask_t *node_alloc_noretry) 1970 { 1971 struct page *page; 1972 bool retry = false; 1973 1974 retry: 1975 if (hstate_is_gigantic(h)) 1976 page = alloc_gigantic_page(h, gfp_mask, nid, nmask); 1977 else 1978 page = alloc_buddy_huge_page(h, gfp_mask, 1979 nid, nmask, node_alloc_noretry); 1980 if (!page) 1981 return NULL; 1982 1983 if (hstate_is_gigantic(h)) { 1984 if (!prep_compound_gigantic_page(page, huge_page_order(h))) { 1985 /* 1986 * Rare failure to convert pages to compound page. 1987 * Free pages and try again - ONCE! 1988 */ 1989 free_gigantic_page(page, huge_page_order(h)); 1990 if (!retry) { 1991 retry = true; 1992 goto retry; 1993 } 1994 return NULL; 1995 } 1996 } 1997 prep_new_huge_page(h, page, page_to_nid(page)); 1998 1999 return page; 2000 } 2001 2002 /* 2003 * Allocates a fresh page to the hugetlb allocator pool in the node interleaved 2004 * manner. 2005 */ 2006 static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 2007 nodemask_t *node_alloc_noretry) 2008 { 2009 struct page *page; 2010 int nr_nodes, node; 2011 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 2012 2013 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 2014 page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, 2015 node_alloc_noretry); 2016 if (page) 2017 break; 2018 } 2019 2020 if (!page) 2021 return 0; 2022 2023 put_page(page); /* free it into the hugepage allocator */ 2024 2025 return 1; 2026 } 2027 2028 /* 2029 * Remove huge page from pool from next node to free. Attempt to keep 2030 * persistent huge pages more or less balanced over allowed nodes. 2031 * This routine only 'removes' the hugetlb page. The caller must make 2032 * an additional call to free the page to low level allocators. 2033 * Called with hugetlb_lock locked. 2034 */ 2035 static struct page *remove_pool_huge_page(struct hstate *h, 2036 nodemask_t *nodes_allowed, 2037 bool acct_surplus) 2038 { 2039 int nr_nodes, node; 2040 struct page *page = NULL; 2041 2042 lockdep_assert_held(&hugetlb_lock); 2043 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 2044 /* 2045 * If we're returning unused surplus pages, only examine 2046 * nodes with surplus pages. 2047 */ 2048 if ((!acct_surplus || h->surplus_huge_pages_node[node]) && 2049 !list_empty(&h->hugepage_freelists[node])) { 2050 page = list_entry(h->hugepage_freelists[node].next, 2051 struct page, lru); 2052 remove_hugetlb_page(h, page, acct_surplus); 2053 break; 2054 } 2055 } 2056 2057 return page; 2058 } 2059 2060 /* 2061 * Dissolve a given free hugepage into free buddy pages. This function does 2062 * nothing for in-use hugepages and non-hugepages. 2063 * This function returns values like below: 2064 * 2065 * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages 2066 * when the system is under memory pressure and the feature of 2067 * freeing unused vmemmap pages associated with each hugetlb page 2068 * is enabled. 2069 * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use 2070 * (allocated or reserved.) 2071 * 0: successfully dissolved free hugepages or the page is not a 2072 * hugepage (considered as already dissolved) 2073 */ 2074 int dissolve_free_huge_page(struct page *page) 2075 { 2076 int rc = -EBUSY; 2077 2078 retry: 2079 /* Not to disrupt normal path by vainly holding hugetlb_lock */ 2080 if (!PageHuge(page)) 2081 return 0; 2082 2083 spin_lock_irq(&hugetlb_lock); 2084 if (!PageHuge(page)) { 2085 rc = 0; 2086 goto out; 2087 } 2088 2089 if (!page_count(page)) { 2090 struct page *head = compound_head(page); 2091 struct hstate *h = page_hstate(head); 2092 if (h->free_huge_pages - h->resv_huge_pages == 0) 2093 goto out; 2094 2095 /* 2096 * We should make sure that the page is already on the free list 2097 * when it is dissolved. 2098 */ 2099 if (unlikely(!HPageFreed(head))) { 2100 spin_unlock_irq(&hugetlb_lock); 2101 cond_resched(); 2102 2103 /* 2104 * Theoretically, we should return -EBUSY when we 2105 * encounter this race. In fact, we have a chance 2106 * to successfully dissolve the page if we do a 2107 * retry. Because the race window is quite small. 2108 * If we seize this opportunity, it is an optimization 2109 * for increasing the success rate of dissolving page. 2110 */ 2111 goto retry; 2112 } 2113 2114 remove_hugetlb_page(h, head, false); 2115 h->max_huge_pages--; 2116 spin_unlock_irq(&hugetlb_lock); 2117 2118 /* 2119 * Normally update_and_free_page will allocate required vmemmmap 2120 * before freeing the page. update_and_free_page will fail to 2121 * free the page if it can not allocate required vmemmap. We 2122 * need to adjust max_huge_pages if the page is not freed. 2123 * Attempt to allocate vmemmmap here so that we can take 2124 * appropriate action on failure. 2125 */ 2126 rc = hugetlb_vmemmap_restore(h, head); 2127 if (!rc) { 2128 update_and_free_page(h, head, false); 2129 } else { 2130 spin_lock_irq(&hugetlb_lock); 2131 add_hugetlb_page(h, head, false); 2132 h->max_huge_pages++; 2133 spin_unlock_irq(&hugetlb_lock); 2134 } 2135 2136 return rc; 2137 } 2138 out: 2139 spin_unlock_irq(&hugetlb_lock); 2140 return rc; 2141 } 2142 2143 /* 2144 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to 2145 * make specified memory blocks removable from the system. 2146 * Note that this will dissolve a free gigantic hugepage completely, if any 2147 * part of it lies within the given range. 2148 * Also note that if dissolve_free_huge_page() returns with an error, all 2149 * free hugepages that were dissolved before that error are lost. 2150 */ 2151 int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) 2152 { 2153 unsigned long pfn; 2154 struct page *page; 2155 int rc = 0; 2156 unsigned int order; 2157 struct hstate *h; 2158 2159 if (!hugepages_supported()) 2160 return rc; 2161 2162 order = huge_page_order(&default_hstate); 2163 for_each_hstate(h) 2164 order = min(order, huge_page_order(h)); 2165 2166 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { 2167 page = pfn_to_page(pfn); 2168 rc = dissolve_free_huge_page(page); 2169 if (rc) 2170 break; 2171 } 2172 2173 return rc; 2174 } 2175 2176 /* 2177 * Allocates a fresh surplus page from the page allocator. 2178 */ 2179 static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, 2180 int nid, nodemask_t *nmask, bool zero_ref) 2181 { 2182 struct page *page = NULL; 2183 bool retry = false; 2184 2185 if (hstate_is_gigantic(h)) 2186 return NULL; 2187 2188 spin_lock_irq(&hugetlb_lock); 2189 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) 2190 goto out_unlock; 2191 spin_unlock_irq(&hugetlb_lock); 2192 2193 retry: 2194 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); 2195 if (!page) 2196 return NULL; 2197 2198 spin_lock_irq(&hugetlb_lock); 2199 /* 2200 * We could have raced with the pool size change. 2201 * Double check that and simply deallocate the new page 2202 * if we would end up overcommiting the surpluses. Abuse 2203 * temporary page to workaround the nasty free_huge_page 2204 * codeflow 2205 */ 2206 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { 2207 SetHPageTemporary(page); 2208 spin_unlock_irq(&hugetlb_lock); 2209 put_page(page); 2210 return NULL; 2211 } 2212 2213 if (zero_ref) { 2214 /* 2215 * Caller requires a page with zero ref count. 2216 * We will drop ref count here. If someone else is holding 2217 * a ref, the page will be freed when they drop it. Abuse 2218 * temporary page flag to accomplish this. 2219 */ 2220 SetHPageTemporary(page); 2221 if (!put_page_testzero(page)) { 2222 /* 2223 * Unexpected inflated ref count on freshly allocated 2224 * huge. Retry once. 2225 */ 2226 pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n"); 2227 spin_unlock_irq(&hugetlb_lock); 2228 if (retry) 2229 return NULL; 2230 2231 retry = true; 2232 goto retry; 2233 } 2234 ClearHPageTemporary(page); 2235 } 2236 2237 h->surplus_huge_pages++; 2238 h->surplus_huge_pages_node[page_to_nid(page)]++; 2239 2240 out_unlock: 2241 spin_unlock_irq(&hugetlb_lock); 2242 2243 return page; 2244 } 2245 2246 static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, 2247 int nid, nodemask_t *nmask) 2248 { 2249 struct page *page; 2250 2251 if (hstate_is_gigantic(h)) 2252 return NULL; 2253 2254 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); 2255 if (!page) 2256 return NULL; 2257 2258 /* 2259 * We do not account these pages as surplus because they are only 2260 * temporary and will be released properly on the last reference 2261 */ 2262 SetHPageTemporary(page); 2263 2264 return page; 2265 } 2266 2267 /* 2268 * Use the VMA's mpolicy to allocate a huge page from the buddy. 2269 */ 2270 static 2271 struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, 2272 struct vm_area_struct *vma, unsigned long addr) 2273 { 2274 struct page *page = NULL; 2275 struct mempolicy *mpol; 2276 gfp_t gfp_mask = htlb_alloc_mask(h); 2277 int nid; 2278 nodemask_t *nodemask; 2279 2280 nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); 2281 if (mpol_is_preferred_many(mpol)) { 2282 gfp_t gfp = gfp_mask | __GFP_NOWARN; 2283 2284 gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2285 page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false); 2286 2287 /* Fallback to all nodes if page==NULL */ 2288 nodemask = NULL; 2289 } 2290 2291 if (!page) 2292 page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); 2293 mpol_cond_put(mpol); 2294 return page; 2295 } 2296 2297 /* page migration callback function */ 2298 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, 2299 nodemask_t *nmask, gfp_t gfp_mask) 2300 { 2301 spin_lock_irq(&hugetlb_lock); 2302 if (h->free_huge_pages - h->resv_huge_pages > 0) { 2303 struct page *page; 2304 2305 page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); 2306 if (page) { 2307 spin_unlock_irq(&hugetlb_lock); 2308 return page; 2309 } 2310 } 2311 spin_unlock_irq(&hugetlb_lock); 2312 2313 return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); 2314 } 2315 2316 /* mempolicy aware migration callback */ 2317 struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, 2318 unsigned long address) 2319 { 2320 struct mempolicy *mpol; 2321 nodemask_t *nodemask; 2322 struct page *page; 2323 gfp_t gfp_mask; 2324 int node; 2325 2326 gfp_mask = htlb_alloc_mask(h); 2327 node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 2328 page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask); 2329 mpol_cond_put(mpol); 2330 2331 return page; 2332 } 2333 2334 /* 2335 * Increase the hugetlb pool such that it can accommodate a reservation 2336 * of size 'delta'. 2337 */ 2338 static int gather_surplus_pages(struct hstate *h, long delta) 2339 __must_hold(&hugetlb_lock) 2340 { 2341 LIST_HEAD(surplus_list); 2342 struct page *page, *tmp; 2343 int ret; 2344 long i; 2345 long needed, allocated; 2346 bool alloc_ok = true; 2347 2348 lockdep_assert_held(&hugetlb_lock); 2349 needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 2350 if (needed <= 0) { 2351 h->resv_huge_pages += delta; 2352 return 0; 2353 } 2354 2355 allocated = 0; 2356 2357 ret = -ENOMEM; 2358 retry: 2359 spin_unlock_irq(&hugetlb_lock); 2360 for (i = 0; i < needed; i++) { 2361 page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), 2362 NUMA_NO_NODE, NULL, true); 2363 if (!page) { 2364 alloc_ok = false; 2365 break; 2366 } 2367 list_add(&page->lru, &surplus_list); 2368 cond_resched(); 2369 } 2370 allocated += i; 2371 2372 /* 2373 * After retaking hugetlb_lock, we need to recalculate 'needed' 2374 * because either resv_huge_pages or free_huge_pages may have changed. 2375 */ 2376 spin_lock_irq(&hugetlb_lock); 2377 needed = (h->resv_huge_pages + delta) - 2378 (h->free_huge_pages + allocated); 2379 if (needed > 0) { 2380 if (alloc_ok) 2381 goto retry; 2382 /* 2383 * We were not able to allocate enough pages to 2384 * satisfy the entire reservation so we free what 2385 * we've allocated so far. 2386 */ 2387 goto free; 2388 } 2389 /* 2390 * The surplus_list now contains _at_least_ the number of extra pages 2391 * needed to accommodate the reservation. Add the appropriate number 2392 * of pages to the hugetlb pool and free the extras back to the buddy 2393 * allocator. Commit the entire reservation here to prevent another 2394 * process from stealing the pages as they are added to the pool but 2395 * before they are reserved. 2396 */ 2397 needed += allocated; 2398 h->resv_huge_pages += delta; 2399 ret = 0; 2400 2401 /* Free the needed pages to the hugetlb pool */ 2402 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 2403 if ((--needed) < 0) 2404 break; 2405 /* Add the page to the hugetlb allocator */ 2406 enqueue_huge_page(h, page); 2407 } 2408 free: 2409 spin_unlock_irq(&hugetlb_lock); 2410 2411 /* 2412 * Free unnecessary surplus pages to the buddy allocator. 2413 * Pages have no ref count, call free_huge_page directly. 2414 */ 2415 list_for_each_entry_safe(page, tmp, &surplus_list, lru) 2416 free_huge_page(page); 2417 spin_lock_irq(&hugetlb_lock); 2418 2419 return ret; 2420 } 2421 2422 /* 2423 * This routine has two main purposes: 2424 * 1) Decrement the reservation count (resv_huge_pages) by the value passed 2425 * in unused_resv_pages. This corresponds to the prior adjustments made 2426 * to the associated reservation map. 2427 * 2) Free any unused surplus pages that may have been allocated to satisfy 2428 * the reservation. As many as unused_resv_pages may be freed. 2429 */ 2430 static void return_unused_surplus_pages(struct hstate *h, 2431 unsigned long unused_resv_pages) 2432 { 2433 unsigned long nr_pages; 2434 struct page *page; 2435 LIST_HEAD(page_list); 2436 2437 lockdep_assert_held(&hugetlb_lock); 2438 /* Uncommit the reservation */ 2439 h->resv_huge_pages -= unused_resv_pages; 2440 2441 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 2442 goto out; 2443 2444 /* 2445 * Part (or even all) of the reservation could have been backed 2446 * by pre-allocated pages. Only free surplus pages. 2447 */ 2448 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 2449 2450 /* 2451 * We want to release as many surplus pages as possible, spread 2452 * evenly across all nodes with memory. Iterate across these nodes 2453 * until we can no longer free unreserved surplus pages. This occurs 2454 * when the nodes with surplus pages have no free pages. 2455 * remove_pool_huge_page() will balance the freed pages across the 2456 * on-line nodes with memory and will handle the hstate accounting. 2457 */ 2458 while (nr_pages--) { 2459 page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); 2460 if (!page) 2461 goto out; 2462 2463 list_add(&page->lru, &page_list); 2464 } 2465 2466 out: 2467 spin_unlock_irq(&hugetlb_lock); 2468 update_and_free_pages_bulk(h, &page_list); 2469 spin_lock_irq(&hugetlb_lock); 2470 } 2471 2472 2473 /* 2474 * vma_needs_reservation, vma_commit_reservation and vma_end_reservation 2475 * are used by the huge page allocation routines to manage reservations. 2476 * 2477 * vma_needs_reservation is called to determine if the huge page at addr 2478 * within the vma has an associated reservation. If a reservation is 2479 * needed, the value 1 is returned. The caller is then responsible for 2480 * managing the global reservation and subpool usage counts. After 2481 * the huge page has been allocated, vma_commit_reservation is called 2482 * to add the page to the reservation map. If the page allocation fails, 2483 * the reservation must be ended instead of committed. vma_end_reservation 2484 * is called in such cases. 2485 * 2486 * In the normal case, vma_commit_reservation returns the same value 2487 * as the preceding vma_needs_reservation call. The only time this 2488 * is not the case is if a reserve map was changed between calls. It 2489 * is the responsibility of the caller to notice the difference and 2490 * take appropriate action. 2491 * 2492 * vma_add_reservation is used in error paths where a reservation must 2493 * be restored when a newly allocated huge page must be freed. It is 2494 * to be called after calling vma_needs_reservation to determine if a 2495 * reservation exists. 2496 * 2497 * vma_del_reservation is used in error paths where an entry in the reserve 2498 * map was created during huge page allocation and must be removed. It is to 2499 * be called after calling vma_needs_reservation to determine if a reservation 2500 * exists. 2501 */ 2502 enum vma_resv_mode { 2503 VMA_NEEDS_RESV, 2504 VMA_COMMIT_RESV, 2505 VMA_END_RESV, 2506 VMA_ADD_RESV, 2507 VMA_DEL_RESV, 2508 }; 2509 static long __vma_reservation_common(struct hstate *h, 2510 struct vm_area_struct *vma, unsigned long addr, 2511 enum vma_resv_mode mode) 2512 { 2513 struct resv_map *resv; 2514 pgoff_t idx; 2515 long ret; 2516 long dummy_out_regions_needed; 2517 2518 resv = vma_resv_map(vma); 2519 if (!resv) 2520 return 1; 2521 2522 idx = vma_hugecache_offset(h, vma, addr); 2523 switch (mode) { 2524 case VMA_NEEDS_RESV: 2525 ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); 2526 /* We assume that vma_reservation_* routines always operate on 2527 * 1 page, and that adding to resv map a 1 page entry can only 2528 * ever require 1 region. 2529 */ 2530 VM_BUG_ON(dummy_out_regions_needed != 1); 2531 break; 2532 case VMA_COMMIT_RESV: 2533 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2534 /* region_add calls of range 1 should never fail. */ 2535 VM_BUG_ON(ret < 0); 2536 break; 2537 case VMA_END_RESV: 2538 region_abort(resv, idx, idx + 1, 1); 2539 ret = 0; 2540 break; 2541 case VMA_ADD_RESV: 2542 if (vma->vm_flags & VM_MAYSHARE) { 2543 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2544 /* region_add calls of range 1 should never fail. */ 2545 VM_BUG_ON(ret < 0); 2546 } else { 2547 region_abort(resv, idx, idx + 1, 1); 2548 ret = region_del(resv, idx, idx + 1); 2549 } 2550 break; 2551 case VMA_DEL_RESV: 2552 if (vma->vm_flags & VM_MAYSHARE) { 2553 region_abort(resv, idx, idx + 1, 1); 2554 ret = region_del(resv, idx, idx + 1); 2555 } else { 2556 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2557 /* region_add calls of range 1 should never fail. */ 2558 VM_BUG_ON(ret < 0); 2559 } 2560 break; 2561 default: 2562 BUG(); 2563 } 2564 2565 if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV) 2566 return ret; 2567 /* 2568 * We know private mapping must have HPAGE_RESV_OWNER set. 2569 * 2570 * In most cases, reserves always exist for private mappings. 2571 * However, a file associated with mapping could have been 2572 * hole punched or truncated after reserves were consumed. 2573 * As subsequent fault on such a range will not use reserves. 2574 * Subtle - The reserve map for private mappings has the 2575 * opposite meaning than that of shared mappings. If NO 2576 * entry is in the reserve map, it means a reservation exists. 2577 * If an entry exists in the reserve map, it means the 2578 * reservation has already been consumed. As a result, the 2579 * return value of this routine is the opposite of the 2580 * value returned from reserve map manipulation routines above. 2581 */ 2582 if (ret > 0) 2583 return 0; 2584 if (ret == 0) 2585 return 1; 2586 return ret; 2587 } 2588 2589 static long vma_needs_reservation(struct hstate *h, 2590 struct vm_area_struct *vma, unsigned long addr) 2591 { 2592 return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); 2593 } 2594 2595 static long vma_commit_reservation(struct hstate *h, 2596 struct vm_area_struct *vma, unsigned long addr) 2597 { 2598 return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); 2599 } 2600 2601 static void vma_end_reservation(struct hstate *h, 2602 struct vm_area_struct *vma, unsigned long addr) 2603 { 2604 (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); 2605 } 2606 2607 static long vma_add_reservation(struct hstate *h, 2608 struct vm_area_struct *vma, unsigned long addr) 2609 { 2610 return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); 2611 } 2612 2613 static long vma_del_reservation(struct hstate *h, 2614 struct vm_area_struct *vma, unsigned long addr) 2615 { 2616 return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV); 2617 } 2618 2619 /* 2620 * This routine is called to restore reservation information on error paths. 2621 * It should ONLY be called for pages allocated via alloc_huge_page(), and 2622 * the hugetlb mutex should remain held when calling this routine. 2623 * 2624 * It handles two specific cases: 2625 * 1) A reservation was in place and the page consumed the reservation. 2626 * HPageRestoreReserve is set in the page. 2627 * 2) No reservation was in place for the page, so HPageRestoreReserve is 2628 * not set. However, alloc_huge_page always updates the reserve map. 2629 * 2630 * In case 1, free_huge_page later in the error path will increment the 2631 * global reserve count. But, free_huge_page does not have enough context 2632 * to adjust the reservation map. This case deals primarily with private 2633 * mappings. Adjust the reserve map here to be consistent with global 2634 * reserve count adjustments to be made by free_huge_page. Make sure the 2635 * reserve map indicates there is a reservation present. 2636 * 2637 * In case 2, simply undo reserve map modifications done by alloc_huge_page. 2638 */ 2639 void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, 2640 unsigned long address, struct page *page) 2641 { 2642 long rc = vma_needs_reservation(h, vma, address); 2643 2644 if (HPageRestoreReserve(page)) { 2645 if (unlikely(rc < 0)) 2646 /* 2647 * Rare out of memory condition in reserve map 2648 * manipulation. Clear HPageRestoreReserve so that 2649 * global reserve count will not be incremented 2650 * by free_huge_page. This will make it appear 2651 * as though the reservation for this page was 2652 * consumed. This may prevent the task from 2653 * faulting in the page at a later time. This 2654 * is better than inconsistent global huge page 2655 * accounting of reserve counts. 2656 */ 2657 ClearHPageRestoreReserve(page); 2658 else if (rc) 2659 (void)vma_add_reservation(h, vma, address); 2660 else 2661 vma_end_reservation(h, vma, address); 2662 } else { 2663 if (!rc) { 2664 /* 2665 * This indicates there is an entry in the reserve map 2666 * not added by alloc_huge_page. We know it was added 2667 * before the alloc_huge_page call, otherwise 2668 * HPageRestoreReserve would be set on the page. 2669 * Remove the entry so that a subsequent allocation 2670 * does not consume a reservation. 2671 */ 2672 rc = vma_del_reservation(h, vma, address); 2673 if (rc < 0) 2674 /* 2675 * VERY rare out of memory condition. Since 2676 * we can not delete the entry, set 2677 * HPageRestoreReserve so that the reserve 2678 * count will be incremented when the page 2679 * is freed. This reserve will be consumed 2680 * on a subsequent allocation. 2681 */ 2682 SetHPageRestoreReserve(page); 2683 } else if (rc < 0) { 2684 /* 2685 * Rare out of memory condition from 2686 * vma_needs_reservation call. Memory allocation is 2687 * only attempted if a new entry is needed. Therefore, 2688 * this implies there is not an entry in the 2689 * reserve map. 2690 * 2691 * For shared mappings, no entry in the map indicates 2692 * no reservation. We are done. 2693 */ 2694 if (!(vma->vm_flags & VM_MAYSHARE)) 2695 /* 2696 * For private mappings, no entry indicates 2697 * a reservation is present. Since we can 2698 * not add an entry, set SetHPageRestoreReserve 2699 * on the page so reserve count will be 2700 * incremented when freed. This reserve will 2701 * be consumed on a subsequent allocation. 2702 */ 2703 SetHPageRestoreReserve(page); 2704 } else 2705 /* 2706 * No reservation present, do nothing 2707 */ 2708 vma_end_reservation(h, vma, address); 2709 } 2710 } 2711 2712 /* 2713 * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one 2714 * @h: struct hstate old page belongs to 2715 * @old_page: Old page to dissolve 2716 * @list: List to isolate the page in case we need to 2717 * Returns 0 on success, otherwise negated error. 2718 */ 2719 static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, 2720 struct list_head *list) 2721 { 2722 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 2723 int nid = page_to_nid(old_page); 2724 bool alloc_retry = false; 2725 struct page *new_page; 2726 int ret = 0; 2727 2728 /* 2729 * Before dissolving the page, we need to allocate a new one for the 2730 * pool to remain stable. Here, we allocate the page and 'prep' it 2731 * by doing everything but actually updating counters and adding to 2732 * the pool. This simplifies and let us do most of the processing 2733 * under the lock. 2734 */ 2735 alloc_retry: 2736 new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); 2737 if (!new_page) 2738 return -ENOMEM; 2739 /* 2740 * If all goes well, this page will be directly added to the free 2741 * list in the pool. For this the ref count needs to be zero. 2742 * Attempt to drop now, and retry once if needed. It is VERY 2743 * unlikely there is another ref on the page. 2744 * 2745 * If someone else has a reference to the page, it will be freed 2746 * when they drop their ref. Abuse temporary page flag to accomplish 2747 * this. Retry once if there is an inflated ref count. 2748 */ 2749 SetHPageTemporary(new_page); 2750 if (!put_page_testzero(new_page)) { 2751 if (alloc_retry) 2752 return -EBUSY; 2753 2754 alloc_retry = true; 2755 goto alloc_retry; 2756 } 2757 ClearHPageTemporary(new_page); 2758 2759 __prep_new_huge_page(h, new_page); 2760 2761 retry: 2762 spin_lock_irq(&hugetlb_lock); 2763 if (!PageHuge(old_page)) { 2764 /* 2765 * Freed from under us. Drop new_page too. 2766 */ 2767 goto free_new; 2768 } else if (page_count(old_page)) { 2769 /* 2770 * Someone has grabbed the page, try to isolate it here. 2771 * Fail with -EBUSY if not possible. 2772 */ 2773 spin_unlock_irq(&hugetlb_lock); 2774 ret = isolate_hugetlb(old_page, list); 2775 spin_lock_irq(&hugetlb_lock); 2776 goto free_new; 2777 } else if (!HPageFreed(old_page)) { 2778 /* 2779 * Page's refcount is 0 but it has not been enqueued in the 2780 * freelist yet. Race window is small, so we can succeed here if 2781 * we retry. 2782 */ 2783 spin_unlock_irq(&hugetlb_lock); 2784 cond_resched(); 2785 goto retry; 2786 } else { 2787 /* 2788 * Ok, old_page is still a genuine free hugepage. Remove it from 2789 * the freelist and decrease the counters. These will be 2790 * incremented again when calling __prep_account_new_huge_page() 2791 * and enqueue_huge_page() for new_page. The counters will remain 2792 * stable since this happens under the lock. 2793 */ 2794 remove_hugetlb_page(h, old_page, false); 2795 2796 /* 2797 * Ref count on new page is already zero as it was dropped 2798 * earlier. It can be directly added to the pool free list. 2799 */ 2800 __prep_account_new_huge_page(h, nid); 2801 enqueue_huge_page(h, new_page); 2802 2803 /* 2804 * Pages have been replaced, we can safely free the old one. 2805 */ 2806 spin_unlock_irq(&hugetlb_lock); 2807 update_and_free_page(h, old_page, false); 2808 } 2809 2810 return ret; 2811 2812 free_new: 2813 spin_unlock_irq(&hugetlb_lock); 2814 /* Page has a zero ref count, but needs a ref to be freed */ 2815 set_page_refcounted(new_page); 2816 update_and_free_page(h, new_page, false); 2817 2818 return ret; 2819 } 2820 2821 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) 2822 { 2823 struct hstate *h; 2824 struct page *head; 2825 int ret = -EBUSY; 2826 2827 /* 2828 * The page might have been dissolved from under our feet, so make sure 2829 * to carefully check the state under the lock. 2830 * Return success when racing as if we dissolved the page ourselves. 2831 */ 2832 spin_lock_irq(&hugetlb_lock); 2833 if (PageHuge(page)) { 2834 head = compound_head(page); 2835 h = page_hstate(head); 2836 } else { 2837 spin_unlock_irq(&hugetlb_lock); 2838 return 0; 2839 } 2840 spin_unlock_irq(&hugetlb_lock); 2841 2842 /* 2843 * Fence off gigantic pages as there is a cyclic dependency between 2844 * alloc_contig_range and them. Return -ENOMEM as this has the effect 2845 * of bailing out right away without further retrying. 2846 */ 2847 if (hstate_is_gigantic(h)) 2848 return -ENOMEM; 2849 2850 if (page_count(head) && !isolate_hugetlb(head, list)) 2851 ret = 0; 2852 else if (!page_count(head)) 2853 ret = alloc_and_dissolve_huge_page(h, head, list); 2854 2855 return ret; 2856 } 2857 2858 struct page *alloc_huge_page(struct vm_area_struct *vma, 2859 unsigned long addr, int avoid_reserve) 2860 { 2861 struct hugepage_subpool *spool = subpool_vma(vma); 2862 struct hstate *h = hstate_vma(vma); 2863 struct page *page; 2864 long map_chg, map_commit; 2865 long gbl_chg; 2866 int ret, idx; 2867 struct hugetlb_cgroup *h_cg; 2868 bool deferred_reserve; 2869 2870 idx = hstate_index(h); 2871 /* 2872 * Examine the region/reserve map to determine if the process 2873 * has a reservation for the page to be allocated. A return 2874 * code of zero indicates a reservation exists (no change). 2875 */ 2876 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); 2877 if (map_chg < 0) 2878 return ERR_PTR(-ENOMEM); 2879 2880 /* 2881 * Processes that did not create the mapping will have no 2882 * reserves as indicated by the region/reserve map. Check 2883 * that the allocation will not exceed the subpool limit. 2884 * Allocations for MAP_NORESERVE mappings also need to be 2885 * checked against any subpool limit. 2886 */ 2887 if (map_chg || avoid_reserve) { 2888 gbl_chg = hugepage_subpool_get_pages(spool, 1); 2889 if (gbl_chg < 0) { 2890 vma_end_reservation(h, vma, addr); 2891 return ERR_PTR(-ENOSPC); 2892 } 2893 2894 /* 2895 * Even though there was no reservation in the region/reserve 2896 * map, there could be reservations associated with the 2897 * subpool that can be used. This would be indicated if the 2898 * return value of hugepage_subpool_get_pages() is zero. 2899 * However, if avoid_reserve is specified we still avoid even 2900 * the subpool reservations. 2901 */ 2902 if (avoid_reserve) 2903 gbl_chg = 1; 2904 } 2905 2906 /* If this allocation is not consuming a reservation, charge it now. 2907 */ 2908 deferred_reserve = map_chg || avoid_reserve; 2909 if (deferred_reserve) { 2910 ret = hugetlb_cgroup_charge_cgroup_rsvd( 2911 idx, pages_per_huge_page(h), &h_cg); 2912 if (ret) 2913 goto out_subpool_put; 2914 } 2915 2916 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 2917 if (ret) 2918 goto out_uncharge_cgroup_reservation; 2919 2920 spin_lock_irq(&hugetlb_lock); 2921 /* 2922 * glb_chg is passed to indicate whether or not a page must be taken 2923 * from the global free pool (global change). gbl_chg == 0 indicates 2924 * a reservation exists for the allocation. 2925 */ 2926 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); 2927 if (!page) { 2928 spin_unlock_irq(&hugetlb_lock); 2929 page = alloc_buddy_huge_page_with_mpol(h, vma, addr); 2930 if (!page) 2931 goto out_uncharge_cgroup; 2932 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { 2933 SetHPageRestoreReserve(page); 2934 h->resv_huge_pages--; 2935 } 2936 spin_lock_irq(&hugetlb_lock); 2937 list_add(&page->lru, &h->hugepage_activelist); 2938 /* Fall through */ 2939 } 2940 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); 2941 /* If allocation is not consuming a reservation, also store the 2942 * hugetlb_cgroup pointer on the page. 2943 */ 2944 if (deferred_reserve) { 2945 hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), 2946 h_cg, page); 2947 } 2948 2949 spin_unlock_irq(&hugetlb_lock); 2950 2951 hugetlb_set_page_subpool(page, spool); 2952 2953 map_commit = vma_commit_reservation(h, vma, addr); 2954 if (unlikely(map_chg > map_commit)) { 2955 /* 2956 * The page was added to the reservation map between 2957 * vma_needs_reservation and vma_commit_reservation. 2958 * This indicates a race with hugetlb_reserve_pages. 2959 * Adjust for the subpool count incremented above AND 2960 * in hugetlb_reserve_pages for the same page. Also, 2961 * the reservation count added in hugetlb_reserve_pages 2962 * no longer applies. 2963 */ 2964 long rsv_adjust; 2965 2966 rsv_adjust = hugepage_subpool_put_pages(spool, 1); 2967 hugetlb_acct_memory(h, -rsv_adjust); 2968 if (deferred_reserve) 2969 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), 2970 pages_per_huge_page(h), page); 2971 } 2972 return page; 2973 2974 out_uncharge_cgroup: 2975 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); 2976 out_uncharge_cgroup_reservation: 2977 if (deferred_reserve) 2978 hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), 2979 h_cg); 2980 out_subpool_put: 2981 if (map_chg || avoid_reserve) 2982 hugepage_subpool_put_pages(spool, 1); 2983 vma_end_reservation(h, vma, addr); 2984 return ERR_PTR(-ENOSPC); 2985 } 2986 2987 int alloc_bootmem_huge_page(struct hstate *h, int nid) 2988 __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); 2989 int __alloc_bootmem_huge_page(struct hstate *h, int nid) 2990 { 2991 struct huge_bootmem_page *m = NULL; /* initialize for clang */ 2992 int nr_nodes, node; 2993 2994 /* do node specific alloc */ 2995 if (nid != NUMA_NO_NODE) { 2996 m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), 2997 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); 2998 if (!m) 2999 return 0; 3000 goto found; 3001 } 3002 /* allocate from next node when distributing huge pages */ 3003 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 3004 m = memblock_alloc_try_nid_raw( 3005 huge_page_size(h), huge_page_size(h), 3006 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); 3007 /* 3008 * Use the beginning of the huge page to store the 3009 * huge_bootmem_page struct (until gather_bootmem 3010 * puts them into the mem_map). 3011 */ 3012 if (!m) 3013 return 0; 3014 goto found; 3015 } 3016 3017 found: 3018 /* Put them into a private list first because mem_map is not up yet */ 3019 INIT_LIST_HEAD(&m->list); 3020 list_add(&m->list, &huge_boot_pages); 3021 m->hstate = h; 3022 return 1; 3023 } 3024 3025 /* 3026 * Put bootmem huge pages into the standard lists after mem_map is up. 3027 * Note: This only applies to gigantic (order > MAX_ORDER) pages. 3028 */ 3029 static void __init gather_bootmem_prealloc(void) 3030 { 3031 struct huge_bootmem_page *m; 3032 3033 list_for_each_entry(m, &huge_boot_pages, list) { 3034 struct page *page = virt_to_page(m); 3035 struct hstate *h = m->hstate; 3036 3037 VM_BUG_ON(!hstate_is_gigantic(h)); 3038 WARN_ON(page_count(page) != 1); 3039 if (prep_compound_gigantic_page(page, huge_page_order(h))) { 3040 WARN_ON(PageReserved(page)); 3041 prep_new_huge_page(h, page, page_to_nid(page)); 3042 put_page(page); /* add to the hugepage allocator */ 3043 } else { 3044 /* VERY unlikely inflated ref count on a tail page */ 3045 free_gigantic_page(page, huge_page_order(h)); 3046 } 3047 3048 /* 3049 * We need to restore the 'stolen' pages to totalram_pages 3050 * in order to fix confusing memory reports from free(1) and 3051 * other side-effects, like CommitLimit going negative. 3052 */ 3053 adjust_managed_page_count(page, pages_per_huge_page(h)); 3054 cond_resched(); 3055 } 3056 } 3057 static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) 3058 { 3059 unsigned long i; 3060 char buf[32]; 3061 3062 for (i = 0; i < h->max_huge_pages_node[nid]; ++i) { 3063 if (hstate_is_gigantic(h)) { 3064 if (!alloc_bootmem_huge_page(h, nid)) 3065 break; 3066 } else { 3067 struct page *page; 3068 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 3069 3070 page = alloc_fresh_huge_page(h, gfp_mask, nid, 3071 &node_states[N_MEMORY], NULL); 3072 if (!page) 3073 break; 3074 put_page(page); /* free it into the hugepage allocator */ 3075 } 3076 cond_resched(); 3077 } 3078 if (i == h->max_huge_pages_node[nid]) 3079 return; 3080 3081 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 3082 pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", 3083 h->max_huge_pages_node[nid], buf, nid, i); 3084 h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); 3085 h->max_huge_pages_node[nid] = i; 3086 } 3087 3088 static void __init hugetlb_hstate_alloc_pages(struct hstate *h) 3089 { 3090 unsigned long i; 3091 nodemask_t *node_alloc_noretry; 3092 bool node_specific_alloc = false; 3093 3094 /* skip gigantic hugepages allocation if hugetlb_cma enabled */ 3095 if (hstate_is_gigantic(h) && hugetlb_cma_size) { 3096 pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); 3097 return; 3098 } 3099 3100 /* do node specific alloc */ 3101 for_each_online_node(i) { 3102 if (h->max_huge_pages_node[i] > 0) { 3103 hugetlb_hstate_alloc_pages_onenode(h, i); 3104 node_specific_alloc = true; 3105 } 3106 } 3107 3108 if (node_specific_alloc) 3109 return; 3110 3111 /* below will do all node balanced alloc */ 3112 if (!hstate_is_gigantic(h)) { 3113 /* 3114 * Bit mask controlling how hard we retry per-node allocations. 3115 * Ignore errors as lower level routines can deal with 3116 * node_alloc_noretry == NULL. If this kmalloc fails at boot 3117 * time, we are likely in bigger trouble. 3118 */ 3119 node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), 3120 GFP_KERNEL); 3121 } else { 3122 /* allocations done at boot time */ 3123 node_alloc_noretry = NULL; 3124 } 3125 3126 /* bit mask controlling how hard we retry per-node allocations */ 3127 if (node_alloc_noretry) 3128 nodes_clear(*node_alloc_noretry); 3129 3130 for (i = 0; i < h->max_huge_pages; ++i) { 3131 if (hstate_is_gigantic(h)) { 3132 if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) 3133 break; 3134 } else if (!alloc_pool_huge_page(h, 3135 &node_states[N_MEMORY], 3136 node_alloc_noretry)) 3137 break; 3138 cond_resched(); 3139 } 3140 if (i < h->max_huge_pages) { 3141 char buf[32]; 3142 3143 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 3144 pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", 3145 h->max_huge_pages, buf, i); 3146 h->max_huge_pages = i; 3147 } 3148 kfree(node_alloc_noretry); 3149 } 3150 3151 static void __init hugetlb_init_hstates(void) 3152 { 3153 struct hstate *h, *h2; 3154 3155 for_each_hstate(h) { 3156 /* oversize hugepages were init'ed in early boot */ 3157 if (!hstate_is_gigantic(h)) 3158 hugetlb_hstate_alloc_pages(h); 3159 3160 /* 3161 * Set demote order for each hstate. Note that 3162 * h->demote_order is initially 0. 3163 * - We can not demote gigantic pages if runtime freeing 3164 * is not supported, so skip this. 3165 * - If CMA allocation is possible, we can not demote 3166 * HUGETLB_PAGE_ORDER or smaller size pages. 3167 */ 3168 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 3169 continue; 3170 if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER) 3171 continue; 3172 for_each_hstate(h2) { 3173 if (h2 == h) 3174 continue; 3175 if (h2->order < h->order && 3176 h2->order > h->demote_order) 3177 h->demote_order = h2->order; 3178 } 3179 } 3180 } 3181 3182 static void __init report_hugepages(void) 3183 { 3184 struct hstate *h; 3185 3186 for_each_hstate(h) { 3187 char buf[32]; 3188 3189 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 3190 pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", 3191 buf, h->free_huge_pages); 3192 pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", 3193 hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); 3194 } 3195 } 3196 3197 #ifdef CONFIG_HIGHMEM 3198 static void try_to_free_low(struct hstate *h, unsigned long count, 3199 nodemask_t *nodes_allowed) 3200 { 3201 int i; 3202 LIST_HEAD(page_list); 3203 3204 lockdep_assert_held(&hugetlb_lock); 3205 if (hstate_is_gigantic(h)) 3206 return; 3207 3208 /* 3209 * Collect pages to be freed on a list, and free after dropping lock 3210 */ 3211 for_each_node_mask(i, *nodes_allowed) { 3212 struct page *page, *next; 3213 struct list_head *freel = &h->hugepage_freelists[i]; 3214 list_for_each_entry_safe(page, next, freel, lru) { 3215 if (count >= h->nr_huge_pages) 3216 goto out; 3217 if (PageHighMem(page)) 3218 continue; 3219 remove_hugetlb_page(h, page, false); 3220 list_add(&page->lru, &page_list); 3221 } 3222 } 3223 3224 out: 3225 spin_unlock_irq(&hugetlb_lock); 3226 update_and_free_pages_bulk(h, &page_list); 3227 spin_lock_irq(&hugetlb_lock); 3228 } 3229 #else 3230 static inline void try_to_free_low(struct hstate *h, unsigned long count, 3231 nodemask_t *nodes_allowed) 3232 { 3233 } 3234 #endif 3235 3236 /* 3237 * Increment or decrement surplus_huge_pages. Keep node-specific counters 3238 * balanced by operating on them in a round-robin fashion. 3239 * Returns 1 if an adjustment was made. 3240 */ 3241 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 3242 int delta) 3243 { 3244 int nr_nodes, node; 3245 3246 lockdep_assert_held(&hugetlb_lock); 3247 VM_BUG_ON(delta != -1 && delta != 1); 3248 3249 if (delta < 0) { 3250 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 3251 if (h->surplus_huge_pages_node[node]) 3252 goto found; 3253 } 3254 } else { 3255 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 3256 if (h->surplus_huge_pages_node[node] < 3257 h->nr_huge_pages_node[node]) 3258 goto found; 3259 } 3260 } 3261 return 0; 3262 3263 found: 3264 h->surplus_huge_pages += delta; 3265 h->surplus_huge_pages_node[node] += delta; 3266 return 1; 3267 } 3268 3269 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 3270 static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, 3271 nodemask_t *nodes_allowed) 3272 { 3273 unsigned long min_count, ret; 3274 struct page *page; 3275 LIST_HEAD(page_list); 3276 NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); 3277 3278 /* 3279 * Bit mask controlling how hard we retry per-node allocations. 3280 * If we can not allocate the bit mask, do not attempt to allocate 3281 * the requested huge pages. 3282 */ 3283 if (node_alloc_noretry) 3284 nodes_clear(*node_alloc_noretry); 3285 else 3286 return -ENOMEM; 3287 3288 /* 3289 * resize_lock mutex prevents concurrent adjustments to number of 3290 * pages in hstate via the proc/sysfs interfaces. 3291 */ 3292 mutex_lock(&h->resize_lock); 3293 flush_free_hpage_work(h); 3294 spin_lock_irq(&hugetlb_lock); 3295 3296 /* 3297 * Check for a node specific request. 3298 * Changing node specific huge page count may require a corresponding 3299 * change to the global count. In any case, the passed node mask 3300 * (nodes_allowed) will restrict alloc/free to the specified node. 3301 */ 3302 if (nid != NUMA_NO_NODE) { 3303 unsigned long old_count = count; 3304 3305 count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 3306 /* 3307 * User may have specified a large count value which caused the 3308 * above calculation to overflow. In this case, they wanted 3309 * to allocate as many huge pages as possible. Set count to 3310 * largest possible value to align with their intention. 3311 */ 3312 if (count < old_count) 3313 count = ULONG_MAX; 3314 } 3315 3316 /* 3317 * Gigantic pages runtime allocation depend on the capability for large 3318 * page range allocation. 3319 * If the system does not provide this feature, return an error when 3320 * the user tries to allocate gigantic pages but let the user free the 3321 * boottime allocated gigantic pages. 3322 */ 3323 if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { 3324 if (count > persistent_huge_pages(h)) { 3325 spin_unlock_irq(&hugetlb_lock); 3326 mutex_unlock(&h->resize_lock); 3327 NODEMASK_FREE(node_alloc_noretry); 3328 return -EINVAL; 3329 } 3330 /* Fall through to decrease pool */ 3331 } 3332 3333 /* 3334 * Increase the pool size 3335 * First take pages out of surplus state. Then make up the 3336 * remaining difference by allocating fresh huge pages. 3337 * 3338 * We might race with alloc_surplus_huge_page() here and be unable 3339 * to convert a surplus huge page to a normal huge page. That is 3340 * not critical, though, it just means the overall size of the 3341 * pool might be one hugepage larger than it needs to be, but 3342 * within all the constraints specified by the sysctls. 3343 */ 3344 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 3345 if (!adjust_pool_surplus(h, nodes_allowed, -1)) 3346 break; 3347 } 3348 3349 while (count > persistent_huge_pages(h)) { 3350 /* 3351 * If this allocation races such that we no longer need the 3352 * page, free_huge_page will handle it by freeing the page 3353 * and reducing the surplus. 3354 */ 3355 spin_unlock_irq(&hugetlb_lock); 3356 3357 /* yield cpu to avoid soft lockup */ 3358 cond_resched(); 3359 3360 ret = alloc_pool_huge_page(h, nodes_allowed, 3361 node_alloc_noretry); 3362 spin_lock_irq(&hugetlb_lock); 3363 if (!ret) 3364 goto out; 3365 3366 /* Bail for signals. Probably ctrl-c from user */ 3367 if (signal_pending(current)) 3368 goto out; 3369 } 3370 3371 /* 3372 * Decrease the pool size 3373 * First return free pages to the buddy allocator (being careful 3374 * to keep enough around to satisfy reservations). Then place 3375 * pages into surplus state as needed so the pool will shrink 3376 * to the desired size as pages become free. 3377 * 3378 * By placing pages into the surplus state independent of the 3379 * overcommit value, we are allowing the surplus pool size to 3380 * exceed overcommit. There are few sane options here. Since 3381 * alloc_surplus_huge_page() is checking the global counter, 3382 * though, we'll note that we're not allowed to exceed surplus 3383 * and won't grow the pool anywhere else. Not until one of the 3384 * sysctls are changed, or the surplus pages go out of use. 3385 */ 3386 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 3387 min_count = max(count, min_count); 3388 try_to_free_low(h, min_count, nodes_allowed); 3389 3390 /* 3391 * Collect pages to be removed on list without dropping lock 3392 */ 3393 while (min_count < persistent_huge_pages(h)) { 3394 page = remove_pool_huge_page(h, nodes_allowed, 0); 3395 if (!page) 3396 break; 3397 3398 list_add(&page->lru, &page_list); 3399 } 3400 /* free the pages after dropping lock */ 3401 spin_unlock_irq(&hugetlb_lock); 3402 update_and_free_pages_bulk(h, &page_list); 3403 flush_free_hpage_work(h); 3404 spin_lock_irq(&hugetlb_lock); 3405 3406 while (count < persistent_huge_pages(h)) { 3407 if (!adjust_pool_surplus(h, nodes_allowed, 1)) 3408 break; 3409 } 3410 out: 3411 h->max_huge_pages = persistent_huge_pages(h); 3412 spin_unlock_irq(&hugetlb_lock); 3413 mutex_unlock(&h->resize_lock); 3414 3415 NODEMASK_FREE(node_alloc_noretry); 3416 3417 return 0; 3418 } 3419 3420 static int demote_free_huge_page(struct hstate *h, struct page *page) 3421 { 3422 int i, nid = page_to_nid(page); 3423 struct hstate *target_hstate; 3424 struct page *subpage; 3425 int rc = 0; 3426 3427 target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); 3428 3429 remove_hugetlb_page_for_demote(h, page, false); 3430 spin_unlock_irq(&hugetlb_lock); 3431 3432 rc = hugetlb_vmemmap_restore(h, page); 3433 if (rc) { 3434 /* Allocation of vmemmmap failed, we can not demote page */ 3435 spin_lock_irq(&hugetlb_lock); 3436 set_page_refcounted(page); 3437 add_hugetlb_page(h, page, false); 3438 return rc; 3439 } 3440 3441 /* 3442 * Use destroy_compound_hugetlb_page_for_demote for all huge page 3443 * sizes as it will not ref count pages. 3444 */ 3445 destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h)); 3446 3447 /* 3448 * Taking target hstate mutex synchronizes with set_max_huge_pages. 3449 * Without the mutex, pages added to target hstate could be marked 3450 * as surplus. 3451 * 3452 * Note that we already hold h->resize_lock. To prevent deadlock, 3453 * use the convention of always taking larger size hstate mutex first. 3454 */ 3455 mutex_lock(&target_hstate->resize_lock); 3456 for (i = 0; i < pages_per_huge_page(h); 3457 i += pages_per_huge_page(target_hstate)) { 3458 subpage = nth_page(page, i); 3459 if (hstate_is_gigantic(target_hstate)) 3460 prep_compound_gigantic_page_for_demote(subpage, 3461 target_hstate->order); 3462 else 3463 prep_compound_page(subpage, target_hstate->order); 3464 set_page_private(subpage, 0); 3465 set_page_refcounted(subpage); 3466 prep_new_huge_page(target_hstate, subpage, nid); 3467 put_page(subpage); 3468 } 3469 mutex_unlock(&target_hstate->resize_lock); 3470 3471 spin_lock_irq(&hugetlb_lock); 3472 3473 /* 3474 * Not absolutely necessary, but for consistency update max_huge_pages 3475 * based on pool changes for the demoted page. 3476 */ 3477 h->max_huge_pages--; 3478 target_hstate->max_huge_pages += 3479 pages_per_huge_page(h) / pages_per_huge_page(target_hstate); 3480 3481 return rc; 3482 } 3483 3484 static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) 3485 __must_hold(&hugetlb_lock) 3486 { 3487 int nr_nodes, node; 3488 struct page *page; 3489 3490 lockdep_assert_held(&hugetlb_lock); 3491 3492 /* We should never get here if no demote order */ 3493 if (!h->demote_order) { 3494 pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); 3495 return -EINVAL; /* internal error */ 3496 } 3497 3498 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 3499 list_for_each_entry(page, &h->hugepage_freelists[node], lru) { 3500 if (PageHWPoison(page)) 3501 continue; 3502 3503 return demote_free_huge_page(h, page); 3504 } 3505 } 3506 3507 /* 3508 * Only way to get here is if all pages on free lists are poisoned. 3509 * Return -EBUSY so that caller will not retry. 3510 */ 3511 return -EBUSY; 3512 } 3513 3514 #define HSTATE_ATTR_RO(_name) \ 3515 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 3516 3517 #define HSTATE_ATTR_WO(_name) \ 3518 static struct kobj_attribute _name##_attr = __ATTR_WO(_name) 3519 3520 #define HSTATE_ATTR(_name) \ 3521 static struct kobj_attribute _name##_attr = __ATTR_RW(_name) 3522 3523 static struct kobject *hugepages_kobj; 3524 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 3525 3526 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); 3527 3528 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) 3529 { 3530 int i; 3531 3532 for (i = 0; i < HUGE_MAX_HSTATE; i++) 3533 if (hstate_kobjs[i] == kobj) { 3534 if (nidp) 3535 *nidp = NUMA_NO_NODE; 3536 return &hstates[i]; 3537 } 3538 3539 return kobj_to_node_hstate(kobj, nidp); 3540 } 3541 3542 static ssize_t nr_hugepages_show_common(struct kobject *kobj, 3543 struct kobj_attribute *attr, char *buf) 3544 { 3545 struct hstate *h; 3546 unsigned long nr_huge_pages; 3547 int nid; 3548 3549 h = kobj_to_hstate(kobj, &nid); 3550 if (nid == NUMA_NO_NODE) 3551 nr_huge_pages = h->nr_huge_pages; 3552 else 3553 nr_huge_pages = h->nr_huge_pages_node[nid]; 3554 3555 return sysfs_emit(buf, "%lu\n", nr_huge_pages); 3556 } 3557 3558 static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, 3559 struct hstate *h, int nid, 3560 unsigned long count, size_t len) 3561 { 3562 int err; 3563 nodemask_t nodes_allowed, *n_mask; 3564 3565 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 3566 return -EINVAL; 3567 3568 if (nid == NUMA_NO_NODE) { 3569 /* 3570 * global hstate attribute 3571 */ 3572 if (!(obey_mempolicy && 3573 init_nodemask_of_mempolicy(&nodes_allowed))) 3574 n_mask = &node_states[N_MEMORY]; 3575 else 3576 n_mask = &nodes_allowed; 3577 } else { 3578 /* 3579 * Node specific request. count adjustment happens in 3580 * set_max_huge_pages() after acquiring hugetlb_lock. 3581 */ 3582 init_nodemask_of_node(&nodes_allowed, nid); 3583 n_mask = &nodes_allowed; 3584 } 3585 3586 err = set_max_huge_pages(h, count, nid, n_mask); 3587 3588 return err ? err : len; 3589 } 3590 3591 static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 3592 struct kobject *kobj, const char *buf, 3593 size_t len) 3594 { 3595 struct hstate *h; 3596 unsigned long count; 3597 int nid; 3598 int err; 3599 3600 err = kstrtoul(buf, 10, &count); 3601 if (err) 3602 return err; 3603 3604 h = kobj_to_hstate(kobj, &nid); 3605 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); 3606 } 3607 3608 static ssize_t nr_hugepages_show(struct kobject *kobj, 3609 struct kobj_attribute *attr, char *buf) 3610 { 3611 return nr_hugepages_show_common(kobj, attr, buf); 3612 } 3613 3614 static ssize_t nr_hugepages_store(struct kobject *kobj, 3615 struct kobj_attribute *attr, const char *buf, size_t len) 3616 { 3617 return nr_hugepages_store_common(false, kobj, buf, len); 3618 } 3619 HSTATE_ATTR(nr_hugepages); 3620 3621 #ifdef CONFIG_NUMA 3622 3623 /* 3624 * hstate attribute for optionally mempolicy-based constraint on persistent 3625 * huge page alloc/free. 3626 */ 3627 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, 3628 struct kobj_attribute *attr, 3629 char *buf) 3630 { 3631 return nr_hugepages_show_common(kobj, attr, buf); 3632 } 3633 3634 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, 3635 struct kobj_attribute *attr, const char *buf, size_t len) 3636 { 3637 return nr_hugepages_store_common(true, kobj, buf, len); 3638 } 3639 HSTATE_ATTR(nr_hugepages_mempolicy); 3640 #endif 3641 3642 3643 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 3644 struct kobj_attribute *attr, char *buf) 3645 { 3646 struct hstate *h = kobj_to_hstate(kobj, NULL); 3647 return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); 3648 } 3649 3650 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 3651 struct kobj_attribute *attr, const char *buf, size_t count) 3652 { 3653 int err; 3654 unsigned long input; 3655 struct hstate *h = kobj_to_hstate(kobj, NULL); 3656 3657 if (hstate_is_gigantic(h)) 3658 return -EINVAL; 3659 3660 err = kstrtoul(buf, 10, &input); 3661 if (err) 3662 return err; 3663 3664 spin_lock_irq(&hugetlb_lock); 3665 h->nr_overcommit_huge_pages = input; 3666 spin_unlock_irq(&hugetlb_lock); 3667 3668 return count; 3669 } 3670 HSTATE_ATTR(nr_overcommit_hugepages); 3671 3672 static ssize_t free_hugepages_show(struct kobject *kobj, 3673 struct kobj_attribute *attr, char *buf) 3674 { 3675 struct hstate *h; 3676 unsigned long free_huge_pages; 3677 int nid; 3678 3679 h = kobj_to_hstate(kobj, &nid); 3680 if (nid == NUMA_NO_NODE) 3681 free_huge_pages = h->free_huge_pages; 3682 else 3683 free_huge_pages = h->free_huge_pages_node[nid]; 3684 3685 return sysfs_emit(buf, "%lu\n", free_huge_pages); 3686 } 3687 HSTATE_ATTR_RO(free_hugepages); 3688 3689 static ssize_t resv_hugepages_show(struct kobject *kobj, 3690 struct kobj_attribute *attr, char *buf) 3691 { 3692 struct hstate *h = kobj_to_hstate(kobj, NULL); 3693 return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); 3694 } 3695 HSTATE_ATTR_RO(resv_hugepages); 3696 3697 static ssize_t surplus_hugepages_show(struct kobject *kobj, 3698 struct kobj_attribute *attr, char *buf) 3699 { 3700 struct hstate *h; 3701 unsigned long surplus_huge_pages; 3702 int nid; 3703 3704 h = kobj_to_hstate(kobj, &nid); 3705 if (nid == NUMA_NO_NODE) 3706 surplus_huge_pages = h->surplus_huge_pages; 3707 else 3708 surplus_huge_pages = h->surplus_huge_pages_node[nid]; 3709 3710 return sysfs_emit(buf, "%lu\n", surplus_huge_pages); 3711 } 3712 HSTATE_ATTR_RO(surplus_hugepages); 3713 3714 static ssize_t demote_store(struct kobject *kobj, 3715 struct kobj_attribute *attr, const char *buf, size_t len) 3716 { 3717 unsigned long nr_demote; 3718 unsigned long nr_available; 3719 nodemask_t nodes_allowed, *n_mask; 3720 struct hstate *h; 3721 int err = 0; 3722 int nid; 3723 3724 err = kstrtoul(buf, 10, &nr_demote); 3725 if (err) 3726 return err; 3727 h = kobj_to_hstate(kobj, &nid); 3728 3729 if (nid != NUMA_NO_NODE) { 3730 init_nodemask_of_node(&nodes_allowed, nid); 3731 n_mask = &nodes_allowed; 3732 } else { 3733 n_mask = &node_states[N_MEMORY]; 3734 } 3735 3736 /* Synchronize with other sysfs operations modifying huge pages */ 3737 mutex_lock(&h->resize_lock); 3738 spin_lock_irq(&hugetlb_lock); 3739 3740 while (nr_demote) { 3741 /* 3742 * Check for available pages to demote each time thorough the 3743 * loop as demote_pool_huge_page will drop hugetlb_lock. 3744 */ 3745 if (nid != NUMA_NO_NODE) 3746 nr_available = h->free_huge_pages_node[nid]; 3747 else 3748 nr_available = h->free_huge_pages; 3749 nr_available -= h->resv_huge_pages; 3750 if (!nr_available) 3751 break; 3752 3753 err = demote_pool_huge_page(h, n_mask); 3754 if (err) 3755 break; 3756 3757 nr_demote--; 3758 } 3759 3760 spin_unlock_irq(&hugetlb_lock); 3761 mutex_unlock(&h->resize_lock); 3762 3763 if (err) 3764 return err; 3765 return len; 3766 } 3767 HSTATE_ATTR_WO(demote); 3768 3769 static ssize_t demote_size_show(struct kobject *kobj, 3770 struct kobj_attribute *attr, char *buf) 3771 { 3772 struct hstate *h = kobj_to_hstate(kobj, NULL); 3773 unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; 3774 3775 return sysfs_emit(buf, "%lukB\n", demote_size); 3776 } 3777 3778 static ssize_t demote_size_store(struct kobject *kobj, 3779 struct kobj_attribute *attr, 3780 const char *buf, size_t count) 3781 { 3782 struct hstate *h, *demote_hstate; 3783 unsigned long demote_size; 3784 unsigned int demote_order; 3785 3786 demote_size = (unsigned long)memparse(buf, NULL); 3787 3788 demote_hstate = size_to_hstate(demote_size); 3789 if (!demote_hstate) 3790 return -EINVAL; 3791 demote_order = demote_hstate->order; 3792 if (demote_order < HUGETLB_PAGE_ORDER) 3793 return -EINVAL; 3794 3795 /* demote order must be smaller than hstate order */ 3796 h = kobj_to_hstate(kobj, NULL); 3797 if (demote_order >= h->order) 3798 return -EINVAL; 3799 3800 /* resize_lock synchronizes access to demote size and writes */ 3801 mutex_lock(&h->resize_lock); 3802 h->demote_order = demote_order; 3803 mutex_unlock(&h->resize_lock); 3804 3805 return count; 3806 } 3807 HSTATE_ATTR(demote_size); 3808 3809 static struct attribute *hstate_attrs[] = { 3810 &nr_hugepages_attr.attr, 3811 &nr_overcommit_hugepages_attr.attr, 3812 &free_hugepages_attr.attr, 3813 &resv_hugepages_attr.attr, 3814 &surplus_hugepages_attr.attr, 3815 #ifdef CONFIG_NUMA 3816 &nr_hugepages_mempolicy_attr.attr, 3817 #endif 3818 NULL, 3819 }; 3820 3821 static const struct attribute_group hstate_attr_group = { 3822 .attrs = hstate_attrs, 3823 }; 3824 3825 static struct attribute *hstate_demote_attrs[] = { 3826 &demote_size_attr.attr, 3827 &demote_attr.attr, 3828 NULL, 3829 }; 3830 3831 static const struct attribute_group hstate_demote_attr_group = { 3832 .attrs = hstate_demote_attrs, 3833 }; 3834 3835 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, 3836 struct kobject **hstate_kobjs, 3837 const struct attribute_group *hstate_attr_group) 3838 { 3839 int retval; 3840 int hi = hstate_index(h); 3841 3842 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 3843 if (!hstate_kobjs[hi]) 3844 return -ENOMEM; 3845 3846 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); 3847 if (retval) { 3848 kobject_put(hstate_kobjs[hi]); 3849 hstate_kobjs[hi] = NULL; 3850 return retval; 3851 } 3852 3853 if (h->demote_order) { 3854 retval = sysfs_create_group(hstate_kobjs[hi], 3855 &hstate_demote_attr_group); 3856 if (retval) { 3857 pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); 3858 sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group); 3859 kobject_put(hstate_kobjs[hi]); 3860 hstate_kobjs[hi] = NULL; 3861 return retval; 3862 } 3863 } 3864 3865 return 0; 3866 } 3867 3868 static void __init hugetlb_sysfs_init(void) 3869 { 3870 struct hstate *h; 3871 int err; 3872 3873 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); 3874 if (!hugepages_kobj) 3875 return; 3876 3877 for_each_hstate(h) { 3878 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 3879 hstate_kobjs, &hstate_attr_group); 3880 if (err) 3881 pr_err("HugeTLB: Unable to add hstate %s", h->name); 3882 } 3883 } 3884 3885 #ifdef CONFIG_NUMA 3886 3887 /* 3888 * node_hstate/s - associate per node hstate attributes, via their kobjects, 3889 * with node devices in node_devices[] using a parallel array. The array 3890 * index of a node device or _hstate == node id. 3891 * This is here to avoid any static dependency of the node device driver, in 3892 * the base kernel, on the hugetlb module. 3893 */ 3894 struct node_hstate { 3895 struct kobject *hugepages_kobj; 3896 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 3897 }; 3898 static struct node_hstate node_hstates[MAX_NUMNODES]; 3899 3900 /* 3901 * A subset of global hstate attributes for node devices 3902 */ 3903 static struct attribute *per_node_hstate_attrs[] = { 3904 &nr_hugepages_attr.attr, 3905 &free_hugepages_attr.attr, 3906 &surplus_hugepages_attr.attr, 3907 NULL, 3908 }; 3909 3910 static const struct attribute_group per_node_hstate_attr_group = { 3911 .attrs = per_node_hstate_attrs, 3912 }; 3913 3914 /* 3915 * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. 3916 * Returns node id via non-NULL nidp. 3917 */ 3918 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 3919 { 3920 int nid; 3921 3922 for (nid = 0; nid < nr_node_ids; nid++) { 3923 struct node_hstate *nhs = &node_hstates[nid]; 3924 int i; 3925 for (i = 0; i < HUGE_MAX_HSTATE; i++) 3926 if (nhs->hstate_kobjs[i] == kobj) { 3927 if (nidp) 3928 *nidp = nid; 3929 return &hstates[i]; 3930 } 3931 } 3932 3933 BUG(); 3934 return NULL; 3935 } 3936 3937 /* 3938 * Unregister hstate attributes from a single node device. 3939 * No-op if no hstate attributes attached. 3940 */ 3941 static void hugetlb_unregister_node(struct node *node) 3942 { 3943 struct hstate *h; 3944 struct node_hstate *nhs = &node_hstates[node->dev.id]; 3945 3946 if (!nhs->hugepages_kobj) 3947 return; /* no hstate attributes */ 3948 3949 for_each_hstate(h) { 3950 int idx = hstate_index(h); 3951 struct kobject *hstate_kobj = nhs->hstate_kobjs[idx]; 3952 3953 if (!hstate_kobj) 3954 continue; 3955 if (h->demote_order) 3956 sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group); 3957 sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group); 3958 kobject_put(hstate_kobj); 3959 nhs->hstate_kobjs[idx] = NULL; 3960 } 3961 3962 kobject_put(nhs->hugepages_kobj); 3963 nhs->hugepages_kobj = NULL; 3964 } 3965 3966 3967 /* 3968 * Register hstate attributes for a single node device. 3969 * No-op if attributes already registered. 3970 */ 3971 static void hugetlb_register_node(struct node *node) 3972 { 3973 struct hstate *h; 3974 struct node_hstate *nhs = &node_hstates[node->dev.id]; 3975 int err; 3976 3977 if (nhs->hugepages_kobj) 3978 return; /* already allocated */ 3979 3980 nhs->hugepages_kobj = kobject_create_and_add("hugepages", 3981 &node->dev.kobj); 3982 if (!nhs->hugepages_kobj) 3983 return; 3984 3985 for_each_hstate(h) { 3986 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, 3987 nhs->hstate_kobjs, 3988 &per_node_hstate_attr_group); 3989 if (err) { 3990 pr_err("HugeTLB: Unable to add hstate %s for node %d\n", 3991 h->name, node->dev.id); 3992 hugetlb_unregister_node(node); 3993 break; 3994 } 3995 } 3996 } 3997 3998 /* 3999 * hugetlb init time: register hstate attributes for all registered node 4000 * devices of nodes that have memory. All on-line nodes should have 4001 * registered their associated device by this time. 4002 */ 4003 static void __init hugetlb_register_all_nodes(void) 4004 { 4005 int nid; 4006 4007 for_each_node_state(nid, N_MEMORY) { 4008 struct node *node = node_devices[nid]; 4009 if (node->dev.id == nid) 4010 hugetlb_register_node(node); 4011 } 4012 4013 /* 4014 * Let the node device driver know we're here so it can 4015 * [un]register hstate attributes on node hotplug. 4016 */ 4017 register_hugetlbfs_with_node(hugetlb_register_node, 4018 hugetlb_unregister_node); 4019 } 4020 #else /* !CONFIG_NUMA */ 4021 4022 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 4023 { 4024 BUG(); 4025 if (nidp) 4026 *nidp = -1; 4027 return NULL; 4028 } 4029 4030 static void hugetlb_register_all_nodes(void) { } 4031 4032 #endif 4033 4034 #ifdef CONFIG_CMA 4035 static void __init hugetlb_cma_check(void); 4036 #else 4037 static inline __init void hugetlb_cma_check(void) 4038 { 4039 } 4040 #endif 4041 4042 static int __init hugetlb_init(void) 4043 { 4044 int i; 4045 4046 BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < 4047 __NR_HPAGEFLAGS); 4048 4049 if (!hugepages_supported()) { 4050 if (hugetlb_max_hstate || default_hstate_max_huge_pages) 4051 pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); 4052 return 0; 4053 } 4054 4055 /* 4056 * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some 4057 * architectures depend on setup being done here. 4058 */ 4059 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 4060 if (!parsed_default_hugepagesz) { 4061 /* 4062 * If we did not parse a default huge page size, set 4063 * default_hstate_idx to HPAGE_SIZE hstate. And, if the 4064 * number of huge pages for this default size was implicitly 4065 * specified, set that here as well. 4066 * Note that the implicit setting will overwrite an explicit 4067 * setting. A warning will be printed in this case. 4068 */ 4069 default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); 4070 if (default_hstate_max_huge_pages) { 4071 if (default_hstate.max_huge_pages) { 4072 char buf[32]; 4073 4074 string_get_size(huge_page_size(&default_hstate), 4075 1, STRING_UNITS_2, buf, 32); 4076 pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", 4077 default_hstate.max_huge_pages, buf); 4078 pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", 4079 default_hstate_max_huge_pages); 4080 } 4081 default_hstate.max_huge_pages = 4082 default_hstate_max_huge_pages; 4083 4084 for_each_online_node(i) 4085 default_hstate.max_huge_pages_node[i] = 4086 default_hugepages_in_node[i]; 4087 } 4088 } 4089 4090 hugetlb_cma_check(); 4091 hugetlb_init_hstates(); 4092 gather_bootmem_prealloc(); 4093 report_hugepages(); 4094 4095 hugetlb_sysfs_init(); 4096 hugetlb_register_all_nodes(); 4097 hugetlb_cgroup_file_init(); 4098 4099 #ifdef CONFIG_SMP 4100 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); 4101 #else 4102 num_fault_mutexes = 1; 4103 #endif 4104 hugetlb_fault_mutex_table = 4105 kmalloc_array(num_fault_mutexes, sizeof(struct mutex), 4106 GFP_KERNEL); 4107 BUG_ON(!hugetlb_fault_mutex_table); 4108 4109 for (i = 0; i < num_fault_mutexes; i++) 4110 mutex_init(&hugetlb_fault_mutex_table[i]); 4111 return 0; 4112 } 4113 subsys_initcall(hugetlb_init); 4114 4115 /* Overwritten by architectures with more huge page sizes */ 4116 bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) 4117 { 4118 return size == HPAGE_SIZE; 4119 } 4120 4121 void __init hugetlb_add_hstate(unsigned int order) 4122 { 4123 struct hstate *h; 4124 unsigned long i; 4125 4126 if (size_to_hstate(PAGE_SIZE << order)) { 4127 return; 4128 } 4129 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); 4130 BUG_ON(order == 0); 4131 h = &hstates[hugetlb_max_hstate++]; 4132 mutex_init(&h->resize_lock); 4133 h->order = order; 4134 h->mask = ~(huge_page_size(h) - 1); 4135 for (i = 0; i < MAX_NUMNODES; ++i) 4136 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 4137 INIT_LIST_HEAD(&h->hugepage_activelist); 4138 h->next_nid_to_alloc = first_memory_node; 4139 h->next_nid_to_free = first_memory_node; 4140 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 4141 huge_page_size(h)/SZ_1K); 4142 4143 parsed_hstate = h; 4144 } 4145 4146 bool __init __weak hugetlb_node_alloc_supported(void) 4147 { 4148 return true; 4149 } 4150 4151 static void __init hugepages_clear_pages_in_node(void) 4152 { 4153 if (!hugetlb_max_hstate) { 4154 default_hstate_max_huge_pages = 0; 4155 memset(default_hugepages_in_node, 0, 4156 sizeof(default_hugepages_in_node)); 4157 } else { 4158 parsed_hstate->max_huge_pages = 0; 4159 memset(parsed_hstate->max_huge_pages_node, 0, 4160 sizeof(parsed_hstate->max_huge_pages_node)); 4161 } 4162 } 4163 4164 /* 4165 * hugepages command line processing 4166 * hugepages normally follows a valid hugepagsz or default_hugepagsz 4167 * specification. If not, ignore the hugepages value. hugepages can also 4168 * be the first huge page command line option in which case it implicitly 4169 * specifies the number of huge pages for the default size. 4170 */ 4171 static int __init hugepages_setup(char *s) 4172 { 4173 unsigned long *mhp; 4174 static unsigned long *last_mhp; 4175 int node = NUMA_NO_NODE; 4176 int count; 4177 unsigned long tmp; 4178 char *p = s; 4179 4180 if (!parsed_valid_hugepagesz) { 4181 pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); 4182 parsed_valid_hugepagesz = true; 4183 return 1; 4184 } 4185 4186 /* 4187 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter 4188 * yet, so this hugepages= parameter goes to the "default hstate". 4189 * Otherwise, it goes with the previously parsed hugepagesz or 4190 * default_hugepagesz. 4191 */ 4192 else if (!hugetlb_max_hstate) 4193 mhp = &default_hstate_max_huge_pages; 4194 else 4195 mhp = &parsed_hstate->max_huge_pages; 4196 4197 if (mhp == last_mhp) { 4198 pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); 4199 return 1; 4200 } 4201 4202 while (*p) { 4203 count = 0; 4204 if (sscanf(p, "%lu%n", &tmp, &count) != 1) 4205 goto invalid; 4206 /* Parameter is node format */ 4207 if (p[count] == ':') { 4208 if (!hugetlb_node_alloc_supported()) { 4209 pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); 4210 return 1; 4211 } 4212 if (tmp >= MAX_NUMNODES || !node_online(tmp)) 4213 goto invalid; 4214 node = array_index_nospec(tmp, MAX_NUMNODES); 4215 p += count + 1; 4216 /* Parse hugepages */ 4217 if (sscanf(p, "%lu%n", &tmp, &count) != 1) 4218 goto invalid; 4219 if (!hugetlb_max_hstate) 4220 default_hugepages_in_node[node] = tmp; 4221 else 4222 parsed_hstate->max_huge_pages_node[node] = tmp; 4223 *mhp += tmp; 4224 /* Go to parse next node*/ 4225 if (p[count] == ',') 4226 p += count + 1; 4227 else 4228 break; 4229 } else { 4230 if (p != s) 4231 goto invalid; 4232 *mhp = tmp; 4233 break; 4234 } 4235 } 4236 4237 /* 4238 * Global state is always initialized later in hugetlb_init. 4239 * But we need to allocate gigantic hstates here early to still 4240 * use the bootmem allocator. 4241 */ 4242 if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate)) 4243 hugetlb_hstate_alloc_pages(parsed_hstate); 4244 4245 last_mhp = mhp; 4246 4247 return 1; 4248 4249 invalid: 4250 pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); 4251 hugepages_clear_pages_in_node(); 4252 return 1; 4253 } 4254 __setup("hugepages=", hugepages_setup); 4255 4256 /* 4257 * hugepagesz command line processing 4258 * A specific huge page size can only be specified once with hugepagesz. 4259 * hugepagesz is followed by hugepages on the command line. The global 4260 * variable 'parsed_valid_hugepagesz' is used to determine if prior 4261 * hugepagesz argument was valid. 4262 */ 4263 static int __init hugepagesz_setup(char *s) 4264 { 4265 unsigned long size; 4266 struct hstate *h; 4267 4268 parsed_valid_hugepagesz = false; 4269 size = (unsigned long)memparse(s, NULL); 4270 4271 if (!arch_hugetlb_valid_size(size)) { 4272 pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); 4273 return 1; 4274 } 4275 4276 h = size_to_hstate(size); 4277 if (h) { 4278 /* 4279 * hstate for this size already exists. This is normally 4280 * an error, but is allowed if the existing hstate is the 4281 * default hstate. More specifically, it is only allowed if 4282 * the number of huge pages for the default hstate was not 4283 * previously specified. 4284 */ 4285 if (!parsed_default_hugepagesz || h != &default_hstate || 4286 default_hstate.max_huge_pages) { 4287 pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); 4288 return 1; 4289 } 4290 4291 /* 4292 * No need to call hugetlb_add_hstate() as hstate already 4293 * exists. But, do set parsed_hstate so that a following 4294 * hugepages= parameter will be applied to this hstate. 4295 */ 4296 parsed_hstate = h; 4297 parsed_valid_hugepagesz = true; 4298 return 1; 4299 } 4300 4301 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 4302 parsed_valid_hugepagesz = true; 4303 return 1; 4304 } 4305 __setup("hugepagesz=", hugepagesz_setup); 4306 4307 /* 4308 * default_hugepagesz command line input 4309 * Only one instance of default_hugepagesz allowed on command line. 4310 */ 4311 static int __init default_hugepagesz_setup(char *s) 4312 { 4313 unsigned long size; 4314 int i; 4315 4316 parsed_valid_hugepagesz = false; 4317 if (parsed_default_hugepagesz) { 4318 pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); 4319 return 1; 4320 } 4321 4322 size = (unsigned long)memparse(s, NULL); 4323 4324 if (!arch_hugetlb_valid_size(size)) { 4325 pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); 4326 return 1; 4327 } 4328 4329 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 4330 parsed_valid_hugepagesz = true; 4331 parsed_default_hugepagesz = true; 4332 default_hstate_idx = hstate_index(size_to_hstate(size)); 4333 4334 /* 4335 * The number of default huge pages (for this size) could have been 4336 * specified as the first hugetlb parameter: hugepages=X. If so, 4337 * then default_hstate_max_huge_pages is set. If the default huge 4338 * page size is gigantic (>= MAX_ORDER), then the pages must be 4339 * allocated here from bootmem allocator. 4340 */ 4341 if (default_hstate_max_huge_pages) { 4342 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 4343 for_each_online_node(i) 4344 default_hstate.max_huge_pages_node[i] = 4345 default_hugepages_in_node[i]; 4346 if (hstate_is_gigantic(&default_hstate)) 4347 hugetlb_hstate_alloc_pages(&default_hstate); 4348 default_hstate_max_huge_pages = 0; 4349 } 4350 4351 return 1; 4352 } 4353 __setup("default_hugepagesz=", default_hugepagesz_setup); 4354 4355 static nodemask_t *policy_mbind_nodemask(gfp_t gfp) 4356 { 4357 #ifdef CONFIG_NUMA 4358 struct mempolicy *mpol = get_task_policy(current); 4359 4360 /* 4361 * Only enforce MPOL_BIND policy which overlaps with cpuset policy 4362 * (from policy_nodemask) specifically for hugetlb case 4363 */ 4364 if (mpol->mode == MPOL_BIND && 4365 (apply_policy_zone(mpol, gfp_zone(gfp)) && 4366 cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) 4367 return &mpol->nodes; 4368 #endif 4369 return NULL; 4370 } 4371 4372 static unsigned int allowed_mems_nr(struct hstate *h) 4373 { 4374 int node; 4375 unsigned int nr = 0; 4376 nodemask_t *mbind_nodemask; 4377 unsigned int *array = h->free_huge_pages_node; 4378 gfp_t gfp_mask = htlb_alloc_mask(h); 4379 4380 mbind_nodemask = policy_mbind_nodemask(gfp_mask); 4381 for_each_node_mask(node, cpuset_current_mems_allowed) { 4382 if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) 4383 nr += array[node]; 4384 } 4385 4386 return nr; 4387 } 4388 4389 #ifdef CONFIG_SYSCTL 4390 static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, 4391 void *buffer, size_t *length, 4392 loff_t *ppos, unsigned long *out) 4393 { 4394 struct ctl_table dup_table; 4395 4396 /* 4397 * In order to avoid races with __do_proc_doulongvec_minmax(), we 4398 * can duplicate the @table and alter the duplicate of it. 4399 */ 4400 dup_table = *table; 4401 dup_table.data = out; 4402 4403 return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); 4404 } 4405 4406 static int hugetlb_sysctl_handler_common(bool obey_mempolicy, 4407 struct ctl_table *table, int write, 4408 void *buffer, size_t *length, loff_t *ppos) 4409 { 4410 struct hstate *h = &default_hstate; 4411 unsigned long tmp = h->max_huge_pages; 4412 int ret; 4413 4414 if (!hugepages_supported()) 4415 return -EOPNOTSUPP; 4416 4417 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, 4418 &tmp); 4419 if (ret) 4420 goto out; 4421 4422 if (write) 4423 ret = __nr_hugepages_store_common(obey_mempolicy, h, 4424 NUMA_NO_NODE, tmp, *length); 4425 out: 4426 return ret; 4427 } 4428 4429 int hugetlb_sysctl_handler(struct ctl_table *table, int write, 4430 void *buffer, size_t *length, loff_t *ppos) 4431 { 4432 4433 return hugetlb_sysctl_handler_common(false, table, write, 4434 buffer, length, ppos); 4435 } 4436 4437 #ifdef CONFIG_NUMA 4438 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, 4439 void *buffer, size_t *length, loff_t *ppos) 4440 { 4441 return hugetlb_sysctl_handler_common(true, table, write, 4442 buffer, length, ppos); 4443 } 4444 #endif /* CONFIG_NUMA */ 4445 4446 int hugetlb_overcommit_handler(struct ctl_table *table, int write, 4447 void *buffer, size_t *length, loff_t *ppos) 4448 { 4449 struct hstate *h = &default_hstate; 4450 unsigned long tmp; 4451 int ret; 4452 4453 if (!hugepages_supported()) 4454 return -EOPNOTSUPP; 4455 4456 tmp = h->nr_overcommit_huge_pages; 4457 4458 if (write && hstate_is_gigantic(h)) 4459 return -EINVAL; 4460 4461 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, 4462 &tmp); 4463 if (ret) 4464 goto out; 4465 4466 if (write) { 4467 spin_lock_irq(&hugetlb_lock); 4468 h->nr_overcommit_huge_pages = tmp; 4469 spin_unlock_irq(&hugetlb_lock); 4470 } 4471 out: 4472 return ret; 4473 } 4474 4475 #endif /* CONFIG_SYSCTL */ 4476 4477 void hugetlb_report_meminfo(struct seq_file *m) 4478 { 4479 struct hstate *h; 4480 unsigned long total = 0; 4481 4482 if (!hugepages_supported()) 4483 return; 4484 4485 for_each_hstate(h) { 4486 unsigned long count = h->nr_huge_pages; 4487 4488 total += huge_page_size(h) * count; 4489 4490 if (h == &default_hstate) 4491 seq_printf(m, 4492 "HugePages_Total: %5lu\n" 4493 "HugePages_Free: %5lu\n" 4494 "HugePages_Rsvd: %5lu\n" 4495 "HugePages_Surp: %5lu\n" 4496 "Hugepagesize: %8lu kB\n", 4497 count, 4498 h->free_huge_pages, 4499 h->resv_huge_pages, 4500 h->surplus_huge_pages, 4501 huge_page_size(h) / SZ_1K); 4502 } 4503 4504 seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K); 4505 } 4506 4507 int hugetlb_report_node_meminfo(char *buf, int len, int nid) 4508 { 4509 struct hstate *h = &default_hstate; 4510 4511 if (!hugepages_supported()) 4512 return 0; 4513 4514 return sysfs_emit_at(buf, len, 4515 "Node %d HugePages_Total: %5u\n" 4516 "Node %d HugePages_Free: %5u\n" 4517 "Node %d HugePages_Surp: %5u\n", 4518 nid, h->nr_huge_pages_node[nid], 4519 nid, h->free_huge_pages_node[nid], 4520 nid, h->surplus_huge_pages_node[nid]); 4521 } 4522 4523 void hugetlb_show_meminfo_node(int nid) 4524 { 4525 struct hstate *h; 4526 4527 if (!hugepages_supported()) 4528 return; 4529 4530 for_each_hstate(h) 4531 printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", 4532 nid, 4533 h->nr_huge_pages_node[nid], 4534 h->free_huge_pages_node[nid], 4535 h->surplus_huge_pages_node[nid], 4536 huge_page_size(h) / SZ_1K); 4537 } 4538 4539 void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) 4540 { 4541 seq_printf(m, "HugetlbPages:\t%8lu kB\n", 4542 atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); 4543 } 4544 4545 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 4546 unsigned long hugetlb_total_pages(void) 4547 { 4548 struct hstate *h; 4549 unsigned long nr_total_pages = 0; 4550 4551 for_each_hstate(h) 4552 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); 4553 return nr_total_pages; 4554 } 4555 4556 static int hugetlb_acct_memory(struct hstate *h, long delta) 4557 { 4558 int ret = -ENOMEM; 4559 4560 if (!delta) 4561 return 0; 4562 4563 spin_lock_irq(&hugetlb_lock); 4564 /* 4565 * When cpuset is configured, it breaks the strict hugetlb page 4566 * reservation as the accounting is done on a global variable. Such 4567 * reservation is completely rubbish in the presence of cpuset because 4568 * the reservation is not checked against page availability for the 4569 * current cpuset. Application can still potentially OOM'ed by kernel 4570 * with lack of free htlb page in cpuset that the task is in. 4571 * Attempt to enforce strict accounting with cpuset is almost 4572 * impossible (or too ugly) because cpuset is too fluid that 4573 * task or memory node can be dynamically moved between cpusets. 4574 * 4575 * The change of semantics for shared hugetlb mapping with cpuset is 4576 * undesirable. However, in order to preserve some of the semantics, 4577 * we fall back to check against current free page availability as 4578 * a best attempt and hopefully to minimize the impact of changing 4579 * semantics that cpuset has. 4580 * 4581 * Apart from cpuset, we also have memory policy mechanism that 4582 * also determines from which node the kernel will allocate memory 4583 * in a NUMA system. So similar to cpuset, we also should consider 4584 * the memory policy of the current task. Similar to the description 4585 * above. 4586 */ 4587 if (delta > 0) { 4588 if (gather_surplus_pages(h, delta) < 0) 4589 goto out; 4590 4591 if (delta > allowed_mems_nr(h)) { 4592 return_unused_surplus_pages(h, delta); 4593 goto out; 4594 } 4595 } 4596 4597 ret = 0; 4598 if (delta < 0) 4599 return_unused_surplus_pages(h, (unsigned long) -delta); 4600 4601 out: 4602 spin_unlock_irq(&hugetlb_lock); 4603 return ret; 4604 } 4605 4606 static void hugetlb_vm_op_open(struct vm_area_struct *vma) 4607 { 4608 struct resv_map *resv = vma_resv_map(vma); 4609 4610 /* 4611 * This new VMA should share its siblings reservation map if present. 4612 * The VMA will only ever have a valid reservation map pointer where 4613 * it is being copied for another still existing VMA. As that VMA 4614 * has a reference to the reservation map it cannot disappear until 4615 * after this open call completes. It is therefore safe to take a 4616 * new reference here without additional locking. 4617 */ 4618 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 4619 resv_map_dup_hugetlb_cgroup_uncharge_info(resv); 4620 kref_get(&resv->refs); 4621 } 4622 } 4623 4624 static void hugetlb_vm_op_close(struct vm_area_struct *vma) 4625 { 4626 struct hstate *h = hstate_vma(vma); 4627 struct resv_map *resv = vma_resv_map(vma); 4628 struct hugepage_subpool *spool = subpool_vma(vma); 4629 unsigned long reserve, start, end; 4630 long gbl_reserve; 4631 4632 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 4633 return; 4634 4635 start = vma_hugecache_offset(h, vma, vma->vm_start); 4636 end = vma_hugecache_offset(h, vma, vma->vm_end); 4637 4638 reserve = (end - start) - region_count(resv, start, end); 4639 hugetlb_cgroup_uncharge_counter(resv, start, end); 4640 if (reserve) { 4641 /* 4642 * Decrement reserve counts. The global reserve count may be 4643 * adjusted if the subpool has a minimum size. 4644 */ 4645 gbl_reserve = hugepage_subpool_put_pages(spool, reserve); 4646 hugetlb_acct_memory(h, -gbl_reserve); 4647 } 4648 4649 kref_put(&resv->refs, resv_map_release); 4650 } 4651 4652 static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) 4653 { 4654 if (addr & ~(huge_page_mask(hstate_vma(vma)))) 4655 return -EINVAL; 4656 return 0; 4657 } 4658 4659 static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) 4660 { 4661 return huge_page_size(hstate_vma(vma)); 4662 } 4663 4664 /* 4665 * We cannot handle pagefaults against hugetlb pages at all. They cause 4666 * handle_mm_fault() to try to instantiate regular-sized pages in the 4667 * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get 4668 * this far. 4669 */ 4670 static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) 4671 { 4672 BUG(); 4673 return 0; 4674 } 4675 4676 /* 4677 * When a new function is introduced to vm_operations_struct and added 4678 * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. 4679 * This is because under System V memory model, mappings created via 4680 * shmget/shmat with "huge page" specified are backed by hugetlbfs files, 4681 * their original vm_ops are overwritten with shm_vm_ops. 4682 */ 4683 const struct vm_operations_struct hugetlb_vm_ops = { 4684 .fault = hugetlb_vm_op_fault, 4685 .open = hugetlb_vm_op_open, 4686 .close = hugetlb_vm_op_close, 4687 .may_split = hugetlb_vm_op_split, 4688 .pagesize = hugetlb_vm_op_pagesize, 4689 }; 4690 4691 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 4692 int writable) 4693 { 4694 pte_t entry; 4695 unsigned int shift = huge_page_shift(hstate_vma(vma)); 4696 4697 if (writable) { 4698 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, 4699 vma->vm_page_prot))); 4700 } else { 4701 entry = huge_pte_wrprotect(mk_huge_pte(page, 4702 vma->vm_page_prot)); 4703 } 4704 entry = pte_mkyoung(entry); 4705 entry = arch_make_huge_pte(entry, shift, vma->vm_flags); 4706 4707 return entry; 4708 } 4709 4710 static void set_huge_ptep_writable(struct vm_area_struct *vma, 4711 unsigned long address, pte_t *ptep) 4712 { 4713 pte_t entry; 4714 4715 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); 4716 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) 4717 update_mmu_cache(vma, address, ptep); 4718 } 4719 4720 bool is_hugetlb_entry_migration(pte_t pte) 4721 { 4722 swp_entry_t swp; 4723 4724 if (huge_pte_none(pte) || pte_present(pte)) 4725 return false; 4726 swp = pte_to_swp_entry(pte); 4727 if (is_migration_entry(swp)) 4728 return true; 4729 else 4730 return false; 4731 } 4732 4733 static bool is_hugetlb_entry_hwpoisoned(pte_t pte) 4734 { 4735 swp_entry_t swp; 4736 4737 if (huge_pte_none(pte) || pte_present(pte)) 4738 return false; 4739 swp = pte_to_swp_entry(pte); 4740 if (is_hwpoison_entry(swp)) 4741 return true; 4742 else 4743 return false; 4744 } 4745 4746 static void 4747 hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, 4748 struct page *new_page) 4749 { 4750 __SetPageUptodate(new_page); 4751 hugepage_add_new_anon_rmap(new_page, vma, addr); 4752 set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); 4753 hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); 4754 ClearHPageRestoreReserve(new_page); 4755 SetHPageMigratable(new_page); 4756 } 4757 4758 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 4759 struct vm_area_struct *dst_vma, 4760 struct vm_area_struct *src_vma) 4761 { 4762 pte_t *src_pte, *dst_pte, entry; 4763 struct page *ptepage; 4764 unsigned long addr; 4765 bool cow = is_cow_mapping(src_vma->vm_flags); 4766 struct hstate *h = hstate_vma(src_vma); 4767 unsigned long sz = huge_page_size(h); 4768 unsigned long npages = pages_per_huge_page(h); 4769 struct address_space *mapping = src_vma->vm_file->f_mapping; 4770 struct mmu_notifier_range range; 4771 unsigned long last_addr_mask; 4772 int ret = 0; 4773 4774 if (cow) { 4775 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src, 4776 src_vma->vm_start, 4777 src_vma->vm_end); 4778 mmu_notifier_invalidate_range_start(&range); 4779 mmap_assert_write_locked(src); 4780 raw_write_seqcount_begin(&src->write_protect_seq); 4781 } else { 4782 /* 4783 * For shared mappings i_mmap_rwsem must be held to call 4784 * huge_pte_alloc, otherwise the returned ptep could go 4785 * away if part of a shared pmd and another thread calls 4786 * huge_pmd_unshare. 4787 */ 4788 i_mmap_lock_read(mapping); 4789 } 4790 4791 last_addr_mask = hugetlb_mask_last_page(h); 4792 for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { 4793 spinlock_t *src_ptl, *dst_ptl; 4794 src_pte = huge_pte_offset(src, addr, sz); 4795 if (!src_pte) { 4796 addr |= last_addr_mask; 4797 continue; 4798 } 4799 dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); 4800 if (!dst_pte) { 4801 ret = -ENOMEM; 4802 break; 4803 } 4804 4805 /* 4806 * If the pagetables are shared don't copy or take references. 4807 * 4808 * dst_pte == src_pte is the common case of src/dest sharing. 4809 * However, src could have 'unshared' and dst shares with 4810 * another vma. So page_count of ptep page is checked instead 4811 * to reliably determine whether pte is shared. 4812 */ 4813 if (page_count(virt_to_page(dst_pte)) > 1) { 4814 addr |= last_addr_mask; 4815 continue; 4816 } 4817 4818 dst_ptl = huge_pte_lock(h, dst, dst_pte); 4819 src_ptl = huge_pte_lockptr(h, src, src_pte); 4820 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 4821 entry = huge_ptep_get(src_pte); 4822 again: 4823 if (huge_pte_none(entry)) { 4824 /* 4825 * Skip if src entry none. 4826 */ 4827 ; 4828 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { 4829 bool uffd_wp = huge_pte_uffd_wp(entry); 4830 4831 if (!userfaultfd_wp(dst_vma) && uffd_wp) 4832 entry = huge_pte_clear_uffd_wp(entry); 4833 set_huge_pte_at(dst, addr, dst_pte, entry); 4834 } else if (unlikely(is_hugetlb_entry_migration(entry))) { 4835 swp_entry_t swp_entry = pte_to_swp_entry(entry); 4836 bool uffd_wp = huge_pte_uffd_wp(entry); 4837 4838 if (!is_readable_migration_entry(swp_entry) && cow) { 4839 /* 4840 * COW mappings require pages in both 4841 * parent and child to be set to read. 4842 */ 4843 swp_entry = make_readable_migration_entry( 4844 swp_offset(swp_entry)); 4845 entry = swp_entry_to_pte(swp_entry); 4846 if (userfaultfd_wp(src_vma) && uffd_wp) 4847 entry = huge_pte_mkuffd_wp(entry); 4848 set_huge_pte_at(src, addr, src_pte, entry); 4849 } 4850 if (!userfaultfd_wp(dst_vma) && uffd_wp) 4851 entry = huge_pte_clear_uffd_wp(entry); 4852 set_huge_pte_at(dst, addr, dst_pte, entry); 4853 } else if (unlikely(is_pte_marker(entry))) { 4854 /* 4855 * We copy the pte marker only if the dst vma has 4856 * uffd-wp enabled. 4857 */ 4858 if (userfaultfd_wp(dst_vma)) 4859 set_huge_pte_at(dst, addr, dst_pte, entry); 4860 } else { 4861 entry = huge_ptep_get(src_pte); 4862 ptepage = pte_page(entry); 4863 get_page(ptepage); 4864 4865 /* 4866 * Failing to duplicate the anon rmap is a rare case 4867 * where we see pinned hugetlb pages while they're 4868 * prone to COW. We need to do the COW earlier during 4869 * fork. 4870 * 4871 * When pre-allocating the page or copying data, we 4872 * need to be without the pgtable locks since we could 4873 * sleep during the process. 4874 */ 4875 if (!PageAnon(ptepage)) { 4876 page_dup_file_rmap(ptepage, true); 4877 } else if (page_try_dup_anon_rmap(ptepage, true, 4878 src_vma)) { 4879 pte_t src_pte_old = entry; 4880 struct page *new; 4881 4882 spin_unlock(src_ptl); 4883 spin_unlock(dst_ptl); 4884 /* Do not use reserve as it's private owned */ 4885 new = alloc_huge_page(dst_vma, addr, 1); 4886 if (IS_ERR(new)) { 4887 put_page(ptepage); 4888 ret = PTR_ERR(new); 4889 break; 4890 } 4891 copy_user_huge_page(new, ptepage, addr, dst_vma, 4892 npages); 4893 put_page(ptepage); 4894 4895 /* Install the new huge page if src pte stable */ 4896 dst_ptl = huge_pte_lock(h, dst, dst_pte); 4897 src_ptl = huge_pte_lockptr(h, src, src_pte); 4898 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 4899 entry = huge_ptep_get(src_pte); 4900 if (!pte_same(src_pte_old, entry)) { 4901 restore_reserve_on_error(h, dst_vma, addr, 4902 new); 4903 put_page(new); 4904 /* huge_ptep of dst_pte won't change as in child */ 4905 goto again; 4906 } 4907 hugetlb_install_page(dst_vma, dst_pte, addr, new); 4908 spin_unlock(src_ptl); 4909 spin_unlock(dst_ptl); 4910 continue; 4911 } 4912 4913 if (cow) { 4914 /* 4915 * No need to notify as we are downgrading page 4916 * table protection not changing it to point 4917 * to a new page. 4918 * 4919 * See Documentation/mm/mmu_notifier.rst 4920 */ 4921 huge_ptep_set_wrprotect(src, addr, src_pte); 4922 entry = huge_pte_wrprotect(entry); 4923 } 4924 4925 set_huge_pte_at(dst, addr, dst_pte, entry); 4926 hugetlb_count_add(npages, dst); 4927 } 4928 spin_unlock(src_ptl); 4929 spin_unlock(dst_ptl); 4930 } 4931 4932 if (cow) { 4933 raw_write_seqcount_end(&src->write_protect_seq); 4934 mmu_notifier_invalidate_range_end(&range); 4935 } else { 4936 i_mmap_unlock_read(mapping); 4937 } 4938 4939 return ret; 4940 } 4941 4942 static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, 4943 unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte) 4944 { 4945 struct hstate *h = hstate_vma(vma); 4946 struct mm_struct *mm = vma->vm_mm; 4947 spinlock_t *src_ptl, *dst_ptl; 4948 pte_t pte; 4949 4950 dst_ptl = huge_pte_lock(h, mm, dst_pte); 4951 src_ptl = huge_pte_lockptr(h, mm, src_pte); 4952 4953 /* 4954 * We don't have to worry about the ordering of src and dst ptlocks 4955 * because exclusive mmap_sem (or the i_mmap_lock) prevents deadlock. 4956 */ 4957 if (src_ptl != dst_ptl) 4958 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 4959 4960 pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); 4961 set_huge_pte_at(mm, new_addr, dst_pte, pte); 4962 4963 if (src_ptl != dst_ptl) 4964 spin_unlock(src_ptl); 4965 spin_unlock(dst_ptl); 4966 } 4967 4968 int move_hugetlb_page_tables(struct vm_area_struct *vma, 4969 struct vm_area_struct *new_vma, 4970 unsigned long old_addr, unsigned long new_addr, 4971 unsigned long len) 4972 { 4973 struct hstate *h = hstate_vma(vma); 4974 struct address_space *mapping = vma->vm_file->f_mapping; 4975 unsigned long sz = huge_page_size(h); 4976 struct mm_struct *mm = vma->vm_mm; 4977 unsigned long old_end = old_addr + len; 4978 unsigned long last_addr_mask; 4979 pte_t *src_pte, *dst_pte; 4980 struct mmu_notifier_range range; 4981 bool shared_pmd = false; 4982 4983 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr, 4984 old_end); 4985 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 4986 /* 4987 * In case of shared PMDs, we should cover the maximum possible 4988 * range. 4989 */ 4990 flush_cache_range(vma, range.start, range.end); 4991 4992 mmu_notifier_invalidate_range_start(&range); 4993 last_addr_mask = hugetlb_mask_last_page(h); 4994 /* Prevent race with file truncation */ 4995 i_mmap_lock_write(mapping); 4996 for (; old_addr < old_end; old_addr += sz, new_addr += sz) { 4997 src_pte = huge_pte_offset(mm, old_addr, sz); 4998 if (!src_pte) { 4999 old_addr |= last_addr_mask; 5000 new_addr |= last_addr_mask; 5001 continue; 5002 } 5003 if (huge_pte_none(huge_ptep_get(src_pte))) 5004 continue; 5005 5006 if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { 5007 shared_pmd = true; 5008 old_addr |= last_addr_mask; 5009 new_addr |= last_addr_mask; 5010 continue; 5011 } 5012 5013 dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz); 5014 if (!dst_pte) 5015 break; 5016 5017 move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte); 5018 } 5019 5020 if (shared_pmd) 5021 flush_tlb_range(vma, range.start, range.end); 5022 else 5023 flush_tlb_range(vma, old_end - len, old_end); 5024 mmu_notifier_invalidate_range_end(&range); 5025 i_mmap_unlock_write(mapping); 5026 5027 return len + old_addr - old_end; 5028 } 5029 5030 static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 5031 unsigned long start, unsigned long end, 5032 struct page *ref_page, zap_flags_t zap_flags) 5033 { 5034 struct mm_struct *mm = vma->vm_mm; 5035 unsigned long address; 5036 pte_t *ptep; 5037 pte_t pte; 5038 spinlock_t *ptl; 5039 struct page *page; 5040 struct hstate *h = hstate_vma(vma); 5041 unsigned long sz = huge_page_size(h); 5042 struct mmu_notifier_range range; 5043 unsigned long last_addr_mask; 5044 bool force_flush = false; 5045 5046 WARN_ON(!is_vm_hugetlb_page(vma)); 5047 BUG_ON(start & ~huge_page_mask(h)); 5048 BUG_ON(end & ~huge_page_mask(h)); 5049 5050 /* 5051 * This is a hugetlb vma, all the pte entries should point 5052 * to huge page. 5053 */ 5054 tlb_change_page_size(tlb, sz); 5055 tlb_start_vma(tlb, vma); 5056 5057 /* 5058 * If sharing possible, alert mmu notifiers of worst case. 5059 */ 5060 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, 5061 end); 5062 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 5063 mmu_notifier_invalidate_range_start(&range); 5064 last_addr_mask = hugetlb_mask_last_page(h); 5065 address = start; 5066 for (; address < end; address += sz) { 5067 ptep = huge_pte_offset(mm, address, sz); 5068 if (!ptep) { 5069 address |= last_addr_mask; 5070 continue; 5071 } 5072 5073 ptl = huge_pte_lock(h, mm, ptep); 5074 if (huge_pmd_unshare(mm, vma, address, ptep)) { 5075 spin_unlock(ptl); 5076 tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); 5077 force_flush = true; 5078 address |= last_addr_mask; 5079 continue; 5080 } 5081 5082 pte = huge_ptep_get(ptep); 5083 if (huge_pte_none(pte)) { 5084 spin_unlock(ptl); 5085 continue; 5086 } 5087 5088 /* 5089 * Migrating hugepage or HWPoisoned hugepage is already 5090 * unmapped and its refcount is dropped, so just clear pte here. 5091 */ 5092 if (unlikely(!pte_present(pte))) { 5093 /* 5094 * If the pte was wr-protected by uffd-wp in any of the 5095 * swap forms, meanwhile the caller does not want to 5096 * drop the uffd-wp bit in this zap, then replace the 5097 * pte with a marker. 5098 */ 5099 if (pte_swp_uffd_wp_any(pte) && 5100 !(zap_flags & ZAP_FLAG_DROP_MARKER)) 5101 set_huge_pte_at(mm, address, ptep, 5102 make_pte_marker(PTE_MARKER_UFFD_WP)); 5103 else 5104 huge_pte_clear(mm, address, ptep, sz); 5105 spin_unlock(ptl); 5106 continue; 5107 } 5108 5109 page = pte_page(pte); 5110 /* 5111 * If a reference page is supplied, it is because a specific 5112 * page is being unmapped, not a range. Ensure the page we 5113 * are about to unmap is the actual page of interest. 5114 */ 5115 if (ref_page) { 5116 if (page != ref_page) { 5117 spin_unlock(ptl); 5118 continue; 5119 } 5120 /* 5121 * Mark the VMA as having unmapped its page so that 5122 * future faults in this VMA will fail rather than 5123 * looking like data was lost 5124 */ 5125 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); 5126 } 5127 5128 pte = huge_ptep_get_and_clear(mm, address, ptep); 5129 tlb_remove_huge_tlb_entry(h, tlb, ptep, address); 5130 if (huge_pte_dirty(pte)) 5131 set_page_dirty(page); 5132 /* Leave a uffd-wp pte marker if needed */ 5133 if (huge_pte_uffd_wp(pte) && 5134 !(zap_flags & ZAP_FLAG_DROP_MARKER)) 5135 set_huge_pte_at(mm, address, ptep, 5136 make_pte_marker(PTE_MARKER_UFFD_WP)); 5137 hugetlb_count_sub(pages_per_huge_page(h), mm); 5138 page_remove_rmap(page, vma, true); 5139 5140 spin_unlock(ptl); 5141 tlb_remove_page_size(tlb, page, huge_page_size(h)); 5142 /* 5143 * Bail out after unmapping reference page if supplied 5144 */ 5145 if (ref_page) 5146 break; 5147 } 5148 mmu_notifier_invalidate_range_end(&range); 5149 tlb_end_vma(tlb, vma); 5150 5151 /* 5152 * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We 5153 * could defer the flush until now, since by holding i_mmap_rwsem we 5154 * guaranteed that the last refernece would not be dropped. But we must 5155 * do the flushing before we return, as otherwise i_mmap_rwsem will be 5156 * dropped and the last reference to the shared PMDs page might be 5157 * dropped as well. 5158 * 5159 * In theory we could defer the freeing of the PMD pages as well, but 5160 * huge_pmd_unshare() relies on the exact page_count for the PMD page to 5161 * detect sharing, so we cannot defer the release of the page either. 5162 * Instead, do flush now. 5163 */ 5164 if (force_flush) 5165 tlb_flush_mmu_tlbonly(tlb); 5166 } 5167 5168 void __unmap_hugepage_range_final(struct mmu_gather *tlb, 5169 struct vm_area_struct *vma, unsigned long start, 5170 unsigned long end, struct page *ref_page, 5171 zap_flags_t zap_flags) 5172 { 5173 __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags); 5174 5175 /* 5176 * Clear this flag so that x86's huge_pmd_share page_table_shareable 5177 * test will fail on a vma being torn down, and not grab a page table 5178 * on its way out. We're lucky that the flag has such an appropriate 5179 * name, and can in fact be safely cleared here. We could clear it 5180 * before the __unmap_hugepage_range above, but all that's necessary 5181 * is to clear it before releasing the i_mmap_rwsem. This works 5182 * because in the context this is called, the VMA is about to be 5183 * destroyed and the i_mmap_rwsem is held. 5184 */ 5185 vma->vm_flags &= ~VM_MAYSHARE; 5186 } 5187 5188 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 5189 unsigned long end, struct page *ref_page, 5190 zap_flags_t zap_flags) 5191 { 5192 struct mmu_gather tlb; 5193 5194 tlb_gather_mmu(&tlb, vma->vm_mm); 5195 __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); 5196 tlb_finish_mmu(&tlb); 5197 } 5198 5199 /* 5200 * This is called when the original mapper is failing to COW a MAP_PRIVATE 5201 * mapping it owns the reserve page for. The intention is to unmap the page 5202 * from other VMAs and let the children be SIGKILLed if they are faulting the 5203 * same region. 5204 */ 5205 static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, 5206 struct page *page, unsigned long address) 5207 { 5208 struct hstate *h = hstate_vma(vma); 5209 struct vm_area_struct *iter_vma; 5210 struct address_space *mapping; 5211 pgoff_t pgoff; 5212 5213 /* 5214 * vm_pgoff is in PAGE_SIZE units, hence the different calculation 5215 * from page cache lookup which is in HPAGE_SIZE units. 5216 */ 5217 address = address & huge_page_mask(h); 5218 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + 5219 vma->vm_pgoff; 5220 mapping = vma->vm_file->f_mapping; 5221 5222 /* 5223 * Take the mapping lock for the duration of the table walk. As 5224 * this mapping should be shared between all the VMAs, 5225 * __unmap_hugepage_range() is called as the lock is already held 5226 */ 5227 i_mmap_lock_write(mapping); 5228 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { 5229 /* Do not unmap the current VMA */ 5230 if (iter_vma == vma) 5231 continue; 5232 5233 /* 5234 * Shared VMAs have their own reserves and do not affect 5235 * MAP_PRIVATE accounting but it is possible that a shared 5236 * VMA is using the same page so check and skip such VMAs. 5237 */ 5238 if (iter_vma->vm_flags & VM_MAYSHARE) 5239 continue; 5240 5241 /* 5242 * Unmap the page from other VMAs without their own reserves. 5243 * They get marked to be SIGKILLed if they fault in these 5244 * areas. This is because a future no-page fault on this VMA 5245 * could insert a zeroed page instead of the data existing 5246 * from the time of fork. This would look like data corruption 5247 */ 5248 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 5249 unmap_hugepage_range(iter_vma, address, 5250 address + huge_page_size(h), page, 0); 5251 } 5252 i_mmap_unlock_write(mapping); 5253 } 5254 5255 /* 5256 * hugetlb_wp() should be called with page lock of the original hugepage held. 5257 * Called with hugetlb_fault_mutex_table held and pte_page locked so we 5258 * cannot race with other handlers or page migration. 5259 * Keep the pte_same checks anyway to make transition from the mutex easier. 5260 */ 5261 static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, 5262 unsigned long address, pte_t *ptep, unsigned int flags, 5263 struct page *pagecache_page, spinlock_t *ptl) 5264 { 5265 const bool unshare = flags & FAULT_FLAG_UNSHARE; 5266 pte_t pte; 5267 struct hstate *h = hstate_vma(vma); 5268 struct page *old_page, *new_page; 5269 int outside_reserve = 0; 5270 vm_fault_t ret = 0; 5271 unsigned long haddr = address & huge_page_mask(h); 5272 struct mmu_notifier_range range; 5273 5274 VM_BUG_ON(unshare && (flags & FOLL_WRITE)); 5275 VM_BUG_ON(!unshare && !(flags & FOLL_WRITE)); 5276 5277 /* 5278 * hugetlb does not support FOLL_FORCE-style write faults that keep the 5279 * PTE mapped R/O such as maybe_mkwrite() would do. 5280 */ 5281 if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE))) 5282 return VM_FAULT_SIGSEGV; 5283 5284 /* Let's take out MAP_SHARED mappings first. */ 5285 if (vma->vm_flags & VM_MAYSHARE) { 5286 if (unlikely(unshare)) 5287 return 0; 5288 set_huge_ptep_writable(vma, haddr, ptep); 5289 return 0; 5290 } 5291 5292 pte = huge_ptep_get(ptep); 5293 old_page = pte_page(pte); 5294 5295 delayacct_wpcopy_start(); 5296 5297 retry_avoidcopy: 5298 /* 5299 * If no-one else is actually using this page, we're the exclusive 5300 * owner and can reuse this page. 5301 */ 5302 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { 5303 if (!PageAnonExclusive(old_page)) 5304 page_move_anon_rmap(old_page, vma); 5305 if (likely(!unshare)) 5306 set_huge_ptep_writable(vma, haddr, ptep); 5307 5308 delayacct_wpcopy_end(); 5309 return 0; 5310 } 5311 VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page), 5312 old_page); 5313 5314 /* 5315 * If the process that created a MAP_PRIVATE mapping is about to 5316 * perform a COW due to a shared page count, attempt to satisfy 5317 * the allocation without using the existing reserves. The pagecache 5318 * page is used to determine if the reserve at this address was 5319 * consumed or not. If reserves were used, a partial faulted mapping 5320 * at the time of fork() could consume its reserves on COW instead 5321 * of the full address range. 5322 */ 5323 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 5324 old_page != pagecache_page) 5325 outside_reserve = 1; 5326 5327 get_page(old_page); 5328 5329 /* 5330 * Drop page table lock as buddy allocator may be called. It will 5331 * be acquired again before returning to the caller, as expected. 5332 */ 5333 spin_unlock(ptl); 5334 new_page = alloc_huge_page(vma, haddr, outside_reserve); 5335 5336 if (IS_ERR(new_page)) { 5337 /* 5338 * If a process owning a MAP_PRIVATE mapping fails to COW, 5339 * it is due to references held by a child and an insufficient 5340 * huge page pool. To guarantee the original mappers 5341 * reliability, unmap the page from child processes. The child 5342 * may get SIGKILLed if it later faults. 5343 */ 5344 if (outside_reserve) { 5345 struct address_space *mapping = vma->vm_file->f_mapping; 5346 pgoff_t idx; 5347 u32 hash; 5348 5349 put_page(old_page); 5350 /* 5351 * Drop hugetlb_fault_mutex and i_mmap_rwsem before 5352 * unmapping. unmapping needs to hold i_mmap_rwsem 5353 * in write mode. Dropping i_mmap_rwsem in read mode 5354 * here is OK as COW mappings do not interact with 5355 * PMD sharing. 5356 * 5357 * Reacquire both after unmap operation. 5358 */ 5359 idx = vma_hugecache_offset(h, vma, haddr); 5360 hash = hugetlb_fault_mutex_hash(mapping, idx); 5361 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 5362 i_mmap_unlock_read(mapping); 5363 5364 unmap_ref_private(mm, vma, old_page, haddr); 5365 5366 i_mmap_lock_read(mapping); 5367 mutex_lock(&hugetlb_fault_mutex_table[hash]); 5368 spin_lock(ptl); 5369 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 5370 if (likely(ptep && 5371 pte_same(huge_ptep_get(ptep), pte))) 5372 goto retry_avoidcopy; 5373 /* 5374 * race occurs while re-acquiring page table 5375 * lock, and our job is done. 5376 */ 5377 delayacct_wpcopy_end(); 5378 return 0; 5379 } 5380 5381 ret = vmf_error(PTR_ERR(new_page)); 5382 goto out_release_old; 5383 } 5384 5385 /* 5386 * When the original hugepage is shared one, it does not have 5387 * anon_vma prepared. 5388 */ 5389 if (unlikely(anon_vma_prepare(vma))) { 5390 ret = VM_FAULT_OOM; 5391 goto out_release_all; 5392 } 5393 5394 copy_user_huge_page(new_page, old_page, address, vma, 5395 pages_per_huge_page(h)); 5396 __SetPageUptodate(new_page); 5397 5398 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, 5399 haddr + huge_page_size(h)); 5400 mmu_notifier_invalidate_range_start(&range); 5401 5402 /* 5403 * Retake the page table lock to check for racing updates 5404 * before the page tables are altered 5405 */ 5406 spin_lock(ptl); 5407 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 5408 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { 5409 ClearHPageRestoreReserve(new_page); 5410 5411 /* Break COW or unshare */ 5412 huge_ptep_clear_flush(vma, haddr, ptep); 5413 mmu_notifier_invalidate_range(mm, range.start, range.end); 5414 page_remove_rmap(old_page, vma, true); 5415 hugepage_add_new_anon_rmap(new_page, vma, haddr); 5416 set_huge_pte_at(mm, haddr, ptep, 5417 make_huge_pte(vma, new_page, !unshare)); 5418 SetHPageMigratable(new_page); 5419 /* Make the old page be freed below */ 5420 new_page = old_page; 5421 } 5422 spin_unlock(ptl); 5423 mmu_notifier_invalidate_range_end(&range); 5424 out_release_all: 5425 /* 5426 * No restore in case of successful pagetable update (Break COW or 5427 * unshare) 5428 */ 5429 if (new_page != old_page) 5430 restore_reserve_on_error(h, vma, haddr, new_page); 5431 put_page(new_page); 5432 out_release_old: 5433 put_page(old_page); 5434 5435 spin_lock(ptl); /* Caller expects lock to be held */ 5436 5437 delayacct_wpcopy_end(); 5438 return ret; 5439 } 5440 5441 /* 5442 * Return whether there is a pagecache page to back given address within VMA. 5443 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. 5444 */ 5445 static bool hugetlbfs_pagecache_present(struct hstate *h, 5446 struct vm_area_struct *vma, unsigned long address) 5447 { 5448 struct address_space *mapping; 5449 pgoff_t idx; 5450 struct page *page; 5451 5452 mapping = vma->vm_file->f_mapping; 5453 idx = vma_hugecache_offset(h, vma, address); 5454 5455 page = find_get_page(mapping, idx); 5456 if (page) 5457 put_page(page); 5458 return page != NULL; 5459 } 5460 5461 int huge_add_to_page_cache(struct page *page, struct address_space *mapping, 5462 pgoff_t idx) 5463 { 5464 struct folio *folio = page_folio(page); 5465 struct inode *inode = mapping->host; 5466 struct hstate *h = hstate_inode(inode); 5467 int err; 5468 5469 __folio_set_locked(folio); 5470 err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); 5471 5472 if (unlikely(err)) { 5473 __folio_clear_locked(folio); 5474 return err; 5475 } 5476 ClearHPageRestoreReserve(page); 5477 5478 /* 5479 * mark folio dirty so that it will not be removed from cache/file 5480 * by non-hugetlbfs specific code paths. 5481 */ 5482 folio_mark_dirty(folio); 5483 5484 spin_lock(&inode->i_lock); 5485 inode->i_blocks += blocks_per_huge_page(h); 5486 spin_unlock(&inode->i_lock); 5487 return 0; 5488 } 5489 5490 static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, 5491 struct address_space *mapping, 5492 pgoff_t idx, 5493 unsigned int flags, 5494 unsigned long haddr, 5495 unsigned long addr, 5496 unsigned long reason) 5497 { 5498 vm_fault_t ret; 5499 u32 hash; 5500 struct vm_fault vmf = { 5501 .vma = vma, 5502 .address = haddr, 5503 .real_address = addr, 5504 .flags = flags, 5505 5506 /* 5507 * Hard to debug if it ends up being 5508 * used by a callee that assumes 5509 * something about the other 5510 * uninitialized fields... same as in 5511 * memory.c 5512 */ 5513 }; 5514 5515 /* 5516 * hugetlb_fault_mutex and i_mmap_rwsem must be 5517 * dropped before handling userfault. Reacquire 5518 * after handling fault to make calling code simpler. 5519 */ 5520 hash = hugetlb_fault_mutex_hash(mapping, idx); 5521 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 5522 i_mmap_unlock_read(mapping); 5523 ret = handle_userfault(&vmf, reason); 5524 i_mmap_lock_read(mapping); 5525 mutex_lock(&hugetlb_fault_mutex_table[hash]); 5526 5527 return ret; 5528 } 5529 5530 static vm_fault_t hugetlb_no_page(struct mm_struct *mm, 5531 struct vm_area_struct *vma, 5532 struct address_space *mapping, pgoff_t idx, 5533 unsigned long address, pte_t *ptep, 5534 pte_t old_pte, unsigned int flags) 5535 { 5536 struct hstate *h = hstate_vma(vma); 5537 vm_fault_t ret = VM_FAULT_SIGBUS; 5538 int anon_rmap = 0; 5539 unsigned long size; 5540 struct page *page; 5541 pte_t new_pte; 5542 spinlock_t *ptl; 5543 unsigned long haddr = address & huge_page_mask(h); 5544 bool new_page, new_pagecache_page = false; 5545 5546 /* 5547 * Currently, we are forced to kill the process in the event the 5548 * original mapper has unmapped pages from the child due to a failed 5549 * COW/unsharing. Warn that such a situation has occurred as it may not 5550 * be obvious. 5551 */ 5552 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 5553 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", 5554 current->pid); 5555 return ret; 5556 } 5557 5558 /* 5559 * We can not race with truncation due to holding i_mmap_rwsem. 5560 * i_size is modified when holding i_mmap_rwsem, so check here 5561 * once for faults beyond end of file. 5562 */ 5563 size = i_size_read(mapping->host) >> huge_page_shift(h); 5564 if (idx >= size) 5565 goto out; 5566 5567 new_page = false; 5568 page = find_lock_page(mapping, idx); 5569 if (!page) { 5570 /* Check for page in userfault range */ 5571 if (userfaultfd_missing(vma)) { 5572 ret = hugetlb_handle_userfault(vma, mapping, idx, 5573 flags, haddr, address, 5574 VM_UFFD_MISSING); 5575 goto out; 5576 } 5577 5578 page = alloc_huge_page(vma, haddr, 0); 5579 if (IS_ERR(page)) { 5580 /* 5581 * Returning error will result in faulting task being 5582 * sent SIGBUS. The hugetlb fault mutex prevents two 5583 * tasks from racing to fault in the same page which 5584 * could result in false unable to allocate errors. 5585 * Page migration does not take the fault mutex, but 5586 * does a clear then write of pte's under page table 5587 * lock. Page fault code could race with migration, 5588 * notice the clear pte and try to allocate a page 5589 * here. Before returning error, get ptl and make 5590 * sure there really is no pte entry. 5591 */ 5592 ptl = huge_pte_lock(h, mm, ptep); 5593 ret = 0; 5594 if (huge_pte_none(huge_ptep_get(ptep))) 5595 ret = vmf_error(PTR_ERR(page)); 5596 spin_unlock(ptl); 5597 goto out; 5598 } 5599 clear_huge_page(page, address, pages_per_huge_page(h)); 5600 __SetPageUptodate(page); 5601 new_page = true; 5602 5603 if (vma->vm_flags & VM_MAYSHARE) { 5604 int err = huge_add_to_page_cache(page, mapping, idx); 5605 if (err) { 5606 /* 5607 * err can't be -EEXIST which implies someone 5608 * else consumed the reservation since hugetlb 5609 * fault mutex is held when add a hugetlb page 5610 * to the page cache. So it's safe to call 5611 * restore_reserve_on_error() here. 5612 */ 5613 restore_reserve_on_error(h, vma, haddr, page); 5614 put_page(page); 5615 goto out; 5616 } 5617 new_pagecache_page = true; 5618 } else { 5619 lock_page(page); 5620 if (unlikely(anon_vma_prepare(vma))) { 5621 ret = VM_FAULT_OOM; 5622 goto backout_unlocked; 5623 } 5624 anon_rmap = 1; 5625 } 5626 } else { 5627 /* 5628 * If memory error occurs between mmap() and fault, some process 5629 * don't have hwpoisoned swap entry for errored virtual address. 5630 * So we need to block hugepage fault by PG_hwpoison bit check. 5631 */ 5632 if (unlikely(PageHWPoison(page))) { 5633 ret = VM_FAULT_HWPOISON_LARGE | 5634 VM_FAULT_SET_HINDEX(hstate_index(h)); 5635 goto backout_unlocked; 5636 } 5637 5638 /* Check for page in userfault range. */ 5639 if (userfaultfd_minor(vma)) { 5640 unlock_page(page); 5641 put_page(page); 5642 ret = hugetlb_handle_userfault(vma, mapping, idx, 5643 flags, haddr, address, 5644 VM_UFFD_MINOR); 5645 goto out; 5646 } 5647 } 5648 5649 /* 5650 * If we are going to COW a private mapping later, we examine the 5651 * pending reservations for this page now. This will ensure that 5652 * any allocations necessary to record that reservation occur outside 5653 * the spinlock. 5654 */ 5655 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 5656 if (vma_needs_reservation(h, vma, haddr) < 0) { 5657 ret = VM_FAULT_OOM; 5658 goto backout_unlocked; 5659 } 5660 /* Just decrements count, does not deallocate */ 5661 vma_end_reservation(h, vma, haddr); 5662 } 5663 5664 ptl = huge_pte_lock(h, mm, ptep); 5665 ret = 0; 5666 /* If pte changed from under us, retry */ 5667 if (!pte_same(huge_ptep_get(ptep), old_pte)) 5668 goto backout; 5669 5670 if (anon_rmap) { 5671 ClearHPageRestoreReserve(page); 5672 hugepage_add_new_anon_rmap(page, vma, haddr); 5673 } else 5674 page_dup_file_rmap(page, true); 5675 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 5676 && (vma->vm_flags & VM_SHARED))); 5677 /* 5678 * If this pte was previously wr-protected, keep it wr-protected even 5679 * if populated. 5680 */ 5681 if (unlikely(pte_marker_uffd_wp(old_pte))) 5682 new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte)); 5683 set_huge_pte_at(mm, haddr, ptep, new_pte); 5684 5685 hugetlb_count_add(pages_per_huge_page(h), mm); 5686 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 5687 /* Optimization, do the COW without a second fault */ 5688 ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl); 5689 } 5690 5691 spin_unlock(ptl); 5692 5693 /* 5694 * Only set HPageMigratable in newly allocated pages. Existing pages 5695 * found in the pagecache may not have HPageMigratableset if they have 5696 * been isolated for migration. 5697 */ 5698 if (new_page) 5699 SetHPageMigratable(page); 5700 5701 unlock_page(page); 5702 out: 5703 return ret; 5704 5705 backout: 5706 spin_unlock(ptl); 5707 backout_unlocked: 5708 unlock_page(page); 5709 /* restore reserve for newly allocated pages not in page cache */ 5710 if (new_page && !new_pagecache_page) 5711 restore_reserve_on_error(h, vma, haddr, page); 5712 put_page(page); 5713 goto out; 5714 } 5715 5716 #ifdef CONFIG_SMP 5717 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 5718 { 5719 unsigned long key[2]; 5720 u32 hash; 5721 5722 key[0] = (unsigned long) mapping; 5723 key[1] = idx; 5724 5725 hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); 5726 5727 return hash & (num_fault_mutexes - 1); 5728 } 5729 #else 5730 /* 5731 * For uniprocessor systems we always use a single mutex, so just 5732 * return 0 and avoid the hashing overhead. 5733 */ 5734 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 5735 { 5736 return 0; 5737 } 5738 #endif 5739 5740 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 5741 unsigned long address, unsigned int flags) 5742 { 5743 pte_t *ptep, entry; 5744 spinlock_t *ptl; 5745 vm_fault_t ret; 5746 u32 hash; 5747 pgoff_t idx; 5748 struct page *page = NULL; 5749 struct page *pagecache_page = NULL; 5750 struct hstate *h = hstate_vma(vma); 5751 struct address_space *mapping; 5752 int need_wait_lock = 0; 5753 unsigned long haddr = address & huge_page_mask(h); 5754 5755 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 5756 if (ptep) { 5757 /* 5758 * Since we hold no locks, ptep could be stale. That is 5759 * OK as we are only making decisions based on content and 5760 * not actually modifying content here. 5761 */ 5762 entry = huge_ptep_get(ptep); 5763 if (unlikely(is_hugetlb_entry_migration(entry))) { 5764 migration_entry_wait_huge(vma, ptep); 5765 return 0; 5766 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 5767 return VM_FAULT_HWPOISON_LARGE | 5768 VM_FAULT_SET_HINDEX(hstate_index(h)); 5769 } 5770 5771 /* 5772 * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold 5773 * until finished with ptep. This serves two purposes: 5774 * 1) It prevents huge_pmd_unshare from being called elsewhere 5775 * and making the ptep no longer valid. 5776 * 2) It synchronizes us with i_size modifications during truncation. 5777 * 5778 * ptep could have already be assigned via huge_pte_offset. That 5779 * is OK, as huge_pte_alloc will return the same value unless 5780 * something has changed. 5781 */ 5782 mapping = vma->vm_file->f_mapping; 5783 i_mmap_lock_read(mapping); 5784 ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); 5785 if (!ptep) { 5786 i_mmap_unlock_read(mapping); 5787 return VM_FAULT_OOM; 5788 } 5789 5790 /* 5791 * Serialize hugepage allocation and instantiation, so that we don't 5792 * get spurious allocation failures if two CPUs race to instantiate 5793 * the same page in the page cache. 5794 */ 5795 idx = vma_hugecache_offset(h, vma, haddr); 5796 hash = hugetlb_fault_mutex_hash(mapping, idx); 5797 mutex_lock(&hugetlb_fault_mutex_table[hash]); 5798 5799 entry = huge_ptep_get(ptep); 5800 /* PTE markers should be handled the same way as none pte */ 5801 if (huge_pte_none_mostly(entry)) { 5802 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, 5803 entry, flags); 5804 goto out_mutex; 5805 } 5806 5807 ret = 0; 5808 5809 /* 5810 * entry could be a migration/hwpoison entry at this point, so this 5811 * check prevents the kernel from going below assuming that we have 5812 * an active hugepage in pagecache. This goto expects the 2nd page 5813 * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will 5814 * properly handle it. 5815 */ 5816 if (!pte_present(entry)) 5817 goto out_mutex; 5818 5819 /* 5820 * If we are going to COW/unshare the mapping later, we examine the 5821 * pending reservations for this page now. This will ensure that any 5822 * allocations necessary to record that reservation occur outside the 5823 * spinlock. Also lookup the pagecache page now as it is used to 5824 * determine if a reservation has been consumed. 5825 */ 5826 if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && 5827 !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) { 5828 if (vma_needs_reservation(h, vma, haddr) < 0) { 5829 ret = VM_FAULT_OOM; 5830 goto out_mutex; 5831 } 5832 /* Just decrements count, does not deallocate */ 5833 vma_end_reservation(h, vma, haddr); 5834 5835 pagecache_page = find_lock_page(mapping, idx); 5836 } 5837 5838 ptl = huge_pte_lock(h, mm, ptep); 5839 5840 /* Check for a racing update before calling hugetlb_wp() */ 5841 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 5842 goto out_ptl; 5843 5844 /* Handle userfault-wp first, before trying to lock more pages */ 5845 if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && 5846 (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { 5847 struct vm_fault vmf = { 5848 .vma = vma, 5849 .address = haddr, 5850 .real_address = address, 5851 .flags = flags, 5852 }; 5853 5854 spin_unlock(ptl); 5855 if (pagecache_page) { 5856 unlock_page(pagecache_page); 5857 put_page(pagecache_page); 5858 } 5859 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 5860 i_mmap_unlock_read(mapping); 5861 return handle_userfault(&vmf, VM_UFFD_WP); 5862 } 5863 5864 /* 5865 * hugetlb_wp() requires page locks of pte_page(entry) and 5866 * pagecache_page, so here we need take the former one 5867 * when page != pagecache_page or !pagecache_page. 5868 */ 5869 page = pte_page(entry); 5870 if (page != pagecache_page) 5871 if (!trylock_page(page)) { 5872 need_wait_lock = 1; 5873 goto out_ptl; 5874 } 5875 5876 get_page(page); 5877 5878 if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { 5879 if (!huge_pte_write(entry)) { 5880 ret = hugetlb_wp(mm, vma, address, ptep, flags, 5881 pagecache_page, ptl); 5882 goto out_put_page; 5883 } else if (likely(flags & FAULT_FLAG_WRITE)) { 5884 entry = huge_pte_mkdirty(entry); 5885 } 5886 } 5887 entry = pte_mkyoung(entry); 5888 if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, 5889 flags & FAULT_FLAG_WRITE)) 5890 update_mmu_cache(vma, haddr, ptep); 5891 out_put_page: 5892 if (page != pagecache_page) 5893 unlock_page(page); 5894 put_page(page); 5895 out_ptl: 5896 spin_unlock(ptl); 5897 5898 if (pagecache_page) { 5899 unlock_page(pagecache_page); 5900 put_page(pagecache_page); 5901 } 5902 out_mutex: 5903 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 5904 i_mmap_unlock_read(mapping); 5905 /* 5906 * Generally it's safe to hold refcount during waiting page lock. But 5907 * here we just wait to defer the next page fault to avoid busy loop and 5908 * the page is not used after unlocked before returning from the current 5909 * page fault. So we are safe from accessing freed page, even if we wait 5910 * here without taking refcount. 5911 */ 5912 if (need_wait_lock) 5913 wait_on_page_locked(page); 5914 return ret; 5915 } 5916 5917 #ifdef CONFIG_USERFAULTFD 5918 /* 5919 * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with 5920 * modifications for huge pages. 5921 */ 5922 int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, 5923 pte_t *dst_pte, 5924 struct vm_area_struct *dst_vma, 5925 unsigned long dst_addr, 5926 unsigned long src_addr, 5927 enum mcopy_atomic_mode mode, 5928 struct page **pagep, 5929 bool wp_copy) 5930 { 5931 bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); 5932 struct hstate *h = hstate_vma(dst_vma); 5933 struct address_space *mapping = dst_vma->vm_file->f_mapping; 5934 pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); 5935 unsigned long size; 5936 int vm_shared = dst_vma->vm_flags & VM_SHARED; 5937 pte_t _dst_pte; 5938 spinlock_t *ptl; 5939 int ret = -ENOMEM; 5940 struct page *page; 5941 int writable; 5942 bool page_in_pagecache = false; 5943 5944 if (is_continue) { 5945 ret = -EFAULT; 5946 page = find_lock_page(mapping, idx); 5947 if (!page) 5948 goto out; 5949 page_in_pagecache = true; 5950 } else if (!*pagep) { 5951 /* If a page already exists, then it's UFFDIO_COPY for 5952 * a non-missing case. Return -EEXIST. 5953 */ 5954 if (vm_shared && 5955 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { 5956 ret = -EEXIST; 5957 goto out; 5958 } 5959 5960 page = alloc_huge_page(dst_vma, dst_addr, 0); 5961 if (IS_ERR(page)) { 5962 ret = -ENOMEM; 5963 goto out; 5964 } 5965 5966 ret = copy_huge_page_from_user(page, 5967 (const void __user *) src_addr, 5968 pages_per_huge_page(h), false); 5969 5970 /* fallback to copy_from_user outside mmap_lock */ 5971 if (unlikely(ret)) { 5972 ret = -ENOENT; 5973 /* Free the allocated page which may have 5974 * consumed a reservation. 5975 */ 5976 restore_reserve_on_error(h, dst_vma, dst_addr, page); 5977 put_page(page); 5978 5979 /* Allocate a temporary page to hold the copied 5980 * contents. 5981 */ 5982 page = alloc_huge_page_vma(h, dst_vma, dst_addr); 5983 if (!page) { 5984 ret = -ENOMEM; 5985 goto out; 5986 } 5987 *pagep = page; 5988 /* Set the outparam pagep and return to the caller to 5989 * copy the contents outside the lock. Don't free the 5990 * page. 5991 */ 5992 goto out; 5993 } 5994 } else { 5995 if (vm_shared && 5996 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { 5997 put_page(*pagep); 5998 ret = -EEXIST; 5999 *pagep = NULL; 6000 goto out; 6001 } 6002 6003 page = alloc_huge_page(dst_vma, dst_addr, 0); 6004 if (IS_ERR(page)) { 6005 put_page(*pagep); 6006 ret = -ENOMEM; 6007 *pagep = NULL; 6008 goto out; 6009 } 6010 copy_user_huge_page(page, *pagep, dst_addr, dst_vma, 6011 pages_per_huge_page(h)); 6012 put_page(*pagep); 6013 *pagep = NULL; 6014 } 6015 6016 /* 6017 * The memory barrier inside __SetPageUptodate makes sure that 6018 * preceding stores to the page contents become visible before 6019 * the set_pte_at() write. 6020 */ 6021 __SetPageUptodate(page); 6022 6023 /* Add shared, newly allocated pages to the page cache. */ 6024 if (vm_shared && !is_continue) { 6025 size = i_size_read(mapping->host) >> huge_page_shift(h); 6026 ret = -EFAULT; 6027 if (idx >= size) 6028 goto out_release_nounlock; 6029 6030 /* 6031 * Serialization between remove_inode_hugepages() and 6032 * huge_add_to_page_cache() below happens through the 6033 * hugetlb_fault_mutex_table that here must be hold by 6034 * the caller. 6035 */ 6036 ret = huge_add_to_page_cache(page, mapping, idx); 6037 if (ret) 6038 goto out_release_nounlock; 6039 page_in_pagecache = true; 6040 } 6041 6042 ptl = huge_pte_lock(h, dst_mm, dst_pte); 6043 6044 /* 6045 * Recheck the i_size after holding PT lock to make sure not 6046 * to leave any page mapped (as page_mapped()) beyond the end 6047 * of the i_size (remove_inode_hugepages() is strict about 6048 * enforcing that). If we bail out here, we'll also leave a 6049 * page in the radix tree in the vm_shared case beyond the end 6050 * of the i_size, but remove_inode_hugepages() will take care 6051 * of it as soon as we drop the hugetlb_fault_mutex_table. 6052 */ 6053 size = i_size_read(mapping->host) >> huge_page_shift(h); 6054 ret = -EFAULT; 6055 if (idx >= size) 6056 goto out_release_unlock; 6057 6058 ret = -EEXIST; 6059 /* 6060 * We allow to overwrite a pte marker: consider when both MISSING|WP 6061 * registered, we firstly wr-protect a none pte which has no page cache 6062 * page backing it, then access the page. 6063 */ 6064 if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) 6065 goto out_release_unlock; 6066 6067 if (page_in_pagecache) { 6068 page_dup_file_rmap(page, true); 6069 } else { 6070 ClearHPageRestoreReserve(page); 6071 hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); 6072 } 6073 6074 /* 6075 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY 6076 * with wp flag set, don't set pte write bit. 6077 */ 6078 if (wp_copy || (is_continue && !vm_shared)) 6079 writable = 0; 6080 else 6081 writable = dst_vma->vm_flags & VM_WRITE; 6082 6083 _dst_pte = make_huge_pte(dst_vma, page, writable); 6084 /* 6085 * Always mark UFFDIO_COPY page dirty; note that this may not be 6086 * extremely important for hugetlbfs for now since swapping is not 6087 * supported, but we should still be clear in that this page cannot be 6088 * thrown away at will, even if write bit not set. 6089 */ 6090 _dst_pte = huge_pte_mkdirty(_dst_pte); 6091 _dst_pte = pte_mkyoung(_dst_pte); 6092 6093 if (wp_copy) 6094 _dst_pte = huge_pte_mkuffd_wp(_dst_pte); 6095 6096 set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 6097 6098 hugetlb_count_add(pages_per_huge_page(h), dst_mm); 6099 6100 /* No need to invalidate - it was non-present before */ 6101 update_mmu_cache(dst_vma, dst_addr, dst_pte); 6102 6103 spin_unlock(ptl); 6104 if (!is_continue) 6105 SetHPageMigratable(page); 6106 if (vm_shared || is_continue) 6107 unlock_page(page); 6108 ret = 0; 6109 out: 6110 return ret; 6111 out_release_unlock: 6112 spin_unlock(ptl); 6113 if (vm_shared || is_continue) 6114 unlock_page(page); 6115 out_release_nounlock: 6116 if (!page_in_pagecache) 6117 restore_reserve_on_error(h, dst_vma, dst_addr, page); 6118 put_page(page); 6119 goto out; 6120 } 6121 #endif /* CONFIG_USERFAULTFD */ 6122 6123 static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, 6124 int refs, struct page **pages, 6125 struct vm_area_struct **vmas) 6126 { 6127 int nr; 6128 6129 for (nr = 0; nr < refs; nr++) { 6130 if (likely(pages)) 6131 pages[nr] = mem_map_offset(page, nr); 6132 if (vmas) 6133 vmas[nr] = vma; 6134 } 6135 } 6136 6137 static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, 6138 bool *unshare) 6139 { 6140 pte_t pteval = huge_ptep_get(pte); 6141 6142 *unshare = false; 6143 if (is_swap_pte(pteval)) 6144 return true; 6145 if (huge_pte_write(pteval)) 6146 return false; 6147 if (flags & FOLL_WRITE) 6148 return true; 6149 if (gup_must_unshare(flags, pte_page(pteval))) { 6150 *unshare = true; 6151 return true; 6152 } 6153 return false; 6154 } 6155 6156 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 6157 struct page **pages, struct vm_area_struct **vmas, 6158 unsigned long *position, unsigned long *nr_pages, 6159 long i, unsigned int flags, int *locked) 6160 { 6161 unsigned long pfn_offset; 6162 unsigned long vaddr = *position; 6163 unsigned long remainder = *nr_pages; 6164 struct hstate *h = hstate_vma(vma); 6165 int err = -EFAULT, refs; 6166 6167 while (vaddr < vma->vm_end && remainder) { 6168 pte_t *pte; 6169 spinlock_t *ptl = NULL; 6170 bool unshare = false; 6171 int absent; 6172 struct page *page; 6173 6174 /* 6175 * If we have a pending SIGKILL, don't keep faulting pages and 6176 * potentially allocating memory. 6177 */ 6178 if (fatal_signal_pending(current)) { 6179 remainder = 0; 6180 break; 6181 } 6182 6183 /* 6184 * Some archs (sparc64, sh*) have multiple pte_ts to 6185 * each hugepage. We have to make sure we get the 6186 * first, for the page indexing below to work. 6187 * 6188 * Note that page table lock is not held when pte is null. 6189 */ 6190 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), 6191 huge_page_size(h)); 6192 if (pte) 6193 ptl = huge_pte_lock(h, mm, pte); 6194 absent = !pte || huge_pte_none(huge_ptep_get(pte)); 6195 6196 /* 6197 * When coredumping, it suits get_dump_page if we just return 6198 * an error where there's an empty slot with no huge pagecache 6199 * to back it. This way, we avoid allocating a hugepage, and 6200 * the sparse dumpfile avoids allocating disk blocks, but its 6201 * huge holes still show up with zeroes where they need to be. 6202 */ 6203 if (absent && (flags & FOLL_DUMP) && 6204 !hugetlbfs_pagecache_present(h, vma, vaddr)) { 6205 if (pte) 6206 spin_unlock(ptl); 6207 remainder = 0; 6208 break; 6209 } 6210 6211 /* 6212 * We need call hugetlb_fault for both hugepages under migration 6213 * (in which case hugetlb_fault waits for the migration,) and 6214 * hwpoisoned hugepages (in which case we need to prevent the 6215 * caller from accessing to them.) In order to do this, we use 6216 * here is_swap_pte instead of is_hugetlb_entry_migration and 6217 * is_hugetlb_entry_hwpoisoned. This is because it simply covers 6218 * both cases, and because we can't follow correct pages 6219 * directly from any kind of swap entries. 6220 */ 6221 if (absent || 6222 __follow_hugetlb_must_fault(flags, pte, &unshare)) { 6223 vm_fault_t ret; 6224 unsigned int fault_flags = 0; 6225 6226 if (pte) 6227 spin_unlock(ptl); 6228 if (flags & FOLL_WRITE) 6229 fault_flags |= FAULT_FLAG_WRITE; 6230 else if (unshare) 6231 fault_flags |= FAULT_FLAG_UNSHARE; 6232 if (locked) 6233 fault_flags |= FAULT_FLAG_ALLOW_RETRY | 6234 FAULT_FLAG_KILLABLE; 6235 if (flags & FOLL_NOWAIT) 6236 fault_flags |= FAULT_FLAG_ALLOW_RETRY | 6237 FAULT_FLAG_RETRY_NOWAIT; 6238 if (flags & FOLL_TRIED) { 6239 /* 6240 * Note: FAULT_FLAG_ALLOW_RETRY and 6241 * FAULT_FLAG_TRIED can co-exist 6242 */ 6243 fault_flags |= FAULT_FLAG_TRIED; 6244 } 6245 ret = hugetlb_fault(mm, vma, vaddr, fault_flags); 6246 if (ret & VM_FAULT_ERROR) { 6247 err = vm_fault_to_errno(ret, flags); 6248 remainder = 0; 6249 break; 6250 } 6251 if (ret & VM_FAULT_RETRY) { 6252 if (locked && 6253 !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) 6254 *locked = 0; 6255 *nr_pages = 0; 6256 /* 6257 * VM_FAULT_RETRY must not return an 6258 * error, it will return zero 6259 * instead. 6260 * 6261 * No need to update "position" as the 6262 * caller will not check it after 6263 * *nr_pages is set to 0. 6264 */ 6265 return i; 6266 } 6267 continue; 6268 } 6269 6270 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; 6271 page = pte_page(huge_ptep_get(pte)); 6272 6273 VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && 6274 !PageAnonExclusive(page), page); 6275 6276 /* 6277 * If subpage information not requested, update counters 6278 * and skip the same_page loop below. 6279 */ 6280 if (!pages && !vmas && !pfn_offset && 6281 (vaddr + huge_page_size(h) < vma->vm_end) && 6282 (remainder >= pages_per_huge_page(h))) { 6283 vaddr += huge_page_size(h); 6284 remainder -= pages_per_huge_page(h); 6285 i += pages_per_huge_page(h); 6286 spin_unlock(ptl); 6287 continue; 6288 } 6289 6290 /* vaddr may not be aligned to PAGE_SIZE */ 6291 refs = min3(pages_per_huge_page(h) - pfn_offset, remainder, 6292 (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); 6293 6294 if (pages || vmas) 6295 record_subpages_vmas(mem_map_offset(page, pfn_offset), 6296 vma, refs, 6297 likely(pages) ? pages + i : NULL, 6298 vmas ? vmas + i : NULL); 6299 6300 if (pages) { 6301 /* 6302 * try_grab_folio() should always succeed here, 6303 * because: a) we hold the ptl lock, and b) we've just 6304 * checked that the huge page is present in the page 6305 * tables. If the huge page is present, then the tail 6306 * pages must also be present. The ptl prevents the 6307 * head page and tail pages from being rearranged in 6308 * any way. So this page must be available at this 6309 * point, unless the page refcount overflowed: 6310 */ 6311 if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs, 6312 flags))) { 6313 spin_unlock(ptl); 6314 remainder = 0; 6315 err = -ENOMEM; 6316 break; 6317 } 6318 } 6319 6320 vaddr += (refs << PAGE_SHIFT); 6321 remainder -= refs; 6322 i += refs; 6323 6324 spin_unlock(ptl); 6325 } 6326 *nr_pages = remainder; 6327 /* 6328 * setting position is actually required only if remainder is 6329 * not zero but it's faster not to add a "if (remainder)" 6330 * branch. 6331 */ 6332 *position = vaddr; 6333 6334 return i ? i : err; 6335 } 6336 6337 unsigned long hugetlb_change_protection(struct vm_area_struct *vma, 6338 unsigned long address, unsigned long end, 6339 pgprot_t newprot, unsigned long cp_flags) 6340 { 6341 struct mm_struct *mm = vma->vm_mm; 6342 unsigned long start = address; 6343 pte_t *ptep; 6344 pte_t pte; 6345 struct hstate *h = hstate_vma(vma); 6346 unsigned long pages = 0, psize = huge_page_size(h); 6347 bool shared_pmd = false; 6348 struct mmu_notifier_range range; 6349 unsigned long last_addr_mask; 6350 bool uffd_wp = cp_flags & MM_CP_UFFD_WP; 6351 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; 6352 6353 /* 6354 * In the case of shared PMDs, the area to flush could be beyond 6355 * start/end. Set range.start/range.end to cover the maximum possible 6356 * range if PMD sharing is possible. 6357 */ 6358 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 6359 0, vma, mm, start, end); 6360 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 6361 6362 BUG_ON(address >= end); 6363 flush_cache_range(vma, range.start, range.end); 6364 6365 mmu_notifier_invalidate_range_start(&range); 6366 last_addr_mask = hugetlb_mask_last_page(h); 6367 i_mmap_lock_write(vma->vm_file->f_mapping); 6368 for (; address < end; address += psize) { 6369 spinlock_t *ptl; 6370 ptep = huge_pte_offset(mm, address, psize); 6371 if (!ptep) { 6372 address |= last_addr_mask; 6373 continue; 6374 } 6375 ptl = huge_pte_lock(h, mm, ptep); 6376 if (huge_pmd_unshare(mm, vma, address, ptep)) { 6377 /* 6378 * When uffd-wp is enabled on the vma, unshare 6379 * shouldn't happen at all. Warn about it if it 6380 * happened due to some reason. 6381 */ 6382 WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); 6383 pages++; 6384 spin_unlock(ptl); 6385 shared_pmd = true; 6386 address |= last_addr_mask; 6387 continue; 6388 } 6389 pte = huge_ptep_get(ptep); 6390 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 6391 spin_unlock(ptl); 6392 continue; 6393 } 6394 if (unlikely(is_hugetlb_entry_migration(pte))) { 6395 swp_entry_t entry = pte_to_swp_entry(pte); 6396 struct page *page = pfn_swap_entry_to_page(entry); 6397 6398 if (!is_readable_migration_entry(entry)) { 6399 pte_t newpte; 6400 6401 if (PageAnon(page)) 6402 entry = make_readable_exclusive_migration_entry( 6403 swp_offset(entry)); 6404 else 6405 entry = make_readable_migration_entry( 6406 swp_offset(entry)); 6407 newpte = swp_entry_to_pte(entry); 6408 if (uffd_wp) 6409 newpte = pte_swp_mkuffd_wp(newpte); 6410 else if (uffd_wp_resolve) 6411 newpte = pte_swp_clear_uffd_wp(newpte); 6412 set_huge_pte_at(mm, address, ptep, newpte); 6413 pages++; 6414 } 6415 spin_unlock(ptl); 6416 continue; 6417 } 6418 if (unlikely(pte_marker_uffd_wp(pte))) { 6419 /* 6420 * This is changing a non-present pte into a none pte, 6421 * no need for huge_ptep_modify_prot_start/commit(). 6422 */ 6423 if (uffd_wp_resolve) 6424 huge_pte_clear(mm, address, ptep, psize); 6425 } 6426 if (!huge_pte_none(pte)) { 6427 pte_t old_pte; 6428 unsigned int shift = huge_page_shift(hstate_vma(vma)); 6429 6430 old_pte = huge_ptep_modify_prot_start(vma, address, ptep); 6431 pte = huge_pte_modify(old_pte, newprot); 6432 pte = arch_make_huge_pte(pte, shift, vma->vm_flags); 6433 if (uffd_wp) 6434 pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte)); 6435 else if (uffd_wp_resolve) 6436 pte = huge_pte_clear_uffd_wp(pte); 6437 huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); 6438 pages++; 6439 } else { 6440 /* None pte */ 6441 if (unlikely(uffd_wp)) 6442 /* Safe to modify directly (none->non-present). */ 6443 set_huge_pte_at(mm, address, ptep, 6444 make_pte_marker(PTE_MARKER_UFFD_WP)); 6445 } 6446 spin_unlock(ptl); 6447 } 6448 /* 6449 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare 6450 * may have cleared our pud entry and done put_page on the page table: 6451 * once we release i_mmap_rwsem, another task can do the final put_page 6452 * and that page table be reused and filled with junk. If we actually 6453 * did unshare a page of pmds, flush the range corresponding to the pud. 6454 */ 6455 if (shared_pmd) 6456 flush_hugetlb_tlb_range(vma, range.start, range.end); 6457 else 6458 flush_hugetlb_tlb_range(vma, start, end); 6459 /* 6460 * No need to call mmu_notifier_invalidate_range() we are downgrading 6461 * page table protection not changing it to point to a new page. 6462 * 6463 * See Documentation/mm/mmu_notifier.rst 6464 */ 6465 i_mmap_unlock_write(vma->vm_file->f_mapping); 6466 mmu_notifier_invalidate_range_end(&range); 6467 6468 return pages << h->order; 6469 } 6470 6471 /* Return true if reservation was successful, false otherwise. */ 6472 bool hugetlb_reserve_pages(struct inode *inode, 6473 long from, long to, 6474 struct vm_area_struct *vma, 6475 vm_flags_t vm_flags) 6476 { 6477 long chg, add = -1; 6478 struct hstate *h = hstate_inode(inode); 6479 struct hugepage_subpool *spool = subpool_inode(inode); 6480 struct resv_map *resv_map; 6481 struct hugetlb_cgroup *h_cg = NULL; 6482 long gbl_reserve, regions_needed = 0; 6483 6484 /* This should never happen */ 6485 if (from > to) { 6486 VM_WARN(1, "%s called with a negative range\n", __func__); 6487 return false; 6488 } 6489 6490 /* 6491 * Only apply hugepage reservation if asked. At fault time, an 6492 * attempt will be made for VM_NORESERVE to allocate a page 6493 * without using reserves 6494 */ 6495 if (vm_flags & VM_NORESERVE) 6496 return true; 6497 6498 /* 6499 * Shared mappings base their reservation on the number of pages that 6500 * are already allocated on behalf of the file. Private mappings need 6501 * to reserve the full area even if read-only as mprotect() may be 6502 * called to make the mapping read-write. Assume !vma is a shm mapping 6503 */ 6504 if (!vma || vma->vm_flags & VM_MAYSHARE) { 6505 /* 6506 * resv_map can not be NULL as hugetlb_reserve_pages is only 6507 * called for inodes for which resv_maps were created (see 6508 * hugetlbfs_get_inode). 6509 */ 6510 resv_map = inode_resv_map(inode); 6511 6512 chg = region_chg(resv_map, from, to, ®ions_needed); 6513 6514 } else { 6515 /* Private mapping. */ 6516 resv_map = resv_map_alloc(); 6517 if (!resv_map) 6518 return false; 6519 6520 chg = to - from; 6521 6522 set_vma_resv_map(vma, resv_map); 6523 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 6524 } 6525 6526 if (chg < 0) 6527 goto out_err; 6528 6529 if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h), 6530 chg * pages_per_huge_page(h), &h_cg) < 0) 6531 goto out_err; 6532 6533 if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { 6534 /* For private mappings, the hugetlb_cgroup uncharge info hangs 6535 * of the resv_map. 6536 */ 6537 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); 6538 } 6539 6540 /* 6541 * There must be enough pages in the subpool for the mapping. If 6542 * the subpool has a minimum size, there may be some global 6543 * reservations already in place (gbl_reserve). 6544 */ 6545 gbl_reserve = hugepage_subpool_get_pages(spool, chg); 6546 if (gbl_reserve < 0) 6547 goto out_uncharge_cgroup; 6548 6549 /* 6550 * Check enough hugepages are available for the reservation. 6551 * Hand the pages back to the subpool if there are not 6552 */ 6553 if (hugetlb_acct_memory(h, gbl_reserve) < 0) 6554 goto out_put_pages; 6555 6556 /* 6557 * Account for the reservations made. Shared mappings record regions 6558 * that have reservations as they are shared by multiple VMAs. 6559 * When the last VMA disappears, the region map says how much 6560 * the reservation was and the page cache tells how much of 6561 * the reservation was consumed. Private mappings are per-VMA and 6562 * only the consumed reservations are tracked. When the VMA 6563 * disappears, the original reservation is the VMA size and the 6564 * consumed reservations are stored in the map. Hence, nothing 6565 * else has to be done for private mappings here 6566 */ 6567 if (!vma || vma->vm_flags & VM_MAYSHARE) { 6568 add = region_add(resv_map, from, to, regions_needed, h, h_cg); 6569 6570 if (unlikely(add < 0)) { 6571 hugetlb_acct_memory(h, -gbl_reserve); 6572 goto out_put_pages; 6573 } else if (unlikely(chg > add)) { 6574 /* 6575 * pages in this range were added to the reserve 6576 * map between region_chg and region_add. This 6577 * indicates a race with alloc_huge_page. Adjust 6578 * the subpool and reserve counts modified above 6579 * based on the difference. 6580 */ 6581 long rsv_adjust; 6582 6583 /* 6584 * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the 6585 * reference to h_cg->css. See comment below for detail. 6586 */ 6587 hugetlb_cgroup_uncharge_cgroup_rsvd( 6588 hstate_index(h), 6589 (chg - add) * pages_per_huge_page(h), h_cg); 6590 6591 rsv_adjust = hugepage_subpool_put_pages(spool, 6592 chg - add); 6593 hugetlb_acct_memory(h, -rsv_adjust); 6594 } else if (h_cg) { 6595 /* 6596 * The file_regions will hold their own reference to 6597 * h_cg->css. So we should release the reference held 6598 * via hugetlb_cgroup_charge_cgroup_rsvd() when we are 6599 * done. 6600 */ 6601 hugetlb_cgroup_put_rsvd_cgroup(h_cg); 6602 } 6603 } 6604 return true; 6605 6606 out_put_pages: 6607 /* put back original number of pages, chg */ 6608 (void)hugepage_subpool_put_pages(spool, chg); 6609 out_uncharge_cgroup: 6610 hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), 6611 chg * pages_per_huge_page(h), h_cg); 6612 out_err: 6613 if (!vma || vma->vm_flags & VM_MAYSHARE) 6614 /* Only call region_abort if the region_chg succeeded but the 6615 * region_add failed or didn't run. 6616 */ 6617 if (chg >= 0 && add < 0) 6618 region_abort(resv_map, from, to, regions_needed); 6619 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 6620 kref_put(&resv_map->refs, resv_map_release); 6621 return false; 6622 } 6623 6624 long hugetlb_unreserve_pages(struct inode *inode, long start, long end, 6625 long freed) 6626 { 6627 struct hstate *h = hstate_inode(inode); 6628 struct resv_map *resv_map = inode_resv_map(inode); 6629 long chg = 0; 6630 struct hugepage_subpool *spool = subpool_inode(inode); 6631 long gbl_reserve; 6632 6633 /* 6634 * Since this routine can be called in the evict inode path for all 6635 * hugetlbfs inodes, resv_map could be NULL. 6636 */ 6637 if (resv_map) { 6638 chg = region_del(resv_map, start, end); 6639 /* 6640 * region_del() can fail in the rare case where a region 6641 * must be split and another region descriptor can not be 6642 * allocated. If end == LONG_MAX, it will not fail. 6643 */ 6644 if (chg < 0) 6645 return chg; 6646 } 6647 6648 spin_lock(&inode->i_lock); 6649 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 6650 spin_unlock(&inode->i_lock); 6651 6652 /* 6653 * If the subpool has a minimum size, the number of global 6654 * reservations to be released may be adjusted. 6655 * 6656 * Note that !resv_map implies freed == 0. So (chg - freed) 6657 * won't go negative. 6658 */ 6659 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); 6660 hugetlb_acct_memory(h, -gbl_reserve); 6661 6662 return 0; 6663 } 6664 6665 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 6666 static unsigned long page_table_shareable(struct vm_area_struct *svma, 6667 struct vm_area_struct *vma, 6668 unsigned long addr, pgoff_t idx) 6669 { 6670 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + 6671 svma->vm_start; 6672 unsigned long sbase = saddr & PUD_MASK; 6673 unsigned long s_end = sbase + PUD_SIZE; 6674 6675 /* Allow segments to share if only one is marked locked */ 6676 unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; 6677 unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; 6678 6679 /* 6680 * match the virtual addresses, permission and the alignment of the 6681 * page table page. 6682 */ 6683 if (pmd_index(addr) != pmd_index(saddr) || 6684 vm_flags != svm_flags || 6685 !range_in_vma(svma, sbase, s_end)) 6686 return 0; 6687 6688 return saddr; 6689 } 6690 6691 static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) 6692 { 6693 unsigned long base = addr & PUD_MASK; 6694 unsigned long end = base + PUD_SIZE; 6695 6696 /* 6697 * check on proper vm_flags and page table alignment 6698 */ 6699 if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) 6700 return true; 6701 return false; 6702 } 6703 6704 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) 6705 { 6706 #ifdef CONFIG_USERFAULTFD 6707 if (uffd_disable_huge_pmd_share(vma)) 6708 return false; 6709 #endif 6710 return vma_shareable(vma, addr); 6711 } 6712 6713 /* 6714 * Determine if start,end range within vma could be mapped by shared pmd. 6715 * If yes, adjust start and end to cover range associated with possible 6716 * shared pmd mappings. 6717 */ 6718 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 6719 unsigned long *start, unsigned long *end) 6720 { 6721 unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE), 6722 v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); 6723 6724 /* 6725 * vma needs to span at least one aligned PUD size, and the range 6726 * must be at least partially within in. 6727 */ 6728 if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || 6729 (*end <= v_start) || (*start >= v_end)) 6730 return; 6731 6732 /* Extend the range to be PUD aligned for a worst case scenario */ 6733 if (*start > v_start) 6734 *start = ALIGN_DOWN(*start, PUD_SIZE); 6735 6736 if (*end < v_end) 6737 *end = ALIGN(*end, PUD_SIZE); 6738 } 6739 6740 /* 6741 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 6742 * and returns the corresponding pte. While this is not necessary for the 6743 * !shared pmd case because we can allocate the pmd later as well, it makes the 6744 * code much cleaner. 6745 * 6746 * This routine must be called with i_mmap_rwsem held in at least read mode if 6747 * sharing is possible. For hugetlbfs, this prevents removal of any page 6748 * table entries associated with the address space. This is important as we 6749 * are setting up sharing based on existing page table entries (mappings). 6750 */ 6751 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, 6752 unsigned long addr, pud_t *pud) 6753 { 6754 struct address_space *mapping = vma->vm_file->f_mapping; 6755 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 6756 vma->vm_pgoff; 6757 struct vm_area_struct *svma; 6758 unsigned long saddr; 6759 pte_t *spte = NULL; 6760 pte_t *pte; 6761 spinlock_t *ptl; 6762 6763 i_mmap_assert_locked(mapping); 6764 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 6765 if (svma == vma) 6766 continue; 6767 6768 saddr = page_table_shareable(svma, vma, addr, idx); 6769 if (saddr) { 6770 spte = huge_pte_offset(svma->vm_mm, saddr, 6771 vma_mmu_pagesize(svma)); 6772 if (spte) { 6773 get_page(virt_to_page(spte)); 6774 break; 6775 } 6776 } 6777 } 6778 6779 if (!spte) 6780 goto out; 6781 6782 ptl = huge_pte_lock(hstate_vma(vma), mm, spte); 6783 if (pud_none(*pud)) { 6784 pud_populate(mm, pud, 6785 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 6786 mm_inc_nr_pmds(mm); 6787 } else { 6788 put_page(virt_to_page(spte)); 6789 } 6790 spin_unlock(ptl); 6791 out: 6792 pte = (pte_t *)pmd_alloc(mm, pud, addr); 6793 return pte; 6794 } 6795 6796 /* 6797 * unmap huge page backed by shared pte. 6798 * 6799 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared 6800 * indicated by page_count > 1, unmap is achieved by clearing pud and 6801 * decrementing the ref count. If count == 1, the pte page is not shared. 6802 * 6803 * Called with page table lock held and i_mmap_rwsem held in write mode. 6804 * 6805 * returns: 1 successfully unmapped a shared pte page 6806 * 0 the underlying pte page is not shared, or it is the last user 6807 */ 6808 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, 6809 unsigned long addr, pte_t *ptep) 6810 { 6811 pgd_t *pgd = pgd_offset(mm, addr); 6812 p4d_t *p4d = p4d_offset(pgd, addr); 6813 pud_t *pud = pud_offset(p4d, addr); 6814 6815 i_mmap_assert_write_locked(vma->vm_file->f_mapping); 6816 BUG_ON(page_count(virt_to_page(ptep)) == 0); 6817 if (page_count(virt_to_page(ptep)) == 1) 6818 return 0; 6819 6820 pud_clear(pud); 6821 put_page(virt_to_page(ptep)); 6822 mm_dec_nr_pmds(mm); 6823 return 1; 6824 } 6825 6826 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 6827 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, 6828 unsigned long addr, pud_t *pud) 6829 { 6830 return NULL; 6831 } 6832 6833 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, 6834 unsigned long addr, pte_t *ptep) 6835 { 6836 return 0; 6837 } 6838 6839 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 6840 unsigned long *start, unsigned long *end) 6841 { 6842 } 6843 6844 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) 6845 { 6846 return false; 6847 } 6848 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 6849 6850 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB 6851 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 6852 unsigned long addr, unsigned long sz) 6853 { 6854 pgd_t *pgd; 6855 p4d_t *p4d; 6856 pud_t *pud; 6857 pte_t *pte = NULL; 6858 6859 pgd = pgd_offset(mm, addr); 6860 p4d = p4d_alloc(mm, pgd, addr); 6861 if (!p4d) 6862 return NULL; 6863 pud = pud_alloc(mm, p4d, addr); 6864 if (pud) { 6865 if (sz == PUD_SIZE) { 6866 pte = (pte_t *)pud; 6867 } else { 6868 BUG_ON(sz != PMD_SIZE); 6869 if (want_pmd_share(vma, addr) && pud_none(*pud)) 6870 pte = huge_pmd_share(mm, vma, addr, pud); 6871 else 6872 pte = (pte_t *)pmd_alloc(mm, pud, addr); 6873 } 6874 } 6875 BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte)); 6876 6877 return pte; 6878 } 6879 6880 /* 6881 * huge_pte_offset() - Walk the page table to resolve the hugepage 6882 * entry at address @addr 6883 * 6884 * Return: Pointer to page table entry (PUD or PMD) for 6885 * address @addr, or NULL if a !p*d_present() entry is encountered and the 6886 * size @sz doesn't match the hugepage size at this level of the page 6887 * table. 6888 */ 6889 pte_t *huge_pte_offset(struct mm_struct *mm, 6890 unsigned long addr, unsigned long sz) 6891 { 6892 pgd_t *pgd; 6893 p4d_t *p4d; 6894 pud_t *pud; 6895 pmd_t *pmd; 6896 6897 pgd = pgd_offset(mm, addr); 6898 if (!pgd_present(*pgd)) 6899 return NULL; 6900 p4d = p4d_offset(pgd, addr); 6901 if (!p4d_present(*p4d)) 6902 return NULL; 6903 6904 pud = pud_offset(p4d, addr); 6905 if (sz == PUD_SIZE) 6906 /* must be pud huge, non-present or none */ 6907 return (pte_t *)pud; 6908 if (!pud_present(*pud)) 6909 return NULL; 6910 /* must have a valid entry and size to go further */ 6911 6912 pmd = pmd_offset(pud, addr); 6913 /* must be pmd huge, non-present or none */ 6914 return (pte_t *)pmd; 6915 } 6916 6917 /* 6918 * Return a mask that can be used to update an address to the last huge 6919 * page in a page table page mapping size. Used to skip non-present 6920 * page table entries when linearly scanning address ranges. Architectures 6921 * with unique huge page to page table relationships can define their own 6922 * version of this routine. 6923 */ 6924 unsigned long hugetlb_mask_last_page(struct hstate *h) 6925 { 6926 unsigned long hp_size = huge_page_size(h); 6927 6928 if (hp_size == PUD_SIZE) 6929 return P4D_SIZE - PUD_SIZE; 6930 else if (hp_size == PMD_SIZE) 6931 return PUD_SIZE - PMD_SIZE; 6932 else 6933 return 0UL; 6934 } 6935 6936 #else 6937 6938 /* See description above. Architectures can provide their own version. */ 6939 __weak unsigned long hugetlb_mask_last_page(struct hstate *h) 6940 { 6941 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 6942 if (huge_page_size(h) == PMD_SIZE) 6943 return PUD_SIZE - PMD_SIZE; 6944 #endif 6945 return 0UL; 6946 } 6947 6948 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 6949 6950 /* 6951 * These functions are overwritable if your architecture needs its own 6952 * behavior. 6953 */ 6954 struct page * __weak 6955 follow_huge_addr(struct mm_struct *mm, unsigned long address, 6956 int write) 6957 { 6958 return ERR_PTR(-EINVAL); 6959 } 6960 6961 struct page * __weak 6962 follow_huge_pd(struct vm_area_struct *vma, 6963 unsigned long address, hugepd_t hpd, int flags, int pdshift) 6964 { 6965 WARN(1, "hugepd follow called with no support for hugepage directory format\n"); 6966 return NULL; 6967 } 6968 6969 struct page * __weak 6970 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 6971 pmd_t *pmd, int flags) 6972 { 6973 struct page *page = NULL; 6974 spinlock_t *ptl; 6975 pte_t pte; 6976 6977 /* 6978 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via 6979 * follow_hugetlb_page(). 6980 */ 6981 if (WARN_ON_ONCE(flags & FOLL_PIN)) 6982 return NULL; 6983 6984 retry: 6985 ptl = pmd_lockptr(mm, pmd); 6986 spin_lock(ptl); 6987 /* 6988 * make sure that the address range covered by this pmd is not 6989 * unmapped from other threads. 6990 */ 6991 if (!pmd_huge(*pmd)) 6992 goto out; 6993 pte = huge_ptep_get((pte_t *)pmd); 6994 if (pte_present(pte)) { 6995 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); 6996 /* 6997 * try_grab_page() should always succeed here, because: a) we 6998 * hold the pmd (ptl) lock, and b) we've just checked that the 6999 * huge pmd (head) page is present in the page tables. The ptl 7000 * prevents the head page and tail pages from being rearranged 7001 * in any way. So this page must be available at this point, 7002 * unless the page refcount overflowed: 7003 */ 7004 if (WARN_ON_ONCE(!try_grab_page(page, flags))) { 7005 page = NULL; 7006 goto out; 7007 } 7008 } else { 7009 if (is_hugetlb_entry_migration(pte)) { 7010 spin_unlock(ptl); 7011 __migration_entry_wait_huge((pte_t *)pmd, ptl); 7012 goto retry; 7013 } 7014 /* 7015 * hwpoisoned entry is treated as no_page_table in 7016 * follow_page_mask(). 7017 */ 7018 } 7019 out: 7020 spin_unlock(ptl); 7021 return page; 7022 } 7023 7024 struct page * __weak 7025 follow_huge_pud(struct mm_struct *mm, unsigned long address, 7026 pud_t *pud, int flags) 7027 { 7028 struct page *page = NULL; 7029 spinlock_t *ptl; 7030 pte_t pte; 7031 7032 if (WARN_ON_ONCE(flags & FOLL_PIN)) 7033 return NULL; 7034 7035 retry: 7036 ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud); 7037 if (!pud_huge(*pud)) 7038 goto out; 7039 pte = huge_ptep_get((pte_t *)pud); 7040 if (pte_present(pte)) { 7041 page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); 7042 if (WARN_ON_ONCE(!try_grab_page(page, flags))) { 7043 page = NULL; 7044 goto out; 7045 } 7046 } else { 7047 if (is_hugetlb_entry_migration(pte)) { 7048 spin_unlock(ptl); 7049 __migration_entry_wait(mm, (pte_t *)pud, ptl); 7050 goto retry; 7051 } 7052 /* 7053 * hwpoisoned entry is treated as no_page_table in 7054 * follow_page_mask(). 7055 */ 7056 } 7057 out: 7058 spin_unlock(ptl); 7059 return page; 7060 } 7061 7062 struct page * __weak 7063 follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) 7064 { 7065 if (flags & (FOLL_GET | FOLL_PIN)) 7066 return NULL; 7067 7068 return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); 7069 } 7070 7071 int isolate_hugetlb(struct page *page, struct list_head *list) 7072 { 7073 int ret = 0; 7074 7075 spin_lock_irq(&hugetlb_lock); 7076 if (!PageHeadHuge(page) || 7077 !HPageMigratable(page) || 7078 !get_page_unless_zero(page)) { 7079 ret = -EBUSY; 7080 goto unlock; 7081 } 7082 ClearHPageMigratable(page); 7083 list_move_tail(&page->lru, list); 7084 unlock: 7085 spin_unlock_irq(&hugetlb_lock); 7086 return ret; 7087 } 7088 7089 int get_hwpoison_huge_page(struct page *page, bool *hugetlb) 7090 { 7091 int ret = 0; 7092 7093 *hugetlb = false; 7094 spin_lock_irq(&hugetlb_lock); 7095 if (PageHeadHuge(page)) { 7096 *hugetlb = true; 7097 if (HPageFreed(page)) 7098 ret = 0; 7099 else if (HPageMigratable(page)) 7100 ret = get_page_unless_zero(page); 7101 else 7102 ret = -EBUSY; 7103 } 7104 spin_unlock_irq(&hugetlb_lock); 7105 return ret; 7106 } 7107 7108 int get_huge_page_for_hwpoison(unsigned long pfn, int flags) 7109 { 7110 int ret; 7111 7112 spin_lock_irq(&hugetlb_lock); 7113 ret = __get_huge_page_for_hwpoison(pfn, flags); 7114 spin_unlock_irq(&hugetlb_lock); 7115 return ret; 7116 } 7117 7118 void putback_active_hugepage(struct page *page) 7119 { 7120 spin_lock_irq(&hugetlb_lock); 7121 SetHPageMigratable(page); 7122 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); 7123 spin_unlock_irq(&hugetlb_lock); 7124 put_page(page); 7125 } 7126 7127 void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) 7128 { 7129 struct hstate *h = page_hstate(oldpage); 7130 7131 hugetlb_cgroup_migrate(oldpage, newpage); 7132 set_page_owner_migrate_reason(newpage, reason); 7133 7134 /* 7135 * transfer temporary state of the new huge page. This is 7136 * reverse to other transitions because the newpage is going to 7137 * be final while the old one will be freed so it takes over 7138 * the temporary status. 7139 * 7140 * Also note that we have to transfer the per-node surplus state 7141 * here as well otherwise the global surplus count will not match 7142 * the per-node's. 7143 */ 7144 if (HPageTemporary(newpage)) { 7145 int old_nid = page_to_nid(oldpage); 7146 int new_nid = page_to_nid(newpage); 7147 7148 SetHPageTemporary(oldpage); 7149 ClearHPageTemporary(newpage); 7150 7151 /* 7152 * There is no need to transfer the per-node surplus state 7153 * when we do not cross the node. 7154 */ 7155 if (new_nid == old_nid) 7156 return; 7157 spin_lock_irq(&hugetlb_lock); 7158 if (h->surplus_huge_pages_node[old_nid]) { 7159 h->surplus_huge_pages_node[old_nid]--; 7160 h->surplus_huge_pages_node[new_nid]++; 7161 } 7162 spin_unlock_irq(&hugetlb_lock); 7163 } 7164 } 7165 7166 /* 7167 * This function will unconditionally remove all the shared pmd pgtable entries 7168 * within the specific vma for a hugetlbfs memory range. 7169 */ 7170 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) 7171 { 7172 struct hstate *h = hstate_vma(vma); 7173 unsigned long sz = huge_page_size(h); 7174 struct mm_struct *mm = vma->vm_mm; 7175 struct mmu_notifier_range range; 7176 unsigned long address, start, end; 7177 spinlock_t *ptl; 7178 pte_t *ptep; 7179 7180 if (!(vma->vm_flags & VM_MAYSHARE)) 7181 return; 7182 7183 start = ALIGN(vma->vm_start, PUD_SIZE); 7184 end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); 7185 7186 if (start >= end) 7187 return; 7188 7189 flush_cache_range(vma, start, end); 7190 /* 7191 * No need to call adjust_range_if_pmd_sharing_possible(), because 7192 * we have already done the PUD_SIZE alignment. 7193 */ 7194 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 7195 start, end); 7196 mmu_notifier_invalidate_range_start(&range); 7197 i_mmap_lock_write(vma->vm_file->f_mapping); 7198 for (address = start; address < end; address += PUD_SIZE) { 7199 ptep = huge_pte_offset(mm, address, sz); 7200 if (!ptep) 7201 continue; 7202 ptl = huge_pte_lock(h, mm, ptep); 7203 huge_pmd_unshare(mm, vma, address, ptep); 7204 spin_unlock(ptl); 7205 } 7206 flush_hugetlb_tlb_range(vma, start, end); 7207 i_mmap_unlock_write(vma->vm_file->f_mapping); 7208 /* 7209 * No need to call mmu_notifier_invalidate_range(), see 7210 * Documentation/mm/mmu_notifier.rst. 7211 */ 7212 mmu_notifier_invalidate_range_end(&range); 7213 } 7214 7215 #ifdef CONFIG_CMA 7216 static bool cma_reserve_called __initdata; 7217 7218 static int __init cmdline_parse_hugetlb_cma(char *p) 7219 { 7220 int nid, count = 0; 7221 unsigned long tmp; 7222 char *s = p; 7223 7224 while (*s) { 7225 if (sscanf(s, "%lu%n", &tmp, &count) != 1) 7226 break; 7227 7228 if (s[count] == ':') { 7229 if (tmp >= MAX_NUMNODES) 7230 break; 7231 nid = array_index_nospec(tmp, MAX_NUMNODES); 7232 7233 s += count + 1; 7234 tmp = memparse(s, &s); 7235 hugetlb_cma_size_in_node[nid] = tmp; 7236 hugetlb_cma_size += tmp; 7237 7238 /* 7239 * Skip the separator if have one, otherwise 7240 * break the parsing. 7241 */ 7242 if (*s == ',') 7243 s++; 7244 else 7245 break; 7246 } else { 7247 hugetlb_cma_size = memparse(p, &p); 7248 break; 7249 } 7250 } 7251 7252 return 0; 7253 } 7254 7255 early_param("hugetlb_cma", cmdline_parse_hugetlb_cma); 7256 7257 void __init hugetlb_cma_reserve(int order) 7258 { 7259 unsigned long size, reserved, per_node; 7260 bool node_specific_cma_alloc = false; 7261 int nid; 7262 7263 cma_reserve_called = true; 7264 7265 if (!hugetlb_cma_size) 7266 return; 7267 7268 for (nid = 0; nid < MAX_NUMNODES; nid++) { 7269 if (hugetlb_cma_size_in_node[nid] == 0) 7270 continue; 7271 7272 if (!node_online(nid)) { 7273 pr_warn("hugetlb_cma: invalid node %d specified\n", nid); 7274 hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; 7275 hugetlb_cma_size_in_node[nid] = 0; 7276 continue; 7277 } 7278 7279 if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) { 7280 pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n", 7281 nid, (PAGE_SIZE << order) / SZ_1M); 7282 hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; 7283 hugetlb_cma_size_in_node[nid] = 0; 7284 } else { 7285 node_specific_cma_alloc = true; 7286 } 7287 } 7288 7289 /* Validate the CMA size again in case some invalid nodes specified. */ 7290 if (!hugetlb_cma_size) 7291 return; 7292 7293 if (hugetlb_cma_size < (PAGE_SIZE << order)) { 7294 pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", 7295 (PAGE_SIZE << order) / SZ_1M); 7296 hugetlb_cma_size = 0; 7297 return; 7298 } 7299 7300 if (!node_specific_cma_alloc) { 7301 /* 7302 * If 3 GB area is requested on a machine with 4 numa nodes, 7303 * let's allocate 1 GB on first three nodes and ignore the last one. 7304 */ 7305 per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); 7306 pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", 7307 hugetlb_cma_size / SZ_1M, per_node / SZ_1M); 7308 } 7309 7310 reserved = 0; 7311 for_each_online_node(nid) { 7312 int res; 7313 char name[CMA_MAX_NAME]; 7314 7315 if (node_specific_cma_alloc) { 7316 if (hugetlb_cma_size_in_node[nid] == 0) 7317 continue; 7318 7319 size = hugetlb_cma_size_in_node[nid]; 7320 } else { 7321 size = min(per_node, hugetlb_cma_size - reserved); 7322 } 7323 7324 size = round_up(size, PAGE_SIZE << order); 7325 7326 snprintf(name, sizeof(name), "hugetlb%d", nid); 7327 /* 7328 * Note that 'order per bit' is based on smallest size that 7329 * may be returned to CMA allocator in the case of 7330 * huge page demotion. 7331 */ 7332 res = cma_declare_contiguous_nid(0, size, 0, 7333 PAGE_SIZE << HUGETLB_PAGE_ORDER, 7334 0, false, name, 7335 &hugetlb_cma[nid], nid); 7336 if (res) { 7337 pr_warn("hugetlb_cma: reservation failed: err %d, node %d", 7338 res, nid); 7339 continue; 7340 } 7341 7342 reserved += size; 7343 pr_info("hugetlb_cma: reserved %lu MiB on node %d\n", 7344 size / SZ_1M, nid); 7345 7346 if (reserved >= hugetlb_cma_size) 7347 break; 7348 } 7349 7350 if (!reserved) 7351 /* 7352 * hugetlb_cma_size is used to determine if allocations from 7353 * cma are possible. Set to zero if no cma regions are set up. 7354 */ 7355 hugetlb_cma_size = 0; 7356 } 7357 7358 static void __init hugetlb_cma_check(void) 7359 { 7360 if (!hugetlb_cma_size || cma_reserve_called) 7361 return; 7362 7363 pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); 7364 } 7365 7366 #endif /* CONFIG_CMA */ 7367