1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Generic hugetlb support. 4 * (C) Nadia Yvette Chambers, April 2004 5 */ 6 #include <linux/list.h> 7 #include <linux/init.h> 8 #include <linux/mm.h> 9 #include <linux/seq_file.h> 10 #include <linux/sysctl.h> 11 #include <linux/highmem.h> 12 #include <linux/mmu_notifier.h> 13 #include <linux/nodemask.h> 14 #include <linux/pagemap.h> 15 #include <linux/mempolicy.h> 16 #include <linux/compiler.h> 17 #include <linux/cpuset.h> 18 #include <linux/mutex.h> 19 #include <linux/memblock.h> 20 #include <linux/sysfs.h> 21 #include <linux/slab.h> 22 #include <linux/sched/mm.h> 23 #include <linux/mmdebug.h> 24 #include <linux/sched/signal.h> 25 #include <linux/rmap.h> 26 #include <linux/string_helpers.h> 27 #include <linux/swap.h> 28 #include <linux/swapops.h> 29 #include <linux/jhash.h> 30 #include <linux/numa.h> 31 #include <linux/llist.h> 32 #include <linux/cma.h> 33 #include <linux/migrate.h> 34 35 #include <asm/page.h> 36 #include <asm/pgalloc.h> 37 #include <asm/tlb.h> 38 39 #include <linux/io.h> 40 #include <linux/hugetlb.h> 41 #include <linux/hugetlb_cgroup.h> 42 #include <linux/node.h> 43 #include <linux/page_owner.h> 44 #include "internal.h" 45 #include "hugetlb_vmemmap.h" 46 47 int hugetlb_max_hstate __read_mostly; 48 unsigned int default_hstate_idx; 49 struct hstate hstates[HUGE_MAX_HSTATE]; 50 51 #ifdef CONFIG_CMA 52 static struct cma *hugetlb_cma[MAX_NUMNODES]; 53 #endif 54 static unsigned long hugetlb_cma_size __initdata; 55 56 /* 57 * Minimum page order among possible hugepage sizes, set to a proper value 58 * at boot time. 59 */ 60 static unsigned int minimum_order __read_mostly = UINT_MAX; 61 62 __initdata LIST_HEAD(huge_boot_pages); 63 64 /* for command line parsing */ 65 static struct hstate * __initdata parsed_hstate; 66 static unsigned long __initdata default_hstate_max_huge_pages; 67 static bool __initdata parsed_valid_hugepagesz = true; 68 static bool __initdata parsed_default_hugepagesz; 69 70 /* 71 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, 72 * free_huge_pages, and surplus_huge_pages. 73 */ 74 DEFINE_SPINLOCK(hugetlb_lock); 75 76 /* 77 * Serializes faults on the same logical page. This is used to 78 * prevent spurious OOMs when the hugepage pool is fully utilized. 79 */ 80 static int num_fault_mutexes; 81 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; 82 83 /* Forward declaration */ 84 static int hugetlb_acct_memory(struct hstate *h, long delta); 85 86 static inline bool subpool_is_free(struct hugepage_subpool *spool) 87 { 88 if (spool->count) 89 return false; 90 if (spool->max_hpages != -1) 91 return spool->used_hpages == 0; 92 if (spool->min_hpages != -1) 93 return spool->rsv_hpages == spool->min_hpages; 94 95 return true; 96 } 97 98 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, 99 unsigned long irq_flags) 100 { 101 spin_unlock_irqrestore(&spool->lock, irq_flags); 102 103 /* If no pages are used, and no other handles to the subpool 104 * remain, give up any reservations based on minimum size and 105 * free the subpool */ 106 if (subpool_is_free(spool)) { 107 if (spool->min_hpages != -1) 108 hugetlb_acct_memory(spool->hstate, 109 -spool->min_hpages); 110 kfree(spool); 111 } 112 } 113 114 struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, 115 long min_hpages) 116 { 117 struct hugepage_subpool *spool; 118 119 spool = kzalloc(sizeof(*spool), GFP_KERNEL); 120 if (!spool) 121 return NULL; 122 123 spin_lock_init(&spool->lock); 124 spool->count = 1; 125 spool->max_hpages = max_hpages; 126 spool->hstate = h; 127 spool->min_hpages = min_hpages; 128 129 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { 130 kfree(spool); 131 return NULL; 132 } 133 spool->rsv_hpages = min_hpages; 134 135 return spool; 136 } 137 138 void hugepage_put_subpool(struct hugepage_subpool *spool) 139 { 140 unsigned long flags; 141 142 spin_lock_irqsave(&spool->lock, flags); 143 BUG_ON(!spool->count); 144 spool->count--; 145 unlock_or_release_subpool(spool, flags); 146 } 147 148 /* 149 * Subpool accounting for allocating and reserving pages. 150 * Return -ENOMEM if there are not enough resources to satisfy the 151 * request. Otherwise, return the number of pages by which the 152 * global pools must be adjusted (upward). The returned value may 153 * only be different than the passed value (delta) in the case where 154 * a subpool minimum size must be maintained. 155 */ 156 static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, 157 long delta) 158 { 159 long ret = delta; 160 161 if (!spool) 162 return ret; 163 164 spin_lock_irq(&spool->lock); 165 166 if (spool->max_hpages != -1) { /* maximum size accounting */ 167 if ((spool->used_hpages + delta) <= spool->max_hpages) 168 spool->used_hpages += delta; 169 else { 170 ret = -ENOMEM; 171 goto unlock_ret; 172 } 173 } 174 175 /* minimum size accounting */ 176 if (spool->min_hpages != -1 && spool->rsv_hpages) { 177 if (delta > spool->rsv_hpages) { 178 /* 179 * Asking for more reserves than those already taken on 180 * behalf of subpool. Return difference. 181 */ 182 ret = delta - spool->rsv_hpages; 183 spool->rsv_hpages = 0; 184 } else { 185 ret = 0; /* reserves already accounted for */ 186 spool->rsv_hpages -= delta; 187 } 188 } 189 190 unlock_ret: 191 spin_unlock_irq(&spool->lock); 192 return ret; 193 } 194 195 /* 196 * Subpool accounting for freeing and unreserving pages. 197 * Return the number of global page reservations that must be dropped. 198 * The return value may only be different than the passed value (delta) 199 * in the case where a subpool minimum size must be maintained. 200 */ 201 static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, 202 long delta) 203 { 204 long ret = delta; 205 unsigned long flags; 206 207 if (!spool) 208 return delta; 209 210 spin_lock_irqsave(&spool->lock, flags); 211 212 if (spool->max_hpages != -1) /* maximum size accounting */ 213 spool->used_hpages -= delta; 214 215 /* minimum size accounting */ 216 if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { 217 if (spool->rsv_hpages + delta <= spool->min_hpages) 218 ret = 0; 219 else 220 ret = spool->rsv_hpages + delta - spool->min_hpages; 221 222 spool->rsv_hpages += delta; 223 if (spool->rsv_hpages > spool->min_hpages) 224 spool->rsv_hpages = spool->min_hpages; 225 } 226 227 /* 228 * If hugetlbfs_put_super couldn't free spool due to an outstanding 229 * quota reference, free it now. 230 */ 231 unlock_or_release_subpool(spool, flags); 232 233 return ret; 234 } 235 236 static inline struct hugepage_subpool *subpool_inode(struct inode *inode) 237 { 238 return HUGETLBFS_SB(inode->i_sb)->spool; 239 } 240 241 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) 242 { 243 return subpool_inode(file_inode(vma->vm_file)); 244 } 245 246 /* Helper that removes a struct file_region from the resv_map cache and returns 247 * it for use. 248 */ 249 static struct file_region * 250 get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) 251 { 252 struct file_region *nrg = NULL; 253 254 VM_BUG_ON(resv->region_cache_count <= 0); 255 256 resv->region_cache_count--; 257 nrg = list_first_entry(&resv->region_cache, struct file_region, link); 258 list_del(&nrg->link); 259 260 nrg->from = from; 261 nrg->to = to; 262 263 return nrg; 264 } 265 266 static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, 267 struct file_region *rg) 268 { 269 #ifdef CONFIG_CGROUP_HUGETLB 270 nrg->reservation_counter = rg->reservation_counter; 271 nrg->css = rg->css; 272 if (rg->css) 273 css_get(rg->css); 274 #endif 275 } 276 277 /* Helper that records hugetlb_cgroup uncharge info. */ 278 static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, 279 struct hstate *h, 280 struct resv_map *resv, 281 struct file_region *nrg) 282 { 283 #ifdef CONFIG_CGROUP_HUGETLB 284 if (h_cg) { 285 nrg->reservation_counter = 286 &h_cg->rsvd_hugepage[hstate_index(h)]; 287 nrg->css = &h_cg->css; 288 /* 289 * The caller will hold exactly one h_cg->css reference for the 290 * whole contiguous reservation region. But this area might be 291 * scattered when there are already some file_regions reside in 292 * it. As a result, many file_regions may share only one css 293 * reference. In order to ensure that one file_region must hold 294 * exactly one h_cg->css reference, we should do css_get for 295 * each file_region and leave the reference held by caller 296 * untouched. 297 */ 298 css_get(&h_cg->css); 299 if (!resv->pages_per_hpage) 300 resv->pages_per_hpage = pages_per_huge_page(h); 301 /* pages_per_hpage should be the same for all entries in 302 * a resv_map. 303 */ 304 VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); 305 } else { 306 nrg->reservation_counter = NULL; 307 nrg->css = NULL; 308 } 309 #endif 310 } 311 312 static void put_uncharge_info(struct file_region *rg) 313 { 314 #ifdef CONFIG_CGROUP_HUGETLB 315 if (rg->css) 316 css_put(rg->css); 317 #endif 318 } 319 320 static bool has_same_uncharge_info(struct file_region *rg, 321 struct file_region *org) 322 { 323 #ifdef CONFIG_CGROUP_HUGETLB 324 return rg && org && 325 rg->reservation_counter == org->reservation_counter && 326 rg->css == org->css; 327 328 #else 329 return true; 330 #endif 331 } 332 333 static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) 334 { 335 struct file_region *nrg = NULL, *prg = NULL; 336 337 prg = list_prev_entry(rg, link); 338 if (&prg->link != &resv->regions && prg->to == rg->from && 339 has_same_uncharge_info(prg, rg)) { 340 prg->to = rg->to; 341 342 list_del(&rg->link); 343 put_uncharge_info(rg); 344 kfree(rg); 345 346 rg = prg; 347 } 348 349 nrg = list_next_entry(rg, link); 350 if (&nrg->link != &resv->regions && nrg->from == rg->to && 351 has_same_uncharge_info(nrg, rg)) { 352 nrg->from = rg->from; 353 354 list_del(&rg->link); 355 put_uncharge_info(rg); 356 kfree(rg); 357 } 358 } 359 360 static inline long 361 hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from, 362 long to, struct hstate *h, struct hugetlb_cgroup *cg, 363 long *regions_needed) 364 { 365 struct file_region *nrg; 366 367 if (!regions_needed) { 368 nrg = get_file_region_entry_from_cache(map, from, to); 369 record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); 370 list_add(&nrg->link, rg->link.prev); 371 coalesce_file_region(map, nrg); 372 } else 373 *regions_needed += 1; 374 375 return to - from; 376 } 377 378 /* 379 * Must be called with resv->lock held. 380 * 381 * Calling this with regions_needed != NULL will count the number of pages 382 * to be added but will not modify the linked list. And regions_needed will 383 * indicate the number of file_regions needed in the cache to carry out to add 384 * the regions for this range. 385 */ 386 static long add_reservation_in_range(struct resv_map *resv, long f, long t, 387 struct hugetlb_cgroup *h_cg, 388 struct hstate *h, long *regions_needed) 389 { 390 long add = 0; 391 struct list_head *head = &resv->regions; 392 long last_accounted_offset = f; 393 struct file_region *rg = NULL, *trg = NULL; 394 395 if (regions_needed) 396 *regions_needed = 0; 397 398 /* In this loop, we essentially handle an entry for the range 399 * [last_accounted_offset, rg->from), at every iteration, with some 400 * bounds checking. 401 */ 402 list_for_each_entry_safe(rg, trg, head, link) { 403 /* Skip irrelevant regions that start before our range. */ 404 if (rg->from < f) { 405 /* If this region ends after the last accounted offset, 406 * then we need to update last_accounted_offset. 407 */ 408 if (rg->to > last_accounted_offset) 409 last_accounted_offset = rg->to; 410 continue; 411 } 412 413 /* When we find a region that starts beyond our range, we've 414 * finished. 415 */ 416 if (rg->from >= t) 417 break; 418 419 /* Add an entry for last_accounted_offset -> rg->from, and 420 * update last_accounted_offset. 421 */ 422 if (rg->from > last_accounted_offset) 423 add += hugetlb_resv_map_add(resv, rg, 424 last_accounted_offset, 425 rg->from, h, h_cg, 426 regions_needed); 427 428 last_accounted_offset = rg->to; 429 } 430 431 /* Handle the case where our range extends beyond 432 * last_accounted_offset. 433 */ 434 if (last_accounted_offset < t) 435 add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, 436 t, h, h_cg, regions_needed); 437 438 VM_BUG_ON(add < 0); 439 return add; 440 } 441 442 /* Must be called with resv->lock acquired. Will drop lock to allocate entries. 443 */ 444 static int allocate_file_region_entries(struct resv_map *resv, 445 int regions_needed) 446 __must_hold(&resv->lock) 447 { 448 struct list_head allocated_regions; 449 int to_allocate = 0, i = 0; 450 struct file_region *trg = NULL, *rg = NULL; 451 452 VM_BUG_ON(regions_needed < 0); 453 454 INIT_LIST_HEAD(&allocated_regions); 455 456 /* 457 * Check for sufficient descriptors in the cache to accommodate 458 * the number of in progress add operations plus regions_needed. 459 * 460 * This is a while loop because when we drop the lock, some other call 461 * to region_add or region_del may have consumed some region_entries, 462 * so we keep looping here until we finally have enough entries for 463 * (adds_in_progress + regions_needed). 464 */ 465 while (resv->region_cache_count < 466 (resv->adds_in_progress + regions_needed)) { 467 to_allocate = resv->adds_in_progress + regions_needed - 468 resv->region_cache_count; 469 470 /* At this point, we should have enough entries in the cache 471 * for all the existing adds_in_progress. We should only be 472 * needing to allocate for regions_needed. 473 */ 474 VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); 475 476 spin_unlock(&resv->lock); 477 for (i = 0; i < to_allocate; i++) { 478 trg = kmalloc(sizeof(*trg), GFP_KERNEL); 479 if (!trg) 480 goto out_of_memory; 481 list_add(&trg->link, &allocated_regions); 482 } 483 484 spin_lock(&resv->lock); 485 486 list_splice(&allocated_regions, &resv->region_cache); 487 resv->region_cache_count += to_allocate; 488 } 489 490 return 0; 491 492 out_of_memory: 493 list_for_each_entry_safe(rg, trg, &allocated_regions, link) { 494 list_del(&rg->link); 495 kfree(rg); 496 } 497 return -ENOMEM; 498 } 499 500 /* 501 * Add the huge page range represented by [f, t) to the reserve 502 * map. Regions will be taken from the cache to fill in this range. 503 * Sufficient regions should exist in the cache due to the previous 504 * call to region_chg with the same range, but in some cases the cache will not 505 * have sufficient entries due to races with other code doing region_add or 506 * region_del. The extra needed entries will be allocated. 507 * 508 * regions_needed is the out value provided by a previous call to region_chg. 509 * 510 * Return the number of new huge pages added to the map. This number is greater 511 * than or equal to zero. If file_region entries needed to be allocated for 512 * this operation and we were not able to allocate, it returns -ENOMEM. 513 * region_add of regions of length 1 never allocate file_regions and cannot 514 * fail; region_chg will always allocate at least 1 entry and a region_add for 515 * 1 page will only require at most 1 entry. 516 */ 517 static long region_add(struct resv_map *resv, long f, long t, 518 long in_regions_needed, struct hstate *h, 519 struct hugetlb_cgroup *h_cg) 520 { 521 long add = 0, actual_regions_needed = 0; 522 523 spin_lock(&resv->lock); 524 retry: 525 526 /* Count how many regions are actually needed to execute this add. */ 527 add_reservation_in_range(resv, f, t, NULL, NULL, 528 &actual_regions_needed); 529 530 /* 531 * Check for sufficient descriptors in the cache to accommodate 532 * this add operation. Note that actual_regions_needed may be greater 533 * than in_regions_needed, as the resv_map may have been modified since 534 * the region_chg call. In this case, we need to make sure that we 535 * allocate extra entries, such that we have enough for all the 536 * existing adds_in_progress, plus the excess needed for this 537 * operation. 538 */ 539 if (actual_regions_needed > in_regions_needed && 540 resv->region_cache_count < 541 resv->adds_in_progress + 542 (actual_regions_needed - in_regions_needed)) { 543 /* region_add operation of range 1 should never need to 544 * allocate file_region entries. 545 */ 546 VM_BUG_ON(t - f <= 1); 547 548 if (allocate_file_region_entries( 549 resv, actual_regions_needed - in_regions_needed)) { 550 return -ENOMEM; 551 } 552 553 goto retry; 554 } 555 556 add = add_reservation_in_range(resv, f, t, h_cg, h, NULL); 557 558 resv->adds_in_progress -= in_regions_needed; 559 560 spin_unlock(&resv->lock); 561 return add; 562 } 563 564 /* 565 * Examine the existing reserve map and determine how many 566 * huge pages in the specified range [f, t) are NOT currently 567 * represented. This routine is called before a subsequent 568 * call to region_add that will actually modify the reserve 569 * map to add the specified range [f, t). region_chg does 570 * not change the number of huge pages represented by the 571 * map. A number of new file_region structures is added to the cache as a 572 * placeholder, for the subsequent region_add call to use. At least 1 573 * file_region structure is added. 574 * 575 * out_regions_needed is the number of regions added to the 576 * resv->adds_in_progress. This value needs to be provided to a follow up call 577 * to region_add or region_abort for proper accounting. 578 * 579 * Returns the number of huge pages that need to be added to the existing 580 * reservation map for the range [f, t). This number is greater or equal to 581 * zero. -ENOMEM is returned if a new file_region structure or cache entry 582 * is needed and can not be allocated. 583 */ 584 static long region_chg(struct resv_map *resv, long f, long t, 585 long *out_regions_needed) 586 { 587 long chg = 0; 588 589 spin_lock(&resv->lock); 590 591 /* Count how many hugepages in this range are NOT represented. */ 592 chg = add_reservation_in_range(resv, f, t, NULL, NULL, 593 out_regions_needed); 594 595 if (*out_regions_needed == 0) 596 *out_regions_needed = 1; 597 598 if (allocate_file_region_entries(resv, *out_regions_needed)) 599 return -ENOMEM; 600 601 resv->adds_in_progress += *out_regions_needed; 602 603 spin_unlock(&resv->lock); 604 return chg; 605 } 606 607 /* 608 * Abort the in progress add operation. The adds_in_progress field 609 * of the resv_map keeps track of the operations in progress between 610 * calls to region_chg and region_add. Operations are sometimes 611 * aborted after the call to region_chg. In such cases, region_abort 612 * is called to decrement the adds_in_progress counter. regions_needed 613 * is the value returned by the region_chg call, it is used to decrement 614 * the adds_in_progress counter. 615 * 616 * NOTE: The range arguments [f, t) are not needed or used in this 617 * routine. They are kept to make reading the calling code easier as 618 * arguments will match the associated region_chg call. 619 */ 620 static void region_abort(struct resv_map *resv, long f, long t, 621 long regions_needed) 622 { 623 spin_lock(&resv->lock); 624 VM_BUG_ON(!resv->region_cache_count); 625 resv->adds_in_progress -= regions_needed; 626 spin_unlock(&resv->lock); 627 } 628 629 /* 630 * Delete the specified range [f, t) from the reserve map. If the 631 * t parameter is LONG_MAX, this indicates that ALL regions after f 632 * should be deleted. Locate the regions which intersect [f, t) 633 * and either trim, delete or split the existing regions. 634 * 635 * Returns the number of huge pages deleted from the reserve map. 636 * In the normal case, the return value is zero or more. In the 637 * case where a region must be split, a new region descriptor must 638 * be allocated. If the allocation fails, -ENOMEM will be returned. 639 * NOTE: If the parameter t == LONG_MAX, then we will never split 640 * a region and possibly return -ENOMEM. Callers specifying 641 * t == LONG_MAX do not need to check for -ENOMEM error. 642 */ 643 static long region_del(struct resv_map *resv, long f, long t) 644 { 645 struct list_head *head = &resv->regions; 646 struct file_region *rg, *trg; 647 struct file_region *nrg = NULL; 648 long del = 0; 649 650 retry: 651 spin_lock(&resv->lock); 652 list_for_each_entry_safe(rg, trg, head, link) { 653 /* 654 * Skip regions before the range to be deleted. file_region 655 * ranges are normally of the form [from, to). However, there 656 * may be a "placeholder" entry in the map which is of the form 657 * (from, to) with from == to. Check for placeholder entries 658 * at the beginning of the range to be deleted. 659 */ 660 if (rg->to <= f && (rg->to != rg->from || rg->to != f)) 661 continue; 662 663 if (rg->from >= t) 664 break; 665 666 if (f > rg->from && t < rg->to) { /* Must split region */ 667 /* 668 * Check for an entry in the cache before dropping 669 * lock and attempting allocation. 670 */ 671 if (!nrg && 672 resv->region_cache_count > resv->adds_in_progress) { 673 nrg = list_first_entry(&resv->region_cache, 674 struct file_region, 675 link); 676 list_del(&nrg->link); 677 resv->region_cache_count--; 678 } 679 680 if (!nrg) { 681 spin_unlock(&resv->lock); 682 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 683 if (!nrg) 684 return -ENOMEM; 685 goto retry; 686 } 687 688 del += t - f; 689 hugetlb_cgroup_uncharge_file_region( 690 resv, rg, t - f, false); 691 692 /* New entry for end of split region */ 693 nrg->from = t; 694 nrg->to = rg->to; 695 696 copy_hugetlb_cgroup_uncharge_info(nrg, rg); 697 698 INIT_LIST_HEAD(&nrg->link); 699 700 /* Original entry is trimmed */ 701 rg->to = f; 702 703 list_add(&nrg->link, &rg->link); 704 nrg = NULL; 705 break; 706 } 707 708 if (f <= rg->from && t >= rg->to) { /* Remove entire region */ 709 del += rg->to - rg->from; 710 hugetlb_cgroup_uncharge_file_region(resv, rg, 711 rg->to - rg->from, true); 712 list_del(&rg->link); 713 kfree(rg); 714 continue; 715 } 716 717 if (f <= rg->from) { /* Trim beginning of region */ 718 hugetlb_cgroup_uncharge_file_region(resv, rg, 719 t - rg->from, false); 720 721 del += t - rg->from; 722 rg->from = t; 723 } else { /* Trim end of region */ 724 hugetlb_cgroup_uncharge_file_region(resv, rg, 725 rg->to - f, false); 726 727 del += rg->to - f; 728 rg->to = f; 729 } 730 } 731 732 spin_unlock(&resv->lock); 733 kfree(nrg); 734 return del; 735 } 736 737 /* 738 * A rare out of memory error was encountered which prevented removal of 739 * the reserve map region for a page. The huge page itself was free'ed 740 * and removed from the page cache. This routine will adjust the subpool 741 * usage count, and the global reserve count if needed. By incrementing 742 * these counts, the reserve map entry which could not be deleted will 743 * appear as a "reserved" entry instead of simply dangling with incorrect 744 * counts. 745 */ 746 void hugetlb_fix_reserve_counts(struct inode *inode) 747 { 748 struct hugepage_subpool *spool = subpool_inode(inode); 749 long rsv_adjust; 750 bool reserved = false; 751 752 rsv_adjust = hugepage_subpool_get_pages(spool, 1); 753 if (rsv_adjust > 0) { 754 struct hstate *h = hstate_inode(inode); 755 756 if (!hugetlb_acct_memory(h, 1)) 757 reserved = true; 758 } else if (!rsv_adjust) { 759 reserved = true; 760 } 761 762 if (!reserved) 763 pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); 764 } 765 766 /* 767 * Count and return the number of huge pages in the reserve map 768 * that intersect with the range [f, t). 769 */ 770 static long region_count(struct resv_map *resv, long f, long t) 771 { 772 struct list_head *head = &resv->regions; 773 struct file_region *rg; 774 long chg = 0; 775 776 spin_lock(&resv->lock); 777 /* Locate each segment we overlap with, and count that overlap. */ 778 list_for_each_entry(rg, head, link) { 779 long seg_from; 780 long seg_to; 781 782 if (rg->to <= f) 783 continue; 784 if (rg->from >= t) 785 break; 786 787 seg_from = max(rg->from, f); 788 seg_to = min(rg->to, t); 789 790 chg += seg_to - seg_from; 791 } 792 spin_unlock(&resv->lock); 793 794 return chg; 795 } 796 797 /* 798 * Convert the address within this vma to the page offset within 799 * the mapping, in pagecache page units; huge pages here. 800 */ 801 static pgoff_t vma_hugecache_offset(struct hstate *h, 802 struct vm_area_struct *vma, unsigned long address) 803 { 804 return ((address - vma->vm_start) >> huge_page_shift(h)) + 805 (vma->vm_pgoff >> huge_page_order(h)); 806 } 807 808 pgoff_t linear_hugepage_index(struct vm_area_struct *vma, 809 unsigned long address) 810 { 811 return vma_hugecache_offset(hstate_vma(vma), vma, address); 812 } 813 EXPORT_SYMBOL_GPL(linear_hugepage_index); 814 815 /* 816 * Return the size of the pages allocated when backing a VMA. In the majority 817 * cases this will be same size as used by the page table entries. 818 */ 819 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 820 { 821 if (vma->vm_ops && vma->vm_ops->pagesize) 822 return vma->vm_ops->pagesize(vma); 823 return PAGE_SIZE; 824 } 825 EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 826 827 /* 828 * Return the page size being used by the MMU to back a VMA. In the majority 829 * of cases, the page size used by the kernel matches the MMU size. On 830 * architectures where it differs, an architecture-specific 'strong' 831 * version of this symbol is required. 832 */ 833 __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 834 { 835 return vma_kernel_pagesize(vma); 836 } 837 838 /* 839 * Flags for MAP_PRIVATE reservations. These are stored in the bottom 840 * bits of the reservation map pointer, which are always clear due to 841 * alignment. 842 */ 843 #define HPAGE_RESV_OWNER (1UL << 0) 844 #define HPAGE_RESV_UNMAPPED (1UL << 1) 845 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 846 847 /* 848 * These helpers are used to track how many pages are reserved for 849 * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 850 * is guaranteed to have their future faults succeed. 851 * 852 * With the exception of reset_vma_resv_huge_pages() which is called at fork(), 853 * the reserve counters are updated with the hugetlb_lock held. It is safe 854 * to reset the VMA at fork() time as it is not in use yet and there is no 855 * chance of the global counters getting corrupted as a result of the values. 856 * 857 * The private mapping reservation is represented in a subtly different 858 * manner to a shared mapping. A shared mapping has a region map associated 859 * with the underlying file, this region map represents the backing file 860 * pages which have ever had a reservation assigned which this persists even 861 * after the page is instantiated. A private mapping has a region map 862 * associated with the original mmap which is attached to all VMAs which 863 * reference it, this region map represents those offsets which have consumed 864 * reservation ie. where pages have been instantiated. 865 */ 866 static unsigned long get_vma_private_data(struct vm_area_struct *vma) 867 { 868 return (unsigned long)vma->vm_private_data; 869 } 870 871 static void set_vma_private_data(struct vm_area_struct *vma, 872 unsigned long value) 873 { 874 vma->vm_private_data = (void *)value; 875 } 876 877 static void 878 resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map, 879 struct hugetlb_cgroup *h_cg, 880 struct hstate *h) 881 { 882 #ifdef CONFIG_CGROUP_HUGETLB 883 if (!h_cg || !h) { 884 resv_map->reservation_counter = NULL; 885 resv_map->pages_per_hpage = 0; 886 resv_map->css = NULL; 887 } else { 888 resv_map->reservation_counter = 889 &h_cg->rsvd_hugepage[hstate_index(h)]; 890 resv_map->pages_per_hpage = pages_per_huge_page(h); 891 resv_map->css = &h_cg->css; 892 } 893 #endif 894 } 895 896 struct resv_map *resv_map_alloc(void) 897 { 898 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 899 struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); 900 901 if (!resv_map || !rg) { 902 kfree(resv_map); 903 kfree(rg); 904 return NULL; 905 } 906 907 kref_init(&resv_map->refs); 908 spin_lock_init(&resv_map->lock); 909 INIT_LIST_HEAD(&resv_map->regions); 910 911 resv_map->adds_in_progress = 0; 912 /* 913 * Initialize these to 0. On shared mappings, 0's here indicate these 914 * fields don't do cgroup accounting. On private mappings, these will be 915 * re-initialized to the proper values, to indicate that hugetlb cgroup 916 * reservations are to be un-charged from here. 917 */ 918 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL); 919 920 INIT_LIST_HEAD(&resv_map->region_cache); 921 list_add(&rg->link, &resv_map->region_cache); 922 resv_map->region_cache_count = 1; 923 924 return resv_map; 925 } 926 927 void resv_map_release(struct kref *ref) 928 { 929 struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 930 struct list_head *head = &resv_map->region_cache; 931 struct file_region *rg, *trg; 932 933 /* Clear out any active regions before we release the map. */ 934 region_del(resv_map, 0, LONG_MAX); 935 936 /* ... and any entries left in the cache */ 937 list_for_each_entry_safe(rg, trg, head, link) { 938 list_del(&rg->link); 939 kfree(rg); 940 } 941 942 VM_BUG_ON(resv_map->adds_in_progress); 943 944 kfree(resv_map); 945 } 946 947 static inline struct resv_map *inode_resv_map(struct inode *inode) 948 { 949 /* 950 * At inode evict time, i_mapping may not point to the original 951 * address space within the inode. This original address space 952 * contains the pointer to the resv_map. So, always use the 953 * address space embedded within the inode. 954 * The VERY common case is inode->mapping == &inode->i_data but, 955 * this may not be true for device special inodes. 956 */ 957 return (struct resv_map *)(&inode->i_data)->private_data; 958 } 959 960 static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 961 { 962 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 963 if (vma->vm_flags & VM_MAYSHARE) { 964 struct address_space *mapping = vma->vm_file->f_mapping; 965 struct inode *inode = mapping->host; 966 967 return inode_resv_map(inode); 968 969 } else { 970 return (struct resv_map *)(get_vma_private_data(vma) & 971 ~HPAGE_RESV_MASK); 972 } 973 } 974 975 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 976 { 977 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 978 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 979 980 set_vma_private_data(vma, (get_vma_private_data(vma) & 981 HPAGE_RESV_MASK) | (unsigned long)map); 982 } 983 984 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 985 { 986 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 987 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 988 989 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 990 } 991 992 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 993 { 994 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 995 996 return (get_vma_private_data(vma) & flag) != 0; 997 } 998 999 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 1000 void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 1001 { 1002 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 1003 if (!(vma->vm_flags & VM_MAYSHARE)) 1004 vma->vm_private_data = (void *)0; 1005 } 1006 1007 /* Returns true if the VMA has associated reserve pages */ 1008 static bool vma_has_reserves(struct vm_area_struct *vma, long chg) 1009 { 1010 if (vma->vm_flags & VM_NORESERVE) { 1011 /* 1012 * This address is already reserved by other process(chg == 0), 1013 * so, we should decrement reserved count. Without decrementing, 1014 * reserve count remains after releasing inode, because this 1015 * allocated page will go into page cache and is regarded as 1016 * coming from reserved pool in releasing step. Currently, we 1017 * don't have any other solution to deal with this situation 1018 * properly, so add work-around here. 1019 */ 1020 if (vma->vm_flags & VM_MAYSHARE && chg == 0) 1021 return true; 1022 else 1023 return false; 1024 } 1025 1026 /* Shared mappings always use reserves */ 1027 if (vma->vm_flags & VM_MAYSHARE) { 1028 /* 1029 * We know VM_NORESERVE is not set. Therefore, there SHOULD 1030 * be a region map for all pages. The only situation where 1031 * there is no region map is if a hole was punched via 1032 * fallocate. In this case, there really are no reserves to 1033 * use. This situation is indicated if chg != 0. 1034 */ 1035 if (chg) 1036 return false; 1037 else 1038 return true; 1039 } 1040 1041 /* 1042 * Only the process that called mmap() has reserves for 1043 * private mappings. 1044 */ 1045 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 1046 /* 1047 * Like the shared case above, a hole punch or truncate 1048 * could have been performed on the private mapping. 1049 * Examine the value of chg to determine if reserves 1050 * actually exist or were previously consumed. 1051 * Very Subtle - The value of chg comes from a previous 1052 * call to vma_needs_reserves(). The reserve map for 1053 * private mappings has different (opposite) semantics 1054 * than that of shared mappings. vma_needs_reserves() 1055 * has already taken this difference in semantics into 1056 * account. Therefore, the meaning of chg is the same 1057 * as in the shared case above. Code could easily be 1058 * combined, but keeping it separate draws attention to 1059 * subtle differences. 1060 */ 1061 if (chg) 1062 return false; 1063 else 1064 return true; 1065 } 1066 1067 return false; 1068 } 1069 1070 static void enqueue_huge_page(struct hstate *h, struct page *page) 1071 { 1072 int nid = page_to_nid(page); 1073 1074 lockdep_assert_held(&hugetlb_lock); 1075 VM_BUG_ON_PAGE(page_count(page), page); 1076 1077 list_move(&page->lru, &h->hugepage_freelists[nid]); 1078 h->free_huge_pages++; 1079 h->free_huge_pages_node[nid]++; 1080 SetHPageFreed(page); 1081 } 1082 1083 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) 1084 { 1085 struct page *page; 1086 bool pin = !!(current->flags & PF_MEMALLOC_PIN); 1087 1088 lockdep_assert_held(&hugetlb_lock); 1089 list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { 1090 if (pin && !is_pinnable_page(page)) 1091 continue; 1092 1093 if (PageHWPoison(page)) 1094 continue; 1095 1096 list_move(&page->lru, &h->hugepage_activelist); 1097 set_page_refcounted(page); 1098 ClearHPageFreed(page); 1099 h->free_huge_pages--; 1100 h->free_huge_pages_node[nid]--; 1101 return page; 1102 } 1103 1104 return NULL; 1105 } 1106 1107 static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, 1108 nodemask_t *nmask) 1109 { 1110 unsigned int cpuset_mems_cookie; 1111 struct zonelist *zonelist; 1112 struct zone *zone; 1113 struct zoneref *z; 1114 int node = NUMA_NO_NODE; 1115 1116 zonelist = node_zonelist(nid, gfp_mask); 1117 1118 retry_cpuset: 1119 cpuset_mems_cookie = read_mems_allowed_begin(); 1120 for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { 1121 struct page *page; 1122 1123 if (!cpuset_zone_allowed(zone, gfp_mask)) 1124 continue; 1125 /* 1126 * no need to ask again on the same node. Pool is node rather than 1127 * zone aware 1128 */ 1129 if (zone_to_nid(zone) == node) 1130 continue; 1131 node = zone_to_nid(zone); 1132 1133 page = dequeue_huge_page_node_exact(h, node); 1134 if (page) 1135 return page; 1136 } 1137 if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) 1138 goto retry_cpuset; 1139 1140 return NULL; 1141 } 1142 1143 static struct page *dequeue_huge_page_vma(struct hstate *h, 1144 struct vm_area_struct *vma, 1145 unsigned long address, int avoid_reserve, 1146 long chg) 1147 { 1148 struct page *page = NULL; 1149 struct mempolicy *mpol; 1150 gfp_t gfp_mask; 1151 nodemask_t *nodemask; 1152 int nid; 1153 1154 /* 1155 * A child process with MAP_PRIVATE mappings created by their parent 1156 * have no page reserves. This check ensures that reservations are 1157 * not "stolen". The child may still get SIGKILLed 1158 */ 1159 if (!vma_has_reserves(vma, chg) && 1160 h->free_huge_pages - h->resv_huge_pages == 0) 1161 goto err; 1162 1163 /* If reserves cannot be used, ensure enough pages are in the pool */ 1164 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 1165 goto err; 1166 1167 gfp_mask = htlb_alloc_mask(h); 1168 nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 1169 1170 if (mpol_is_preferred_many(mpol)) { 1171 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); 1172 1173 /* Fallback to all nodes if page==NULL */ 1174 nodemask = NULL; 1175 } 1176 1177 if (!page) 1178 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); 1179 1180 if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { 1181 SetHPageRestoreReserve(page); 1182 h->resv_huge_pages--; 1183 } 1184 1185 mpol_cond_put(mpol); 1186 return page; 1187 1188 err: 1189 return NULL; 1190 } 1191 1192 /* 1193 * common helper functions for hstate_next_node_to_{alloc|free}. 1194 * We may have allocated or freed a huge page based on a different 1195 * nodes_allowed previously, so h->next_node_to_{alloc|free} might 1196 * be outside of *nodes_allowed. Ensure that we use an allowed 1197 * node for alloc or free. 1198 */ 1199 static int next_node_allowed(int nid, nodemask_t *nodes_allowed) 1200 { 1201 nid = next_node_in(nid, *nodes_allowed); 1202 VM_BUG_ON(nid >= MAX_NUMNODES); 1203 1204 return nid; 1205 } 1206 1207 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) 1208 { 1209 if (!node_isset(nid, *nodes_allowed)) 1210 nid = next_node_allowed(nid, nodes_allowed); 1211 return nid; 1212 } 1213 1214 /* 1215 * returns the previously saved node ["this node"] from which to 1216 * allocate a persistent huge page for the pool and advance the 1217 * next node from which to allocate, handling wrap at end of node 1218 * mask. 1219 */ 1220 static int hstate_next_node_to_alloc(struct hstate *h, 1221 nodemask_t *nodes_allowed) 1222 { 1223 int nid; 1224 1225 VM_BUG_ON(!nodes_allowed); 1226 1227 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); 1228 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); 1229 1230 return nid; 1231 } 1232 1233 /* 1234 * helper for remove_pool_huge_page() - return the previously saved 1235 * node ["this node"] from which to free a huge page. Advance the 1236 * next node id whether or not we find a free huge page to free so 1237 * that the next attempt to free addresses the next node. 1238 */ 1239 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) 1240 { 1241 int nid; 1242 1243 VM_BUG_ON(!nodes_allowed); 1244 1245 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); 1246 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); 1247 1248 return nid; 1249 } 1250 1251 #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ 1252 for (nr_nodes = nodes_weight(*mask); \ 1253 nr_nodes > 0 && \ 1254 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ 1255 nr_nodes--) 1256 1257 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ 1258 for (nr_nodes = nodes_weight(*mask); \ 1259 nr_nodes > 0 && \ 1260 ((node = hstate_next_node_to_free(hs, mask)) || 1); \ 1261 nr_nodes--) 1262 1263 #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE 1264 static void destroy_compound_gigantic_page(struct page *page, 1265 unsigned int order) 1266 { 1267 int i; 1268 int nr_pages = 1 << order; 1269 struct page *p = page + 1; 1270 1271 atomic_set(compound_mapcount_ptr(page), 0); 1272 atomic_set(compound_pincount_ptr(page), 0); 1273 1274 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1275 clear_compound_head(p); 1276 set_page_refcounted(p); 1277 } 1278 1279 set_compound_order(page, 0); 1280 page[1].compound_nr = 0; 1281 __ClearPageHead(page); 1282 } 1283 1284 static void free_gigantic_page(struct page *page, unsigned int order) 1285 { 1286 /* 1287 * If the page isn't allocated using the cma allocator, 1288 * cma_release() returns false. 1289 */ 1290 #ifdef CONFIG_CMA 1291 if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) 1292 return; 1293 #endif 1294 1295 free_contig_range(page_to_pfn(page), 1 << order); 1296 } 1297 1298 #ifdef CONFIG_CONTIG_ALLOC 1299 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1300 int nid, nodemask_t *nodemask) 1301 { 1302 unsigned long nr_pages = pages_per_huge_page(h); 1303 if (nid == NUMA_NO_NODE) 1304 nid = numa_mem_id(); 1305 1306 #ifdef CONFIG_CMA 1307 { 1308 struct page *page; 1309 int node; 1310 1311 if (hugetlb_cma[nid]) { 1312 page = cma_alloc(hugetlb_cma[nid], nr_pages, 1313 huge_page_order(h), true); 1314 if (page) 1315 return page; 1316 } 1317 1318 if (!(gfp_mask & __GFP_THISNODE)) { 1319 for_each_node_mask(node, *nodemask) { 1320 if (node == nid || !hugetlb_cma[node]) 1321 continue; 1322 1323 page = cma_alloc(hugetlb_cma[node], nr_pages, 1324 huge_page_order(h), true); 1325 if (page) 1326 return page; 1327 } 1328 } 1329 } 1330 #endif 1331 1332 return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); 1333 } 1334 1335 #else /* !CONFIG_CONTIG_ALLOC */ 1336 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1337 int nid, nodemask_t *nodemask) 1338 { 1339 return NULL; 1340 } 1341 #endif /* CONFIG_CONTIG_ALLOC */ 1342 1343 #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ 1344 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1345 int nid, nodemask_t *nodemask) 1346 { 1347 return NULL; 1348 } 1349 static inline void free_gigantic_page(struct page *page, unsigned int order) { } 1350 static inline void destroy_compound_gigantic_page(struct page *page, 1351 unsigned int order) { } 1352 #endif 1353 1354 /* 1355 * Remove hugetlb page from lists, and update dtor so that page appears 1356 * as just a compound page. A reference is held on the page. 1357 * 1358 * Must be called with hugetlb lock held. 1359 */ 1360 static void remove_hugetlb_page(struct hstate *h, struct page *page, 1361 bool adjust_surplus) 1362 { 1363 int nid = page_to_nid(page); 1364 1365 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); 1366 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); 1367 1368 lockdep_assert_held(&hugetlb_lock); 1369 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 1370 return; 1371 1372 list_del(&page->lru); 1373 1374 if (HPageFreed(page)) { 1375 h->free_huge_pages--; 1376 h->free_huge_pages_node[nid]--; 1377 } 1378 if (adjust_surplus) { 1379 h->surplus_huge_pages--; 1380 h->surplus_huge_pages_node[nid]--; 1381 } 1382 1383 /* 1384 * Very subtle 1385 * 1386 * For non-gigantic pages set the destructor to the normal compound 1387 * page dtor. This is needed in case someone takes an additional 1388 * temporary ref to the page, and freeing is delayed until they drop 1389 * their reference. 1390 * 1391 * For gigantic pages set the destructor to the null dtor. This 1392 * destructor will never be called. Before freeing the gigantic 1393 * page destroy_compound_gigantic_page will turn the compound page 1394 * into a simple group of pages. After this the destructor does not 1395 * apply. 1396 * 1397 * This handles the case where more than one ref is held when and 1398 * after update_and_free_page is called. 1399 */ 1400 set_page_refcounted(page); 1401 if (hstate_is_gigantic(h)) 1402 set_compound_page_dtor(page, NULL_COMPOUND_DTOR); 1403 else 1404 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); 1405 1406 h->nr_huge_pages--; 1407 h->nr_huge_pages_node[nid]--; 1408 } 1409 1410 static void add_hugetlb_page(struct hstate *h, struct page *page, 1411 bool adjust_surplus) 1412 { 1413 int zeroed; 1414 int nid = page_to_nid(page); 1415 1416 VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page); 1417 1418 lockdep_assert_held(&hugetlb_lock); 1419 1420 INIT_LIST_HEAD(&page->lru); 1421 h->nr_huge_pages++; 1422 h->nr_huge_pages_node[nid]++; 1423 1424 if (adjust_surplus) { 1425 h->surplus_huge_pages++; 1426 h->surplus_huge_pages_node[nid]++; 1427 } 1428 1429 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); 1430 set_page_private(page, 0); 1431 SetHPageVmemmapOptimized(page); 1432 1433 /* 1434 * This page is about to be managed by the hugetlb allocator and 1435 * should have no users. Drop our reference, and check for others 1436 * just in case. 1437 */ 1438 zeroed = put_page_testzero(page); 1439 if (!zeroed) 1440 /* 1441 * It is VERY unlikely soneone else has taken a ref on 1442 * the page. In this case, we simply return as the 1443 * hugetlb destructor (free_huge_page) will be called 1444 * when this other ref is dropped. 1445 */ 1446 return; 1447 1448 arch_clear_hugepage_flags(page); 1449 enqueue_huge_page(h, page); 1450 } 1451 1452 static void __update_and_free_page(struct hstate *h, struct page *page) 1453 { 1454 int i; 1455 struct page *subpage = page; 1456 1457 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 1458 return; 1459 1460 if (alloc_huge_page_vmemmap(h, page)) { 1461 spin_lock_irq(&hugetlb_lock); 1462 /* 1463 * If we cannot allocate vmemmap pages, just refuse to free the 1464 * page and put the page back on the hugetlb free list and treat 1465 * as a surplus page. 1466 */ 1467 add_hugetlb_page(h, page, true); 1468 spin_unlock_irq(&hugetlb_lock); 1469 return; 1470 } 1471 1472 for (i = 0; i < pages_per_huge_page(h); 1473 i++, subpage = mem_map_next(subpage, page, i)) { 1474 subpage->flags &= ~(1 << PG_locked | 1 << PG_error | 1475 1 << PG_referenced | 1 << PG_dirty | 1476 1 << PG_active | 1 << PG_private | 1477 1 << PG_writeback); 1478 } 1479 if (hstate_is_gigantic(h)) { 1480 destroy_compound_gigantic_page(page, huge_page_order(h)); 1481 free_gigantic_page(page, huge_page_order(h)); 1482 } else { 1483 __free_pages(page, huge_page_order(h)); 1484 } 1485 } 1486 1487 /* 1488 * As update_and_free_page() can be called under any context, so we cannot 1489 * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the 1490 * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate 1491 * the vmemmap pages. 1492 * 1493 * free_hpage_workfn() locklessly retrieves the linked list of pages to be 1494 * freed and frees them one-by-one. As the page->mapping pointer is going 1495 * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node 1496 * structure of a lockless linked list of huge pages to be freed. 1497 */ 1498 static LLIST_HEAD(hpage_freelist); 1499 1500 static void free_hpage_workfn(struct work_struct *work) 1501 { 1502 struct llist_node *node; 1503 1504 node = llist_del_all(&hpage_freelist); 1505 1506 while (node) { 1507 struct page *page; 1508 struct hstate *h; 1509 1510 page = container_of((struct address_space **)node, 1511 struct page, mapping); 1512 node = node->next; 1513 page->mapping = NULL; 1514 /* 1515 * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate() 1516 * is going to trigger because a previous call to 1517 * remove_hugetlb_page() will set_compound_page_dtor(page, 1518 * NULL_COMPOUND_DTOR), so do not use page_hstate() directly. 1519 */ 1520 h = size_to_hstate(page_size(page)); 1521 1522 __update_and_free_page(h, page); 1523 1524 cond_resched(); 1525 } 1526 } 1527 static DECLARE_WORK(free_hpage_work, free_hpage_workfn); 1528 1529 static inline void flush_free_hpage_work(struct hstate *h) 1530 { 1531 if (free_vmemmap_pages_per_hpage(h)) 1532 flush_work(&free_hpage_work); 1533 } 1534 1535 static void update_and_free_page(struct hstate *h, struct page *page, 1536 bool atomic) 1537 { 1538 if (!HPageVmemmapOptimized(page) || !atomic) { 1539 __update_and_free_page(h, page); 1540 return; 1541 } 1542 1543 /* 1544 * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages. 1545 * 1546 * Only call schedule_work() if hpage_freelist is previously 1547 * empty. Otherwise, schedule_work() had been called but the workfn 1548 * hasn't retrieved the list yet. 1549 */ 1550 if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist)) 1551 schedule_work(&free_hpage_work); 1552 } 1553 1554 static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) 1555 { 1556 struct page *page, *t_page; 1557 1558 list_for_each_entry_safe(page, t_page, list, lru) { 1559 update_and_free_page(h, page, false); 1560 cond_resched(); 1561 } 1562 } 1563 1564 struct hstate *size_to_hstate(unsigned long size) 1565 { 1566 struct hstate *h; 1567 1568 for_each_hstate(h) { 1569 if (huge_page_size(h) == size) 1570 return h; 1571 } 1572 return NULL; 1573 } 1574 1575 void free_huge_page(struct page *page) 1576 { 1577 /* 1578 * Can't pass hstate in here because it is called from the 1579 * compound page destructor. 1580 */ 1581 struct hstate *h = page_hstate(page); 1582 int nid = page_to_nid(page); 1583 struct hugepage_subpool *spool = hugetlb_page_subpool(page); 1584 bool restore_reserve; 1585 unsigned long flags; 1586 1587 VM_BUG_ON_PAGE(page_count(page), page); 1588 VM_BUG_ON_PAGE(page_mapcount(page), page); 1589 1590 hugetlb_set_page_subpool(page, NULL); 1591 page->mapping = NULL; 1592 restore_reserve = HPageRestoreReserve(page); 1593 ClearHPageRestoreReserve(page); 1594 1595 /* 1596 * If HPageRestoreReserve was set on page, page allocation consumed a 1597 * reservation. If the page was associated with a subpool, there 1598 * would have been a page reserved in the subpool before allocation 1599 * via hugepage_subpool_get_pages(). Since we are 'restoring' the 1600 * reservation, do not call hugepage_subpool_put_pages() as this will 1601 * remove the reserved page from the subpool. 1602 */ 1603 if (!restore_reserve) { 1604 /* 1605 * A return code of zero implies that the subpool will be 1606 * under its minimum size if the reservation is not restored 1607 * after page is free. Therefore, force restore_reserve 1608 * operation. 1609 */ 1610 if (hugepage_subpool_put_pages(spool, 1) == 0) 1611 restore_reserve = true; 1612 } 1613 1614 spin_lock_irqsave(&hugetlb_lock, flags); 1615 ClearHPageMigratable(page); 1616 hugetlb_cgroup_uncharge_page(hstate_index(h), 1617 pages_per_huge_page(h), page); 1618 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), 1619 pages_per_huge_page(h), page); 1620 if (restore_reserve) 1621 h->resv_huge_pages++; 1622 1623 if (HPageTemporary(page)) { 1624 remove_hugetlb_page(h, page, false); 1625 spin_unlock_irqrestore(&hugetlb_lock, flags); 1626 update_and_free_page(h, page, true); 1627 } else if (h->surplus_huge_pages_node[nid]) { 1628 /* remove the page from active list */ 1629 remove_hugetlb_page(h, page, true); 1630 spin_unlock_irqrestore(&hugetlb_lock, flags); 1631 update_and_free_page(h, page, true); 1632 } else { 1633 arch_clear_hugepage_flags(page); 1634 enqueue_huge_page(h, page); 1635 spin_unlock_irqrestore(&hugetlb_lock, flags); 1636 } 1637 } 1638 1639 /* 1640 * Must be called with the hugetlb lock held 1641 */ 1642 static void __prep_account_new_huge_page(struct hstate *h, int nid) 1643 { 1644 lockdep_assert_held(&hugetlb_lock); 1645 h->nr_huge_pages++; 1646 h->nr_huge_pages_node[nid]++; 1647 } 1648 1649 static void __prep_new_huge_page(struct hstate *h, struct page *page) 1650 { 1651 free_huge_page_vmemmap(h, page); 1652 INIT_LIST_HEAD(&page->lru); 1653 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); 1654 hugetlb_set_page_subpool(page, NULL); 1655 set_hugetlb_cgroup(page, NULL); 1656 set_hugetlb_cgroup_rsvd(page, NULL); 1657 } 1658 1659 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 1660 { 1661 __prep_new_huge_page(h, page); 1662 spin_lock_irq(&hugetlb_lock); 1663 __prep_account_new_huge_page(h, nid); 1664 spin_unlock_irq(&hugetlb_lock); 1665 } 1666 1667 static bool prep_compound_gigantic_page(struct page *page, unsigned int order) 1668 { 1669 int i, j; 1670 int nr_pages = 1 << order; 1671 struct page *p = page + 1; 1672 1673 /* we rely on prep_new_huge_page to set the destructor */ 1674 set_compound_order(page, order); 1675 __ClearPageReserved(page); 1676 __SetPageHead(page); 1677 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1678 /* 1679 * For gigantic hugepages allocated through bootmem at 1680 * boot, it's safer to be consistent with the not-gigantic 1681 * hugepages and clear the PG_reserved bit from all tail pages 1682 * too. Otherwise drivers using get_user_pages() to access tail 1683 * pages may get the reference counting wrong if they see 1684 * PG_reserved set on a tail page (despite the head page not 1685 * having PG_reserved set). Enforcing this consistency between 1686 * head and tail pages allows drivers to optimize away a check 1687 * on the head page when they need know if put_page() is needed 1688 * after get_user_pages(). 1689 */ 1690 __ClearPageReserved(p); 1691 /* 1692 * Subtle and very unlikely 1693 * 1694 * Gigantic 'page allocators' such as memblock or cma will 1695 * return a set of pages with each page ref counted. We need 1696 * to turn this set of pages into a compound page with tail 1697 * page ref counts set to zero. Code such as speculative page 1698 * cache adding could take a ref on a 'to be' tail page. 1699 * We need to respect any increased ref count, and only set 1700 * the ref count to zero if count is currently 1. If count 1701 * is not 1, we return an error. An error return indicates 1702 * the set of pages can not be converted to a gigantic page. 1703 * The caller who allocated the pages should then discard the 1704 * pages using the appropriate free interface. 1705 */ 1706 if (!page_ref_freeze(p, 1)) { 1707 pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); 1708 goto out_error; 1709 } 1710 set_page_count(p, 0); 1711 set_compound_head(p, page); 1712 } 1713 atomic_set(compound_mapcount_ptr(page), -1); 1714 atomic_set(compound_pincount_ptr(page), 0); 1715 return true; 1716 1717 out_error: 1718 /* undo tail page modifications made above */ 1719 p = page + 1; 1720 for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) { 1721 clear_compound_head(p); 1722 set_page_refcounted(p); 1723 } 1724 /* need to clear PG_reserved on remaining tail pages */ 1725 for (; j < nr_pages; j++, p = mem_map_next(p, page, j)) 1726 __ClearPageReserved(p); 1727 set_compound_order(page, 0); 1728 page[1].compound_nr = 0; 1729 __ClearPageHead(page); 1730 return false; 1731 } 1732 1733 /* 1734 * PageHuge() only returns true for hugetlbfs pages, but not for normal or 1735 * transparent huge pages. See the PageTransHuge() documentation for more 1736 * details. 1737 */ 1738 int PageHuge(struct page *page) 1739 { 1740 if (!PageCompound(page)) 1741 return 0; 1742 1743 page = compound_head(page); 1744 return page[1].compound_dtor == HUGETLB_PAGE_DTOR; 1745 } 1746 EXPORT_SYMBOL_GPL(PageHuge); 1747 1748 /* 1749 * PageHeadHuge() only returns true for hugetlbfs head page, but not for 1750 * normal or transparent huge pages. 1751 */ 1752 int PageHeadHuge(struct page *page_head) 1753 { 1754 if (!PageHead(page_head)) 1755 return 0; 1756 1757 return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; 1758 } 1759 1760 /* 1761 * Find and lock address space (mapping) in write mode. 1762 * 1763 * Upon entry, the page is locked which means that page_mapping() is 1764 * stable. Due to locking order, we can only trylock_write. If we can 1765 * not get the lock, simply return NULL to caller. 1766 */ 1767 struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) 1768 { 1769 struct address_space *mapping = page_mapping(hpage); 1770 1771 if (!mapping) 1772 return mapping; 1773 1774 if (i_mmap_trylock_write(mapping)) 1775 return mapping; 1776 1777 return NULL; 1778 } 1779 1780 pgoff_t hugetlb_basepage_index(struct page *page) 1781 { 1782 struct page *page_head = compound_head(page); 1783 pgoff_t index = page_index(page_head); 1784 unsigned long compound_idx; 1785 1786 if (compound_order(page_head) >= MAX_ORDER) 1787 compound_idx = page_to_pfn(page) - page_to_pfn(page_head); 1788 else 1789 compound_idx = page - page_head; 1790 1791 return (index << compound_order(page_head)) + compound_idx; 1792 } 1793 1794 static struct page *alloc_buddy_huge_page(struct hstate *h, 1795 gfp_t gfp_mask, int nid, nodemask_t *nmask, 1796 nodemask_t *node_alloc_noretry) 1797 { 1798 int order = huge_page_order(h); 1799 struct page *page; 1800 bool alloc_try_hard = true; 1801 1802 /* 1803 * By default we always try hard to allocate the page with 1804 * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in 1805 * a loop (to adjust global huge page counts) and previous allocation 1806 * failed, do not continue to try hard on the same node. Use the 1807 * node_alloc_noretry bitmap to manage this state information. 1808 */ 1809 if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) 1810 alloc_try_hard = false; 1811 gfp_mask |= __GFP_COMP|__GFP_NOWARN; 1812 if (alloc_try_hard) 1813 gfp_mask |= __GFP_RETRY_MAYFAIL; 1814 if (nid == NUMA_NO_NODE) 1815 nid = numa_mem_id(); 1816 page = __alloc_pages(gfp_mask, order, nid, nmask); 1817 if (page) 1818 __count_vm_event(HTLB_BUDDY_PGALLOC); 1819 else 1820 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 1821 1822 /* 1823 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this 1824 * indicates an overall state change. Clear bit so that we resume 1825 * normal 'try hard' allocations. 1826 */ 1827 if (node_alloc_noretry && page && !alloc_try_hard) 1828 node_clear(nid, *node_alloc_noretry); 1829 1830 /* 1831 * If we tried hard to get a page but failed, set bit so that 1832 * subsequent attempts will not try as hard until there is an 1833 * overall state change. 1834 */ 1835 if (node_alloc_noretry && !page && alloc_try_hard) 1836 node_set(nid, *node_alloc_noretry); 1837 1838 return page; 1839 } 1840 1841 /* 1842 * Common helper to allocate a fresh hugetlb page. All specific allocators 1843 * should use this function to get new hugetlb pages 1844 */ 1845 static struct page *alloc_fresh_huge_page(struct hstate *h, 1846 gfp_t gfp_mask, int nid, nodemask_t *nmask, 1847 nodemask_t *node_alloc_noretry) 1848 { 1849 struct page *page; 1850 bool retry = false; 1851 1852 retry: 1853 if (hstate_is_gigantic(h)) 1854 page = alloc_gigantic_page(h, gfp_mask, nid, nmask); 1855 else 1856 page = alloc_buddy_huge_page(h, gfp_mask, 1857 nid, nmask, node_alloc_noretry); 1858 if (!page) 1859 return NULL; 1860 1861 if (hstate_is_gigantic(h)) { 1862 if (!prep_compound_gigantic_page(page, huge_page_order(h))) { 1863 /* 1864 * Rare failure to convert pages to compound page. 1865 * Free pages and try again - ONCE! 1866 */ 1867 free_gigantic_page(page, huge_page_order(h)); 1868 if (!retry) { 1869 retry = true; 1870 goto retry; 1871 } 1872 return NULL; 1873 } 1874 } 1875 prep_new_huge_page(h, page, page_to_nid(page)); 1876 1877 return page; 1878 } 1879 1880 /* 1881 * Allocates a fresh page to the hugetlb allocator pool in the node interleaved 1882 * manner. 1883 */ 1884 static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 1885 nodemask_t *node_alloc_noretry) 1886 { 1887 struct page *page; 1888 int nr_nodes, node; 1889 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 1890 1891 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 1892 page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, 1893 node_alloc_noretry); 1894 if (page) 1895 break; 1896 } 1897 1898 if (!page) 1899 return 0; 1900 1901 put_page(page); /* free it into the hugepage allocator */ 1902 1903 return 1; 1904 } 1905 1906 /* 1907 * Remove huge page from pool from next node to free. Attempt to keep 1908 * persistent huge pages more or less balanced over allowed nodes. 1909 * This routine only 'removes' the hugetlb page. The caller must make 1910 * an additional call to free the page to low level allocators. 1911 * Called with hugetlb_lock locked. 1912 */ 1913 static struct page *remove_pool_huge_page(struct hstate *h, 1914 nodemask_t *nodes_allowed, 1915 bool acct_surplus) 1916 { 1917 int nr_nodes, node; 1918 struct page *page = NULL; 1919 1920 lockdep_assert_held(&hugetlb_lock); 1921 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 1922 /* 1923 * If we're returning unused surplus pages, only examine 1924 * nodes with surplus pages. 1925 */ 1926 if ((!acct_surplus || h->surplus_huge_pages_node[node]) && 1927 !list_empty(&h->hugepage_freelists[node])) { 1928 page = list_entry(h->hugepage_freelists[node].next, 1929 struct page, lru); 1930 remove_hugetlb_page(h, page, acct_surplus); 1931 break; 1932 } 1933 } 1934 1935 return page; 1936 } 1937 1938 /* 1939 * Dissolve a given free hugepage into free buddy pages. This function does 1940 * nothing for in-use hugepages and non-hugepages. 1941 * This function returns values like below: 1942 * 1943 * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages 1944 * when the system is under memory pressure and the feature of 1945 * freeing unused vmemmap pages associated with each hugetlb page 1946 * is enabled. 1947 * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use 1948 * (allocated or reserved.) 1949 * 0: successfully dissolved free hugepages or the page is not a 1950 * hugepage (considered as already dissolved) 1951 */ 1952 int dissolve_free_huge_page(struct page *page) 1953 { 1954 int rc = -EBUSY; 1955 1956 retry: 1957 /* Not to disrupt normal path by vainly holding hugetlb_lock */ 1958 if (!PageHuge(page)) 1959 return 0; 1960 1961 spin_lock_irq(&hugetlb_lock); 1962 if (!PageHuge(page)) { 1963 rc = 0; 1964 goto out; 1965 } 1966 1967 if (!page_count(page)) { 1968 struct page *head = compound_head(page); 1969 struct hstate *h = page_hstate(head); 1970 if (h->free_huge_pages - h->resv_huge_pages == 0) 1971 goto out; 1972 1973 /* 1974 * We should make sure that the page is already on the free list 1975 * when it is dissolved. 1976 */ 1977 if (unlikely(!HPageFreed(head))) { 1978 spin_unlock_irq(&hugetlb_lock); 1979 cond_resched(); 1980 1981 /* 1982 * Theoretically, we should return -EBUSY when we 1983 * encounter this race. In fact, we have a chance 1984 * to successfully dissolve the page if we do a 1985 * retry. Because the race window is quite small. 1986 * If we seize this opportunity, it is an optimization 1987 * for increasing the success rate of dissolving page. 1988 */ 1989 goto retry; 1990 } 1991 1992 remove_hugetlb_page(h, head, false); 1993 h->max_huge_pages--; 1994 spin_unlock_irq(&hugetlb_lock); 1995 1996 /* 1997 * Normally update_and_free_page will allocate required vmemmmap 1998 * before freeing the page. update_and_free_page will fail to 1999 * free the page if it can not allocate required vmemmap. We 2000 * need to adjust max_huge_pages if the page is not freed. 2001 * Attempt to allocate vmemmmap here so that we can take 2002 * appropriate action on failure. 2003 */ 2004 rc = alloc_huge_page_vmemmap(h, head); 2005 if (!rc) { 2006 /* 2007 * Move PageHWPoison flag from head page to the raw 2008 * error page, which makes any subpages rather than 2009 * the error page reusable. 2010 */ 2011 if (PageHWPoison(head) && page != head) { 2012 SetPageHWPoison(page); 2013 ClearPageHWPoison(head); 2014 } 2015 update_and_free_page(h, head, false); 2016 } else { 2017 spin_lock_irq(&hugetlb_lock); 2018 add_hugetlb_page(h, head, false); 2019 h->max_huge_pages++; 2020 spin_unlock_irq(&hugetlb_lock); 2021 } 2022 2023 return rc; 2024 } 2025 out: 2026 spin_unlock_irq(&hugetlb_lock); 2027 return rc; 2028 } 2029 2030 /* 2031 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to 2032 * make specified memory blocks removable from the system. 2033 * Note that this will dissolve a free gigantic hugepage completely, if any 2034 * part of it lies within the given range. 2035 * Also note that if dissolve_free_huge_page() returns with an error, all 2036 * free hugepages that were dissolved before that error are lost. 2037 */ 2038 int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) 2039 { 2040 unsigned long pfn; 2041 struct page *page; 2042 int rc = 0; 2043 2044 if (!hugepages_supported()) 2045 return rc; 2046 2047 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { 2048 page = pfn_to_page(pfn); 2049 rc = dissolve_free_huge_page(page); 2050 if (rc) 2051 break; 2052 } 2053 2054 return rc; 2055 } 2056 2057 /* 2058 * Allocates a fresh surplus page from the page allocator. 2059 */ 2060 static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, 2061 int nid, nodemask_t *nmask, bool zero_ref) 2062 { 2063 struct page *page = NULL; 2064 bool retry = false; 2065 2066 if (hstate_is_gigantic(h)) 2067 return NULL; 2068 2069 spin_lock_irq(&hugetlb_lock); 2070 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) 2071 goto out_unlock; 2072 spin_unlock_irq(&hugetlb_lock); 2073 2074 retry: 2075 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); 2076 if (!page) 2077 return NULL; 2078 2079 spin_lock_irq(&hugetlb_lock); 2080 /* 2081 * We could have raced with the pool size change. 2082 * Double check that and simply deallocate the new page 2083 * if we would end up overcommiting the surpluses. Abuse 2084 * temporary page to workaround the nasty free_huge_page 2085 * codeflow 2086 */ 2087 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { 2088 SetHPageTemporary(page); 2089 spin_unlock_irq(&hugetlb_lock); 2090 put_page(page); 2091 return NULL; 2092 } 2093 2094 if (zero_ref) { 2095 /* 2096 * Caller requires a page with zero ref count. 2097 * We will drop ref count here. If someone else is holding 2098 * a ref, the page will be freed when they drop it. Abuse 2099 * temporary page flag to accomplish this. 2100 */ 2101 SetHPageTemporary(page); 2102 if (!put_page_testzero(page)) { 2103 /* 2104 * Unexpected inflated ref count on freshly allocated 2105 * huge. Retry once. 2106 */ 2107 pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n"); 2108 spin_unlock_irq(&hugetlb_lock); 2109 if (retry) 2110 return NULL; 2111 2112 retry = true; 2113 goto retry; 2114 } 2115 ClearHPageTemporary(page); 2116 } 2117 2118 h->surplus_huge_pages++; 2119 h->surplus_huge_pages_node[page_to_nid(page)]++; 2120 2121 out_unlock: 2122 spin_unlock_irq(&hugetlb_lock); 2123 2124 return page; 2125 } 2126 2127 static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, 2128 int nid, nodemask_t *nmask) 2129 { 2130 struct page *page; 2131 2132 if (hstate_is_gigantic(h)) 2133 return NULL; 2134 2135 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); 2136 if (!page) 2137 return NULL; 2138 2139 /* 2140 * We do not account these pages as surplus because they are only 2141 * temporary and will be released properly on the last reference 2142 */ 2143 SetHPageTemporary(page); 2144 2145 return page; 2146 } 2147 2148 /* 2149 * Use the VMA's mpolicy to allocate a huge page from the buddy. 2150 */ 2151 static 2152 struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, 2153 struct vm_area_struct *vma, unsigned long addr) 2154 { 2155 struct page *page = NULL; 2156 struct mempolicy *mpol; 2157 gfp_t gfp_mask = htlb_alloc_mask(h); 2158 int nid; 2159 nodemask_t *nodemask; 2160 2161 nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); 2162 if (mpol_is_preferred_many(mpol)) { 2163 gfp_t gfp = gfp_mask | __GFP_NOWARN; 2164 2165 gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2166 page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false); 2167 2168 /* Fallback to all nodes if page==NULL */ 2169 nodemask = NULL; 2170 } 2171 2172 if (!page) 2173 page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); 2174 mpol_cond_put(mpol); 2175 return page; 2176 } 2177 2178 /* page migration callback function */ 2179 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, 2180 nodemask_t *nmask, gfp_t gfp_mask) 2181 { 2182 spin_lock_irq(&hugetlb_lock); 2183 if (h->free_huge_pages - h->resv_huge_pages > 0) { 2184 struct page *page; 2185 2186 page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); 2187 if (page) { 2188 spin_unlock_irq(&hugetlb_lock); 2189 return page; 2190 } 2191 } 2192 spin_unlock_irq(&hugetlb_lock); 2193 2194 return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); 2195 } 2196 2197 /* mempolicy aware migration callback */ 2198 struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, 2199 unsigned long address) 2200 { 2201 struct mempolicy *mpol; 2202 nodemask_t *nodemask; 2203 struct page *page; 2204 gfp_t gfp_mask; 2205 int node; 2206 2207 gfp_mask = htlb_alloc_mask(h); 2208 node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 2209 page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask); 2210 mpol_cond_put(mpol); 2211 2212 return page; 2213 } 2214 2215 /* 2216 * Increase the hugetlb pool such that it can accommodate a reservation 2217 * of size 'delta'. 2218 */ 2219 static int gather_surplus_pages(struct hstate *h, long delta) 2220 __must_hold(&hugetlb_lock) 2221 { 2222 struct list_head surplus_list; 2223 struct page *page, *tmp; 2224 int ret; 2225 long i; 2226 long needed, allocated; 2227 bool alloc_ok = true; 2228 2229 lockdep_assert_held(&hugetlb_lock); 2230 needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 2231 if (needed <= 0) { 2232 h->resv_huge_pages += delta; 2233 return 0; 2234 } 2235 2236 allocated = 0; 2237 INIT_LIST_HEAD(&surplus_list); 2238 2239 ret = -ENOMEM; 2240 retry: 2241 spin_unlock_irq(&hugetlb_lock); 2242 for (i = 0; i < needed; i++) { 2243 page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), 2244 NUMA_NO_NODE, NULL, true); 2245 if (!page) { 2246 alloc_ok = false; 2247 break; 2248 } 2249 list_add(&page->lru, &surplus_list); 2250 cond_resched(); 2251 } 2252 allocated += i; 2253 2254 /* 2255 * After retaking hugetlb_lock, we need to recalculate 'needed' 2256 * because either resv_huge_pages or free_huge_pages may have changed. 2257 */ 2258 spin_lock_irq(&hugetlb_lock); 2259 needed = (h->resv_huge_pages + delta) - 2260 (h->free_huge_pages + allocated); 2261 if (needed > 0) { 2262 if (alloc_ok) 2263 goto retry; 2264 /* 2265 * We were not able to allocate enough pages to 2266 * satisfy the entire reservation so we free what 2267 * we've allocated so far. 2268 */ 2269 goto free; 2270 } 2271 /* 2272 * The surplus_list now contains _at_least_ the number of extra pages 2273 * needed to accommodate the reservation. Add the appropriate number 2274 * of pages to the hugetlb pool and free the extras back to the buddy 2275 * allocator. Commit the entire reservation here to prevent another 2276 * process from stealing the pages as they are added to the pool but 2277 * before they are reserved. 2278 */ 2279 needed += allocated; 2280 h->resv_huge_pages += delta; 2281 ret = 0; 2282 2283 /* Free the needed pages to the hugetlb pool */ 2284 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 2285 if ((--needed) < 0) 2286 break; 2287 /* Add the page to the hugetlb allocator */ 2288 enqueue_huge_page(h, page); 2289 } 2290 free: 2291 spin_unlock_irq(&hugetlb_lock); 2292 2293 /* 2294 * Free unnecessary surplus pages to the buddy allocator. 2295 * Pages have no ref count, call free_huge_page directly. 2296 */ 2297 list_for_each_entry_safe(page, tmp, &surplus_list, lru) 2298 free_huge_page(page); 2299 spin_lock_irq(&hugetlb_lock); 2300 2301 return ret; 2302 } 2303 2304 /* 2305 * This routine has two main purposes: 2306 * 1) Decrement the reservation count (resv_huge_pages) by the value passed 2307 * in unused_resv_pages. This corresponds to the prior adjustments made 2308 * to the associated reservation map. 2309 * 2) Free any unused surplus pages that may have been allocated to satisfy 2310 * the reservation. As many as unused_resv_pages may be freed. 2311 */ 2312 static void return_unused_surplus_pages(struct hstate *h, 2313 unsigned long unused_resv_pages) 2314 { 2315 unsigned long nr_pages; 2316 struct page *page; 2317 LIST_HEAD(page_list); 2318 2319 lockdep_assert_held(&hugetlb_lock); 2320 /* Uncommit the reservation */ 2321 h->resv_huge_pages -= unused_resv_pages; 2322 2323 /* Cannot return gigantic pages currently */ 2324 if (hstate_is_gigantic(h)) 2325 goto out; 2326 2327 /* 2328 * Part (or even all) of the reservation could have been backed 2329 * by pre-allocated pages. Only free surplus pages. 2330 */ 2331 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 2332 2333 /* 2334 * We want to release as many surplus pages as possible, spread 2335 * evenly across all nodes with memory. Iterate across these nodes 2336 * until we can no longer free unreserved surplus pages. This occurs 2337 * when the nodes with surplus pages have no free pages. 2338 * remove_pool_huge_page() will balance the freed pages across the 2339 * on-line nodes with memory and will handle the hstate accounting. 2340 */ 2341 while (nr_pages--) { 2342 page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); 2343 if (!page) 2344 goto out; 2345 2346 list_add(&page->lru, &page_list); 2347 } 2348 2349 out: 2350 spin_unlock_irq(&hugetlb_lock); 2351 update_and_free_pages_bulk(h, &page_list); 2352 spin_lock_irq(&hugetlb_lock); 2353 } 2354 2355 2356 /* 2357 * vma_needs_reservation, vma_commit_reservation and vma_end_reservation 2358 * are used by the huge page allocation routines to manage reservations. 2359 * 2360 * vma_needs_reservation is called to determine if the huge page at addr 2361 * within the vma has an associated reservation. If a reservation is 2362 * needed, the value 1 is returned. The caller is then responsible for 2363 * managing the global reservation and subpool usage counts. After 2364 * the huge page has been allocated, vma_commit_reservation is called 2365 * to add the page to the reservation map. If the page allocation fails, 2366 * the reservation must be ended instead of committed. vma_end_reservation 2367 * is called in such cases. 2368 * 2369 * In the normal case, vma_commit_reservation returns the same value 2370 * as the preceding vma_needs_reservation call. The only time this 2371 * is not the case is if a reserve map was changed between calls. It 2372 * is the responsibility of the caller to notice the difference and 2373 * take appropriate action. 2374 * 2375 * vma_add_reservation is used in error paths where a reservation must 2376 * be restored when a newly allocated huge page must be freed. It is 2377 * to be called after calling vma_needs_reservation to determine if a 2378 * reservation exists. 2379 * 2380 * vma_del_reservation is used in error paths where an entry in the reserve 2381 * map was created during huge page allocation and must be removed. It is to 2382 * be called after calling vma_needs_reservation to determine if a reservation 2383 * exists. 2384 */ 2385 enum vma_resv_mode { 2386 VMA_NEEDS_RESV, 2387 VMA_COMMIT_RESV, 2388 VMA_END_RESV, 2389 VMA_ADD_RESV, 2390 VMA_DEL_RESV, 2391 }; 2392 static long __vma_reservation_common(struct hstate *h, 2393 struct vm_area_struct *vma, unsigned long addr, 2394 enum vma_resv_mode mode) 2395 { 2396 struct resv_map *resv; 2397 pgoff_t idx; 2398 long ret; 2399 long dummy_out_regions_needed; 2400 2401 resv = vma_resv_map(vma); 2402 if (!resv) 2403 return 1; 2404 2405 idx = vma_hugecache_offset(h, vma, addr); 2406 switch (mode) { 2407 case VMA_NEEDS_RESV: 2408 ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); 2409 /* We assume that vma_reservation_* routines always operate on 2410 * 1 page, and that adding to resv map a 1 page entry can only 2411 * ever require 1 region. 2412 */ 2413 VM_BUG_ON(dummy_out_regions_needed != 1); 2414 break; 2415 case VMA_COMMIT_RESV: 2416 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2417 /* region_add calls of range 1 should never fail. */ 2418 VM_BUG_ON(ret < 0); 2419 break; 2420 case VMA_END_RESV: 2421 region_abort(resv, idx, idx + 1, 1); 2422 ret = 0; 2423 break; 2424 case VMA_ADD_RESV: 2425 if (vma->vm_flags & VM_MAYSHARE) { 2426 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2427 /* region_add calls of range 1 should never fail. */ 2428 VM_BUG_ON(ret < 0); 2429 } else { 2430 region_abort(resv, idx, idx + 1, 1); 2431 ret = region_del(resv, idx, idx + 1); 2432 } 2433 break; 2434 case VMA_DEL_RESV: 2435 if (vma->vm_flags & VM_MAYSHARE) { 2436 region_abort(resv, idx, idx + 1, 1); 2437 ret = region_del(resv, idx, idx + 1); 2438 } else { 2439 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2440 /* region_add calls of range 1 should never fail. */ 2441 VM_BUG_ON(ret < 0); 2442 } 2443 break; 2444 default: 2445 BUG(); 2446 } 2447 2448 if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV) 2449 return ret; 2450 /* 2451 * We know private mapping must have HPAGE_RESV_OWNER set. 2452 * 2453 * In most cases, reserves always exist for private mappings. 2454 * However, a file associated with mapping could have been 2455 * hole punched or truncated after reserves were consumed. 2456 * As subsequent fault on such a range will not use reserves. 2457 * Subtle - The reserve map for private mappings has the 2458 * opposite meaning than that of shared mappings. If NO 2459 * entry is in the reserve map, it means a reservation exists. 2460 * If an entry exists in the reserve map, it means the 2461 * reservation has already been consumed. As a result, the 2462 * return value of this routine is the opposite of the 2463 * value returned from reserve map manipulation routines above. 2464 */ 2465 if (ret > 0) 2466 return 0; 2467 if (ret == 0) 2468 return 1; 2469 return ret; 2470 } 2471 2472 static long vma_needs_reservation(struct hstate *h, 2473 struct vm_area_struct *vma, unsigned long addr) 2474 { 2475 return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); 2476 } 2477 2478 static long vma_commit_reservation(struct hstate *h, 2479 struct vm_area_struct *vma, unsigned long addr) 2480 { 2481 return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); 2482 } 2483 2484 static void vma_end_reservation(struct hstate *h, 2485 struct vm_area_struct *vma, unsigned long addr) 2486 { 2487 (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); 2488 } 2489 2490 static long vma_add_reservation(struct hstate *h, 2491 struct vm_area_struct *vma, unsigned long addr) 2492 { 2493 return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); 2494 } 2495 2496 static long vma_del_reservation(struct hstate *h, 2497 struct vm_area_struct *vma, unsigned long addr) 2498 { 2499 return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV); 2500 } 2501 2502 /* 2503 * This routine is called to restore reservation information on error paths. 2504 * It should ONLY be called for pages allocated via alloc_huge_page(), and 2505 * the hugetlb mutex should remain held when calling this routine. 2506 * 2507 * It handles two specific cases: 2508 * 1) A reservation was in place and the page consumed the reservation. 2509 * HPageRestoreReserve is set in the page. 2510 * 2) No reservation was in place for the page, so HPageRestoreReserve is 2511 * not set. However, alloc_huge_page always updates the reserve map. 2512 * 2513 * In case 1, free_huge_page later in the error path will increment the 2514 * global reserve count. But, free_huge_page does not have enough context 2515 * to adjust the reservation map. This case deals primarily with private 2516 * mappings. Adjust the reserve map here to be consistent with global 2517 * reserve count adjustments to be made by free_huge_page. Make sure the 2518 * reserve map indicates there is a reservation present. 2519 * 2520 * In case 2, simply undo reserve map modifications done by alloc_huge_page. 2521 */ 2522 void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, 2523 unsigned long address, struct page *page) 2524 { 2525 long rc = vma_needs_reservation(h, vma, address); 2526 2527 if (HPageRestoreReserve(page)) { 2528 if (unlikely(rc < 0)) 2529 /* 2530 * Rare out of memory condition in reserve map 2531 * manipulation. Clear HPageRestoreReserve so that 2532 * global reserve count will not be incremented 2533 * by free_huge_page. This will make it appear 2534 * as though the reservation for this page was 2535 * consumed. This may prevent the task from 2536 * faulting in the page at a later time. This 2537 * is better than inconsistent global huge page 2538 * accounting of reserve counts. 2539 */ 2540 ClearHPageRestoreReserve(page); 2541 else if (rc) 2542 (void)vma_add_reservation(h, vma, address); 2543 else 2544 vma_end_reservation(h, vma, address); 2545 } else { 2546 if (!rc) { 2547 /* 2548 * This indicates there is an entry in the reserve map 2549 * not added by alloc_huge_page. We know it was added 2550 * before the alloc_huge_page call, otherwise 2551 * HPageRestoreReserve would be set on the page. 2552 * Remove the entry so that a subsequent allocation 2553 * does not consume a reservation. 2554 */ 2555 rc = vma_del_reservation(h, vma, address); 2556 if (rc < 0) 2557 /* 2558 * VERY rare out of memory condition. Since 2559 * we can not delete the entry, set 2560 * HPageRestoreReserve so that the reserve 2561 * count will be incremented when the page 2562 * is freed. This reserve will be consumed 2563 * on a subsequent allocation. 2564 */ 2565 SetHPageRestoreReserve(page); 2566 } else if (rc < 0) { 2567 /* 2568 * Rare out of memory condition from 2569 * vma_needs_reservation call. Memory allocation is 2570 * only attempted if a new entry is needed. Therefore, 2571 * this implies there is not an entry in the 2572 * reserve map. 2573 * 2574 * For shared mappings, no entry in the map indicates 2575 * no reservation. We are done. 2576 */ 2577 if (!(vma->vm_flags & VM_MAYSHARE)) 2578 /* 2579 * For private mappings, no entry indicates 2580 * a reservation is present. Since we can 2581 * not add an entry, set SetHPageRestoreReserve 2582 * on the page so reserve count will be 2583 * incremented when freed. This reserve will 2584 * be consumed on a subsequent allocation. 2585 */ 2586 SetHPageRestoreReserve(page); 2587 } else 2588 /* 2589 * No reservation present, do nothing 2590 */ 2591 vma_end_reservation(h, vma, address); 2592 } 2593 } 2594 2595 /* 2596 * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one 2597 * @h: struct hstate old page belongs to 2598 * @old_page: Old page to dissolve 2599 * @list: List to isolate the page in case we need to 2600 * Returns 0 on success, otherwise negated error. 2601 */ 2602 static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, 2603 struct list_head *list) 2604 { 2605 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 2606 int nid = page_to_nid(old_page); 2607 bool alloc_retry = false; 2608 struct page *new_page; 2609 int ret = 0; 2610 2611 /* 2612 * Before dissolving the page, we need to allocate a new one for the 2613 * pool to remain stable. Here, we allocate the page and 'prep' it 2614 * by doing everything but actually updating counters and adding to 2615 * the pool. This simplifies and let us do most of the processing 2616 * under the lock. 2617 */ 2618 alloc_retry: 2619 new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); 2620 if (!new_page) 2621 return -ENOMEM; 2622 /* 2623 * If all goes well, this page will be directly added to the free 2624 * list in the pool. For this the ref count needs to be zero. 2625 * Attempt to drop now, and retry once if needed. It is VERY 2626 * unlikely there is another ref on the page. 2627 * 2628 * If someone else has a reference to the page, it will be freed 2629 * when they drop their ref. Abuse temporary page flag to accomplish 2630 * this. Retry once if there is an inflated ref count. 2631 */ 2632 SetHPageTemporary(new_page); 2633 if (!put_page_testzero(new_page)) { 2634 if (alloc_retry) 2635 return -EBUSY; 2636 2637 alloc_retry = true; 2638 goto alloc_retry; 2639 } 2640 ClearHPageTemporary(new_page); 2641 2642 __prep_new_huge_page(h, new_page); 2643 2644 retry: 2645 spin_lock_irq(&hugetlb_lock); 2646 if (!PageHuge(old_page)) { 2647 /* 2648 * Freed from under us. Drop new_page too. 2649 */ 2650 goto free_new; 2651 } else if (page_count(old_page)) { 2652 /* 2653 * Someone has grabbed the page, try to isolate it here. 2654 * Fail with -EBUSY if not possible. 2655 */ 2656 spin_unlock_irq(&hugetlb_lock); 2657 if (!isolate_huge_page(old_page, list)) 2658 ret = -EBUSY; 2659 spin_lock_irq(&hugetlb_lock); 2660 goto free_new; 2661 } else if (!HPageFreed(old_page)) { 2662 /* 2663 * Page's refcount is 0 but it has not been enqueued in the 2664 * freelist yet. Race window is small, so we can succeed here if 2665 * we retry. 2666 */ 2667 spin_unlock_irq(&hugetlb_lock); 2668 cond_resched(); 2669 goto retry; 2670 } else { 2671 /* 2672 * Ok, old_page is still a genuine free hugepage. Remove it from 2673 * the freelist and decrease the counters. These will be 2674 * incremented again when calling __prep_account_new_huge_page() 2675 * and enqueue_huge_page() for new_page. The counters will remain 2676 * stable since this happens under the lock. 2677 */ 2678 remove_hugetlb_page(h, old_page, false); 2679 2680 /* 2681 * Ref count on new page is already zero as it was dropped 2682 * earlier. It can be directly added to the pool free list. 2683 */ 2684 __prep_account_new_huge_page(h, nid); 2685 enqueue_huge_page(h, new_page); 2686 2687 /* 2688 * Pages have been replaced, we can safely free the old one. 2689 */ 2690 spin_unlock_irq(&hugetlb_lock); 2691 update_and_free_page(h, old_page, false); 2692 } 2693 2694 return ret; 2695 2696 free_new: 2697 spin_unlock_irq(&hugetlb_lock); 2698 /* Page has a zero ref count, but needs a ref to be freed */ 2699 set_page_refcounted(new_page); 2700 update_and_free_page(h, new_page, false); 2701 2702 return ret; 2703 } 2704 2705 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) 2706 { 2707 struct hstate *h; 2708 struct page *head; 2709 int ret = -EBUSY; 2710 2711 /* 2712 * The page might have been dissolved from under our feet, so make sure 2713 * to carefully check the state under the lock. 2714 * Return success when racing as if we dissolved the page ourselves. 2715 */ 2716 spin_lock_irq(&hugetlb_lock); 2717 if (PageHuge(page)) { 2718 head = compound_head(page); 2719 h = page_hstate(head); 2720 } else { 2721 spin_unlock_irq(&hugetlb_lock); 2722 return 0; 2723 } 2724 spin_unlock_irq(&hugetlb_lock); 2725 2726 /* 2727 * Fence off gigantic pages as there is a cyclic dependency between 2728 * alloc_contig_range and them. Return -ENOMEM as this has the effect 2729 * of bailing out right away without further retrying. 2730 */ 2731 if (hstate_is_gigantic(h)) 2732 return -ENOMEM; 2733 2734 if (page_count(head) && isolate_huge_page(head, list)) 2735 ret = 0; 2736 else if (!page_count(head)) 2737 ret = alloc_and_dissolve_huge_page(h, head, list); 2738 2739 return ret; 2740 } 2741 2742 struct page *alloc_huge_page(struct vm_area_struct *vma, 2743 unsigned long addr, int avoid_reserve) 2744 { 2745 struct hugepage_subpool *spool = subpool_vma(vma); 2746 struct hstate *h = hstate_vma(vma); 2747 struct page *page; 2748 long map_chg, map_commit; 2749 long gbl_chg; 2750 int ret, idx; 2751 struct hugetlb_cgroup *h_cg; 2752 bool deferred_reserve; 2753 2754 idx = hstate_index(h); 2755 /* 2756 * Examine the region/reserve map to determine if the process 2757 * has a reservation for the page to be allocated. A return 2758 * code of zero indicates a reservation exists (no change). 2759 */ 2760 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); 2761 if (map_chg < 0) 2762 return ERR_PTR(-ENOMEM); 2763 2764 /* 2765 * Processes that did not create the mapping will have no 2766 * reserves as indicated by the region/reserve map. Check 2767 * that the allocation will not exceed the subpool limit. 2768 * Allocations for MAP_NORESERVE mappings also need to be 2769 * checked against any subpool limit. 2770 */ 2771 if (map_chg || avoid_reserve) { 2772 gbl_chg = hugepage_subpool_get_pages(spool, 1); 2773 if (gbl_chg < 0) { 2774 vma_end_reservation(h, vma, addr); 2775 return ERR_PTR(-ENOSPC); 2776 } 2777 2778 /* 2779 * Even though there was no reservation in the region/reserve 2780 * map, there could be reservations associated with the 2781 * subpool that can be used. This would be indicated if the 2782 * return value of hugepage_subpool_get_pages() is zero. 2783 * However, if avoid_reserve is specified we still avoid even 2784 * the subpool reservations. 2785 */ 2786 if (avoid_reserve) 2787 gbl_chg = 1; 2788 } 2789 2790 /* If this allocation is not consuming a reservation, charge it now. 2791 */ 2792 deferred_reserve = map_chg || avoid_reserve; 2793 if (deferred_reserve) { 2794 ret = hugetlb_cgroup_charge_cgroup_rsvd( 2795 idx, pages_per_huge_page(h), &h_cg); 2796 if (ret) 2797 goto out_subpool_put; 2798 } 2799 2800 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 2801 if (ret) 2802 goto out_uncharge_cgroup_reservation; 2803 2804 spin_lock_irq(&hugetlb_lock); 2805 /* 2806 * glb_chg is passed to indicate whether or not a page must be taken 2807 * from the global free pool (global change). gbl_chg == 0 indicates 2808 * a reservation exists for the allocation. 2809 */ 2810 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); 2811 if (!page) { 2812 spin_unlock_irq(&hugetlb_lock); 2813 page = alloc_buddy_huge_page_with_mpol(h, vma, addr); 2814 if (!page) 2815 goto out_uncharge_cgroup; 2816 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { 2817 SetHPageRestoreReserve(page); 2818 h->resv_huge_pages--; 2819 } 2820 spin_lock_irq(&hugetlb_lock); 2821 list_add(&page->lru, &h->hugepage_activelist); 2822 /* Fall through */ 2823 } 2824 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); 2825 /* If allocation is not consuming a reservation, also store the 2826 * hugetlb_cgroup pointer on the page. 2827 */ 2828 if (deferred_reserve) { 2829 hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), 2830 h_cg, page); 2831 } 2832 2833 spin_unlock_irq(&hugetlb_lock); 2834 2835 hugetlb_set_page_subpool(page, spool); 2836 2837 map_commit = vma_commit_reservation(h, vma, addr); 2838 if (unlikely(map_chg > map_commit)) { 2839 /* 2840 * The page was added to the reservation map between 2841 * vma_needs_reservation and vma_commit_reservation. 2842 * This indicates a race with hugetlb_reserve_pages. 2843 * Adjust for the subpool count incremented above AND 2844 * in hugetlb_reserve_pages for the same page. Also, 2845 * the reservation count added in hugetlb_reserve_pages 2846 * no longer applies. 2847 */ 2848 long rsv_adjust; 2849 2850 rsv_adjust = hugepage_subpool_put_pages(spool, 1); 2851 hugetlb_acct_memory(h, -rsv_adjust); 2852 if (deferred_reserve) 2853 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), 2854 pages_per_huge_page(h), page); 2855 } 2856 return page; 2857 2858 out_uncharge_cgroup: 2859 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); 2860 out_uncharge_cgroup_reservation: 2861 if (deferred_reserve) 2862 hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), 2863 h_cg); 2864 out_subpool_put: 2865 if (map_chg || avoid_reserve) 2866 hugepage_subpool_put_pages(spool, 1); 2867 vma_end_reservation(h, vma, addr); 2868 return ERR_PTR(-ENOSPC); 2869 } 2870 2871 int alloc_bootmem_huge_page(struct hstate *h) 2872 __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); 2873 int __alloc_bootmem_huge_page(struct hstate *h) 2874 { 2875 struct huge_bootmem_page *m; 2876 int nr_nodes, node; 2877 2878 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 2879 void *addr; 2880 2881 addr = memblock_alloc_try_nid_raw( 2882 huge_page_size(h), huge_page_size(h), 2883 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); 2884 if (addr) { 2885 /* 2886 * Use the beginning of the huge page to store the 2887 * huge_bootmem_page struct (until gather_bootmem 2888 * puts them into the mem_map). 2889 */ 2890 m = addr; 2891 goto found; 2892 } 2893 } 2894 return 0; 2895 2896 found: 2897 BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); 2898 /* Put them into a private list first because mem_map is not up yet */ 2899 INIT_LIST_HEAD(&m->list); 2900 list_add(&m->list, &huge_boot_pages); 2901 m->hstate = h; 2902 return 1; 2903 } 2904 2905 /* 2906 * Put bootmem huge pages into the standard lists after mem_map is up. 2907 * Note: This only applies to gigantic (order > MAX_ORDER) pages. 2908 */ 2909 static void __init gather_bootmem_prealloc(void) 2910 { 2911 struct huge_bootmem_page *m; 2912 2913 list_for_each_entry(m, &huge_boot_pages, list) { 2914 struct page *page = virt_to_page(m); 2915 struct hstate *h = m->hstate; 2916 2917 VM_BUG_ON(!hstate_is_gigantic(h)); 2918 WARN_ON(page_count(page) != 1); 2919 if (prep_compound_gigantic_page(page, huge_page_order(h))) { 2920 WARN_ON(PageReserved(page)); 2921 prep_new_huge_page(h, page, page_to_nid(page)); 2922 put_page(page); /* add to the hugepage allocator */ 2923 } else { 2924 /* VERY unlikely inflated ref count on a tail page */ 2925 free_gigantic_page(page, huge_page_order(h)); 2926 } 2927 2928 /* 2929 * We need to restore the 'stolen' pages to totalram_pages 2930 * in order to fix confusing memory reports from free(1) and 2931 * other side-effects, like CommitLimit going negative. 2932 */ 2933 adjust_managed_page_count(page, pages_per_huge_page(h)); 2934 cond_resched(); 2935 } 2936 } 2937 2938 static void __init hugetlb_hstate_alloc_pages(struct hstate *h) 2939 { 2940 unsigned long i; 2941 nodemask_t *node_alloc_noretry; 2942 2943 if (!hstate_is_gigantic(h)) { 2944 /* 2945 * Bit mask controlling how hard we retry per-node allocations. 2946 * Ignore errors as lower level routines can deal with 2947 * node_alloc_noretry == NULL. If this kmalloc fails at boot 2948 * time, we are likely in bigger trouble. 2949 */ 2950 node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), 2951 GFP_KERNEL); 2952 } else { 2953 /* allocations done at boot time */ 2954 node_alloc_noretry = NULL; 2955 } 2956 2957 /* bit mask controlling how hard we retry per-node allocations */ 2958 if (node_alloc_noretry) 2959 nodes_clear(*node_alloc_noretry); 2960 2961 for (i = 0; i < h->max_huge_pages; ++i) { 2962 if (hstate_is_gigantic(h)) { 2963 if (hugetlb_cma_size) { 2964 pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); 2965 goto free; 2966 } 2967 if (!alloc_bootmem_huge_page(h)) 2968 break; 2969 } else if (!alloc_pool_huge_page(h, 2970 &node_states[N_MEMORY], 2971 node_alloc_noretry)) 2972 break; 2973 cond_resched(); 2974 } 2975 if (i < h->max_huge_pages) { 2976 char buf[32]; 2977 2978 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 2979 pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", 2980 h->max_huge_pages, buf, i); 2981 h->max_huge_pages = i; 2982 } 2983 free: 2984 kfree(node_alloc_noretry); 2985 } 2986 2987 static void __init hugetlb_init_hstates(void) 2988 { 2989 struct hstate *h; 2990 2991 for_each_hstate(h) { 2992 if (minimum_order > huge_page_order(h)) 2993 minimum_order = huge_page_order(h); 2994 2995 /* oversize hugepages were init'ed in early boot */ 2996 if (!hstate_is_gigantic(h)) 2997 hugetlb_hstate_alloc_pages(h); 2998 } 2999 VM_BUG_ON(minimum_order == UINT_MAX); 3000 } 3001 3002 static void __init report_hugepages(void) 3003 { 3004 struct hstate *h; 3005 3006 for_each_hstate(h) { 3007 char buf[32]; 3008 3009 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 3010 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", 3011 buf, h->free_huge_pages); 3012 } 3013 } 3014 3015 #ifdef CONFIG_HIGHMEM 3016 static void try_to_free_low(struct hstate *h, unsigned long count, 3017 nodemask_t *nodes_allowed) 3018 { 3019 int i; 3020 LIST_HEAD(page_list); 3021 3022 lockdep_assert_held(&hugetlb_lock); 3023 if (hstate_is_gigantic(h)) 3024 return; 3025 3026 /* 3027 * Collect pages to be freed on a list, and free after dropping lock 3028 */ 3029 for_each_node_mask(i, *nodes_allowed) { 3030 struct page *page, *next; 3031 struct list_head *freel = &h->hugepage_freelists[i]; 3032 list_for_each_entry_safe(page, next, freel, lru) { 3033 if (count >= h->nr_huge_pages) 3034 goto out; 3035 if (PageHighMem(page)) 3036 continue; 3037 remove_hugetlb_page(h, page, false); 3038 list_add(&page->lru, &page_list); 3039 } 3040 } 3041 3042 out: 3043 spin_unlock_irq(&hugetlb_lock); 3044 update_and_free_pages_bulk(h, &page_list); 3045 spin_lock_irq(&hugetlb_lock); 3046 } 3047 #else 3048 static inline void try_to_free_low(struct hstate *h, unsigned long count, 3049 nodemask_t *nodes_allowed) 3050 { 3051 } 3052 #endif 3053 3054 /* 3055 * Increment or decrement surplus_huge_pages. Keep node-specific counters 3056 * balanced by operating on them in a round-robin fashion. 3057 * Returns 1 if an adjustment was made. 3058 */ 3059 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 3060 int delta) 3061 { 3062 int nr_nodes, node; 3063 3064 lockdep_assert_held(&hugetlb_lock); 3065 VM_BUG_ON(delta != -1 && delta != 1); 3066 3067 if (delta < 0) { 3068 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 3069 if (h->surplus_huge_pages_node[node]) 3070 goto found; 3071 } 3072 } else { 3073 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 3074 if (h->surplus_huge_pages_node[node] < 3075 h->nr_huge_pages_node[node]) 3076 goto found; 3077 } 3078 } 3079 return 0; 3080 3081 found: 3082 h->surplus_huge_pages += delta; 3083 h->surplus_huge_pages_node[node] += delta; 3084 return 1; 3085 } 3086 3087 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 3088 static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, 3089 nodemask_t *nodes_allowed) 3090 { 3091 unsigned long min_count, ret; 3092 struct page *page; 3093 LIST_HEAD(page_list); 3094 NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); 3095 3096 /* 3097 * Bit mask controlling how hard we retry per-node allocations. 3098 * If we can not allocate the bit mask, do not attempt to allocate 3099 * the requested huge pages. 3100 */ 3101 if (node_alloc_noretry) 3102 nodes_clear(*node_alloc_noretry); 3103 else 3104 return -ENOMEM; 3105 3106 /* 3107 * resize_lock mutex prevents concurrent adjustments to number of 3108 * pages in hstate via the proc/sysfs interfaces. 3109 */ 3110 mutex_lock(&h->resize_lock); 3111 flush_free_hpage_work(h); 3112 spin_lock_irq(&hugetlb_lock); 3113 3114 /* 3115 * Check for a node specific request. 3116 * Changing node specific huge page count may require a corresponding 3117 * change to the global count. In any case, the passed node mask 3118 * (nodes_allowed) will restrict alloc/free to the specified node. 3119 */ 3120 if (nid != NUMA_NO_NODE) { 3121 unsigned long old_count = count; 3122 3123 count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 3124 /* 3125 * User may have specified a large count value which caused the 3126 * above calculation to overflow. In this case, they wanted 3127 * to allocate as many huge pages as possible. Set count to 3128 * largest possible value to align with their intention. 3129 */ 3130 if (count < old_count) 3131 count = ULONG_MAX; 3132 } 3133 3134 /* 3135 * Gigantic pages runtime allocation depend on the capability for large 3136 * page range allocation. 3137 * If the system does not provide this feature, return an error when 3138 * the user tries to allocate gigantic pages but let the user free the 3139 * boottime allocated gigantic pages. 3140 */ 3141 if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { 3142 if (count > persistent_huge_pages(h)) { 3143 spin_unlock_irq(&hugetlb_lock); 3144 mutex_unlock(&h->resize_lock); 3145 NODEMASK_FREE(node_alloc_noretry); 3146 return -EINVAL; 3147 } 3148 /* Fall through to decrease pool */ 3149 } 3150 3151 /* 3152 * Increase the pool size 3153 * First take pages out of surplus state. Then make up the 3154 * remaining difference by allocating fresh huge pages. 3155 * 3156 * We might race with alloc_surplus_huge_page() here and be unable 3157 * to convert a surplus huge page to a normal huge page. That is 3158 * not critical, though, it just means the overall size of the 3159 * pool might be one hugepage larger than it needs to be, but 3160 * within all the constraints specified by the sysctls. 3161 */ 3162 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 3163 if (!adjust_pool_surplus(h, nodes_allowed, -1)) 3164 break; 3165 } 3166 3167 while (count > persistent_huge_pages(h)) { 3168 /* 3169 * If this allocation races such that we no longer need the 3170 * page, free_huge_page will handle it by freeing the page 3171 * and reducing the surplus. 3172 */ 3173 spin_unlock_irq(&hugetlb_lock); 3174 3175 /* yield cpu to avoid soft lockup */ 3176 cond_resched(); 3177 3178 ret = alloc_pool_huge_page(h, nodes_allowed, 3179 node_alloc_noretry); 3180 spin_lock_irq(&hugetlb_lock); 3181 if (!ret) 3182 goto out; 3183 3184 /* Bail for signals. Probably ctrl-c from user */ 3185 if (signal_pending(current)) 3186 goto out; 3187 } 3188 3189 /* 3190 * Decrease the pool size 3191 * First return free pages to the buddy allocator (being careful 3192 * to keep enough around to satisfy reservations). Then place 3193 * pages into surplus state as needed so the pool will shrink 3194 * to the desired size as pages become free. 3195 * 3196 * By placing pages into the surplus state independent of the 3197 * overcommit value, we are allowing the surplus pool size to 3198 * exceed overcommit. There are few sane options here. Since 3199 * alloc_surplus_huge_page() is checking the global counter, 3200 * though, we'll note that we're not allowed to exceed surplus 3201 * and won't grow the pool anywhere else. Not until one of the 3202 * sysctls are changed, or the surplus pages go out of use. 3203 */ 3204 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 3205 min_count = max(count, min_count); 3206 try_to_free_low(h, min_count, nodes_allowed); 3207 3208 /* 3209 * Collect pages to be removed on list without dropping lock 3210 */ 3211 while (min_count < persistent_huge_pages(h)) { 3212 page = remove_pool_huge_page(h, nodes_allowed, 0); 3213 if (!page) 3214 break; 3215 3216 list_add(&page->lru, &page_list); 3217 } 3218 /* free the pages after dropping lock */ 3219 spin_unlock_irq(&hugetlb_lock); 3220 update_and_free_pages_bulk(h, &page_list); 3221 flush_free_hpage_work(h); 3222 spin_lock_irq(&hugetlb_lock); 3223 3224 while (count < persistent_huge_pages(h)) { 3225 if (!adjust_pool_surplus(h, nodes_allowed, 1)) 3226 break; 3227 } 3228 out: 3229 h->max_huge_pages = persistent_huge_pages(h); 3230 spin_unlock_irq(&hugetlb_lock); 3231 mutex_unlock(&h->resize_lock); 3232 3233 NODEMASK_FREE(node_alloc_noretry); 3234 3235 return 0; 3236 } 3237 3238 #define HSTATE_ATTR_RO(_name) \ 3239 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 3240 3241 #define HSTATE_ATTR(_name) \ 3242 static struct kobj_attribute _name##_attr = \ 3243 __ATTR(_name, 0644, _name##_show, _name##_store) 3244 3245 static struct kobject *hugepages_kobj; 3246 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 3247 3248 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); 3249 3250 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) 3251 { 3252 int i; 3253 3254 for (i = 0; i < HUGE_MAX_HSTATE; i++) 3255 if (hstate_kobjs[i] == kobj) { 3256 if (nidp) 3257 *nidp = NUMA_NO_NODE; 3258 return &hstates[i]; 3259 } 3260 3261 return kobj_to_node_hstate(kobj, nidp); 3262 } 3263 3264 static ssize_t nr_hugepages_show_common(struct kobject *kobj, 3265 struct kobj_attribute *attr, char *buf) 3266 { 3267 struct hstate *h; 3268 unsigned long nr_huge_pages; 3269 int nid; 3270 3271 h = kobj_to_hstate(kobj, &nid); 3272 if (nid == NUMA_NO_NODE) 3273 nr_huge_pages = h->nr_huge_pages; 3274 else 3275 nr_huge_pages = h->nr_huge_pages_node[nid]; 3276 3277 return sysfs_emit(buf, "%lu\n", nr_huge_pages); 3278 } 3279 3280 static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, 3281 struct hstate *h, int nid, 3282 unsigned long count, size_t len) 3283 { 3284 int err; 3285 nodemask_t nodes_allowed, *n_mask; 3286 3287 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 3288 return -EINVAL; 3289 3290 if (nid == NUMA_NO_NODE) { 3291 /* 3292 * global hstate attribute 3293 */ 3294 if (!(obey_mempolicy && 3295 init_nodemask_of_mempolicy(&nodes_allowed))) 3296 n_mask = &node_states[N_MEMORY]; 3297 else 3298 n_mask = &nodes_allowed; 3299 } else { 3300 /* 3301 * Node specific request. count adjustment happens in 3302 * set_max_huge_pages() after acquiring hugetlb_lock. 3303 */ 3304 init_nodemask_of_node(&nodes_allowed, nid); 3305 n_mask = &nodes_allowed; 3306 } 3307 3308 err = set_max_huge_pages(h, count, nid, n_mask); 3309 3310 return err ? err : len; 3311 } 3312 3313 static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 3314 struct kobject *kobj, const char *buf, 3315 size_t len) 3316 { 3317 struct hstate *h; 3318 unsigned long count; 3319 int nid; 3320 int err; 3321 3322 err = kstrtoul(buf, 10, &count); 3323 if (err) 3324 return err; 3325 3326 h = kobj_to_hstate(kobj, &nid); 3327 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); 3328 } 3329 3330 static ssize_t nr_hugepages_show(struct kobject *kobj, 3331 struct kobj_attribute *attr, char *buf) 3332 { 3333 return nr_hugepages_show_common(kobj, attr, buf); 3334 } 3335 3336 static ssize_t nr_hugepages_store(struct kobject *kobj, 3337 struct kobj_attribute *attr, const char *buf, size_t len) 3338 { 3339 return nr_hugepages_store_common(false, kobj, buf, len); 3340 } 3341 HSTATE_ATTR(nr_hugepages); 3342 3343 #ifdef CONFIG_NUMA 3344 3345 /* 3346 * hstate attribute for optionally mempolicy-based constraint on persistent 3347 * huge page alloc/free. 3348 */ 3349 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, 3350 struct kobj_attribute *attr, 3351 char *buf) 3352 { 3353 return nr_hugepages_show_common(kobj, attr, buf); 3354 } 3355 3356 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, 3357 struct kobj_attribute *attr, const char *buf, size_t len) 3358 { 3359 return nr_hugepages_store_common(true, kobj, buf, len); 3360 } 3361 HSTATE_ATTR(nr_hugepages_mempolicy); 3362 #endif 3363 3364 3365 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 3366 struct kobj_attribute *attr, char *buf) 3367 { 3368 struct hstate *h = kobj_to_hstate(kobj, NULL); 3369 return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); 3370 } 3371 3372 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 3373 struct kobj_attribute *attr, const char *buf, size_t count) 3374 { 3375 int err; 3376 unsigned long input; 3377 struct hstate *h = kobj_to_hstate(kobj, NULL); 3378 3379 if (hstate_is_gigantic(h)) 3380 return -EINVAL; 3381 3382 err = kstrtoul(buf, 10, &input); 3383 if (err) 3384 return err; 3385 3386 spin_lock_irq(&hugetlb_lock); 3387 h->nr_overcommit_huge_pages = input; 3388 spin_unlock_irq(&hugetlb_lock); 3389 3390 return count; 3391 } 3392 HSTATE_ATTR(nr_overcommit_hugepages); 3393 3394 static ssize_t free_hugepages_show(struct kobject *kobj, 3395 struct kobj_attribute *attr, char *buf) 3396 { 3397 struct hstate *h; 3398 unsigned long free_huge_pages; 3399 int nid; 3400 3401 h = kobj_to_hstate(kobj, &nid); 3402 if (nid == NUMA_NO_NODE) 3403 free_huge_pages = h->free_huge_pages; 3404 else 3405 free_huge_pages = h->free_huge_pages_node[nid]; 3406 3407 return sysfs_emit(buf, "%lu\n", free_huge_pages); 3408 } 3409 HSTATE_ATTR_RO(free_hugepages); 3410 3411 static ssize_t resv_hugepages_show(struct kobject *kobj, 3412 struct kobj_attribute *attr, char *buf) 3413 { 3414 struct hstate *h = kobj_to_hstate(kobj, NULL); 3415 return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); 3416 } 3417 HSTATE_ATTR_RO(resv_hugepages); 3418 3419 static ssize_t surplus_hugepages_show(struct kobject *kobj, 3420 struct kobj_attribute *attr, char *buf) 3421 { 3422 struct hstate *h; 3423 unsigned long surplus_huge_pages; 3424 int nid; 3425 3426 h = kobj_to_hstate(kobj, &nid); 3427 if (nid == NUMA_NO_NODE) 3428 surplus_huge_pages = h->surplus_huge_pages; 3429 else 3430 surplus_huge_pages = h->surplus_huge_pages_node[nid]; 3431 3432 return sysfs_emit(buf, "%lu\n", surplus_huge_pages); 3433 } 3434 HSTATE_ATTR_RO(surplus_hugepages); 3435 3436 static struct attribute *hstate_attrs[] = { 3437 &nr_hugepages_attr.attr, 3438 &nr_overcommit_hugepages_attr.attr, 3439 &free_hugepages_attr.attr, 3440 &resv_hugepages_attr.attr, 3441 &surplus_hugepages_attr.attr, 3442 #ifdef CONFIG_NUMA 3443 &nr_hugepages_mempolicy_attr.attr, 3444 #endif 3445 NULL, 3446 }; 3447 3448 static const struct attribute_group hstate_attr_group = { 3449 .attrs = hstate_attrs, 3450 }; 3451 3452 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, 3453 struct kobject **hstate_kobjs, 3454 const struct attribute_group *hstate_attr_group) 3455 { 3456 int retval; 3457 int hi = hstate_index(h); 3458 3459 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 3460 if (!hstate_kobjs[hi]) 3461 return -ENOMEM; 3462 3463 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); 3464 if (retval) { 3465 kobject_put(hstate_kobjs[hi]); 3466 hstate_kobjs[hi] = NULL; 3467 } 3468 3469 return retval; 3470 } 3471 3472 static void __init hugetlb_sysfs_init(void) 3473 { 3474 struct hstate *h; 3475 int err; 3476 3477 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); 3478 if (!hugepages_kobj) 3479 return; 3480 3481 for_each_hstate(h) { 3482 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 3483 hstate_kobjs, &hstate_attr_group); 3484 if (err) 3485 pr_err("HugeTLB: Unable to add hstate %s", h->name); 3486 } 3487 } 3488 3489 #ifdef CONFIG_NUMA 3490 3491 /* 3492 * node_hstate/s - associate per node hstate attributes, via their kobjects, 3493 * with node devices in node_devices[] using a parallel array. The array 3494 * index of a node device or _hstate == node id. 3495 * This is here to avoid any static dependency of the node device driver, in 3496 * the base kernel, on the hugetlb module. 3497 */ 3498 struct node_hstate { 3499 struct kobject *hugepages_kobj; 3500 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 3501 }; 3502 static struct node_hstate node_hstates[MAX_NUMNODES]; 3503 3504 /* 3505 * A subset of global hstate attributes for node devices 3506 */ 3507 static struct attribute *per_node_hstate_attrs[] = { 3508 &nr_hugepages_attr.attr, 3509 &free_hugepages_attr.attr, 3510 &surplus_hugepages_attr.attr, 3511 NULL, 3512 }; 3513 3514 static const struct attribute_group per_node_hstate_attr_group = { 3515 .attrs = per_node_hstate_attrs, 3516 }; 3517 3518 /* 3519 * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. 3520 * Returns node id via non-NULL nidp. 3521 */ 3522 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 3523 { 3524 int nid; 3525 3526 for (nid = 0; nid < nr_node_ids; nid++) { 3527 struct node_hstate *nhs = &node_hstates[nid]; 3528 int i; 3529 for (i = 0; i < HUGE_MAX_HSTATE; i++) 3530 if (nhs->hstate_kobjs[i] == kobj) { 3531 if (nidp) 3532 *nidp = nid; 3533 return &hstates[i]; 3534 } 3535 } 3536 3537 BUG(); 3538 return NULL; 3539 } 3540 3541 /* 3542 * Unregister hstate attributes from a single node device. 3543 * No-op if no hstate attributes attached. 3544 */ 3545 static void hugetlb_unregister_node(struct node *node) 3546 { 3547 struct hstate *h; 3548 struct node_hstate *nhs = &node_hstates[node->dev.id]; 3549 3550 if (!nhs->hugepages_kobj) 3551 return; /* no hstate attributes */ 3552 3553 for_each_hstate(h) { 3554 int idx = hstate_index(h); 3555 if (nhs->hstate_kobjs[idx]) { 3556 kobject_put(nhs->hstate_kobjs[idx]); 3557 nhs->hstate_kobjs[idx] = NULL; 3558 } 3559 } 3560 3561 kobject_put(nhs->hugepages_kobj); 3562 nhs->hugepages_kobj = NULL; 3563 } 3564 3565 3566 /* 3567 * Register hstate attributes for a single node device. 3568 * No-op if attributes already registered. 3569 */ 3570 static void hugetlb_register_node(struct node *node) 3571 { 3572 struct hstate *h; 3573 struct node_hstate *nhs = &node_hstates[node->dev.id]; 3574 int err; 3575 3576 if (nhs->hugepages_kobj) 3577 return; /* already allocated */ 3578 3579 nhs->hugepages_kobj = kobject_create_and_add("hugepages", 3580 &node->dev.kobj); 3581 if (!nhs->hugepages_kobj) 3582 return; 3583 3584 for_each_hstate(h) { 3585 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, 3586 nhs->hstate_kobjs, 3587 &per_node_hstate_attr_group); 3588 if (err) { 3589 pr_err("HugeTLB: Unable to add hstate %s for node %d\n", 3590 h->name, node->dev.id); 3591 hugetlb_unregister_node(node); 3592 break; 3593 } 3594 } 3595 } 3596 3597 /* 3598 * hugetlb init time: register hstate attributes for all registered node 3599 * devices of nodes that have memory. All on-line nodes should have 3600 * registered their associated device by this time. 3601 */ 3602 static void __init hugetlb_register_all_nodes(void) 3603 { 3604 int nid; 3605 3606 for_each_node_state(nid, N_MEMORY) { 3607 struct node *node = node_devices[nid]; 3608 if (node->dev.id == nid) 3609 hugetlb_register_node(node); 3610 } 3611 3612 /* 3613 * Let the node device driver know we're here so it can 3614 * [un]register hstate attributes on node hotplug. 3615 */ 3616 register_hugetlbfs_with_node(hugetlb_register_node, 3617 hugetlb_unregister_node); 3618 } 3619 #else /* !CONFIG_NUMA */ 3620 3621 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 3622 { 3623 BUG(); 3624 if (nidp) 3625 *nidp = -1; 3626 return NULL; 3627 } 3628 3629 static void hugetlb_register_all_nodes(void) { } 3630 3631 #endif 3632 3633 static int __init hugetlb_init(void) 3634 { 3635 int i; 3636 3637 BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < 3638 __NR_HPAGEFLAGS); 3639 3640 if (!hugepages_supported()) { 3641 if (hugetlb_max_hstate || default_hstate_max_huge_pages) 3642 pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); 3643 return 0; 3644 } 3645 3646 /* 3647 * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some 3648 * architectures depend on setup being done here. 3649 */ 3650 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 3651 if (!parsed_default_hugepagesz) { 3652 /* 3653 * If we did not parse a default huge page size, set 3654 * default_hstate_idx to HPAGE_SIZE hstate. And, if the 3655 * number of huge pages for this default size was implicitly 3656 * specified, set that here as well. 3657 * Note that the implicit setting will overwrite an explicit 3658 * setting. A warning will be printed in this case. 3659 */ 3660 default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); 3661 if (default_hstate_max_huge_pages) { 3662 if (default_hstate.max_huge_pages) { 3663 char buf[32]; 3664 3665 string_get_size(huge_page_size(&default_hstate), 3666 1, STRING_UNITS_2, buf, 32); 3667 pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", 3668 default_hstate.max_huge_pages, buf); 3669 pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", 3670 default_hstate_max_huge_pages); 3671 } 3672 default_hstate.max_huge_pages = 3673 default_hstate_max_huge_pages; 3674 } 3675 } 3676 3677 hugetlb_cma_check(); 3678 hugetlb_init_hstates(); 3679 gather_bootmem_prealloc(); 3680 report_hugepages(); 3681 3682 hugetlb_sysfs_init(); 3683 hugetlb_register_all_nodes(); 3684 hugetlb_cgroup_file_init(); 3685 3686 #ifdef CONFIG_SMP 3687 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); 3688 #else 3689 num_fault_mutexes = 1; 3690 #endif 3691 hugetlb_fault_mutex_table = 3692 kmalloc_array(num_fault_mutexes, sizeof(struct mutex), 3693 GFP_KERNEL); 3694 BUG_ON(!hugetlb_fault_mutex_table); 3695 3696 for (i = 0; i < num_fault_mutexes; i++) 3697 mutex_init(&hugetlb_fault_mutex_table[i]); 3698 return 0; 3699 } 3700 subsys_initcall(hugetlb_init); 3701 3702 /* Overwritten by architectures with more huge page sizes */ 3703 bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) 3704 { 3705 return size == HPAGE_SIZE; 3706 } 3707 3708 void __init hugetlb_add_hstate(unsigned int order) 3709 { 3710 struct hstate *h; 3711 unsigned long i; 3712 3713 if (size_to_hstate(PAGE_SIZE << order)) { 3714 return; 3715 } 3716 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); 3717 BUG_ON(order == 0); 3718 h = &hstates[hugetlb_max_hstate++]; 3719 mutex_init(&h->resize_lock); 3720 h->order = order; 3721 h->mask = ~(huge_page_size(h) - 1); 3722 for (i = 0; i < MAX_NUMNODES; ++i) 3723 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 3724 INIT_LIST_HEAD(&h->hugepage_activelist); 3725 h->next_nid_to_alloc = first_memory_node; 3726 h->next_nid_to_free = first_memory_node; 3727 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 3728 huge_page_size(h)/1024); 3729 hugetlb_vmemmap_init(h); 3730 3731 parsed_hstate = h; 3732 } 3733 3734 /* 3735 * hugepages command line processing 3736 * hugepages normally follows a valid hugepagsz or default_hugepagsz 3737 * specification. If not, ignore the hugepages value. hugepages can also 3738 * be the first huge page command line option in which case it implicitly 3739 * specifies the number of huge pages for the default size. 3740 */ 3741 static int __init hugepages_setup(char *s) 3742 { 3743 unsigned long *mhp; 3744 static unsigned long *last_mhp; 3745 3746 if (!parsed_valid_hugepagesz) { 3747 pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); 3748 parsed_valid_hugepagesz = true; 3749 return 0; 3750 } 3751 3752 /* 3753 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter 3754 * yet, so this hugepages= parameter goes to the "default hstate". 3755 * Otherwise, it goes with the previously parsed hugepagesz or 3756 * default_hugepagesz. 3757 */ 3758 else if (!hugetlb_max_hstate) 3759 mhp = &default_hstate_max_huge_pages; 3760 else 3761 mhp = &parsed_hstate->max_huge_pages; 3762 3763 if (mhp == last_mhp) { 3764 pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); 3765 return 0; 3766 } 3767 3768 if (sscanf(s, "%lu", mhp) <= 0) 3769 *mhp = 0; 3770 3771 /* 3772 * Global state is always initialized later in hugetlb_init. 3773 * But we need to allocate gigantic hstates here early to still 3774 * use the bootmem allocator. 3775 */ 3776 if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate)) 3777 hugetlb_hstate_alloc_pages(parsed_hstate); 3778 3779 last_mhp = mhp; 3780 3781 return 1; 3782 } 3783 __setup("hugepages=", hugepages_setup); 3784 3785 /* 3786 * hugepagesz command line processing 3787 * A specific huge page size can only be specified once with hugepagesz. 3788 * hugepagesz is followed by hugepages on the command line. The global 3789 * variable 'parsed_valid_hugepagesz' is used to determine if prior 3790 * hugepagesz argument was valid. 3791 */ 3792 static int __init hugepagesz_setup(char *s) 3793 { 3794 unsigned long size; 3795 struct hstate *h; 3796 3797 parsed_valid_hugepagesz = false; 3798 size = (unsigned long)memparse(s, NULL); 3799 3800 if (!arch_hugetlb_valid_size(size)) { 3801 pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); 3802 return 0; 3803 } 3804 3805 h = size_to_hstate(size); 3806 if (h) { 3807 /* 3808 * hstate for this size already exists. This is normally 3809 * an error, but is allowed if the existing hstate is the 3810 * default hstate. More specifically, it is only allowed if 3811 * the number of huge pages for the default hstate was not 3812 * previously specified. 3813 */ 3814 if (!parsed_default_hugepagesz || h != &default_hstate || 3815 default_hstate.max_huge_pages) { 3816 pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); 3817 return 0; 3818 } 3819 3820 /* 3821 * No need to call hugetlb_add_hstate() as hstate already 3822 * exists. But, do set parsed_hstate so that a following 3823 * hugepages= parameter will be applied to this hstate. 3824 */ 3825 parsed_hstate = h; 3826 parsed_valid_hugepagesz = true; 3827 return 1; 3828 } 3829 3830 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 3831 parsed_valid_hugepagesz = true; 3832 return 1; 3833 } 3834 __setup("hugepagesz=", hugepagesz_setup); 3835 3836 /* 3837 * default_hugepagesz command line input 3838 * Only one instance of default_hugepagesz allowed on command line. 3839 */ 3840 static int __init default_hugepagesz_setup(char *s) 3841 { 3842 unsigned long size; 3843 3844 parsed_valid_hugepagesz = false; 3845 if (parsed_default_hugepagesz) { 3846 pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); 3847 return 0; 3848 } 3849 3850 size = (unsigned long)memparse(s, NULL); 3851 3852 if (!arch_hugetlb_valid_size(size)) { 3853 pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); 3854 return 0; 3855 } 3856 3857 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 3858 parsed_valid_hugepagesz = true; 3859 parsed_default_hugepagesz = true; 3860 default_hstate_idx = hstate_index(size_to_hstate(size)); 3861 3862 /* 3863 * The number of default huge pages (for this size) could have been 3864 * specified as the first hugetlb parameter: hugepages=X. If so, 3865 * then default_hstate_max_huge_pages is set. If the default huge 3866 * page size is gigantic (>= MAX_ORDER), then the pages must be 3867 * allocated here from bootmem allocator. 3868 */ 3869 if (default_hstate_max_huge_pages) { 3870 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 3871 if (hstate_is_gigantic(&default_hstate)) 3872 hugetlb_hstate_alloc_pages(&default_hstate); 3873 default_hstate_max_huge_pages = 0; 3874 } 3875 3876 return 1; 3877 } 3878 __setup("default_hugepagesz=", default_hugepagesz_setup); 3879 3880 static unsigned int allowed_mems_nr(struct hstate *h) 3881 { 3882 int node; 3883 unsigned int nr = 0; 3884 nodemask_t *mpol_allowed; 3885 unsigned int *array = h->free_huge_pages_node; 3886 gfp_t gfp_mask = htlb_alloc_mask(h); 3887 3888 mpol_allowed = policy_nodemask_current(gfp_mask); 3889 3890 for_each_node_mask(node, cpuset_current_mems_allowed) { 3891 if (!mpol_allowed || node_isset(node, *mpol_allowed)) 3892 nr += array[node]; 3893 } 3894 3895 return nr; 3896 } 3897 3898 #ifdef CONFIG_SYSCTL 3899 static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, 3900 void *buffer, size_t *length, 3901 loff_t *ppos, unsigned long *out) 3902 { 3903 struct ctl_table dup_table; 3904 3905 /* 3906 * In order to avoid races with __do_proc_doulongvec_minmax(), we 3907 * can duplicate the @table and alter the duplicate of it. 3908 */ 3909 dup_table = *table; 3910 dup_table.data = out; 3911 3912 return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); 3913 } 3914 3915 static int hugetlb_sysctl_handler_common(bool obey_mempolicy, 3916 struct ctl_table *table, int write, 3917 void *buffer, size_t *length, loff_t *ppos) 3918 { 3919 struct hstate *h = &default_hstate; 3920 unsigned long tmp = h->max_huge_pages; 3921 int ret; 3922 3923 if (!hugepages_supported()) 3924 return -EOPNOTSUPP; 3925 3926 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, 3927 &tmp); 3928 if (ret) 3929 goto out; 3930 3931 if (write) 3932 ret = __nr_hugepages_store_common(obey_mempolicy, h, 3933 NUMA_NO_NODE, tmp, *length); 3934 out: 3935 return ret; 3936 } 3937 3938 int hugetlb_sysctl_handler(struct ctl_table *table, int write, 3939 void *buffer, size_t *length, loff_t *ppos) 3940 { 3941 3942 return hugetlb_sysctl_handler_common(false, table, write, 3943 buffer, length, ppos); 3944 } 3945 3946 #ifdef CONFIG_NUMA 3947 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, 3948 void *buffer, size_t *length, loff_t *ppos) 3949 { 3950 return hugetlb_sysctl_handler_common(true, table, write, 3951 buffer, length, ppos); 3952 } 3953 #endif /* CONFIG_NUMA */ 3954 3955 int hugetlb_overcommit_handler(struct ctl_table *table, int write, 3956 void *buffer, size_t *length, loff_t *ppos) 3957 { 3958 struct hstate *h = &default_hstate; 3959 unsigned long tmp; 3960 int ret; 3961 3962 if (!hugepages_supported()) 3963 return -EOPNOTSUPP; 3964 3965 tmp = h->nr_overcommit_huge_pages; 3966 3967 if (write && hstate_is_gigantic(h)) 3968 return -EINVAL; 3969 3970 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, 3971 &tmp); 3972 if (ret) 3973 goto out; 3974 3975 if (write) { 3976 spin_lock_irq(&hugetlb_lock); 3977 h->nr_overcommit_huge_pages = tmp; 3978 spin_unlock_irq(&hugetlb_lock); 3979 } 3980 out: 3981 return ret; 3982 } 3983 3984 #endif /* CONFIG_SYSCTL */ 3985 3986 void hugetlb_report_meminfo(struct seq_file *m) 3987 { 3988 struct hstate *h; 3989 unsigned long total = 0; 3990 3991 if (!hugepages_supported()) 3992 return; 3993 3994 for_each_hstate(h) { 3995 unsigned long count = h->nr_huge_pages; 3996 3997 total += huge_page_size(h) * count; 3998 3999 if (h == &default_hstate) 4000 seq_printf(m, 4001 "HugePages_Total: %5lu\n" 4002 "HugePages_Free: %5lu\n" 4003 "HugePages_Rsvd: %5lu\n" 4004 "HugePages_Surp: %5lu\n" 4005 "Hugepagesize: %8lu kB\n", 4006 count, 4007 h->free_huge_pages, 4008 h->resv_huge_pages, 4009 h->surplus_huge_pages, 4010 huge_page_size(h) / SZ_1K); 4011 } 4012 4013 seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K); 4014 } 4015 4016 int hugetlb_report_node_meminfo(char *buf, int len, int nid) 4017 { 4018 struct hstate *h = &default_hstate; 4019 4020 if (!hugepages_supported()) 4021 return 0; 4022 4023 return sysfs_emit_at(buf, len, 4024 "Node %d HugePages_Total: %5u\n" 4025 "Node %d HugePages_Free: %5u\n" 4026 "Node %d HugePages_Surp: %5u\n", 4027 nid, h->nr_huge_pages_node[nid], 4028 nid, h->free_huge_pages_node[nid], 4029 nid, h->surplus_huge_pages_node[nid]); 4030 } 4031 4032 void hugetlb_show_meminfo(void) 4033 { 4034 struct hstate *h; 4035 int nid; 4036 4037 if (!hugepages_supported()) 4038 return; 4039 4040 for_each_node_state(nid, N_MEMORY) 4041 for_each_hstate(h) 4042 pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", 4043 nid, 4044 h->nr_huge_pages_node[nid], 4045 h->free_huge_pages_node[nid], 4046 h->surplus_huge_pages_node[nid], 4047 huge_page_size(h) / SZ_1K); 4048 } 4049 4050 void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) 4051 { 4052 seq_printf(m, "HugetlbPages:\t%8lu kB\n", 4053 atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); 4054 } 4055 4056 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 4057 unsigned long hugetlb_total_pages(void) 4058 { 4059 struct hstate *h; 4060 unsigned long nr_total_pages = 0; 4061 4062 for_each_hstate(h) 4063 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); 4064 return nr_total_pages; 4065 } 4066 4067 static int hugetlb_acct_memory(struct hstate *h, long delta) 4068 { 4069 int ret = -ENOMEM; 4070 4071 if (!delta) 4072 return 0; 4073 4074 spin_lock_irq(&hugetlb_lock); 4075 /* 4076 * When cpuset is configured, it breaks the strict hugetlb page 4077 * reservation as the accounting is done on a global variable. Such 4078 * reservation is completely rubbish in the presence of cpuset because 4079 * the reservation is not checked against page availability for the 4080 * current cpuset. Application can still potentially OOM'ed by kernel 4081 * with lack of free htlb page in cpuset that the task is in. 4082 * Attempt to enforce strict accounting with cpuset is almost 4083 * impossible (or too ugly) because cpuset is too fluid that 4084 * task or memory node can be dynamically moved between cpusets. 4085 * 4086 * The change of semantics for shared hugetlb mapping with cpuset is 4087 * undesirable. However, in order to preserve some of the semantics, 4088 * we fall back to check against current free page availability as 4089 * a best attempt and hopefully to minimize the impact of changing 4090 * semantics that cpuset has. 4091 * 4092 * Apart from cpuset, we also have memory policy mechanism that 4093 * also determines from which node the kernel will allocate memory 4094 * in a NUMA system. So similar to cpuset, we also should consider 4095 * the memory policy of the current task. Similar to the description 4096 * above. 4097 */ 4098 if (delta > 0) { 4099 if (gather_surplus_pages(h, delta) < 0) 4100 goto out; 4101 4102 if (delta > allowed_mems_nr(h)) { 4103 return_unused_surplus_pages(h, delta); 4104 goto out; 4105 } 4106 } 4107 4108 ret = 0; 4109 if (delta < 0) 4110 return_unused_surplus_pages(h, (unsigned long) -delta); 4111 4112 out: 4113 spin_unlock_irq(&hugetlb_lock); 4114 return ret; 4115 } 4116 4117 static void hugetlb_vm_op_open(struct vm_area_struct *vma) 4118 { 4119 struct resv_map *resv = vma_resv_map(vma); 4120 4121 /* 4122 * This new VMA should share its siblings reservation map if present. 4123 * The VMA will only ever have a valid reservation map pointer where 4124 * it is being copied for another still existing VMA. As that VMA 4125 * has a reference to the reservation map it cannot disappear until 4126 * after this open call completes. It is therefore safe to take a 4127 * new reference here without additional locking. 4128 */ 4129 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 4130 resv_map_dup_hugetlb_cgroup_uncharge_info(resv); 4131 kref_get(&resv->refs); 4132 } 4133 } 4134 4135 static void hugetlb_vm_op_close(struct vm_area_struct *vma) 4136 { 4137 struct hstate *h = hstate_vma(vma); 4138 struct resv_map *resv = vma_resv_map(vma); 4139 struct hugepage_subpool *spool = subpool_vma(vma); 4140 unsigned long reserve, start, end; 4141 long gbl_reserve; 4142 4143 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 4144 return; 4145 4146 start = vma_hugecache_offset(h, vma, vma->vm_start); 4147 end = vma_hugecache_offset(h, vma, vma->vm_end); 4148 4149 reserve = (end - start) - region_count(resv, start, end); 4150 hugetlb_cgroup_uncharge_counter(resv, start, end); 4151 if (reserve) { 4152 /* 4153 * Decrement reserve counts. The global reserve count may be 4154 * adjusted if the subpool has a minimum size. 4155 */ 4156 gbl_reserve = hugepage_subpool_put_pages(spool, reserve); 4157 hugetlb_acct_memory(h, -gbl_reserve); 4158 } 4159 4160 kref_put(&resv->refs, resv_map_release); 4161 } 4162 4163 static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) 4164 { 4165 if (addr & ~(huge_page_mask(hstate_vma(vma)))) 4166 return -EINVAL; 4167 return 0; 4168 } 4169 4170 static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) 4171 { 4172 return huge_page_size(hstate_vma(vma)); 4173 } 4174 4175 /* 4176 * We cannot handle pagefaults against hugetlb pages at all. They cause 4177 * handle_mm_fault() to try to instantiate regular-sized pages in the 4178 * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get 4179 * this far. 4180 */ 4181 static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) 4182 { 4183 BUG(); 4184 return 0; 4185 } 4186 4187 /* 4188 * When a new function is introduced to vm_operations_struct and added 4189 * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. 4190 * This is because under System V memory model, mappings created via 4191 * shmget/shmat with "huge page" specified are backed by hugetlbfs files, 4192 * their original vm_ops are overwritten with shm_vm_ops. 4193 */ 4194 const struct vm_operations_struct hugetlb_vm_ops = { 4195 .fault = hugetlb_vm_op_fault, 4196 .open = hugetlb_vm_op_open, 4197 .close = hugetlb_vm_op_close, 4198 .may_split = hugetlb_vm_op_split, 4199 .pagesize = hugetlb_vm_op_pagesize, 4200 }; 4201 4202 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 4203 int writable) 4204 { 4205 pte_t entry; 4206 unsigned int shift = huge_page_shift(hstate_vma(vma)); 4207 4208 if (writable) { 4209 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, 4210 vma->vm_page_prot))); 4211 } else { 4212 entry = huge_pte_wrprotect(mk_huge_pte(page, 4213 vma->vm_page_prot)); 4214 } 4215 entry = pte_mkyoung(entry); 4216 entry = pte_mkhuge(entry); 4217 entry = arch_make_huge_pte(entry, shift, vma->vm_flags); 4218 4219 return entry; 4220 } 4221 4222 static void set_huge_ptep_writable(struct vm_area_struct *vma, 4223 unsigned long address, pte_t *ptep) 4224 { 4225 pte_t entry; 4226 4227 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); 4228 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) 4229 update_mmu_cache(vma, address, ptep); 4230 } 4231 4232 bool is_hugetlb_entry_migration(pte_t pte) 4233 { 4234 swp_entry_t swp; 4235 4236 if (huge_pte_none(pte) || pte_present(pte)) 4237 return false; 4238 swp = pte_to_swp_entry(pte); 4239 if (is_migration_entry(swp)) 4240 return true; 4241 else 4242 return false; 4243 } 4244 4245 static bool is_hugetlb_entry_hwpoisoned(pte_t pte) 4246 { 4247 swp_entry_t swp; 4248 4249 if (huge_pte_none(pte) || pte_present(pte)) 4250 return false; 4251 swp = pte_to_swp_entry(pte); 4252 if (is_hwpoison_entry(swp)) 4253 return true; 4254 else 4255 return false; 4256 } 4257 4258 static void 4259 hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, 4260 struct page *new_page) 4261 { 4262 __SetPageUptodate(new_page); 4263 set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); 4264 hugepage_add_new_anon_rmap(new_page, vma, addr); 4265 hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); 4266 ClearHPageRestoreReserve(new_page); 4267 SetHPageMigratable(new_page); 4268 } 4269 4270 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 4271 struct vm_area_struct *vma) 4272 { 4273 pte_t *src_pte, *dst_pte, entry, dst_entry; 4274 struct page *ptepage; 4275 unsigned long addr; 4276 bool cow = is_cow_mapping(vma->vm_flags); 4277 struct hstate *h = hstate_vma(vma); 4278 unsigned long sz = huge_page_size(h); 4279 unsigned long npages = pages_per_huge_page(h); 4280 struct address_space *mapping = vma->vm_file->f_mapping; 4281 struct mmu_notifier_range range; 4282 int ret = 0; 4283 4284 if (cow) { 4285 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, 4286 vma->vm_start, 4287 vma->vm_end); 4288 mmu_notifier_invalidate_range_start(&range); 4289 } else { 4290 /* 4291 * For shared mappings i_mmap_rwsem must be held to call 4292 * huge_pte_alloc, otherwise the returned ptep could go 4293 * away if part of a shared pmd and another thread calls 4294 * huge_pmd_unshare. 4295 */ 4296 i_mmap_lock_read(mapping); 4297 } 4298 4299 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 4300 spinlock_t *src_ptl, *dst_ptl; 4301 src_pte = huge_pte_offset(src, addr, sz); 4302 if (!src_pte) 4303 continue; 4304 dst_pte = huge_pte_alloc(dst, vma, addr, sz); 4305 if (!dst_pte) { 4306 ret = -ENOMEM; 4307 break; 4308 } 4309 4310 /* 4311 * If the pagetables are shared don't copy or take references. 4312 * dst_pte == src_pte is the common case of src/dest sharing. 4313 * 4314 * However, src could have 'unshared' and dst shares with 4315 * another vma. If dst_pte !none, this implies sharing. 4316 * Check here before taking page table lock, and once again 4317 * after taking the lock below. 4318 */ 4319 dst_entry = huge_ptep_get(dst_pte); 4320 if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) 4321 continue; 4322 4323 dst_ptl = huge_pte_lock(h, dst, dst_pte); 4324 src_ptl = huge_pte_lockptr(h, src, src_pte); 4325 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 4326 entry = huge_ptep_get(src_pte); 4327 dst_entry = huge_ptep_get(dst_pte); 4328 again: 4329 if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { 4330 /* 4331 * Skip if src entry none. Also, skip in the 4332 * unlikely case dst entry !none as this implies 4333 * sharing with another vma. 4334 */ 4335 ; 4336 } else if (unlikely(is_hugetlb_entry_migration(entry) || 4337 is_hugetlb_entry_hwpoisoned(entry))) { 4338 swp_entry_t swp_entry = pte_to_swp_entry(entry); 4339 4340 if (is_writable_migration_entry(swp_entry) && cow) { 4341 /* 4342 * COW mappings require pages in both 4343 * parent and child to be set to read. 4344 */ 4345 swp_entry = make_readable_migration_entry( 4346 swp_offset(swp_entry)); 4347 entry = swp_entry_to_pte(swp_entry); 4348 set_huge_swap_pte_at(src, addr, src_pte, 4349 entry, sz); 4350 } 4351 set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); 4352 } else { 4353 entry = huge_ptep_get(src_pte); 4354 ptepage = pte_page(entry); 4355 get_page(ptepage); 4356 4357 /* 4358 * This is a rare case where we see pinned hugetlb 4359 * pages while they're prone to COW. We need to do the 4360 * COW earlier during fork. 4361 * 4362 * When pre-allocating the page or copying data, we 4363 * need to be without the pgtable locks since we could 4364 * sleep during the process. 4365 */ 4366 if (unlikely(page_needs_cow_for_dma(vma, ptepage))) { 4367 pte_t src_pte_old = entry; 4368 struct page *new; 4369 4370 spin_unlock(src_ptl); 4371 spin_unlock(dst_ptl); 4372 /* Do not use reserve as it's private owned */ 4373 new = alloc_huge_page(vma, addr, 1); 4374 if (IS_ERR(new)) { 4375 put_page(ptepage); 4376 ret = PTR_ERR(new); 4377 break; 4378 } 4379 copy_user_huge_page(new, ptepage, addr, vma, 4380 npages); 4381 put_page(ptepage); 4382 4383 /* Install the new huge page if src pte stable */ 4384 dst_ptl = huge_pte_lock(h, dst, dst_pte); 4385 src_ptl = huge_pte_lockptr(h, src, src_pte); 4386 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 4387 entry = huge_ptep_get(src_pte); 4388 if (!pte_same(src_pte_old, entry)) { 4389 restore_reserve_on_error(h, vma, addr, 4390 new); 4391 put_page(new); 4392 /* dst_entry won't change as in child */ 4393 goto again; 4394 } 4395 hugetlb_install_page(vma, dst_pte, addr, new); 4396 spin_unlock(src_ptl); 4397 spin_unlock(dst_ptl); 4398 continue; 4399 } 4400 4401 if (cow) { 4402 /* 4403 * No need to notify as we are downgrading page 4404 * table protection not changing it to point 4405 * to a new page. 4406 * 4407 * See Documentation/vm/mmu_notifier.rst 4408 */ 4409 huge_ptep_set_wrprotect(src, addr, src_pte); 4410 entry = huge_pte_wrprotect(entry); 4411 } 4412 4413 page_dup_rmap(ptepage, true); 4414 set_huge_pte_at(dst, addr, dst_pte, entry); 4415 hugetlb_count_add(npages, dst); 4416 } 4417 spin_unlock(src_ptl); 4418 spin_unlock(dst_ptl); 4419 } 4420 4421 if (cow) 4422 mmu_notifier_invalidate_range_end(&range); 4423 else 4424 i_mmap_unlock_read(mapping); 4425 4426 return ret; 4427 } 4428 4429 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 4430 unsigned long start, unsigned long end, 4431 struct page *ref_page) 4432 { 4433 struct mm_struct *mm = vma->vm_mm; 4434 unsigned long address; 4435 pte_t *ptep; 4436 pte_t pte; 4437 spinlock_t *ptl; 4438 struct page *page; 4439 struct hstate *h = hstate_vma(vma); 4440 unsigned long sz = huge_page_size(h); 4441 struct mmu_notifier_range range; 4442 4443 WARN_ON(!is_vm_hugetlb_page(vma)); 4444 BUG_ON(start & ~huge_page_mask(h)); 4445 BUG_ON(end & ~huge_page_mask(h)); 4446 4447 /* 4448 * This is a hugetlb vma, all the pte entries should point 4449 * to huge page. 4450 */ 4451 tlb_change_page_size(tlb, sz); 4452 tlb_start_vma(tlb, vma); 4453 4454 /* 4455 * If sharing possible, alert mmu notifiers of worst case. 4456 */ 4457 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, 4458 end); 4459 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 4460 mmu_notifier_invalidate_range_start(&range); 4461 address = start; 4462 for (; address < end; address += sz) { 4463 ptep = huge_pte_offset(mm, address, sz); 4464 if (!ptep) 4465 continue; 4466 4467 ptl = huge_pte_lock(h, mm, ptep); 4468 if (huge_pmd_unshare(mm, vma, &address, ptep)) { 4469 spin_unlock(ptl); 4470 /* 4471 * We just unmapped a page of PMDs by clearing a PUD. 4472 * The caller's TLB flush range should cover this area. 4473 */ 4474 continue; 4475 } 4476 4477 pte = huge_ptep_get(ptep); 4478 if (huge_pte_none(pte)) { 4479 spin_unlock(ptl); 4480 continue; 4481 } 4482 4483 /* 4484 * Migrating hugepage or HWPoisoned hugepage is already 4485 * unmapped and its refcount is dropped, so just clear pte here. 4486 */ 4487 if (unlikely(!pte_present(pte))) { 4488 huge_pte_clear(mm, address, ptep, sz); 4489 spin_unlock(ptl); 4490 continue; 4491 } 4492 4493 page = pte_page(pte); 4494 /* 4495 * If a reference page is supplied, it is because a specific 4496 * page is being unmapped, not a range. Ensure the page we 4497 * are about to unmap is the actual page of interest. 4498 */ 4499 if (ref_page) { 4500 if (page != ref_page) { 4501 spin_unlock(ptl); 4502 continue; 4503 } 4504 /* 4505 * Mark the VMA as having unmapped its page so that 4506 * future faults in this VMA will fail rather than 4507 * looking like data was lost 4508 */ 4509 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); 4510 } 4511 4512 pte = huge_ptep_get_and_clear(mm, address, ptep); 4513 tlb_remove_huge_tlb_entry(h, tlb, ptep, address); 4514 if (huge_pte_dirty(pte)) 4515 set_page_dirty(page); 4516 4517 hugetlb_count_sub(pages_per_huge_page(h), mm); 4518 page_remove_rmap(page, true); 4519 4520 spin_unlock(ptl); 4521 tlb_remove_page_size(tlb, page, huge_page_size(h)); 4522 /* 4523 * Bail out after unmapping reference page if supplied 4524 */ 4525 if (ref_page) 4526 break; 4527 } 4528 mmu_notifier_invalidate_range_end(&range); 4529 tlb_end_vma(tlb, vma); 4530 } 4531 4532 void __unmap_hugepage_range_final(struct mmu_gather *tlb, 4533 struct vm_area_struct *vma, unsigned long start, 4534 unsigned long end, struct page *ref_page) 4535 { 4536 __unmap_hugepage_range(tlb, vma, start, end, ref_page); 4537 4538 /* 4539 * Clear this flag so that x86's huge_pmd_share page_table_shareable 4540 * test will fail on a vma being torn down, and not grab a page table 4541 * on its way out. We're lucky that the flag has such an appropriate 4542 * name, and can in fact be safely cleared here. We could clear it 4543 * before the __unmap_hugepage_range above, but all that's necessary 4544 * is to clear it before releasing the i_mmap_rwsem. This works 4545 * because in the context this is called, the VMA is about to be 4546 * destroyed and the i_mmap_rwsem is held. 4547 */ 4548 vma->vm_flags &= ~VM_MAYSHARE; 4549 } 4550 4551 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 4552 unsigned long end, struct page *ref_page) 4553 { 4554 struct mmu_gather tlb; 4555 4556 tlb_gather_mmu(&tlb, vma->vm_mm); 4557 __unmap_hugepage_range(&tlb, vma, start, end, ref_page); 4558 tlb_finish_mmu(&tlb); 4559 } 4560 4561 /* 4562 * This is called when the original mapper is failing to COW a MAP_PRIVATE 4563 * mapping it owns the reserve page for. The intention is to unmap the page 4564 * from other VMAs and let the children be SIGKILLed if they are faulting the 4565 * same region. 4566 */ 4567 static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, 4568 struct page *page, unsigned long address) 4569 { 4570 struct hstate *h = hstate_vma(vma); 4571 struct vm_area_struct *iter_vma; 4572 struct address_space *mapping; 4573 pgoff_t pgoff; 4574 4575 /* 4576 * vm_pgoff is in PAGE_SIZE units, hence the different calculation 4577 * from page cache lookup which is in HPAGE_SIZE units. 4578 */ 4579 address = address & huge_page_mask(h); 4580 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + 4581 vma->vm_pgoff; 4582 mapping = vma->vm_file->f_mapping; 4583 4584 /* 4585 * Take the mapping lock for the duration of the table walk. As 4586 * this mapping should be shared between all the VMAs, 4587 * __unmap_hugepage_range() is called as the lock is already held 4588 */ 4589 i_mmap_lock_write(mapping); 4590 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { 4591 /* Do not unmap the current VMA */ 4592 if (iter_vma == vma) 4593 continue; 4594 4595 /* 4596 * Shared VMAs have their own reserves and do not affect 4597 * MAP_PRIVATE accounting but it is possible that a shared 4598 * VMA is using the same page so check and skip such VMAs. 4599 */ 4600 if (iter_vma->vm_flags & VM_MAYSHARE) 4601 continue; 4602 4603 /* 4604 * Unmap the page from other VMAs without their own reserves. 4605 * They get marked to be SIGKILLed if they fault in these 4606 * areas. This is because a future no-page fault on this VMA 4607 * could insert a zeroed page instead of the data existing 4608 * from the time of fork. This would look like data corruption 4609 */ 4610 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 4611 unmap_hugepage_range(iter_vma, address, 4612 address + huge_page_size(h), page); 4613 } 4614 i_mmap_unlock_write(mapping); 4615 } 4616 4617 /* 4618 * Hugetlb_cow() should be called with page lock of the original hugepage held. 4619 * Called with hugetlb_instantiation_mutex held and pte_page locked so we 4620 * cannot race with other handlers or page migration. 4621 * Keep the pte_same checks anyway to make transition from the mutex easier. 4622 */ 4623 static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 4624 unsigned long address, pte_t *ptep, 4625 struct page *pagecache_page, spinlock_t *ptl) 4626 { 4627 pte_t pte; 4628 struct hstate *h = hstate_vma(vma); 4629 struct page *old_page, *new_page; 4630 int outside_reserve = 0; 4631 vm_fault_t ret = 0; 4632 unsigned long haddr = address & huge_page_mask(h); 4633 struct mmu_notifier_range range; 4634 4635 pte = huge_ptep_get(ptep); 4636 old_page = pte_page(pte); 4637 4638 retry_avoidcopy: 4639 /* If no-one else is actually using this page, avoid the copy 4640 * and just make the page writable */ 4641 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { 4642 page_move_anon_rmap(old_page, vma); 4643 set_huge_ptep_writable(vma, haddr, ptep); 4644 return 0; 4645 } 4646 4647 /* 4648 * If the process that created a MAP_PRIVATE mapping is about to 4649 * perform a COW due to a shared page count, attempt to satisfy 4650 * the allocation without using the existing reserves. The pagecache 4651 * page is used to determine if the reserve at this address was 4652 * consumed or not. If reserves were used, a partial faulted mapping 4653 * at the time of fork() could consume its reserves on COW instead 4654 * of the full address range. 4655 */ 4656 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 4657 old_page != pagecache_page) 4658 outside_reserve = 1; 4659 4660 get_page(old_page); 4661 4662 /* 4663 * Drop page table lock as buddy allocator may be called. It will 4664 * be acquired again before returning to the caller, as expected. 4665 */ 4666 spin_unlock(ptl); 4667 new_page = alloc_huge_page(vma, haddr, outside_reserve); 4668 4669 if (IS_ERR(new_page)) { 4670 /* 4671 * If a process owning a MAP_PRIVATE mapping fails to COW, 4672 * it is due to references held by a child and an insufficient 4673 * huge page pool. To guarantee the original mappers 4674 * reliability, unmap the page from child processes. The child 4675 * may get SIGKILLed if it later faults. 4676 */ 4677 if (outside_reserve) { 4678 struct address_space *mapping = vma->vm_file->f_mapping; 4679 pgoff_t idx; 4680 u32 hash; 4681 4682 put_page(old_page); 4683 BUG_ON(huge_pte_none(pte)); 4684 /* 4685 * Drop hugetlb_fault_mutex and i_mmap_rwsem before 4686 * unmapping. unmapping needs to hold i_mmap_rwsem 4687 * in write mode. Dropping i_mmap_rwsem in read mode 4688 * here is OK as COW mappings do not interact with 4689 * PMD sharing. 4690 * 4691 * Reacquire both after unmap operation. 4692 */ 4693 idx = vma_hugecache_offset(h, vma, haddr); 4694 hash = hugetlb_fault_mutex_hash(mapping, idx); 4695 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 4696 i_mmap_unlock_read(mapping); 4697 4698 unmap_ref_private(mm, vma, old_page, haddr); 4699 4700 i_mmap_lock_read(mapping); 4701 mutex_lock(&hugetlb_fault_mutex_table[hash]); 4702 spin_lock(ptl); 4703 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 4704 if (likely(ptep && 4705 pte_same(huge_ptep_get(ptep), pte))) 4706 goto retry_avoidcopy; 4707 /* 4708 * race occurs while re-acquiring page table 4709 * lock, and our job is done. 4710 */ 4711 return 0; 4712 } 4713 4714 ret = vmf_error(PTR_ERR(new_page)); 4715 goto out_release_old; 4716 } 4717 4718 /* 4719 * When the original hugepage is shared one, it does not have 4720 * anon_vma prepared. 4721 */ 4722 if (unlikely(anon_vma_prepare(vma))) { 4723 ret = VM_FAULT_OOM; 4724 goto out_release_all; 4725 } 4726 4727 copy_user_huge_page(new_page, old_page, address, vma, 4728 pages_per_huge_page(h)); 4729 __SetPageUptodate(new_page); 4730 4731 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, 4732 haddr + huge_page_size(h)); 4733 mmu_notifier_invalidate_range_start(&range); 4734 4735 /* 4736 * Retake the page table lock to check for racing updates 4737 * before the page tables are altered 4738 */ 4739 spin_lock(ptl); 4740 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 4741 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { 4742 ClearHPageRestoreReserve(new_page); 4743 4744 /* Break COW */ 4745 huge_ptep_clear_flush(vma, haddr, ptep); 4746 mmu_notifier_invalidate_range(mm, range.start, range.end); 4747 set_huge_pte_at(mm, haddr, ptep, 4748 make_huge_pte(vma, new_page, 1)); 4749 page_remove_rmap(old_page, true); 4750 hugepage_add_new_anon_rmap(new_page, vma, haddr); 4751 SetHPageMigratable(new_page); 4752 /* Make the old page be freed below */ 4753 new_page = old_page; 4754 } 4755 spin_unlock(ptl); 4756 mmu_notifier_invalidate_range_end(&range); 4757 out_release_all: 4758 /* No restore in case of successful pagetable update (Break COW) */ 4759 if (new_page != old_page) 4760 restore_reserve_on_error(h, vma, haddr, new_page); 4761 put_page(new_page); 4762 out_release_old: 4763 put_page(old_page); 4764 4765 spin_lock(ptl); /* Caller expects lock to be held */ 4766 return ret; 4767 } 4768 4769 /* Return the pagecache page at a given address within a VMA */ 4770 static struct page *hugetlbfs_pagecache_page(struct hstate *h, 4771 struct vm_area_struct *vma, unsigned long address) 4772 { 4773 struct address_space *mapping; 4774 pgoff_t idx; 4775 4776 mapping = vma->vm_file->f_mapping; 4777 idx = vma_hugecache_offset(h, vma, address); 4778 4779 return find_lock_page(mapping, idx); 4780 } 4781 4782 /* 4783 * Return whether there is a pagecache page to back given address within VMA. 4784 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. 4785 */ 4786 static bool hugetlbfs_pagecache_present(struct hstate *h, 4787 struct vm_area_struct *vma, unsigned long address) 4788 { 4789 struct address_space *mapping; 4790 pgoff_t idx; 4791 struct page *page; 4792 4793 mapping = vma->vm_file->f_mapping; 4794 idx = vma_hugecache_offset(h, vma, address); 4795 4796 page = find_get_page(mapping, idx); 4797 if (page) 4798 put_page(page); 4799 return page != NULL; 4800 } 4801 4802 int huge_add_to_page_cache(struct page *page, struct address_space *mapping, 4803 pgoff_t idx) 4804 { 4805 struct inode *inode = mapping->host; 4806 struct hstate *h = hstate_inode(inode); 4807 int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 4808 4809 if (err) 4810 return err; 4811 ClearHPageRestoreReserve(page); 4812 4813 /* 4814 * set page dirty so that it will not be removed from cache/file 4815 * by non-hugetlbfs specific code paths. 4816 */ 4817 set_page_dirty(page); 4818 4819 spin_lock(&inode->i_lock); 4820 inode->i_blocks += blocks_per_huge_page(h); 4821 spin_unlock(&inode->i_lock); 4822 return 0; 4823 } 4824 4825 static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, 4826 struct address_space *mapping, 4827 pgoff_t idx, 4828 unsigned int flags, 4829 unsigned long haddr, 4830 unsigned long reason) 4831 { 4832 vm_fault_t ret; 4833 u32 hash; 4834 struct vm_fault vmf = { 4835 .vma = vma, 4836 .address = haddr, 4837 .flags = flags, 4838 4839 /* 4840 * Hard to debug if it ends up being 4841 * used by a callee that assumes 4842 * something about the other 4843 * uninitialized fields... same as in 4844 * memory.c 4845 */ 4846 }; 4847 4848 /* 4849 * hugetlb_fault_mutex and i_mmap_rwsem must be 4850 * dropped before handling userfault. Reacquire 4851 * after handling fault to make calling code simpler. 4852 */ 4853 hash = hugetlb_fault_mutex_hash(mapping, idx); 4854 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 4855 i_mmap_unlock_read(mapping); 4856 ret = handle_userfault(&vmf, reason); 4857 i_mmap_lock_read(mapping); 4858 mutex_lock(&hugetlb_fault_mutex_table[hash]); 4859 4860 return ret; 4861 } 4862 4863 static vm_fault_t hugetlb_no_page(struct mm_struct *mm, 4864 struct vm_area_struct *vma, 4865 struct address_space *mapping, pgoff_t idx, 4866 unsigned long address, pte_t *ptep, unsigned int flags) 4867 { 4868 struct hstate *h = hstate_vma(vma); 4869 vm_fault_t ret = VM_FAULT_SIGBUS; 4870 int anon_rmap = 0; 4871 unsigned long size; 4872 struct page *page; 4873 pte_t new_pte; 4874 spinlock_t *ptl; 4875 unsigned long haddr = address & huge_page_mask(h); 4876 bool new_page, new_pagecache_page = false; 4877 4878 /* 4879 * Currently, we are forced to kill the process in the event the 4880 * original mapper has unmapped pages from the child due to a failed 4881 * COW. Warn that such a situation has occurred as it may not be obvious 4882 */ 4883 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 4884 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", 4885 current->pid); 4886 return ret; 4887 } 4888 4889 /* 4890 * We can not race with truncation due to holding i_mmap_rwsem. 4891 * i_size is modified when holding i_mmap_rwsem, so check here 4892 * once for faults beyond end of file. 4893 */ 4894 size = i_size_read(mapping->host) >> huge_page_shift(h); 4895 if (idx >= size) 4896 goto out; 4897 4898 retry: 4899 new_page = false; 4900 page = find_lock_page(mapping, idx); 4901 if (!page) { 4902 /* Check for page in userfault range */ 4903 if (userfaultfd_missing(vma)) { 4904 ret = hugetlb_handle_userfault(vma, mapping, idx, 4905 flags, haddr, 4906 VM_UFFD_MISSING); 4907 goto out; 4908 } 4909 4910 page = alloc_huge_page(vma, haddr, 0); 4911 if (IS_ERR(page)) { 4912 /* 4913 * Returning error will result in faulting task being 4914 * sent SIGBUS. The hugetlb fault mutex prevents two 4915 * tasks from racing to fault in the same page which 4916 * could result in false unable to allocate errors. 4917 * Page migration does not take the fault mutex, but 4918 * does a clear then write of pte's under page table 4919 * lock. Page fault code could race with migration, 4920 * notice the clear pte and try to allocate a page 4921 * here. Before returning error, get ptl and make 4922 * sure there really is no pte entry. 4923 */ 4924 ptl = huge_pte_lock(h, mm, ptep); 4925 ret = 0; 4926 if (huge_pte_none(huge_ptep_get(ptep))) 4927 ret = vmf_error(PTR_ERR(page)); 4928 spin_unlock(ptl); 4929 goto out; 4930 } 4931 clear_huge_page(page, address, pages_per_huge_page(h)); 4932 __SetPageUptodate(page); 4933 new_page = true; 4934 4935 if (vma->vm_flags & VM_MAYSHARE) { 4936 int err = huge_add_to_page_cache(page, mapping, idx); 4937 if (err) { 4938 put_page(page); 4939 if (err == -EEXIST) 4940 goto retry; 4941 goto out; 4942 } 4943 new_pagecache_page = true; 4944 } else { 4945 lock_page(page); 4946 if (unlikely(anon_vma_prepare(vma))) { 4947 ret = VM_FAULT_OOM; 4948 goto backout_unlocked; 4949 } 4950 anon_rmap = 1; 4951 } 4952 } else { 4953 /* 4954 * If memory error occurs between mmap() and fault, some process 4955 * don't have hwpoisoned swap entry for errored virtual address. 4956 * So we need to block hugepage fault by PG_hwpoison bit check. 4957 */ 4958 if (unlikely(PageHWPoison(page))) { 4959 ret = VM_FAULT_HWPOISON_LARGE | 4960 VM_FAULT_SET_HINDEX(hstate_index(h)); 4961 goto backout_unlocked; 4962 } 4963 4964 /* Check for page in userfault range. */ 4965 if (userfaultfd_minor(vma)) { 4966 unlock_page(page); 4967 put_page(page); 4968 ret = hugetlb_handle_userfault(vma, mapping, idx, 4969 flags, haddr, 4970 VM_UFFD_MINOR); 4971 goto out; 4972 } 4973 } 4974 4975 /* 4976 * If we are going to COW a private mapping later, we examine the 4977 * pending reservations for this page now. This will ensure that 4978 * any allocations necessary to record that reservation occur outside 4979 * the spinlock. 4980 */ 4981 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 4982 if (vma_needs_reservation(h, vma, haddr) < 0) { 4983 ret = VM_FAULT_OOM; 4984 goto backout_unlocked; 4985 } 4986 /* Just decrements count, does not deallocate */ 4987 vma_end_reservation(h, vma, haddr); 4988 } 4989 4990 ptl = huge_pte_lock(h, mm, ptep); 4991 ret = 0; 4992 if (!huge_pte_none(huge_ptep_get(ptep))) 4993 goto backout; 4994 4995 if (anon_rmap) { 4996 ClearHPageRestoreReserve(page); 4997 hugepage_add_new_anon_rmap(page, vma, haddr); 4998 } else 4999 page_dup_rmap(page, true); 5000 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 5001 && (vma->vm_flags & VM_SHARED))); 5002 set_huge_pte_at(mm, haddr, ptep, new_pte); 5003 5004 hugetlb_count_add(pages_per_huge_page(h), mm); 5005 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 5006 /* Optimization, do the COW without a second fault */ 5007 ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); 5008 } 5009 5010 spin_unlock(ptl); 5011 5012 /* 5013 * Only set HPageMigratable in newly allocated pages. Existing pages 5014 * found in the pagecache may not have HPageMigratableset if they have 5015 * been isolated for migration. 5016 */ 5017 if (new_page) 5018 SetHPageMigratable(page); 5019 5020 unlock_page(page); 5021 out: 5022 return ret; 5023 5024 backout: 5025 spin_unlock(ptl); 5026 backout_unlocked: 5027 unlock_page(page); 5028 /* restore reserve for newly allocated pages not in page cache */ 5029 if (new_page && !new_pagecache_page) 5030 restore_reserve_on_error(h, vma, haddr, page); 5031 put_page(page); 5032 goto out; 5033 } 5034 5035 #ifdef CONFIG_SMP 5036 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 5037 { 5038 unsigned long key[2]; 5039 u32 hash; 5040 5041 key[0] = (unsigned long) mapping; 5042 key[1] = idx; 5043 5044 hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); 5045 5046 return hash & (num_fault_mutexes - 1); 5047 } 5048 #else 5049 /* 5050 * For uniprocessor systems we always use a single mutex, so just 5051 * return 0 and avoid the hashing overhead. 5052 */ 5053 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 5054 { 5055 return 0; 5056 } 5057 #endif 5058 5059 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 5060 unsigned long address, unsigned int flags) 5061 { 5062 pte_t *ptep, entry; 5063 spinlock_t *ptl; 5064 vm_fault_t ret; 5065 u32 hash; 5066 pgoff_t idx; 5067 struct page *page = NULL; 5068 struct page *pagecache_page = NULL; 5069 struct hstate *h = hstate_vma(vma); 5070 struct address_space *mapping; 5071 int need_wait_lock = 0; 5072 unsigned long haddr = address & huge_page_mask(h); 5073 5074 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 5075 if (ptep) { 5076 /* 5077 * Since we hold no locks, ptep could be stale. That is 5078 * OK as we are only making decisions based on content and 5079 * not actually modifying content here. 5080 */ 5081 entry = huge_ptep_get(ptep); 5082 if (unlikely(is_hugetlb_entry_migration(entry))) { 5083 migration_entry_wait_huge(vma, mm, ptep); 5084 return 0; 5085 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 5086 return VM_FAULT_HWPOISON_LARGE | 5087 VM_FAULT_SET_HINDEX(hstate_index(h)); 5088 } 5089 5090 /* 5091 * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold 5092 * until finished with ptep. This serves two purposes: 5093 * 1) It prevents huge_pmd_unshare from being called elsewhere 5094 * and making the ptep no longer valid. 5095 * 2) It synchronizes us with i_size modifications during truncation. 5096 * 5097 * ptep could have already be assigned via huge_pte_offset. That 5098 * is OK, as huge_pte_alloc will return the same value unless 5099 * something has changed. 5100 */ 5101 mapping = vma->vm_file->f_mapping; 5102 i_mmap_lock_read(mapping); 5103 ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); 5104 if (!ptep) { 5105 i_mmap_unlock_read(mapping); 5106 return VM_FAULT_OOM; 5107 } 5108 5109 /* 5110 * Serialize hugepage allocation and instantiation, so that we don't 5111 * get spurious allocation failures if two CPUs race to instantiate 5112 * the same page in the page cache. 5113 */ 5114 idx = vma_hugecache_offset(h, vma, haddr); 5115 hash = hugetlb_fault_mutex_hash(mapping, idx); 5116 mutex_lock(&hugetlb_fault_mutex_table[hash]); 5117 5118 entry = huge_ptep_get(ptep); 5119 if (huge_pte_none(entry)) { 5120 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); 5121 goto out_mutex; 5122 } 5123 5124 ret = 0; 5125 5126 /* 5127 * entry could be a migration/hwpoison entry at this point, so this 5128 * check prevents the kernel from going below assuming that we have 5129 * an active hugepage in pagecache. This goto expects the 2nd page 5130 * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will 5131 * properly handle it. 5132 */ 5133 if (!pte_present(entry)) 5134 goto out_mutex; 5135 5136 /* 5137 * If we are going to COW the mapping later, we examine the pending 5138 * reservations for this page now. This will ensure that any 5139 * allocations necessary to record that reservation occur outside the 5140 * spinlock. For private mappings, we also lookup the pagecache 5141 * page now as it is used to determine if a reservation has been 5142 * consumed. 5143 */ 5144 if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { 5145 if (vma_needs_reservation(h, vma, haddr) < 0) { 5146 ret = VM_FAULT_OOM; 5147 goto out_mutex; 5148 } 5149 /* Just decrements count, does not deallocate */ 5150 vma_end_reservation(h, vma, haddr); 5151 5152 if (!(vma->vm_flags & VM_MAYSHARE)) 5153 pagecache_page = hugetlbfs_pagecache_page(h, 5154 vma, haddr); 5155 } 5156 5157 ptl = huge_pte_lock(h, mm, ptep); 5158 5159 /* Check for a racing update before calling hugetlb_cow */ 5160 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 5161 goto out_ptl; 5162 5163 /* 5164 * hugetlb_cow() requires page locks of pte_page(entry) and 5165 * pagecache_page, so here we need take the former one 5166 * when page != pagecache_page or !pagecache_page. 5167 */ 5168 page = pte_page(entry); 5169 if (page != pagecache_page) 5170 if (!trylock_page(page)) { 5171 need_wait_lock = 1; 5172 goto out_ptl; 5173 } 5174 5175 get_page(page); 5176 5177 if (flags & FAULT_FLAG_WRITE) { 5178 if (!huge_pte_write(entry)) { 5179 ret = hugetlb_cow(mm, vma, address, ptep, 5180 pagecache_page, ptl); 5181 goto out_put_page; 5182 } 5183 entry = huge_pte_mkdirty(entry); 5184 } 5185 entry = pte_mkyoung(entry); 5186 if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, 5187 flags & FAULT_FLAG_WRITE)) 5188 update_mmu_cache(vma, haddr, ptep); 5189 out_put_page: 5190 if (page != pagecache_page) 5191 unlock_page(page); 5192 put_page(page); 5193 out_ptl: 5194 spin_unlock(ptl); 5195 5196 if (pagecache_page) { 5197 unlock_page(pagecache_page); 5198 put_page(pagecache_page); 5199 } 5200 out_mutex: 5201 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 5202 i_mmap_unlock_read(mapping); 5203 /* 5204 * Generally it's safe to hold refcount during waiting page lock. But 5205 * here we just wait to defer the next page fault to avoid busy loop and 5206 * the page is not used after unlocked before returning from the current 5207 * page fault. So we are safe from accessing freed page, even if we wait 5208 * here without taking refcount. 5209 */ 5210 if (need_wait_lock) 5211 wait_on_page_locked(page); 5212 return ret; 5213 } 5214 5215 #ifdef CONFIG_USERFAULTFD 5216 /* 5217 * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with 5218 * modifications for huge pages. 5219 */ 5220 int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, 5221 pte_t *dst_pte, 5222 struct vm_area_struct *dst_vma, 5223 unsigned long dst_addr, 5224 unsigned long src_addr, 5225 enum mcopy_atomic_mode mode, 5226 struct page **pagep) 5227 { 5228 bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); 5229 struct hstate *h = hstate_vma(dst_vma); 5230 struct address_space *mapping = dst_vma->vm_file->f_mapping; 5231 pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); 5232 unsigned long size; 5233 int vm_shared = dst_vma->vm_flags & VM_SHARED; 5234 pte_t _dst_pte; 5235 spinlock_t *ptl; 5236 int ret = -ENOMEM; 5237 struct page *page; 5238 int writable; 5239 bool new_pagecache_page = false; 5240 5241 if (is_continue) { 5242 ret = -EFAULT; 5243 page = find_lock_page(mapping, idx); 5244 if (!page) 5245 goto out; 5246 } else if (!*pagep) { 5247 /* If a page already exists, then it's UFFDIO_COPY for 5248 * a non-missing case. Return -EEXIST. 5249 */ 5250 if (vm_shared && 5251 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { 5252 ret = -EEXIST; 5253 goto out; 5254 } 5255 5256 page = alloc_huge_page(dst_vma, dst_addr, 0); 5257 if (IS_ERR(page)) { 5258 ret = -ENOMEM; 5259 goto out; 5260 } 5261 5262 ret = copy_huge_page_from_user(page, 5263 (const void __user *) src_addr, 5264 pages_per_huge_page(h), false); 5265 5266 /* fallback to copy_from_user outside mmap_lock */ 5267 if (unlikely(ret)) { 5268 ret = -ENOENT; 5269 /* Free the allocated page which may have 5270 * consumed a reservation. 5271 */ 5272 restore_reserve_on_error(h, dst_vma, dst_addr, page); 5273 put_page(page); 5274 5275 /* Allocate a temporary page to hold the copied 5276 * contents. 5277 */ 5278 page = alloc_huge_page_vma(h, dst_vma, dst_addr); 5279 if (!page) { 5280 ret = -ENOMEM; 5281 goto out; 5282 } 5283 *pagep = page; 5284 /* Set the outparam pagep and return to the caller to 5285 * copy the contents outside the lock. Don't free the 5286 * page. 5287 */ 5288 goto out; 5289 } 5290 } else { 5291 if (vm_shared && 5292 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { 5293 put_page(*pagep); 5294 ret = -EEXIST; 5295 *pagep = NULL; 5296 goto out; 5297 } 5298 5299 page = alloc_huge_page(dst_vma, dst_addr, 0); 5300 if (IS_ERR(page)) { 5301 ret = -ENOMEM; 5302 *pagep = NULL; 5303 goto out; 5304 } 5305 copy_huge_page(page, *pagep); 5306 put_page(*pagep); 5307 *pagep = NULL; 5308 } 5309 5310 /* 5311 * The memory barrier inside __SetPageUptodate makes sure that 5312 * preceding stores to the page contents become visible before 5313 * the set_pte_at() write. 5314 */ 5315 __SetPageUptodate(page); 5316 5317 /* Add shared, newly allocated pages to the page cache. */ 5318 if (vm_shared && !is_continue) { 5319 size = i_size_read(mapping->host) >> huge_page_shift(h); 5320 ret = -EFAULT; 5321 if (idx >= size) 5322 goto out_release_nounlock; 5323 5324 /* 5325 * Serialization between remove_inode_hugepages() and 5326 * huge_add_to_page_cache() below happens through the 5327 * hugetlb_fault_mutex_table that here must be hold by 5328 * the caller. 5329 */ 5330 ret = huge_add_to_page_cache(page, mapping, idx); 5331 if (ret) 5332 goto out_release_nounlock; 5333 new_pagecache_page = true; 5334 } 5335 5336 ptl = huge_pte_lockptr(h, dst_mm, dst_pte); 5337 spin_lock(ptl); 5338 5339 /* 5340 * Recheck the i_size after holding PT lock to make sure not 5341 * to leave any page mapped (as page_mapped()) beyond the end 5342 * of the i_size (remove_inode_hugepages() is strict about 5343 * enforcing that). If we bail out here, we'll also leave a 5344 * page in the radix tree in the vm_shared case beyond the end 5345 * of the i_size, but remove_inode_hugepages() will take care 5346 * of it as soon as we drop the hugetlb_fault_mutex_table. 5347 */ 5348 size = i_size_read(mapping->host) >> huge_page_shift(h); 5349 ret = -EFAULT; 5350 if (idx >= size) 5351 goto out_release_unlock; 5352 5353 ret = -EEXIST; 5354 if (!huge_pte_none(huge_ptep_get(dst_pte))) 5355 goto out_release_unlock; 5356 5357 if (vm_shared) { 5358 page_dup_rmap(page, true); 5359 } else { 5360 ClearHPageRestoreReserve(page); 5361 hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); 5362 } 5363 5364 /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */ 5365 if (is_continue && !vm_shared) 5366 writable = 0; 5367 else 5368 writable = dst_vma->vm_flags & VM_WRITE; 5369 5370 _dst_pte = make_huge_pte(dst_vma, page, writable); 5371 if (writable) 5372 _dst_pte = huge_pte_mkdirty(_dst_pte); 5373 _dst_pte = pte_mkyoung(_dst_pte); 5374 5375 set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 5376 5377 (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, 5378 dst_vma->vm_flags & VM_WRITE); 5379 hugetlb_count_add(pages_per_huge_page(h), dst_mm); 5380 5381 /* No need to invalidate - it was non-present before */ 5382 update_mmu_cache(dst_vma, dst_addr, dst_pte); 5383 5384 spin_unlock(ptl); 5385 if (!is_continue) 5386 SetHPageMigratable(page); 5387 if (vm_shared || is_continue) 5388 unlock_page(page); 5389 ret = 0; 5390 out: 5391 return ret; 5392 out_release_unlock: 5393 spin_unlock(ptl); 5394 if (vm_shared || is_continue) 5395 unlock_page(page); 5396 out_release_nounlock: 5397 if (!new_pagecache_page) 5398 restore_reserve_on_error(h, dst_vma, dst_addr, page); 5399 put_page(page); 5400 goto out; 5401 } 5402 #endif /* CONFIG_USERFAULTFD */ 5403 5404 static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, 5405 int refs, struct page **pages, 5406 struct vm_area_struct **vmas) 5407 { 5408 int nr; 5409 5410 for (nr = 0; nr < refs; nr++) { 5411 if (likely(pages)) 5412 pages[nr] = mem_map_offset(page, nr); 5413 if (vmas) 5414 vmas[nr] = vma; 5415 } 5416 } 5417 5418 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 5419 struct page **pages, struct vm_area_struct **vmas, 5420 unsigned long *position, unsigned long *nr_pages, 5421 long i, unsigned int flags, int *locked) 5422 { 5423 unsigned long pfn_offset; 5424 unsigned long vaddr = *position; 5425 unsigned long remainder = *nr_pages; 5426 struct hstate *h = hstate_vma(vma); 5427 int err = -EFAULT, refs; 5428 5429 while (vaddr < vma->vm_end && remainder) { 5430 pte_t *pte; 5431 spinlock_t *ptl = NULL; 5432 int absent; 5433 struct page *page; 5434 5435 /* 5436 * If we have a pending SIGKILL, don't keep faulting pages and 5437 * potentially allocating memory. 5438 */ 5439 if (fatal_signal_pending(current)) { 5440 remainder = 0; 5441 break; 5442 } 5443 5444 /* 5445 * Some archs (sparc64, sh*) have multiple pte_ts to 5446 * each hugepage. We have to make sure we get the 5447 * first, for the page indexing below to work. 5448 * 5449 * Note that page table lock is not held when pte is null. 5450 */ 5451 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), 5452 huge_page_size(h)); 5453 if (pte) 5454 ptl = huge_pte_lock(h, mm, pte); 5455 absent = !pte || huge_pte_none(huge_ptep_get(pte)); 5456 5457 /* 5458 * When coredumping, it suits get_dump_page if we just return 5459 * an error where there's an empty slot with no huge pagecache 5460 * to back it. This way, we avoid allocating a hugepage, and 5461 * the sparse dumpfile avoids allocating disk blocks, but its 5462 * huge holes still show up with zeroes where they need to be. 5463 */ 5464 if (absent && (flags & FOLL_DUMP) && 5465 !hugetlbfs_pagecache_present(h, vma, vaddr)) { 5466 if (pte) 5467 spin_unlock(ptl); 5468 remainder = 0; 5469 break; 5470 } 5471 5472 /* 5473 * We need call hugetlb_fault for both hugepages under migration 5474 * (in which case hugetlb_fault waits for the migration,) and 5475 * hwpoisoned hugepages (in which case we need to prevent the 5476 * caller from accessing to them.) In order to do this, we use 5477 * here is_swap_pte instead of is_hugetlb_entry_migration and 5478 * is_hugetlb_entry_hwpoisoned. This is because it simply covers 5479 * both cases, and because we can't follow correct pages 5480 * directly from any kind of swap entries. 5481 */ 5482 if (absent || is_swap_pte(huge_ptep_get(pte)) || 5483 ((flags & FOLL_WRITE) && 5484 !huge_pte_write(huge_ptep_get(pte)))) { 5485 vm_fault_t ret; 5486 unsigned int fault_flags = 0; 5487 5488 if (pte) 5489 spin_unlock(ptl); 5490 if (flags & FOLL_WRITE) 5491 fault_flags |= FAULT_FLAG_WRITE; 5492 if (locked) 5493 fault_flags |= FAULT_FLAG_ALLOW_RETRY | 5494 FAULT_FLAG_KILLABLE; 5495 if (flags & FOLL_NOWAIT) 5496 fault_flags |= FAULT_FLAG_ALLOW_RETRY | 5497 FAULT_FLAG_RETRY_NOWAIT; 5498 if (flags & FOLL_TRIED) { 5499 /* 5500 * Note: FAULT_FLAG_ALLOW_RETRY and 5501 * FAULT_FLAG_TRIED can co-exist 5502 */ 5503 fault_flags |= FAULT_FLAG_TRIED; 5504 } 5505 ret = hugetlb_fault(mm, vma, vaddr, fault_flags); 5506 if (ret & VM_FAULT_ERROR) { 5507 err = vm_fault_to_errno(ret, flags); 5508 remainder = 0; 5509 break; 5510 } 5511 if (ret & VM_FAULT_RETRY) { 5512 if (locked && 5513 !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) 5514 *locked = 0; 5515 *nr_pages = 0; 5516 /* 5517 * VM_FAULT_RETRY must not return an 5518 * error, it will return zero 5519 * instead. 5520 * 5521 * No need to update "position" as the 5522 * caller will not check it after 5523 * *nr_pages is set to 0. 5524 */ 5525 return i; 5526 } 5527 continue; 5528 } 5529 5530 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; 5531 page = pte_page(huge_ptep_get(pte)); 5532 5533 /* 5534 * If subpage information not requested, update counters 5535 * and skip the same_page loop below. 5536 */ 5537 if (!pages && !vmas && !pfn_offset && 5538 (vaddr + huge_page_size(h) < vma->vm_end) && 5539 (remainder >= pages_per_huge_page(h))) { 5540 vaddr += huge_page_size(h); 5541 remainder -= pages_per_huge_page(h); 5542 i += pages_per_huge_page(h); 5543 spin_unlock(ptl); 5544 continue; 5545 } 5546 5547 /* vaddr may not be aligned to PAGE_SIZE */ 5548 refs = min3(pages_per_huge_page(h) - pfn_offset, remainder, 5549 (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); 5550 5551 if (pages || vmas) 5552 record_subpages_vmas(mem_map_offset(page, pfn_offset), 5553 vma, refs, 5554 likely(pages) ? pages + i : NULL, 5555 vmas ? vmas + i : NULL); 5556 5557 if (pages) { 5558 /* 5559 * try_grab_compound_head() should always succeed here, 5560 * because: a) we hold the ptl lock, and b) we've just 5561 * checked that the huge page is present in the page 5562 * tables. If the huge page is present, then the tail 5563 * pages must also be present. The ptl prevents the 5564 * head page and tail pages from being rearranged in 5565 * any way. So this page must be available at this 5566 * point, unless the page refcount overflowed: 5567 */ 5568 if (WARN_ON_ONCE(!try_grab_compound_head(pages[i], 5569 refs, 5570 flags))) { 5571 spin_unlock(ptl); 5572 remainder = 0; 5573 err = -ENOMEM; 5574 break; 5575 } 5576 } 5577 5578 vaddr += (refs << PAGE_SHIFT); 5579 remainder -= refs; 5580 i += refs; 5581 5582 spin_unlock(ptl); 5583 } 5584 *nr_pages = remainder; 5585 /* 5586 * setting position is actually required only if remainder is 5587 * not zero but it's faster not to add a "if (remainder)" 5588 * branch. 5589 */ 5590 *position = vaddr; 5591 5592 return i ? i : err; 5593 } 5594 5595 unsigned long hugetlb_change_protection(struct vm_area_struct *vma, 5596 unsigned long address, unsigned long end, pgprot_t newprot) 5597 { 5598 struct mm_struct *mm = vma->vm_mm; 5599 unsigned long start = address; 5600 pte_t *ptep; 5601 pte_t pte; 5602 struct hstate *h = hstate_vma(vma); 5603 unsigned long pages = 0; 5604 bool shared_pmd = false; 5605 struct mmu_notifier_range range; 5606 5607 /* 5608 * In the case of shared PMDs, the area to flush could be beyond 5609 * start/end. Set range.start/range.end to cover the maximum possible 5610 * range if PMD sharing is possible. 5611 */ 5612 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 5613 0, vma, mm, start, end); 5614 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 5615 5616 BUG_ON(address >= end); 5617 flush_cache_range(vma, range.start, range.end); 5618 5619 mmu_notifier_invalidate_range_start(&range); 5620 i_mmap_lock_write(vma->vm_file->f_mapping); 5621 for (; address < end; address += huge_page_size(h)) { 5622 spinlock_t *ptl; 5623 ptep = huge_pte_offset(mm, address, huge_page_size(h)); 5624 if (!ptep) 5625 continue; 5626 ptl = huge_pte_lock(h, mm, ptep); 5627 if (huge_pmd_unshare(mm, vma, &address, ptep)) { 5628 pages++; 5629 spin_unlock(ptl); 5630 shared_pmd = true; 5631 continue; 5632 } 5633 pte = huge_ptep_get(ptep); 5634 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 5635 spin_unlock(ptl); 5636 continue; 5637 } 5638 if (unlikely(is_hugetlb_entry_migration(pte))) { 5639 swp_entry_t entry = pte_to_swp_entry(pte); 5640 5641 if (is_writable_migration_entry(entry)) { 5642 pte_t newpte; 5643 5644 entry = make_readable_migration_entry( 5645 swp_offset(entry)); 5646 newpte = swp_entry_to_pte(entry); 5647 set_huge_swap_pte_at(mm, address, ptep, 5648 newpte, huge_page_size(h)); 5649 pages++; 5650 } 5651 spin_unlock(ptl); 5652 continue; 5653 } 5654 if (!huge_pte_none(pte)) { 5655 pte_t old_pte; 5656 unsigned int shift = huge_page_shift(hstate_vma(vma)); 5657 5658 old_pte = huge_ptep_modify_prot_start(vma, address, ptep); 5659 pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); 5660 pte = arch_make_huge_pte(pte, shift, vma->vm_flags); 5661 huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); 5662 pages++; 5663 } 5664 spin_unlock(ptl); 5665 } 5666 /* 5667 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare 5668 * may have cleared our pud entry and done put_page on the page table: 5669 * once we release i_mmap_rwsem, another task can do the final put_page 5670 * and that page table be reused and filled with junk. If we actually 5671 * did unshare a page of pmds, flush the range corresponding to the pud. 5672 */ 5673 if (shared_pmd) 5674 flush_hugetlb_tlb_range(vma, range.start, range.end); 5675 else 5676 flush_hugetlb_tlb_range(vma, start, end); 5677 /* 5678 * No need to call mmu_notifier_invalidate_range() we are downgrading 5679 * page table protection not changing it to point to a new page. 5680 * 5681 * See Documentation/vm/mmu_notifier.rst 5682 */ 5683 i_mmap_unlock_write(vma->vm_file->f_mapping); 5684 mmu_notifier_invalidate_range_end(&range); 5685 5686 return pages << h->order; 5687 } 5688 5689 /* Return true if reservation was successful, false otherwise. */ 5690 bool hugetlb_reserve_pages(struct inode *inode, 5691 long from, long to, 5692 struct vm_area_struct *vma, 5693 vm_flags_t vm_flags) 5694 { 5695 long chg, add = -1; 5696 struct hstate *h = hstate_inode(inode); 5697 struct hugepage_subpool *spool = subpool_inode(inode); 5698 struct resv_map *resv_map; 5699 struct hugetlb_cgroup *h_cg = NULL; 5700 long gbl_reserve, regions_needed = 0; 5701 5702 /* This should never happen */ 5703 if (from > to) { 5704 VM_WARN(1, "%s called with a negative range\n", __func__); 5705 return false; 5706 } 5707 5708 /* 5709 * Only apply hugepage reservation if asked. At fault time, an 5710 * attempt will be made for VM_NORESERVE to allocate a page 5711 * without using reserves 5712 */ 5713 if (vm_flags & VM_NORESERVE) 5714 return true; 5715 5716 /* 5717 * Shared mappings base their reservation on the number of pages that 5718 * are already allocated on behalf of the file. Private mappings need 5719 * to reserve the full area even if read-only as mprotect() may be 5720 * called to make the mapping read-write. Assume !vma is a shm mapping 5721 */ 5722 if (!vma || vma->vm_flags & VM_MAYSHARE) { 5723 /* 5724 * resv_map can not be NULL as hugetlb_reserve_pages is only 5725 * called for inodes for which resv_maps were created (see 5726 * hugetlbfs_get_inode). 5727 */ 5728 resv_map = inode_resv_map(inode); 5729 5730 chg = region_chg(resv_map, from, to, ®ions_needed); 5731 5732 } else { 5733 /* Private mapping. */ 5734 resv_map = resv_map_alloc(); 5735 if (!resv_map) 5736 return false; 5737 5738 chg = to - from; 5739 5740 set_vma_resv_map(vma, resv_map); 5741 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 5742 } 5743 5744 if (chg < 0) 5745 goto out_err; 5746 5747 if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h), 5748 chg * pages_per_huge_page(h), &h_cg) < 0) 5749 goto out_err; 5750 5751 if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { 5752 /* For private mappings, the hugetlb_cgroup uncharge info hangs 5753 * of the resv_map. 5754 */ 5755 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); 5756 } 5757 5758 /* 5759 * There must be enough pages in the subpool for the mapping. If 5760 * the subpool has a minimum size, there may be some global 5761 * reservations already in place (gbl_reserve). 5762 */ 5763 gbl_reserve = hugepage_subpool_get_pages(spool, chg); 5764 if (gbl_reserve < 0) 5765 goto out_uncharge_cgroup; 5766 5767 /* 5768 * Check enough hugepages are available for the reservation. 5769 * Hand the pages back to the subpool if there are not 5770 */ 5771 if (hugetlb_acct_memory(h, gbl_reserve) < 0) 5772 goto out_put_pages; 5773 5774 /* 5775 * Account for the reservations made. Shared mappings record regions 5776 * that have reservations as they are shared by multiple VMAs. 5777 * When the last VMA disappears, the region map says how much 5778 * the reservation was and the page cache tells how much of 5779 * the reservation was consumed. Private mappings are per-VMA and 5780 * only the consumed reservations are tracked. When the VMA 5781 * disappears, the original reservation is the VMA size and the 5782 * consumed reservations are stored in the map. Hence, nothing 5783 * else has to be done for private mappings here 5784 */ 5785 if (!vma || vma->vm_flags & VM_MAYSHARE) { 5786 add = region_add(resv_map, from, to, regions_needed, h, h_cg); 5787 5788 if (unlikely(add < 0)) { 5789 hugetlb_acct_memory(h, -gbl_reserve); 5790 goto out_put_pages; 5791 } else if (unlikely(chg > add)) { 5792 /* 5793 * pages in this range were added to the reserve 5794 * map between region_chg and region_add. This 5795 * indicates a race with alloc_huge_page. Adjust 5796 * the subpool and reserve counts modified above 5797 * based on the difference. 5798 */ 5799 long rsv_adjust; 5800 5801 /* 5802 * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the 5803 * reference to h_cg->css. See comment below for detail. 5804 */ 5805 hugetlb_cgroup_uncharge_cgroup_rsvd( 5806 hstate_index(h), 5807 (chg - add) * pages_per_huge_page(h), h_cg); 5808 5809 rsv_adjust = hugepage_subpool_put_pages(spool, 5810 chg - add); 5811 hugetlb_acct_memory(h, -rsv_adjust); 5812 } else if (h_cg) { 5813 /* 5814 * The file_regions will hold their own reference to 5815 * h_cg->css. So we should release the reference held 5816 * via hugetlb_cgroup_charge_cgroup_rsvd() when we are 5817 * done. 5818 */ 5819 hugetlb_cgroup_put_rsvd_cgroup(h_cg); 5820 } 5821 } 5822 return true; 5823 5824 out_put_pages: 5825 /* put back original number of pages, chg */ 5826 (void)hugepage_subpool_put_pages(spool, chg); 5827 out_uncharge_cgroup: 5828 hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), 5829 chg * pages_per_huge_page(h), h_cg); 5830 out_err: 5831 if (!vma || vma->vm_flags & VM_MAYSHARE) 5832 /* Only call region_abort if the region_chg succeeded but the 5833 * region_add failed or didn't run. 5834 */ 5835 if (chg >= 0 && add < 0) 5836 region_abort(resv_map, from, to, regions_needed); 5837 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 5838 kref_put(&resv_map->refs, resv_map_release); 5839 return false; 5840 } 5841 5842 long hugetlb_unreserve_pages(struct inode *inode, long start, long end, 5843 long freed) 5844 { 5845 struct hstate *h = hstate_inode(inode); 5846 struct resv_map *resv_map = inode_resv_map(inode); 5847 long chg = 0; 5848 struct hugepage_subpool *spool = subpool_inode(inode); 5849 long gbl_reserve; 5850 5851 /* 5852 * Since this routine can be called in the evict inode path for all 5853 * hugetlbfs inodes, resv_map could be NULL. 5854 */ 5855 if (resv_map) { 5856 chg = region_del(resv_map, start, end); 5857 /* 5858 * region_del() can fail in the rare case where a region 5859 * must be split and another region descriptor can not be 5860 * allocated. If end == LONG_MAX, it will not fail. 5861 */ 5862 if (chg < 0) 5863 return chg; 5864 } 5865 5866 spin_lock(&inode->i_lock); 5867 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 5868 spin_unlock(&inode->i_lock); 5869 5870 /* 5871 * If the subpool has a minimum size, the number of global 5872 * reservations to be released may be adjusted. 5873 * 5874 * Note that !resv_map implies freed == 0. So (chg - freed) 5875 * won't go negative. 5876 */ 5877 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); 5878 hugetlb_acct_memory(h, -gbl_reserve); 5879 5880 return 0; 5881 } 5882 5883 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 5884 static unsigned long page_table_shareable(struct vm_area_struct *svma, 5885 struct vm_area_struct *vma, 5886 unsigned long addr, pgoff_t idx) 5887 { 5888 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + 5889 svma->vm_start; 5890 unsigned long sbase = saddr & PUD_MASK; 5891 unsigned long s_end = sbase + PUD_SIZE; 5892 5893 /* Allow segments to share if only one is marked locked */ 5894 unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; 5895 unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; 5896 5897 /* 5898 * match the virtual addresses, permission and the alignment of the 5899 * page table page. 5900 */ 5901 if (pmd_index(addr) != pmd_index(saddr) || 5902 vm_flags != svm_flags || 5903 !range_in_vma(svma, sbase, s_end)) 5904 return 0; 5905 5906 return saddr; 5907 } 5908 5909 static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) 5910 { 5911 unsigned long base = addr & PUD_MASK; 5912 unsigned long end = base + PUD_SIZE; 5913 5914 /* 5915 * check on proper vm_flags and page table alignment 5916 */ 5917 if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) 5918 return true; 5919 return false; 5920 } 5921 5922 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) 5923 { 5924 #ifdef CONFIG_USERFAULTFD 5925 if (uffd_disable_huge_pmd_share(vma)) 5926 return false; 5927 #endif 5928 return vma_shareable(vma, addr); 5929 } 5930 5931 /* 5932 * Determine if start,end range within vma could be mapped by shared pmd. 5933 * If yes, adjust start and end to cover range associated with possible 5934 * shared pmd mappings. 5935 */ 5936 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 5937 unsigned long *start, unsigned long *end) 5938 { 5939 unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE), 5940 v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); 5941 5942 /* 5943 * vma needs to span at least one aligned PUD size, and the range 5944 * must be at least partially within in. 5945 */ 5946 if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || 5947 (*end <= v_start) || (*start >= v_end)) 5948 return; 5949 5950 /* Extend the range to be PUD aligned for a worst case scenario */ 5951 if (*start > v_start) 5952 *start = ALIGN_DOWN(*start, PUD_SIZE); 5953 5954 if (*end < v_end) 5955 *end = ALIGN(*end, PUD_SIZE); 5956 } 5957 5958 /* 5959 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 5960 * and returns the corresponding pte. While this is not necessary for the 5961 * !shared pmd case because we can allocate the pmd later as well, it makes the 5962 * code much cleaner. 5963 * 5964 * This routine must be called with i_mmap_rwsem held in at least read mode if 5965 * sharing is possible. For hugetlbfs, this prevents removal of any page 5966 * table entries associated with the address space. This is important as we 5967 * are setting up sharing based on existing page table entries (mappings). 5968 * 5969 * NOTE: This routine is only called from huge_pte_alloc. Some callers of 5970 * huge_pte_alloc know that sharing is not possible and do not take 5971 * i_mmap_rwsem as a performance optimization. This is handled by the 5972 * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is 5973 * only required for subsequent processing. 5974 */ 5975 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, 5976 unsigned long addr, pud_t *pud) 5977 { 5978 struct address_space *mapping = vma->vm_file->f_mapping; 5979 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 5980 vma->vm_pgoff; 5981 struct vm_area_struct *svma; 5982 unsigned long saddr; 5983 pte_t *spte = NULL; 5984 pte_t *pte; 5985 spinlock_t *ptl; 5986 5987 i_mmap_assert_locked(mapping); 5988 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 5989 if (svma == vma) 5990 continue; 5991 5992 saddr = page_table_shareable(svma, vma, addr, idx); 5993 if (saddr) { 5994 spte = huge_pte_offset(svma->vm_mm, saddr, 5995 vma_mmu_pagesize(svma)); 5996 if (spte) { 5997 get_page(virt_to_page(spte)); 5998 break; 5999 } 6000 } 6001 } 6002 6003 if (!spte) 6004 goto out; 6005 6006 ptl = huge_pte_lock(hstate_vma(vma), mm, spte); 6007 if (pud_none(*pud)) { 6008 pud_populate(mm, pud, 6009 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 6010 mm_inc_nr_pmds(mm); 6011 } else { 6012 put_page(virt_to_page(spte)); 6013 } 6014 spin_unlock(ptl); 6015 out: 6016 pte = (pte_t *)pmd_alloc(mm, pud, addr); 6017 return pte; 6018 } 6019 6020 /* 6021 * unmap huge page backed by shared pte. 6022 * 6023 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared 6024 * indicated by page_count > 1, unmap is achieved by clearing pud and 6025 * decrementing the ref count. If count == 1, the pte page is not shared. 6026 * 6027 * Called with page table lock held and i_mmap_rwsem held in write mode. 6028 * 6029 * returns: 1 successfully unmapped a shared pte page 6030 * 0 the underlying pte page is not shared, or it is the last user 6031 */ 6032 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, 6033 unsigned long *addr, pte_t *ptep) 6034 { 6035 pgd_t *pgd = pgd_offset(mm, *addr); 6036 p4d_t *p4d = p4d_offset(pgd, *addr); 6037 pud_t *pud = pud_offset(p4d, *addr); 6038 6039 i_mmap_assert_write_locked(vma->vm_file->f_mapping); 6040 BUG_ON(page_count(virt_to_page(ptep)) == 0); 6041 if (page_count(virt_to_page(ptep)) == 1) 6042 return 0; 6043 6044 pud_clear(pud); 6045 put_page(virt_to_page(ptep)); 6046 mm_dec_nr_pmds(mm); 6047 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; 6048 return 1; 6049 } 6050 6051 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 6052 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, 6053 unsigned long addr, pud_t *pud) 6054 { 6055 return NULL; 6056 } 6057 6058 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, 6059 unsigned long *addr, pte_t *ptep) 6060 { 6061 return 0; 6062 } 6063 6064 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 6065 unsigned long *start, unsigned long *end) 6066 { 6067 } 6068 6069 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) 6070 { 6071 return false; 6072 } 6073 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 6074 6075 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB 6076 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 6077 unsigned long addr, unsigned long sz) 6078 { 6079 pgd_t *pgd; 6080 p4d_t *p4d; 6081 pud_t *pud; 6082 pte_t *pte = NULL; 6083 6084 pgd = pgd_offset(mm, addr); 6085 p4d = p4d_alloc(mm, pgd, addr); 6086 if (!p4d) 6087 return NULL; 6088 pud = pud_alloc(mm, p4d, addr); 6089 if (pud) { 6090 if (sz == PUD_SIZE) { 6091 pte = (pte_t *)pud; 6092 } else { 6093 BUG_ON(sz != PMD_SIZE); 6094 if (want_pmd_share(vma, addr) && pud_none(*pud)) 6095 pte = huge_pmd_share(mm, vma, addr, pud); 6096 else 6097 pte = (pte_t *)pmd_alloc(mm, pud, addr); 6098 } 6099 } 6100 BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte)); 6101 6102 return pte; 6103 } 6104 6105 /* 6106 * huge_pte_offset() - Walk the page table to resolve the hugepage 6107 * entry at address @addr 6108 * 6109 * Return: Pointer to page table entry (PUD or PMD) for 6110 * address @addr, or NULL if a !p*d_present() entry is encountered and the 6111 * size @sz doesn't match the hugepage size at this level of the page 6112 * table. 6113 */ 6114 pte_t *huge_pte_offset(struct mm_struct *mm, 6115 unsigned long addr, unsigned long sz) 6116 { 6117 pgd_t *pgd; 6118 p4d_t *p4d; 6119 pud_t *pud; 6120 pmd_t *pmd; 6121 6122 pgd = pgd_offset(mm, addr); 6123 if (!pgd_present(*pgd)) 6124 return NULL; 6125 p4d = p4d_offset(pgd, addr); 6126 if (!p4d_present(*p4d)) 6127 return NULL; 6128 6129 pud = pud_offset(p4d, addr); 6130 if (sz == PUD_SIZE) 6131 /* must be pud huge, non-present or none */ 6132 return (pte_t *)pud; 6133 if (!pud_present(*pud)) 6134 return NULL; 6135 /* must have a valid entry and size to go further */ 6136 6137 pmd = pmd_offset(pud, addr); 6138 /* must be pmd huge, non-present or none */ 6139 return (pte_t *)pmd; 6140 } 6141 6142 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 6143 6144 /* 6145 * These functions are overwritable if your architecture needs its own 6146 * behavior. 6147 */ 6148 struct page * __weak 6149 follow_huge_addr(struct mm_struct *mm, unsigned long address, 6150 int write) 6151 { 6152 return ERR_PTR(-EINVAL); 6153 } 6154 6155 struct page * __weak 6156 follow_huge_pd(struct vm_area_struct *vma, 6157 unsigned long address, hugepd_t hpd, int flags, int pdshift) 6158 { 6159 WARN(1, "hugepd follow called with no support for hugepage directory format\n"); 6160 return NULL; 6161 } 6162 6163 struct page * __weak 6164 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 6165 pmd_t *pmd, int flags) 6166 { 6167 struct page *page = NULL; 6168 spinlock_t *ptl; 6169 pte_t pte; 6170 6171 /* FOLL_GET and FOLL_PIN are mutually exclusive. */ 6172 if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == 6173 (FOLL_PIN | FOLL_GET))) 6174 return NULL; 6175 6176 retry: 6177 ptl = pmd_lockptr(mm, pmd); 6178 spin_lock(ptl); 6179 /* 6180 * make sure that the address range covered by this pmd is not 6181 * unmapped from other threads. 6182 */ 6183 if (!pmd_huge(*pmd)) 6184 goto out; 6185 pte = huge_ptep_get((pte_t *)pmd); 6186 if (pte_present(pte)) { 6187 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); 6188 /* 6189 * try_grab_page() should always succeed here, because: a) we 6190 * hold the pmd (ptl) lock, and b) we've just checked that the 6191 * huge pmd (head) page is present in the page tables. The ptl 6192 * prevents the head page and tail pages from being rearranged 6193 * in any way. So this page must be available at this point, 6194 * unless the page refcount overflowed: 6195 */ 6196 if (WARN_ON_ONCE(!try_grab_page(page, flags))) { 6197 page = NULL; 6198 goto out; 6199 } 6200 } else { 6201 if (is_hugetlb_entry_migration(pte)) { 6202 spin_unlock(ptl); 6203 __migration_entry_wait(mm, (pte_t *)pmd, ptl); 6204 goto retry; 6205 } 6206 /* 6207 * hwpoisoned entry is treated as no_page_table in 6208 * follow_page_mask(). 6209 */ 6210 } 6211 out: 6212 spin_unlock(ptl); 6213 return page; 6214 } 6215 6216 struct page * __weak 6217 follow_huge_pud(struct mm_struct *mm, unsigned long address, 6218 pud_t *pud, int flags) 6219 { 6220 if (flags & (FOLL_GET | FOLL_PIN)) 6221 return NULL; 6222 6223 return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); 6224 } 6225 6226 struct page * __weak 6227 follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) 6228 { 6229 if (flags & (FOLL_GET | FOLL_PIN)) 6230 return NULL; 6231 6232 return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); 6233 } 6234 6235 bool isolate_huge_page(struct page *page, struct list_head *list) 6236 { 6237 bool ret = true; 6238 6239 spin_lock_irq(&hugetlb_lock); 6240 if (!PageHeadHuge(page) || 6241 !HPageMigratable(page) || 6242 !get_page_unless_zero(page)) { 6243 ret = false; 6244 goto unlock; 6245 } 6246 ClearHPageMigratable(page); 6247 list_move_tail(&page->lru, list); 6248 unlock: 6249 spin_unlock_irq(&hugetlb_lock); 6250 return ret; 6251 } 6252 6253 int get_hwpoison_huge_page(struct page *page, bool *hugetlb) 6254 { 6255 int ret = 0; 6256 6257 *hugetlb = false; 6258 spin_lock_irq(&hugetlb_lock); 6259 if (PageHeadHuge(page)) { 6260 *hugetlb = true; 6261 if (HPageFreed(page) || HPageMigratable(page)) 6262 ret = get_page_unless_zero(page); 6263 else 6264 ret = -EBUSY; 6265 } 6266 spin_unlock_irq(&hugetlb_lock); 6267 return ret; 6268 } 6269 6270 void putback_active_hugepage(struct page *page) 6271 { 6272 spin_lock_irq(&hugetlb_lock); 6273 SetHPageMigratable(page); 6274 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); 6275 spin_unlock_irq(&hugetlb_lock); 6276 put_page(page); 6277 } 6278 6279 void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) 6280 { 6281 struct hstate *h = page_hstate(oldpage); 6282 6283 hugetlb_cgroup_migrate(oldpage, newpage); 6284 set_page_owner_migrate_reason(newpage, reason); 6285 6286 /* 6287 * transfer temporary state of the new huge page. This is 6288 * reverse to other transitions because the newpage is going to 6289 * be final while the old one will be freed so it takes over 6290 * the temporary status. 6291 * 6292 * Also note that we have to transfer the per-node surplus state 6293 * here as well otherwise the global surplus count will not match 6294 * the per-node's. 6295 */ 6296 if (HPageTemporary(newpage)) { 6297 int old_nid = page_to_nid(oldpage); 6298 int new_nid = page_to_nid(newpage); 6299 6300 SetHPageTemporary(oldpage); 6301 ClearHPageTemporary(newpage); 6302 6303 /* 6304 * There is no need to transfer the per-node surplus state 6305 * when we do not cross the node. 6306 */ 6307 if (new_nid == old_nid) 6308 return; 6309 spin_lock_irq(&hugetlb_lock); 6310 if (h->surplus_huge_pages_node[old_nid]) { 6311 h->surplus_huge_pages_node[old_nid]--; 6312 h->surplus_huge_pages_node[new_nid]++; 6313 } 6314 spin_unlock_irq(&hugetlb_lock); 6315 } 6316 } 6317 6318 /* 6319 * This function will unconditionally remove all the shared pmd pgtable entries 6320 * within the specific vma for a hugetlbfs memory range. 6321 */ 6322 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) 6323 { 6324 struct hstate *h = hstate_vma(vma); 6325 unsigned long sz = huge_page_size(h); 6326 struct mm_struct *mm = vma->vm_mm; 6327 struct mmu_notifier_range range; 6328 unsigned long address, start, end; 6329 spinlock_t *ptl; 6330 pte_t *ptep; 6331 6332 if (!(vma->vm_flags & VM_MAYSHARE)) 6333 return; 6334 6335 start = ALIGN(vma->vm_start, PUD_SIZE); 6336 end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); 6337 6338 if (start >= end) 6339 return; 6340 6341 /* 6342 * No need to call adjust_range_if_pmd_sharing_possible(), because 6343 * we have already done the PUD_SIZE alignment. 6344 */ 6345 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 6346 start, end); 6347 mmu_notifier_invalidate_range_start(&range); 6348 i_mmap_lock_write(vma->vm_file->f_mapping); 6349 for (address = start; address < end; address += PUD_SIZE) { 6350 unsigned long tmp = address; 6351 6352 ptep = huge_pte_offset(mm, address, sz); 6353 if (!ptep) 6354 continue; 6355 ptl = huge_pte_lock(h, mm, ptep); 6356 /* We don't want 'address' to be changed */ 6357 huge_pmd_unshare(mm, vma, &tmp, ptep); 6358 spin_unlock(ptl); 6359 } 6360 flush_hugetlb_tlb_range(vma, start, end); 6361 i_mmap_unlock_write(vma->vm_file->f_mapping); 6362 /* 6363 * No need to call mmu_notifier_invalidate_range(), see 6364 * Documentation/vm/mmu_notifier.rst. 6365 */ 6366 mmu_notifier_invalidate_range_end(&range); 6367 } 6368 6369 #ifdef CONFIG_CMA 6370 static bool cma_reserve_called __initdata; 6371 6372 static int __init cmdline_parse_hugetlb_cma(char *p) 6373 { 6374 hugetlb_cma_size = memparse(p, &p); 6375 return 0; 6376 } 6377 6378 early_param("hugetlb_cma", cmdline_parse_hugetlb_cma); 6379 6380 void __init hugetlb_cma_reserve(int order) 6381 { 6382 unsigned long size, reserved, per_node; 6383 int nid; 6384 6385 cma_reserve_called = true; 6386 6387 if (!hugetlb_cma_size) 6388 return; 6389 6390 if (hugetlb_cma_size < (PAGE_SIZE << order)) { 6391 pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", 6392 (PAGE_SIZE << order) / SZ_1M); 6393 return; 6394 } 6395 6396 /* 6397 * If 3 GB area is requested on a machine with 4 numa nodes, 6398 * let's allocate 1 GB on first three nodes and ignore the last one. 6399 */ 6400 per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); 6401 pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", 6402 hugetlb_cma_size / SZ_1M, per_node / SZ_1M); 6403 6404 reserved = 0; 6405 for_each_node_state(nid, N_ONLINE) { 6406 int res; 6407 char name[CMA_MAX_NAME]; 6408 6409 size = min(per_node, hugetlb_cma_size - reserved); 6410 size = round_up(size, PAGE_SIZE << order); 6411 6412 snprintf(name, sizeof(name), "hugetlb%d", nid); 6413 res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order, 6414 0, false, name, 6415 &hugetlb_cma[nid], nid); 6416 if (res) { 6417 pr_warn("hugetlb_cma: reservation failed: err %d, node %d", 6418 res, nid); 6419 continue; 6420 } 6421 6422 reserved += size; 6423 pr_info("hugetlb_cma: reserved %lu MiB on node %d\n", 6424 size / SZ_1M, nid); 6425 6426 if (reserved >= hugetlb_cma_size) 6427 break; 6428 } 6429 } 6430 6431 void __init hugetlb_cma_check(void) 6432 { 6433 if (!hugetlb_cma_size || cma_reserve_called) 6434 return; 6435 6436 pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); 6437 } 6438 6439 #endif /* CONFIG_CMA */ 6440