1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Generic hugetlb support. 4 * (C) Nadia Yvette Chambers, April 2004 5 */ 6 #include <linux/list.h> 7 #include <linux/init.h> 8 #include <linux/mm.h> 9 #include <linux/seq_file.h> 10 #include <linux/sysctl.h> 11 #include <linux/highmem.h> 12 #include <linux/mmu_notifier.h> 13 #include <linux/nodemask.h> 14 #include <linux/pagemap.h> 15 #include <linux/mempolicy.h> 16 #include <linux/compiler.h> 17 #include <linux/cpuset.h> 18 #include <linux/mutex.h> 19 #include <linux/memblock.h> 20 #include <linux/sysfs.h> 21 #include <linux/slab.h> 22 #include <linux/mmdebug.h> 23 #include <linux/sched/signal.h> 24 #include <linux/rmap.h> 25 #include <linux/string_helpers.h> 26 #include <linux/swap.h> 27 #include <linux/swapops.h> 28 #include <linux/jhash.h> 29 #include <linux/numa.h> 30 #include <linux/llist.h> 31 #include <linux/cma.h> 32 33 #include <asm/page.h> 34 #include <asm/tlb.h> 35 36 #include <linux/io.h> 37 #include <linux/hugetlb.h> 38 #include <linux/hugetlb_cgroup.h> 39 #include <linux/node.h> 40 #include <linux/userfaultfd_k.h> 41 #include <linux/page_owner.h> 42 #include "internal.h" 43 44 int hugetlb_max_hstate __read_mostly; 45 unsigned int default_hstate_idx; 46 struct hstate hstates[HUGE_MAX_HSTATE]; 47 48 static struct cma *hugetlb_cma[MAX_NUMNODES]; 49 50 /* 51 * Minimum page order among possible hugepage sizes, set to a proper value 52 * at boot time. 53 */ 54 static unsigned int minimum_order __read_mostly = UINT_MAX; 55 56 __initdata LIST_HEAD(huge_boot_pages); 57 58 /* for command line parsing */ 59 static struct hstate * __initdata parsed_hstate; 60 static unsigned long __initdata default_hstate_max_huge_pages; 61 static bool __initdata parsed_valid_hugepagesz = true; 62 static bool __initdata parsed_default_hugepagesz; 63 64 /* 65 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, 66 * free_huge_pages, and surplus_huge_pages. 67 */ 68 DEFINE_SPINLOCK(hugetlb_lock); 69 70 /* 71 * Serializes faults on the same logical page. This is used to 72 * prevent spurious OOMs when the hugepage pool is fully utilized. 73 */ 74 static int num_fault_mutexes; 75 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; 76 77 /* Forward declaration */ 78 static int hugetlb_acct_memory(struct hstate *h, long delta); 79 80 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 81 { 82 bool free = (spool->count == 0) && (spool->used_hpages == 0); 83 84 spin_unlock(&spool->lock); 85 86 /* If no pages are used, and no other handles to the subpool 87 * remain, give up any reservations based on minimum size and 88 * free the subpool */ 89 if (free) { 90 if (spool->min_hpages != -1) 91 hugetlb_acct_memory(spool->hstate, 92 -spool->min_hpages); 93 kfree(spool); 94 } 95 } 96 97 struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, 98 long min_hpages) 99 { 100 struct hugepage_subpool *spool; 101 102 spool = kzalloc(sizeof(*spool), GFP_KERNEL); 103 if (!spool) 104 return NULL; 105 106 spin_lock_init(&spool->lock); 107 spool->count = 1; 108 spool->max_hpages = max_hpages; 109 spool->hstate = h; 110 spool->min_hpages = min_hpages; 111 112 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { 113 kfree(spool); 114 return NULL; 115 } 116 spool->rsv_hpages = min_hpages; 117 118 return spool; 119 } 120 121 void hugepage_put_subpool(struct hugepage_subpool *spool) 122 { 123 spin_lock(&spool->lock); 124 BUG_ON(!spool->count); 125 spool->count--; 126 unlock_or_release_subpool(spool); 127 } 128 129 /* 130 * Subpool accounting for allocating and reserving pages. 131 * Return -ENOMEM if there are not enough resources to satisfy the 132 * the request. Otherwise, return the number of pages by which the 133 * global pools must be adjusted (upward). The returned value may 134 * only be different than the passed value (delta) in the case where 135 * a subpool minimum size must be maintained. 136 */ 137 static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, 138 long delta) 139 { 140 long ret = delta; 141 142 if (!spool) 143 return ret; 144 145 spin_lock(&spool->lock); 146 147 if (spool->max_hpages != -1) { /* maximum size accounting */ 148 if ((spool->used_hpages + delta) <= spool->max_hpages) 149 spool->used_hpages += delta; 150 else { 151 ret = -ENOMEM; 152 goto unlock_ret; 153 } 154 } 155 156 /* minimum size accounting */ 157 if (spool->min_hpages != -1 && spool->rsv_hpages) { 158 if (delta > spool->rsv_hpages) { 159 /* 160 * Asking for more reserves than those already taken on 161 * behalf of subpool. Return difference. 162 */ 163 ret = delta - spool->rsv_hpages; 164 spool->rsv_hpages = 0; 165 } else { 166 ret = 0; /* reserves already accounted for */ 167 spool->rsv_hpages -= delta; 168 } 169 } 170 171 unlock_ret: 172 spin_unlock(&spool->lock); 173 return ret; 174 } 175 176 /* 177 * Subpool accounting for freeing and unreserving pages. 178 * Return the number of global page reservations that must be dropped. 179 * The return value may only be different than the passed value (delta) 180 * in the case where a subpool minimum size must be maintained. 181 */ 182 static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, 183 long delta) 184 { 185 long ret = delta; 186 187 if (!spool) 188 return delta; 189 190 spin_lock(&spool->lock); 191 192 if (spool->max_hpages != -1) /* maximum size accounting */ 193 spool->used_hpages -= delta; 194 195 /* minimum size accounting */ 196 if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { 197 if (spool->rsv_hpages + delta <= spool->min_hpages) 198 ret = 0; 199 else 200 ret = spool->rsv_hpages + delta - spool->min_hpages; 201 202 spool->rsv_hpages += delta; 203 if (spool->rsv_hpages > spool->min_hpages) 204 spool->rsv_hpages = spool->min_hpages; 205 } 206 207 /* 208 * If hugetlbfs_put_super couldn't free spool due to an outstanding 209 * quota reference, free it now. 210 */ 211 unlock_or_release_subpool(spool); 212 213 return ret; 214 } 215 216 static inline struct hugepage_subpool *subpool_inode(struct inode *inode) 217 { 218 return HUGETLBFS_SB(inode->i_sb)->spool; 219 } 220 221 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) 222 { 223 return subpool_inode(file_inode(vma->vm_file)); 224 } 225 226 /* Helper that removes a struct file_region from the resv_map cache and returns 227 * it for use. 228 */ 229 static struct file_region * 230 get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) 231 { 232 struct file_region *nrg = NULL; 233 234 VM_BUG_ON(resv->region_cache_count <= 0); 235 236 resv->region_cache_count--; 237 nrg = list_first_entry(&resv->region_cache, struct file_region, link); 238 VM_BUG_ON(!nrg); 239 list_del(&nrg->link); 240 241 nrg->from = from; 242 nrg->to = to; 243 244 return nrg; 245 } 246 247 static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, 248 struct file_region *rg) 249 { 250 #ifdef CONFIG_CGROUP_HUGETLB 251 nrg->reservation_counter = rg->reservation_counter; 252 nrg->css = rg->css; 253 if (rg->css) 254 css_get(rg->css); 255 #endif 256 } 257 258 /* Helper that records hugetlb_cgroup uncharge info. */ 259 static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, 260 struct hstate *h, 261 struct resv_map *resv, 262 struct file_region *nrg) 263 { 264 #ifdef CONFIG_CGROUP_HUGETLB 265 if (h_cg) { 266 nrg->reservation_counter = 267 &h_cg->rsvd_hugepage[hstate_index(h)]; 268 nrg->css = &h_cg->css; 269 if (!resv->pages_per_hpage) 270 resv->pages_per_hpage = pages_per_huge_page(h); 271 /* pages_per_hpage should be the same for all entries in 272 * a resv_map. 273 */ 274 VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); 275 } else { 276 nrg->reservation_counter = NULL; 277 nrg->css = NULL; 278 } 279 #endif 280 } 281 282 static bool has_same_uncharge_info(struct file_region *rg, 283 struct file_region *org) 284 { 285 #ifdef CONFIG_CGROUP_HUGETLB 286 return rg && org && 287 rg->reservation_counter == org->reservation_counter && 288 rg->css == org->css; 289 290 #else 291 return true; 292 #endif 293 } 294 295 static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) 296 { 297 struct file_region *nrg = NULL, *prg = NULL; 298 299 prg = list_prev_entry(rg, link); 300 if (&prg->link != &resv->regions && prg->to == rg->from && 301 has_same_uncharge_info(prg, rg)) { 302 prg->to = rg->to; 303 304 list_del(&rg->link); 305 kfree(rg); 306 307 coalesce_file_region(resv, prg); 308 return; 309 } 310 311 nrg = list_next_entry(rg, link); 312 if (&nrg->link != &resv->regions && nrg->from == rg->to && 313 has_same_uncharge_info(nrg, rg)) { 314 nrg->from = rg->from; 315 316 list_del(&rg->link); 317 kfree(rg); 318 319 coalesce_file_region(resv, nrg); 320 return; 321 } 322 } 323 324 /* Must be called with resv->lock held. Calling this with count_only == true 325 * will count the number of pages to be added but will not modify the linked 326 * list. If regions_needed != NULL and count_only == true, then regions_needed 327 * will indicate the number of file_regions needed in the cache to carry out to 328 * add the regions for this range. 329 */ 330 static long add_reservation_in_range(struct resv_map *resv, long f, long t, 331 struct hugetlb_cgroup *h_cg, 332 struct hstate *h, long *regions_needed, 333 bool count_only) 334 { 335 long add = 0; 336 struct list_head *head = &resv->regions; 337 long last_accounted_offset = f; 338 struct file_region *rg = NULL, *trg = NULL, *nrg = NULL; 339 340 if (regions_needed) 341 *regions_needed = 0; 342 343 /* In this loop, we essentially handle an entry for the range 344 * [last_accounted_offset, rg->from), at every iteration, with some 345 * bounds checking. 346 */ 347 list_for_each_entry_safe(rg, trg, head, link) { 348 /* Skip irrelevant regions that start before our range. */ 349 if (rg->from < f) { 350 /* If this region ends after the last accounted offset, 351 * then we need to update last_accounted_offset. 352 */ 353 if (rg->to > last_accounted_offset) 354 last_accounted_offset = rg->to; 355 continue; 356 } 357 358 /* When we find a region that starts beyond our range, we've 359 * finished. 360 */ 361 if (rg->from > t) 362 break; 363 364 /* Add an entry for last_accounted_offset -> rg->from, and 365 * update last_accounted_offset. 366 */ 367 if (rg->from > last_accounted_offset) { 368 add += rg->from - last_accounted_offset; 369 if (!count_only) { 370 nrg = get_file_region_entry_from_cache( 371 resv, last_accounted_offset, rg->from); 372 record_hugetlb_cgroup_uncharge_info(h_cg, h, 373 resv, nrg); 374 list_add(&nrg->link, rg->link.prev); 375 coalesce_file_region(resv, nrg); 376 } else if (regions_needed) 377 *regions_needed += 1; 378 } 379 380 last_accounted_offset = rg->to; 381 } 382 383 /* Handle the case where our range extends beyond 384 * last_accounted_offset. 385 */ 386 if (last_accounted_offset < t) { 387 add += t - last_accounted_offset; 388 if (!count_only) { 389 nrg = get_file_region_entry_from_cache( 390 resv, last_accounted_offset, t); 391 record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg); 392 list_add(&nrg->link, rg->link.prev); 393 coalesce_file_region(resv, nrg); 394 } else if (regions_needed) 395 *regions_needed += 1; 396 } 397 398 VM_BUG_ON(add < 0); 399 return add; 400 } 401 402 /* Must be called with resv->lock acquired. Will drop lock to allocate entries. 403 */ 404 static int allocate_file_region_entries(struct resv_map *resv, 405 int regions_needed) 406 __must_hold(&resv->lock) 407 { 408 struct list_head allocated_regions; 409 int to_allocate = 0, i = 0; 410 struct file_region *trg = NULL, *rg = NULL; 411 412 VM_BUG_ON(regions_needed < 0); 413 414 INIT_LIST_HEAD(&allocated_regions); 415 416 /* 417 * Check for sufficient descriptors in the cache to accommodate 418 * the number of in progress add operations plus regions_needed. 419 * 420 * This is a while loop because when we drop the lock, some other call 421 * to region_add or region_del may have consumed some region_entries, 422 * so we keep looping here until we finally have enough entries for 423 * (adds_in_progress + regions_needed). 424 */ 425 while (resv->region_cache_count < 426 (resv->adds_in_progress + regions_needed)) { 427 to_allocate = resv->adds_in_progress + regions_needed - 428 resv->region_cache_count; 429 430 /* At this point, we should have enough entries in the cache 431 * for all the existings adds_in_progress. We should only be 432 * needing to allocate for regions_needed. 433 */ 434 VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); 435 436 spin_unlock(&resv->lock); 437 for (i = 0; i < to_allocate; i++) { 438 trg = kmalloc(sizeof(*trg), GFP_KERNEL); 439 if (!trg) 440 goto out_of_memory; 441 list_add(&trg->link, &allocated_regions); 442 } 443 444 spin_lock(&resv->lock); 445 446 list_for_each_entry_safe(rg, trg, &allocated_regions, link) { 447 list_del(&rg->link); 448 list_add(&rg->link, &resv->region_cache); 449 resv->region_cache_count++; 450 } 451 } 452 453 return 0; 454 455 out_of_memory: 456 list_for_each_entry_safe(rg, trg, &allocated_regions, link) { 457 list_del(&rg->link); 458 kfree(rg); 459 } 460 return -ENOMEM; 461 } 462 463 /* 464 * Add the huge page range represented by [f, t) to the reserve 465 * map. Regions will be taken from the cache to fill in this range. 466 * Sufficient regions should exist in the cache due to the previous 467 * call to region_chg with the same range, but in some cases the cache will not 468 * have sufficient entries due to races with other code doing region_add or 469 * region_del. The extra needed entries will be allocated. 470 * 471 * regions_needed is the out value provided by a previous call to region_chg. 472 * 473 * Return the number of new huge pages added to the map. This number is greater 474 * than or equal to zero. If file_region entries needed to be allocated for 475 * this operation and we were not able to allocate, it returns -ENOMEM. 476 * region_add of regions of length 1 never allocate file_regions and cannot 477 * fail; region_chg will always allocate at least 1 entry and a region_add for 478 * 1 page will only require at most 1 entry. 479 */ 480 static long region_add(struct resv_map *resv, long f, long t, 481 long in_regions_needed, struct hstate *h, 482 struct hugetlb_cgroup *h_cg) 483 { 484 long add = 0, actual_regions_needed = 0; 485 486 spin_lock(&resv->lock); 487 retry: 488 489 /* Count how many regions are actually needed to execute this add. */ 490 add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed, 491 true); 492 493 /* 494 * Check for sufficient descriptors in the cache to accommodate 495 * this add operation. Note that actual_regions_needed may be greater 496 * than in_regions_needed, as the resv_map may have been modified since 497 * the region_chg call. In this case, we need to make sure that we 498 * allocate extra entries, such that we have enough for all the 499 * existing adds_in_progress, plus the excess needed for this 500 * operation. 501 */ 502 if (actual_regions_needed > in_regions_needed && 503 resv->region_cache_count < 504 resv->adds_in_progress + 505 (actual_regions_needed - in_regions_needed)) { 506 /* region_add operation of range 1 should never need to 507 * allocate file_region entries. 508 */ 509 VM_BUG_ON(t - f <= 1); 510 511 if (allocate_file_region_entries( 512 resv, actual_regions_needed - in_regions_needed)) { 513 return -ENOMEM; 514 } 515 516 goto retry; 517 } 518 519 add = add_reservation_in_range(resv, f, t, h_cg, h, NULL, false); 520 521 resv->adds_in_progress -= in_regions_needed; 522 523 spin_unlock(&resv->lock); 524 VM_BUG_ON(add < 0); 525 return add; 526 } 527 528 /* 529 * Examine the existing reserve map and determine how many 530 * huge pages in the specified range [f, t) are NOT currently 531 * represented. This routine is called before a subsequent 532 * call to region_add that will actually modify the reserve 533 * map to add the specified range [f, t). region_chg does 534 * not change the number of huge pages represented by the 535 * map. A number of new file_region structures is added to the cache as a 536 * placeholder, for the subsequent region_add call to use. At least 1 537 * file_region structure is added. 538 * 539 * out_regions_needed is the number of regions added to the 540 * resv->adds_in_progress. This value needs to be provided to a follow up call 541 * to region_add or region_abort for proper accounting. 542 * 543 * Returns the number of huge pages that need to be added to the existing 544 * reservation map for the range [f, t). This number is greater or equal to 545 * zero. -ENOMEM is returned if a new file_region structure or cache entry 546 * is needed and can not be allocated. 547 */ 548 static long region_chg(struct resv_map *resv, long f, long t, 549 long *out_regions_needed) 550 { 551 long chg = 0; 552 553 spin_lock(&resv->lock); 554 555 /* Count how many hugepages in this range are NOT respresented. */ 556 chg = add_reservation_in_range(resv, f, t, NULL, NULL, 557 out_regions_needed, true); 558 559 if (*out_regions_needed == 0) 560 *out_regions_needed = 1; 561 562 if (allocate_file_region_entries(resv, *out_regions_needed)) 563 return -ENOMEM; 564 565 resv->adds_in_progress += *out_regions_needed; 566 567 spin_unlock(&resv->lock); 568 return chg; 569 } 570 571 /* 572 * Abort the in progress add operation. The adds_in_progress field 573 * of the resv_map keeps track of the operations in progress between 574 * calls to region_chg and region_add. Operations are sometimes 575 * aborted after the call to region_chg. In such cases, region_abort 576 * is called to decrement the adds_in_progress counter. regions_needed 577 * is the value returned by the region_chg call, it is used to decrement 578 * the adds_in_progress counter. 579 * 580 * NOTE: The range arguments [f, t) are not needed or used in this 581 * routine. They are kept to make reading the calling code easier as 582 * arguments will match the associated region_chg call. 583 */ 584 static void region_abort(struct resv_map *resv, long f, long t, 585 long regions_needed) 586 { 587 spin_lock(&resv->lock); 588 VM_BUG_ON(!resv->region_cache_count); 589 resv->adds_in_progress -= regions_needed; 590 spin_unlock(&resv->lock); 591 } 592 593 /* 594 * Delete the specified range [f, t) from the reserve map. If the 595 * t parameter is LONG_MAX, this indicates that ALL regions after f 596 * should be deleted. Locate the regions which intersect [f, t) 597 * and either trim, delete or split the existing regions. 598 * 599 * Returns the number of huge pages deleted from the reserve map. 600 * In the normal case, the return value is zero or more. In the 601 * case where a region must be split, a new region descriptor must 602 * be allocated. If the allocation fails, -ENOMEM will be returned. 603 * NOTE: If the parameter t == LONG_MAX, then we will never split 604 * a region and possibly return -ENOMEM. Callers specifying 605 * t == LONG_MAX do not need to check for -ENOMEM error. 606 */ 607 static long region_del(struct resv_map *resv, long f, long t) 608 { 609 struct list_head *head = &resv->regions; 610 struct file_region *rg, *trg; 611 struct file_region *nrg = NULL; 612 long del = 0; 613 614 retry: 615 spin_lock(&resv->lock); 616 list_for_each_entry_safe(rg, trg, head, link) { 617 /* 618 * Skip regions before the range to be deleted. file_region 619 * ranges are normally of the form [from, to). However, there 620 * may be a "placeholder" entry in the map which is of the form 621 * (from, to) with from == to. Check for placeholder entries 622 * at the beginning of the range to be deleted. 623 */ 624 if (rg->to <= f && (rg->to != rg->from || rg->to != f)) 625 continue; 626 627 if (rg->from >= t) 628 break; 629 630 if (f > rg->from && t < rg->to) { /* Must split region */ 631 /* 632 * Check for an entry in the cache before dropping 633 * lock and attempting allocation. 634 */ 635 if (!nrg && 636 resv->region_cache_count > resv->adds_in_progress) { 637 nrg = list_first_entry(&resv->region_cache, 638 struct file_region, 639 link); 640 list_del(&nrg->link); 641 resv->region_cache_count--; 642 } 643 644 if (!nrg) { 645 spin_unlock(&resv->lock); 646 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 647 if (!nrg) 648 return -ENOMEM; 649 goto retry; 650 } 651 652 del += t - f; 653 654 /* New entry for end of split region */ 655 nrg->from = t; 656 nrg->to = rg->to; 657 658 copy_hugetlb_cgroup_uncharge_info(nrg, rg); 659 660 INIT_LIST_HEAD(&nrg->link); 661 662 /* Original entry is trimmed */ 663 rg->to = f; 664 665 hugetlb_cgroup_uncharge_file_region( 666 resv, rg, nrg->to - nrg->from); 667 668 list_add(&nrg->link, &rg->link); 669 nrg = NULL; 670 break; 671 } 672 673 if (f <= rg->from && t >= rg->to) { /* Remove entire region */ 674 del += rg->to - rg->from; 675 hugetlb_cgroup_uncharge_file_region(resv, rg, 676 rg->to - rg->from); 677 list_del(&rg->link); 678 kfree(rg); 679 continue; 680 } 681 682 if (f <= rg->from) { /* Trim beginning of region */ 683 del += t - rg->from; 684 rg->from = t; 685 686 hugetlb_cgroup_uncharge_file_region(resv, rg, 687 t - rg->from); 688 } else { /* Trim end of region */ 689 del += rg->to - f; 690 rg->to = f; 691 692 hugetlb_cgroup_uncharge_file_region(resv, rg, 693 rg->to - f); 694 } 695 } 696 697 spin_unlock(&resv->lock); 698 kfree(nrg); 699 return del; 700 } 701 702 /* 703 * A rare out of memory error was encountered which prevented removal of 704 * the reserve map region for a page. The huge page itself was free'ed 705 * and removed from the page cache. This routine will adjust the subpool 706 * usage count, and the global reserve count if needed. By incrementing 707 * these counts, the reserve map entry which could not be deleted will 708 * appear as a "reserved" entry instead of simply dangling with incorrect 709 * counts. 710 */ 711 void hugetlb_fix_reserve_counts(struct inode *inode) 712 { 713 struct hugepage_subpool *spool = subpool_inode(inode); 714 long rsv_adjust; 715 716 rsv_adjust = hugepage_subpool_get_pages(spool, 1); 717 if (rsv_adjust) { 718 struct hstate *h = hstate_inode(inode); 719 720 hugetlb_acct_memory(h, 1); 721 } 722 } 723 724 /* 725 * Count and return the number of huge pages in the reserve map 726 * that intersect with the range [f, t). 727 */ 728 static long region_count(struct resv_map *resv, long f, long t) 729 { 730 struct list_head *head = &resv->regions; 731 struct file_region *rg; 732 long chg = 0; 733 734 spin_lock(&resv->lock); 735 /* Locate each segment we overlap with, and count that overlap. */ 736 list_for_each_entry(rg, head, link) { 737 long seg_from; 738 long seg_to; 739 740 if (rg->to <= f) 741 continue; 742 if (rg->from >= t) 743 break; 744 745 seg_from = max(rg->from, f); 746 seg_to = min(rg->to, t); 747 748 chg += seg_to - seg_from; 749 } 750 spin_unlock(&resv->lock); 751 752 return chg; 753 } 754 755 /* 756 * Convert the address within this vma to the page offset within 757 * the mapping, in pagecache page units; huge pages here. 758 */ 759 static pgoff_t vma_hugecache_offset(struct hstate *h, 760 struct vm_area_struct *vma, unsigned long address) 761 { 762 return ((address - vma->vm_start) >> huge_page_shift(h)) + 763 (vma->vm_pgoff >> huge_page_order(h)); 764 } 765 766 pgoff_t linear_hugepage_index(struct vm_area_struct *vma, 767 unsigned long address) 768 { 769 return vma_hugecache_offset(hstate_vma(vma), vma, address); 770 } 771 EXPORT_SYMBOL_GPL(linear_hugepage_index); 772 773 /* 774 * Return the size of the pages allocated when backing a VMA. In the majority 775 * cases this will be same size as used by the page table entries. 776 */ 777 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 778 { 779 if (vma->vm_ops && vma->vm_ops->pagesize) 780 return vma->vm_ops->pagesize(vma); 781 return PAGE_SIZE; 782 } 783 EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 784 785 /* 786 * Return the page size being used by the MMU to back a VMA. In the majority 787 * of cases, the page size used by the kernel matches the MMU size. On 788 * architectures where it differs, an architecture-specific 'strong' 789 * version of this symbol is required. 790 */ 791 __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 792 { 793 return vma_kernel_pagesize(vma); 794 } 795 796 /* 797 * Flags for MAP_PRIVATE reservations. These are stored in the bottom 798 * bits of the reservation map pointer, which are always clear due to 799 * alignment. 800 */ 801 #define HPAGE_RESV_OWNER (1UL << 0) 802 #define HPAGE_RESV_UNMAPPED (1UL << 1) 803 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 804 805 /* 806 * These helpers are used to track how many pages are reserved for 807 * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 808 * is guaranteed to have their future faults succeed. 809 * 810 * With the exception of reset_vma_resv_huge_pages() which is called at fork(), 811 * the reserve counters are updated with the hugetlb_lock held. It is safe 812 * to reset the VMA at fork() time as it is not in use yet and there is no 813 * chance of the global counters getting corrupted as a result of the values. 814 * 815 * The private mapping reservation is represented in a subtly different 816 * manner to a shared mapping. A shared mapping has a region map associated 817 * with the underlying file, this region map represents the backing file 818 * pages which have ever had a reservation assigned which this persists even 819 * after the page is instantiated. A private mapping has a region map 820 * associated with the original mmap which is attached to all VMAs which 821 * reference it, this region map represents those offsets which have consumed 822 * reservation ie. where pages have been instantiated. 823 */ 824 static unsigned long get_vma_private_data(struct vm_area_struct *vma) 825 { 826 return (unsigned long)vma->vm_private_data; 827 } 828 829 static void set_vma_private_data(struct vm_area_struct *vma, 830 unsigned long value) 831 { 832 vma->vm_private_data = (void *)value; 833 } 834 835 static void 836 resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map, 837 struct hugetlb_cgroup *h_cg, 838 struct hstate *h) 839 { 840 #ifdef CONFIG_CGROUP_HUGETLB 841 if (!h_cg || !h) { 842 resv_map->reservation_counter = NULL; 843 resv_map->pages_per_hpage = 0; 844 resv_map->css = NULL; 845 } else { 846 resv_map->reservation_counter = 847 &h_cg->rsvd_hugepage[hstate_index(h)]; 848 resv_map->pages_per_hpage = pages_per_huge_page(h); 849 resv_map->css = &h_cg->css; 850 } 851 #endif 852 } 853 854 struct resv_map *resv_map_alloc(void) 855 { 856 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 857 struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); 858 859 if (!resv_map || !rg) { 860 kfree(resv_map); 861 kfree(rg); 862 return NULL; 863 } 864 865 kref_init(&resv_map->refs); 866 spin_lock_init(&resv_map->lock); 867 INIT_LIST_HEAD(&resv_map->regions); 868 869 resv_map->adds_in_progress = 0; 870 /* 871 * Initialize these to 0. On shared mappings, 0's here indicate these 872 * fields don't do cgroup accounting. On private mappings, these will be 873 * re-initialized to the proper values, to indicate that hugetlb cgroup 874 * reservations are to be un-charged from here. 875 */ 876 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL); 877 878 INIT_LIST_HEAD(&resv_map->region_cache); 879 list_add(&rg->link, &resv_map->region_cache); 880 resv_map->region_cache_count = 1; 881 882 return resv_map; 883 } 884 885 void resv_map_release(struct kref *ref) 886 { 887 struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 888 struct list_head *head = &resv_map->region_cache; 889 struct file_region *rg, *trg; 890 891 /* Clear out any active regions before we release the map. */ 892 region_del(resv_map, 0, LONG_MAX); 893 894 /* ... and any entries left in the cache */ 895 list_for_each_entry_safe(rg, trg, head, link) { 896 list_del(&rg->link); 897 kfree(rg); 898 } 899 900 VM_BUG_ON(resv_map->adds_in_progress); 901 902 kfree(resv_map); 903 } 904 905 static inline struct resv_map *inode_resv_map(struct inode *inode) 906 { 907 /* 908 * At inode evict time, i_mapping may not point to the original 909 * address space within the inode. This original address space 910 * contains the pointer to the resv_map. So, always use the 911 * address space embedded within the inode. 912 * The VERY common case is inode->mapping == &inode->i_data but, 913 * this may not be true for device special inodes. 914 */ 915 return (struct resv_map *)(&inode->i_data)->private_data; 916 } 917 918 static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 919 { 920 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 921 if (vma->vm_flags & VM_MAYSHARE) { 922 struct address_space *mapping = vma->vm_file->f_mapping; 923 struct inode *inode = mapping->host; 924 925 return inode_resv_map(inode); 926 927 } else { 928 return (struct resv_map *)(get_vma_private_data(vma) & 929 ~HPAGE_RESV_MASK); 930 } 931 } 932 933 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 934 { 935 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 936 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 937 938 set_vma_private_data(vma, (get_vma_private_data(vma) & 939 HPAGE_RESV_MASK) | (unsigned long)map); 940 } 941 942 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 943 { 944 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 945 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 946 947 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 948 } 949 950 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 951 { 952 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 953 954 return (get_vma_private_data(vma) & flag) != 0; 955 } 956 957 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 958 void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 959 { 960 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 961 if (!(vma->vm_flags & VM_MAYSHARE)) 962 vma->vm_private_data = (void *)0; 963 } 964 965 /* Returns true if the VMA has associated reserve pages */ 966 static bool vma_has_reserves(struct vm_area_struct *vma, long chg) 967 { 968 if (vma->vm_flags & VM_NORESERVE) { 969 /* 970 * This address is already reserved by other process(chg == 0), 971 * so, we should decrement reserved count. Without decrementing, 972 * reserve count remains after releasing inode, because this 973 * allocated page will go into page cache and is regarded as 974 * coming from reserved pool in releasing step. Currently, we 975 * don't have any other solution to deal with this situation 976 * properly, so add work-around here. 977 */ 978 if (vma->vm_flags & VM_MAYSHARE && chg == 0) 979 return true; 980 else 981 return false; 982 } 983 984 /* Shared mappings always use reserves */ 985 if (vma->vm_flags & VM_MAYSHARE) { 986 /* 987 * We know VM_NORESERVE is not set. Therefore, there SHOULD 988 * be a region map for all pages. The only situation where 989 * there is no region map is if a hole was punched via 990 * fallocate. In this case, there really are no reserves to 991 * use. This situation is indicated if chg != 0. 992 */ 993 if (chg) 994 return false; 995 else 996 return true; 997 } 998 999 /* 1000 * Only the process that called mmap() has reserves for 1001 * private mappings. 1002 */ 1003 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 1004 /* 1005 * Like the shared case above, a hole punch or truncate 1006 * could have been performed on the private mapping. 1007 * Examine the value of chg to determine if reserves 1008 * actually exist or were previously consumed. 1009 * Very Subtle - The value of chg comes from a previous 1010 * call to vma_needs_reserves(). The reserve map for 1011 * private mappings has different (opposite) semantics 1012 * than that of shared mappings. vma_needs_reserves() 1013 * has already taken this difference in semantics into 1014 * account. Therefore, the meaning of chg is the same 1015 * as in the shared case above. Code could easily be 1016 * combined, but keeping it separate draws attention to 1017 * subtle differences. 1018 */ 1019 if (chg) 1020 return false; 1021 else 1022 return true; 1023 } 1024 1025 return false; 1026 } 1027 1028 static void enqueue_huge_page(struct hstate *h, struct page *page) 1029 { 1030 int nid = page_to_nid(page); 1031 list_move(&page->lru, &h->hugepage_freelists[nid]); 1032 h->free_huge_pages++; 1033 h->free_huge_pages_node[nid]++; 1034 } 1035 1036 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) 1037 { 1038 struct page *page; 1039 1040 list_for_each_entry(page, &h->hugepage_freelists[nid], lru) 1041 if (!PageHWPoison(page)) 1042 break; 1043 /* 1044 * if 'non-isolated free hugepage' not found on the list, 1045 * the allocation fails. 1046 */ 1047 if (&h->hugepage_freelists[nid] == &page->lru) 1048 return NULL; 1049 list_move(&page->lru, &h->hugepage_activelist); 1050 set_page_refcounted(page); 1051 h->free_huge_pages--; 1052 h->free_huge_pages_node[nid]--; 1053 return page; 1054 } 1055 1056 static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, 1057 nodemask_t *nmask) 1058 { 1059 unsigned int cpuset_mems_cookie; 1060 struct zonelist *zonelist; 1061 struct zone *zone; 1062 struct zoneref *z; 1063 int node = NUMA_NO_NODE; 1064 1065 zonelist = node_zonelist(nid, gfp_mask); 1066 1067 retry_cpuset: 1068 cpuset_mems_cookie = read_mems_allowed_begin(); 1069 for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { 1070 struct page *page; 1071 1072 if (!cpuset_zone_allowed(zone, gfp_mask)) 1073 continue; 1074 /* 1075 * no need to ask again on the same node. Pool is node rather than 1076 * zone aware 1077 */ 1078 if (zone_to_nid(zone) == node) 1079 continue; 1080 node = zone_to_nid(zone); 1081 1082 page = dequeue_huge_page_node_exact(h, node); 1083 if (page) 1084 return page; 1085 } 1086 if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) 1087 goto retry_cpuset; 1088 1089 return NULL; 1090 } 1091 1092 /* Movability of hugepages depends on migration support. */ 1093 static inline gfp_t htlb_alloc_mask(struct hstate *h) 1094 { 1095 if (hugepage_movable_supported(h)) 1096 return GFP_HIGHUSER_MOVABLE; 1097 else 1098 return GFP_HIGHUSER; 1099 } 1100 1101 static struct page *dequeue_huge_page_vma(struct hstate *h, 1102 struct vm_area_struct *vma, 1103 unsigned long address, int avoid_reserve, 1104 long chg) 1105 { 1106 struct page *page; 1107 struct mempolicy *mpol; 1108 gfp_t gfp_mask; 1109 nodemask_t *nodemask; 1110 int nid; 1111 1112 /* 1113 * A child process with MAP_PRIVATE mappings created by their parent 1114 * have no page reserves. This check ensures that reservations are 1115 * not "stolen". The child may still get SIGKILLed 1116 */ 1117 if (!vma_has_reserves(vma, chg) && 1118 h->free_huge_pages - h->resv_huge_pages == 0) 1119 goto err; 1120 1121 /* If reserves cannot be used, ensure enough pages are in the pool */ 1122 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 1123 goto err; 1124 1125 gfp_mask = htlb_alloc_mask(h); 1126 nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 1127 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); 1128 if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { 1129 SetPagePrivate(page); 1130 h->resv_huge_pages--; 1131 } 1132 1133 mpol_cond_put(mpol); 1134 return page; 1135 1136 err: 1137 return NULL; 1138 } 1139 1140 /* 1141 * common helper functions for hstate_next_node_to_{alloc|free}. 1142 * We may have allocated or freed a huge page based on a different 1143 * nodes_allowed previously, so h->next_node_to_{alloc|free} might 1144 * be outside of *nodes_allowed. Ensure that we use an allowed 1145 * node for alloc or free. 1146 */ 1147 static int next_node_allowed(int nid, nodemask_t *nodes_allowed) 1148 { 1149 nid = next_node_in(nid, *nodes_allowed); 1150 VM_BUG_ON(nid >= MAX_NUMNODES); 1151 1152 return nid; 1153 } 1154 1155 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) 1156 { 1157 if (!node_isset(nid, *nodes_allowed)) 1158 nid = next_node_allowed(nid, nodes_allowed); 1159 return nid; 1160 } 1161 1162 /* 1163 * returns the previously saved node ["this node"] from which to 1164 * allocate a persistent huge page for the pool and advance the 1165 * next node from which to allocate, handling wrap at end of node 1166 * mask. 1167 */ 1168 static int hstate_next_node_to_alloc(struct hstate *h, 1169 nodemask_t *nodes_allowed) 1170 { 1171 int nid; 1172 1173 VM_BUG_ON(!nodes_allowed); 1174 1175 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); 1176 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); 1177 1178 return nid; 1179 } 1180 1181 /* 1182 * helper for free_pool_huge_page() - return the previously saved 1183 * node ["this node"] from which to free a huge page. Advance the 1184 * next node id whether or not we find a free huge page to free so 1185 * that the next attempt to free addresses the next node. 1186 */ 1187 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) 1188 { 1189 int nid; 1190 1191 VM_BUG_ON(!nodes_allowed); 1192 1193 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); 1194 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); 1195 1196 return nid; 1197 } 1198 1199 #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ 1200 for (nr_nodes = nodes_weight(*mask); \ 1201 nr_nodes > 0 && \ 1202 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ 1203 nr_nodes--) 1204 1205 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ 1206 for (nr_nodes = nodes_weight(*mask); \ 1207 nr_nodes > 0 && \ 1208 ((node = hstate_next_node_to_free(hs, mask)) || 1); \ 1209 nr_nodes--) 1210 1211 #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE 1212 static void destroy_compound_gigantic_page(struct page *page, 1213 unsigned int order) 1214 { 1215 int i; 1216 int nr_pages = 1 << order; 1217 struct page *p = page + 1; 1218 1219 atomic_set(compound_mapcount_ptr(page), 0); 1220 if (hpage_pincount_available(page)) 1221 atomic_set(compound_pincount_ptr(page), 0); 1222 1223 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1224 clear_compound_head(p); 1225 set_page_refcounted(p); 1226 } 1227 1228 set_compound_order(page, 0); 1229 __ClearPageHead(page); 1230 } 1231 1232 static void free_gigantic_page(struct page *page, unsigned int order) 1233 { 1234 /* 1235 * If the page isn't allocated using the cma allocator, 1236 * cma_release() returns false. 1237 */ 1238 if (IS_ENABLED(CONFIG_CMA) && 1239 cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) 1240 return; 1241 1242 free_contig_range(page_to_pfn(page), 1 << order); 1243 } 1244 1245 #ifdef CONFIG_CONTIG_ALLOC 1246 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1247 int nid, nodemask_t *nodemask) 1248 { 1249 unsigned long nr_pages = 1UL << huge_page_order(h); 1250 1251 if (IS_ENABLED(CONFIG_CMA)) { 1252 struct page *page; 1253 int node; 1254 1255 for_each_node_mask(node, *nodemask) { 1256 if (!hugetlb_cma[node]) 1257 continue; 1258 1259 page = cma_alloc(hugetlb_cma[node], nr_pages, 1260 huge_page_order(h), true); 1261 if (page) 1262 return page; 1263 } 1264 } 1265 1266 return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); 1267 } 1268 1269 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); 1270 static void prep_compound_gigantic_page(struct page *page, unsigned int order); 1271 #else /* !CONFIG_CONTIG_ALLOC */ 1272 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1273 int nid, nodemask_t *nodemask) 1274 { 1275 return NULL; 1276 } 1277 #endif /* CONFIG_CONTIG_ALLOC */ 1278 1279 #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ 1280 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1281 int nid, nodemask_t *nodemask) 1282 { 1283 return NULL; 1284 } 1285 static inline void free_gigantic_page(struct page *page, unsigned int order) { } 1286 static inline void destroy_compound_gigantic_page(struct page *page, 1287 unsigned int order) { } 1288 #endif 1289 1290 static void update_and_free_page(struct hstate *h, struct page *page) 1291 { 1292 int i; 1293 1294 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 1295 return; 1296 1297 h->nr_huge_pages--; 1298 h->nr_huge_pages_node[page_to_nid(page)]--; 1299 for (i = 0; i < pages_per_huge_page(h); i++) { 1300 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1301 1 << PG_referenced | 1 << PG_dirty | 1302 1 << PG_active | 1 << PG_private | 1303 1 << PG_writeback); 1304 } 1305 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); 1306 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); 1307 set_compound_page_dtor(page, NULL_COMPOUND_DTOR); 1308 set_page_refcounted(page); 1309 if (hstate_is_gigantic(h)) { 1310 /* 1311 * Temporarily drop the hugetlb_lock, because 1312 * we might block in free_gigantic_page(). 1313 */ 1314 spin_unlock(&hugetlb_lock); 1315 destroy_compound_gigantic_page(page, huge_page_order(h)); 1316 free_gigantic_page(page, huge_page_order(h)); 1317 spin_lock(&hugetlb_lock); 1318 } else { 1319 __free_pages(page, huge_page_order(h)); 1320 } 1321 } 1322 1323 struct hstate *size_to_hstate(unsigned long size) 1324 { 1325 struct hstate *h; 1326 1327 for_each_hstate(h) { 1328 if (huge_page_size(h) == size) 1329 return h; 1330 } 1331 return NULL; 1332 } 1333 1334 /* 1335 * Test to determine whether the hugepage is "active/in-use" (i.e. being linked 1336 * to hstate->hugepage_activelist.) 1337 * 1338 * This function can be called for tail pages, but never returns true for them. 1339 */ 1340 bool page_huge_active(struct page *page) 1341 { 1342 VM_BUG_ON_PAGE(!PageHuge(page), page); 1343 return PageHead(page) && PagePrivate(&page[1]); 1344 } 1345 1346 /* never called for tail page */ 1347 static void set_page_huge_active(struct page *page) 1348 { 1349 VM_BUG_ON_PAGE(!PageHeadHuge(page), page); 1350 SetPagePrivate(&page[1]); 1351 } 1352 1353 static void clear_page_huge_active(struct page *page) 1354 { 1355 VM_BUG_ON_PAGE(!PageHeadHuge(page), page); 1356 ClearPagePrivate(&page[1]); 1357 } 1358 1359 /* 1360 * Internal hugetlb specific page flag. Do not use outside of the hugetlb 1361 * code 1362 */ 1363 static inline bool PageHugeTemporary(struct page *page) 1364 { 1365 if (!PageHuge(page)) 1366 return false; 1367 1368 return (unsigned long)page[2].mapping == -1U; 1369 } 1370 1371 static inline void SetPageHugeTemporary(struct page *page) 1372 { 1373 page[2].mapping = (void *)-1U; 1374 } 1375 1376 static inline void ClearPageHugeTemporary(struct page *page) 1377 { 1378 page[2].mapping = NULL; 1379 } 1380 1381 static void __free_huge_page(struct page *page) 1382 { 1383 /* 1384 * Can't pass hstate in here because it is called from the 1385 * compound page destructor. 1386 */ 1387 struct hstate *h = page_hstate(page); 1388 int nid = page_to_nid(page); 1389 struct hugepage_subpool *spool = 1390 (struct hugepage_subpool *)page_private(page); 1391 bool restore_reserve; 1392 1393 VM_BUG_ON_PAGE(page_count(page), page); 1394 VM_BUG_ON_PAGE(page_mapcount(page), page); 1395 1396 set_page_private(page, 0); 1397 page->mapping = NULL; 1398 restore_reserve = PagePrivate(page); 1399 ClearPagePrivate(page); 1400 1401 /* 1402 * If PagePrivate() was set on page, page allocation consumed a 1403 * reservation. If the page was associated with a subpool, there 1404 * would have been a page reserved in the subpool before allocation 1405 * via hugepage_subpool_get_pages(). Since we are 'restoring' the 1406 * reservtion, do not call hugepage_subpool_put_pages() as this will 1407 * remove the reserved page from the subpool. 1408 */ 1409 if (!restore_reserve) { 1410 /* 1411 * A return code of zero implies that the subpool will be 1412 * under its minimum size if the reservation is not restored 1413 * after page is free. Therefore, force restore_reserve 1414 * operation. 1415 */ 1416 if (hugepage_subpool_put_pages(spool, 1) == 0) 1417 restore_reserve = true; 1418 } 1419 1420 spin_lock(&hugetlb_lock); 1421 clear_page_huge_active(page); 1422 hugetlb_cgroup_uncharge_page(hstate_index(h), 1423 pages_per_huge_page(h), page); 1424 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), 1425 pages_per_huge_page(h), page); 1426 if (restore_reserve) 1427 h->resv_huge_pages++; 1428 1429 if (PageHugeTemporary(page)) { 1430 list_del(&page->lru); 1431 ClearPageHugeTemporary(page); 1432 update_and_free_page(h, page); 1433 } else if (h->surplus_huge_pages_node[nid]) { 1434 /* remove the page from active list */ 1435 list_del(&page->lru); 1436 update_and_free_page(h, page); 1437 h->surplus_huge_pages--; 1438 h->surplus_huge_pages_node[nid]--; 1439 } else { 1440 arch_clear_hugepage_flags(page); 1441 enqueue_huge_page(h, page); 1442 } 1443 spin_unlock(&hugetlb_lock); 1444 } 1445 1446 /* 1447 * As free_huge_page() can be called from a non-task context, we have 1448 * to defer the actual freeing in a workqueue to prevent potential 1449 * hugetlb_lock deadlock. 1450 * 1451 * free_hpage_workfn() locklessly retrieves the linked list of pages to 1452 * be freed and frees them one-by-one. As the page->mapping pointer is 1453 * going to be cleared in __free_huge_page() anyway, it is reused as the 1454 * llist_node structure of a lockless linked list of huge pages to be freed. 1455 */ 1456 static LLIST_HEAD(hpage_freelist); 1457 1458 static void free_hpage_workfn(struct work_struct *work) 1459 { 1460 struct llist_node *node; 1461 struct page *page; 1462 1463 node = llist_del_all(&hpage_freelist); 1464 1465 while (node) { 1466 page = container_of((struct address_space **)node, 1467 struct page, mapping); 1468 node = node->next; 1469 __free_huge_page(page); 1470 } 1471 } 1472 static DECLARE_WORK(free_hpage_work, free_hpage_workfn); 1473 1474 void free_huge_page(struct page *page) 1475 { 1476 /* 1477 * Defer freeing if in non-task context to avoid hugetlb_lock deadlock. 1478 */ 1479 if (!in_task()) { 1480 /* 1481 * Only call schedule_work() if hpage_freelist is previously 1482 * empty. Otherwise, schedule_work() had been called but the 1483 * workfn hasn't retrieved the list yet. 1484 */ 1485 if (llist_add((struct llist_node *)&page->mapping, 1486 &hpage_freelist)) 1487 schedule_work(&free_hpage_work); 1488 return; 1489 } 1490 1491 __free_huge_page(page); 1492 } 1493 1494 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 1495 { 1496 INIT_LIST_HEAD(&page->lru); 1497 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); 1498 spin_lock(&hugetlb_lock); 1499 set_hugetlb_cgroup(page, NULL); 1500 set_hugetlb_cgroup_rsvd(page, NULL); 1501 h->nr_huge_pages++; 1502 h->nr_huge_pages_node[nid]++; 1503 spin_unlock(&hugetlb_lock); 1504 } 1505 1506 static void prep_compound_gigantic_page(struct page *page, unsigned int order) 1507 { 1508 int i; 1509 int nr_pages = 1 << order; 1510 struct page *p = page + 1; 1511 1512 /* we rely on prep_new_huge_page to set the destructor */ 1513 set_compound_order(page, order); 1514 __ClearPageReserved(page); 1515 __SetPageHead(page); 1516 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1517 /* 1518 * For gigantic hugepages allocated through bootmem at 1519 * boot, it's safer to be consistent with the not-gigantic 1520 * hugepages and clear the PG_reserved bit from all tail pages 1521 * too. Otherwise drivers using get_user_pages() to access tail 1522 * pages may get the reference counting wrong if they see 1523 * PG_reserved set on a tail page (despite the head page not 1524 * having PG_reserved set). Enforcing this consistency between 1525 * head and tail pages allows drivers to optimize away a check 1526 * on the head page when they need know if put_page() is needed 1527 * after get_user_pages(). 1528 */ 1529 __ClearPageReserved(p); 1530 set_page_count(p, 0); 1531 set_compound_head(p, page); 1532 } 1533 atomic_set(compound_mapcount_ptr(page), -1); 1534 1535 if (hpage_pincount_available(page)) 1536 atomic_set(compound_pincount_ptr(page), 0); 1537 } 1538 1539 /* 1540 * PageHuge() only returns true for hugetlbfs pages, but not for normal or 1541 * transparent huge pages. See the PageTransHuge() documentation for more 1542 * details. 1543 */ 1544 int PageHuge(struct page *page) 1545 { 1546 if (!PageCompound(page)) 1547 return 0; 1548 1549 page = compound_head(page); 1550 return page[1].compound_dtor == HUGETLB_PAGE_DTOR; 1551 } 1552 EXPORT_SYMBOL_GPL(PageHuge); 1553 1554 /* 1555 * PageHeadHuge() only returns true for hugetlbfs head page, but not for 1556 * normal or transparent huge pages. 1557 */ 1558 int PageHeadHuge(struct page *page_head) 1559 { 1560 if (!PageHead(page_head)) 1561 return 0; 1562 1563 return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; 1564 } 1565 1566 /* 1567 * Find address_space associated with hugetlbfs page. 1568 * Upon entry page is locked and page 'was' mapped although mapped state 1569 * could change. If necessary, use anon_vma to find vma and associated 1570 * address space. The returned mapping may be stale, but it can not be 1571 * invalid as page lock (which is held) is required to destroy mapping. 1572 */ 1573 static struct address_space *_get_hugetlb_page_mapping(struct page *hpage) 1574 { 1575 struct anon_vma *anon_vma; 1576 pgoff_t pgoff_start, pgoff_end; 1577 struct anon_vma_chain *avc; 1578 struct address_space *mapping = page_mapping(hpage); 1579 1580 /* Simple file based mapping */ 1581 if (mapping) 1582 return mapping; 1583 1584 /* 1585 * Even anonymous hugetlbfs mappings are associated with an 1586 * underlying hugetlbfs file (see hugetlb_file_setup in mmap 1587 * code). Find a vma associated with the anonymous vma, and 1588 * use the file pointer to get address_space. 1589 */ 1590 anon_vma = page_lock_anon_vma_read(hpage); 1591 if (!anon_vma) 1592 return mapping; /* NULL */ 1593 1594 /* Use first found vma */ 1595 pgoff_start = page_to_pgoff(hpage); 1596 pgoff_end = pgoff_start + hpage_nr_pages(hpage) - 1; 1597 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, 1598 pgoff_start, pgoff_end) { 1599 struct vm_area_struct *vma = avc->vma; 1600 1601 mapping = vma->vm_file->f_mapping; 1602 break; 1603 } 1604 1605 anon_vma_unlock_read(anon_vma); 1606 return mapping; 1607 } 1608 1609 /* 1610 * Find and lock address space (mapping) in write mode. 1611 * 1612 * Upon entry, the page is locked which allows us to find the mapping 1613 * even in the case of an anon page. However, locking order dictates 1614 * the i_mmap_rwsem be acquired BEFORE the page lock. This is hugetlbfs 1615 * specific. So, we first try to lock the sema while still holding the 1616 * page lock. If this works, great! If not, then we need to drop the 1617 * page lock and then acquire i_mmap_rwsem and reacquire page lock. Of 1618 * course, need to revalidate state along the way. 1619 */ 1620 struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) 1621 { 1622 struct address_space *mapping, *mapping2; 1623 1624 mapping = _get_hugetlb_page_mapping(hpage); 1625 retry: 1626 if (!mapping) 1627 return mapping; 1628 1629 /* 1630 * If no contention, take lock and return 1631 */ 1632 if (i_mmap_trylock_write(mapping)) 1633 return mapping; 1634 1635 /* 1636 * Must drop page lock and wait on mapping sema. 1637 * Note: Once page lock is dropped, mapping could become invalid. 1638 * As a hack, increase map count until we lock page again. 1639 */ 1640 atomic_inc(&hpage->_mapcount); 1641 unlock_page(hpage); 1642 i_mmap_lock_write(mapping); 1643 lock_page(hpage); 1644 atomic_add_negative(-1, &hpage->_mapcount); 1645 1646 /* verify page is still mapped */ 1647 if (!page_mapped(hpage)) { 1648 i_mmap_unlock_write(mapping); 1649 return NULL; 1650 } 1651 1652 /* 1653 * Get address space again and verify it is the same one 1654 * we locked. If not, drop lock and retry. 1655 */ 1656 mapping2 = _get_hugetlb_page_mapping(hpage); 1657 if (mapping2 != mapping) { 1658 i_mmap_unlock_write(mapping); 1659 mapping = mapping2; 1660 goto retry; 1661 } 1662 1663 return mapping; 1664 } 1665 1666 pgoff_t __basepage_index(struct page *page) 1667 { 1668 struct page *page_head = compound_head(page); 1669 pgoff_t index = page_index(page_head); 1670 unsigned long compound_idx; 1671 1672 if (!PageHuge(page_head)) 1673 return page_index(page); 1674 1675 if (compound_order(page_head) >= MAX_ORDER) 1676 compound_idx = page_to_pfn(page) - page_to_pfn(page_head); 1677 else 1678 compound_idx = page - page_head; 1679 1680 return (index << compound_order(page_head)) + compound_idx; 1681 } 1682 1683 static struct page *alloc_buddy_huge_page(struct hstate *h, 1684 gfp_t gfp_mask, int nid, nodemask_t *nmask, 1685 nodemask_t *node_alloc_noretry) 1686 { 1687 int order = huge_page_order(h); 1688 struct page *page; 1689 bool alloc_try_hard = true; 1690 1691 /* 1692 * By default we always try hard to allocate the page with 1693 * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in 1694 * a loop (to adjust global huge page counts) and previous allocation 1695 * failed, do not continue to try hard on the same node. Use the 1696 * node_alloc_noretry bitmap to manage this state information. 1697 */ 1698 if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) 1699 alloc_try_hard = false; 1700 gfp_mask |= __GFP_COMP|__GFP_NOWARN; 1701 if (alloc_try_hard) 1702 gfp_mask |= __GFP_RETRY_MAYFAIL; 1703 if (nid == NUMA_NO_NODE) 1704 nid = numa_mem_id(); 1705 page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); 1706 if (page) 1707 __count_vm_event(HTLB_BUDDY_PGALLOC); 1708 else 1709 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 1710 1711 /* 1712 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this 1713 * indicates an overall state change. Clear bit so that we resume 1714 * normal 'try hard' allocations. 1715 */ 1716 if (node_alloc_noretry && page && !alloc_try_hard) 1717 node_clear(nid, *node_alloc_noretry); 1718 1719 /* 1720 * If we tried hard to get a page but failed, set bit so that 1721 * subsequent attempts will not try as hard until there is an 1722 * overall state change. 1723 */ 1724 if (node_alloc_noretry && !page && alloc_try_hard) 1725 node_set(nid, *node_alloc_noretry); 1726 1727 return page; 1728 } 1729 1730 /* 1731 * Common helper to allocate a fresh hugetlb page. All specific allocators 1732 * should use this function to get new hugetlb pages 1733 */ 1734 static struct page *alloc_fresh_huge_page(struct hstate *h, 1735 gfp_t gfp_mask, int nid, nodemask_t *nmask, 1736 nodemask_t *node_alloc_noretry) 1737 { 1738 struct page *page; 1739 1740 if (hstate_is_gigantic(h)) 1741 page = alloc_gigantic_page(h, gfp_mask, nid, nmask); 1742 else 1743 page = alloc_buddy_huge_page(h, gfp_mask, 1744 nid, nmask, node_alloc_noretry); 1745 if (!page) 1746 return NULL; 1747 1748 if (hstate_is_gigantic(h)) 1749 prep_compound_gigantic_page(page, huge_page_order(h)); 1750 prep_new_huge_page(h, page, page_to_nid(page)); 1751 1752 return page; 1753 } 1754 1755 /* 1756 * Allocates a fresh page to the hugetlb allocator pool in the node interleaved 1757 * manner. 1758 */ 1759 static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 1760 nodemask_t *node_alloc_noretry) 1761 { 1762 struct page *page; 1763 int nr_nodes, node; 1764 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 1765 1766 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 1767 page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, 1768 node_alloc_noretry); 1769 if (page) 1770 break; 1771 } 1772 1773 if (!page) 1774 return 0; 1775 1776 put_page(page); /* free it into the hugepage allocator */ 1777 1778 return 1; 1779 } 1780 1781 /* 1782 * Free huge page from pool from next node to free. 1783 * Attempt to keep persistent huge pages more or less 1784 * balanced over allowed nodes. 1785 * Called with hugetlb_lock locked. 1786 */ 1787 static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 1788 bool acct_surplus) 1789 { 1790 int nr_nodes, node; 1791 int ret = 0; 1792 1793 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 1794 /* 1795 * If we're returning unused surplus pages, only examine 1796 * nodes with surplus pages. 1797 */ 1798 if ((!acct_surplus || h->surplus_huge_pages_node[node]) && 1799 !list_empty(&h->hugepage_freelists[node])) { 1800 struct page *page = 1801 list_entry(h->hugepage_freelists[node].next, 1802 struct page, lru); 1803 list_del(&page->lru); 1804 h->free_huge_pages--; 1805 h->free_huge_pages_node[node]--; 1806 if (acct_surplus) { 1807 h->surplus_huge_pages--; 1808 h->surplus_huge_pages_node[node]--; 1809 } 1810 update_and_free_page(h, page); 1811 ret = 1; 1812 break; 1813 } 1814 } 1815 1816 return ret; 1817 } 1818 1819 /* 1820 * Dissolve a given free hugepage into free buddy pages. This function does 1821 * nothing for in-use hugepages and non-hugepages. 1822 * This function returns values like below: 1823 * 1824 * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use 1825 * (allocated or reserved.) 1826 * 0: successfully dissolved free hugepages or the page is not a 1827 * hugepage (considered as already dissolved) 1828 */ 1829 int dissolve_free_huge_page(struct page *page) 1830 { 1831 int rc = -EBUSY; 1832 1833 /* Not to disrupt normal path by vainly holding hugetlb_lock */ 1834 if (!PageHuge(page)) 1835 return 0; 1836 1837 spin_lock(&hugetlb_lock); 1838 if (!PageHuge(page)) { 1839 rc = 0; 1840 goto out; 1841 } 1842 1843 if (!page_count(page)) { 1844 struct page *head = compound_head(page); 1845 struct hstate *h = page_hstate(head); 1846 int nid = page_to_nid(head); 1847 if (h->free_huge_pages - h->resv_huge_pages == 0) 1848 goto out; 1849 /* 1850 * Move PageHWPoison flag from head page to the raw error page, 1851 * which makes any subpages rather than the error page reusable. 1852 */ 1853 if (PageHWPoison(head) && page != head) { 1854 SetPageHWPoison(page); 1855 ClearPageHWPoison(head); 1856 } 1857 list_del(&head->lru); 1858 h->free_huge_pages--; 1859 h->free_huge_pages_node[nid]--; 1860 h->max_huge_pages--; 1861 update_and_free_page(h, head); 1862 rc = 0; 1863 } 1864 out: 1865 spin_unlock(&hugetlb_lock); 1866 return rc; 1867 } 1868 1869 /* 1870 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to 1871 * make specified memory blocks removable from the system. 1872 * Note that this will dissolve a free gigantic hugepage completely, if any 1873 * part of it lies within the given range. 1874 * Also note that if dissolve_free_huge_page() returns with an error, all 1875 * free hugepages that were dissolved before that error are lost. 1876 */ 1877 int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) 1878 { 1879 unsigned long pfn; 1880 struct page *page; 1881 int rc = 0; 1882 1883 if (!hugepages_supported()) 1884 return rc; 1885 1886 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { 1887 page = pfn_to_page(pfn); 1888 rc = dissolve_free_huge_page(page); 1889 if (rc) 1890 break; 1891 } 1892 1893 return rc; 1894 } 1895 1896 /* 1897 * Allocates a fresh surplus page from the page allocator. 1898 */ 1899 static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, 1900 int nid, nodemask_t *nmask) 1901 { 1902 struct page *page = NULL; 1903 1904 if (hstate_is_gigantic(h)) 1905 return NULL; 1906 1907 spin_lock(&hugetlb_lock); 1908 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) 1909 goto out_unlock; 1910 spin_unlock(&hugetlb_lock); 1911 1912 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); 1913 if (!page) 1914 return NULL; 1915 1916 spin_lock(&hugetlb_lock); 1917 /* 1918 * We could have raced with the pool size change. 1919 * Double check that and simply deallocate the new page 1920 * if we would end up overcommiting the surpluses. Abuse 1921 * temporary page to workaround the nasty free_huge_page 1922 * codeflow 1923 */ 1924 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { 1925 SetPageHugeTemporary(page); 1926 spin_unlock(&hugetlb_lock); 1927 put_page(page); 1928 return NULL; 1929 } else { 1930 h->surplus_huge_pages++; 1931 h->surplus_huge_pages_node[page_to_nid(page)]++; 1932 } 1933 1934 out_unlock: 1935 spin_unlock(&hugetlb_lock); 1936 1937 return page; 1938 } 1939 1940 struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, 1941 int nid, nodemask_t *nmask) 1942 { 1943 struct page *page; 1944 1945 if (hstate_is_gigantic(h)) 1946 return NULL; 1947 1948 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); 1949 if (!page) 1950 return NULL; 1951 1952 /* 1953 * We do not account these pages as surplus because they are only 1954 * temporary and will be released properly on the last reference 1955 */ 1956 SetPageHugeTemporary(page); 1957 1958 return page; 1959 } 1960 1961 /* 1962 * Use the VMA's mpolicy to allocate a huge page from the buddy. 1963 */ 1964 static 1965 struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, 1966 struct vm_area_struct *vma, unsigned long addr) 1967 { 1968 struct page *page; 1969 struct mempolicy *mpol; 1970 gfp_t gfp_mask = htlb_alloc_mask(h); 1971 int nid; 1972 nodemask_t *nodemask; 1973 1974 nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); 1975 page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); 1976 mpol_cond_put(mpol); 1977 1978 return page; 1979 } 1980 1981 /* page migration callback function */ 1982 struct page *alloc_huge_page_node(struct hstate *h, int nid) 1983 { 1984 gfp_t gfp_mask = htlb_alloc_mask(h); 1985 struct page *page = NULL; 1986 1987 if (nid != NUMA_NO_NODE) 1988 gfp_mask |= __GFP_THISNODE; 1989 1990 spin_lock(&hugetlb_lock); 1991 if (h->free_huge_pages - h->resv_huge_pages > 0) 1992 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL); 1993 spin_unlock(&hugetlb_lock); 1994 1995 if (!page) 1996 page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL); 1997 1998 return page; 1999 } 2000 2001 /* page migration callback function */ 2002 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, 2003 nodemask_t *nmask) 2004 { 2005 gfp_t gfp_mask = htlb_alloc_mask(h); 2006 2007 spin_lock(&hugetlb_lock); 2008 if (h->free_huge_pages - h->resv_huge_pages > 0) { 2009 struct page *page; 2010 2011 page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); 2012 if (page) { 2013 spin_unlock(&hugetlb_lock); 2014 return page; 2015 } 2016 } 2017 spin_unlock(&hugetlb_lock); 2018 2019 return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); 2020 } 2021 2022 /* mempolicy aware migration callback */ 2023 struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, 2024 unsigned long address) 2025 { 2026 struct mempolicy *mpol; 2027 nodemask_t *nodemask; 2028 struct page *page; 2029 gfp_t gfp_mask; 2030 int node; 2031 2032 gfp_mask = htlb_alloc_mask(h); 2033 node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 2034 page = alloc_huge_page_nodemask(h, node, nodemask); 2035 mpol_cond_put(mpol); 2036 2037 return page; 2038 } 2039 2040 /* 2041 * Increase the hugetlb pool such that it can accommodate a reservation 2042 * of size 'delta'. 2043 */ 2044 static int gather_surplus_pages(struct hstate *h, int delta) 2045 __must_hold(&hugetlb_lock) 2046 { 2047 struct list_head surplus_list; 2048 struct page *page, *tmp; 2049 int ret, i; 2050 int needed, allocated; 2051 bool alloc_ok = true; 2052 2053 needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 2054 if (needed <= 0) { 2055 h->resv_huge_pages += delta; 2056 return 0; 2057 } 2058 2059 allocated = 0; 2060 INIT_LIST_HEAD(&surplus_list); 2061 2062 ret = -ENOMEM; 2063 retry: 2064 spin_unlock(&hugetlb_lock); 2065 for (i = 0; i < needed; i++) { 2066 page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), 2067 NUMA_NO_NODE, NULL); 2068 if (!page) { 2069 alloc_ok = false; 2070 break; 2071 } 2072 list_add(&page->lru, &surplus_list); 2073 cond_resched(); 2074 } 2075 allocated += i; 2076 2077 /* 2078 * After retaking hugetlb_lock, we need to recalculate 'needed' 2079 * because either resv_huge_pages or free_huge_pages may have changed. 2080 */ 2081 spin_lock(&hugetlb_lock); 2082 needed = (h->resv_huge_pages + delta) - 2083 (h->free_huge_pages + allocated); 2084 if (needed > 0) { 2085 if (alloc_ok) 2086 goto retry; 2087 /* 2088 * We were not able to allocate enough pages to 2089 * satisfy the entire reservation so we free what 2090 * we've allocated so far. 2091 */ 2092 goto free; 2093 } 2094 /* 2095 * The surplus_list now contains _at_least_ the number of extra pages 2096 * needed to accommodate the reservation. Add the appropriate number 2097 * of pages to the hugetlb pool and free the extras back to the buddy 2098 * allocator. Commit the entire reservation here to prevent another 2099 * process from stealing the pages as they are added to the pool but 2100 * before they are reserved. 2101 */ 2102 needed += allocated; 2103 h->resv_huge_pages += delta; 2104 ret = 0; 2105 2106 /* Free the needed pages to the hugetlb pool */ 2107 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 2108 if ((--needed) < 0) 2109 break; 2110 /* 2111 * This page is now managed by the hugetlb allocator and has 2112 * no users -- drop the buddy allocator's reference. 2113 */ 2114 put_page_testzero(page); 2115 VM_BUG_ON_PAGE(page_count(page), page); 2116 enqueue_huge_page(h, page); 2117 } 2118 free: 2119 spin_unlock(&hugetlb_lock); 2120 2121 /* Free unnecessary surplus pages to the buddy allocator */ 2122 list_for_each_entry_safe(page, tmp, &surplus_list, lru) 2123 put_page(page); 2124 spin_lock(&hugetlb_lock); 2125 2126 return ret; 2127 } 2128 2129 /* 2130 * This routine has two main purposes: 2131 * 1) Decrement the reservation count (resv_huge_pages) by the value passed 2132 * in unused_resv_pages. This corresponds to the prior adjustments made 2133 * to the associated reservation map. 2134 * 2) Free any unused surplus pages that may have been allocated to satisfy 2135 * the reservation. As many as unused_resv_pages may be freed. 2136 * 2137 * Called with hugetlb_lock held. However, the lock could be dropped (and 2138 * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, 2139 * we must make sure nobody else can claim pages we are in the process of 2140 * freeing. Do this by ensuring resv_huge_page always is greater than the 2141 * number of huge pages we plan to free when dropping the lock. 2142 */ 2143 static void return_unused_surplus_pages(struct hstate *h, 2144 unsigned long unused_resv_pages) 2145 { 2146 unsigned long nr_pages; 2147 2148 /* Cannot return gigantic pages currently */ 2149 if (hstate_is_gigantic(h)) 2150 goto out; 2151 2152 /* 2153 * Part (or even all) of the reservation could have been backed 2154 * by pre-allocated pages. Only free surplus pages. 2155 */ 2156 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 2157 2158 /* 2159 * We want to release as many surplus pages as possible, spread 2160 * evenly across all nodes with memory. Iterate across these nodes 2161 * until we can no longer free unreserved surplus pages. This occurs 2162 * when the nodes with surplus pages have no free pages. 2163 * free_pool_huge_page() will balance the the freed pages across the 2164 * on-line nodes with memory and will handle the hstate accounting. 2165 * 2166 * Note that we decrement resv_huge_pages as we free the pages. If 2167 * we drop the lock, resv_huge_pages will still be sufficiently large 2168 * to cover subsequent pages we may free. 2169 */ 2170 while (nr_pages--) { 2171 h->resv_huge_pages--; 2172 unused_resv_pages--; 2173 if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) 2174 goto out; 2175 cond_resched_lock(&hugetlb_lock); 2176 } 2177 2178 out: 2179 /* Fully uncommit the reservation */ 2180 h->resv_huge_pages -= unused_resv_pages; 2181 } 2182 2183 2184 /* 2185 * vma_needs_reservation, vma_commit_reservation and vma_end_reservation 2186 * are used by the huge page allocation routines to manage reservations. 2187 * 2188 * vma_needs_reservation is called to determine if the huge page at addr 2189 * within the vma has an associated reservation. If a reservation is 2190 * needed, the value 1 is returned. The caller is then responsible for 2191 * managing the global reservation and subpool usage counts. After 2192 * the huge page has been allocated, vma_commit_reservation is called 2193 * to add the page to the reservation map. If the page allocation fails, 2194 * the reservation must be ended instead of committed. vma_end_reservation 2195 * is called in such cases. 2196 * 2197 * In the normal case, vma_commit_reservation returns the same value 2198 * as the preceding vma_needs_reservation call. The only time this 2199 * is not the case is if a reserve map was changed between calls. It 2200 * is the responsibility of the caller to notice the difference and 2201 * take appropriate action. 2202 * 2203 * vma_add_reservation is used in error paths where a reservation must 2204 * be restored when a newly allocated huge page must be freed. It is 2205 * to be called after calling vma_needs_reservation to determine if a 2206 * reservation exists. 2207 */ 2208 enum vma_resv_mode { 2209 VMA_NEEDS_RESV, 2210 VMA_COMMIT_RESV, 2211 VMA_END_RESV, 2212 VMA_ADD_RESV, 2213 }; 2214 static long __vma_reservation_common(struct hstate *h, 2215 struct vm_area_struct *vma, unsigned long addr, 2216 enum vma_resv_mode mode) 2217 { 2218 struct resv_map *resv; 2219 pgoff_t idx; 2220 long ret; 2221 long dummy_out_regions_needed; 2222 2223 resv = vma_resv_map(vma); 2224 if (!resv) 2225 return 1; 2226 2227 idx = vma_hugecache_offset(h, vma, addr); 2228 switch (mode) { 2229 case VMA_NEEDS_RESV: 2230 ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); 2231 /* We assume that vma_reservation_* routines always operate on 2232 * 1 page, and that adding to resv map a 1 page entry can only 2233 * ever require 1 region. 2234 */ 2235 VM_BUG_ON(dummy_out_regions_needed != 1); 2236 break; 2237 case VMA_COMMIT_RESV: 2238 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2239 /* region_add calls of range 1 should never fail. */ 2240 VM_BUG_ON(ret < 0); 2241 break; 2242 case VMA_END_RESV: 2243 region_abort(resv, idx, idx + 1, 1); 2244 ret = 0; 2245 break; 2246 case VMA_ADD_RESV: 2247 if (vma->vm_flags & VM_MAYSHARE) { 2248 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2249 /* region_add calls of range 1 should never fail. */ 2250 VM_BUG_ON(ret < 0); 2251 } else { 2252 region_abort(resv, idx, idx + 1, 1); 2253 ret = region_del(resv, idx, idx + 1); 2254 } 2255 break; 2256 default: 2257 BUG(); 2258 } 2259 2260 if (vma->vm_flags & VM_MAYSHARE) 2261 return ret; 2262 else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) { 2263 /* 2264 * In most cases, reserves always exist for private mappings. 2265 * However, a file associated with mapping could have been 2266 * hole punched or truncated after reserves were consumed. 2267 * As subsequent fault on such a range will not use reserves. 2268 * Subtle - The reserve map for private mappings has the 2269 * opposite meaning than that of shared mappings. If NO 2270 * entry is in the reserve map, it means a reservation exists. 2271 * If an entry exists in the reserve map, it means the 2272 * reservation has already been consumed. As a result, the 2273 * return value of this routine is the opposite of the 2274 * value returned from reserve map manipulation routines above. 2275 */ 2276 if (ret) 2277 return 0; 2278 else 2279 return 1; 2280 } 2281 else 2282 return ret < 0 ? ret : 0; 2283 } 2284 2285 static long vma_needs_reservation(struct hstate *h, 2286 struct vm_area_struct *vma, unsigned long addr) 2287 { 2288 return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); 2289 } 2290 2291 static long vma_commit_reservation(struct hstate *h, 2292 struct vm_area_struct *vma, unsigned long addr) 2293 { 2294 return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); 2295 } 2296 2297 static void vma_end_reservation(struct hstate *h, 2298 struct vm_area_struct *vma, unsigned long addr) 2299 { 2300 (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); 2301 } 2302 2303 static long vma_add_reservation(struct hstate *h, 2304 struct vm_area_struct *vma, unsigned long addr) 2305 { 2306 return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); 2307 } 2308 2309 /* 2310 * This routine is called to restore a reservation on error paths. In the 2311 * specific error paths, a huge page was allocated (via alloc_huge_page) 2312 * and is about to be freed. If a reservation for the page existed, 2313 * alloc_huge_page would have consumed the reservation and set PagePrivate 2314 * in the newly allocated page. When the page is freed via free_huge_page, 2315 * the global reservation count will be incremented if PagePrivate is set. 2316 * However, free_huge_page can not adjust the reserve map. Adjust the 2317 * reserve map here to be consistent with global reserve count adjustments 2318 * to be made by free_huge_page. 2319 */ 2320 static void restore_reserve_on_error(struct hstate *h, 2321 struct vm_area_struct *vma, unsigned long address, 2322 struct page *page) 2323 { 2324 if (unlikely(PagePrivate(page))) { 2325 long rc = vma_needs_reservation(h, vma, address); 2326 2327 if (unlikely(rc < 0)) { 2328 /* 2329 * Rare out of memory condition in reserve map 2330 * manipulation. Clear PagePrivate so that 2331 * global reserve count will not be incremented 2332 * by free_huge_page. This will make it appear 2333 * as though the reservation for this page was 2334 * consumed. This may prevent the task from 2335 * faulting in the page at a later time. This 2336 * is better than inconsistent global huge page 2337 * accounting of reserve counts. 2338 */ 2339 ClearPagePrivate(page); 2340 } else if (rc) { 2341 rc = vma_add_reservation(h, vma, address); 2342 if (unlikely(rc < 0)) 2343 /* 2344 * See above comment about rare out of 2345 * memory condition. 2346 */ 2347 ClearPagePrivate(page); 2348 } else 2349 vma_end_reservation(h, vma, address); 2350 } 2351 } 2352 2353 struct page *alloc_huge_page(struct vm_area_struct *vma, 2354 unsigned long addr, int avoid_reserve) 2355 { 2356 struct hugepage_subpool *spool = subpool_vma(vma); 2357 struct hstate *h = hstate_vma(vma); 2358 struct page *page; 2359 long map_chg, map_commit; 2360 long gbl_chg; 2361 int ret, idx; 2362 struct hugetlb_cgroup *h_cg; 2363 bool deferred_reserve; 2364 2365 idx = hstate_index(h); 2366 /* 2367 * Examine the region/reserve map to determine if the process 2368 * has a reservation for the page to be allocated. A return 2369 * code of zero indicates a reservation exists (no change). 2370 */ 2371 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); 2372 if (map_chg < 0) 2373 return ERR_PTR(-ENOMEM); 2374 2375 /* 2376 * Processes that did not create the mapping will have no 2377 * reserves as indicated by the region/reserve map. Check 2378 * that the allocation will not exceed the subpool limit. 2379 * Allocations for MAP_NORESERVE mappings also need to be 2380 * checked against any subpool limit. 2381 */ 2382 if (map_chg || avoid_reserve) { 2383 gbl_chg = hugepage_subpool_get_pages(spool, 1); 2384 if (gbl_chg < 0) { 2385 vma_end_reservation(h, vma, addr); 2386 return ERR_PTR(-ENOSPC); 2387 } 2388 2389 /* 2390 * Even though there was no reservation in the region/reserve 2391 * map, there could be reservations associated with the 2392 * subpool that can be used. This would be indicated if the 2393 * return value of hugepage_subpool_get_pages() is zero. 2394 * However, if avoid_reserve is specified we still avoid even 2395 * the subpool reservations. 2396 */ 2397 if (avoid_reserve) 2398 gbl_chg = 1; 2399 } 2400 2401 /* If this allocation is not consuming a reservation, charge it now. 2402 */ 2403 deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma); 2404 if (deferred_reserve) { 2405 ret = hugetlb_cgroup_charge_cgroup_rsvd( 2406 idx, pages_per_huge_page(h), &h_cg); 2407 if (ret) 2408 goto out_subpool_put; 2409 } 2410 2411 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 2412 if (ret) 2413 goto out_uncharge_cgroup_reservation; 2414 2415 spin_lock(&hugetlb_lock); 2416 /* 2417 * glb_chg is passed to indicate whether or not a page must be taken 2418 * from the global free pool (global change). gbl_chg == 0 indicates 2419 * a reservation exists for the allocation. 2420 */ 2421 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); 2422 if (!page) { 2423 spin_unlock(&hugetlb_lock); 2424 page = alloc_buddy_huge_page_with_mpol(h, vma, addr); 2425 if (!page) 2426 goto out_uncharge_cgroup; 2427 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { 2428 SetPagePrivate(page); 2429 h->resv_huge_pages--; 2430 } 2431 spin_lock(&hugetlb_lock); 2432 list_move(&page->lru, &h->hugepage_activelist); 2433 /* Fall through */ 2434 } 2435 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); 2436 /* If allocation is not consuming a reservation, also store the 2437 * hugetlb_cgroup pointer on the page. 2438 */ 2439 if (deferred_reserve) { 2440 hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), 2441 h_cg, page); 2442 } 2443 2444 spin_unlock(&hugetlb_lock); 2445 2446 set_page_private(page, (unsigned long)spool); 2447 2448 map_commit = vma_commit_reservation(h, vma, addr); 2449 if (unlikely(map_chg > map_commit)) { 2450 /* 2451 * The page was added to the reservation map between 2452 * vma_needs_reservation and vma_commit_reservation. 2453 * This indicates a race with hugetlb_reserve_pages. 2454 * Adjust for the subpool count incremented above AND 2455 * in hugetlb_reserve_pages for the same page. Also, 2456 * the reservation count added in hugetlb_reserve_pages 2457 * no longer applies. 2458 */ 2459 long rsv_adjust; 2460 2461 rsv_adjust = hugepage_subpool_put_pages(spool, 1); 2462 hugetlb_acct_memory(h, -rsv_adjust); 2463 } 2464 return page; 2465 2466 out_uncharge_cgroup: 2467 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); 2468 out_uncharge_cgroup_reservation: 2469 if (deferred_reserve) 2470 hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), 2471 h_cg); 2472 out_subpool_put: 2473 if (map_chg || avoid_reserve) 2474 hugepage_subpool_put_pages(spool, 1); 2475 vma_end_reservation(h, vma, addr); 2476 return ERR_PTR(-ENOSPC); 2477 } 2478 2479 int alloc_bootmem_huge_page(struct hstate *h) 2480 __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); 2481 int __alloc_bootmem_huge_page(struct hstate *h) 2482 { 2483 struct huge_bootmem_page *m; 2484 int nr_nodes, node; 2485 2486 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 2487 void *addr; 2488 2489 addr = memblock_alloc_try_nid_raw( 2490 huge_page_size(h), huge_page_size(h), 2491 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); 2492 if (addr) { 2493 /* 2494 * Use the beginning of the huge page to store the 2495 * huge_bootmem_page struct (until gather_bootmem 2496 * puts them into the mem_map). 2497 */ 2498 m = addr; 2499 goto found; 2500 } 2501 } 2502 return 0; 2503 2504 found: 2505 BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); 2506 /* Put them into a private list first because mem_map is not up yet */ 2507 INIT_LIST_HEAD(&m->list); 2508 list_add(&m->list, &huge_boot_pages); 2509 m->hstate = h; 2510 return 1; 2511 } 2512 2513 static void __init prep_compound_huge_page(struct page *page, 2514 unsigned int order) 2515 { 2516 if (unlikely(order > (MAX_ORDER - 1))) 2517 prep_compound_gigantic_page(page, order); 2518 else 2519 prep_compound_page(page, order); 2520 } 2521 2522 /* Put bootmem huge pages into the standard lists after mem_map is up */ 2523 static void __init gather_bootmem_prealloc(void) 2524 { 2525 struct huge_bootmem_page *m; 2526 2527 list_for_each_entry(m, &huge_boot_pages, list) { 2528 struct page *page = virt_to_page(m); 2529 struct hstate *h = m->hstate; 2530 2531 WARN_ON(page_count(page) != 1); 2532 prep_compound_huge_page(page, h->order); 2533 WARN_ON(PageReserved(page)); 2534 prep_new_huge_page(h, page, page_to_nid(page)); 2535 put_page(page); /* free it into the hugepage allocator */ 2536 2537 /* 2538 * If we had gigantic hugepages allocated at boot time, we need 2539 * to restore the 'stolen' pages to totalram_pages in order to 2540 * fix confusing memory reports from free(1) and another 2541 * side-effects, like CommitLimit going negative. 2542 */ 2543 if (hstate_is_gigantic(h)) 2544 adjust_managed_page_count(page, 1 << h->order); 2545 cond_resched(); 2546 } 2547 } 2548 2549 static void __init hugetlb_hstate_alloc_pages(struct hstate *h) 2550 { 2551 unsigned long i; 2552 nodemask_t *node_alloc_noretry; 2553 2554 if (!hstate_is_gigantic(h)) { 2555 /* 2556 * Bit mask controlling how hard we retry per-node allocations. 2557 * Ignore errors as lower level routines can deal with 2558 * node_alloc_noretry == NULL. If this kmalloc fails at boot 2559 * time, we are likely in bigger trouble. 2560 */ 2561 node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), 2562 GFP_KERNEL); 2563 } else { 2564 /* allocations done at boot time */ 2565 node_alloc_noretry = NULL; 2566 } 2567 2568 /* bit mask controlling how hard we retry per-node allocations */ 2569 if (node_alloc_noretry) 2570 nodes_clear(*node_alloc_noretry); 2571 2572 for (i = 0; i < h->max_huge_pages; ++i) { 2573 if (hstate_is_gigantic(h)) { 2574 if (IS_ENABLED(CONFIG_CMA) && hugetlb_cma[0]) { 2575 pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); 2576 break; 2577 } 2578 if (!alloc_bootmem_huge_page(h)) 2579 break; 2580 } else if (!alloc_pool_huge_page(h, 2581 &node_states[N_MEMORY], 2582 node_alloc_noretry)) 2583 break; 2584 cond_resched(); 2585 } 2586 if (i < h->max_huge_pages) { 2587 char buf[32]; 2588 2589 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 2590 pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", 2591 h->max_huge_pages, buf, i); 2592 h->max_huge_pages = i; 2593 } 2594 2595 kfree(node_alloc_noretry); 2596 } 2597 2598 static void __init hugetlb_init_hstates(void) 2599 { 2600 struct hstate *h; 2601 2602 for_each_hstate(h) { 2603 if (minimum_order > huge_page_order(h)) 2604 minimum_order = huge_page_order(h); 2605 2606 /* oversize hugepages were init'ed in early boot */ 2607 if (!hstate_is_gigantic(h)) 2608 hugetlb_hstate_alloc_pages(h); 2609 } 2610 VM_BUG_ON(minimum_order == UINT_MAX); 2611 } 2612 2613 static void __init report_hugepages(void) 2614 { 2615 struct hstate *h; 2616 2617 for_each_hstate(h) { 2618 char buf[32]; 2619 2620 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 2621 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", 2622 buf, h->free_huge_pages); 2623 } 2624 } 2625 2626 #ifdef CONFIG_HIGHMEM 2627 static void try_to_free_low(struct hstate *h, unsigned long count, 2628 nodemask_t *nodes_allowed) 2629 { 2630 int i; 2631 2632 if (hstate_is_gigantic(h)) 2633 return; 2634 2635 for_each_node_mask(i, *nodes_allowed) { 2636 struct page *page, *next; 2637 struct list_head *freel = &h->hugepage_freelists[i]; 2638 list_for_each_entry_safe(page, next, freel, lru) { 2639 if (count >= h->nr_huge_pages) 2640 return; 2641 if (PageHighMem(page)) 2642 continue; 2643 list_del(&page->lru); 2644 update_and_free_page(h, page); 2645 h->free_huge_pages--; 2646 h->free_huge_pages_node[page_to_nid(page)]--; 2647 } 2648 } 2649 } 2650 #else 2651 static inline void try_to_free_low(struct hstate *h, unsigned long count, 2652 nodemask_t *nodes_allowed) 2653 { 2654 } 2655 #endif 2656 2657 /* 2658 * Increment or decrement surplus_huge_pages. Keep node-specific counters 2659 * balanced by operating on them in a round-robin fashion. 2660 * Returns 1 if an adjustment was made. 2661 */ 2662 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 2663 int delta) 2664 { 2665 int nr_nodes, node; 2666 2667 VM_BUG_ON(delta != -1 && delta != 1); 2668 2669 if (delta < 0) { 2670 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 2671 if (h->surplus_huge_pages_node[node]) 2672 goto found; 2673 } 2674 } else { 2675 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 2676 if (h->surplus_huge_pages_node[node] < 2677 h->nr_huge_pages_node[node]) 2678 goto found; 2679 } 2680 } 2681 return 0; 2682 2683 found: 2684 h->surplus_huge_pages += delta; 2685 h->surplus_huge_pages_node[node] += delta; 2686 return 1; 2687 } 2688 2689 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 2690 static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, 2691 nodemask_t *nodes_allowed) 2692 { 2693 unsigned long min_count, ret; 2694 NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); 2695 2696 /* 2697 * Bit mask controlling how hard we retry per-node allocations. 2698 * If we can not allocate the bit mask, do not attempt to allocate 2699 * the requested huge pages. 2700 */ 2701 if (node_alloc_noretry) 2702 nodes_clear(*node_alloc_noretry); 2703 else 2704 return -ENOMEM; 2705 2706 spin_lock(&hugetlb_lock); 2707 2708 /* 2709 * Check for a node specific request. 2710 * Changing node specific huge page count may require a corresponding 2711 * change to the global count. In any case, the passed node mask 2712 * (nodes_allowed) will restrict alloc/free to the specified node. 2713 */ 2714 if (nid != NUMA_NO_NODE) { 2715 unsigned long old_count = count; 2716 2717 count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 2718 /* 2719 * User may have specified a large count value which caused the 2720 * above calculation to overflow. In this case, they wanted 2721 * to allocate as many huge pages as possible. Set count to 2722 * largest possible value to align with their intention. 2723 */ 2724 if (count < old_count) 2725 count = ULONG_MAX; 2726 } 2727 2728 /* 2729 * Gigantic pages runtime allocation depend on the capability for large 2730 * page range allocation. 2731 * If the system does not provide this feature, return an error when 2732 * the user tries to allocate gigantic pages but let the user free the 2733 * boottime allocated gigantic pages. 2734 */ 2735 if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { 2736 if (count > persistent_huge_pages(h)) { 2737 spin_unlock(&hugetlb_lock); 2738 NODEMASK_FREE(node_alloc_noretry); 2739 return -EINVAL; 2740 } 2741 /* Fall through to decrease pool */ 2742 } 2743 2744 /* 2745 * Increase the pool size 2746 * First take pages out of surplus state. Then make up the 2747 * remaining difference by allocating fresh huge pages. 2748 * 2749 * We might race with alloc_surplus_huge_page() here and be unable 2750 * to convert a surplus huge page to a normal huge page. That is 2751 * not critical, though, it just means the overall size of the 2752 * pool might be one hugepage larger than it needs to be, but 2753 * within all the constraints specified by the sysctls. 2754 */ 2755 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 2756 if (!adjust_pool_surplus(h, nodes_allowed, -1)) 2757 break; 2758 } 2759 2760 while (count > persistent_huge_pages(h)) { 2761 /* 2762 * If this allocation races such that we no longer need the 2763 * page, free_huge_page will handle it by freeing the page 2764 * and reducing the surplus. 2765 */ 2766 spin_unlock(&hugetlb_lock); 2767 2768 /* yield cpu to avoid soft lockup */ 2769 cond_resched(); 2770 2771 ret = alloc_pool_huge_page(h, nodes_allowed, 2772 node_alloc_noretry); 2773 spin_lock(&hugetlb_lock); 2774 if (!ret) 2775 goto out; 2776 2777 /* Bail for signals. Probably ctrl-c from user */ 2778 if (signal_pending(current)) 2779 goto out; 2780 } 2781 2782 /* 2783 * Decrease the pool size 2784 * First return free pages to the buddy allocator (being careful 2785 * to keep enough around to satisfy reservations). Then place 2786 * pages into surplus state as needed so the pool will shrink 2787 * to the desired size as pages become free. 2788 * 2789 * By placing pages into the surplus state independent of the 2790 * overcommit value, we are allowing the surplus pool size to 2791 * exceed overcommit. There are few sane options here. Since 2792 * alloc_surplus_huge_page() is checking the global counter, 2793 * though, we'll note that we're not allowed to exceed surplus 2794 * and won't grow the pool anywhere else. Not until one of the 2795 * sysctls are changed, or the surplus pages go out of use. 2796 */ 2797 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 2798 min_count = max(count, min_count); 2799 try_to_free_low(h, min_count, nodes_allowed); 2800 while (min_count < persistent_huge_pages(h)) { 2801 if (!free_pool_huge_page(h, nodes_allowed, 0)) 2802 break; 2803 cond_resched_lock(&hugetlb_lock); 2804 } 2805 while (count < persistent_huge_pages(h)) { 2806 if (!adjust_pool_surplus(h, nodes_allowed, 1)) 2807 break; 2808 } 2809 out: 2810 h->max_huge_pages = persistent_huge_pages(h); 2811 spin_unlock(&hugetlb_lock); 2812 2813 NODEMASK_FREE(node_alloc_noretry); 2814 2815 return 0; 2816 } 2817 2818 #define HSTATE_ATTR_RO(_name) \ 2819 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 2820 2821 #define HSTATE_ATTR(_name) \ 2822 static struct kobj_attribute _name##_attr = \ 2823 __ATTR(_name, 0644, _name##_show, _name##_store) 2824 2825 static struct kobject *hugepages_kobj; 2826 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 2827 2828 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); 2829 2830 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) 2831 { 2832 int i; 2833 2834 for (i = 0; i < HUGE_MAX_HSTATE; i++) 2835 if (hstate_kobjs[i] == kobj) { 2836 if (nidp) 2837 *nidp = NUMA_NO_NODE; 2838 return &hstates[i]; 2839 } 2840 2841 return kobj_to_node_hstate(kobj, nidp); 2842 } 2843 2844 static ssize_t nr_hugepages_show_common(struct kobject *kobj, 2845 struct kobj_attribute *attr, char *buf) 2846 { 2847 struct hstate *h; 2848 unsigned long nr_huge_pages; 2849 int nid; 2850 2851 h = kobj_to_hstate(kobj, &nid); 2852 if (nid == NUMA_NO_NODE) 2853 nr_huge_pages = h->nr_huge_pages; 2854 else 2855 nr_huge_pages = h->nr_huge_pages_node[nid]; 2856 2857 return sprintf(buf, "%lu\n", nr_huge_pages); 2858 } 2859 2860 static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, 2861 struct hstate *h, int nid, 2862 unsigned long count, size_t len) 2863 { 2864 int err; 2865 nodemask_t nodes_allowed, *n_mask; 2866 2867 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 2868 return -EINVAL; 2869 2870 if (nid == NUMA_NO_NODE) { 2871 /* 2872 * global hstate attribute 2873 */ 2874 if (!(obey_mempolicy && 2875 init_nodemask_of_mempolicy(&nodes_allowed))) 2876 n_mask = &node_states[N_MEMORY]; 2877 else 2878 n_mask = &nodes_allowed; 2879 } else { 2880 /* 2881 * Node specific request. count adjustment happens in 2882 * set_max_huge_pages() after acquiring hugetlb_lock. 2883 */ 2884 init_nodemask_of_node(&nodes_allowed, nid); 2885 n_mask = &nodes_allowed; 2886 } 2887 2888 err = set_max_huge_pages(h, count, nid, n_mask); 2889 2890 return err ? err : len; 2891 } 2892 2893 static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 2894 struct kobject *kobj, const char *buf, 2895 size_t len) 2896 { 2897 struct hstate *h; 2898 unsigned long count; 2899 int nid; 2900 int err; 2901 2902 err = kstrtoul(buf, 10, &count); 2903 if (err) 2904 return err; 2905 2906 h = kobj_to_hstate(kobj, &nid); 2907 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); 2908 } 2909 2910 static ssize_t nr_hugepages_show(struct kobject *kobj, 2911 struct kobj_attribute *attr, char *buf) 2912 { 2913 return nr_hugepages_show_common(kobj, attr, buf); 2914 } 2915 2916 static ssize_t nr_hugepages_store(struct kobject *kobj, 2917 struct kobj_attribute *attr, const char *buf, size_t len) 2918 { 2919 return nr_hugepages_store_common(false, kobj, buf, len); 2920 } 2921 HSTATE_ATTR(nr_hugepages); 2922 2923 #ifdef CONFIG_NUMA 2924 2925 /* 2926 * hstate attribute for optionally mempolicy-based constraint on persistent 2927 * huge page alloc/free. 2928 */ 2929 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, 2930 struct kobj_attribute *attr, char *buf) 2931 { 2932 return nr_hugepages_show_common(kobj, attr, buf); 2933 } 2934 2935 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, 2936 struct kobj_attribute *attr, const char *buf, size_t len) 2937 { 2938 return nr_hugepages_store_common(true, kobj, buf, len); 2939 } 2940 HSTATE_ATTR(nr_hugepages_mempolicy); 2941 #endif 2942 2943 2944 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 2945 struct kobj_attribute *attr, char *buf) 2946 { 2947 struct hstate *h = kobj_to_hstate(kobj, NULL); 2948 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 2949 } 2950 2951 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 2952 struct kobj_attribute *attr, const char *buf, size_t count) 2953 { 2954 int err; 2955 unsigned long input; 2956 struct hstate *h = kobj_to_hstate(kobj, NULL); 2957 2958 if (hstate_is_gigantic(h)) 2959 return -EINVAL; 2960 2961 err = kstrtoul(buf, 10, &input); 2962 if (err) 2963 return err; 2964 2965 spin_lock(&hugetlb_lock); 2966 h->nr_overcommit_huge_pages = input; 2967 spin_unlock(&hugetlb_lock); 2968 2969 return count; 2970 } 2971 HSTATE_ATTR(nr_overcommit_hugepages); 2972 2973 static ssize_t free_hugepages_show(struct kobject *kobj, 2974 struct kobj_attribute *attr, char *buf) 2975 { 2976 struct hstate *h; 2977 unsigned long free_huge_pages; 2978 int nid; 2979 2980 h = kobj_to_hstate(kobj, &nid); 2981 if (nid == NUMA_NO_NODE) 2982 free_huge_pages = h->free_huge_pages; 2983 else 2984 free_huge_pages = h->free_huge_pages_node[nid]; 2985 2986 return sprintf(buf, "%lu\n", free_huge_pages); 2987 } 2988 HSTATE_ATTR_RO(free_hugepages); 2989 2990 static ssize_t resv_hugepages_show(struct kobject *kobj, 2991 struct kobj_attribute *attr, char *buf) 2992 { 2993 struct hstate *h = kobj_to_hstate(kobj, NULL); 2994 return sprintf(buf, "%lu\n", h->resv_huge_pages); 2995 } 2996 HSTATE_ATTR_RO(resv_hugepages); 2997 2998 static ssize_t surplus_hugepages_show(struct kobject *kobj, 2999 struct kobj_attribute *attr, char *buf) 3000 { 3001 struct hstate *h; 3002 unsigned long surplus_huge_pages; 3003 int nid; 3004 3005 h = kobj_to_hstate(kobj, &nid); 3006 if (nid == NUMA_NO_NODE) 3007 surplus_huge_pages = h->surplus_huge_pages; 3008 else 3009 surplus_huge_pages = h->surplus_huge_pages_node[nid]; 3010 3011 return sprintf(buf, "%lu\n", surplus_huge_pages); 3012 } 3013 HSTATE_ATTR_RO(surplus_hugepages); 3014 3015 static struct attribute *hstate_attrs[] = { 3016 &nr_hugepages_attr.attr, 3017 &nr_overcommit_hugepages_attr.attr, 3018 &free_hugepages_attr.attr, 3019 &resv_hugepages_attr.attr, 3020 &surplus_hugepages_attr.attr, 3021 #ifdef CONFIG_NUMA 3022 &nr_hugepages_mempolicy_attr.attr, 3023 #endif 3024 NULL, 3025 }; 3026 3027 static const struct attribute_group hstate_attr_group = { 3028 .attrs = hstate_attrs, 3029 }; 3030 3031 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, 3032 struct kobject **hstate_kobjs, 3033 const struct attribute_group *hstate_attr_group) 3034 { 3035 int retval; 3036 int hi = hstate_index(h); 3037 3038 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 3039 if (!hstate_kobjs[hi]) 3040 return -ENOMEM; 3041 3042 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); 3043 if (retval) 3044 kobject_put(hstate_kobjs[hi]); 3045 3046 return retval; 3047 } 3048 3049 static void __init hugetlb_sysfs_init(void) 3050 { 3051 struct hstate *h; 3052 int err; 3053 3054 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); 3055 if (!hugepages_kobj) 3056 return; 3057 3058 for_each_hstate(h) { 3059 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 3060 hstate_kobjs, &hstate_attr_group); 3061 if (err) 3062 pr_err("HugeTLB: Unable to add hstate %s", h->name); 3063 } 3064 } 3065 3066 #ifdef CONFIG_NUMA 3067 3068 /* 3069 * node_hstate/s - associate per node hstate attributes, via their kobjects, 3070 * with node devices in node_devices[] using a parallel array. The array 3071 * index of a node device or _hstate == node id. 3072 * This is here to avoid any static dependency of the node device driver, in 3073 * the base kernel, on the hugetlb module. 3074 */ 3075 struct node_hstate { 3076 struct kobject *hugepages_kobj; 3077 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 3078 }; 3079 static struct node_hstate node_hstates[MAX_NUMNODES]; 3080 3081 /* 3082 * A subset of global hstate attributes for node devices 3083 */ 3084 static struct attribute *per_node_hstate_attrs[] = { 3085 &nr_hugepages_attr.attr, 3086 &free_hugepages_attr.attr, 3087 &surplus_hugepages_attr.attr, 3088 NULL, 3089 }; 3090 3091 static const struct attribute_group per_node_hstate_attr_group = { 3092 .attrs = per_node_hstate_attrs, 3093 }; 3094 3095 /* 3096 * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. 3097 * Returns node id via non-NULL nidp. 3098 */ 3099 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 3100 { 3101 int nid; 3102 3103 for (nid = 0; nid < nr_node_ids; nid++) { 3104 struct node_hstate *nhs = &node_hstates[nid]; 3105 int i; 3106 for (i = 0; i < HUGE_MAX_HSTATE; i++) 3107 if (nhs->hstate_kobjs[i] == kobj) { 3108 if (nidp) 3109 *nidp = nid; 3110 return &hstates[i]; 3111 } 3112 } 3113 3114 BUG(); 3115 return NULL; 3116 } 3117 3118 /* 3119 * Unregister hstate attributes from a single node device. 3120 * No-op if no hstate attributes attached. 3121 */ 3122 static void hugetlb_unregister_node(struct node *node) 3123 { 3124 struct hstate *h; 3125 struct node_hstate *nhs = &node_hstates[node->dev.id]; 3126 3127 if (!nhs->hugepages_kobj) 3128 return; /* no hstate attributes */ 3129 3130 for_each_hstate(h) { 3131 int idx = hstate_index(h); 3132 if (nhs->hstate_kobjs[idx]) { 3133 kobject_put(nhs->hstate_kobjs[idx]); 3134 nhs->hstate_kobjs[idx] = NULL; 3135 } 3136 } 3137 3138 kobject_put(nhs->hugepages_kobj); 3139 nhs->hugepages_kobj = NULL; 3140 } 3141 3142 3143 /* 3144 * Register hstate attributes for a single node device. 3145 * No-op if attributes already registered. 3146 */ 3147 static void hugetlb_register_node(struct node *node) 3148 { 3149 struct hstate *h; 3150 struct node_hstate *nhs = &node_hstates[node->dev.id]; 3151 int err; 3152 3153 if (nhs->hugepages_kobj) 3154 return; /* already allocated */ 3155 3156 nhs->hugepages_kobj = kobject_create_and_add("hugepages", 3157 &node->dev.kobj); 3158 if (!nhs->hugepages_kobj) 3159 return; 3160 3161 for_each_hstate(h) { 3162 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, 3163 nhs->hstate_kobjs, 3164 &per_node_hstate_attr_group); 3165 if (err) { 3166 pr_err("HugeTLB: Unable to add hstate %s for node %d\n", 3167 h->name, node->dev.id); 3168 hugetlb_unregister_node(node); 3169 break; 3170 } 3171 } 3172 } 3173 3174 /* 3175 * hugetlb init time: register hstate attributes for all registered node 3176 * devices of nodes that have memory. All on-line nodes should have 3177 * registered their associated device by this time. 3178 */ 3179 static void __init hugetlb_register_all_nodes(void) 3180 { 3181 int nid; 3182 3183 for_each_node_state(nid, N_MEMORY) { 3184 struct node *node = node_devices[nid]; 3185 if (node->dev.id == nid) 3186 hugetlb_register_node(node); 3187 } 3188 3189 /* 3190 * Let the node device driver know we're here so it can 3191 * [un]register hstate attributes on node hotplug. 3192 */ 3193 register_hugetlbfs_with_node(hugetlb_register_node, 3194 hugetlb_unregister_node); 3195 } 3196 #else /* !CONFIG_NUMA */ 3197 3198 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 3199 { 3200 BUG(); 3201 if (nidp) 3202 *nidp = -1; 3203 return NULL; 3204 } 3205 3206 static void hugetlb_register_all_nodes(void) { } 3207 3208 #endif 3209 3210 static int __init hugetlb_init(void) 3211 { 3212 int i; 3213 3214 if (!hugepages_supported()) { 3215 if (hugetlb_max_hstate || default_hstate_max_huge_pages) 3216 pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); 3217 return 0; 3218 } 3219 3220 /* 3221 * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some 3222 * architectures depend on setup being done here. 3223 */ 3224 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 3225 if (!parsed_default_hugepagesz) { 3226 /* 3227 * If we did not parse a default huge page size, set 3228 * default_hstate_idx to HPAGE_SIZE hstate. And, if the 3229 * number of huge pages for this default size was implicitly 3230 * specified, set that here as well. 3231 * Note that the implicit setting will overwrite an explicit 3232 * setting. A warning will be printed in this case. 3233 */ 3234 default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); 3235 if (default_hstate_max_huge_pages) { 3236 if (default_hstate.max_huge_pages) { 3237 char buf[32]; 3238 3239 string_get_size(huge_page_size(&default_hstate), 3240 1, STRING_UNITS_2, buf, 32); 3241 pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", 3242 default_hstate.max_huge_pages, buf); 3243 pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", 3244 default_hstate_max_huge_pages); 3245 } 3246 default_hstate.max_huge_pages = 3247 default_hstate_max_huge_pages; 3248 } 3249 } 3250 3251 hugetlb_cma_check(); 3252 hugetlb_init_hstates(); 3253 gather_bootmem_prealloc(); 3254 report_hugepages(); 3255 3256 hugetlb_sysfs_init(); 3257 hugetlb_register_all_nodes(); 3258 hugetlb_cgroup_file_init(); 3259 3260 #ifdef CONFIG_SMP 3261 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); 3262 #else 3263 num_fault_mutexes = 1; 3264 #endif 3265 hugetlb_fault_mutex_table = 3266 kmalloc_array(num_fault_mutexes, sizeof(struct mutex), 3267 GFP_KERNEL); 3268 BUG_ON(!hugetlb_fault_mutex_table); 3269 3270 for (i = 0; i < num_fault_mutexes; i++) 3271 mutex_init(&hugetlb_fault_mutex_table[i]); 3272 return 0; 3273 } 3274 subsys_initcall(hugetlb_init); 3275 3276 /* Overwritten by architectures with more huge page sizes */ 3277 bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) 3278 { 3279 return size == HPAGE_SIZE; 3280 } 3281 3282 void __init hugetlb_add_hstate(unsigned int order) 3283 { 3284 struct hstate *h; 3285 unsigned long i; 3286 3287 if (size_to_hstate(PAGE_SIZE << order)) { 3288 return; 3289 } 3290 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); 3291 BUG_ON(order == 0); 3292 h = &hstates[hugetlb_max_hstate++]; 3293 h->order = order; 3294 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); 3295 h->nr_huge_pages = 0; 3296 h->free_huge_pages = 0; 3297 for (i = 0; i < MAX_NUMNODES; ++i) 3298 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 3299 INIT_LIST_HEAD(&h->hugepage_activelist); 3300 h->next_nid_to_alloc = first_memory_node; 3301 h->next_nid_to_free = first_memory_node; 3302 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 3303 huge_page_size(h)/1024); 3304 3305 parsed_hstate = h; 3306 } 3307 3308 /* 3309 * hugepages command line processing 3310 * hugepages normally follows a valid hugepagsz or default_hugepagsz 3311 * specification. If not, ignore the hugepages value. hugepages can also 3312 * be the first huge page command line option in which case it implicitly 3313 * specifies the number of huge pages for the default size. 3314 */ 3315 static int __init hugepages_setup(char *s) 3316 { 3317 unsigned long *mhp; 3318 static unsigned long *last_mhp; 3319 3320 if (!parsed_valid_hugepagesz) { 3321 pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); 3322 parsed_valid_hugepagesz = true; 3323 return 0; 3324 } 3325 3326 /* 3327 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter 3328 * yet, so this hugepages= parameter goes to the "default hstate". 3329 * Otherwise, it goes with the previously parsed hugepagesz or 3330 * default_hugepagesz. 3331 */ 3332 else if (!hugetlb_max_hstate) 3333 mhp = &default_hstate_max_huge_pages; 3334 else 3335 mhp = &parsed_hstate->max_huge_pages; 3336 3337 if (mhp == last_mhp) { 3338 pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); 3339 return 0; 3340 } 3341 3342 if (sscanf(s, "%lu", mhp) <= 0) 3343 *mhp = 0; 3344 3345 /* 3346 * Global state is always initialized later in hugetlb_init. 3347 * But we need to allocate >= MAX_ORDER hstates here early to still 3348 * use the bootmem allocator. 3349 */ 3350 if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) 3351 hugetlb_hstate_alloc_pages(parsed_hstate); 3352 3353 last_mhp = mhp; 3354 3355 return 1; 3356 } 3357 __setup("hugepages=", hugepages_setup); 3358 3359 /* 3360 * hugepagesz command line processing 3361 * A specific huge page size can only be specified once with hugepagesz. 3362 * hugepagesz is followed by hugepages on the command line. The global 3363 * variable 'parsed_valid_hugepagesz' is used to determine if prior 3364 * hugepagesz argument was valid. 3365 */ 3366 static int __init hugepagesz_setup(char *s) 3367 { 3368 unsigned long size; 3369 struct hstate *h; 3370 3371 parsed_valid_hugepagesz = false; 3372 size = (unsigned long)memparse(s, NULL); 3373 3374 if (!arch_hugetlb_valid_size(size)) { 3375 pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); 3376 return 0; 3377 } 3378 3379 h = size_to_hstate(size); 3380 if (h) { 3381 /* 3382 * hstate for this size already exists. This is normally 3383 * an error, but is allowed if the existing hstate is the 3384 * default hstate. More specifically, it is only allowed if 3385 * the number of huge pages for the default hstate was not 3386 * previously specified. 3387 */ 3388 if (!parsed_default_hugepagesz || h != &default_hstate || 3389 default_hstate.max_huge_pages) { 3390 pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); 3391 return 0; 3392 } 3393 3394 /* 3395 * No need to call hugetlb_add_hstate() as hstate already 3396 * exists. But, do set parsed_hstate so that a following 3397 * hugepages= parameter will be applied to this hstate. 3398 */ 3399 parsed_hstate = h; 3400 parsed_valid_hugepagesz = true; 3401 return 1; 3402 } 3403 3404 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 3405 parsed_valid_hugepagesz = true; 3406 return 1; 3407 } 3408 __setup("hugepagesz=", hugepagesz_setup); 3409 3410 /* 3411 * default_hugepagesz command line input 3412 * Only one instance of default_hugepagesz allowed on command line. 3413 */ 3414 static int __init default_hugepagesz_setup(char *s) 3415 { 3416 unsigned long size; 3417 3418 parsed_valid_hugepagesz = false; 3419 if (parsed_default_hugepagesz) { 3420 pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); 3421 return 0; 3422 } 3423 3424 size = (unsigned long)memparse(s, NULL); 3425 3426 if (!arch_hugetlb_valid_size(size)) { 3427 pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); 3428 return 0; 3429 } 3430 3431 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 3432 parsed_valid_hugepagesz = true; 3433 parsed_default_hugepagesz = true; 3434 default_hstate_idx = hstate_index(size_to_hstate(size)); 3435 3436 /* 3437 * The number of default huge pages (for this size) could have been 3438 * specified as the first hugetlb parameter: hugepages=X. If so, 3439 * then default_hstate_max_huge_pages is set. If the default huge 3440 * page size is gigantic (>= MAX_ORDER), then the pages must be 3441 * allocated here from bootmem allocator. 3442 */ 3443 if (default_hstate_max_huge_pages) { 3444 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 3445 if (hstate_is_gigantic(&default_hstate)) 3446 hugetlb_hstate_alloc_pages(&default_hstate); 3447 default_hstate_max_huge_pages = 0; 3448 } 3449 3450 return 1; 3451 } 3452 __setup("default_hugepagesz=", default_hugepagesz_setup); 3453 3454 static unsigned int cpuset_mems_nr(unsigned int *array) 3455 { 3456 int node; 3457 unsigned int nr = 0; 3458 3459 for_each_node_mask(node, cpuset_current_mems_allowed) 3460 nr += array[node]; 3461 3462 return nr; 3463 } 3464 3465 #ifdef CONFIG_SYSCTL 3466 static int hugetlb_sysctl_handler_common(bool obey_mempolicy, 3467 struct ctl_table *table, int write, 3468 void *buffer, size_t *length, loff_t *ppos) 3469 { 3470 struct hstate *h = &default_hstate; 3471 unsigned long tmp = h->max_huge_pages; 3472 int ret; 3473 3474 if (!hugepages_supported()) 3475 return -EOPNOTSUPP; 3476 3477 table->data = &tmp; 3478 table->maxlen = sizeof(unsigned long); 3479 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); 3480 if (ret) 3481 goto out; 3482 3483 if (write) 3484 ret = __nr_hugepages_store_common(obey_mempolicy, h, 3485 NUMA_NO_NODE, tmp, *length); 3486 out: 3487 return ret; 3488 } 3489 3490 int hugetlb_sysctl_handler(struct ctl_table *table, int write, 3491 void *buffer, size_t *length, loff_t *ppos) 3492 { 3493 3494 return hugetlb_sysctl_handler_common(false, table, write, 3495 buffer, length, ppos); 3496 } 3497 3498 #ifdef CONFIG_NUMA 3499 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, 3500 void *buffer, size_t *length, loff_t *ppos) 3501 { 3502 return hugetlb_sysctl_handler_common(true, table, write, 3503 buffer, length, ppos); 3504 } 3505 #endif /* CONFIG_NUMA */ 3506 3507 int hugetlb_overcommit_handler(struct ctl_table *table, int write, 3508 void *buffer, size_t *length, loff_t *ppos) 3509 { 3510 struct hstate *h = &default_hstate; 3511 unsigned long tmp; 3512 int ret; 3513 3514 if (!hugepages_supported()) 3515 return -EOPNOTSUPP; 3516 3517 tmp = h->nr_overcommit_huge_pages; 3518 3519 if (write && hstate_is_gigantic(h)) 3520 return -EINVAL; 3521 3522 table->data = &tmp; 3523 table->maxlen = sizeof(unsigned long); 3524 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); 3525 if (ret) 3526 goto out; 3527 3528 if (write) { 3529 spin_lock(&hugetlb_lock); 3530 h->nr_overcommit_huge_pages = tmp; 3531 spin_unlock(&hugetlb_lock); 3532 } 3533 out: 3534 return ret; 3535 } 3536 3537 #endif /* CONFIG_SYSCTL */ 3538 3539 void hugetlb_report_meminfo(struct seq_file *m) 3540 { 3541 struct hstate *h; 3542 unsigned long total = 0; 3543 3544 if (!hugepages_supported()) 3545 return; 3546 3547 for_each_hstate(h) { 3548 unsigned long count = h->nr_huge_pages; 3549 3550 total += (PAGE_SIZE << huge_page_order(h)) * count; 3551 3552 if (h == &default_hstate) 3553 seq_printf(m, 3554 "HugePages_Total: %5lu\n" 3555 "HugePages_Free: %5lu\n" 3556 "HugePages_Rsvd: %5lu\n" 3557 "HugePages_Surp: %5lu\n" 3558 "Hugepagesize: %8lu kB\n", 3559 count, 3560 h->free_huge_pages, 3561 h->resv_huge_pages, 3562 h->surplus_huge_pages, 3563 (PAGE_SIZE << huge_page_order(h)) / 1024); 3564 } 3565 3566 seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024); 3567 } 3568 3569 int hugetlb_report_node_meminfo(int nid, char *buf) 3570 { 3571 struct hstate *h = &default_hstate; 3572 if (!hugepages_supported()) 3573 return 0; 3574 return sprintf(buf, 3575 "Node %d HugePages_Total: %5u\n" 3576 "Node %d HugePages_Free: %5u\n" 3577 "Node %d HugePages_Surp: %5u\n", 3578 nid, h->nr_huge_pages_node[nid], 3579 nid, h->free_huge_pages_node[nid], 3580 nid, h->surplus_huge_pages_node[nid]); 3581 } 3582 3583 void hugetlb_show_meminfo(void) 3584 { 3585 struct hstate *h; 3586 int nid; 3587 3588 if (!hugepages_supported()) 3589 return; 3590 3591 for_each_node_state(nid, N_MEMORY) 3592 for_each_hstate(h) 3593 pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", 3594 nid, 3595 h->nr_huge_pages_node[nid], 3596 h->free_huge_pages_node[nid], 3597 h->surplus_huge_pages_node[nid], 3598 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); 3599 } 3600 3601 void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) 3602 { 3603 seq_printf(m, "HugetlbPages:\t%8lu kB\n", 3604 atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); 3605 } 3606 3607 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 3608 unsigned long hugetlb_total_pages(void) 3609 { 3610 struct hstate *h; 3611 unsigned long nr_total_pages = 0; 3612 3613 for_each_hstate(h) 3614 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); 3615 return nr_total_pages; 3616 } 3617 3618 static int hugetlb_acct_memory(struct hstate *h, long delta) 3619 { 3620 int ret = -ENOMEM; 3621 3622 spin_lock(&hugetlb_lock); 3623 /* 3624 * When cpuset is configured, it breaks the strict hugetlb page 3625 * reservation as the accounting is done on a global variable. Such 3626 * reservation is completely rubbish in the presence of cpuset because 3627 * the reservation is not checked against page availability for the 3628 * current cpuset. Application can still potentially OOM'ed by kernel 3629 * with lack of free htlb page in cpuset that the task is in. 3630 * Attempt to enforce strict accounting with cpuset is almost 3631 * impossible (or too ugly) because cpuset is too fluid that 3632 * task or memory node can be dynamically moved between cpusets. 3633 * 3634 * The change of semantics for shared hugetlb mapping with cpuset is 3635 * undesirable. However, in order to preserve some of the semantics, 3636 * we fall back to check against current free page availability as 3637 * a best attempt and hopefully to minimize the impact of changing 3638 * semantics that cpuset has. 3639 */ 3640 if (delta > 0) { 3641 if (gather_surplus_pages(h, delta) < 0) 3642 goto out; 3643 3644 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { 3645 return_unused_surplus_pages(h, delta); 3646 goto out; 3647 } 3648 } 3649 3650 ret = 0; 3651 if (delta < 0) 3652 return_unused_surplus_pages(h, (unsigned long) -delta); 3653 3654 out: 3655 spin_unlock(&hugetlb_lock); 3656 return ret; 3657 } 3658 3659 static void hugetlb_vm_op_open(struct vm_area_struct *vma) 3660 { 3661 struct resv_map *resv = vma_resv_map(vma); 3662 3663 /* 3664 * This new VMA should share its siblings reservation map if present. 3665 * The VMA will only ever have a valid reservation map pointer where 3666 * it is being copied for another still existing VMA. As that VMA 3667 * has a reference to the reservation map it cannot disappear until 3668 * after this open call completes. It is therefore safe to take a 3669 * new reference here without additional locking. 3670 */ 3671 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 3672 kref_get(&resv->refs); 3673 } 3674 3675 static void hugetlb_vm_op_close(struct vm_area_struct *vma) 3676 { 3677 struct hstate *h = hstate_vma(vma); 3678 struct resv_map *resv = vma_resv_map(vma); 3679 struct hugepage_subpool *spool = subpool_vma(vma); 3680 unsigned long reserve, start, end; 3681 long gbl_reserve; 3682 3683 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 3684 return; 3685 3686 start = vma_hugecache_offset(h, vma, vma->vm_start); 3687 end = vma_hugecache_offset(h, vma, vma->vm_end); 3688 3689 reserve = (end - start) - region_count(resv, start, end); 3690 hugetlb_cgroup_uncharge_counter(resv, start, end); 3691 if (reserve) { 3692 /* 3693 * Decrement reserve counts. The global reserve count may be 3694 * adjusted if the subpool has a minimum size. 3695 */ 3696 gbl_reserve = hugepage_subpool_put_pages(spool, reserve); 3697 hugetlb_acct_memory(h, -gbl_reserve); 3698 } 3699 3700 kref_put(&resv->refs, resv_map_release); 3701 } 3702 3703 static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) 3704 { 3705 if (addr & ~(huge_page_mask(hstate_vma(vma)))) 3706 return -EINVAL; 3707 return 0; 3708 } 3709 3710 static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) 3711 { 3712 struct hstate *hstate = hstate_vma(vma); 3713 3714 return 1UL << huge_page_shift(hstate); 3715 } 3716 3717 /* 3718 * We cannot handle pagefaults against hugetlb pages at all. They cause 3719 * handle_mm_fault() to try to instantiate regular-sized pages in the 3720 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 3721 * this far. 3722 */ 3723 static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) 3724 { 3725 BUG(); 3726 return 0; 3727 } 3728 3729 /* 3730 * When a new function is introduced to vm_operations_struct and added 3731 * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. 3732 * This is because under System V memory model, mappings created via 3733 * shmget/shmat with "huge page" specified are backed by hugetlbfs files, 3734 * their original vm_ops are overwritten with shm_vm_ops. 3735 */ 3736 const struct vm_operations_struct hugetlb_vm_ops = { 3737 .fault = hugetlb_vm_op_fault, 3738 .open = hugetlb_vm_op_open, 3739 .close = hugetlb_vm_op_close, 3740 .split = hugetlb_vm_op_split, 3741 .pagesize = hugetlb_vm_op_pagesize, 3742 }; 3743 3744 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 3745 int writable) 3746 { 3747 pte_t entry; 3748 3749 if (writable) { 3750 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, 3751 vma->vm_page_prot))); 3752 } else { 3753 entry = huge_pte_wrprotect(mk_huge_pte(page, 3754 vma->vm_page_prot)); 3755 } 3756 entry = pte_mkyoung(entry); 3757 entry = pte_mkhuge(entry); 3758 entry = arch_make_huge_pte(entry, vma, page, writable); 3759 3760 return entry; 3761 } 3762 3763 static void set_huge_ptep_writable(struct vm_area_struct *vma, 3764 unsigned long address, pte_t *ptep) 3765 { 3766 pte_t entry; 3767 3768 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); 3769 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) 3770 update_mmu_cache(vma, address, ptep); 3771 } 3772 3773 bool is_hugetlb_entry_migration(pte_t pte) 3774 { 3775 swp_entry_t swp; 3776 3777 if (huge_pte_none(pte) || pte_present(pte)) 3778 return false; 3779 swp = pte_to_swp_entry(pte); 3780 if (non_swap_entry(swp) && is_migration_entry(swp)) 3781 return true; 3782 else 3783 return false; 3784 } 3785 3786 static int is_hugetlb_entry_hwpoisoned(pte_t pte) 3787 { 3788 swp_entry_t swp; 3789 3790 if (huge_pte_none(pte) || pte_present(pte)) 3791 return 0; 3792 swp = pte_to_swp_entry(pte); 3793 if (non_swap_entry(swp) && is_hwpoison_entry(swp)) 3794 return 1; 3795 else 3796 return 0; 3797 } 3798 3799 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 3800 struct vm_area_struct *vma) 3801 { 3802 pte_t *src_pte, *dst_pte, entry, dst_entry; 3803 struct page *ptepage; 3804 unsigned long addr; 3805 int cow; 3806 struct hstate *h = hstate_vma(vma); 3807 unsigned long sz = huge_page_size(h); 3808 struct address_space *mapping = vma->vm_file->f_mapping; 3809 struct mmu_notifier_range range; 3810 int ret = 0; 3811 3812 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 3813 3814 if (cow) { 3815 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, 3816 vma->vm_start, 3817 vma->vm_end); 3818 mmu_notifier_invalidate_range_start(&range); 3819 } else { 3820 /* 3821 * For shared mappings i_mmap_rwsem must be held to call 3822 * huge_pte_alloc, otherwise the returned ptep could go 3823 * away if part of a shared pmd and another thread calls 3824 * huge_pmd_unshare. 3825 */ 3826 i_mmap_lock_read(mapping); 3827 } 3828 3829 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 3830 spinlock_t *src_ptl, *dst_ptl; 3831 src_pte = huge_pte_offset(src, addr, sz); 3832 if (!src_pte) 3833 continue; 3834 dst_pte = huge_pte_alloc(dst, addr, sz); 3835 if (!dst_pte) { 3836 ret = -ENOMEM; 3837 break; 3838 } 3839 3840 /* 3841 * If the pagetables are shared don't copy or take references. 3842 * dst_pte == src_pte is the common case of src/dest sharing. 3843 * 3844 * However, src could have 'unshared' and dst shares with 3845 * another vma. If dst_pte !none, this implies sharing. 3846 * Check here before taking page table lock, and once again 3847 * after taking the lock below. 3848 */ 3849 dst_entry = huge_ptep_get(dst_pte); 3850 if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) 3851 continue; 3852 3853 dst_ptl = huge_pte_lock(h, dst, dst_pte); 3854 src_ptl = huge_pte_lockptr(h, src, src_pte); 3855 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 3856 entry = huge_ptep_get(src_pte); 3857 dst_entry = huge_ptep_get(dst_pte); 3858 if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { 3859 /* 3860 * Skip if src entry none. Also, skip in the 3861 * unlikely case dst entry !none as this implies 3862 * sharing with another vma. 3863 */ 3864 ; 3865 } else if (unlikely(is_hugetlb_entry_migration(entry) || 3866 is_hugetlb_entry_hwpoisoned(entry))) { 3867 swp_entry_t swp_entry = pte_to_swp_entry(entry); 3868 3869 if (is_write_migration_entry(swp_entry) && cow) { 3870 /* 3871 * COW mappings require pages in both 3872 * parent and child to be set to read. 3873 */ 3874 make_migration_entry_read(&swp_entry); 3875 entry = swp_entry_to_pte(swp_entry); 3876 set_huge_swap_pte_at(src, addr, src_pte, 3877 entry, sz); 3878 } 3879 set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); 3880 } else { 3881 if (cow) { 3882 /* 3883 * No need to notify as we are downgrading page 3884 * table protection not changing it to point 3885 * to a new page. 3886 * 3887 * See Documentation/vm/mmu_notifier.rst 3888 */ 3889 huge_ptep_set_wrprotect(src, addr, src_pte); 3890 } 3891 entry = huge_ptep_get(src_pte); 3892 ptepage = pte_page(entry); 3893 get_page(ptepage); 3894 page_dup_rmap(ptepage, true); 3895 set_huge_pte_at(dst, addr, dst_pte, entry); 3896 hugetlb_count_add(pages_per_huge_page(h), dst); 3897 } 3898 spin_unlock(src_ptl); 3899 spin_unlock(dst_ptl); 3900 } 3901 3902 if (cow) 3903 mmu_notifier_invalidate_range_end(&range); 3904 else 3905 i_mmap_unlock_read(mapping); 3906 3907 return ret; 3908 } 3909 3910 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 3911 unsigned long start, unsigned long end, 3912 struct page *ref_page) 3913 { 3914 struct mm_struct *mm = vma->vm_mm; 3915 unsigned long address; 3916 pte_t *ptep; 3917 pte_t pte; 3918 spinlock_t *ptl; 3919 struct page *page; 3920 struct hstate *h = hstate_vma(vma); 3921 unsigned long sz = huge_page_size(h); 3922 struct mmu_notifier_range range; 3923 3924 WARN_ON(!is_vm_hugetlb_page(vma)); 3925 BUG_ON(start & ~huge_page_mask(h)); 3926 BUG_ON(end & ~huge_page_mask(h)); 3927 3928 /* 3929 * This is a hugetlb vma, all the pte entries should point 3930 * to huge page. 3931 */ 3932 tlb_change_page_size(tlb, sz); 3933 tlb_start_vma(tlb, vma); 3934 3935 /* 3936 * If sharing possible, alert mmu notifiers of worst case. 3937 */ 3938 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, 3939 end); 3940 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 3941 mmu_notifier_invalidate_range_start(&range); 3942 address = start; 3943 for (; address < end; address += sz) { 3944 ptep = huge_pte_offset(mm, address, sz); 3945 if (!ptep) 3946 continue; 3947 3948 ptl = huge_pte_lock(h, mm, ptep); 3949 if (huge_pmd_unshare(mm, &address, ptep)) { 3950 spin_unlock(ptl); 3951 /* 3952 * We just unmapped a page of PMDs by clearing a PUD. 3953 * The caller's TLB flush range should cover this area. 3954 */ 3955 continue; 3956 } 3957 3958 pte = huge_ptep_get(ptep); 3959 if (huge_pte_none(pte)) { 3960 spin_unlock(ptl); 3961 continue; 3962 } 3963 3964 /* 3965 * Migrating hugepage or HWPoisoned hugepage is already 3966 * unmapped and its refcount is dropped, so just clear pte here. 3967 */ 3968 if (unlikely(!pte_present(pte))) { 3969 huge_pte_clear(mm, address, ptep, sz); 3970 spin_unlock(ptl); 3971 continue; 3972 } 3973 3974 page = pte_page(pte); 3975 /* 3976 * If a reference page is supplied, it is because a specific 3977 * page is being unmapped, not a range. Ensure the page we 3978 * are about to unmap is the actual page of interest. 3979 */ 3980 if (ref_page) { 3981 if (page != ref_page) { 3982 spin_unlock(ptl); 3983 continue; 3984 } 3985 /* 3986 * Mark the VMA as having unmapped its page so that 3987 * future faults in this VMA will fail rather than 3988 * looking like data was lost 3989 */ 3990 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); 3991 } 3992 3993 pte = huge_ptep_get_and_clear(mm, address, ptep); 3994 tlb_remove_huge_tlb_entry(h, tlb, ptep, address); 3995 if (huge_pte_dirty(pte)) 3996 set_page_dirty(page); 3997 3998 hugetlb_count_sub(pages_per_huge_page(h), mm); 3999 page_remove_rmap(page, true); 4000 4001 spin_unlock(ptl); 4002 tlb_remove_page_size(tlb, page, huge_page_size(h)); 4003 /* 4004 * Bail out after unmapping reference page if supplied 4005 */ 4006 if (ref_page) 4007 break; 4008 } 4009 mmu_notifier_invalidate_range_end(&range); 4010 tlb_end_vma(tlb, vma); 4011 } 4012 4013 void __unmap_hugepage_range_final(struct mmu_gather *tlb, 4014 struct vm_area_struct *vma, unsigned long start, 4015 unsigned long end, struct page *ref_page) 4016 { 4017 __unmap_hugepage_range(tlb, vma, start, end, ref_page); 4018 4019 /* 4020 * Clear this flag so that x86's huge_pmd_share page_table_shareable 4021 * test will fail on a vma being torn down, and not grab a page table 4022 * on its way out. We're lucky that the flag has such an appropriate 4023 * name, and can in fact be safely cleared here. We could clear it 4024 * before the __unmap_hugepage_range above, but all that's necessary 4025 * is to clear it before releasing the i_mmap_rwsem. This works 4026 * because in the context this is called, the VMA is about to be 4027 * destroyed and the i_mmap_rwsem is held. 4028 */ 4029 vma->vm_flags &= ~VM_MAYSHARE; 4030 } 4031 4032 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 4033 unsigned long end, struct page *ref_page) 4034 { 4035 struct mm_struct *mm; 4036 struct mmu_gather tlb; 4037 unsigned long tlb_start = start; 4038 unsigned long tlb_end = end; 4039 4040 /* 4041 * If shared PMDs were possibly used within this vma range, adjust 4042 * start/end for worst case tlb flushing. 4043 * Note that we can not be sure if PMDs are shared until we try to 4044 * unmap pages. However, we want to make sure TLB flushing covers 4045 * the largest possible range. 4046 */ 4047 adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end); 4048 4049 mm = vma->vm_mm; 4050 4051 tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end); 4052 __unmap_hugepage_range(&tlb, vma, start, end, ref_page); 4053 tlb_finish_mmu(&tlb, tlb_start, tlb_end); 4054 } 4055 4056 /* 4057 * This is called when the original mapper is failing to COW a MAP_PRIVATE 4058 * mappping it owns the reserve page for. The intention is to unmap the page 4059 * from other VMAs and let the children be SIGKILLed if they are faulting the 4060 * same region. 4061 */ 4062 static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, 4063 struct page *page, unsigned long address) 4064 { 4065 struct hstate *h = hstate_vma(vma); 4066 struct vm_area_struct *iter_vma; 4067 struct address_space *mapping; 4068 pgoff_t pgoff; 4069 4070 /* 4071 * vm_pgoff is in PAGE_SIZE units, hence the different calculation 4072 * from page cache lookup which is in HPAGE_SIZE units. 4073 */ 4074 address = address & huge_page_mask(h); 4075 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + 4076 vma->vm_pgoff; 4077 mapping = vma->vm_file->f_mapping; 4078 4079 /* 4080 * Take the mapping lock for the duration of the table walk. As 4081 * this mapping should be shared between all the VMAs, 4082 * __unmap_hugepage_range() is called as the lock is already held 4083 */ 4084 i_mmap_lock_write(mapping); 4085 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { 4086 /* Do not unmap the current VMA */ 4087 if (iter_vma == vma) 4088 continue; 4089 4090 /* 4091 * Shared VMAs have their own reserves and do not affect 4092 * MAP_PRIVATE accounting but it is possible that a shared 4093 * VMA is using the same page so check and skip such VMAs. 4094 */ 4095 if (iter_vma->vm_flags & VM_MAYSHARE) 4096 continue; 4097 4098 /* 4099 * Unmap the page from other VMAs without their own reserves. 4100 * They get marked to be SIGKILLed if they fault in these 4101 * areas. This is because a future no-page fault on this VMA 4102 * could insert a zeroed page instead of the data existing 4103 * from the time of fork. This would look like data corruption 4104 */ 4105 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 4106 unmap_hugepage_range(iter_vma, address, 4107 address + huge_page_size(h), page); 4108 } 4109 i_mmap_unlock_write(mapping); 4110 } 4111 4112 /* 4113 * Hugetlb_cow() should be called with page lock of the original hugepage held. 4114 * Called with hugetlb_instantiation_mutex held and pte_page locked so we 4115 * cannot race with other handlers or page migration. 4116 * Keep the pte_same checks anyway to make transition from the mutex easier. 4117 */ 4118 static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 4119 unsigned long address, pte_t *ptep, 4120 struct page *pagecache_page, spinlock_t *ptl) 4121 { 4122 pte_t pte; 4123 struct hstate *h = hstate_vma(vma); 4124 struct page *old_page, *new_page; 4125 int outside_reserve = 0; 4126 vm_fault_t ret = 0; 4127 unsigned long haddr = address & huge_page_mask(h); 4128 struct mmu_notifier_range range; 4129 4130 pte = huge_ptep_get(ptep); 4131 old_page = pte_page(pte); 4132 4133 retry_avoidcopy: 4134 /* If no-one else is actually using this page, avoid the copy 4135 * and just make the page writable */ 4136 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { 4137 page_move_anon_rmap(old_page, vma); 4138 set_huge_ptep_writable(vma, haddr, ptep); 4139 return 0; 4140 } 4141 4142 /* 4143 * If the process that created a MAP_PRIVATE mapping is about to 4144 * perform a COW due to a shared page count, attempt to satisfy 4145 * the allocation without using the existing reserves. The pagecache 4146 * page is used to determine if the reserve at this address was 4147 * consumed or not. If reserves were used, a partial faulted mapping 4148 * at the time of fork() could consume its reserves on COW instead 4149 * of the full address range. 4150 */ 4151 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 4152 old_page != pagecache_page) 4153 outside_reserve = 1; 4154 4155 get_page(old_page); 4156 4157 /* 4158 * Drop page table lock as buddy allocator may be called. It will 4159 * be acquired again before returning to the caller, as expected. 4160 */ 4161 spin_unlock(ptl); 4162 new_page = alloc_huge_page(vma, haddr, outside_reserve); 4163 4164 if (IS_ERR(new_page)) { 4165 /* 4166 * If a process owning a MAP_PRIVATE mapping fails to COW, 4167 * it is due to references held by a child and an insufficient 4168 * huge page pool. To guarantee the original mappers 4169 * reliability, unmap the page from child processes. The child 4170 * may get SIGKILLed if it later faults. 4171 */ 4172 if (outside_reserve) { 4173 put_page(old_page); 4174 BUG_ON(huge_pte_none(pte)); 4175 unmap_ref_private(mm, vma, old_page, haddr); 4176 BUG_ON(huge_pte_none(pte)); 4177 spin_lock(ptl); 4178 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 4179 if (likely(ptep && 4180 pte_same(huge_ptep_get(ptep), pte))) 4181 goto retry_avoidcopy; 4182 /* 4183 * race occurs while re-acquiring page table 4184 * lock, and our job is done. 4185 */ 4186 return 0; 4187 } 4188 4189 ret = vmf_error(PTR_ERR(new_page)); 4190 goto out_release_old; 4191 } 4192 4193 /* 4194 * When the original hugepage is shared one, it does not have 4195 * anon_vma prepared. 4196 */ 4197 if (unlikely(anon_vma_prepare(vma))) { 4198 ret = VM_FAULT_OOM; 4199 goto out_release_all; 4200 } 4201 4202 copy_user_huge_page(new_page, old_page, address, vma, 4203 pages_per_huge_page(h)); 4204 __SetPageUptodate(new_page); 4205 4206 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, 4207 haddr + huge_page_size(h)); 4208 mmu_notifier_invalidate_range_start(&range); 4209 4210 /* 4211 * Retake the page table lock to check for racing updates 4212 * before the page tables are altered 4213 */ 4214 spin_lock(ptl); 4215 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 4216 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { 4217 ClearPagePrivate(new_page); 4218 4219 /* Break COW */ 4220 huge_ptep_clear_flush(vma, haddr, ptep); 4221 mmu_notifier_invalidate_range(mm, range.start, range.end); 4222 set_huge_pte_at(mm, haddr, ptep, 4223 make_huge_pte(vma, new_page, 1)); 4224 page_remove_rmap(old_page, true); 4225 hugepage_add_new_anon_rmap(new_page, vma, haddr); 4226 set_page_huge_active(new_page); 4227 /* Make the old page be freed below */ 4228 new_page = old_page; 4229 } 4230 spin_unlock(ptl); 4231 mmu_notifier_invalidate_range_end(&range); 4232 out_release_all: 4233 restore_reserve_on_error(h, vma, haddr, new_page); 4234 put_page(new_page); 4235 out_release_old: 4236 put_page(old_page); 4237 4238 spin_lock(ptl); /* Caller expects lock to be held */ 4239 return ret; 4240 } 4241 4242 /* Return the pagecache page at a given address within a VMA */ 4243 static struct page *hugetlbfs_pagecache_page(struct hstate *h, 4244 struct vm_area_struct *vma, unsigned long address) 4245 { 4246 struct address_space *mapping; 4247 pgoff_t idx; 4248 4249 mapping = vma->vm_file->f_mapping; 4250 idx = vma_hugecache_offset(h, vma, address); 4251 4252 return find_lock_page(mapping, idx); 4253 } 4254 4255 /* 4256 * Return whether there is a pagecache page to back given address within VMA. 4257 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. 4258 */ 4259 static bool hugetlbfs_pagecache_present(struct hstate *h, 4260 struct vm_area_struct *vma, unsigned long address) 4261 { 4262 struct address_space *mapping; 4263 pgoff_t idx; 4264 struct page *page; 4265 4266 mapping = vma->vm_file->f_mapping; 4267 idx = vma_hugecache_offset(h, vma, address); 4268 4269 page = find_get_page(mapping, idx); 4270 if (page) 4271 put_page(page); 4272 return page != NULL; 4273 } 4274 4275 int huge_add_to_page_cache(struct page *page, struct address_space *mapping, 4276 pgoff_t idx) 4277 { 4278 struct inode *inode = mapping->host; 4279 struct hstate *h = hstate_inode(inode); 4280 int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 4281 4282 if (err) 4283 return err; 4284 ClearPagePrivate(page); 4285 4286 /* 4287 * set page dirty so that it will not be removed from cache/file 4288 * by non-hugetlbfs specific code paths. 4289 */ 4290 set_page_dirty(page); 4291 4292 spin_lock(&inode->i_lock); 4293 inode->i_blocks += blocks_per_huge_page(h); 4294 spin_unlock(&inode->i_lock); 4295 return 0; 4296 } 4297 4298 static vm_fault_t hugetlb_no_page(struct mm_struct *mm, 4299 struct vm_area_struct *vma, 4300 struct address_space *mapping, pgoff_t idx, 4301 unsigned long address, pte_t *ptep, unsigned int flags) 4302 { 4303 struct hstate *h = hstate_vma(vma); 4304 vm_fault_t ret = VM_FAULT_SIGBUS; 4305 int anon_rmap = 0; 4306 unsigned long size; 4307 struct page *page; 4308 pte_t new_pte; 4309 spinlock_t *ptl; 4310 unsigned long haddr = address & huge_page_mask(h); 4311 bool new_page = false; 4312 4313 /* 4314 * Currently, we are forced to kill the process in the event the 4315 * original mapper has unmapped pages from the child due to a failed 4316 * COW. Warn that such a situation has occurred as it may not be obvious 4317 */ 4318 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 4319 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", 4320 current->pid); 4321 return ret; 4322 } 4323 4324 /* 4325 * We can not race with truncation due to holding i_mmap_rwsem. 4326 * i_size is modified when holding i_mmap_rwsem, so check here 4327 * once for faults beyond end of file. 4328 */ 4329 size = i_size_read(mapping->host) >> huge_page_shift(h); 4330 if (idx >= size) 4331 goto out; 4332 4333 retry: 4334 page = find_lock_page(mapping, idx); 4335 if (!page) { 4336 /* 4337 * Check for page in userfault range 4338 */ 4339 if (userfaultfd_missing(vma)) { 4340 u32 hash; 4341 struct vm_fault vmf = { 4342 .vma = vma, 4343 .address = haddr, 4344 .flags = flags, 4345 /* 4346 * Hard to debug if it ends up being 4347 * used by a callee that assumes 4348 * something about the other 4349 * uninitialized fields... same as in 4350 * memory.c 4351 */ 4352 }; 4353 4354 /* 4355 * hugetlb_fault_mutex and i_mmap_rwsem must be 4356 * dropped before handling userfault. Reacquire 4357 * after handling fault to make calling code simpler. 4358 */ 4359 hash = hugetlb_fault_mutex_hash(mapping, idx); 4360 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 4361 i_mmap_unlock_read(mapping); 4362 ret = handle_userfault(&vmf, VM_UFFD_MISSING); 4363 i_mmap_lock_read(mapping); 4364 mutex_lock(&hugetlb_fault_mutex_table[hash]); 4365 goto out; 4366 } 4367 4368 page = alloc_huge_page(vma, haddr, 0); 4369 if (IS_ERR(page)) { 4370 /* 4371 * Returning error will result in faulting task being 4372 * sent SIGBUS. The hugetlb fault mutex prevents two 4373 * tasks from racing to fault in the same page which 4374 * could result in false unable to allocate errors. 4375 * Page migration does not take the fault mutex, but 4376 * does a clear then write of pte's under page table 4377 * lock. Page fault code could race with migration, 4378 * notice the clear pte and try to allocate a page 4379 * here. Before returning error, get ptl and make 4380 * sure there really is no pte entry. 4381 */ 4382 ptl = huge_pte_lock(h, mm, ptep); 4383 if (!huge_pte_none(huge_ptep_get(ptep))) { 4384 ret = 0; 4385 spin_unlock(ptl); 4386 goto out; 4387 } 4388 spin_unlock(ptl); 4389 ret = vmf_error(PTR_ERR(page)); 4390 goto out; 4391 } 4392 clear_huge_page(page, address, pages_per_huge_page(h)); 4393 __SetPageUptodate(page); 4394 new_page = true; 4395 4396 if (vma->vm_flags & VM_MAYSHARE) { 4397 int err = huge_add_to_page_cache(page, mapping, idx); 4398 if (err) { 4399 put_page(page); 4400 if (err == -EEXIST) 4401 goto retry; 4402 goto out; 4403 } 4404 } else { 4405 lock_page(page); 4406 if (unlikely(anon_vma_prepare(vma))) { 4407 ret = VM_FAULT_OOM; 4408 goto backout_unlocked; 4409 } 4410 anon_rmap = 1; 4411 } 4412 } else { 4413 /* 4414 * If memory error occurs between mmap() and fault, some process 4415 * don't have hwpoisoned swap entry for errored virtual address. 4416 * So we need to block hugepage fault by PG_hwpoison bit check. 4417 */ 4418 if (unlikely(PageHWPoison(page))) { 4419 ret = VM_FAULT_HWPOISON | 4420 VM_FAULT_SET_HINDEX(hstate_index(h)); 4421 goto backout_unlocked; 4422 } 4423 } 4424 4425 /* 4426 * If we are going to COW a private mapping later, we examine the 4427 * pending reservations for this page now. This will ensure that 4428 * any allocations necessary to record that reservation occur outside 4429 * the spinlock. 4430 */ 4431 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 4432 if (vma_needs_reservation(h, vma, haddr) < 0) { 4433 ret = VM_FAULT_OOM; 4434 goto backout_unlocked; 4435 } 4436 /* Just decrements count, does not deallocate */ 4437 vma_end_reservation(h, vma, haddr); 4438 } 4439 4440 ptl = huge_pte_lock(h, mm, ptep); 4441 ret = 0; 4442 if (!huge_pte_none(huge_ptep_get(ptep))) 4443 goto backout; 4444 4445 if (anon_rmap) { 4446 ClearPagePrivate(page); 4447 hugepage_add_new_anon_rmap(page, vma, haddr); 4448 } else 4449 page_dup_rmap(page, true); 4450 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 4451 && (vma->vm_flags & VM_SHARED))); 4452 set_huge_pte_at(mm, haddr, ptep, new_pte); 4453 4454 hugetlb_count_add(pages_per_huge_page(h), mm); 4455 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 4456 /* Optimization, do the COW without a second fault */ 4457 ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); 4458 } 4459 4460 spin_unlock(ptl); 4461 4462 /* 4463 * Only make newly allocated pages active. Existing pages found 4464 * in the pagecache could be !page_huge_active() if they have been 4465 * isolated for migration. 4466 */ 4467 if (new_page) 4468 set_page_huge_active(page); 4469 4470 unlock_page(page); 4471 out: 4472 return ret; 4473 4474 backout: 4475 spin_unlock(ptl); 4476 backout_unlocked: 4477 unlock_page(page); 4478 restore_reserve_on_error(h, vma, haddr, page); 4479 put_page(page); 4480 goto out; 4481 } 4482 4483 #ifdef CONFIG_SMP 4484 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 4485 { 4486 unsigned long key[2]; 4487 u32 hash; 4488 4489 key[0] = (unsigned long) mapping; 4490 key[1] = idx; 4491 4492 hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); 4493 4494 return hash & (num_fault_mutexes - 1); 4495 } 4496 #else 4497 /* 4498 * For uniprocesor systems we always use a single mutex, so just 4499 * return 0 and avoid the hashing overhead. 4500 */ 4501 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 4502 { 4503 return 0; 4504 } 4505 #endif 4506 4507 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 4508 unsigned long address, unsigned int flags) 4509 { 4510 pte_t *ptep, entry; 4511 spinlock_t *ptl; 4512 vm_fault_t ret; 4513 u32 hash; 4514 pgoff_t idx; 4515 struct page *page = NULL; 4516 struct page *pagecache_page = NULL; 4517 struct hstate *h = hstate_vma(vma); 4518 struct address_space *mapping; 4519 int need_wait_lock = 0; 4520 unsigned long haddr = address & huge_page_mask(h); 4521 4522 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 4523 if (ptep) { 4524 /* 4525 * Since we hold no locks, ptep could be stale. That is 4526 * OK as we are only making decisions based on content and 4527 * not actually modifying content here. 4528 */ 4529 entry = huge_ptep_get(ptep); 4530 if (unlikely(is_hugetlb_entry_migration(entry))) { 4531 migration_entry_wait_huge(vma, mm, ptep); 4532 return 0; 4533 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 4534 return VM_FAULT_HWPOISON_LARGE | 4535 VM_FAULT_SET_HINDEX(hstate_index(h)); 4536 } else { 4537 ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); 4538 if (!ptep) 4539 return VM_FAULT_OOM; 4540 } 4541 4542 /* 4543 * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold 4544 * until finished with ptep. This serves two purposes: 4545 * 1) It prevents huge_pmd_unshare from being called elsewhere 4546 * and making the ptep no longer valid. 4547 * 2) It synchronizes us with i_size modifications during truncation. 4548 * 4549 * ptep could have already be assigned via huge_pte_offset. That 4550 * is OK, as huge_pte_alloc will return the same value unless 4551 * something has changed. 4552 */ 4553 mapping = vma->vm_file->f_mapping; 4554 i_mmap_lock_read(mapping); 4555 ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); 4556 if (!ptep) { 4557 i_mmap_unlock_read(mapping); 4558 return VM_FAULT_OOM; 4559 } 4560 4561 /* 4562 * Serialize hugepage allocation and instantiation, so that we don't 4563 * get spurious allocation failures if two CPUs race to instantiate 4564 * the same page in the page cache. 4565 */ 4566 idx = vma_hugecache_offset(h, vma, haddr); 4567 hash = hugetlb_fault_mutex_hash(mapping, idx); 4568 mutex_lock(&hugetlb_fault_mutex_table[hash]); 4569 4570 entry = huge_ptep_get(ptep); 4571 if (huge_pte_none(entry)) { 4572 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); 4573 goto out_mutex; 4574 } 4575 4576 ret = 0; 4577 4578 /* 4579 * entry could be a migration/hwpoison entry at this point, so this 4580 * check prevents the kernel from going below assuming that we have 4581 * an active hugepage in pagecache. This goto expects the 2nd page 4582 * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will 4583 * properly handle it. 4584 */ 4585 if (!pte_present(entry)) 4586 goto out_mutex; 4587 4588 /* 4589 * If we are going to COW the mapping later, we examine the pending 4590 * reservations for this page now. This will ensure that any 4591 * allocations necessary to record that reservation occur outside the 4592 * spinlock. For private mappings, we also lookup the pagecache 4593 * page now as it is used to determine if a reservation has been 4594 * consumed. 4595 */ 4596 if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { 4597 if (vma_needs_reservation(h, vma, haddr) < 0) { 4598 ret = VM_FAULT_OOM; 4599 goto out_mutex; 4600 } 4601 /* Just decrements count, does not deallocate */ 4602 vma_end_reservation(h, vma, haddr); 4603 4604 if (!(vma->vm_flags & VM_MAYSHARE)) 4605 pagecache_page = hugetlbfs_pagecache_page(h, 4606 vma, haddr); 4607 } 4608 4609 ptl = huge_pte_lock(h, mm, ptep); 4610 4611 /* Check for a racing update before calling hugetlb_cow */ 4612 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 4613 goto out_ptl; 4614 4615 /* 4616 * hugetlb_cow() requires page locks of pte_page(entry) and 4617 * pagecache_page, so here we need take the former one 4618 * when page != pagecache_page or !pagecache_page. 4619 */ 4620 page = pte_page(entry); 4621 if (page != pagecache_page) 4622 if (!trylock_page(page)) { 4623 need_wait_lock = 1; 4624 goto out_ptl; 4625 } 4626 4627 get_page(page); 4628 4629 if (flags & FAULT_FLAG_WRITE) { 4630 if (!huge_pte_write(entry)) { 4631 ret = hugetlb_cow(mm, vma, address, ptep, 4632 pagecache_page, ptl); 4633 goto out_put_page; 4634 } 4635 entry = huge_pte_mkdirty(entry); 4636 } 4637 entry = pte_mkyoung(entry); 4638 if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, 4639 flags & FAULT_FLAG_WRITE)) 4640 update_mmu_cache(vma, haddr, ptep); 4641 out_put_page: 4642 if (page != pagecache_page) 4643 unlock_page(page); 4644 put_page(page); 4645 out_ptl: 4646 spin_unlock(ptl); 4647 4648 if (pagecache_page) { 4649 unlock_page(pagecache_page); 4650 put_page(pagecache_page); 4651 } 4652 out_mutex: 4653 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 4654 i_mmap_unlock_read(mapping); 4655 /* 4656 * Generally it's safe to hold refcount during waiting page lock. But 4657 * here we just wait to defer the next page fault to avoid busy loop and 4658 * the page is not used after unlocked before returning from the current 4659 * page fault. So we are safe from accessing freed page, even if we wait 4660 * here without taking refcount. 4661 */ 4662 if (need_wait_lock) 4663 wait_on_page_locked(page); 4664 return ret; 4665 } 4666 4667 /* 4668 * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with 4669 * modifications for huge pages. 4670 */ 4671 int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, 4672 pte_t *dst_pte, 4673 struct vm_area_struct *dst_vma, 4674 unsigned long dst_addr, 4675 unsigned long src_addr, 4676 struct page **pagep) 4677 { 4678 struct address_space *mapping; 4679 pgoff_t idx; 4680 unsigned long size; 4681 int vm_shared = dst_vma->vm_flags & VM_SHARED; 4682 struct hstate *h = hstate_vma(dst_vma); 4683 pte_t _dst_pte; 4684 spinlock_t *ptl; 4685 int ret; 4686 struct page *page; 4687 4688 if (!*pagep) { 4689 ret = -ENOMEM; 4690 page = alloc_huge_page(dst_vma, dst_addr, 0); 4691 if (IS_ERR(page)) 4692 goto out; 4693 4694 ret = copy_huge_page_from_user(page, 4695 (const void __user *) src_addr, 4696 pages_per_huge_page(h), false); 4697 4698 /* fallback to copy_from_user outside mmap_lock */ 4699 if (unlikely(ret)) { 4700 ret = -ENOENT; 4701 *pagep = page; 4702 /* don't free the page */ 4703 goto out; 4704 } 4705 } else { 4706 page = *pagep; 4707 *pagep = NULL; 4708 } 4709 4710 /* 4711 * The memory barrier inside __SetPageUptodate makes sure that 4712 * preceding stores to the page contents become visible before 4713 * the set_pte_at() write. 4714 */ 4715 __SetPageUptodate(page); 4716 4717 mapping = dst_vma->vm_file->f_mapping; 4718 idx = vma_hugecache_offset(h, dst_vma, dst_addr); 4719 4720 /* 4721 * If shared, add to page cache 4722 */ 4723 if (vm_shared) { 4724 size = i_size_read(mapping->host) >> huge_page_shift(h); 4725 ret = -EFAULT; 4726 if (idx >= size) 4727 goto out_release_nounlock; 4728 4729 /* 4730 * Serialization between remove_inode_hugepages() and 4731 * huge_add_to_page_cache() below happens through the 4732 * hugetlb_fault_mutex_table that here must be hold by 4733 * the caller. 4734 */ 4735 ret = huge_add_to_page_cache(page, mapping, idx); 4736 if (ret) 4737 goto out_release_nounlock; 4738 } 4739 4740 ptl = huge_pte_lockptr(h, dst_mm, dst_pte); 4741 spin_lock(ptl); 4742 4743 /* 4744 * Recheck the i_size after holding PT lock to make sure not 4745 * to leave any page mapped (as page_mapped()) beyond the end 4746 * of the i_size (remove_inode_hugepages() is strict about 4747 * enforcing that). If we bail out here, we'll also leave a 4748 * page in the radix tree in the vm_shared case beyond the end 4749 * of the i_size, but remove_inode_hugepages() will take care 4750 * of it as soon as we drop the hugetlb_fault_mutex_table. 4751 */ 4752 size = i_size_read(mapping->host) >> huge_page_shift(h); 4753 ret = -EFAULT; 4754 if (idx >= size) 4755 goto out_release_unlock; 4756 4757 ret = -EEXIST; 4758 if (!huge_pte_none(huge_ptep_get(dst_pte))) 4759 goto out_release_unlock; 4760 4761 if (vm_shared) { 4762 page_dup_rmap(page, true); 4763 } else { 4764 ClearPagePrivate(page); 4765 hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); 4766 } 4767 4768 _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); 4769 if (dst_vma->vm_flags & VM_WRITE) 4770 _dst_pte = huge_pte_mkdirty(_dst_pte); 4771 _dst_pte = pte_mkyoung(_dst_pte); 4772 4773 set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 4774 4775 (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, 4776 dst_vma->vm_flags & VM_WRITE); 4777 hugetlb_count_add(pages_per_huge_page(h), dst_mm); 4778 4779 /* No need to invalidate - it was non-present before */ 4780 update_mmu_cache(dst_vma, dst_addr, dst_pte); 4781 4782 spin_unlock(ptl); 4783 set_page_huge_active(page); 4784 if (vm_shared) 4785 unlock_page(page); 4786 ret = 0; 4787 out: 4788 return ret; 4789 out_release_unlock: 4790 spin_unlock(ptl); 4791 if (vm_shared) 4792 unlock_page(page); 4793 out_release_nounlock: 4794 put_page(page); 4795 goto out; 4796 } 4797 4798 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 4799 struct page **pages, struct vm_area_struct **vmas, 4800 unsigned long *position, unsigned long *nr_pages, 4801 long i, unsigned int flags, int *locked) 4802 { 4803 unsigned long pfn_offset; 4804 unsigned long vaddr = *position; 4805 unsigned long remainder = *nr_pages; 4806 struct hstate *h = hstate_vma(vma); 4807 int err = -EFAULT; 4808 4809 while (vaddr < vma->vm_end && remainder) { 4810 pte_t *pte; 4811 spinlock_t *ptl = NULL; 4812 int absent; 4813 struct page *page; 4814 4815 /* 4816 * If we have a pending SIGKILL, don't keep faulting pages and 4817 * potentially allocating memory. 4818 */ 4819 if (fatal_signal_pending(current)) { 4820 remainder = 0; 4821 break; 4822 } 4823 4824 /* 4825 * Some archs (sparc64, sh*) have multiple pte_ts to 4826 * each hugepage. We have to make sure we get the 4827 * first, for the page indexing below to work. 4828 * 4829 * Note that page table lock is not held when pte is null. 4830 */ 4831 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), 4832 huge_page_size(h)); 4833 if (pte) 4834 ptl = huge_pte_lock(h, mm, pte); 4835 absent = !pte || huge_pte_none(huge_ptep_get(pte)); 4836 4837 /* 4838 * When coredumping, it suits get_dump_page if we just return 4839 * an error where there's an empty slot with no huge pagecache 4840 * to back it. This way, we avoid allocating a hugepage, and 4841 * the sparse dumpfile avoids allocating disk blocks, but its 4842 * huge holes still show up with zeroes where they need to be. 4843 */ 4844 if (absent && (flags & FOLL_DUMP) && 4845 !hugetlbfs_pagecache_present(h, vma, vaddr)) { 4846 if (pte) 4847 spin_unlock(ptl); 4848 remainder = 0; 4849 break; 4850 } 4851 4852 /* 4853 * We need call hugetlb_fault for both hugepages under migration 4854 * (in which case hugetlb_fault waits for the migration,) and 4855 * hwpoisoned hugepages (in which case we need to prevent the 4856 * caller from accessing to them.) In order to do this, we use 4857 * here is_swap_pte instead of is_hugetlb_entry_migration and 4858 * is_hugetlb_entry_hwpoisoned. This is because it simply covers 4859 * both cases, and because we can't follow correct pages 4860 * directly from any kind of swap entries. 4861 */ 4862 if (absent || is_swap_pte(huge_ptep_get(pte)) || 4863 ((flags & FOLL_WRITE) && 4864 !huge_pte_write(huge_ptep_get(pte)))) { 4865 vm_fault_t ret; 4866 unsigned int fault_flags = 0; 4867 4868 if (pte) 4869 spin_unlock(ptl); 4870 if (flags & FOLL_WRITE) 4871 fault_flags |= FAULT_FLAG_WRITE; 4872 if (locked) 4873 fault_flags |= FAULT_FLAG_ALLOW_RETRY | 4874 FAULT_FLAG_KILLABLE; 4875 if (flags & FOLL_NOWAIT) 4876 fault_flags |= FAULT_FLAG_ALLOW_RETRY | 4877 FAULT_FLAG_RETRY_NOWAIT; 4878 if (flags & FOLL_TRIED) { 4879 /* 4880 * Note: FAULT_FLAG_ALLOW_RETRY and 4881 * FAULT_FLAG_TRIED can co-exist 4882 */ 4883 fault_flags |= FAULT_FLAG_TRIED; 4884 } 4885 ret = hugetlb_fault(mm, vma, vaddr, fault_flags); 4886 if (ret & VM_FAULT_ERROR) { 4887 err = vm_fault_to_errno(ret, flags); 4888 remainder = 0; 4889 break; 4890 } 4891 if (ret & VM_FAULT_RETRY) { 4892 if (locked && 4893 !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) 4894 *locked = 0; 4895 *nr_pages = 0; 4896 /* 4897 * VM_FAULT_RETRY must not return an 4898 * error, it will return zero 4899 * instead. 4900 * 4901 * No need to update "position" as the 4902 * caller will not check it after 4903 * *nr_pages is set to 0. 4904 */ 4905 return i; 4906 } 4907 continue; 4908 } 4909 4910 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; 4911 page = pte_page(huge_ptep_get(pte)); 4912 4913 /* 4914 * If subpage information not requested, update counters 4915 * and skip the same_page loop below. 4916 */ 4917 if (!pages && !vmas && !pfn_offset && 4918 (vaddr + huge_page_size(h) < vma->vm_end) && 4919 (remainder >= pages_per_huge_page(h))) { 4920 vaddr += huge_page_size(h); 4921 remainder -= pages_per_huge_page(h); 4922 i += pages_per_huge_page(h); 4923 spin_unlock(ptl); 4924 continue; 4925 } 4926 4927 same_page: 4928 if (pages) { 4929 pages[i] = mem_map_offset(page, pfn_offset); 4930 /* 4931 * try_grab_page() should always succeed here, because: 4932 * a) we hold the ptl lock, and b) we've just checked 4933 * that the huge page is present in the page tables. If 4934 * the huge page is present, then the tail pages must 4935 * also be present. The ptl prevents the head page and 4936 * tail pages from being rearranged in any way. So this 4937 * page must be available at this point, unless the page 4938 * refcount overflowed: 4939 */ 4940 if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) { 4941 spin_unlock(ptl); 4942 remainder = 0; 4943 err = -ENOMEM; 4944 break; 4945 } 4946 } 4947 4948 if (vmas) 4949 vmas[i] = vma; 4950 4951 vaddr += PAGE_SIZE; 4952 ++pfn_offset; 4953 --remainder; 4954 ++i; 4955 if (vaddr < vma->vm_end && remainder && 4956 pfn_offset < pages_per_huge_page(h)) { 4957 /* 4958 * We use pfn_offset to avoid touching the pageframes 4959 * of this compound page. 4960 */ 4961 goto same_page; 4962 } 4963 spin_unlock(ptl); 4964 } 4965 *nr_pages = remainder; 4966 /* 4967 * setting position is actually required only if remainder is 4968 * not zero but it's faster not to add a "if (remainder)" 4969 * branch. 4970 */ 4971 *position = vaddr; 4972 4973 return i ? i : err; 4974 } 4975 4976 #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE 4977 /* 4978 * ARCHes with special requirements for evicting HUGETLB backing TLB entries can 4979 * implement this. 4980 */ 4981 #define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) 4982 #endif 4983 4984 unsigned long hugetlb_change_protection(struct vm_area_struct *vma, 4985 unsigned long address, unsigned long end, pgprot_t newprot) 4986 { 4987 struct mm_struct *mm = vma->vm_mm; 4988 unsigned long start = address; 4989 pte_t *ptep; 4990 pte_t pte; 4991 struct hstate *h = hstate_vma(vma); 4992 unsigned long pages = 0; 4993 bool shared_pmd = false; 4994 struct mmu_notifier_range range; 4995 4996 /* 4997 * In the case of shared PMDs, the area to flush could be beyond 4998 * start/end. Set range.start/range.end to cover the maximum possible 4999 * range if PMD sharing is possible. 5000 */ 5001 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 5002 0, vma, mm, start, end); 5003 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 5004 5005 BUG_ON(address >= end); 5006 flush_cache_range(vma, range.start, range.end); 5007 5008 mmu_notifier_invalidate_range_start(&range); 5009 i_mmap_lock_write(vma->vm_file->f_mapping); 5010 for (; address < end; address += huge_page_size(h)) { 5011 spinlock_t *ptl; 5012 ptep = huge_pte_offset(mm, address, huge_page_size(h)); 5013 if (!ptep) 5014 continue; 5015 ptl = huge_pte_lock(h, mm, ptep); 5016 if (huge_pmd_unshare(mm, &address, ptep)) { 5017 pages++; 5018 spin_unlock(ptl); 5019 shared_pmd = true; 5020 continue; 5021 } 5022 pte = huge_ptep_get(ptep); 5023 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 5024 spin_unlock(ptl); 5025 continue; 5026 } 5027 if (unlikely(is_hugetlb_entry_migration(pte))) { 5028 swp_entry_t entry = pte_to_swp_entry(pte); 5029 5030 if (is_write_migration_entry(entry)) { 5031 pte_t newpte; 5032 5033 make_migration_entry_read(&entry); 5034 newpte = swp_entry_to_pte(entry); 5035 set_huge_swap_pte_at(mm, address, ptep, 5036 newpte, huge_page_size(h)); 5037 pages++; 5038 } 5039 spin_unlock(ptl); 5040 continue; 5041 } 5042 if (!huge_pte_none(pte)) { 5043 pte_t old_pte; 5044 5045 old_pte = huge_ptep_modify_prot_start(vma, address, ptep); 5046 pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); 5047 pte = arch_make_huge_pte(pte, vma, NULL, 0); 5048 huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); 5049 pages++; 5050 } 5051 spin_unlock(ptl); 5052 } 5053 /* 5054 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare 5055 * may have cleared our pud entry and done put_page on the page table: 5056 * once we release i_mmap_rwsem, another task can do the final put_page 5057 * and that page table be reused and filled with junk. If we actually 5058 * did unshare a page of pmds, flush the range corresponding to the pud. 5059 */ 5060 if (shared_pmd) 5061 flush_hugetlb_tlb_range(vma, range.start, range.end); 5062 else 5063 flush_hugetlb_tlb_range(vma, start, end); 5064 /* 5065 * No need to call mmu_notifier_invalidate_range() we are downgrading 5066 * page table protection not changing it to point to a new page. 5067 * 5068 * See Documentation/vm/mmu_notifier.rst 5069 */ 5070 i_mmap_unlock_write(vma->vm_file->f_mapping); 5071 mmu_notifier_invalidate_range_end(&range); 5072 5073 return pages << h->order; 5074 } 5075 5076 int hugetlb_reserve_pages(struct inode *inode, 5077 long from, long to, 5078 struct vm_area_struct *vma, 5079 vm_flags_t vm_flags) 5080 { 5081 long ret, chg, add = -1; 5082 struct hstate *h = hstate_inode(inode); 5083 struct hugepage_subpool *spool = subpool_inode(inode); 5084 struct resv_map *resv_map; 5085 struct hugetlb_cgroup *h_cg = NULL; 5086 long gbl_reserve, regions_needed = 0; 5087 5088 /* This should never happen */ 5089 if (from > to) { 5090 VM_WARN(1, "%s called with a negative range\n", __func__); 5091 return -EINVAL; 5092 } 5093 5094 /* 5095 * Only apply hugepage reservation if asked. At fault time, an 5096 * attempt will be made for VM_NORESERVE to allocate a page 5097 * without using reserves 5098 */ 5099 if (vm_flags & VM_NORESERVE) 5100 return 0; 5101 5102 /* 5103 * Shared mappings base their reservation on the number of pages that 5104 * are already allocated on behalf of the file. Private mappings need 5105 * to reserve the full area even if read-only as mprotect() may be 5106 * called to make the mapping read-write. Assume !vma is a shm mapping 5107 */ 5108 if (!vma || vma->vm_flags & VM_MAYSHARE) { 5109 /* 5110 * resv_map can not be NULL as hugetlb_reserve_pages is only 5111 * called for inodes for which resv_maps were created (see 5112 * hugetlbfs_get_inode). 5113 */ 5114 resv_map = inode_resv_map(inode); 5115 5116 chg = region_chg(resv_map, from, to, ®ions_needed); 5117 5118 } else { 5119 /* Private mapping. */ 5120 resv_map = resv_map_alloc(); 5121 if (!resv_map) 5122 return -ENOMEM; 5123 5124 chg = to - from; 5125 5126 set_vma_resv_map(vma, resv_map); 5127 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 5128 } 5129 5130 if (chg < 0) { 5131 ret = chg; 5132 goto out_err; 5133 } 5134 5135 ret = hugetlb_cgroup_charge_cgroup_rsvd( 5136 hstate_index(h), chg * pages_per_huge_page(h), &h_cg); 5137 5138 if (ret < 0) { 5139 ret = -ENOMEM; 5140 goto out_err; 5141 } 5142 5143 if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { 5144 /* For private mappings, the hugetlb_cgroup uncharge info hangs 5145 * of the resv_map. 5146 */ 5147 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); 5148 } 5149 5150 /* 5151 * There must be enough pages in the subpool for the mapping. If 5152 * the subpool has a minimum size, there may be some global 5153 * reservations already in place (gbl_reserve). 5154 */ 5155 gbl_reserve = hugepage_subpool_get_pages(spool, chg); 5156 if (gbl_reserve < 0) { 5157 ret = -ENOSPC; 5158 goto out_uncharge_cgroup; 5159 } 5160 5161 /* 5162 * Check enough hugepages are available for the reservation. 5163 * Hand the pages back to the subpool if there are not 5164 */ 5165 ret = hugetlb_acct_memory(h, gbl_reserve); 5166 if (ret < 0) { 5167 goto out_put_pages; 5168 } 5169 5170 /* 5171 * Account for the reservations made. Shared mappings record regions 5172 * that have reservations as they are shared by multiple VMAs. 5173 * When the last VMA disappears, the region map says how much 5174 * the reservation was and the page cache tells how much of 5175 * the reservation was consumed. Private mappings are per-VMA and 5176 * only the consumed reservations are tracked. When the VMA 5177 * disappears, the original reservation is the VMA size and the 5178 * consumed reservations are stored in the map. Hence, nothing 5179 * else has to be done for private mappings here 5180 */ 5181 if (!vma || vma->vm_flags & VM_MAYSHARE) { 5182 add = region_add(resv_map, from, to, regions_needed, h, h_cg); 5183 5184 if (unlikely(add < 0)) { 5185 hugetlb_acct_memory(h, -gbl_reserve); 5186 goto out_put_pages; 5187 } else if (unlikely(chg > add)) { 5188 /* 5189 * pages in this range were added to the reserve 5190 * map between region_chg and region_add. This 5191 * indicates a race with alloc_huge_page. Adjust 5192 * the subpool and reserve counts modified above 5193 * based on the difference. 5194 */ 5195 long rsv_adjust; 5196 5197 hugetlb_cgroup_uncharge_cgroup_rsvd( 5198 hstate_index(h), 5199 (chg - add) * pages_per_huge_page(h), h_cg); 5200 5201 rsv_adjust = hugepage_subpool_put_pages(spool, 5202 chg - add); 5203 hugetlb_acct_memory(h, -rsv_adjust); 5204 } 5205 } 5206 return 0; 5207 out_put_pages: 5208 /* put back original number of pages, chg */ 5209 (void)hugepage_subpool_put_pages(spool, chg); 5210 out_uncharge_cgroup: 5211 hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), 5212 chg * pages_per_huge_page(h), h_cg); 5213 out_err: 5214 if (!vma || vma->vm_flags & VM_MAYSHARE) 5215 /* Only call region_abort if the region_chg succeeded but the 5216 * region_add failed or didn't run. 5217 */ 5218 if (chg >= 0 && add < 0) 5219 region_abort(resv_map, from, to, regions_needed); 5220 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 5221 kref_put(&resv_map->refs, resv_map_release); 5222 return ret; 5223 } 5224 5225 long hugetlb_unreserve_pages(struct inode *inode, long start, long end, 5226 long freed) 5227 { 5228 struct hstate *h = hstate_inode(inode); 5229 struct resv_map *resv_map = inode_resv_map(inode); 5230 long chg = 0; 5231 struct hugepage_subpool *spool = subpool_inode(inode); 5232 long gbl_reserve; 5233 5234 /* 5235 * Since this routine can be called in the evict inode path for all 5236 * hugetlbfs inodes, resv_map could be NULL. 5237 */ 5238 if (resv_map) { 5239 chg = region_del(resv_map, start, end); 5240 /* 5241 * region_del() can fail in the rare case where a region 5242 * must be split and another region descriptor can not be 5243 * allocated. If end == LONG_MAX, it will not fail. 5244 */ 5245 if (chg < 0) 5246 return chg; 5247 } 5248 5249 spin_lock(&inode->i_lock); 5250 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 5251 spin_unlock(&inode->i_lock); 5252 5253 /* 5254 * If the subpool has a minimum size, the number of global 5255 * reservations to be released may be adjusted. 5256 */ 5257 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); 5258 hugetlb_acct_memory(h, -gbl_reserve); 5259 5260 return 0; 5261 } 5262 5263 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 5264 static unsigned long page_table_shareable(struct vm_area_struct *svma, 5265 struct vm_area_struct *vma, 5266 unsigned long addr, pgoff_t idx) 5267 { 5268 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + 5269 svma->vm_start; 5270 unsigned long sbase = saddr & PUD_MASK; 5271 unsigned long s_end = sbase + PUD_SIZE; 5272 5273 /* Allow segments to share if only one is marked locked */ 5274 unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; 5275 unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; 5276 5277 /* 5278 * match the virtual addresses, permission and the alignment of the 5279 * page table page. 5280 */ 5281 if (pmd_index(addr) != pmd_index(saddr) || 5282 vm_flags != svm_flags || 5283 sbase < svma->vm_start || svma->vm_end < s_end) 5284 return 0; 5285 5286 return saddr; 5287 } 5288 5289 static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) 5290 { 5291 unsigned long base = addr & PUD_MASK; 5292 unsigned long end = base + PUD_SIZE; 5293 5294 /* 5295 * check on proper vm_flags and page table alignment 5296 */ 5297 if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) 5298 return true; 5299 return false; 5300 } 5301 5302 /* 5303 * Determine if start,end range within vma could be mapped by shared pmd. 5304 * If yes, adjust start and end to cover range associated with possible 5305 * shared pmd mappings. 5306 */ 5307 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 5308 unsigned long *start, unsigned long *end) 5309 { 5310 unsigned long check_addr; 5311 5312 if (!(vma->vm_flags & VM_MAYSHARE)) 5313 return; 5314 5315 for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) { 5316 unsigned long a_start = check_addr & PUD_MASK; 5317 unsigned long a_end = a_start + PUD_SIZE; 5318 5319 /* 5320 * If sharing is possible, adjust start/end if necessary. 5321 */ 5322 if (range_in_vma(vma, a_start, a_end)) { 5323 if (a_start < *start) 5324 *start = a_start; 5325 if (a_end > *end) 5326 *end = a_end; 5327 } 5328 } 5329 } 5330 5331 /* 5332 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 5333 * and returns the corresponding pte. While this is not necessary for the 5334 * !shared pmd case because we can allocate the pmd later as well, it makes the 5335 * code much cleaner. 5336 * 5337 * This routine must be called with i_mmap_rwsem held in at least read mode. 5338 * For hugetlbfs, this prevents removal of any page table entries associated 5339 * with the address space. This is important as we are setting up sharing 5340 * based on existing page table entries (mappings). 5341 */ 5342 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 5343 { 5344 struct vm_area_struct *vma = find_vma(mm, addr); 5345 struct address_space *mapping = vma->vm_file->f_mapping; 5346 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 5347 vma->vm_pgoff; 5348 struct vm_area_struct *svma; 5349 unsigned long saddr; 5350 pte_t *spte = NULL; 5351 pte_t *pte; 5352 spinlock_t *ptl; 5353 5354 if (!vma_shareable(vma, addr)) 5355 return (pte_t *)pmd_alloc(mm, pud, addr); 5356 5357 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 5358 if (svma == vma) 5359 continue; 5360 5361 saddr = page_table_shareable(svma, vma, addr, idx); 5362 if (saddr) { 5363 spte = huge_pte_offset(svma->vm_mm, saddr, 5364 vma_mmu_pagesize(svma)); 5365 if (spte) { 5366 get_page(virt_to_page(spte)); 5367 break; 5368 } 5369 } 5370 } 5371 5372 if (!spte) 5373 goto out; 5374 5375 ptl = huge_pte_lock(hstate_vma(vma), mm, spte); 5376 if (pud_none(*pud)) { 5377 pud_populate(mm, pud, 5378 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 5379 mm_inc_nr_pmds(mm); 5380 } else { 5381 put_page(virt_to_page(spte)); 5382 } 5383 spin_unlock(ptl); 5384 out: 5385 pte = (pte_t *)pmd_alloc(mm, pud, addr); 5386 return pte; 5387 } 5388 5389 /* 5390 * unmap huge page backed by shared pte. 5391 * 5392 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared 5393 * indicated by page_count > 1, unmap is achieved by clearing pud and 5394 * decrementing the ref count. If count == 1, the pte page is not shared. 5395 * 5396 * Called with page table lock held and i_mmap_rwsem held in write mode. 5397 * 5398 * returns: 1 successfully unmapped a shared pte page 5399 * 0 the underlying pte page is not shared, or it is the last user 5400 */ 5401 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 5402 { 5403 pgd_t *pgd = pgd_offset(mm, *addr); 5404 p4d_t *p4d = p4d_offset(pgd, *addr); 5405 pud_t *pud = pud_offset(p4d, *addr); 5406 5407 BUG_ON(page_count(virt_to_page(ptep)) == 0); 5408 if (page_count(virt_to_page(ptep)) == 1) 5409 return 0; 5410 5411 pud_clear(pud); 5412 put_page(virt_to_page(ptep)); 5413 mm_dec_nr_pmds(mm); 5414 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; 5415 return 1; 5416 } 5417 #define want_pmd_share() (1) 5418 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 5419 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 5420 { 5421 return NULL; 5422 } 5423 5424 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 5425 { 5426 return 0; 5427 } 5428 5429 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 5430 unsigned long *start, unsigned long *end) 5431 { 5432 } 5433 #define want_pmd_share() (0) 5434 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 5435 5436 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB 5437 pte_t *huge_pte_alloc(struct mm_struct *mm, 5438 unsigned long addr, unsigned long sz) 5439 { 5440 pgd_t *pgd; 5441 p4d_t *p4d; 5442 pud_t *pud; 5443 pte_t *pte = NULL; 5444 5445 pgd = pgd_offset(mm, addr); 5446 p4d = p4d_alloc(mm, pgd, addr); 5447 if (!p4d) 5448 return NULL; 5449 pud = pud_alloc(mm, p4d, addr); 5450 if (pud) { 5451 if (sz == PUD_SIZE) { 5452 pte = (pte_t *)pud; 5453 } else { 5454 BUG_ON(sz != PMD_SIZE); 5455 if (want_pmd_share() && pud_none(*pud)) 5456 pte = huge_pmd_share(mm, addr, pud); 5457 else 5458 pte = (pte_t *)pmd_alloc(mm, pud, addr); 5459 } 5460 } 5461 BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte)); 5462 5463 return pte; 5464 } 5465 5466 /* 5467 * huge_pte_offset() - Walk the page table to resolve the hugepage 5468 * entry at address @addr 5469 * 5470 * Return: Pointer to page table entry (PUD or PMD) for 5471 * address @addr, or NULL if a !p*d_present() entry is encountered and the 5472 * size @sz doesn't match the hugepage size at this level of the page 5473 * table. 5474 */ 5475 pte_t *huge_pte_offset(struct mm_struct *mm, 5476 unsigned long addr, unsigned long sz) 5477 { 5478 pgd_t *pgd; 5479 p4d_t *p4d; 5480 pud_t *pud; 5481 pmd_t *pmd; 5482 5483 pgd = pgd_offset(mm, addr); 5484 if (!pgd_present(*pgd)) 5485 return NULL; 5486 p4d = p4d_offset(pgd, addr); 5487 if (!p4d_present(*p4d)) 5488 return NULL; 5489 5490 pud = pud_offset(p4d, addr); 5491 if (sz == PUD_SIZE) 5492 /* must be pud huge, non-present or none */ 5493 return (pte_t *)pud; 5494 if (!pud_present(*pud)) 5495 return NULL; 5496 /* must have a valid entry and size to go further */ 5497 5498 pmd = pmd_offset(pud, addr); 5499 /* must be pmd huge, non-present or none */ 5500 return (pte_t *)pmd; 5501 } 5502 5503 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 5504 5505 /* 5506 * These functions are overwritable if your architecture needs its own 5507 * behavior. 5508 */ 5509 struct page * __weak 5510 follow_huge_addr(struct mm_struct *mm, unsigned long address, 5511 int write) 5512 { 5513 return ERR_PTR(-EINVAL); 5514 } 5515 5516 struct page * __weak 5517 follow_huge_pd(struct vm_area_struct *vma, 5518 unsigned long address, hugepd_t hpd, int flags, int pdshift) 5519 { 5520 WARN(1, "hugepd follow called with no support for hugepage directory format\n"); 5521 return NULL; 5522 } 5523 5524 struct page * __weak 5525 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 5526 pmd_t *pmd, int flags) 5527 { 5528 struct page *page = NULL; 5529 spinlock_t *ptl; 5530 pte_t pte; 5531 5532 /* FOLL_GET and FOLL_PIN are mutually exclusive. */ 5533 if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == 5534 (FOLL_PIN | FOLL_GET))) 5535 return NULL; 5536 5537 retry: 5538 ptl = pmd_lockptr(mm, pmd); 5539 spin_lock(ptl); 5540 /* 5541 * make sure that the address range covered by this pmd is not 5542 * unmapped from other threads. 5543 */ 5544 if (!pmd_huge(*pmd)) 5545 goto out; 5546 pte = huge_ptep_get((pte_t *)pmd); 5547 if (pte_present(pte)) { 5548 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); 5549 /* 5550 * try_grab_page() should always succeed here, because: a) we 5551 * hold the pmd (ptl) lock, and b) we've just checked that the 5552 * huge pmd (head) page is present in the page tables. The ptl 5553 * prevents the head page and tail pages from being rearranged 5554 * in any way. So this page must be available at this point, 5555 * unless the page refcount overflowed: 5556 */ 5557 if (WARN_ON_ONCE(!try_grab_page(page, flags))) { 5558 page = NULL; 5559 goto out; 5560 } 5561 } else { 5562 if (is_hugetlb_entry_migration(pte)) { 5563 spin_unlock(ptl); 5564 __migration_entry_wait(mm, (pte_t *)pmd, ptl); 5565 goto retry; 5566 } 5567 /* 5568 * hwpoisoned entry is treated as no_page_table in 5569 * follow_page_mask(). 5570 */ 5571 } 5572 out: 5573 spin_unlock(ptl); 5574 return page; 5575 } 5576 5577 struct page * __weak 5578 follow_huge_pud(struct mm_struct *mm, unsigned long address, 5579 pud_t *pud, int flags) 5580 { 5581 if (flags & (FOLL_GET | FOLL_PIN)) 5582 return NULL; 5583 5584 return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); 5585 } 5586 5587 struct page * __weak 5588 follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) 5589 { 5590 if (flags & (FOLL_GET | FOLL_PIN)) 5591 return NULL; 5592 5593 return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); 5594 } 5595 5596 bool isolate_huge_page(struct page *page, struct list_head *list) 5597 { 5598 bool ret = true; 5599 5600 VM_BUG_ON_PAGE(!PageHead(page), page); 5601 spin_lock(&hugetlb_lock); 5602 if (!page_huge_active(page) || !get_page_unless_zero(page)) { 5603 ret = false; 5604 goto unlock; 5605 } 5606 clear_page_huge_active(page); 5607 list_move_tail(&page->lru, list); 5608 unlock: 5609 spin_unlock(&hugetlb_lock); 5610 return ret; 5611 } 5612 5613 void putback_active_hugepage(struct page *page) 5614 { 5615 VM_BUG_ON_PAGE(!PageHead(page), page); 5616 spin_lock(&hugetlb_lock); 5617 set_page_huge_active(page); 5618 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); 5619 spin_unlock(&hugetlb_lock); 5620 put_page(page); 5621 } 5622 5623 void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) 5624 { 5625 struct hstate *h = page_hstate(oldpage); 5626 5627 hugetlb_cgroup_migrate(oldpage, newpage); 5628 set_page_owner_migrate_reason(newpage, reason); 5629 5630 /* 5631 * transfer temporary state of the new huge page. This is 5632 * reverse to other transitions because the newpage is going to 5633 * be final while the old one will be freed so it takes over 5634 * the temporary status. 5635 * 5636 * Also note that we have to transfer the per-node surplus state 5637 * here as well otherwise the global surplus count will not match 5638 * the per-node's. 5639 */ 5640 if (PageHugeTemporary(newpage)) { 5641 int old_nid = page_to_nid(oldpage); 5642 int new_nid = page_to_nid(newpage); 5643 5644 SetPageHugeTemporary(oldpage); 5645 ClearPageHugeTemporary(newpage); 5646 5647 spin_lock(&hugetlb_lock); 5648 if (h->surplus_huge_pages_node[old_nid]) { 5649 h->surplus_huge_pages_node[old_nid]--; 5650 h->surplus_huge_pages_node[new_nid]++; 5651 } 5652 spin_unlock(&hugetlb_lock); 5653 } 5654 } 5655 5656 #ifdef CONFIG_CMA 5657 static unsigned long hugetlb_cma_size __initdata; 5658 static bool cma_reserve_called __initdata; 5659 5660 static int __init cmdline_parse_hugetlb_cma(char *p) 5661 { 5662 hugetlb_cma_size = memparse(p, &p); 5663 return 0; 5664 } 5665 5666 early_param("hugetlb_cma", cmdline_parse_hugetlb_cma); 5667 5668 void __init hugetlb_cma_reserve(int order) 5669 { 5670 unsigned long size, reserved, per_node; 5671 int nid; 5672 5673 cma_reserve_called = true; 5674 5675 if (!hugetlb_cma_size) 5676 return; 5677 5678 if (hugetlb_cma_size < (PAGE_SIZE << order)) { 5679 pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", 5680 (PAGE_SIZE << order) / SZ_1M); 5681 return; 5682 } 5683 5684 /* 5685 * If 3 GB area is requested on a machine with 4 numa nodes, 5686 * let's allocate 1 GB on first three nodes and ignore the last one. 5687 */ 5688 per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); 5689 pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", 5690 hugetlb_cma_size / SZ_1M, per_node / SZ_1M); 5691 5692 reserved = 0; 5693 for_each_node_state(nid, N_ONLINE) { 5694 int res; 5695 5696 size = min(per_node, hugetlb_cma_size - reserved); 5697 size = round_up(size, PAGE_SIZE << order); 5698 5699 res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order, 5700 0, false, "hugetlb", 5701 &hugetlb_cma[nid], nid); 5702 if (res) { 5703 pr_warn("hugetlb_cma: reservation failed: err %d, node %d", 5704 res, nid); 5705 continue; 5706 } 5707 5708 reserved += size; 5709 pr_info("hugetlb_cma: reserved %lu MiB on node %d\n", 5710 size / SZ_1M, nid); 5711 5712 if (reserved >= hugetlb_cma_size) 5713 break; 5714 } 5715 } 5716 5717 void __init hugetlb_cma_check(void) 5718 { 5719 if (!hugetlb_cma_size || cma_reserve_called) 5720 return; 5721 5722 pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); 5723 } 5724 5725 #endif /* CONFIG_CMA */ 5726