1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Generic hugetlb support. 4 * (C) Nadia Yvette Chambers, April 2004 5 */ 6 #include <linux/list.h> 7 #include <linux/init.h> 8 #include <linux/mm.h> 9 #include <linux/seq_file.h> 10 #include <linux/sysctl.h> 11 #include <linux/highmem.h> 12 #include <linux/mmu_notifier.h> 13 #include <linux/nodemask.h> 14 #include <linux/pagemap.h> 15 #include <linux/mempolicy.h> 16 #include <linux/compiler.h> 17 #include <linux/cpuset.h> 18 #include <linux/mutex.h> 19 #include <linux/memblock.h> 20 #include <linux/sysfs.h> 21 #include <linux/slab.h> 22 #include <linux/mmdebug.h> 23 #include <linux/sched/signal.h> 24 #include <linux/rmap.h> 25 #include <linux/string_helpers.h> 26 #include <linux/swap.h> 27 #include <linux/swapops.h> 28 #include <linux/jhash.h> 29 #include <linux/numa.h> 30 #include <linux/llist.h> 31 #include <linux/cma.h> 32 33 #include <asm/page.h> 34 #include <asm/tlb.h> 35 36 #include <linux/io.h> 37 #include <linux/hugetlb.h> 38 #include <linux/hugetlb_cgroup.h> 39 #include <linux/node.h> 40 #include <linux/userfaultfd_k.h> 41 #include <linux/page_owner.h> 42 #include "internal.h" 43 44 int hugetlb_max_hstate __read_mostly; 45 unsigned int default_hstate_idx; 46 struct hstate hstates[HUGE_MAX_HSTATE]; 47 48 #ifdef CONFIG_CMA 49 static struct cma *hugetlb_cma[MAX_NUMNODES]; 50 #endif 51 static unsigned long hugetlb_cma_size __initdata; 52 53 /* 54 * Minimum page order among possible hugepage sizes, set to a proper value 55 * at boot time. 56 */ 57 static unsigned int minimum_order __read_mostly = UINT_MAX; 58 59 __initdata LIST_HEAD(huge_boot_pages); 60 61 /* for command line parsing */ 62 static struct hstate * __initdata parsed_hstate; 63 static unsigned long __initdata default_hstate_max_huge_pages; 64 static bool __initdata parsed_valid_hugepagesz = true; 65 static bool __initdata parsed_default_hugepagesz; 66 67 /* 68 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, 69 * free_huge_pages, and surplus_huge_pages. 70 */ 71 DEFINE_SPINLOCK(hugetlb_lock); 72 73 /* 74 * Serializes faults on the same logical page. This is used to 75 * prevent spurious OOMs when the hugepage pool is fully utilized. 76 */ 77 static int num_fault_mutexes; 78 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; 79 80 /* Forward declaration */ 81 static int hugetlb_acct_memory(struct hstate *h, long delta); 82 83 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 84 { 85 bool free = (spool->count == 0) && (spool->used_hpages == 0); 86 87 spin_unlock(&spool->lock); 88 89 /* If no pages are used, and no other handles to the subpool 90 * remain, give up any reservations based on minimum size and 91 * free the subpool */ 92 if (free) { 93 if (spool->min_hpages != -1) 94 hugetlb_acct_memory(spool->hstate, 95 -spool->min_hpages); 96 kfree(spool); 97 } 98 } 99 100 struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, 101 long min_hpages) 102 { 103 struct hugepage_subpool *spool; 104 105 spool = kzalloc(sizeof(*spool), GFP_KERNEL); 106 if (!spool) 107 return NULL; 108 109 spin_lock_init(&spool->lock); 110 spool->count = 1; 111 spool->max_hpages = max_hpages; 112 spool->hstate = h; 113 spool->min_hpages = min_hpages; 114 115 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { 116 kfree(spool); 117 return NULL; 118 } 119 spool->rsv_hpages = min_hpages; 120 121 return spool; 122 } 123 124 void hugepage_put_subpool(struct hugepage_subpool *spool) 125 { 126 spin_lock(&spool->lock); 127 BUG_ON(!spool->count); 128 spool->count--; 129 unlock_or_release_subpool(spool); 130 } 131 132 /* 133 * Subpool accounting for allocating and reserving pages. 134 * Return -ENOMEM if there are not enough resources to satisfy the 135 * the request. Otherwise, return the number of pages by which the 136 * global pools must be adjusted (upward). The returned value may 137 * only be different than the passed value (delta) in the case where 138 * a subpool minimum size must be maintained. 139 */ 140 static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, 141 long delta) 142 { 143 long ret = delta; 144 145 if (!spool) 146 return ret; 147 148 spin_lock(&spool->lock); 149 150 if (spool->max_hpages != -1) { /* maximum size accounting */ 151 if ((spool->used_hpages + delta) <= spool->max_hpages) 152 spool->used_hpages += delta; 153 else { 154 ret = -ENOMEM; 155 goto unlock_ret; 156 } 157 } 158 159 /* minimum size accounting */ 160 if (spool->min_hpages != -1 && spool->rsv_hpages) { 161 if (delta > spool->rsv_hpages) { 162 /* 163 * Asking for more reserves than those already taken on 164 * behalf of subpool. Return difference. 165 */ 166 ret = delta - spool->rsv_hpages; 167 spool->rsv_hpages = 0; 168 } else { 169 ret = 0; /* reserves already accounted for */ 170 spool->rsv_hpages -= delta; 171 } 172 } 173 174 unlock_ret: 175 spin_unlock(&spool->lock); 176 return ret; 177 } 178 179 /* 180 * Subpool accounting for freeing and unreserving pages. 181 * Return the number of global page reservations that must be dropped. 182 * The return value may only be different than the passed value (delta) 183 * in the case where a subpool minimum size must be maintained. 184 */ 185 static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, 186 long delta) 187 { 188 long ret = delta; 189 190 if (!spool) 191 return delta; 192 193 spin_lock(&spool->lock); 194 195 if (spool->max_hpages != -1) /* maximum size accounting */ 196 spool->used_hpages -= delta; 197 198 /* minimum size accounting */ 199 if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { 200 if (spool->rsv_hpages + delta <= spool->min_hpages) 201 ret = 0; 202 else 203 ret = spool->rsv_hpages + delta - spool->min_hpages; 204 205 spool->rsv_hpages += delta; 206 if (spool->rsv_hpages > spool->min_hpages) 207 spool->rsv_hpages = spool->min_hpages; 208 } 209 210 /* 211 * If hugetlbfs_put_super couldn't free spool due to an outstanding 212 * quota reference, free it now. 213 */ 214 unlock_or_release_subpool(spool); 215 216 return ret; 217 } 218 219 static inline struct hugepage_subpool *subpool_inode(struct inode *inode) 220 { 221 return HUGETLBFS_SB(inode->i_sb)->spool; 222 } 223 224 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) 225 { 226 return subpool_inode(file_inode(vma->vm_file)); 227 } 228 229 /* Helper that removes a struct file_region from the resv_map cache and returns 230 * it for use. 231 */ 232 static struct file_region * 233 get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) 234 { 235 struct file_region *nrg = NULL; 236 237 VM_BUG_ON(resv->region_cache_count <= 0); 238 239 resv->region_cache_count--; 240 nrg = list_first_entry(&resv->region_cache, struct file_region, link); 241 VM_BUG_ON(!nrg); 242 list_del(&nrg->link); 243 244 nrg->from = from; 245 nrg->to = to; 246 247 return nrg; 248 } 249 250 static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, 251 struct file_region *rg) 252 { 253 #ifdef CONFIG_CGROUP_HUGETLB 254 nrg->reservation_counter = rg->reservation_counter; 255 nrg->css = rg->css; 256 if (rg->css) 257 css_get(rg->css); 258 #endif 259 } 260 261 /* Helper that records hugetlb_cgroup uncharge info. */ 262 static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, 263 struct hstate *h, 264 struct resv_map *resv, 265 struct file_region *nrg) 266 { 267 #ifdef CONFIG_CGROUP_HUGETLB 268 if (h_cg) { 269 nrg->reservation_counter = 270 &h_cg->rsvd_hugepage[hstate_index(h)]; 271 nrg->css = &h_cg->css; 272 if (!resv->pages_per_hpage) 273 resv->pages_per_hpage = pages_per_huge_page(h); 274 /* pages_per_hpage should be the same for all entries in 275 * a resv_map. 276 */ 277 VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); 278 } else { 279 nrg->reservation_counter = NULL; 280 nrg->css = NULL; 281 } 282 #endif 283 } 284 285 static bool has_same_uncharge_info(struct file_region *rg, 286 struct file_region *org) 287 { 288 #ifdef CONFIG_CGROUP_HUGETLB 289 return rg && org && 290 rg->reservation_counter == org->reservation_counter && 291 rg->css == org->css; 292 293 #else 294 return true; 295 #endif 296 } 297 298 static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) 299 { 300 struct file_region *nrg = NULL, *prg = NULL; 301 302 prg = list_prev_entry(rg, link); 303 if (&prg->link != &resv->regions && prg->to == rg->from && 304 has_same_uncharge_info(prg, rg)) { 305 prg->to = rg->to; 306 307 list_del(&rg->link); 308 kfree(rg); 309 310 coalesce_file_region(resv, prg); 311 return; 312 } 313 314 nrg = list_next_entry(rg, link); 315 if (&nrg->link != &resv->regions && nrg->from == rg->to && 316 has_same_uncharge_info(nrg, rg)) { 317 nrg->from = rg->from; 318 319 list_del(&rg->link); 320 kfree(rg); 321 322 coalesce_file_region(resv, nrg); 323 return; 324 } 325 } 326 327 /* Must be called with resv->lock held. Calling this with count_only == true 328 * will count the number of pages to be added but will not modify the linked 329 * list. If regions_needed != NULL and count_only == true, then regions_needed 330 * will indicate the number of file_regions needed in the cache to carry out to 331 * add the regions for this range. 332 */ 333 static long add_reservation_in_range(struct resv_map *resv, long f, long t, 334 struct hugetlb_cgroup *h_cg, 335 struct hstate *h, long *regions_needed, 336 bool count_only) 337 { 338 long add = 0; 339 struct list_head *head = &resv->regions; 340 long last_accounted_offset = f; 341 struct file_region *rg = NULL, *trg = NULL, *nrg = NULL; 342 343 if (regions_needed) 344 *regions_needed = 0; 345 346 /* In this loop, we essentially handle an entry for the range 347 * [last_accounted_offset, rg->from), at every iteration, with some 348 * bounds checking. 349 */ 350 list_for_each_entry_safe(rg, trg, head, link) { 351 /* Skip irrelevant regions that start before our range. */ 352 if (rg->from < f) { 353 /* If this region ends after the last accounted offset, 354 * then we need to update last_accounted_offset. 355 */ 356 if (rg->to > last_accounted_offset) 357 last_accounted_offset = rg->to; 358 continue; 359 } 360 361 /* When we find a region that starts beyond our range, we've 362 * finished. 363 */ 364 if (rg->from > t) 365 break; 366 367 /* Add an entry for last_accounted_offset -> rg->from, and 368 * update last_accounted_offset. 369 */ 370 if (rg->from > last_accounted_offset) { 371 add += rg->from - last_accounted_offset; 372 if (!count_only) { 373 nrg = get_file_region_entry_from_cache( 374 resv, last_accounted_offset, rg->from); 375 record_hugetlb_cgroup_uncharge_info(h_cg, h, 376 resv, nrg); 377 list_add(&nrg->link, rg->link.prev); 378 coalesce_file_region(resv, nrg); 379 } else if (regions_needed) 380 *regions_needed += 1; 381 } 382 383 last_accounted_offset = rg->to; 384 } 385 386 /* Handle the case where our range extends beyond 387 * last_accounted_offset. 388 */ 389 if (last_accounted_offset < t) { 390 add += t - last_accounted_offset; 391 if (!count_only) { 392 nrg = get_file_region_entry_from_cache( 393 resv, last_accounted_offset, t); 394 record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg); 395 list_add(&nrg->link, rg->link.prev); 396 coalesce_file_region(resv, nrg); 397 } else if (regions_needed) 398 *regions_needed += 1; 399 } 400 401 VM_BUG_ON(add < 0); 402 return add; 403 } 404 405 /* Must be called with resv->lock acquired. Will drop lock to allocate entries. 406 */ 407 static int allocate_file_region_entries(struct resv_map *resv, 408 int regions_needed) 409 __must_hold(&resv->lock) 410 { 411 struct list_head allocated_regions; 412 int to_allocate = 0, i = 0; 413 struct file_region *trg = NULL, *rg = NULL; 414 415 VM_BUG_ON(regions_needed < 0); 416 417 INIT_LIST_HEAD(&allocated_regions); 418 419 /* 420 * Check for sufficient descriptors in the cache to accommodate 421 * the number of in progress add operations plus regions_needed. 422 * 423 * This is a while loop because when we drop the lock, some other call 424 * to region_add or region_del may have consumed some region_entries, 425 * so we keep looping here until we finally have enough entries for 426 * (adds_in_progress + regions_needed). 427 */ 428 while (resv->region_cache_count < 429 (resv->adds_in_progress + regions_needed)) { 430 to_allocate = resv->adds_in_progress + regions_needed - 431 resv->region_cache_count; 432 433 /* At this point, we should have enough entries in the cache 434 * for all the existings adds_in_progress. We should only be 435 * needing to allocate for regions_needed. 436 */ 437 VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); 438 439 spin_unlock(&resv->lock); 440 for (i = 0; i < to_allocate; i++) { 441 trg = kmalloc(sizeof(*trg), GFP_KERNEL); 442 if (!trg) 443 goto out_of_memory; 444 list_add(&trg->link, &allocated_regions); 445 } 446 447 spin_lock(&resv->lock); 448 449 list_for_each_entry_safe(rg, trg, &allocated_regions, link) { 450 list_del(&rg->link); 451 list_add(&rg->link, &resv->region_cache); 452 resv->region_cache_count++; 453 } 454 } 455 456 return 0; 457 458 out_of_memory: 459 list_for_each_entry_safe(rg, trg, &allocated_regions, link) { 460 list_del(&rg->link); 461 kfree(rg); 462 } 463 return -ENOMEM; 464 } 465 466 /* 467 * Add the huge page range represented by [f, t) to the reserve 468 * map. Regions will be taken from the cache to fill in this range. 469 * Sufficient regions should exist in the cache due to the previous 470 * call to region_chg with the same range, but in some cases the cache will not 471 * have sufficient entries due to races with other code doing region_add or 472 * region_del. The extra needed entries will be allocated. 473 * 474 * regions_needed is the out value provided by a previous call to region_chg. 475 * 476 * Return the number of new huge pages added to the map. This number is greater 477 * than or equal to zero. If file_region entries needed to be allocated for 478 * this operation and we were not able to allocate, it returns -ENOMEM. 479 * region_add of regions of length 1 never allocate file_regions and cannot 480 * fail; region_chg will always allocate at least 1 entry and a region_add for 481 * 1 page will only require at most 1 entry. 482 */ 483 static long region_add(struct resv_map *resv, long f, long t, 484 long in_regions_needed, struct hstate *h, 485 struct hugetlb_cgroup *h_cg) 486 { 487 long add = 0, actual_regions_needed = 0; 488 489 spin_lock(&resv->lock); 490 retry: 491 492 /* Count how many regions are actually needed to execute this add. */ 493 add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed, 494 true); 495 496 /* 497 * Check for sufficient descriptors in the cache to accommodate 498 * this add operation. Note that actual_regions_needed may be greater 499 * than in_regions_needed, as the resv_map may have been modified since 500 * the region_chg call. In this case, we need to make sure that we 501 * allocate extra entries, such that we have enough for all the 502 * existing adds_in_progress, plus the excess needed for this 503 * operation. 504 */ 505 if (actual_regions_needed > in_regions_needed && 506 resv->region_cache_count < 507 resv->adds_in_progress + 508 (actual_regions_needed - in_regions_needed)) { 509 /* region_add operation of range 1 should never need to 510 * allocate file_region entries. 511 */ 512 VM_BUG_ON(t - f <= 1); 513 514 if (allocate_file_region_entries( 515 resv, actual_regions_needed - in_regions_needed)) { 516 return -ENOMEM; 517 } 518 519 goto retry; 520 } 521 522 add = add_reservation_in_range(resv, f, t, h_cg, h, NULL, false); 523 524 resv->adds_in_progress -= in_regions_needed; 525 526 spin_unlock(&resv->lock); 527 VM_BUG_ON(add < 0); 528 return add; 529 } 530 531 /* 532 * Examine the existing reserve map and determine how many 533 * huge pages in the specified range [f, t) are NOT currently 534 * represented. This routine is called before a subsequent 535 * call to region_add that will actually modify the reserve 536 * map to add the specified range [f, t). region_chg does 537 * not change the number of huge pages represented by the 538 * map. A number of new file_region structures is added to the cache as a 539 * placeholder, for the subsequent region_add call to use. At least 1 540 * file_region structure is added. 541 * 542 * out_regions_needed is the number of regions added to the 543 * resv->adds_in_progress. This value needs to be provided to a follow up call 544 * to region_add or region_abort for proper accounting. 545 * 546 * Returns the number of huge pages that need to be added to the existing 547 * reservation map for the range [f, t). This number is greater or equal to 548 * zero. -ENOMEM is returned if a new file_region structure or cache entry 549 * is needed and can not be allocated. 550 */ 551 static long region_chg(struct resv_map *resv, long f, long t, 552 long *out_regions_needed) 553 { 554 long chg = 0; 555 556 spin_lock(&resv->lock); 557 558 /* Count how many hugepages in this range are NOT respresented. */ 559 chg = add_reservation_in_range(resv, f, t, NULL, NULL, 560 out_regions_needed, true); 561 562 if (*out_regions_needed == 0) 563 *out_regions_needed = 1; 564 565 if (allocate_file_region_entries(resv, *out_regions_needed)) 566 return -ENOMEM; 567 568 resv->adds_in_progress += *out_regions_needed; 569 570 spin_unlock(&resv->lock); 571 return chg; 572 } 573 574 /* 575 * Abort the in progress add operation. The adds_in_progress field 576 * of the resv_map keeps track of the operations in progress between 577 * calls to region_chg and region_add. Operations are sometimes 578 * aborted after the call to region_chg. In such cases, region_abort 579 * is called to decrement the adds_in_progress counter. regions_needed 580 * is the value returned by the region_chg call, it is used to decrement 581 * the adds_in_progress counter. 582 * 583 * NOTE: The range arguments [f, t) are not needed or used in this 584 * routine. They are kept to make reading the calling code easier as 585 * arguments will match the associated region_chg call. 586 */ 587 static void region_abort(struct resv_map *resv, long f, long t, 588 long regions_needed) 589 { 590 spin_lock(&resv->lock); 591 VM_BUG_ON(!resv->region_cache_count); 592 resv->adds_in_progress -= regions_needed; 593 spin_unlock(&resv->lock); 594 } 595 596 /* 597 * Delete the specified range [f, t) from the reserve map. If the 598 * t parameter is LONG_MAX, this indicates that ALL regions after f 599 * should be deleted. Locate the regions which intersect [f, t) 600 * and either trim, delete or split the existing regions. 601 * 602 * Returns the number of huge pages deleted from the reserve map. 603 * In the normal case, the return value is zero or more. In the 604 * case where a region must be split, a new region descriptor must 605 * be allocated. If the allocation fails, -ENOMEM will be returned. 606 * NOTE: If the parameter t == LONG_MAX, then we will never split 607 * a region and possibly return -ENOMEM. Callers specifying 608 * t == LONG_MAX do not need to check for -ENOMEM error. 609 */ 610 static long region_del(struct resv_map *resv, long f, long t) 611 { 612 struct list_head *head = &resv->regions; 613 struct file_region *rg, *trg; 614 struct file_region *nrg = NULL; 615 long del = 0; 616 617 retry: 618 spin_lock(&resv->lock); 619 list_for_each_entry_safe(rg, trg, head, link) { 620 /* 621 * Skip regions before the range to be deleted. file_region 622 * ranges are normally of the form [from, to). However, there 623 * may be a "placeholder" entry in the map which is of the form 624 * (from, to) with from == to. Check for placeholder entries 625 * at the beginning of the range to be deleted. 626 */ 627 if (rg->to <= f && (rg->to != rg->from || rg->to != f)) 628 continue; 629 630 if (rg->from >= t) 631 break; 632 633 if (f > rg->from && t < rg->to) { /* Must split region */ 634 /* 635 * Check for an entry in the cache before dropping 636 * lock and attempting allocation. 637 */ 638 if (!nrg && 639 resv->region_cache_count > resv->adds_in_progress) { 640 nrg = list_first_entry(&resv->region_cache, 641 struct file_region, 642 link); 643 list_del(&nrg->link); 644 resv->region_cache_count--; 645 } 646 647 if (!nrg) { 648 spin_unlock(&resv->lock); 649 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 650 if (!nrg) 651 return -ENOMEM; 652 goto retry; 653 } 654 655 del += t - f; 656 657 /* New entry for end of split region */ 658 nrg->from = t; 659 nrg->to = rg->to; 660 661 copy_hugetlb_cgroup_uncharge_info(nrg, rg); 662 663 INIT_LIST_HEAD(&nrg->link); 664 665 /* Original entry is trimmed */ 666 rg->to = f; 667 668 hugetlb_cgroup_uncharge_file_region( 669 resv, rg, nrg->to - nrg->from); 670 671 list_add(&nrg->link, &rg->link); 672 nrg = NULL; 673 break; 674 } 675 676 if (f <= rg->from && t >= rg->to) { /* Remove entire region */ 677 del += rg->to - rg->from; 678 hugetlb_cgroup_uncharge_file_region(resv, rg, 679 rg->to - rg->from); 680 list_del(&rg->link); 681 kfree(rg); 682 continue; 683 } 684 685 if (f <= rg->from) { /* Trim beginning of region */ 686 del += t - rg->from; 687 rg->from = t; 688 689 hugetlb_cgroup_uncharge_file_region(resv, rg, 690 t - rg->from); 691 } else { /* Trim end of region */ 692 del += rg->to - f; 693 rg->to = f; 694 695 hugetlb_cgroup_uncharge_file_region(resv, rg, 696 rg->to - f); 697 } 698 } 699 700 spin_unlock(&resv->lock); 701 kfree(nrg); 702 return del; 703 } 704 705 /* 706 * A rare out of memory error was encountered which prevented removal of 707 * the reserve map region for a page. The huge page itself was free'ed 708 * and removed from the page cache. This routine will adjust the subpool 709 * usage count, and the global reserve count if needed. By incrementing 710 * these counts, the reserve map entry which could not be deleted will 711 * appear as a "reserved" entry instead of simply dangling with incorrect 712 * counts. 713 */ 714 void hugetlb_fix_reserve_counts(struct inode *inode) 715 { 716 struct hugepage_subpool *spool = subpool_inode(inode); 717 long rsv_adjust; 718 719 rsv_adjust = hugepage_subpool_get_pages(spool, 1); 720 if (rsv_adjust) { 721 struct hstate *h = hstate_inode(inode); 722 723 hugetlb_acct_memory(h, 1); 724 } 725 } 726 727 /* 728 * Count and return the number of huge pages in the reserve map 729 * that intersect with the range [f, t). 730 */ 731 static long region_count(struct resv_map *resv, long f, long t) 732 { 733 struct list_head *head = &resv->regions; 734 struct file_region *rg; 735 long chg = 0; 736 737 spin_lock(&resv->lock); 738 /* Locate each segment we overlap with, and count that overlap. */ 739 list_for_each_entry(rg, head, link) { 740 long seg_from; 741 long seg_to; 742 743 if (rg->to <= f) 744 continue; 745 if (rg->from >= t) 746 break; 747 748 seg_from = max(rg->from, f); 749 seg_to = min(rg->to, t); 750 751 chg += seg_to - seg_from; 752 } 753 spin_unlock(&resv->lock); 754 755 return chg; 756 } 757 758 /* 759 * Convert the address within this vma to the page offset within 760 * the mapping, in pagecache page units; huge pages here. 761 */ 762 static pgoff_t vma_hugecache_offset(struct hstate *h, 763 struct vm_area_struct *vma, unsigned long address) 764 { 765 return ((address - vma->vm_start) >> huge_page_shift(h)) + 766 (vma->vm_pgoff >> huge_page_order(h)); 767 } 768 769 pgoff_t linear_hugepage_index(struct vm_area_struct *vma, 770 unsigned long address) 771 { 772 return vma_hugecache_offset(hstate_vma(vma), vma, address); 773 } 774 EXPORT_SYMBOL_GPL(linear_hugepage_index); 775 776 /* 777 * Return the size of the pages allocated when backing a VMA. In the majority 778 * cases this will be same size as used by the page table entries. 779 */ 780 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 781 { 782 if (vma->vm_ops && vma->vm_ops->pagesize) 783 return vma->vm_ops->pagesize(vma); 784 return PAGE_SIZE; 785 } 786 EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 787 788 /* 789 * Return the page size being used by the MMU to back a VMA. In the majority 790 * of cases, the page size used by the kernel matches the MMU size. On 791 * architectures where it differs, an architecture-specific 'strong' 792 * version of this symbol is required. 793 */ 794 __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 795 { 796 return vma_kernel_pagesize(vma); 797 } 798 799 /* 800 * Flags for MAP_PRIVATE reservations. These are stored in the bottom 801 * bits of the reservation map pointer, which are always clear due to 802 * alignment. 803 */ 804 #define HPAGE_RESV_OWNER (1UL << 0) 805 #define HPAGE_RESV_UNMAPPED (1UL << 1) 806 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 807 808 /* 809 * These helpers are used to track how many pages are reserved for 810 * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 811 * is guaranteed to have their future faults succeed. 812 * 813 * With the exception of reset_vma_resv_huge_pages() which is called at fork(), 814 * the reserve counters are updated with the hugetlb_lock held. It is safe 815 * to reset the VMA at fork() time as it is not in use yet and there is no 816 * chance of the global counters getting corrupted as a result of the values. 817 * 818 * The private mapping reservation is represented in a subtly different 819 * manner to a shared mapping. A shared mapping has a region map associated 820 * with the underlying file, this region map represents the backing file 821 * pages which have ever had a reservation assigned which this persists even 822 * after the page is instantiated. A private mapping has a region map 823 * associated with the original mmap which is attached to all VMAs which 824 * reference it, this region map represents those offsets which have consumed 825 * reservation ie. where pages have been instantiated. 826 */ 827 static unsigned long get_vma_private_data(struct vm_area_struct *vma) 828 { 829 return (unsigned long)vma->vm_private_data; 830 } 831 832 static void set_vma_private_data(struct vm_area_struct *vma, 833 unsigned long value) 834 { 835 vma->vm_private_data = (void *)value; 836 } 837 838 static void 839 resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map, 840 struct hugetlb_cgroup *h_cg, 841 struct hstate *h) 842 { 843 #ifdef CONFIG_CGROUP_HUGETLB 844 if (!h_cg || !h) { 845 resv_map->reservation_counter = NULL; 846 resv_map->pages_per_hpage = 0; 847 resv_map->css = NULL; 848 } else { 849 resv_map->reservation_counter = 850 &h_cg->rsvd_hugepage[hstate_index(h)]; 851 resv_map->pages_per_hpage = pages_per_huge_page(h); 852 resv_map->css = &h_cg->css; 853 } 854 #endif 855 } 856 857 struct resv_map *resv_map_alloc(void) 858 { 859 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 860 struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); 861 862 if (!resv_map || !rg) { 863 kfree(resv_map); 864 kfree(rg); 865 return NULL; 866 } 867 868 kref_init(&resv_map->refs); 869 spin_lock_init(&resv_map->lock); 870 INIT_LIST_HEAD(&resv_map->regions); 871 872 resv_map->adds_in_progress = 0; 873 /* 874 * Initialize these to 0. On shared mappings, 0's here indicate these 875 * fields don't do cgroup accounting. On private mappings, these will be 876 * re-initialized to the proper values, to indicate that hugetlb cgroup 877 * reservations are to be un-charged from here. 878 */ 879 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL); 880 881 INIT_LIST_HEAD(&resv_map->region_cache); 882 list_add(&rg->link, &resv_map->region_cache); 883 resv_map->region_cache_count = 1; 884 885 return resv_map; 886 } 887 888 void resv_map_release(struct kref *ref) 889 { 890 struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 891 struct list_head *head = &resv_map->region_cache; 892 struct file_region *rg, *trg; 893 894 /* Clear out any active regions before we release the map. */ 895 region_del(resv_map, 0, LONG_MAX); 896 897 /* ... and any entries left in the cache */ 898 list_for_each_entry_safe(rg, trg, head, link) { 899 list_del(&rg->link); 900 kfree(rg); 901 } 902 903 VM_BUG_ON(resv_map->adds_in_progress); 904 905 kfree(resv_map); 906 } 907 908 static inline struct resv_map *inode_resv_map(struct inode *inode) 909 { 910 /* 911 * At inode evict time, i_mapping may not point to the original 912 * address space within the inode. This original address space 913 * contains the pointer to the resv_map. So, always use the 914 * address space embedded within the inode. 915 * The VERY common case is inode->mapping == &inode->i_data but, 916 * this may not be true for device special inodes. 917 */ 918 return (struct resv_map *)(&inode->i_data)->private_data; 919 } 920 921 static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 922 { 923 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 924 if (vma->vm_flags & VM_MAYSHARE) { 925 struct address_space *mapping = vma->vm_file->f_mapping; 926 struct inode *inode = mapping->host; 927 928 return inode_resv_map(inode); 929 930 } else { 931 return (struct resv_map *)(get_vma_private_data(vma) & 932 ~HPAGE_RESV_MASK); 933 } 934 } 935 936 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 937 { 938 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 939 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 940 941 set_vma_private_data(vma, (get_vma_private_data(vma) & 942 HPAGE_RESV_MASK) | (unsigned long)map); 943 } 944 945 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 946 { 947 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 948 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 949 950 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 951 } 952 953 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 954 { 955 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 956 957 return (get_vma_private_data(vma) & flag) != 0; 958 } 959 960 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 961 void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 962 { 963 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 964 if (!(vma->vm_flags & VM_MAYSHARE)) 965 vma->vm_private_data = (void *)0; 966 } 967 968 /* Returns true if the VMA has associated reserve pages */ 969 static bool vma_has_reserves(struct vm_area_struct *vma, long chg) 970 { 971 if (vma->vm_flags & VM_NORESERVE) { 972 /* 973 * This address is already reserved by other process(chg == 0), 974 * so, we should decrement reserved count. Without decrementing, 975 * reserve count remains after releasing inode, because this 976 * allocated page will go into page cache and is regarded as 977 * coming from reserved pool in releasing step. Currently, we 978 * don't have any other solution to deal with this situation 979 * properly, so add work-around here. 980 */ 981 if (vma->vm_flags & VM_MAYSHARE && chg == 0) 982 return true; 983 else 984 return false; 985 } 986 987 /* Shared mappings always use reserves */ 988 if (vma->vm_flags & VM_MAYSHARE) { 989 /* 990 * We know VM_NORESERVE is not set. Therefore, there SHOULD 991 * be a region map for all pages. The only situation where 992 * there is no region map is if a hole was punched via 993 * fallocate. In this case, there really are no reserves to 994 * use. This situation is indicated if chg != 0. 995 */ 996 if (chg) 997 return false; 998 else 999 return true; 1000 } 1001 1002 /* 1003 * Only the process that called mmap() has reserves for 1004 * private mappings. 1005 */ 1006 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 1007 /* 1008 * Like the shared case above, a hole punch or truncate 1009 * could have been performed on the private mapping. 1010 * Examine the value of chg to determine if reserves 1011 * actually exist or were previously consumed. 1012 * Very Subtle - The value of chg comes from a previous 1013 * call to vma_needs_reserves(). The reserve map for 1014 * private mappings has different (opposite) semantics 1015 * than that of shared mappings. vma_needs_reserves() 1016 * has already taken this difference in semantics into 1017 * account. Therefore, the meaning of chg is the same 1018 * as in the shared case above. Code could easily be 1019 * combined, but keeping it separate draws attention to 1020 * subtle differences. 1021 */ 1022 if (chg) 1023 return false; 1024 else 1025 return true; 1026 } 1027 1028 return false; 1029 } 1030 1031 static void enqueue_huge_page(struct hstate *h, struct page *page) 1032 { 1033 int nid = page_to_nid(page); 1034 list_move(&page->lru, &h->hugepage_freelists[nid]); 1035 h->free_huge_pages++; 1036 h->free_huge_pages_node[nid]++; 1037 } 1038 1039 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) 1040 { 1041 struct page *page; 1042 1043 list_for_each_entry(page, &h->hugepage_freelists[nid], lru) 1044 if (!PageHWPoison(page)) 1045 break; 1046 /* 1047 * if 'non-isolated free hugepage' not found on the list, 1048 * the allocation fails. 1049 */ 1050 if (&h->hugepage_freelists[nid] == &page->lru) 1051 return NULL; 1052 list_move(&page->lru, &h->hugepage_activelist); 1053 set_page_refcounted(page); 1054 h->free_huge_pages--; 1055 h->free_huge_pages_node[nid]--; 1056 return page; 1057 } 1058 1059 static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, 1060 nodemask_t *nmask) 1061 { 1062 unsigned int cpuset_mems_cookie; 1063 struct zonelist *zonelist; 1064 struct zone *zone; 1065 struct zoneref *z; 1066 int node = NUMA_NO_NODE; 1067 1068 zonelist = node_zonelist(nid, gfp_mask); 1069 1070 retry_cpuset: 1071 cpuset_mems_cookie = read_mems_allowed_begin(); 1072 for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { 1073 struct page *page; 1074 1075 if (!cpuset_zone_allowed(zone, gfp_mask)) 1076 continue; 1077 /* 1078 * no need to ask again on the same node. Pool is node rather than 1079 * zone aware 1080 */ 1081 if (zone_to_nid(zone) == node) 1082 continue; 1083 node = zone_to_nid(zone); 1084 1085 page = dequeue_huge_page_node_exact(h, node); 1086 if (page) 1087 return page; 1088 } 1089 if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) 1090 goto retry_cpuset; 1091 1092 return NULL; 1093 } 1094 1095 /* Movability of hugepages depends on migration support. */ 1096 static inline gfp_t htlb_alloc_mask(struct hstate *h) 1097 { 1098 if (hugepage_movable_supported(h)) 1099 return GFP_HIGHUSER_MOVABLE; 1100 else 1101 return GFP_HIGHUSER; 1102 } 1103 1104 static struct page *dequeue_huge_page_vma(struct hstate *h, 1105 struct vm_area_struct *vma, 1106 unsigned long address, int avoid_reserve, 1107 long chg) 1108 { 1109 struct page *page; 1110 struct mempolicy *mpol; 1111 gfp_t gfp_mask; 1112 nodemask_t *nodemask; 1113 int nid; 1114 1115 /* 1116 * A child process with MAP_PRIVATE mappings created by their parent 1117 * have no page reserves. This check ensures that reservations are 1118 * not "stolen". The child may still get SIGKILLed 1119 */ 1120 if (!vma_has_reserves(vma, chg) && 1121 h->free_huge_pages - h->resv_huge_pages == 0) 1122 goto err; 1123 1124 /* If reserves cannot be used, ensure enough pages are in the pool */ 1125 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 1126 goto err; 1127 1128 gfp_mask = htlb_alloc_mask(h); 1129 nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 1130 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); 1131 if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { 1132 SetPagePrivate(page); 1133 h->resv_huge_pages--; 1134 } 1135 1136 mpol_cond_put(mpol); 1137 return page; 1138 1139 err: 1140 return NULL; 1141 } 1142 1143 /* 1144 * common helper functions for hstate_next_node_to_{alloc|free}. 1145 * We may have allocated or freed a huge page based on a different 1146 * nodes_allowed previously, so h->next_node_to_{alloc|free} might 1147 * be outside of *nodes_allowed. Ensure that we use an allowed 1148 * node for alloc or free. 1149 */ 1150 static int next_node_allowed(int nid, nodemask_t *nodes_allowed) 1151 { 1152 nid = next_node_in(nid, *nodes_allowed); 1153 VM_BUG_ON(nid >= MAX_NUMNODES); 1154 1155 return nid; 1156 } 1157 1158 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) 1159 { 1160 if (!node_isset(nid, *nodes_allowed)) 1161 nid = next_node_allowed(nid, nodes_allowed); 1162 return nid; 1163 } 1164 1165 /* 1166 * returns the previously saved node ["this node"] from which to 1167 * allocate a persistent huge page for the pool and advance the 1168 * next node from which to allocate, handling wrap at end of node 1169 * mask. 1170 */ 1171 static int hstate_next_node_to_alloc(struct hstate *h, 1172 nodemask_t *nodes_allowed) 1173 { 1174 int nid; 1175 1176 VM_BUG_ON(!nodes_allowed); 1177 1178 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); 1179 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); 1180 1181 return nid; 1182 } 1183 1184 /* 1185 * helper for free_pool_huge_page() - return the previously saved 1186 * node ["this node"] from which to free a huge page. Advance the 1187 * next node id whether or not we find a free huge page to free so 1188 * that the next attempt to free addresses the next node. 1189 */ 1190 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) 1191 { 1192 int nid; 1193 1194 VM_BUG_ON(!nodes_allowed); 1195 1196 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); 1197 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); 1198 1199 return nid; 1200 } 1201 1202 #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ 1203 for (nr_nodes = nodes_weight(*mask); \ 1204 nr_nodes > 0 && \ 1205 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ 1206 nr_nodes--) 1207 1208 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ 1209 for (nr_nodes = nodes_weight(*mask); \ 1210 nr_nodes > 0 && \ 1211 ((node = hstate_next_node_to_free(hs, mask)) || 1); \ 1212 nr_nodes--) 1213 1214 #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE 1215 static void destroy_compound_gigantic_page(struct page *page, 1216 unsigned int order) 1217 { 1218 int i; 1219 int nr_pages = 1 << order; 1220 struct page *p = page + 1; 1221 1222 atomic_set(compound_mapcount_ptr(page), 0); 1223 if (hpage_pincount_available(page)) 1224 atomic_set(compound_pincount_ptr(page), 0); 1225 1226 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1227 clear_compound_head(p); 1228 set_page_refcounted(p); 1229 } 1230 1231 set_compound_order(page, 0); 1232 __ClearPageHead(page); 1233 } 1234 1235 static void free_gigantic_page(struct page *page, unsigned int order) 1236 { 1237 /* 1238 * If the page isn't allocated using the cma allocator, 1239 * cma_release() returns false. 1240 */ 1241 #ifdef CONFIG_CMA 1242 if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) 1243 return; 1244 #endif 1245 1246 free_contig_range(page_to_pfn(page), 1 << order); 1247 } 1248 1249 #ifdef CONFIG_CONTIG_ALLOC 1250 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1251 int nid, nodemask_t *nodemask) 1252 { 1253 unsigned long nr_pages = 1UL << huge_page_order(h); 1254 1255 #ifdef CONFIG_CMA 1256 { 1257 struct page *page; 1258 int node; 1259 1260 for_each_node_mask(node, *nodemask) { 1261 if (!hugetlb_cma[node]) 1262 continue; 1263 1264 page = cma_alloc(hugetlb_cma[node], nr_pages, 1265 huge_page_order(h), true); 1266 if (page) 1267 return page; 1268 } 1269 } 1270 #endif 1271 1272 return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); 1273 } 1274 1275 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); 1276 static void prep_compound_gigantic_page(struct page *page, unsigned int order); 1277 #else /* !CONFIG_CONTIG_ALLOC */ 1278 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1279 int nid, nodemask_t *nodemask) 1280 { 1281 return NULL; 1282 } 1283 #endif /* CONFIG_CONTIG_ALLOC */ 1284 1285 #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ 1286 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1287 int nid, nodemask_t *nodemask) 1288 { 1289 return NULL; 1290 } 1291 static inline void free_gigantic_page(struct page *page, unsigned int order) { } 1292 static inline void destroy_compound_gigantic_page(struct page *page, 1293 unsigned int order) { } 1294 #endif 1295 1296 static void update_and_free_page(struct hstate *h, struct page *page) 1297 { 1298 int i; 1299 1300 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 1301 return; 1302 1303 h->nr_huge_pages--; 1304 h->nr_huge_pages_node[page_to_nid(page)]--; 1305 for (i = 0; i < pages_per_huge_page(h); i++) { 1306 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1307 1 << PG_referenced | 1 << PG_dirty | 1308 1 << PG_active | 1 << PG_private | 1309 1 << PG_writeback); 1310 } 1311 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); 1312 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); 1313 set_compound_page_dtor(page, NULL_COMPOUND_DTOR); 1314 set_page_refcounted(page); 1315 if (hstate_is_gigantic(h)) { 1316 /* 1317 * Temporarily drop the hugetlb_lock, because 1318 * we might block in free_gigantic_page(). 1319 */ 1320 spin_unlock(&hugetlb_lock); 1321 destroy_compound_gigantic_page(page, huge_page_order(h)); 1322 free_gigantic_page(page, huge_page_order(h)); 1323 spin_lock(&hugetlb_lock); 1324 } else { 1325 __free_pages(page, huge_page_order(h)); 1326 } 1327 } 1328 1329 struct hstate *size_to_hstate(unsigned long size) 1330 { 1331 struct hstate *h; 1332 1333 for_each_hstate(h) { 1334 if (huge_page_size(h) == size) 1335 return h; 1336 } 1337 return NULL; 1338 } 1339 1340 /* 1341 * Test to determine whether the hugepage is "active/in-use" (i.e. being linked 1342 * to hstate->hugepage_activelist.) 1343 * 1344 * This function can be called for tail pages, but never returns true for them. 1345 */ 1346 bool page_huge_active(struct page *page) 1347 { 1348 VM_BUG_ON_PAGE(!PageHuge(page), page); 1349 return PageHead(page) && PagePrivate(&page[1]); 1350 } 1351 1352 /* never called for tail page */ 1353 static void set_page_huge_active(struct page *page) 1354 { 1355 VM_BUG_ON_PAGE(!PageHeadHuge(page), page); 1356 SetPagePrivate(&page[1]); 1357 } 1358 1359 static void clear_page_huge_active(struct page *page) 1360 { 1361 VM_BUG_ON_PAGE(!PageHeadHuge(page), page); 1362 ClearPagePrivate(&page[1]); 1363 } 1364 1365 /* 1366 * Internal hugetlb specific page flag. Do not use outside of the hugetlb 1367 * code 1368 */ 1369 static inline bool PageHugeTemporary(struct page *page) 1370 { 1371 if (!PageHuge(page)) 1372 return false; 1373 1374 return (unsigned long)page[2].mapping == -1U; 1375 } 1376 1377 static inline void SetPageHugeTemporary(struct page *page) 1378 { 1379 page[2].mapping = (void *)-1U; 1380 } 1381 1382 static inline void ClearPageHugeTemporary(struct page *page) 1383 { 1384 page[2].mapping = NULL; 1385 } 1386 1387 static void __free_huge_page(struct page *page) 1388 { 1389 /* 1390 * Can't pass hstate in here because it is called from the 1391 * compound page destructor. 1392 */ 1393 struct hstate *h = page_hstate(page); 1394 int nid = page_to_nid(page); 1395 struct hugepage_subpool *spool = 1396 (struct hugepage_subpool *)page_private(page); 1397 bool restore_reserve; 1398 1399 VM_BUG_ON_PAGE(page_count(page), page); 1400 VM_BUG_ON_PAGE(page_mapcount(page), page); 1401 1402 set_page_private(page, 0); 1403 page->mapping = NULL; 1404 restore_reserve = PagePrivate(page); 1405 ClearPagePrivate(page); 1406 1407 /* 1408 * If PagePrivate() was set on page, page allocation consumed a 1409 * reservation. If the page was associated with a subpool, there 1410 * would have been a page reserved in the subpool before allocation 1411 * via hugepage_subpool_get_pages(). Since we are 'restoring' the 1412 * reservtion, do not call hugepage_subpool_put_pages() as this will 1413 * remove the reserved page from the subpool. 1414 */ 1415 if (!restore_reserve) { 1416 /* 1417 * A return code of zero implies that the subpool will be 1418 * under its minimum size if the reservation is not restored 1419 * after page is free. Therefore, force restore_reserve 1420 * operation. 1421 */ 1422 if (hugepage_subpool_put_pages(spool, 1) == 0) 1423 restore_reserve = true; 1424 } 1425 1426 spin_lock(&hugetlb_lock); 1427 clear_page_huge_active(page); 1428 hugetlb_cgroup_uncharge_page(hstate_index(h), 1429 pages_per_huge_page(h), page); 1430 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), 1431 pages_per_huge_page(h), page); 1432 if (restore_reserve) 1433 h->resv_huge_pages++; 1434 1435 if (PageHugeTemporary(page)) { 1436 list_del(&page->lru); 1437 ClearPageHugeTemporary(page); 1438 update_and_free_page(h, page); 1439 } else if (h->surplus_huge_pages_node[nid]) { 1440 /* remove the page from active list */ 1441 list_del(&page->lru); 1442 update_and_free_page(h, page); 1443 h->surplus_huge_pages--; 1444 h->surplus_huge_pages_node[nid]--; 1445 } else { 1446 arch_clear_hugepage_flags(page); 1447 enqueue_huge_page(h, page); 1448 } 1449 spin_unlock(&hugetlb_lock); 1450 } 1451 1452 /* 1453 * As free_huge_page() can be called from a non-task context, we have 1454 * to defer the actual freeing in a workqueue to prevent potential 1455 * hugetlb_lock deadlock. 1456 * 1457 * free_hpage_workfn() locklessly retrieves the linked list of pages to 1458 * be freed and frees them one-by-one. As the page->mapping pointer is 1459 * going to be cleared in __free_huge_page() anyway, it is reused as the 1460 * llist_node structure of a lockless linked list of huge pages to be freed. 1461 */ 1462 static LLIST_HEAD(hpage_freelist); 1463 1464 static void free_hpage_workfn(struct work_struct *work) 1465 { 1466 struct llist_node *node; 1467 struct page *page; 1468 1469 node = llist_del_all(&hpage_freelist); 1470 1471 while (node) { 1472 page = container_of((struct address_space **)node, 1473 struct page, mapping); 1474 node = node->next; 1475 __free_huge_page(page); 1476 } 1477 } 1478 static DECLARE_WORK(free_hpage_work, free_hpage_workfn); 1479 1480 void free_huge_page(struct page *page) 1481 { 1482 /* 1483 * Defer freeing if in non-task context to avoid hugetlb_lock deadlock. 1484 */ 1485 if (!in_task()) { 1486 /* 1487 * Only call schedule_work() if hpage_freelist is previously 1488 * empty. Otherwise, schedule_work() had been called but the 1489 * workfn hasn't retrieved the list yet. 1490 */ 1491 if (llist_add((struct llist_node *)&page->mapping, 1492 &hpage_freelist)) 1493 schedule_work(&free_hpage_work); 1494 return; 1495 } 1496 1497 __free_huge_page(page); 1498 } 1499 1500 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 1501 { 1502 INIT_LIST_HEAD(&page->lru); 1503 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); 1504 spin_lock(&hugetlb_lock); 1505 set_hugetlb_cgroup(page, NULL); 1506 set_hugetlb_cgroup_rsvd(page, NULL); 1507 h->nr_huge_pages++; 1508 h->nr_huge_pages_node[nid]++; 1509 spin_unlock(&hugetlb_lock); 1510 } 1511 1512 static void prep_compound_gigantic_page(struct page *page, unsigned int order) 1513 { 1514 int i; 1515 int nr_pages = 1 << order; 1516 struct page *p = page + 1; 1517 1518 /* we rely on prep_new_huge_page to set the destructor */ 1519 set_compound_order(page, order); 1520 __ClearPageReserved(page); 1521 __SetPageHead(page); 1522 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1523 /* 1524 * For gigantic hugepages allocated through bootmem at 1525 * boot, it's safer to be consistent with the not-gigantic 1526 * hugepages and clear the PG_reserved bit from all tail pages 1527 * too. Otherwise drivers using get_user_pages() to access tail 1528 * pages may get the reference counting wrong if they see 1529 * PG_reserved set on a tail page (despite the head page not 1530 * having PG_reserved set). Enforcing this consistency between 1531 * head and tail pages allows drivers to optimize away a check 1532 * on the head page when they need know if put_page() is needed 1533 * after get_user_pages(). 1534 */ 1535 __ClearPageReserved(p); 1536 set_page_count(p, 0); 1537 set_compound_head(p, page); 1538 } 1539 atomic_set(compound_mapcount_ptr(page), -1); 1540 1541 if (hpage_pincount_available(page)) 1542 atomic_set(compound_pincount_ptr(page), 0); 1543 } 1544 1545 /* 1546 * PageHuge() only returns true for hugetlbfs pages, but not for normal or 1547 * transparent huge pages. See the PageTransHuge() documentation for more 1548 * details. 1549 */ 1550 int PageHuge(struct page *page) 1551 { 1552 if (!PageCompound(page)) 1553 return 0; 1554 1555 page = compound_head(page); 1556 return page[1].compound_dtor == HUGETLB_PAGE_DTOR; 1557 } 1558 EXPORT_SYMBOL_GPL(PageHuge); 1559 1560 /* 1561 * PageHeadHuge() only returns true for hugetlbfs head page, but not for 1562 * normal or transparent huge pages. 1563 */ 1564 int PageHeadHuge(struct page *page_head) 1565 { 1566 if (!PageHead(page_head)) 1567 return 0; 1568 1569 return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; 1570 } 1571 1572 /* 1573 * Find address_space associated with hugetlbfs page. 1574 * Upon entry page is locked and page 'was' mapped although mapped state 1575 * could change. If necessary, use anon_vma to find vma and associated 1576 * address space. The returned mapping may be stale, but it can not be 1577 * invalid as page lock (which is held) is required to destroy mapping. 1578 */ 1579 static struct address_space *_get_hugetlb_page_mapping(struct page *hpage) 1580 { 1581 struct anon_vma *anon_vma; 1582 pgoff_t pgoff_start, pgoff_end; 1583 struct anon_vma_chain *avc; 1584 struct address_space *mapping = page_mapping(hpage); 1585 1586 /* Simple file based mapping */ 1587 if (mapping) 1588 return mapping; 1589 1590 /* 1591 * Even anonymous hugetlbfs mappings are associated with an 1592 * underlying hugetlbfs file (see hugetlb_file_setup in mmap 1593 * code). Find a vma associated with the anonymous vma, and 1594 * use the file pointer to get address_space. 1595 */ 1596 anon_vma = page_lock_anon_vma_read(hpage); 1597 if (!anon_vma) 1598 return mapping; /* NULL */ 1599 1600 /* Use first found vma */ 1601 pgoff_start = page_to_pgoff(hpage); 1602 pgoff_end = pgoff_start + pages_per_huge_page(page_hstate(hpage)) - 1; 1603 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, 1604 pgoff_start, pgoff_end) { 1605 struct vm_area_struct *vma = avc->vma; 1606 1607 mapping = vma->vm_file->f_mapping; 1608 break; 1609 } 1610 1611 anon_vma_unlock_read(anon_vma); 1612 return mapping; 1613 } 1614 1615 /* 1616 * Find and lock address space (mapping) in write mode. 1617 * 1618 * Upon entry, the page is locked which allows us to find the mapping 1619 * even in the case of an anon page. However, locking order dictates 1620 * the i_mmap_rwsem be acquired BEFORE the page lock. This is hugetlbfs 1621 * specific. So, we first try to lock the sema while still holding the 1622 * page lock. If this works, great! If not, then we need to drop the 1623 * page lock and then acquire i_mmap_rwsem and reacquire page lock. Of 1624 * course, need to revalidate state along the way. 1625 */ 1626 struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) 1627 { 1628 struct address_space *mapping, *mapping2; 1629 1630 mapping = _get_hugetlb_page_mapping(hpage); 1631 retry: 1632 if (!mapping) 1633 return mapping; 1634 1635 /* 1636 * If no contention, take lock and return 1637 */ 1638 if (i_mmap_trylock_write(mapping)) 1639 return mapping; 1640 1641 /* 1642 * Must drop page lock and wait on mapping sema. 1643 * Note: Once page lock is dropped, mapping could become invalid. 1644 * As a hack, increase map count until we lock page again. 1645 */ 1646 atomic_inc(&hpage->_mapcount); 1647 unlock_page(hpage); 1648 i_mmap_lock_write(mapping); 1649 lock_page(hpage); 1650 atomic_add_negative(-1, &hpage->_mapcount); 1651 1652 /* verify page is still mapped */ 1653 if (!page_mapped(hpage)) { 1654 i_mmap_unlock_write(mapping); 1655 return NULL; 1656 } 1657 1658 /* 1659 * Get address space again and verify it is the same one 1660 * we locked. If not, drop lock and retry. 1661 */ 1662 mapping2 = _get_hugetlb_page_mapping(hpage); 1663 if (mapping2 != mapping) { 1664 i_mmap_unlock_write(mapping); 1665 mapping = mapping2; 1666 goto retry; 1667 } 1668 1669 return mapping; 1670 } 1671 1672 pgoff_t __basepage_index(struct page *page) 1673 { 1674 struct page *page_head = compound_head(page); 1675 pgoff_t index = page_index(page_head); 1676 unsigned long compound_idx; 1677 1678 if (!PageHuge(page_head)) 1679 return page_index(page); 1680 1681 if (compound_order(page_head) >= MAX_ORDER) 1682 compound_idx = page_to_pfn(page) - page_to_pfn(page_head); 1683 else 1684 compound_idx = page - page_head; 1685 1686 return (index << compound_order(page_head)) + compound_idx; 1687 } 1688 1689 static struct page *alloc_buddy_huge_page(struct hstate *h, 1690 gfp_t gfp_mask, int nid, nodemask_t *nmask, 1691 nodemask_t *node_alloc_noretry) 1692 { 1693 int order = huge_page_order(h); 1694 struct page *page; 1695 bool alloc_try_hard = true; 1696 1697 /* 1698 * By default we always try hard to allocate the page with 1699 * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in 1700 * a loop (to adjust global huge page counts) and previous allocation 1701 * failed, do not continue to try hard on the same node. Use the 1702 * node_alloc_noretry bitmap to manage this state information. 1703 */ 1704 if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) 1705 alloc_try_hard = false; 1706 gfp_mask |= __GFP_COMP|__GFP_NOWARN; 1707 if (alloc_try_hard) 1708 gfp_mask |= __GFP_RETRY_MAYFAIL; 1709 if (nid == NUMA_NO_NODE) 1710 nid = numa_mem_id(); 1711 page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); 1712 if (page) 1713 __count_vm_event(HTLB_BUDDY_PGALLOC); 1714 else 1715 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 1716 1717 /* 1718 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this 1719 * indicates an overall state change. Clear bit so that we resume 1720 * normal 'try hard' allocations. 1721 */ 1722 if (node_alloc_noretry && page && !alloc_try_hard) 1723 node_clear(nid, *node_alloc_noretry); 1724 1725 /* 1726 * If we tried hard to get a page but failed, set bit so that 1727 * subsequent attempts will not try as hard until there is an 1728 * overall state change. 1729 */ 1730 if (node_alloc_noretry && !page && alloc_try_hard) 1731 node_set(nid, *node_alloc_noretry); 1732 1733 return page; 1734 } 1735 1736 /* 1737 * Common helper to allocate a fresh hugetlb page. All specific allocators 1738 * should use this function to get new hugetlb pages 1739 */ 1740 static struct page *alloc_fresh_huge_page(struct hstate *h, 1741 gfp_t gfp_mask, int nid, nodemask_t *nmask, 1742 nodemask_t *node_alloc_noretry) 1743 { 1744 struct page *page; 1745 1746 if (hstate_is_gigantic(h)) 1747 page = alloc_gigantic_page(h, gfp_mask, nid, nmask); 1748 else 1749 page = alloc_buddy_huge_page(h, gfp_mask, 1750 nid, nmask, node_alloc_noretry); 1751 if (!page) 1752 return NULL; 1753 1754 if (hstate_is_gigantic(h)) 1755 prep_compound_gigantic_page(page, huge_page_order(h)); 1756 prep_new_huge_page(h, page, page_to_nid(page)); 1757 1758 return page; 1759 } 1760 1761 /* 1762 * Allocates a fresh page to the hugetlb allocator pool in the node interleaved 1763 * manner. 1764 */ 1765 static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 1766 nodemask_t *node_alloc_noretry) 1767 { 1768 struct page *page; 1769 int nr_nodes, node; 1770 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 1771 1772 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 1773 page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, 1774 node_alloc_noretry); 1775 if (page) 1776 break; 1777 } 1778 1779 if (!page) 1780 return 0; 1781 1782 put_page(page); /* free it into the hugepage allocator */ 1783 1784 return 1; 1785 } 1786 1787 /* 1788 * Free huge page from pool from next node to free. 1789 * Attempt to keep persistent huge pages more or less 1790 * balanced over allowed nodes. 1791 * Called with hugetlb_lock locked. 1792 */ 1793 static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 1794 bool acct_surplus) 1795 { 1796 int nr_nodes, node; 1797 int ret = 0; 1798 1799 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 1800 /* 1801 * If we're returning unused surplus pages, only examine 1802 * nodes with surplus pages. 1803 */ 1804 if ((!acct_surplus || h->surplus_huge_pages_node[node]) && 1805 !list_empty(&h->hugepage_freelists[node])) { 1806 struct page *page = 1807 list_entry(h->hugepage_freelists[node].next, 1808 struct page, lru); 1809 list_del(&page->lru); 1810 h->free_huge_pages--; 1811 h->free_huge_pages_node[node]--; 1812 if (acct_surplus) { 1813 h->surplus_huge_pages--; 1814 h->surplus_huge_pages_node[node]--; 1815 } 1816 update_and_free_page(h, page); 1817 ret = 1; 1818 break; 1819 } 1820 } 1821 1822 return ret; 1823 } 1824 1825 /* 1826 * Dissolve a given free hugepage into free buddy pages. This function does 1827 * nothing for in-use hugepages and non-hugepages. 1828 * This function returns values like below: 1829 * 1830 * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use 1831 * (allocated or reserved.) 1832 * 0: successfully dissolved free hugepages or the page is not a 1833 * hugepage (considered as already dissolved) 1834 */ 1835 int dissolve_free_huge_page(struct page *page) 1836 { 1837 int rc = -EBUSY; 1838 1839 /* Not to disrupt normal path by vainly holding hugetlb_lock */ 1840 if (!PageHuge(page)) 1841 return 0; 1842 1843 spin_lock(&hugetlb_lock); 1844 if (!PageHuge(page)) { 1845 rc = 0; 1846 goto out; 1847 } 1848 1849 if (!page_count(page)) { 1850 struct page *head = compound_head(page); 1851 struct hstate *h = page_hstate(head); 1852 int nid = page_to_nid(head); 1853 if (h->free_huge_pages - h->resv_huge_pages == 0) 1854 goto out; 1855 /* 1856 * Move PageHWPoison flag from head page to the raw error page, 1857 * which makes any subpages rather than the error page reusable. 1858 */ 1859 if (PageHWPoison(head) && page != head) { 1860 SetPageHWPoison(page); 1861 ClearPageHWPoison(head); 1862 } 1863 list_del(&head->lru); 1864 h->free_huge_pages--; 1865 h->free_huge_pages_node[nid]--; 1866 h->max_huge_pages--; 1867 update_and_free_page(h, head); 1868 rc = 0; 1869 } 1870 out: 1871 spin_unlock(&hugetlb_lock); 1872 return rc; 1873 } 1874 1875 /* 1876 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to 1877 * make specified memory blocks removable from the system. 1878 * Note that this will dissolve a free gigantic hugepage completely, if any 1879 * part of it lies within the given range. 1880 * Also note that if dissolve_free_huge_page() returns with an error, all 1881 * free hugepages that were dissolved before that error are lost. 1882 */ 1883 int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) 1884 { 1885 unsigned long pfn; 1886 struct page *page; 1887 int rc = 0; 1888 1889 if (!hugepages_supported()) 1890 return rc; 1891 1892 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { 1893 page = pfn_to_page(pfn); 1894 rc = dissolve_free_huge_page(page); 1895 if (rc) 1896 break; 1897 } 1898 1899 return rc; 1900 } 1901 1902 /* 1903 * Allocates a fresh surplus page from the page allocator. 1904 */ 1905 static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, 1906 int nid, nodemask_t *nmask) 1907 { 1908 struct page *page = NULL; 1909 1910 if (hstate_is_gigantic(h)) 1911 return NULL; 1912 1913 spin_lock(&hugetlb_lock); 1914 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) 1915 goto out_unlock; 1916 spin_unlock(&hugetlb_lock); 1917 1918 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); 1919 if (!page) 1920 return NULL; 1921 1922 spin_lock(&hugetlb_lock); 1923 /* 1924 * We could have raced with the pool size change. 1925 * Double check that and simply deallocate the new page 1926 * if we would end up overcommiting the surpluses. Abuse 1927 * temporary page to workaround the nasty free_huge_page 1928 * codeflow 1929 */ 1930 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { 1931 SetPageHugeTemporary(page); 1932 spin_unlock(&hugetlb_lock); 1933 put_page(page); 1934 return NULL; 1935 } else { 1936 h->surplus_huge_pages++; 1937 h->surplus_huge_pages_node[page_to_nid(page)]++; 1938 } 1939 1940 out_unlock: 1941 spin_unlock(&hugetlb_lock); 1942 1943 return page; 1944 } 1945 1946 struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, 1947 int nid, nodemask_t *nmask) 1948 { 1949 struct page *page; 1950 1951 if (hstate_is_gigantic(h)) 1952 return NULL; 1953 1954 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); 1955 if (!page) 1956 return NULL; 1957 1958 /* 1959 * We do not account these pages as surplus because they are only 1960 * temporary and will be released properly on the last reference 1961 */ 1962 SetPageHugeTemporary(page); 1963 1964 return page; 1965 } 1966 1967 /* 1968 * Use the VMA's mpolicy to allocate a huge page from the buddy. 1969 */ 1970 static 1971 struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, 1972 struct vm_area_struct *vma, unsigned long addr) 1973 { 1974 struct page *page; 1975 struct mempolicy *mpol; 1976 gfp_t gfp_mask = htlb_alloc_mask(h); 1977 int nid; 1978 nodemask_t *nodemask; 1979 1980 nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); 1981 page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); 1982 mpol_cond_put(mpol); 1983 1984 return page; 1985 } 1986 1987 /* page migration callback function */ 1988 struct page *alloc_huge_page_node(struct hstate *h, int nid) 1989 { 1990 gfp_t gfp_mask = htlb_alloc_mask(h); 1991 struct page *page = NULL; 1992 1993 if (nid != NUMA_NO_NODE) 1994 gfp_mask |= __GFP_THISNODE; 1995 1996 spin_lock(&hugetlb_lock); 1997 if (h->free_huge_pages - h->resv_huge_pages > 0) 1998 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL); 1999 spin_unlock(&hugetlb_lock); 2000 2001 if (!page) 2002 page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL); 2003 2004 return page; 2005 } 2006 2007 /* page migration callback function */ 2008 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, 2009 nodemask_t *nmask) 2010 { 2011 gfp_t gfp_mask = htlb_alloc_mask(h); 2012 2013 spin_lock(&hugetlb_lock); 2014 if (h->free_huge_pages - h->resv_huge_pages > 0) { 2015 struct page *page; 2016 2017 page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); 2018 if (page) { 2019 spin_unlock(&hugetlb_lock); 2020 return page; 2021 } 2022 } 2023 spin_unlock(&hugetlb_lock); 2024 2025 return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); 2026 } 2027 2028 /* mempolicy aware migration callback */ 2029 struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, 2030 unsigned long address) 2031 { 2032 struct mempolicy *mpol; 2033 nodemask_t *nodemask; 2034 struct page *page; 2035 gfp_t gfp_mask; 2036 int node; 2037 2038 gfp_mask = htlb_alloc_mask(h); 2039 node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 2040 page = alloc_huge_page_nodemask(h, node, nodemask); 2041 mpol_cond_put(mpol); 2042 2043 return page; 2044 } 2045 2046 /* 2047 * Increase the hugetlb pool such that it can accommodate a reservation 2048 * of size 'delta'. 2049 */ 2050 static int gather_surplus_pages(struct hstate *h, int delta) 2051 __must_hold(&hugetlb_lock) 2052 { 2053 struct list_head surplus_list; 2054 struct page *page, *tmp; 2055 int ret, i; 2056 int needed, allocated; 2057 bool alloc_ok = true; 2058 2059 needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 2060 if (needed <= 0) { 2061 h->resv_huge_pages += delta; 2062 return 0; 2063 } 2064 2065 allocated = 0; 2066 INIT_LIST_HEAD(&surplus_list); 2067 2068 ret = -ENOMEM; 2069 retry: 2070 spin_unlock(&hugetlb_lock); 2071 for (i = 0; i < needed; i++) { 2072 page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), 2073 NUMA_NO_NODE, NULL); 2074 if (!page) { 2075 alloc_ok = false; 2076 break; 2077 } 2078 list_add(&page->lru, &surplus_list); 2079 cond_resched(); 2080 } 2081 allocated += i; 2082 2083 /* 2084 * After retaking hugetlb_lock, we need to recalculate 'needed' 2085 * because either resv_huge_pages or free_huge_pages may have changed. 2086 */ 2087 spin_lock(&hugetlb_lock); 2088 needed = (h->resv_huge_pages + delta) - 2089 (h->free_huge_pages + allocated); 2090 if (needed > 0) { 2091 if (alloc_ok) 2092 goto retry; 2093 /* 2094 * We were not able to allocate enough pages to 2095 * satisfy the entire reservation so we free what 2096 * we've allocated so far. 2097 */ 2098 goto free; 2099 } 2100 /* 2101 * The surplus_list now contains _at_least_ the number of extra pages 2102 * needed to accommodate the reservation. Add the appropriate number 2103 * of pages to the hugetlb pool and free the extras back to the buddy 2104 * allocator. Commit the entire reservation here to prevent another 2105 * process from stealing the pages as they are added to the pool but 2106 * before they are reserved. 2107 */ 2108 needed += allocated; 2109 h->resv_huge_pages += delta; 2110 ret = 0; 2111 2112 /* Free the needed pages to the hugetlb pool */ 2113 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 2114 if ((--needed) < 0) 2115 break; 2116 /* 2117 * This page is now managed by the hugetlb allocator and has 2118 * no users -- drop the buddy allocator's reference. 2119 */ 2120 put_page_testzero(page); 2121 VM_BUG_ON_PAGE(page_count(page), page); 2122 enqueue_huge_page(h, page); 2123 } 2124 free: 2125 spin_unlock(&hugetlb_lock); 2126 2127 /* Free unnecessary surplus pages to the buddy allocator */ 2128 list_for_each_entry_safe(page, tmp, &surplus_list, lru) 2129 put_page(page); 2130 spin_lock(&hugetlb_lock); 2131 2132 return ret; 2133 } 2134 2135 /* 2136 * This routine has two main purposes: 2137 * 1) Decrement the reservation count (resv_huge_pages) by the value passed 2138 * in unused_resv_pages. This corresponds to the prior adjustments made 2139 * to the associated reservation map. 2140 * 2) Free any unused surplus pages that may have been allocated to satisfy 2141 * the reservation. As many as unused_resv_pages may be freed. 2142 * 2143 * Called with hugetlb_lock held. However, the lock could be dropped (and 2144 * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, 2145 * we must make sure nobody else can claim pages we are in the process of 2146 * freeing. Do this by ensuring resv_huge_page always is greater than the 2147 * number of huge pages we plan to free when dropping the lock. 2148 */ 2149 static void return_unused_surplus_pages(struct hstate *h, 2150 unsigned long unused_resv_pages) 2151 { 2152 unsigned long nr_pages; 2153 2154 /* Cannot return gigantic pages currently */ 2155 if (hstate_is_gigantic(h)) 2156 goto out; 2157 2158 /* 2159 * Part (or even all) of the reservation could have been backed 2160 * by pre-allocated pages. Only free surplus pages. 2161 */ 2162 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 2163 2164 /* 2165 * We want to release as many surplus pages as possible, spread 2166 * evenly across all nodes with memory. Iterate across these nodes 2167 * until we can no longer free unreserved surplus pages. This occurs 2168 * when the nodes with surplus pages have no free pages. 2169 * free_pool_huge_page() will balance the the freed pages across the 2170 * on-line nodes with memory and will handle the hstate accounting. 2171 * 2172 * Note that we decrement resv_huge_pages as we free the pages. If 2173 * we drop the lock, resv_huge_pages will still be sufficiently large 2174 * to cover subsequent pages we may free. 2175 */ 2176 while (nr_pages--) { 2177 h->resv_huge_pages--; 2178 unused_resv_pages--; 2179 if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) 2180 goto out; 2181 cond_resched_lock(&hugetlb_lock); 2182 } 2183 2184 out: 2185 /* Fully uncommit the reservation */ 2186 h->resv_huge_pages -= unused_resv_pages; 2187 } 2188 2189 2190 /* 2191 * vma_needs_reservation, vma_commit_reservation and vma_end_reservation 2192 * are used by the huge page allocation routines to manage reservations. 2193 * 2194 * vma_needs_reservation is called to determine if the huge page at addr 2195 * within the vma has an associated reservation. If a reservation is 2196 * needed, the value 1 is returned. The caller is then responsible for 2197 * managing the global reservation and subpool usage counts. After 2198 * the huge page has been allocated, vma_commit_reservation is called 2199 * to add the page to the reservation map. If the page allocation fails, 2200 * the reservation must be ended instead of committed. vma_end_reservation 2201 * is called in such cases. 2202 * 2203 * In the normal case, vma_commit_reservation returns the same value 2204 * as the preceding vma_needs_reservation call. The only time this 2205 * is not the case is if a reserve map was changed between calls. It 2206 * is the responsibility of the caller to notice the difference and 2207 * take appropriate action. 2208 * 2209 * vma_add_reservation is used in error paths where a reservation must 2210 * be restored when a newly allocated huge page must be freed. It is 2211 * to be called after calling vma_needs_reservation to determine if a 2212 * reservation exists. 2213 */ 2214 enum vma_resv_mode { 2215 VMA_NEEDS_RESV, 2216 VMA_COMMIT_RESV, 2217 VMA_END_RESV, 2218 VMA_ADD_RESV, 2219 }; 2220 static long __vma_reservation_common(struct hstate *h, 2221 struct vm_area_struct *vma, unsigned long addr, 2222 enum vma_resv_mode mode) 2223 { 2224 struct resv_map *resv; 2225 pgoff_t idx; 2226 long ret; 2227 long dummy_out_regions_needed; 2228 2229 resv = vma_resv_map(vma); 2230 if (!resv) 2231 return 1; 2232 2233 idx = vma_hugecache_offset(h, vma, addr); 2234 switch (mode) { 2235 case VMA_NEEDS_RESV: 2236 ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); 2237 /* We assume that vma_reservation_* routines always operate on 2238 * 1 page, and that adding to resv map a 1 page entry can only 2239 * ever require 1 region. 2240 */ 2241 VM_BUG_ON(dummy_out_regions_needed != 1); 2242 break; 2243 case VMA_COMMIT_RESV: 2244 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2245 /* region_add calls of range 1 should never fail. */ 2246 VM_BUG_ON(ret < 0); 2247 break; 2248 case VMA_END_RESV: 2249 region_abort(resv, idx, idx + 1, 1); 2250 ret = 0; 2251 break; 2252 case VMA_ADD_RESV: 2253 if (vma->vm_flags & VM_MAYSHARE) { 2254 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2255 /* region_add calls of range 1 should never fail. */ 2256 VM_BUG_ON(ret < 0); 2257 } else { 2258 region_abort(resv, idx, idx + 1, 1); 2259 ret = region_del(resv, idx, idx + 1); 2260 } 2261 break; 2262 default: 2263 BUG(); 2264 } 2265 2266 if (vma->vm_flags & VM_MAYSHARE) 2267 return ret; 2268 else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) { 2269 /* 2270 * In most cases, reserves always exist for private mappings. 2271 * However, a file associated with mapping could have been 2272 * hole punched or truncated after reserves were consumed. 2273 * As subsequent fault on such a range will not use reserves. 2274 * Subtle - The reserve map for private mappings has the 2275 * opposite meaning than that of shared mappings. If NO 2276 * entry is in the reserve map, it means a reservation exists. 2277 * If an entry exists in the reserve map, it means the 2278 * reservation has already been consumed. As a result, the 2279 * return value of this routine is the opposite of the 2280 * value returned from reserve map manipulation routines above. 2281 */ 2282 if (ret) 2283 return 0; 2284 else 2285 return 1; 2286 } 2287 else 2288 return ret < 0 ? ret : 0; 2289 } 2290 2291 static long vma_needs_reservation(struct hstate *h, 2292 struct vm_area_struct *vma, unsigned long addr) 2293 { 2294 return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); 2295 } 2296 2297 static long vma_commit_reservation(struct hstate *h, 2298 struct vm_area_struct *vma, unsigned long addr) 2299 { 2300 return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); 2301 } 2302 2303 static void vma_end_reservation(struct hstate *h, 2304 struct vm_area_struct *vma, unsigned long addr) 2305 { 2306 (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); 2307 } 2308 2309 static long vma_add_reservation(struct hstate *h, 2310 struct vm_area_struct *vma, unsigned long addr) 2311 { 2312 return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); 2313 } 2314 2315 /* 2316 * This routine is called to restore a reservation on error paths. In the 2317 * specific error paths, a huge page was allocated (via alloc_huge_page) 2318 * and is about to be freed. If a reservation for the page existed, 2319 * alloc_huge_page would have consumed the reservation and set PagePrivate 2320 * in the newly allocated page. When the page is freed via free_huge_page, 2321 * the global reservation count will be incremented if PagePrivate is set. 2322 * However, free_huge_page can not adjust the reserve map. Adjust the 2323 * reserve map here to be consistent with global reserve count adjustments 2324 * to be made by free_huge_page. 2325 */ 2326 static void restore_reserve_on_error(struct hstate *h, 2327 struct vm_area_struct *vma, unsigned long address, 2328 struct page *page) 2329 { 2330 if (unlikely(PagePrivate(page))) { 2331 long rc = vma_needs_reservation(h, vma, address); 2332 2333 if (unlikely(rc < 0)) { 2334 /* 2335 * Rare out of memory condition in reserve map 2336 * manipulation. Clear PagePrivate so that 2337 * global reserve count will not be incremented 2338 * by free_huge_page. This will make it appear 2339 * as though the reservation for this page was 2340 * consumed. This may prevent the task from 2341 * faulting in the page at a later time. This 2342 * is better than inconsistent global huge page 2343 * accounting of reserve counts. 2344 */ 2345 ClearPagePrivate(page); 2346 } else if (rc) { 2347 rc = vma_add_reservation(h, vma, address); 2348 if (unlikely(rc < 0)) 2349 /* 2350 * See above comment about rare out of 2351 * memory condition. 2352 */ 2353 ClearPagePrivate(page); 2354 } else 2355 vma_end_reservation(h, vma, address); 2356 } 2357 } 2358 2359 struct page *alloc_huge_page(struct vm_area_struct *vma, 2360 unsigned long addr, int avoid_reserve) 2361 { 2362 struct hugepage_subpool *spool = subpool_vma(vma); 2363 struct hstate *h = hstate_vma(vma); 2364 struct page *page; 2365 long map_chg, map_commit; 2366 long gbl_chg; 2367 int ret, idx; 2368 struct hugetlb_cgroup *h_cg; 2369 bool deferred_reserve; 2370 2371 idx = hstate_index(h); 2372 /* 2373 * Examine the region/reserve map to determine if the process 2374 * has a reservation for the page to be allocated. A return 2375 * code of zero indicates a reservation exists (no change). 2376 */ 2377 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); 2378 if (map_chg < 0) 2379 return ERR_PTR(-ENOMEM); 2380 2381 /* 2382 * Processes that did not create the mapping will have no 2383 * reserves as indicated by the region/reserve map. Check 2384 * that the allocation will not exceed the subpool limit. 2385 * Allocations for MAP_NORESERVE mappings also need to be 2386 * checked against any subpool limit. 2387 */ 2388 if (map_chg || avoid_reserve) { 2389 gbl_chg = hugepage_subpool_get_pages(spool, 1); 2390 if (gbl_chg < 0) { 2391 vma_end_reservation(h, vma, addr); 2392 return ERR_PTR(-ENOSPC); 2393 } 2394 2395 /* 2396 * Even though there was no reservation in the region/reserve 2397 * map, there could be reservations associated with the 2398 * subpool that can be used. This would be indicated if the 2399 * return value of hugepage_subpool_get_pages() is zero. 2400 * However, if avoid_reserve is specified we still avoid even 2401 * the subpool reservations. 2402 */ 2403 if (avoid_reserve) 2404 gbl_chg = 1; 2405 } 2406 2407 /* If this allocation is not consuming a reservation, charge it now. 2408 */ 2409 deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma); 2410 if (deferred_reserve) { 2411 ret = hugetlb_cgroup_charge_cgroup_rsvd( 2412 idx, pages_per_huge_page(h), &h_cg); 2413 if (ret) 2414 goto out_subpool_put; 2415 } 2416 2417 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 2418 if (ret) 2419 goto out_uncharge_cgroup_reservation; 2420 2421 spin_lock(&hugetlb_lock); 2422 /* 2423 * glb_chg is passed to indicate whether or not a page must be taken 2424 * from the global free pool (global change). gbl_chg == 0 indicates 2425 * a reservation exists for the allocation. 2426 */ 2427 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); 2428 if (!page) { 2429 spin_unlock(&hugetlb_lock); 2430 page = alloc_buddy_huge_page_with_mpol(h, vma, addr); 2431 if (!page) 2432 goto out_uncharge_cgroup; 2433 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { 2434 SetPagePrivate(page); 2435 h->resv_huge_pages--; 2436 } 2437 spin_lock(&hugetlb_lock); 2438 list_move(&page->lru, &h->hugepage_activelist); 2439 /* Fall through */ 2440 } 2441 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); 2442 /* If allocation is not consuming a reservation, also store the 2443 * hugetlb_cgroup pointer on the page. 2444 */ 2445 if (deferred_reserve) { 2446 hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), 2447 h_cg, page); 2448 } 2449 2450 spin_unlock(&hugetlb_lock); 2451 2452 set_page_private(page, (unsigned long)spool); 2453 2454 map_commit = vma_commit_reservation(h, vma, addr); 2455 if (unlikely(map_chg > map_commit)) { 2456 /* 2457 * The page was added to the reservation map between 2458 * vma_needs_reservation and vma_commit_reservation. 2459 * This indicates a race with hugetlb_reserve_pages. 2460 * Adjust for the subpool count incremented above AND 2461 * in hugetlb_reserve_pages for the same page. Also, 2462 * the reservation count added in hugetlb_reserve_pages 2463 * no longer applies. 2464 */ 2465 long rsv_adjust; 2466 2467 rsv_adjust = hugepage_subpool_put_pages(spool, 1); 2468 hugetlb_acct_memory(h, -rsv_adjust); 2469 } 2470 return page; 2471 2472 out_uncharge_cgroup: 2473 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); 2474 out_uncharge_cgroup_reservation: 2475 if (deferred_reserve) 2476 hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), 2477 h_cg); 2478 out_subpool_put: 2479 if (map_chg || avoid_reserve) 2480 hugepage_subpool_put_pages(spool, 1); 2481 vma_end_reservation(h, vma, addr); 2482 return ERR_PTR(-ENOSPC); 2483 } 2484 2485 int alloc_bootmem_huge_page(struct hstate *h) 2486 __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); 2487 int __alloc_bootmem_huge_page(struct hstate *h) 2488 { 2489 struct huge_bootmem_page *m; 2490 int nr_nodes, node; 2491 2492 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 2493 void *addr; 2494 2495 addr = memblock_alloc_try_nid_raw( 2496 huge_page_size(h), huge_page_size(h), 2497 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); 2498 if (addr) { 2499 /* 2500 * Use the beginning of the huge page to store the 2501 * huge_bootmem_page struct (until gather_bootmem 2502 * puts them into the mem_map). 2503 */ 2504 m = addr; 2505 goto found; 2506 } 2507 } 2508 return 0; 2509 2510 found: 2511 BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); 2512 /* Put them into a private list first because mem_map is not up yet */ 2513 INIT_LIST_HEAD(&m->list); 2514 list_add(&m->list, &huge_boot_pages); 2515 m->hstate = h; 2516 return 1; 2517 } 2518 2519 static void __init prep_compound_huge_page(struct page *page, 2520 unsigned int order) 2521 { 2522 if (unlikely(order > (MAX_ORDER - 1))) 2523 prep_compound_gigantic_page(page, order); 2524 else 2525 prep_compound_page(page, order); 2526 } 2527 2528 /* Put bootmem huge pages into the standard lists after mem_map is up */ 2529 static void __init gather_bootmem_prealloc(void) 2530 { 2531 struct huge_bootmem_page *m; 2532 2533 list_for_each_entry(m, &huge_boot_pages, list) { 2534 struct page *page = virt_to_page(m); 2535 struct hstate *h = m->hstate; 2536 2537 WARN_ON(page_count(page) != 1); 2538 prep_compound_huge_page(page, h->order); 2539 WARN_ON(PageReserved(page)); 2540 prep_new_huge_page(h, page, page_to_nid(page)); 2541 put_page(page); /* free it into the hugepage allocator */ 2542 2543 /* 2544 * If we had gigantic hugepages allocated at boot time, we need 2545 * to restore the 'stolen' pages to totalram_pages in order to 2546 * fix confusing memory reports from free(1) and another 2547 * side-effects, like CommitLimit going negative. 2548 */ 2549 if (hstate_is_gigantic(h)) 2550 adjust_managed_page_count(page, 1 << h->order); 2551 cond_resched(); 2552 } 2553 } 2554 2555 static void __init hugetlb_hstate_alloc_pages(struct hstate *h) 2556 { 2557 unsigned long i; 2558 nodemask_t *node_alloc_noretry; 2559 2560 if (!hstate_is_gigantic(h)) { 2561 /* 2562 * Bit mask controlling how hard we retry per-node allocations. 2563 * Ignore errors as lower level routines can deal with 2564 * node_alloc_noretry == NULL. If this kmalloc fails at boot 2565 * time, we are likely in bigger trouble. 2566 */ 2567 node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), 2568 GFP_KERNEL); 2569 } else { 2570 /* allocations done at boot time */ 2571 node_alloc_noretry = NULL; 2572 } 2573 2574 /* bit mask controlling how hard we retry per-node allocations */ 2575 if (node_alloc_noretry) 2576 nodes_clear(*node_alloc_noretry); 2577 2578 for (i = 0; i < h->max_huge_pages; ++i) { 2579 if (hstate_is_gigantic(h)) { 2580 if (hugetlb_cma_size) { 2581 pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); 2582 break; 2583 } 2584 if (!alloc_bootmem_huge_page(h)) 2585 break; 2586 } else if (!alloc_pool_huge_page(h, 2587 &node_states[N_MEMORY], 2588 node_alloc_noretry)) 2589 break; 2590 cond_resched(); 2591 } 2592 if (i < h->max_huge_pages) { 2593 char buf[32]; 2594 2595 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 2596 pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", 2597 h->max_huge_pages, buf, i); 2598 h->max_huge_pages = i; 2599 } 2600 2601 kfree(node_alloc_noretry); 2602 } 2603 2604 static void __init hugetlb_init_hstates(void) 2605 { 2606 struct hstate *h; 2607 2608 for_each_hstate(h) { 2609 if (minimum_order > huge_page_order(h)) 2610 minimum_order = huge_page_order(h); 2611 2612 /* oversize hugepages were init'ed in early boot */ 2613 if (!hstate_is_gigantic(h)) 2614 hugetlb_hstate_alloc_pages(h); 2615 } 2616 VM_BUG_ON(minimum_order == UINT_MAX); 2617 } 2618 2619 static void __init report_hugepages(void) 2620 { 2621 struct hstate *h; 2622 2623 for_each_hstate(h) { 2624 char buf[32]; 2625 2626 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 2627 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", 2628 buf, h->free_huge_pages); 2629 } 2630 } 2631 2632 #ifdef CONFIG_HIGHMEM 2633 static void try_to_free_low(struct hstate *h, unsigned long count, 2634 nodemask_t *nodes_allowed) 2635 { 2636 int i; 2637 2638 if (hstate_is_gigantic(h)) 2639 return; 2640 2641 for_each_node_mask(i, *nodes_allowed) { 2642 struct page *page, *next; 2643 struct list_head *freel = &h->hugepage_freelists[i]; 2644 list_for_each_entry_safe(page, next, freel, lru) { 2645 if (count >= h->nr_huge_pages) 2646 return; 2647 if (PageHighMem(page)) 2648 continue; 2649 list_del(&page->lru); 2650 update_and_free_page(h, page); 2651 h->free_huge_pages--; 2652 h->free_huge_pages_node[page_to_nid(page)]--; 2653 } 2654 } 2655 } 2656 #else 2657 static inline void try_to_free_low(struct hstate *h, unsigned long count, 2658 nodemask_t *nodes_allowed) 2659 { 2660 } 2661 #endif 2662 2663 /* 2664 * Increment or decrement surplus_huge_pages. Keep node-specific counters 2665 * balanced by operating on them in a round-robin fashion. 2666 * Returns 1 if an adjustment was made. 2667 */ 2668 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 2669 int delta) 2670 { 2671 int nr_nodes, node; 2672 2673 VM_BUG_ON(delta != -1 && delta != 1); 2674 2675 if (delta < 0) { 2676 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 2677 if (h->surplus_huge_pages_node[node]) 2678 goto found; 2679 } 2680 } else { 2681 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 2682 if (h->surplus_huge_pages_node[node] < 2683 h->nr_huge_pages_node[node]) 2684 goto found; 2685 } 2686 } 2687 return 0; 2688 2689 found: 2690 h->surplus_huge_pages += delta; 2691 h->surplus_huge_pages_node[node] += delta; 2692 return 1; 2693 } 2694 2695 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 2696 static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, 2697 nodemask_t *nodes_allowed) 2698 { 2699 unsigned long min_count, ret; 2700 NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); 2701 2702 /* 2703 * Bit mask controlling how hard we retry per-node allocations. 2704 * If we can not allocate the bit mask, do not attempt to allocate 2705 * the requested huge pages. 2706 */ 2707 if (node_alloc_noretry) 2708 nodes_clear(*node_alloc_noretry); 2709 else 2710 return -ENOMEM; 2711 2712 spin_lock(&hugetlb_lock); 2713 2714 /* 2715 * Check for a node specific request. 2716 * Changing node specific huge page count may require a corresponding 2717 * change to the global count. In any case, the passed node mask 2718 * (nodes_allowed) will restrict alloc/free to the specified node. 2719 */ 2720 if (nid != NUMA_NO_NODE) { 2721 unsigned long old_count = count; 2722 2723 count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 2724 /* 2725 * User may have specified a large count value which caused the 2726 * above calculation to overflow. In this case, they wanted 2727 * to allocate as many huge pages as possible. Set count to 2728 * largest possible value to align with their intention. 2729 */ 2730 if (count < old_count) 2731 count = ULONG_MAX; 2732 } 2733 2734 /* 2735 * Gigantic pages runtime allocation depend on the capability for large 2736 * page range allocation. 2737 * If the system does not provide this feature, return an error when 2738 * the user tries to allocate gigantic pages but let the user free the 2739 * boottime allocated gigantic pages. 2740 */ 2741 if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { 2742 if (count > persistent_huge_pages(h)) { 2743 spin_unlock(&hugetlb_lock); 2744 NODEMASK_FREE(node_alloc_noretry); 2745 return -EINVAL; 2746 } 2747 /* Fall through to decrease pool */ 2748 } 2749 2750 /* 2751 * Increase the pool size 2752 * First take pages out of surplus state. Then make up the 2753 * remaining difference by allocating fresh huge pages. 2754 * 2755 * We might race with alloc_surplus_huge_page() here and be unable 2756 * to convert a surplus huge page to a normal huge page. That is 2757 * not critical, though, it just means the overall size of the 2758 * pool might be one hugepage larger than it needs to be, but 2759 * within all the constraints specified by the sysctls. 2760 */ 2761 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 2762 if (!adjust_pool_surplus(h, nodes_allowed, -1)) 2763 break; 2764 } 2765 2766 while (count > persistent_huge_pages(h)) { 2767 /* 2768 * If this allocation races such that we no longer need the 2769 * page, free_huge_page will handle it by freeing the page 2770 * and reducing the surplus. 2771 */ 2772 spin_unlock(&hugetlb_lock); 2773 2774 /* yield cpu to avoid soft lockup */ 2775 cond_resched(); 2776 2777 ret = alloc_pool_huge_page(h, nodes_allowed, 2778 node_alloc_noretry); 2779 spin_lock(&hugetlb_lock); 2780 if (!ret) 2781 goto out; 2782 2783 /* Bail for signals. Probably ctrl-c from user */ 2784 if (signal_pending(current)) 2785 goto out; 2786 } 2787 2788 /* 2789 * Decrease the pool size 2790 * First return free pages to the buddy allocator (being careful 2791 * to keep enough around to satisfy reservations). Then place 2792 * pages into surplus state as needed so the pool will shrink 2793 * to the desired size as pages become free. 2794 * 2795 * By placing pages into the surplus state independent of the 2796 * overcommit value, we are allowing the surplus pool size to 2797 * exceed overcommit. There are few sane options here. Since 2798 * alloc_surplus_huge_page() is checking the global counter, 2799 * though, we'll note that we're not allowed to exceed surplus 2800 * and won't grow the pool anywhere else. Not until one of the 2801 * sysctls are changed, or the surplus pages go out of use. 2802 */ 2803 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 2804 min_count = max(count, min_count); 2805 try_to_free_low(h, min_count, nodes_allowed); 2806 while (min_count < persistent_huge_pages(h)) { 2807 if (!free_pool_huge_page(h, nodes_allowed, 0)) 2808 break; 2809 cond_resched_lock(&hugetlb_lock); 2810 } 2811 while (count < persistent_huge_pages(h)) { 2812 if (!adjust_pool_surplus(h, nodes_allowed, 1)) 2813 break; 2814 } 2815 out: 2816 h->max_huge_pages = persistent_huge_pages(h); 2817 spin_unlock(&hugetlb_lock); 2818 2819 NODEMASK_FREE(node_alloc_noretry); 2820 2821 return 0; 2822 } 2823 2824 #define HSTATE_ATTR_RO(_name) \ 2825 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 2826 2827 #define HSTATE_ATTR(_name) \ 2828 static struct kobj_attribute _name##_attr = \ 2829 __ATTR(_name, 0644, _name##_show, _name##_store) 2830 2831 static struct kobject *hugepages_kobj; 2832 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 2833 2834 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); 2835 2836 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) 2837 { 2838 int i; 2839 2840 for (i = 0; i < HUGE_MAX_HSTATE; i++) 2841 if (hstate_kobjs[i] == kobj) { 2842 if (nidp) 2843 *nidp = NUMA_NO_NODE; 2844 return &hstates[i]; 2845 } 2846 2847 return kobj_to_node_hstate(kobj, nidp); 2848 } 2849 2850 static ssize_t nr_hugepages_show_common(struct kobject *kobj, 2851 struct kobj_attribute *attr, char *buf) 2852 { 2853 struct hstate *h; 2854 unsigned long nr_huge_pages; 2855 int nid; 2856 2857 h = kobj_to_hstate(kobj, &nid); 2858 if (nid == NUMA_NO_NODE) 2859 nr_huge_pages = h->nr_huge_pages; 2860 else 2861 nr_huge_pages = h->nr_huge_pages_node[nid]; 2862 2863 return sprintf(buf, "%lu\n", nr_huge_pages); 2864 } 2865 2866 static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, 2867 struct hstate *h, int nid, 2868 unsigned long count, size_t len) 2869 { 2870 int err; 2871 nodemask_t nodes_allowed, *n_mask; 2872 2873 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 2874 return -EINVAL; 2875 2876 if (nid == NUMA_NO_NODE) { 2877 /* 2878 * global hstate attribute 2879 */ 2880 if (!(obey_mempolicy && 2881 init_nodemask_of_mempolicy(&nodes_allowed))) 2882 n_mask = &node_states[N_MEMORY]; 2883 else 2884 n_mask = &nodes_allowed; 2885 } else { 2886 /* 2887 * Node specific request. count adjustment happens in 2888 * set_max_huge_pages() after acquiring hugetlb_lock. 2889 */ 2890 init_nodemask_of_node(&nodes_allowed, nid); 2891 n_mask = &nodes_allowed; 2892 } 2893 2894 err = set_max_huge_pages(h, count, nid, n_mask); 2895 2896 return err ? err : len; 2897 } 2898 2899 static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 2900 struct kobject *kobj, const char *buf, 2901 size_t len) 2902 { 2903 struct hstate *h; 2904 unsigned long count; 2905 int nid; 2906 int err; 2907 2908 err = kstrtoul(buf, 10, &count); 2909 if (err) 2910 return err; 2911 2912 h = kobj_to_hstate(kobj, &nid); 2913 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); 2914 } 2915 2916 static ssize_t nr_hugepages_show(struct kobject *kobj, 2917 struct kobj_attribute *attr, char *buf) 2918 { 2919 return nr_hugepages_show_common(kobj, attr, buf); 2920 } 2921 2922 static ssize_t nr_hugepages_store(struct kobject *kobj, 2923 struct kobj_attribute *attr, const char *buf, size_t len) 2924 { 2925 return nr_hugepages_store_common(false, kobj, buf, len); 2926 } 2927 HSTATE_ATTR(nr_hugepages); 2928 2929 #ifdef CONFIG_NUMA 2930 2931 /* 2932 * hstate attribute for optionally mempolicy-based constraint on persistent 2933 * huge page alloc/free. 2934 */ 2935 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, 2936 struct kobj_attribute *attr, char *buf) 2937 { 2938 return nr_hugepages_show_common(kobj, attr, buf); 2939 } 2940 2941 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, 2942 struct kobj_attribute *attr, const char *buf, size_t len) 2943 { 2944 return nr_hugepages_store_common(true, kobj, buf, len); 2945 } 2946 HSTATE_ATTR(nr_hugepages_mempolicy); 2947 #endif 2948 2949 2950 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 2951 struct kobj_attribute *attr, char *buf) 2952 { 2953 struct hstate *h = kobj_to_hstate(kobj, NULL); 2954 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 2955 } 2956 2957 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 2958 struct kobj_attribute *attr, const char *buf, size_t count) 2959 { 2960 int err; 2961 unsigned long input; 2962 struct hstate *h = kobj_to_hstate(kobj, NULL); 2963 2964 if (hstate_is_gigantic(h)) 2965 return -EINVAL; 2966 2967 err = kstrtoul(buf, 10, &input); 2968 if (err) 2969 return err; 2970 2971 spin_lock(&hugetlb_lock); 2972 h->nr_overcommit_huge_pages = input; 2973 spin_unlock(&hugetlb_lock); 2974 2975 return count; 2976 } 2977 HSTATE_ATTR(nr_overcommit_hugepages); 2978 2979 static ssize_t free_hugepages_show(struct kobject *kobj, 2980 struct kobj_attribute *attr, char *buf) 2981 { 2982 struct hstate *h; 2983 unsigned long free_huge_pages; 2984 int nid; 2985 2986 h = kobj_to_hstate(kobj, &nid); 2987 if (nid == NUMA_NO_NODE) 2988 free_huge_pages = h->free_huge_pages; 2989 else 2990 free_huge_pages = h->free_huge_pages_node[nid]; 2991 2992 return sprintf(buf, "%lu\n", free_huge_pages); 2993 } 2994 HSTATE_ATTR_RO(free_hugepages); 2995 2996 static ssize_t resv_hugepages_show(struct kobject *kobj, 2997 struct kobj_attribute *attr, char *buf) 2998 { 2999 struct hstate *h = kobj_to_hstate(kobj, NULL); 3000 return sprintf(buf, "%lu\n", h->resv_huge_pages); 3001 } 3002 HSTATE_ATTR_RO(resv_hugepages); 3003 3004 static ssize_t surplus_hugepages_show(struct kobject *kobj, 3005 struct kobj_attribute *attr, char *buf) 3006 { 3007 struct hstate *h; 3008 unsigned long surplus_huge_pages; 3009 int nid; 3010 3011 h = kobj_to_hstate(kobj, &nid); 3012 if (nid == NUMA_NO_NODE) 3013 surplus_huge_pages = h->surplus_huge_pages; 3014 else 3015 surplus_huge_pages = h->surplus_huge_pages_node[nid]; 3016 3017 return sprintf(buf, "%lu\n", surplus_huge_pages); 3018 } 3019 HSTATE_ATTR_RO(surplus_hugepages); 3020 3021 static struct attribute *hstate_attrs[] = { 3022 &nr_hugepages_attr.attr, 3023 &nr_overcommit_hugepages_attr.attr, 3024 &free_hugepages_attr.attr, 3025 &resv_hugepages_attr.attr, 3026 &surplus_hugepages_attr.attr, 3027 #ifdef CONFIG_NUMA 3028 &nr_hugepages_mempolicy_attr.attr, 3029 #endif 3030 NULL, 3031 }; 3032 3033 static const struct attribute_group hstate_attr_group = { 3034 .attrs = hstate_attrs, 3035 }; 3036 3037 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, 3038 struct kobject **hstate_kobjs, 3039 const struct attribute_group *hstate_attr_group) 3040 { 3041 int retval; 3042 int hi = hstate_index(h); 3043 3044 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 3045 if (!hstate_kobjs[hi]) 3046 return -ENOMEM; 3047 3048 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); 3049 if (retval) 3050 kobject_put(hstate_kobjs[hi]); 3051 3052 return retval; 3053 } 3054 3055 static void __init hugetlb_sysfs_init(void) 3056 { 3057 struct hstate *h; 3058 int err; 3059 3060 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); 3061 if (!hugepages_kobj) 3062 return; 3063 3064 for_each_hstate(h) { 3065 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 3066 hstate_kobjs, &hstate_attr_group); 3067 if (err) 3068 pr_err("HugeTLB: Unable to add hstate %s", h->name); 3069 } 3070 } 3071 3072 #ifdef CONFIG_NUMA 3073 3074 /* 3075 * node_hstate/s - associate per node hstate attributes, via their kobjects, 3076 * with node devices in node_devices[] using a parallel array. The array 3077 * index of a node device or _hstate == node id. 3078 * This is here to avoid any static dependency of the node device driver, in 3079 * the base kernel, on the hugetlb module. 3080 */ 3081 struct node_hstate { 3082 struct kobject *hugepages_kobj; 3083 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 3084 }; 3085 static struct node_hstate node_hstates[MAX_NUMNODES]; 3086 3087 /* 3088 * A subset of global hstate attributes for node devices 3089 */ 3090 static struct attribute *per_node_hstate_attrs[] = { 3091 &nr_hugepages_attr.attr, 3092 &free_hugepages_attr.attr, 3093 &surplus_hugepages_attr.attr, 3094 NULL, 3095 }; 3096 3097 static const struct attribute_group per_node_hstate_attr_group = { 3098 .attrs = per_node_hstate_attrs, 3099 }; 3100 3101 /* 3102 * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. 3103 * Returns node id via non-NULL nidp. 3104 */ 3105 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 3106 { 3107 int nid; 3108 3109 for (nid = 0; nid < nr_node_ids; nid++) { 3110 struct node_hstate *nhs = &node_hstates[nid]; 3111 int i; 3112 for (i = 0; i < HUGE_MAX_HSTATE; i++) 3113 if (nhs->hstate_kobjs[i] == kobj) { 3114 if (nidp) 3115 *nidp = nid; 3116 return &hstates[i]; 3117 } 3118 } 3119 3120 BUG(); 3121 return NULL; 3122 } 3123 3124 /* 3125 * Unregister hstate attributes from a single node device. 3126 * No-op if no hstate attributes attached. 3127 */ 3128 static void hugetlb_unregister_node(struct node *node) 3129 { 3130 struct hstate *h; 3131 struct node_hstate *nhs = &node_hstates[node->dev.id]; 3132 3133 if (!nhs->hugepages_kobj) 3134 return; /* no hstate attributes */ 3135 3136 for_each_hstate(h) { 3137 int idx = hstate_index(h); 3138 if (nhs->hstate_kobjs[idx]) { 3139 kobject_put(nhs->hstate_kobjs[idx]); 3140 nhs->hstate_kobjs[idx] = NULL; 3141 } 3142 } 3143 3144 kobject_put(nhs->hugepages_kobj); 3145 nhs->hugepages_kobj = NULL; 3146 } 3147 3148 3149 /* 3150 * Register hstate attributes for a single node device. 3151 * No-op if attributes already registered. 3152 */ 3153 static void hugetlb_register_node(struct node *node) 3154 { 3155 struct hstate *h; 3156 struct node_hstate *nhs = &node_hstates[node->dev.id]; 3157 int err; 3158 3159 if (nhs->hugepages_kobj) 3160 return; /* already allocated */ 3161 3162 nhs->hugepages_kobj = kobject_create_and_add("hugepages", 3163 &node->dev.kobj); 3164 if (!nhs->hugepages_kobj) 3165 return; 3166 3167 for_each_hstate(h) { 3168 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, 3169 nhs->hstate_kobjs, 3170 &per_node_hstate_attr_group); 3171 if (err) { 3172 pr_err("HugeTLB: Unable to add hstate %s for node %d\n", 3173 h->name, node->dev.id); 3174 hugetlb_unregister_node(node); 3175 break; 3176 } 3177 } 3178 } 3179 3180 /* 3181 * hugetlb init time: register hstate attributes for all registered node 3182 * devices of nodes that have memory. All on-line nodes should have 3183 * registered their associated device by this time. 3184 */ 3185 static void __init hugetlb_register_all_nodes(void) 3186 { 3187 int nid; 3188 3189 for_each_node_state(nid, N_MEMORY) { 3190 struct node *node = node_devices[nid]; 3191 if (node->dev.id == nid) 3192 hugetlb_register_node(node); 3193 } 3194 3195 /* 3196 * Let the node device driver know we're here so it can 3197 * [un]register hstate attributes on node hotplug. 3198 */ 3199 register_hugetlbfs_with_node(hugetlb_register_node, 3200 hugetlb_unregister_node); 3201 } 3202 #else /* !CONFIG_NUMA */ 3203 3204 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 3205 { 3206 BUG(); 3207 if (nidp) 3208 *nidp = -1; 3209 return NULL; 3210 } 3211 3212 static void hugetlb_register_all_nodes(void) { } 3213 3214 #endif 3215 3216 static int __init hugetlb_init(void) 3217 { 3218 int i; 3219 3220 if (!hugepages_supported()) { 3221 if (hugetlb_max_hstate || default_hstate_max_huge_pages) 3222 pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); 3223 return 0; 3224 } 3225 3226 /* 3227 * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some 3228 * architectures depend on setup being done here. 3229 */ 3230 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 3231 if (!parsed_default_hugepagesz) { 3232 /* 3233 * If we did not parse a default huge page size, set 3234 * default_hstate_idx to HPAGE_SIZE hstate. And, if the 3235 * number of huge pages for this default size was implicitly 3236 * specified, set that here as well. 3237 * Note that the implicit setting will overwrite an explicit 3238 * setting. A warning will be printed in this case. 3239 */ 3240 default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); 3241 if (default_hstate_max_huge_pages) { 3242 if (default_hstate.max_huge_pages) { 3243 char buf[32]; 3244 3245 string_get_size(huge_page_size(&default_hstate), 3246 1, STRING_UNITS_2, buf, 32); 3247 pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", 3248 default_hstate.max_huge_pages, buf); 3249 pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", 3250 default_hstate_max_huge_pages); 3251 } 3252 default_hstate.max_huge_pages = 3253 default_hstate_max_huge_pages; 3254 } 3255 } 3256 3257 hugetlb_cma_check(); 3258 hugetlb_init_hstates(); 3259 gather_bootmem_prealloc(); 3260 report_hugepages(); 3261 3262 hugetlb_sysfs_init(); 3263 hugetlb_register_all_nodes(); 3264 hugetlb_cgroup_file_init(); 3265 3266 #ifdef CONFIG_SMP 3267 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); 3268 #else 3269 num_fault_mutexes = 1; 3270 #endif 3271 hugetlb_fault_mutex_table = 3272 kmalloc_array(num_fault_mutexes, sizeof(struct mutex), 3273 GFP_KERNEL); 3274 BUG_ON(!hugetlb_fault_mutex_table); 3275 3276 for (i = 0; i < num_fault_mutexes; i++) 3277 mutex_init(&hugetlb_fault_mutex_table[i]); 3278 return 0; 3279 } 3280 subsys_initcall(hugetlb_init); 3281 3282 /* Overwritten by architectures with more huge page sizes */ 3283 bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) 3284 { 3285 return size == HPAGE_SIZE; 3286 } 3287 3288 void __init hugetlb_add_hstate(unsigned int order) 3289 { 3290 struct hstate *h; 3291 unsigned long i; 3292 3293 if (size_to_hstate(PAGE_SIZE << order)) { 3294 return; 3295 } 3296 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); 3297 BUG_ON(order == 0); 3298 h = &hstates[hugetlb_max_hstate++]; 3299 h->order = order; 3300 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); 3301 h->nr_huge_pages = 0; 3302 h->free_huge_pages = 0; 3303 for (i = 0; i < MAX_NUMNODES; ++i) 3304 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 3305 INIT_LIST_HEAD(&h->hugepage_activelist); 3306 h->next_nid_to_alloc = first_memory_node; 3307 h->next_nid_to_free = first_memory_node; 3308 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 3309 huge_page_size(h)/1024); 3310 3311 parsed_hstate = h; 3312 } 3313 3314 /* 3315 * hugepages command line processing 3316 * hugepages normally follows a valid hugepagsz or default_hugepagsz 3317 * specification. If not, ignore the hugepages value. hugepages can also 3318 * be the first huge page command line option in which case it implicitly 3319 * specifies the number of huge pages for the default size. 3320 */ 3321 static int __init hugepages_setup(char *s) 3322 { 3323 unsigned long *mhp; 3324 static unsigned long *last_mhp; 3325 3326 if (!parsed_valid_hugepagesz) { 3327 pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); 3328 parsed_valid_hugepagesz = true; 3329 return 0; 3330 } 3331 3332 /* 3333 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter 3334 * yet, so this hugepages= parameter goes to the "default hstate". 3335 * Otherwise, it goes with the previously parsed hugepagesz or 3336 * default_hugepagesz. 3337 */ 3338 else if (!hugetlb_max_hstate) 3339 mhp = &default_hstate_max_huge_pages; 3340 else 3341 mhp = &parsed_hstate->max_huge_pages; 3342 3343 if (mhp == last_mhp) { 3344 pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); 3345 return 0; 3346 } 3347 3348 if (sscanf(s, "%lu", mhp) <= 0) 3349 *mhp = 0; 3350 3351 /* 3352 * Global state is always initialized later in hugetlb_init. 3353 * But we need to allocate >= MAX_ORDER hstates here early to still 3354 * use the bootmem allocator. 3355 */ 3356 if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) 3357 hugetlb_hstate_alloc_pages(parsed_hstate); 3358 3359 last_mhp = mhp; 3360 3361 return 1; 3362 } 3363 __setup("hugepages=", hugepages_setup); 3364 3365 /* 3366 * hugepagesz command line processing 3367 * A specific huge page size can only be specified once with hugepagesz. 3368 * hugepagesz is followed by hugepages on the command line. The global 3369 * variable 'parsed_valid_hugepagesz' is used to determine if prior 3370 * hugepagesz argument was valid. 3371 */ 3372 static int __init hugepagesz_setup(char *s) 3373 { 3374 unsigned long size; 3375 struct hstate *h; 3376 3377 parsed_valid_hugepagesz = false; 3378 size = (unsigned long)memparse(s, NULL); 3379 3380 if (!arch_hugetlb_valid_size(size)) { 3381 pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); 3382 return 0; 3383 } 3384 3385 h = size_to_hstate(size); 3386 if (h) { 3387 /* 3388 * hstate for this size already exists. This is normally 3389 * an error, but is allowed if the existing hstate is the 3390 * default hstate. More specifically, it is only allowed if 3391 * the number of huge pages for the default hstate was not 3392 * previously specified. 3393 */ 3394 if (!parsed_default_hugepagesz || h != &default_hstate || 3395 default_hstate.max_huge_pages) { 3396 pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); 3397 return 0; 3398 } 3399 3400 /* 3401 * No need to call hugetlb_add_hstate() as hstate already 3402 * exists. But, do set parsed_hstate so that a following 3403 * hugepages= parameter will be applied to this hstate. 3404 */ 3405 parsed_hstate = h; 3406 parsed_valid_hugepagesz = true; 3407 return 1; 3408 } 3409 3410 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 3411 parsed_valid_hugepagesz = true; 3412 return 1; 3413 } 3414 __setup("hugepagesz=", hugepagesz_setup); 3415 3416 /* 3417 * default_hugepagesz command line input 3418 * Only one instance of default_hugepagesz allowed on command line. 3419 */ 3420 static int __init default_hugepagesz_setup(char *s) 3421 { 3422 unsigned long size; 3423 3424 parsed_valid_hugepagesz = false; 3425 if (parsed_default_hugepagesz) { 3426 pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); 3427 return 0; 3428 } 3429 3430 size = (unsigned long)memparse(s, NULL); 3431 3432 if (!arch_hugetlb_valid_size(size)) { 3433 pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); 3434 return 0; 3435 } 3436 3437 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 3438 parsed_valid_hugepagesz = true; 3439 parsed_default_hugepagesz = true; 3440 default_hstate_idx = hstate_index(size_to_hstate(size)); 3441 3442 /* 3443 * The number of default huge pages (for this size) could have been 3444 * specified as the first hugetlb parameter: hugepages=X. If so, 3445 * then default_hstate_max_huge_pages is set. If the default huge 3446 * page size is gigantic (>= MAX_ORDER), then the pages must be 3447 * allocated here from bootmem allocator. 3448 */ 3449 if (default_hstate_max_huge_pages) { 3450 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 3451 if (hstate_is_gigantic(&default_hstate)) 3452 hugetlb_hstate_alloc_pages(&default_hstate); 3453 default_hstate_max_huge_pages = 0; 3454 } 3455 3456 return 1; 3457 } 3458 __setup("default_hugepagesz=", default_hugepagesz_setup); 3459 3460 static unsigned int cpuset_mems_nr(unsigned int *array) 3461 { 3462 int node; 3463 unsigned int nr = 0; 3464 3465 for_each_node_mask(node, cpuset_current_mems_allowed) 3466 nr += array[node]; 3467 3468 return nr; 3469 } 3470 3471 #ifdef CONFIG_SYSCTL 3472 static int hugetlb_sysctl_handler_common(bool obey_mempolicy, 3473 struct ctl_table *table, int write, 3474 void *buffer, size_t *length, loff_t *ppos) 3475 { 3476 struct hstate *h = &default_hstate; 3477 unsigned long tmp = h->max_huge_pages; 3478 int ret; 3479 3480 if (!hugepages_supported()) 3481 return -EOPNOTSUPP; 3482 3483 table->data = &tmp; 3484 table->maxlen = sizeof(unsigned long); 3485 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); 3486 if (ret) 3487 goto out; 3488 3489 if (write) 3490 ret = __nr_hugepages_store_common(obey_mempolicy, h, 3491 NUMA_NO_NODE, tmp, *length); 3492 out: 3493 return ret; 3494 } 3495 3496 int hugetlb_sysctl_handler(struct ctl_table *table, int write, 3497 void *buffer, size_t *length, loff_t *ppos) 3498 { 3499 3500 return hugetlb_sysctl_handler_common(false, table, write, 3501 buffer, length, ppos); 3502 } 3503 3504 #ifdef CONFIG_NUMA 3505 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, 3506 void *buffer, size_t *length, loff_t *ppos) 3507 { 3508 return hugetlb_sysctl_handler_common(true, table, write, 3509 buffer, length, ppos); 3510 } 3511 #endif /* CONFIG_NUMA */ 3512 3513 int hugetlb_overcommit_handler(struct ctl_table *table, int write, 3514 void *buffer, size_t *length, loff_t *ppos) 3515 { 3516 struct hstate *h = &default_hstate; 3517 unsigned long tmp; 3518 int ret; 3519 3520 if (!hugepages_supported()) 3521 return -EOPNOTSUPP; 3522 3523 tmp = h->nr_overcommit_huge_pages; 3524 3525 if (write && hstate_is_gigantic(h)) 3526 return -EINVAL; 3527 3528 table->data = &tmp; 3529 table->maxlen = sizeof(unsigned long); 3530 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); 3531 if (ret) 3532 goto out; 3533 3534 if (write) { 3535 spin_lock(&hugetlb_lock); 3536 h->nr_overcommit_huge_pages = tmp; 3537 spin_unlock(&hugetlb_lock); 3538 } 3539 out: 3540 return ret; 3541 } 3542 3543 #endif /* CONFIG_SYSCTL */ 3544 3545 void hugetlb_report_meminfo(struct seq_file *m) 3546 { 3547 struct hstate *h; 3548 unsigned long total = 0; 3549 3550 if (!hugepages_supported()) 3551 return; 3552 3553 for_each_hstate(h) { 3554 unsigned long count = h->nr_huge_pages; 3555 3556 total += (PAGE_SIZE << huge_page_order(h)) * count; 3557 3558 if (h == &default_hstate) 3559 seq_printf(m, 3560 "HugePages_Total: %5lu\n" 3561 "HugePages_Free: %5lu\n" 3562 "HugePages_Rsvd: %5lu\n" 3563 "HugePages_Surp: %5lu\n" 3564 "Hugepagesize: %8lu kB\n", 3565 count, 3566 h->free_huge_pages, 3567 h->resv_huge_pages, 3568 h->surplus_huge_pages, 3569 (PAGE_SIZE << huge_page_order(h)) / 1024); 3570 } 3571 3572 seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024); 3573 } 3574 3575 int hugetlb_report_node_meminfo(int nid, char *buf) 3576 { 3577 struct hstate *h = &default_hstate; 3578 if (!hugepages_supported()) 3579 return 0; 3580 return sprintf(buf, 3581 "Node %d HugePages_Total: %5u\n" 3582 "Node %d HugePages_Free: %5u\n" 3583 "Node %d HugePages_Surp: %5u\n", 3584 nid, h->nr_huge_pages_node[nid], 3585 nid, h->free_huge_pages_node[nid], 3586 nid, h->surplus_huge_pages_node[nid]); 3587 } 3588 3589 void hugetlb_show_meminfo(void) 3590 { 3591 struct hstate *h; 3592 int nid; 3593 3594 if (!hugepages_supported()) 3595 return; 3596 3597 for_each_node_state(nid, N_MEMORY) 3598 for_each_hstate(h) 3599 pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", 3600 nid, 3601 h->nr_huge_pages_node[nid], 3602 h->free_huge_pages_node[nid], 3603 h->surplus_huge_pages_node[nid], 3604 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); 3605 } 3606 3607 void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) 3608 { 3609 seq_printf(m, "HugetlbPages:\t%8lu kB\n", 3610 atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); 3611 } 3612 3613 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 3614 unsigned long hugetlb_total_pages(void) 3615 { 3616 struct hstate *h; 3617 unsigned long nr_total_pages = 0; 3618 3619 for_each_hstate(h) 3620 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); 3621 return nr_total_pages; 3622 } 3623 3624 static int hugetlb_acct_memory(struct hstate *h, long delta) 3625 { 3626 int ret = -ENOMEM; 3627 3628 spin_lock(&hugetlb_lock); 3629 /* 3630 * When cpuset is configured, it breaks the strict hugetlb page 3631 * reservation as the accounting is done on a global variable. Such 3632 * reservation is completely rubbish in the presence of cpuset because 3633 * the reservation is not checked against page availability for the 3634 * current cpuset. Application can still potentially OOM'ed by kernel 3635 * with lack of free htlb page in cpuset that the task is in. 3636 * Attempt to enforce strict accounting with cpuset is almost 3637 * impossible (or too ugly) because cpuset is too fluid that 3638 * task or memory node can be dynamically moved between cpusets. 3639 * 3640 * The change of semantics for shared hugetlb mapping with cpuset is 3641 * undesirable. However, in order to preserve some of the semantics, 3642 * we fall back to check against current free page availability as 3643 * a best attempt and hopefully to minimize the impact of changing 3644 * semantics that cpuset has. 3645 */ 3646 if (delta > 0) { 3647 if (gather_surplus_pages(h, delta) < 0) 3648 goto out; 3649 3650 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { 3651 return_unused_surplus_pages(h, delta); 3652 goto out; 3653 } 3654 } 3655 3656 ret = 0; 3657 if (delta < 0) 3658 return_unused_surplus_pages(h, (unsigned long) -delta); 3659 3660 out: 3661 spin_unlock(&hugetlb_lock); 3662 return ret; 3663 } 3664 3665 static void hugetlb_vm_op_open(struct vm_area_struct *vma) 3666 { 3667 struct resv_map *resv = vma_resv_map(vma); 3668 3669 /* 3670 * This new VMA should share its siblings reservation map if present. 3671 * The VMA will only ever have a valid reservation map pointer where 3672 * it is being copied for another still existing VMA. As that VMA 3673 * has a reference to the reservation map it cannot disappear until 3674 * after this open call completes. It is therefore safe to take a 3675 * new reference here without additional locking. 3676 */ 3677 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 3678 kref_get(&resv->refs); 3679 } 3680 3681 static void hugetlb_vm_op_close(struct vm_area_struct *vma) 3682 { 3683 struct hstate *h = hstate_vma(vma); 3684 struct resv_map *resv = vma_resv_map(vma); 3685 struct hugepage_subpool *spool = subpool_vma(vma); 3686 unsigned long reserve, start, end; 3687 long gbl_reserve; 3688 3689 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 3690 return; 3691 3692 start = vma_hugecache_offset(h, vma, vma->vm_start); 3693 end = vma_hugecache_offset(h, vma, vma->vm_end); 3694 3695 reserve = (end - start) - region_count(resv, start, end); 3696 hugetlb_cgroup_uncharge_counter(resv, start, end); 3697 if (reserve) { 3698 /* 3699 * Decrement reserve counts. The global reserve count may be 3700 * adjusted if the subpool has a minimum size. 3701 */ 3702 gbl_reserve = hugepage_subpool_put_pages(spool, reserve); 3703 hugetlb_acct_memory(h, -gbl_reserve); 3704 } 3705 3706 kref_put(&resv->refs, resv_map_release); 3707 } 3708 3709 static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) 3710 { 3711 if (addr & ~(huge_page_mask(hstate_vma(vma)))) 3712 return -EINVAL; 3713 return 0; 3714 } 3715 3716 static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) 3717 { 3718 struct hstate *hstate = hstate_vma(vma); 3719 3720 return 1UL << huge_page_shift(hstate); 3721 } 3722 3723 /* 3724 * We cannot handle pagefaults against hugetlb pages at all. They cause 3725 * handle_mm_fault() to try to instantiate regular-sized pages in the 3726 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 3727 * this far. 3728 */ 3729 static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) 3730 { 3731 BUG(); 3732 return 0; 3733 } 3734 3735 /* 3736 * When a new function is introduced to vm_operations_struct and added 3737 * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. 3738 * This is because under System V memory model, mappings created via 3739 * shmget/shmat with "huge page" specified are backed by hugetlbfs files, 3740 * their original vm_ops are overwritten with shm_vm_ops. 3741 */ 3742 const struct vm_operations_struct hugetlb_vm_ops = { 3743 .fault = hugetlb_vm_op_fault, 3744 .open = hugetlb_vm_op_open, 3745 .close = hugetlb_vm_op_close, 3746 .split = hugetlb_vm_op_split, 3747 .pagesize = hugetlb_vm_op_pagesize, 3748 }; 3749 3750 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 3751 int writable) 3752 { 3753 pte_t entry; 3754 3755 if (writable) { 3756 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, 3757 vma->vm_page_prot))); 3758 } else { 3759 entry = huge_pte_wrprotect(mk_huge_pte(page, 3760 vma->vm_page_prot)); 3761 } 3762 entry = pte_mkyoung(entry); 3763 entry = pte_mkhuge(entry); 3764 entry = arch_make_huge_pte(entry, vma, page, writable); 3765 3766 return entry; 3767 } 3768 3769 static void set_huge_ptep_writable(struct vm_area_struct *vma, 3770 unsigned long address, pte_t *ptep) 3771 { 3772 pte_t entry; 3773 3774 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); 3775 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) 3776 update_mmu_cache(vma, address, ptep); 3777 } 3778 3779 bool is_hugetlb_entry_migration(pte_t pte) 3780 { 3781 swp_entry_t swp; 3782 3783 if (huge_pte_none(pte) || pte_present(pte)) 3784 return false; 3785 swp = pte_to_swp_entry(pte); 3786 if (non_swap_entry(swp) && is_migration_entry(swp)) 3787 return true; 3788 else 3789 return false; 3790 } 3791 3792 static int is_hugetlb_entry_hwpoisoned(pte_t pte) 3793 { 3794 swp_entry_t swp; 3795 3796 if (huge_pte_none(pte) || pte_present(pte)) 3797 return 0; 3798 swp = pte_to_swp_entry(pte); 3799 if (non_swap_entry(swp) && is_hwpoison_entry(swp)) 3800 return 1; 3801 else 3802 return 0; 3803 } 3804 3805 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 3806 struct vm_area_struct *vma) 3807 { 3808 pte_t *src_pte, *dst_pte, entry, dst_entry; 3809 struct page *ptepage; 3810 unsigned long addr; 3811 int cow; 3812 struct hstate *h = hstate_vma(vma); 3813 unsigned long sz = huge_page_size(h); 3814 struct address_space *mapping = vma->vm_file->f_mapping; 3815 struct mmu_notifier_range range; 3816 int ret = 0; 3817 3818 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 3819 3820 if (cow) { 3821 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, 3822 vma->vm_start, 3823 vma->vm_end); 3824 mmu_notifier_invalidate_range_start(&range); 3825 } else { 3826 /* 3827 * For shared mappings i_mmap_rwsem must be held to call 3828 * huge_pte_alloc, otherwise the returned ptep could go 3829 * away if part of a shared pmd and another thread calls 3830 * huge_pmd_unshare. 3831 */ 3832 i_mmap_lock_read(mapping); 3833 } 3834 3835 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 3836 spinlock_t *src_ptl, *dst_ptl; 3837 src_pte = huge_pte_offset(src, addr, sz); 3838 if (!src_pte) 3839 continue; 3840 dst_pte = huge_pte_alloc(dst, addr, sz); 3841 if (!dst_pte) { 3842 ret = -ENOMEM; 3843 break; 3844 } 3845 3846 /* 3847 * If the pagetables are shared don't copy or take references. 3848 * dst_pte == src_pte is the common case of src/dest sharing. 3849 * 3850 * However, src could have 'unshared' and dst shares with 3851 * another vma. If dst_pte !none, this implies sharing. 3852 * Check here before taking page table lock, and once again 3853 * after taking the lock below. 3854 */ 3855 dst_entry = huge_ptep_get(dst_pte); 3856 if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) 3857 continue; 3858 3859 dst_ptl = huge_pte_lock(h, dst, dst_pte); 3860 src_ptl = huge_pte_lockptr(h, src, src_pte); 3861 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 3862 entry = huge_ptep_get(src_pte); 3863 dst_entry = huge_ptep_get(dst_pte); 3864 if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { 3865 /* 3866 * Skip if src entry none. Also, skip in the 3867 * unlikely case dst entry !none as this implies 3868 * sharing with another vma. 3869 */ 3870 ; 3871 } else if (unlikely(is_hugetlb_entry_migration(entry) || 3872 is_hugetlb_entry_hwpoisoned(entry))) { 3873 swp_entry_t swp_entry = pte_to_swp_entry(entry); 3874 3875 if (is_write_migration_entry(swp_entry) && cow) { 3876 /* 3877 * COW mappings require pages in both 3878 * parent and child to be set to read. 3879 */ 3880 make_migration_entry_read(&swp_entry); 3881 entry = swp_entry_to_pte(swp_entry); 3882 set_huge_swap_pte_at(src, addr, src_pte, 3883 entry, sz); 3884 } 3885 set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); 3886 } else { 3887 if (cow) { 3888 /* 3889 * No need to notify as we are downgrading page 3890 * table protection not changing it to point 3891 * to a new page. 3892 * 3893 * See Documentation/vm/mmu_notifier.rst 3894 */ 3895 huge_ptep_set_wrprotect(src, addr, src_pte); 3896 } 3897 entry = huge_ptep_get(src_pte); 3898 ptepage = pte_page(entry); 3899 get_page(ptepage); 3900 page_dup_rmap(ptepage, true); 3901 set_huge_pte_at(dst, addr, dst_pte, entry); 3902 hugetlb_count_add(pages_per_huge_page(h), dst); 3903 } 3904 spin_unlock(src_ptl); 3905 spin_unlock(dst_ptl); 3906 } 3907 3908 if (cow) 3909 mmu_notifier_invalidate_range_end(&range); 3910 else 3911 i_mmap_unlock_read(mapping); 3912 3913 return ret; 3914 } 3915 3916 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 3917 unsigned long start, unsigned long end, 3918 struct page *ref_page) 3919 { 3920 struct mm_struct *mm = vma->vm_mm; 3921 unsigned long address; 3922 pte_t *ptep; 3923 pte_t pte; 3924 spinlock_t *ptl; 3925 struct page *page; 3926 struct hstate *h = hstate_vma(vma); 3927 unsigned long sz = huge_page_size(h); 3928 struct mmu_notifier_range range; 3929 3930 WARN_ON(!is_vm_hugetlb_page(vma)); 3931 BUG_ON(start & ~huge_page_mask(h)); 3932 BUG_ON(end & ~huge_page_mask(h)); 3933 3934 /* 3935 * This is a hugetlb vma, all the pte entries should point 3936 * to huge page. 3937 */ 3938 tlb_change_page_size(tlb, sz); 3939 tlb_start_vma(tlb, vma); 3940 3941 /* 3942 * If sharing possible, alert mmu notifiers of worst case. 3943 */ 3944 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, 3945 end); 3946 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 3947 mmu_notifier_invalidate_range_start(&range); 3948 address = start; 3949 for (; address < end; address += sz) { 3950 ptep = huge_pte_offset(mm, address, sz); 3951 if (!ptep) 3952 continue; 3953 3954 ptl = huge_pte_lock(h, mm, ptep); 3955 if (huge_pmd_unshare(mm, &address, ptep)) { 3956 spin_unlock(ptl); 3957 /* 3958 * We just unmapped a page of PMDs by clearing a PUD. 3959 * The caller's TLB flush range should cover this area. 3960 */ 3961 continue; 3962 } 3963 3964 pte = huge_ptep_get(ptep); 3965 if (huge_pte_none(pte)) { 3966 spin_unlock(ptl); 3967 continue; 3968 } 3969 3970 /* 3971 * Migrating hugepage or HWPoisoned hugepage is already 3972 * unmapped and its refcount is dropped, so just clear pte here. 3973 */ 3974 if (unlikely(!pte_present(pte))) { 3975 huge_pte_clear(mm, address, ptep, sz); 3976 spin_unlock(ptl); 3977 continue; 3978 } 3979 3980 page = pte_page(pte); 3981 /* 3982 * If a reference page is supplied, it is because a specific 3983 * page is being unmapped, not a range. Ensure the page we 3984 * are about to unmap is the actual page of interest. 3985 */ 3986 if (ref_page) { 3987 if (page != ref_page) { 3988 spin_unlock(ptl); 3989 continue; 3990 } 3991 /* 3992 * Mark the VMA as having unmapped its page so that 3993 * future faults in this VMA will fail rather than 3994 * looking like data was lost 3995 */ 3996 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); 3997 } 3998 3999 pte = huge_ptep_get_and_clear(mm, address, ptep); 4000 tlb_remove_huge_tlb_entry(h, tlb, ptep, address); 4001 if (huge_pte_dirty(pte)) 4002 set_page_dirty(page); 4003 4004 hugetlb_count_sub(pages_per_huge_page(h), mm); 4005 page_remove_rmap(page, true); 4006 4007 spin_unlock(ptl); 4008 tlb_remove_page_size(tlb, page, huge_page_size(h)); 4009 /* 4010 * Bail out after unmapping reference page if supplied 4011 */ 4012 if (ref_page) 4013 break; 4014 } 4015 mmu_notifier_invalidate_range_end(&range); 4016 tlb_end_vma(tlb, vma); 4017 } 4018 4019 void __unmap_hugepage_range_final(struct mmu_gather *tlb, 4020 struct vm_area_struct *vma, unsigned long start, 4021 unsigned long end, struct page *ref_page) 4022 { 4023 __unmap_hugepage_range(tlb, vma, start, end, ref_page); 4024 4025 /* 4026 * Clear this flag so that x86's huge_pmd_share page_table_shareable 4027 * test will fail on a vma being torn down, and not grab a page table 4028 * on its way out. We're lucky that the flag has such an appropriate 4029 * name, and can in fact be safely cleared here. We could clear it 4030 * before the __unmap_hugepage_range above, but all that's necessary 4031 * is to clear it before releasing the i_mmap_rwsem. This works 4032 * because in the context this is called, the VMA is about to be 4033 * destroyed and the i_mmap_rwsem is held. 4034 */ 4035 vma->vm_flags &= ~VM_MAYSHARE; 4036 } 4037 4038 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 4039 unsigned long end, struct page *ref_page) 4040 { 4041 struct mm_struct *mm; 4042 struct mmu_gather tlb; 4043 unsigned long tlb_start = start; 4044 unsigned long tlb_end = end; 4045 4046 /* 4047 * If shared PMDs were possibly used within this vma range, adjust 4048 * start/end for worst case tlb flushing. 4049 * Note that we can not be sure if PMDs are shared until we try to 4050 * unmap pages. However, we want to make sure TLB flushing covers 4051 * the largest possible range. 4052 */ 4053 adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end); 4054 4055 mm = vma->vm_mm; 4056 4057 tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end); 4058 __unmap_hugepage_range(&tlb, vma, start, end, ref_page); 4059 tlb_finish_mmu(&tlb, tlb_start, tlb_end); 4060 } 4061 4062 /* 4063 * This is called when the original mapper is failing to COW a MAP_PRIVATE 4064 * mappping it owns the reserve page for. The intention is to unmap the page 4065 * from other VMAs and let the children be SIGKILLed if they are faulting the 4066 * same region. 4067 */ 4068 static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, 4069 struct page *page, unsigned long address) 4070 { 4071 struct hstate *h = hstate_vma(vma); 4072 struct vm_area_struct *iter_vma; 4073 struct address_space *mapping; 4074 pgoff_t pgoff; 4075 4076 /* 4077 * vm_pgoff is in PAGE_SIZE units, hence the different calculation 4078 * from page cache lookup which is in HPAGE_SIZE units. 4079 */ 4080 address = address & huge_page_mask(h); 4081 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + 4082 vma->vm_pgoff; 4083 mapping = vma->vm_file->f_mapping; 4084 4085 /* 4086 * Take the mapping lock for the duration of the table walk. As 4087 * this mapping should be shared between all the VMAs, 4088 * __unmap_hugepage_range() is called as the lock is already held 4089 */ 4090 i_mmap_lock_write(mapping); 4091 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { 4092 /* Do not unmap the current VMA */ 4093 if (iter_vma == vma) 4094 continue; 4095 4096 /* 4097 * Shared VMAs have their own reserves and do not affect 4098 * MAP_PRIVATE accounting but it is possible that a shared 4099 * VMA is using the same page so check and skip such VMAs. 4100 */ 4101 if (iter_vma->vm_flags & VM_MAYSHARE) 4102 continue; 4103 4104 /* 4105 * Unmap the page from other VMAs without their own reserves. 4106 * They get marked to be SIGKILLed if they fault in these 4107 * areas. This is because a future no-page fault on this VMA 4108 * could insert a zeroed page instead of the data existing 4109 * from the time of fork. This would look like data corruption 4110 */ 4111 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 4112 unmap_hugepage_range(iter_vma, address, 4113 address + huge_page_size(h), page); 4114 } 4115 i_mmap_unlock_write(mapping); 4116 } 4117 4118 /* 4119 * Hugetlb_cow() should be called with page lock of the original hugepage held. 4120 * Called with hugetlb_instantiation_mutex held and pte_page locked so we 4121 * cannot race with other handlers or page migration. 4122 * Keep the pte_same checks anyway to make transition from the mutex easier. 4123 */ 4124 static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 4125 unsigned long address, pte_t *ptep, 4126 struct page *pagecache_page, spinlock_t *ptl) 4127 { 4128 pte_t pte; 4129 struct hstate *h = hstate_vma(vma); 4130 struct page *old_page, *new_page; 4131 int outside_reserve = 0; 4132 vm_fault_t ret = 0; 4133 unsigned long haddr = address & huge_page_mask(h); 4134 struct mmu_notifier_range range; 4135 4136 pte = huge_ptep_get(ptep); 4137 old_page = pte_page(pte); 4138 4139 retry_avoidcopy: 4140 /* If no-one else is actually using this page, avoid the copy 4141 * and just make the page writable */ 4142 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { 4143 page_move_anon_rmap(old_page, vma); 4144 set_huge_ptep_writable(vma, haddr, ptep); 4145 return 0; 4146 } 4147 4148 /* 4149 * If the process that created a MAP_PRIVATE mapping is about to 4150 * perform a COW due to a shared page count, attempt to satisfy 4151 * the allocation without using the existing reserves. The pagecache 4152 * page is used to determine if the reserve at this address was 4153 * consumed or not. If reserves were used, a partial faulted mapping 4154 * at the time of fork() could consume its reserves on COW instead 4155 * of the full address range. 4156 */ 4157 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 4158 old_page != pagecache_page) 4159 outside_reserve = 1; 4160 4161 get_page(old_page); 4162 4163 /* 4164 * Drop page table lock as buddy allocator may be called. It will 4165 * be acquired again before returning to the caller, as expected. 4166 */ 4167 spin_unlock(ptl); 4168 new_page = alloc_huge_page(vma, haddr, outside_reserve); 4169 4170 if (IS_ERR(new_page)) { 4171 /* 4172 * If a process owning a MAP_PRIVATE mapping fails to COW, 4173 * it is due to references held by a child and an insufficient 4174 * huge page pool. To guarantee the original mappers 4175 * reliability, unmap the page from child processes. The child 4176 * may get SIGKILLed if it later faults. 4177 */ 4178 if (outside_reserve) { 4179 put_page(old_page); 4180 BUG_ON(huge_pte_none(pte)); 4181 unmap_ref_private(mm, vma, old_page, haddr); 4182 BUG_ON(huge_pte_none(pte)); 4183 spin_lock(ptl); 4184 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 4185 if (likely(ptep && 4186 pte_same(huge_ptep_get(ptep), pte))) 4187 goto retry_avoidcopy; 4188 /* 4189 * race occurs while re-acquiring page table 4190 * lock, and our job is done. 4191 */ 4192 return 0; 4193 } 4194 4195 ret = vmf_error(PTR_ERR(new_page)); 4196 goto out_release_old; 4197 } 4198 4199 /* 4200 * When the original hugepage is shared one, it does not have 4201 * anon_vma prepared. 4202 */ 4203 if (unlikely(anon_vma_prepare(vma))) { 4204 ret = VM_FAULT_OOM; 4205 goto out_release_all; 4206 } 4207 4208 copy_user_huge_page(new_page, old_page, address, vma, 4209 pages_per_huge_page(h)); 4210 __SetPageUptodate(new_page); 4211 4212 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, 4213 haddr + huge_page_size(h)); 4214 mmu_notifier_invalidate_range_start(&range); 4215 4216 /* 4217 * Retake the page table lock to check for racing updates 4218 * before the page tables are altered 4219 */ 4220 spin_lock(ptl); 4221 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 4222 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { 4223 ClearPagePrivate(new_page); 4224 4225 /* Break COW */ 4226 huge_ptep_clear_flush(vma, haddr, ptep); 4227 mmu_notifier_invalidate_range(mm, range.start, range.end); 4228 set_huge_pte_at(mm, haddr, ptep, 4229 make_huge_pte(vma, new_page, 1)); 4230 page_remove_rmap(old_page, true); 4231 hugepage_add_new_anon_rmap(new_page, vma, haddr); 4232 set_page_huge_active(new_page); 4233 /* Make the old page be freed below */ 4234 new_page = old_page; 4235 } 4236 spin_unlock(ptl); 4237 mmu_notifier_invalidate_range_end(&range); 4238 out_release_all: 4239 restore_reserve_on_error(h, vma, haddr, new_page); 4240 put_page(new_page); 4241 out_release_old: 4242 put_page(old_page); 4243 4244 spin_lock(ptl); /* Caller expects lock to be held */ 4245 return ret; 4246 } 4247 4248 /* Return the pagecache page at a given address within a VMA */ 4249 static struct page *hugetlbfs_pagecache_page(struct hstate *h, 4250 struct vm_area_struct *vma, unsigned long address) 4251 { 4252 struct address_space *mapping; 4253 pgoff_t idx; 4254 4255 mapping = vma->vm_file->f_mapping; 4256 idx = vma_hugecache_offset(h, vma, address); 4257 4258 return find_lock_page(mapping, idx); 4259 } 4260 4261 /* 4262 * Return whether there is a pagecache page to back given address within VMA. 4263 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. 4264 */ 4265 static bool hugetlbfs_pagecache_present(struct hstate *h, 4266 struct vm_area_struct *vma, unsigned long address) 4267 { 4268 struct address_space *mapping; 4269 pgoff_t idx; 4270 struct page *page; 4271 4272 mapping = vma->vm_file->f_mapping; 4273 idx = vma_hugecache_offset(h, vma, address); 4274 4275 page = find_get_page(mapping, idx); 4276 if (page) 4277 put_page(page); 4278 return page != NULL; 4279 } 4280 4281 int huge_add_to_page_cache(struct page *page, struct address_space *mapping, 4282 pgoff_t idx) 4283 { 4284 struct inode *inode = mapping->host; 4285 struct hstate *h = hstate_inode(inode); 4286 int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 4287 4288 if (err) 4289 return err; 4290 ClearPagePrivate(page); 4291 4292 /* 4293 * set page dirty so that it will not be removed from cache/file 4294 * by non-hugetlbfs specific code paths. 4295 */ 4296 set_page_dirty(page); 4297 4298 spin_lock(&inode->i_lock); 4299 inode->i_blocks += blocks_per_huge_page(h); 4300 spin_unlock(&inode->i_lock); 4301 return 0; 4302 } 4303 4304 static vm_fault_t hugetlb_no_page(struct mm_struct *mm, 4305 struct vm_area_struct *vma, 4306 struct address_space *mapping, pgoff_t idx, 4307 unsigned long address, pte_t *ptep, unsigned int flags) 4308 { 4309 struct hstate *h = hstate_vma(vma); 4310 vm_fault_t ret = VM_FAULT_SIGBUS; 4311 int anon_rmap = 0; 4312 unsigned long size; 4313 struct page *page; 4314 pte_t new_pte; 4315 spinlock_t *ptl; 4316 unsigned long haddr = address & huge_page_mask(h); 4317 bool new_page = false; 4318 4319 /* 4320 * Currently, we are forced to kill the process in the event the 4321 * original mapper has unmapped pages from the child due to a failed 4322 * COW. Warn that such a situation has occurred as it may not be obvious 4323 */ 4324 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 4325 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", 4326 current->pid); 4327 return ret; 4328 } 4329 4330 /* 4331 * We can not race with truncation due to holding i_mmap_rwsem. 4332 * i_size is modified when holding i_mmap_rwsem, so check here 4333 * once for faults beyond end of file. 4334 */ 4335 size = i_size_read(mapping->host) >> huge_page_shift(h); 4336 if (idx >= size) 4337 goto out; 4338 4339 retry: 4340 page = find_lock_page(mapping, idx); 4341 if (!page) { 4342 /* 4343 * Check for page in userfault range 4344 */ 4345 if (userfaultfd_missing(vma)) { 4346 u32 hash; 4347 struct vm_fault vmf = { 4348 .vma = vma, 4349 .address = haddr, 4350 .flags = flags, 4351 /* 4352 * Hard to debug if it ends up being 4353 * used by a callee that assumes 4354 * something about the other 4355 * uninitialized fields... same as in 4356 * memory.c 4357 */ 4358 }; 4359 4360 /* 4361 * hugetlb_fault_mutex and i_mmap_rwsem must be 4362 * dropped before handling userfault. Reacquire 4363 * after handling fault to make calling code simpler. 4364 */ 4365 hash = hugetlb_fault_mutex_hash(mapping, idx); 4366 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 4367 i_mmap_unlock_read(mapping); 4368 ret = handle_userfault(&vmf, VM_UFFD_MISSING); 4369 i_mmap_lock_read(mapping); 4370 mutex_lock(&hugetlb_fault_mutex_table[hash]); 4371 goto out; 4372 } 4373 4374 page = alloc_huge_page(vma, haddr, 0); 4375 if (IS_ERR(page)) { 4376 /* 4377 * Returning error will result in faulting task being 4378 * sent SIGBUS. The hugetlb fault mutex prevents two 4379 * tasks from racing to fault in the same page which 4380 * could result in false unable to allocate errors. 4381 * Page migration does not take the fault mutex, but 4382 * does a clear then write of pte's under page table 4383 * lock. Page fault code could race with migration, 4384 * notice the clear pte and try to allocate a page 4385 * here. Before returning error, get ptl and make 4386 * sure there really is no pte entry. 4387 */ 4388 ptl = huge_pte_lock(h, mm, ptep); 4389 if (!huge_pte_none(huge_ptep_get(ptep))) { 4390 ret = 0; 4391 spin_unlock(ptl); 4392 goto out; 4393 } 4394 spin_unlock(ptl); 4395 ret = vmf_error(PTR_ERR(page)); 4396 goto out; 4397 } 4398 clear_huge_page(page, address, pages_per_huge_page(h)); 4399 __SetPageUptodate(page); 4400 new_page = true; 4401 4402 if (vma->vm_flags & VM_MAYSHARE) { 4403 int err = huge_add_to_page_cache(page, mapping, idx); 4404 if (err) { 4405 put_page(page); 4406 if (err == -EEXIST) 4407 goto retry; 4408 goto out; 4409 } 4410 } else { 4411 lock_page(page); 4412 if (unlikely(anon_vma_prepare(vma))) { 4413 ret = VM_FAULT_OOM; 4414 goto backout_unlocked; 4415 } 4416 anon_rmap = 1; 4417 } 4418 } else { 4419 /* 4420 * If memory error occurs between mmap() and fault, some process 4421 * don't have hwpoisoned swap entry for errored virtual address. 4422 * So we need to block hugepage fault by PG_hwpoison bit check. 4423 */ 4424 if (unlikely(PageHWPoison(page))) { 4425 ret = VM_FAULT_HWPOISON | 4426 VM_FAULT_SET_HINDEX(hstate_index(h)); 4427 goto backout_unlocked; 4428 } 4429 } 4430 4431 /* 4432 * If we are going to COW a private mapping later, we examine the 4433 * pending reservations for this page now. This will ensure that 4434 * any allocations necessary to record that reservation occur outside 4435 * the spinlock. 4436 */ 4437 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 4438 if (vma_needs_reservation(h, vma, haddr) < 0) { 4439 ret = VM_FAULT_OOM; 4440 goto backout_unlocked; 4441 } 4442 /* Just decrements count, does not deallocate */ 4443 vma_end_reservation(h, vma, haddr); 4444 } 4445 4446 ptl = huge_pte_lock(h, mm, ptep); 4447 ret = 0; 4448 if (!huge_pte_none(huge_ptep_get(ptep))) 4449 goto backout; 4450 4451 if (anon_rmap) { 4452 ClearPagePrivate(page); 4453 hugepage_add_new_anon_rmap(page, vma, haddr); 4454 } else 4455 page_dup_rmap(page, true); 4456 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 4457 && (vma->vm_flags & VM_SHARED))); 4458 set_huge_pte_at(mm, haddr, ptep, new_pte); 4459 4460 hugetlb_count_add(pages_per_huge_page(h), mm); 4461 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 4462 /* Optimization, do the COW without a second fault */ 4463 ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); 4464 } 4465 4466 spin_unlock(ptl); 4467 4468 /* 4469 * Only make newly allocated pages active. Existing pages found 4470 * in the pagecache could be !page_huge_active() if they have been 4471 * isolated for migration. 4472 */ 4473 if (new_page) 4474 set_page_huge_active(page); 4475 4476 unlock_page(page); 4477 out: 4478 return ret; 4479 4480 backout: 4481 spin_unlock(ptl); 4482 backout_unlocked: 4483 unlock_page(page); 4484 restore_reserve_on_error(h, vma, haddr, page); 4485 put_page(page); 4486 goto out; 4487 } 4488 4489 #ifdef CONFIG_SMP 4490 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 4491 { 4492 unsigned long key[2]; 4493 u32 hash; 4494 4495 key[0] = (unsigned long) mapping; 4496 key[1] = idx; 4497 4498 hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); 4499 4500 return hash & (num_fault_mutexes - 1); 4501 } 4502 #else 4503 /* 4504 * For uniprocesor systems we always use a single mutex, so just 4505 * return 0 and avoid the hashing overhead. 4506 */ 4507 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 4508 { 4509 return 0; 4510 } 4511 #endif 4512 4513 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 4514 unsigned long address, unsigned int flags) 4515 { 4516 pte_t *ptep, entry; 4517 spinlock_t *ptl; 4518 vm_fault_t ret; 4519 u32 hash; 4520 pgoff_t idx; 4521 struct page *page = NULL; 4522 struct page *pagecache_page = NULL; 4523 struct hstate *h = hstate_vma(vma); 4524 struct address_space *mapping; 4525 int need_wait_lock = 0; 4526 unsigned long haddr = address & huge_page_mask(h); 4527 4528 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 4529 if (ptep) { 4530 /* 4531 * Since we hold no locks, ptep could be stale. That is 4532 * OK as we are only making decisions based on content and 4533 * not actually modifying content here. 4534 */ 4535 entry = huge_ptep_get(ptep); 4536 if (unlikely(is_hugetlb_entry_migration(entry))) { 4537 migration_entry_wait_huge(vma, mm, ptep); 4538 return 0; 4539 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 4540 return VM_FAULT_HWPOISON_LARGE | 4541 VM_FAULT_SET_HINDEX(hstate_index(h)); 4542 } else { 4543 ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); 4544 if (!ptep) 4545 return VM_FAULT_OOM; 4546 } 4547 4548 /* 4549 * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold 4550 * until finished with ptep. This serves two purposes: 4551 * 1) It prevents huge_pmd_unshare from being called elsewhere 4552 * and making the ptep no longer valid. 4553 * 2) It synchronizes us with i_size modifications during truncation. 4554 * 4555 * ptep could have already be assigned via huge_pte_offset. That 4556 * is OK, as huge_pte_alloc will return the same value unless 4557 * something has changed. 4558 */ 4559 mapping = vma->vm_file->f_mapping; 4560 i_mmap_lock_read(mapping); 4561 ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); 4562 if (!ptep) { 4563 i_mmap_unlock_read(mapping); 4564 return VM_FAULT_OOM; 4565 } 4566 4567 /* 4568 * Serialize hugepage allocation and instantiation, so that we don't 4569 * get spurious allocation failures if two CPUs race to instantiate 4570 * the same page in the page cache. 4571 */ 4572 idx = vma_hugecache_offset(h, vma, haddr); 4573 hash = hugetlb_fault_mutex_hash(mapping, idx); 4574 mutex_lock(&hugetlb_fault_mutex_table[hash]); 4575 4576 entry = huge_ptep_get(ptep); 4577 if (huge_pte_none(entry)) { 4578 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); 4579 goto out_mutex; 4580 } 4581 4582 ret = 0; 4583 4584 /* 4585 * entry could be a migration/hwpoison entry at this point, so this 4586 * check prevents the kernel from going below assuming that we have 4587 * an active hugepage in pagecache. This goto expects the 2nd page 4588 * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will 4589 * properly handle it. 4590 */ 4591 if (!pte_present(entry)) 4592 goto out_mutex; 4593 4594 /* 4595 * If we are going to COW the mapping later, we examine the pending 4596 * reservations for this page now. This will ensure that any 4597 * allocations necessary to record that reservation occur outside the 4598 * spinlock. For private mappings, we also lookup the pagecache 4599 * page now as it is used to determine if a reservation has been 4600 * consumed. 4601 */ 4602 if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { 4603 if (vma_needs_reservation(h, vma, haddr) < 0) { 4604 ret = VM_FAULT_OOM; 4605 goto out_mutex; 4606 } 4607 /* Just decrements count, does not deallocate */ 4608 vma_end_reservation(h, vma, haddr); 4609 4610 if (!(vma->vm_flags & VM_MAYSHARE)) 4611 pagecache_page = hugetlbfs_pagecache_page(h, 4612 vma, haddr); 4613 } 4614 4615 ptl = huge_pte_lock(h, mm, ptep); 4616 4617 /* Check for a racing update before calling hugetlb_cow */ 4618 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 4619 goto out_ptl; 4620 4621 /* 4622 * hugetlb_cow() requires page locks of pte_page(entry) and 4623 * pagecache_page, so here we need take the former one 4624 * when page != pagecache_page or !pagecache_page. 4625 */ 4626 page = pte_page(entry); 4627 if (page != pagecache_page) 4628 if (!trylock_page(page)) { 4629 need_wait_lock = 1; 4630 goto out_ptl; 4631 } 4632 4633 get_page(page); 4634 4635 if (flags & FAULT_FLAG_WRITE) { 4636 if (!huge_pte_write(entry)) { 4637 ret = hugetlb_cow(mm, vma, address, ptep, 4638 pagecache_page, ptl); 4639 goto out_put_page; 4640 } 4641 entry = huge_pte_mkdirty(entry); 4642 } 4643 entry = pte_mkyoung(entry); 4644 if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, 4645 flags & FAULT_FLAG_WRITE)) 4646 update_mmu_cache(vma, haddr, ptep); 4647 out_put_page: 4648 if (page != pagecache_page) 4649 unlock_page(page); 4650 put_page(page); 4651 out_ptl: 4652 spin_unlock(ptl); 4653 4654 if (pagecache_page) { 4655 unlock_page(pagecache_page); 4656 put_page(pagecache_page); 4657 } 4658 out_mutex: 4659 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 4660 i_mmap_unlock_read(mapping); 4661 /* 4662 * Generally it's safe to hold refcount during waiting page lock. But 4663 * here we just wait to defer the next page fault to avoid busy loop and 4664 * the page is not used after unlocked before returning from the current 4665 * page fault. So we are safe from accessing freed page, even if we wait 4666 * here without taking refcount. 4667 */ 4668 if (need_wait_lock) 4669 wait_on_page_locked(page); 4670 return ret; 4671 } 4672 4673 /* 4674 * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with 4675 * modifications for huge pages. 4676 */ 4677 int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, 4678 pte_t *dst_pte, 4679 struct vm_area_struct *dst_vma, 4680 unsigned long dst_addr, 4681 unsigned long src_addr, 4682 struct page **pagep) 4683 { 4684 struct address_space *mapping; 4685 pgoff_t idx; 4686 unsigned long size; 4687 int vm_shared = dst_vma->vm_flags & VM_SHARED; 4688 struct hstate *h = hstate_vma(dst_vma); 4689 pte_t _dst_pte; 4690 spinlock_t *ptl; 4691 int ret; 4692 struct page *page; 4693 4694 if (!*pagep) { 4695 ret = -ENOMEM; 4696 page = alloc_huge_page(dst_vma, dst_addr, 0); 4697 if (IS_ERR(page)) 4698 goto out; 4699 4700 ret = copy_huge_page_from_user(page, 4701 (const void __user *) src_addr, 4702 pages_per_huge_page(h), false); 4703 4704 /* fallback to copy_from_user outside mmap_lock */ 4705 if (unlikely(ret)) { 4706 ret = -ENOENT; 4707 *pagep = page; 4708 /* don't free the page */ 4709 goto out; 4710 } 4711 } else { 4712 page = *pagep; 4713 *pagep = NULL; 4714 } 4715 4716 /* 4717 * The memory barrier inside __SetPageUptodate makes sure that 4718 * preceding stores to the page contents become visible before 4719 * the set_pte_at() write. 4720 */ 4721 __SetPageUptodate(page); 4722 4723 mapping = dst_vma->vm_file->f_mapping; 4724 idx = vma_hugecache_offset(h, dst_vma, dst_addr); 4725 4726 /* 4727 * If shared, add to page cache 4728 */ 4729 if (vm_shared) { 4730 size = i_size_read(mapping->host) >> huge_page_shift(h); 4731 ret = -EFAULT; 4732 if (idx >= size) 4733 goto out_release_nounlock; 4734 4735 /* 4736 * Serialization between remove_inode_hugepages() and 4737 * huge_add_to_page_cache() below happens through the 4738 * hugetlb_fault_mutex_table that here must be hold by 4739 * the caller. 4740 */ 4741 ret = huge_add_to_page_cache(page, mapping, idx); 4742 if (ret) 4743 goto out_release_nounlock; 4744 } 4745 4746 ptl = huge_pte_lockptr(h, dst_mm, dst_pte); 4747 spin_lock(ptl); 4748 4749 /* 4750 * Recheck the i_size after holding PT lock to make sure not 4751 * to leave any page mapped (as page_mapped()) beyond the end 4752 * of the i_size (remove_inode_hugepages() is strict about 4753 * enforcing that). If we bail out here, we'll also leave a 4754 * page in the radix tree in the vm_shared case beyond the end 4755 * of the i_size, but remove_inode_hugepages() will take care 4756 * of it as soon as we drop the hugetlb_fault_mutex_table. 4757 */ 4758 size = i_size_read(mapping->host) >> huge_page_shift(h); 4759 ret = -EFAULT; 4760 if (idx >= size) 4761 goto out_release_unlock; 4762 4763 ret = -EEXIST; 4764 if (!huge_pte_none(huge_ptep_get(dst_pte))) 4765 goto out_release_unlock; 4766 4767 if (vm_shared) { 4768 page_dup_rmap(page, true); 4769 } else { 4770 ClearPagePrivate(page); 4771 hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); 4772 } 4773 4774 _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); 4775 if (dst_vma->vm_flags & VM_WRITE) 4776 _dst_pte = huge_pte_mkdirty(_dst_pte); 4777 _dst_pte = pte_mkyoung(_dst_pte); 4778 4779 set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 4780 4781 (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, 4782 dst_vma->vm_flags & VM_WRITE); 4783 hugetlb_count_add(pages_per_huge_page(h), dst_mm); 4784 4785 /* No need to invalidate - it was non-present before */ 4786 update_mmu_cache(dst_vma, dst_addr, dst_pte); 4787 4788 spin_unlock(ptl); 4789 set_page_huge_active(page); 4790 if (vm_shared) 4791 unlock_page(page); 4792 ret = 0; 4793 out: 4794 return ret; 4795 out_release_unlock: 4796 spin_unlock(ptl); 4797 if (vm_shared) 4798 unlock_page(page); 4799 out_release_nounlock: 4800 put_page(page); 4801 goto out; 4802 } 4803 4804 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 4805 struct page **pages, struct vm_area_struct **vmas, 4806 unsigned long *position, unsigned long *nr_pages, 4807 long i, unsigned int flags, int *locked) 4808 { 4809 unsigned long pfn_offset; 4810 unsigned long vaddr = *position; 4811 unsigned long remainder = *nr_pages; 4812 struct hstate *h = hstate_vma(vma); 4813 int err = -EFAULT; 4814 4815 while (vaddr < vma->vm_end && remainder) { 4816 pte_t *pte; 4817 spinlock_t *ptl = NULL; 4818 int absent; 4819 struct page *page; 4820 4821 /* 4822 * If we have a pending SIGKILL, don't keep faulting pages and 4823 * potentially allocating memory. 4824 */ 4825 if (fatal_signal_pending(current)) { 4826 remainder = 0; 4827 break; 4828 } 4829 4830 /* 4831 * Some archs (sparc64, sh*) have multiple pte_ts to 4832 * each hugepage. We have to make sure we get the 4833 * first, for the page indexing below to work. 4834 * 4835 * Note that page table lock is not held when pte is null. 4836 */ 4837 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), 4838 huge_page_size(h)); 4839 if (pte) 4840 ptl = huge_pte_lock(h, mm, pte); 4841 absent = !pte || huge_pte_none(huge_ptep_get(pte)); 4842 4843 /* 4844 * When coredumping, it suits get_dump_page if we just return 4845 * an error where there's an empty slot with no huge pagecache 4846 * to back it. This way, we avoid allocating a hugepage, and 4847 * the sparse dumpfile avoids allocating disk blocks, but its 4848 * huge holes still show up with zeroes where they need to be. 4849 */ 4850 if (absent && (flags & FOLL_DUMP) && 4851 !hugetlbfs_pagecache_present(h, vma, vaddr)) { 4852 if (pte) 4853 spin_unlock(ptl); 4854 remainder = 0; 4855 break; 4856 } 4857 4858 /* 4859 * We need call hugetlb_fault for both hugepages under migration 4860 * (in which case hugetlb_fault waits for the migration,) and 4861 * hwpoisoned hugepages (in which case we need to prevent the 4862 * caller from accessing to them.) In order to do this, we use 4863 * here is_swap_pte instead of is_hugetlb_entry_migration and 4864 * is_hugetlb_entry_hwpoisoned. This is because it simply covers 4865 * both cases, and because we can't follow correct pages 4866 * directly from any kind of swap entries. 4867 */ 4868 if (absent || is_swap_pte(huge_ptep_get(pte)) || 4869 ((flags & FOLL_WRITE) && 4870 !huge_pte_write(huge_ptep_get(pte)))) { 4871 vm_fault_t ret; 4872 unsigned int fault_flags = 0; 4873 4874 if (pte) 4875 spin_unlock(ptl); 4876 if (flags & FOLL_WRITE) 4877 fault_flags |= FAULT_FLAG_WRITE; 4878 if (locked) 4879 fault_flags |= FAULT_FLAG_ALLOW_RETRY | 4880 FAULT_FLAG_KILLABLE; 4881 if (flags & FOLL_NOWAIT) 4882 fault_flags |= FAULT_FLAG_ALLOW_RETRY | 4883 FAULT_FLAG_RETRY_NOWAIT; 4884 if (flags & FOLL_TRIED) { 4885 /* 4886 * Note: FAULT_FLAG_ALLOW_RETRY and 4887 * FAULT_FLAG_TRIED can co-exist 4888 */ 4889 fault_flags |= FAULT_FLAG_TRIED; 4890 } 4891 ret = hugetlb_fault(mm, vma, vaddr, fault_flags); 4892 if (ret & VM_FAULT_ERROR) { 4893 err = vm_fault_to_errno(ret, flags); 4894 remainder = 0; 4895 break; 4896 } 4897 if (ret & VM_FAULT_RETRY) { 4898 if (locked && 4899 !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) 4900 *locked = 0; 4901 *nr_pages = 0; 4902 /* 4903 * VM_FAULT_RETRY must not return an 4904 * error, it will return zero 4905 * instead. 4906 * 4907 * No need to update "position" as the 4908 * caller will not check it after 4909 * *nr_pages is set to 0. 4910 */ 4911 return i; 4912 } 4913 continue; 4914 } 4915 4916 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; 4917 page = pte_page(huge_ptep_get(pte)); 4918 4919 /* 4920 * If subpage information not requested, update counters 4921 * and skip the same_page loop below. 4922 */ 4923 if (!pages && !vmas && !pfn_offset && 4924 (vaddr + huge_page_size(h) < vma->vm_end) && 4925 (remainder >= pages_per_huge_page(h))) { 4926 vaddr += huge_page_size(h); 4927 remainder -= pages_per_huge_page(h); 4928 i += pages_per_huge_page(h); 4929 spin_unlock(ptl); 4930 continue; 4931 } 4932 4933 same_page: 4934 if (pages) { 4935 pages[i] = mem_map_offset(page, pfn_offset); 4936 /* 4937 * try_grab_page() should always succeed here, because: 4938 * a) we hold the ptl lock, and b) we've just checked 4939 * that the huge page is present in the page tables. If 4940 * the huge page is present, then the tail pages must 4941 * also be present. The ptl prevents the head page and 4942 * tail pages from being rearranged in any way. So this 4943 * page must be available at this point, unless the page 4944 * refcount overflowed: 4945 */ 4946 if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) { 4947 spin_unlock(ptl); 4948 remainder = 0; 4949 err = -ENOMEM; 4950 break; 4951 } 4952 } 4953 4954 if (vmas) 4955 vmas[i] = vma; 4956 4957 vaddr += PAGE_SIZE; 4958 ++pfn_offset; 4959 --remainder; 4960 ++i; 4961 if (vaddr < vma->vm_end && remainder && 4962 pfn_offset < pages_per_huge_page(h)) { 4963 /* 4964 * We use pfn_offset to avoid touching the pageframes 4965 * of this compound page. 4966 */ 4967 goto same_page; 4968 } 4969 spin_unlock(ptl); 4970 } 4971 *nr_pages = remainder; 4972 /* 4973 * setting position is actually required only if remainder is 4974 * not zero but it's faster not to add a "if (remainder)" 4975 * branch. 4976 */ 4977 *position = vaddr; 4978 4979 return i ? i : err; 4980 } 4981 4982 #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE 4983 /* 4984 * ARCHes with special requirements for evicting HUGETLB backing TLB entries can 4985 * implement this. 4986 */ 4987 #define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) 4988 #endif 4989 4990 unsigned long hugetlb_change_protection(struct vm_area_struct *vma, 4991 unsigned long address, unsigned long end, pgprot_t newprot) 4992 { 4993 struct mm_struct *mm = vma->vm_mm; 4994 unsigned long start = address; 4995 pte_t *ptep; 4996 pte_t pte; 4997 struct hstate *h = hstate_vma(vma); 4998 unsigned long pages = 0; 4999 bool shared_pmd = false; 5000 struct mmu_notifier_range range; 5001 5002 /* 5003 * In the case of shared PMDs, the area to flush could be beyond 5004 * start/end. Set range.start/range.end to cover the maximum possible 5005 * range if PMD sharing is possible. 5006 */ 5007 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 5008 0, vma, mm, start, end); 5009 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 5010 5011 BUG_ON(address >= end); 5012 flush_cache_range(vma, range.start, range.end); 5013 5014 mmu_notifier_invalidate_range_start(&range); 5015 i_mmap_lock_write(vma->vm_file->f_mapping); 5016 for (; address < end; address += huge_page_size(h)) { 5017 spinlock_t *ptl; 5018 ptep = huge_pte_offset(mm, address, huge_page_size(h)); 5019 if (!ptep) 5020 continue; 5021 ptl = huge_pte_lock(h, mm, ptep); 5022 if (huge_pmd_unshare(mm, &address, ptep)) { 5023 pages++; 5024 spin_unlock(ptl); 5025 shared_pmd = true; 5026 continue; 5027 } 5028 pte = huge_ptep_get(ptep); 5029 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 5030 spin_unlock(ptl); 5031 continue; 5032 } 5033 if (unlikely(is_hugetlb_entry_migration(pte))) { 5034 swp_entry_t entry = pte_to_swp_entry(pte); 5035 5036 if (is_write_migration_entry(entry)) { 5037 pte_t newpte; 5038 5039 make_migration_entry_read(&entry); 5040 newpte = swp_entry_to_pte(entry); 5041 set_huge_swap_pte_at(mm, address, ptep, 5042 newpte, huge_page_size(h)); 5043 pages++; 5044 } 5045 spin_unlock(ptl); 5046 continue; 5047 } 5048 if (!huge_pte_none(pte)) { 5049 pte_t old_pte; 5050 5051 old_pte = huge_ptep_modify_prot_start(vma, address, ptep); 5052 pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); 5053 pte = arch_make_huge_pte(pte, vma, NULL, 0); 5054 huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); 5055 pages++; 5056 } 5057 spin_unlock(ptl); 5058 } 5059 /* 5060 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare 5061 * may have cleared our pud entry and done put_page on the page table: 5062 * once we release i_mmap_rwsem, another task can do the final put_page 5063 * and that page table be reused and filled with junk. If we actually 5064 * did unshare a page of pmds, flush the range corresponding to the pud. 5065 */ 5066 if (shared_pmd) 5067 flush_hugetlb_tlb_range(vma, range.start, range.end); 5068 else 5069 flush_hugetlb_tlb_range(vma, start, end); 5070 /* 5071 * No need to call mmu_notifier_invalidate_range() we are downgrading 5072 * page table protection not changing it to point to a new page. 5073 * 5074 * See Documentation/vm/mmu_notifier.rst 5075 */ 5076 i_mmap_unlock_write(vma->vm_file->f_mapping); 5077 mmu_notifier_invalidate_range_end(&range); 5078 5079 return pages << h->order; 5080 } 5081 5082 int hugetlb_reserve_pages(struct inode *inode, 5083 long from, long to, 5084 struct vm_area_struct *vma, 5085 vm_flags_t vm_flags) 5086 { 5087 long ret, chg, add = -1; 5088 struct hstate *h = hstate_inode(inode); 5089 struct hugepage_subpool *spool = subpool_inode(inode); 5090 struct resv_map *resv_map; 5091 struct hugetlb_cgroup *h_cg = NULL; 5092 long gbl_reserve, regions_needed = 0; 5093 5094 /* This should never happen */ 5095 if (from > to) { 5096 VM_WARN(1, "%s called with a negative range\n", __func__); 5097 return -EINVAL; 5098 } 5099 5100 /* 5101 * Only apply hugepage reservation if asked. At fault time, an 5102 * attempt will be made for VM_NORESERVE to allocate a page 5103 * without using reserves 5104 */ 5105 if (vm_flags & VM_NORESERVE) 5106 return 0; 5107 5108 /* 5109 * Shared mappings base their reservation on the number of pages that 5110 * are already allocated on behalf of the file. Private mappings need 5111 * to reserve the full area even if read-only as mprotect() may be 5112 * called to make the mapping read-write. Assume !vma is a shm mapping 5113 */ 5114 if (!vma || vma->vm_flags & VM_MAYSHARE) { 5115 /* 5116 * resv_map can not be NULL as hugetlb_reserve_pages is only 5117 * called for inodes for which resv_maps were created (see 5118 * hugetlbfs_get_inode). 5119 */ 5120 resv_map = inode_resv_map(inode); 5121 5122 chg = region_chg(resv_map, from, to, ®ions_needed); 5123 5124 } else { 5125 /* Private mapping. */ 5126 resv_map = resv_map_alloc(); 5127 if (!resv_map) 5128 return -ENOMEM; 5129 5130 chg = to - from; 5131 5132 set_vma_resv_map(vma, resv_map); 5133 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 5134 } 5135 5136 if (chg < 0) { 5137 ret = chg; 5138 goto out_err; 5139 } 5140 5141 ret = hugetlb_cgroup_charge_cgroup_rsvd( 5142 hstate_index(h), chg * pages_per_huge_page(h), &h_cg); 5143 5144 if (ret < 0) { 5145 ret = -ENOMEM; 5146 goto out_err; 5147 } 5148 5149 if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { 5150 /* For private mappings, the hugetlb_cgroup uncharge info hangs 5151 * of the resv_map. 5152 */ 5153 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); 5154 } 5155 5156 /* 5157 * There must be enough pages in the subpool for the mapping. If 5158 * the subpool has a minimum size, there may be some global 5159 * reservations already in place (gbl_reserve). 5160 */ 5161 gbl_reserve = hugepage_subpool_get_pages(spool, chg); 5162 if (gbl_reserve < 0) { 5163 ret = -ENOSPC; 5164 goto out_uncharge_cgroup; 5165 } 5166 5167 /* 5168 * Check enough hugepages are available for the reservation. 5169 * Hand the pages back to the subpool if there are not 5170 */ 5171 ret = hugetlb_acct_memory(h, gbl_reserve); 5172 if (ret < 0) { 5173 goto out_put_pages; 5174 } 5175 5176 /* 5177 * Account for the reservations made. Shared mappings record regions 5178 * that have reservations as they are shared by multiple VMAs. 5179 * When the last VMA disappears, the region map says how much 5180 * the reservation was and the page cache tells how much of 5181 * the reservation was consumed. Private mappings are per-VMA and 5182 * only the consumed reservations are tracked. When the VMA 5183 * disappears, the original reservation is the VMA size and the 5184 * consumed reservations are stored in the map. Hence, nothing 5185 * else has to be done for private mappings here 5186 */ 5187 if (!vma || vma->vm_flags & VM_MAYSHARE) { 5188 add = region_add(resv_map, from, to, regions_needed, h, h_cg); 5189 5190 if (unlikely(add < 0)) { 5191 hugetlb_acct_memory(h, -gbl_reserve); 5192 goto out_put_pages; 5193 } else if (unlikely(chg > add)) { 5194 /* 5195 * pages in this range were added to the reserve 5196 * map between region_chg and region_add. This 5197 * indicates a race with alloc_huge_page. Adjust 5198 * the subpool and reserve counts modified above 5199 * based on the difference. 5200 */ 5201 long rsv_adjust; 5202 5203 hugetlb_cgroup_uncharge_cgroup_rsvd( 5204 hstate_index(h), 5205 (chg - add) * pages_per_huge_page(h), h_cg); 5206 5207 rsv_adjust = hugepage_subpool_put_pages(spool, 5208 chg - add); 5209 hugetlb_acct_memory(h, -rsv_adjust); 5210 } 5211 } 5212 return 0; 5213 out_put_pages: 5214 /* put back original number of pages, chg */ 5215 (void)hugepage_subpool_put_pages(spool, chg); 5216 out_uncharge_cgroup: 5217 hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), 5218 chg * pages_per_huge_page(h), h_cg); 5219 out_err: 5220 if (!vma || vma->vm_flags & VM_MAYSHARE) 5221 /* Only call region_abort if the region_chg succeeded but the 5222 * region_add failed or didn't run. 5223 */ 5224 if (chg >= 0 && add < 0) 5225 region_abort(resv_map, from, to, regions_needed); 5226 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 5227 kref_put(&resv_map->refs, resv_map_release); 5228 return ret; 5229 } 5230 5231 long hugetlb_unreserve_pages(struct inode *inode, long start, long end, 5232 long freed) 5233 { 5234 struct hstate *h = hstate_inode(inode); 5235 struct resv_map *resv_map = inode_resv_map(inode); 5236 long chg = 0; 5237 struct hugepage_subpool *spool = subpool_inode(inode); 5238 long gbl_reserve; 5239 5240 /* 5241 * Since this routine can be called in the evict inode path for all 5242 * hugetlbfs inodes, resv_map could be NULL. 5243 */ 5244 if (resv_map) { 5245 chg = region_del(resv_map, start, end); 5246 /* 5247 * region_del() can fail in the rare case where a region 5248 * must be split and another region descriptor can not be 5249 * allocated. If end == LONG_MAX, it will not fail. 5250 */ 5251 if (chg < 0) 5252 return chg; 5253 } 5254 5255 spin_lock(&inode->i_lock); 5256 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 5257 spin_unlock(&inode->i_lock); 5258 5259 /* 5260 * If the subpool has a minimum size, the number of global 5261 * reservations to be released may be adjusted. 5262 */ 5263 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); 5264 hugetlb_acct_memory(h, -gbl_reserve); 5265 5266 return 0; 5267 } 5268 5269 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 5270 static unsigned long page_table_shareable(struct vm_area_struct *svma, 5271 struct vm_area_struct *vma, 5272 unsigned long addr, pgoff_t idx) 5273 { 5274 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + 5275 svma->vm_start; 5276 unsigned long sbase = saddr & PUD_MASK; 5277 unsigned long s_end = sbase + PUD_SIZE; 5278 5279 /* Allow segments to share if only one is marked locked */ 5280 unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; 5281 unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; 5282 5283 /* 5284 * match the virtual addresses, permission and the alignment of the 5285 * page table page. 5286 */ 5287 if (pmd_index(addr) != pmd_index(saddr) || 5288 vm_flags != svm_flags || 5289 sbase < svma->vm_start || svma->vm_end < s_end) 5290 return 0; 5291 5292 return saddr; 5293 } 5294 5295 static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) 5296 { 5297 unsigned long base = addr & PUD_MASK; 5298 unsigned long end = base + PUD_SIZE; 5299 5300 /* 5301 * check on proper vm_flags and page table alignment 5302 */ 5303 if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) 5304 return true; 5305 return false; 5306 } 5307 5308 /* 5309 * Determine if start,end range within vma could be mapped by shared pmd. 5310 * If yes, adjust start and end to cover range associated with possible 5311 * shared pmd mappings. 5312 */ 5313 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 5314 unsigned long *start, unsigned long *end) 5315 { 5316 unsigned long check_addr; 5317 5318 if (!(vma->vm_flags & VM_MAYSHARE)) 5319 return; 5320 5321 for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) { 5322 unsigned long a_start = check_addr & PUD_MASK; 5323 unsigned long a_end = a_start + PUD_SIZE; 5324 5325 /* 5326 * If sharing is possible, adjust start/end if necessary. 5327 */ 5328 if (range_in_vma(vma, a_start, a_end)) { 5329 if (a_start < *start) 5330 *start = a_start; 5331 if (a_end > *end) 5332 *end = a_end; 5333 } 5334 } 5335 } 5336 5337 /* 5338 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 5339 * and returns the corresponding pte. While this is not necessary for the 5340 * !shared pmd case because we can allocate the pmd later as well, it makes the 5341 * code much cleaner. 5342 * 5343 * This routine must be called with i_mmap_rwsem held in at least read mode. 5344 * For hugetlbfs, this prevents removal of any page table entries associated 5345 * with the address space. This is important as we are setting up sharing 5346 * based on existing page table entries (mappings). 5347 */ 5348 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 5349 { 5350 struct vm_area_struct *vma = find_vma(mm, addr); 5351 struct address_space *mapping = vma->vm_file->f_mapping; 5352 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 5353 vma->vm_pgoff; 5354 struct vm_area_struct *svma; 5355 unsigned long saddr; 5356 pte_t *spte = NULL; 5357 pte_t *pte; 5358 spinlock_t *ptl; 5359 5360 if (!vma_shareable(vma, addr)) 5361 return (pte_t *)pmd_alloc(mm, pud, addr); 5362 5363 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 5364 if (svma == vma) 5365 continue; 5366 5367 saddr = page_table_shareable(svma, vma, addr, idx); 5368 if (saddr) { 5369 spte = huge_pte_offset(svma->vm_mm, saddr, 5370 vma_mmu_pagesize(svma)); 5371 if (spte) { 5372 get_page(virt_to_page(spte)); 5373 break; 5374 } 5375 } 5376 } 5377 5378 if (!spte) 5379 goto out; 5380 5381 ptl = huge_pte_lock(hstate_vma(vma), mm, spte); 5382 if (pud_none(*pud)) { 5383 pud_populate(mm, pud, 5384 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 5385 mm_inc_nr_pmds(mm); 5386 } else { 5387 put_page(virt_to_page(spte)); 5388 } 5389 spin_unlock(ptl); 5390 out: 5391 pte = (pte_t *)pmd_alloc(mm, pud, addr); 5392 return pte; 5393 } 5394 5395 /* 5396 * unmap huge page backed by shared pte. 5397 * 5398 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared 5399 * indicated by page_count > 1, unmap is achieved by clearing pud and 5400 * decrementing the ref count. If count == 1, the pte page is not shared. 5401 * 5402 * Called with page table lock held and i_mmap_rwsem held in write mode. 5403 * 5404 * returns: 1 successfully unmapped a shared pte page 5405 * 0 the underlying pte page is not shared, or it is the last user 5406 */ 5407 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 5408 { 5409 pgd_t *pgd = pgd_offset(mm, *addr); 5410 p4d_t *p4d = p4d_offset(pgd, *addr); 5411 pud_t *pud = pud_offset(p4d, *addr); 5412 5413 BUG_ON(page_count(virt_to_page(ptep)) == 0); 5414 if (page_count(virt_to_page(ptep)) == 1) 5415 return 0; 5416 5417 pud_clear(pud); 5418 put_page(virt_to_page(ptep)); 5419 mm_dec_nr_pmds(mm); 5420 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; 5421 return 1; 5422 } 5423 #define want_pmd_share() (1) 5424 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 5425 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 5426 { 5427 return NULL; 5428 } 5429 5430 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 5431 { 5432 return 0; 5433 } 5434 5435 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 5436 unsigned long *start, unsigned long *end) 5437 { 5438 } 5439 #define want_pmd_share() (0) 5440 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 5441 5442 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB 5443 pte_t *huge_pte_alloc(struct mm_struct *mm, 5444 unsigned long addr, unsigned long sz) 5445 { 5446 pgd_t *pgd; 5447 p4d_t *p4d; 5448 pud_t *pud; 5449 pte_t *pte = NULL; 5450 5451 pgd = pgd_offset(mm, addr); 5452 p4d = p4d_alloc(mm, pgd, addr); 5453 if (!p4d) 5454 return NULL; 5455 pud = pud_alloc(mm, p4d, addr); 5456 if (pud) { 5457 if (sz == PUD_SIZE) { 5458 pte = (pte_t *)pud; 5459 } else { 5460 BUG_ON(sz != PMD_SIZE); 5461 if (want_pmd_share() && pud_none(*pud)) 5462 pte = huge_pmd_share(mm, addr, pud); 5463 else 5464 pte = (pte_t *)pmd_alloc(mm, pud, addr); 5465 } 5466 } 5467 BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte)); 5468 5469 return pte; 5470 } 5471 5472 /* 5473 * huge_pte_offset() - Walk the page table to resolve the hugepage 5474 * entry at address @addr 5475 * 5476 * Return: Pointer to page table entry (PUD or PMD) for 5477 * address @addr, or NULL if a !p*d_present() entry is encountered and the 5478 * size @sz doesn't match the hugepage size at this level of the page 5479 * table. 5480 */ 5481 pte_t *huge_pte_offset(struct mm_struct *mm, 5482 unsigned long addr, unsigned long sz) 5483 { 5484 pgd_t *pgd; 5485 p4d_t *p4d; 5486 pud_t *pud; 5487 pmd_t *pmd; 5488 5489 pgd = pgd_offset(mm, addr); 5490 if (!pgd_present(*pgd)) 5491 return NULL; 5492 p4d = p4d_offset(pgd, addr); 5493 if (!p4d_present(*p4d)) 5494 return NULL; 5495 5496 pud = pud_offset(p4d, addr); 5497 if (sz == PUD_SIZE) 5498 /* must be pud huge, non-present or none */ 5499 return (pte_t *)pud; 5500 if (!pud_present(*pud)) 5501 return NULL; 5502 /* must have a valid entry and size to go further */ 5503 5504 pmd = pmd_offset(pud, addr); 5505 /* must be pmd huge, non-present or none */ 5506 return (pte_t *)pmd; 5507 } 5508 5509 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 5510 5511 /* 5512 * These functions are overwritable if your architecture needs its own 5513 * behavior. 5514 */ 5515 struct page * __weak 5516 follow_huge_addr(struct mm_struct *mm, unsigned long address, 5517 int write) 5518 { 5519 return ERR_PTR(-EINVAL); 5520 } 5521 5522 struct page * __weak 5523 follow_huge_pd(struct vm_area_struct *vma, 5524 unsigned long address, hugepd_t hpd, int flags, int pdshift) 5525 { 5526 WARN(1, "hugepd follow called with no support for hugepage directory format\n"); 5527 return NULL; 5528 } 5529 5530 struct page * __weak 5531 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 5532 pmd_t *pmd, int flags) 5533 { 5534 struct page *page = NULL; 5535 spinlock_t *ptl; 5536 pte_t pte; 5537 5538 /* FOLL_GET and FOLL_PIN are mutually exclusive. */ 5539 if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == 5540 (FOLL_PIN | FOLL_GET))) 5541 return NULL; 5542 5543 retry: 5544 ptl = pmd_lockptr(mm, pmd); 5545 spin_lock(ptl); 5546 /* 5547 * make sure that the address range covered by this pmd is not 5548 * unmapped from other threads. 5549 */ 5550 if (!pmd_huge(*pmd)) 5551 goto out; 5552 pte = huge_ptep_get((pte_t *)pmd); 5553 if (pte_present(pte)) { 5554 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); 5555 /* 5556 * try_grab_page() should always succeed here, because: a) we 5557 * hold the pmd (ptl) lock, and b) we've just checked that the 5558 * huge pmd (head) page is present in the page tables. The ptl 5559 * prevents the head page and tail pages from being rearranged 5560 * in any way. So this page must be available at this point, 5561 * unless the page refcount overflowed: 5562 */ 5563 if (WARN_ON_ONCE(!try_grab_page(page, flags))) { 5564 page = NULL; 5565 goto out; 5566 } 5567 } else { 5568 if (is_hugetlb_entry_migration(pte)) { 5569 spin_unlock(ptl); 5570 __migration_entry_wait(mm, (pte_t *)pmd, ptl); 5571 goto retry; 5572 } 5573 /* 5574 * hwpoisoned entry is treated as no_page_table in 5575 * follow_page_mask(). 5576 */ 5577 } 5578 out: 5579 spin_unlock(ptl); 5580 return page; 5581 } 5582 5583 struct page * __weak 5584 follow_huge_pud(struct mm_struct *mm, unsigned long address, 5585 pud_t *pud, int flags) 5586 { 5587 if (flags & (FOLL_GET | FOLL_PIN)) 5588 return NULL; 5589 5590 return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); 5591 } 5592 5593 struct page * __weak 5594 follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) 5595 { 5596 if (flags & (FOLL_GET | FOLL_PIN)) 5597 return NULL; 5598 5599 return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); 5600 } 5601 5602 bool isolate_huge_page(struct page *page, struct list_head *list) 5603 { 5604 bool ret = true; 5605 5606 VM_BUG_ON_PAGE(!PageHead(page), page); 5607 spin_lock(&hugetlb_lock); 5608 if (!page_huge_active(page) || !get_page_unless_zero(page)) { 5609 ret = false; 5610 goto unlock; 5611 } 5612 clear_page_huge_active(page); 5613 list_move_tail(&page->lru, list); 5614 unlock: 5615 spin_unlock(&hugetlb_lock); 5616 return ret; 5617 } 5618 5619 void putback_active_hugepage(struct page *page) 5620 { 5621 VM_BUG_ON_PAGE(!PageHead(page), page); 5622 spin_lock(&hugetlb_lock); 5623 set_page_huge_active(page); 5624 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); 5625 spin_unlock(&hugetlb_lock); 5626 put_page(page); 5627 } 5628 5629 void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) 5630 { 5631 struct hstate *h = page_hstate(oldpage); 5632 5633 hugetlb_cgroup_migrate(oldpage, newpage); 5634 set_page_owner_migrate_reason(newpage, reason); 5635 5636 /* 5637 * transfer temporary state of the new huge page. This is 5638 * reverse to other transitions because the newpage is going to 5639 * be final while the old one will be freed so it takes over 5640 * the temporary status. 5641 * 5642 * Also note that we have to transfer the per-node surplus state 5643 * here as well otherwise the global surplus count will not match 5644 * the per-node's. 5645 */ 5646 if (PageHugeTemporary(newpage)) { 5647 int old_nid = page_to_nid(oldpage); 5648 int new_nid = page_to_nid(newpage); 5649 5650 SetPageHugeTemporary(oldpage); 5651 ClearPageHugeTemporary(newpage); 5652 5653 spin_lock(&hugetlb_lock); 5654 if (h->surplus_huge_pages_node[old_nid]) { 5655 h->surplus_huge_pages_node[old_nid]--; 5656 h->surplus_huge_pages_node[new_nid]++; 5657 } 5658 spin_unlock(&hugetlb_lock); 5659 } 5660 } 5661 5662 #ifdef CONFIG_CMA 5663 static bool cma_reserve_called __initdata; 5664 5665 static int __init cmdline_parse_hugetlb_cma(char *p) 5666 { 5667 hugetlb_cma_size = memparse(p, &p); 5668 return 0; 5669 } 5670 5671 early_param("hugetlb_cma", cmdline_parse_hugetlb_cma); 5672 5673 void __init hugetlb_cma_reserve(int order) 5674 { 5675 unsigned long size, reserved, per_node; 5676 int nid; 5677 5678 cma_reserve_called = true; 5679 5680 if (!hugetlb_cma_size) 5681 return; 5682 5683 if (hugetlb_cma_size < (PAGE_SIZE << order)) { 5684 pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", 5685 (PAGE_SIZE << order) / SZ_1M); 5686 return; 5687 } 5688 5689 /* 5690 * If 3 GB area is requested on a machine with 4 numa nodes, 5691 * let's allocate 1 GB on first three nodes and ignore the last one. 5692 */ 5693 per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); 5694 pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", 5695 hugetlb_cma_size / SZ_1M, per_node / SZ_1M); 5696 5697 reserved = 0; 5698 for_each_node_state(nid, N_ONLINE) { 5699 int res; 5700 5701 size = min(per_node, hugetlb_cma_size - reserved); 5702 size = round_up(size, PAGE_SIZE << order); 5703 5704 res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order, 5705 0, false, "hugetlb", 5706 &hugetlb_cma[nid], nid); 5707 if (res) { 5708 pr_warn("hugetlb_cma: reservation failed: err %d, node %d", 5709 res, nid); 5710 continue; 5711 } 5712 5713 reserved += size; 5714 pr_info("hugetlb_cma: reserved %lu MiB on node %d\n", 5715 size / SZ_1M, nid); 5716 5717 if (reserved >= hugetlb_cma_size) 5718 break; 5719 } 5720 } 5721 5722 void __init hugetlb_cma_check(void) 5723 { 5724 if (!hugetlb_cma_size || cma_reserve_called) 5725 return; 5726 5727 pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); 5728 } 5729 5730 #endif /* CONFIG_CMA */ 5731