1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Generic hugetlb support. 4 * (C) Nadia Yvette Chambers, April 2004 5 */ 6 #include <linux/list.h> 7 #include <linux/init.h> 8 #include <linux/mm.h> 9 #include <linux/seq_file.h> 10 #include <linux/sysctl.h> 11 #include <linux/highmem.h> 12 #include <linux/mmu_notifier.h> 13 #include <linux/nodemask.h> 14 #include <linux/pagemap.h> 15 #include <linux/mempolicy.h> 16 #include <linux/compiler.h> 17 #include <linux/cpuset.h> 18 #include <linux/mutex.h> 19 #include <linux/memblock.h> 20 #include <linux/sysfs.h> 21 #include <linux/slab.h> 22 #include <linux/sched/mm.h> 23 #include <linux/mmdebug.h> 24 #include <linux/sched/signal.h> 25 #include <linux/rmap.h> 26 #include <linux/string_helpers.h> 27 #include <linux/swap.h> 28 #include <linux/swapops.h> 29 #include <linux/jhash.h> 30 #include <linux/numa.h> 31 #include <linux/llist.h> 32 #include <linux/cma.h> 33 #include <linux/migrate.h> 34 #include <linux/nospec.h> 35 #include <linux/delayacct.h> 36 #include <linux/memory.h> 37 #include <linux/mm_inline.h> 38 39 #include <asm/page.h> 40 #include <asm/pgalloc.h> 41 #include <asm/tlb.h> 42 43 #include <linux/io.h> 44 #include <linux/hugetlb.h> 45 #include <linux/hugetlb_cgroup.h> 46 #include <linux/node.h> 47 #include <linux/page_owner.h> 48 #include "internal.h" 49 #include "hugetlb_vmemmap.h" 50 51 int hugetlb_max_hstate __read_mostly; 52 unsigned int default_hstate_idx; 53 struct hstate hstates[HUGE_MAX_HSTATE]; 54 55 #ifdef CONFIG_CMA 56 static struct cma *hugetlb_cma[MAX_NUMNODES]; 57 static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; 58 static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) 59 { 60 return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page, 61 1 << order); 62 } 63 #else 64 static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) 65 { 66 return false; 67 } 68 #endif 69 static unsigned long hugetlb_cma_size __initdata; 70 71 __initdata LIST_HEAD(huge_boot_pages); 72 73 /* for command line parsing */ 74 static struct hstate * __initdata parsed_hstate; 75 static unsigned long __initdata default_hstate_max_huge_pages; 76 static bool __initdata parsed_valid_hugepagesz = true; 77 static bool __initdata parsed_default_hugepagesz; 78 static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; 79 80 /* 81 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, 82 * free_huge_pages, and surplus_huge_pages. 83 */ 84 DEFINE_SPINLOCK(hugetlb_lock); 85 86 /* 87 * Serializes faults on the same logical page. This is used to 88 * prevent spurious OOMs when the hugepage pool is fully utilized. 89 */ 90 static int num_fault_mutexes; 91 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; 92 93 /* Forward declaration */ 94 static int hugetlb_acct_memory(struct hstate *h, long delta); 95 static void hugetlb_vma_lock_free(struct vm_area_struct *vma); 96 static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); 97 static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); 98 static void hugetlb_unshare_pmds(struct vm_area_struct *vma, 99 unsigned long start, unsigned long end); 100 static struct resv_map *vma_resv_map(struct vm_area_struct *vma); 101 102 static inline bool subpool_is_free(struct hugepage_subpool *spool) 103 { 104 if (spool->count) 105 return false; 106 if (spool->max_hpages != -1) 107 return spool->used_hpages == 0; 108 if (spool->min_hpages != -1) 109 return spool->rsv_hpages == spool->min_hpages; 110 111 return true; 112 } 113 114 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, 115 unsigned long irq_flags) 116 { 117 spin_unlock_irqrestore(&spool->lock, irq_flags); 118 119 /* If no pages are used, and no other handles to the subpool 120 * remain, give up any reservations based on minimum size and 121 * free the subpool */ 122 if (subpool_is_free(spool)) { 123 if (spool->min_hpages != -1) 124 hugetlb_acct_memory(spool->hstate, 125 -spool->min_hpages); 126 kfree(spool); 127 } 128 } 129 130 struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, 131 long min_hpages) 132 { 133 struct hugepage_subpool *spool; 134 135 spool = kzalloc(sizeof(*spool), GFP_KERNEL); 136 if (!spool) 137 return NULL; 138 139 spin_lock_init(&spool->lock); 140 spool->count = 1; 141 spool->max_hpages = max_hpages; 142 spool->hstate = h; 143 spool->min_hpages = min_hpages; 144 145 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { 146 kfree(spool); 147 return NULL; 148 } 149 spool->rsv_hpages = min_hpages; 150 151 return spool; 152 } 153 154 void hugepage_put_subpool(struct hugepage_subpool *spool) 155 { 156 unsigned long flags; 157 158 spin_lock_irqsave(&spool->lock, flags); 159 BUG_ON(!spool->count); 160 spool->count--; 161 unlock_or_release_subpool(spool, flags); 162 } 163 164 /* 165 * Subpool accounting for allocating and reserving pages. 166 * Return -ENOMEM if there are not enough resources to satisfy the 167 * request. Otherwise, return the number of pages by which the 168 * global pools must be adjusted (upward). The returned value may 169 * only be different than the passed value (delta) in the case where 170 * a subpool minimum size must be maintained. 171 */ 172 static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, 173 long delta) 174 { 175 long ret = delta; 176 177 if (!spool) 178 return ret; 179 180 spin_lock_irq(&spool->lock); 181 182 if (spool->max_hpages != -1) { /* maximum size accounting */ 183 if ((spool->used_hpages + delta) <= spool->max_hpages) 184 spool->used_hpages += delta; 185 else { 186 ret = -ENOMEM; 187 goto unlock_ret; 188 } 189 } 190 191 /* minimum size accounting */ 192 if (spool->min_hpages != -1 && spool->rsv_hpages) { 193 if (delta > spool->rsv_hpages) { 194 /* 195 * Asking for more reserves than those already taken on 196 * behalf of subpool. Return difference. 197 */ 198 ret = delta - spool->rsv_hpages; 199 spool->rsv_hpages = 0; 200 } else { 201 ret = 0; /* reserves already accounted for */ 202 spool->rsv_hpages -= delta; 203 } 204 } 205 206 unlock_ret: 207 spin_unlock_irq(&spool->lock); 208 return ret; 209 } 210 211 /* 212 * Subpool accounting for freeing and unreserving pages. 213 * Return the number of global page reservations that must be dropped. 214 * The return value may only be different than the passed value (delta) 215 * in the case where a subpool minimum size must be maintained. 216 */ 217 static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, 218 long delta) 219 { 220 long ret = delta; 221 unsigned long flags; 222 223 if (!spool) 224 return delta; 225 226 spin_lock_irqsave(&spool->lock, flags); 227 228 if (spool->max_hpages != -1) /* maximum size accounting */ 229 spool->used_hpages -= delta; 230 231 /* minimum size accounting */ 232 if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { 233 if (spool->rsv_hpages + delta <= spool->min_hpages) 234 ret = 0; 235 else 236 ret = spool->rsv_hpages + delta - spool->min_hpages; 237 238 spool->rsv_hpages += delta; 239 if (spool->rsv_hpages > spool->min_hpages) 240 spool->rsv_hpages = spool->min_hpages; 241 } 242 243 /* 244 * If hugetlbfs_put_super couldn't free spool due to an outstanding 245 * quota reference, free it now. 246 */ 247 unlock_or_release_subpool(spool, flags); 248 249 return ret; 250 } 251 252 static inline struct hugepage_subpool *subpool_inode(struct inode *inode) 253 { 254 return HUGETLBFS_SB(inode->i_sb)->spool; 255 } 256 257 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) 258 { 259 return subpool_inode(file_inode(vma->vm_file)); 260 } 261 262 /* 263 * hugetlb vma_lock helper routines 264 */ 265 void hugetlb_vma_lock_read(struct vm_area_struct *vma) 266 { 267 if (__vma_shareable_lock(vma)) { 268 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 269 270 down_read(&vma_lock->rw_sema); 271 } else if (__vma_private_lock(vma)) { 272 struct resv_map *resv_map = vma_resv_map(vma); 273 274 down_read(&resv_map->rw_sema); 275 } 276 } 277 278 void hugetlb_vma_unlock_read(struct vm_area_struct *vma) 279 { 280 if (__vma_shareable_lock(vma)) { 281 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 282 283 up_read(&vma_lock->rw_sema); 284 } else if (__vma_private_lock(vma)) { 285 struct resv_map *resv_map = vma_resv_map(vma); 286 287 up_read(&resv_map->rw_sema); 288 } 289 } 290 291 void hugetlb_vma_lock_write(struct vm_area_struct *vma) 292 { 293 if (__vma_shareable_lock(vma)) { 294 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 295 296 down_write(&vma_lock->rw_sema); 297 } else if (__vma_private_lock(vma)) { 298 struct resv_map *resv_map = vma_resv_map(vma); 299 300 down_write(&resv_map->rw_sema); 301 } 302 } 303 304 void hugetlb_vma_unlock_write(struct vm_area_struct *vma) 305 { 306 if (__vma_shareable_lock(vma)) { 307 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 308 309 up_write(&vma_lock->rw_sema); 310 } else if (__vma_private_lock(vma)) { 311 struct resv_map *resv_map = vma_resv_map(vma); 312 313 up_write(&resv_map->rw_sema); 314 } 315 } 316 317 int hugetlb_vma_trylock_write(struct vm_area_struct *vma) 318 { 319 320 if (__vma_shareable_lock(vma)) { 321 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 322 323 return down_write_trylock(&vma_lock->rw_sema); 324 } else if (__vma_private_lock(vma)) { 325 struct resv_map *resv_map = vma_resv_map(vma); 326 327 return down_write_trylock(&resv_map->rw_sema); 328 } 329 330 return 1; 331 } 332 333 void hugetlb_vma_assert_locked(struct vm_area_struct *vma) 334 { 335 if (__vma_shareable_lock(vma)) { 336 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 337 338 lockdep_assert_held(&vma_lock->rw_sema); 339 } else if (__vma_private_lock(vma)) { 340 struct resv_map *resv_map = vma_resv_map(vma); 341 342 lockdep_assert_held(&resv_map->rw_sema); 343 } 344 } 345 346 void hugetlb_vma_lock_release(struct kref *kref) 347 { 348 struct hugetlb_vma_lock *vma_lock = container_of(kref, 349 struct hugetlb_vma_lock, refs); 350 351 kfree(vma_lock); 352 } 353 354 static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) 355 { 356 struct vm_area_struct *vma = vma_lock->vma; 357 358 /* 359 * vma_lock structure may or not be released as a result of put, 360 * it certainly will no longer be attached to vma so clear pointer. 361 * Semaphore synchronizes access to vma_lock->vma field. 362 */ 363 vma_lock->vma = NULL; 364 vma->vm_private_data = NULL; 365 up_write(&vma_lock->rw_sema); 366 kref_put(&vma_lock->refs, hugetlb_vma_lock_release); 367 } 368 369 static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) 370 { 371 if (__vma_shareable_lock(vma)) { 372 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 373 374 __hugetlb_vma_unlock_write_put(vma_lock); 375 } else if (__vma_private_lock(vma)) { 376 struct resv_map *resv_map = vma_resv_map(vma); 377 378 /* no free for anon vmas, but still need to unlock */ 379 up_write(&resv_map->rw_sema); 380 } 381 } 382 383 static void hugetlb_vma_lock_free(struct vm_area_struct *vma) 384 { 385 /* 386 * Only present in sharable vmas. 387 */ 388 if (!vma || !__vma_shareable_lock(vma)) 389 return; 390 391 if (vma->vm_private_data) { 392 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 393 394 down_write(&vma_lock->rw_sema); 395 __hugetlb_vma_unlock_write_put(vma_lock); 396 } 397 } 398 399 static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) 400 { 401 struct hugetlb_vma_lock *vma_lock; 402 403 /* Only establish in (flags) sharable vmas */ 404 if (!vma || !(vma->vm_flags & VM_MAYSHARE)) 405 return; 406 407 /* Should never get here with non-NULL vm_private_data */ 408 if (vma->vm_private_data) 409 return; 410 411 vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); 412 if (!vma_lock) { 413 /* 414 * If we can not allocate structure, then vma can not 415 * participate in pmd sharing. This is only a possible 416 * performance enhancement and memory saving issue. 417 * However, the lock is also used to synchronize page 418 * faults with truncation. If the lock is not present, 419 * unlikely races could leave pages in a file past i_size 420 * until the file is removed. Warn in the unlikely case of 421 * allocation failure. 422 */ 423 pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); 424 return; 425 } 426 427 kref_init(&vma_lock->refs); 428 init_rwsem(&vma_lock->rw_sema); 429 vma_lock->vma = vma; 430 vma->vm_private_data = vma_lock; 431 } 432 433 /* Helper that removes a struct file_region from the resv_map cache and returns 434 * it for use. 435 */ 436 static struct file_region * 437 get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) 438 { 439 struct file_region *nrg; 440 441 VM_BUG_ON(resv->region_cache_count <= 0); 442 443 resv->region_cache_count--; 444 nrg = list_first_entry(&resv->region_cache, struct file_region, link); 445 list_del(&nrg->link); 446 447 nrg->from = from; 448 nrg->to = to; 449 450 return nrg; 451 } 452 453 static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, 454 struct file_region *rg) 455 { 456 #ifdef CONFIG_CGROUP_HUGETLB 457 nrg->reservation_counter = rg->reservation_counter; 458 nrg->css = rg->css; 459 if (rg->css) 460 css_get(rg->css); 461 #endif 462 } 463 464 /* Helper that records hugetlb_cgroup uncharge info. */ 465 static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, 466 struct hstate *h, 467 struct resv_map *resv, 468 struct file_region *nrg) 469 { 470 #ifdef CONFIG_CGROUP_HUGETLB 471 if (h_cg) { 472 nrg->reservation_counter = 473 &h_cg->rsvd_hugepage[hstate_index(h)]; 474 nrg->css = &h_cg->css; 475 /* 476 * The caller will hold exactly one h_cg->css reference for the 477 * whole contiguous reservation region. But this area might be 478 * scattered when there are already some file_regions reside in 479 * it. As a result, many file_regions may share only one css 480 * reference. In order to ensure that one file_region must hold 481 * exactly one h_cg->css reference, we should do css_get for 482 * each file_region and leave the reference held by caller 483 * untouched. 484 */ 485 css_get(&h_cg->css); 486 if (!resv->pages_per_hpage) 487 resv->pages_per_hpage = pages_per_huge_page(h); 488 /* pages_per_hpage should be the same for all entries in 489 * a resv_map. 490 */ 491 VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); 492 } else { 493 nrg->reservation_counter = NULL; 494 nrg->css = NULL; 495 } 496 #endif 497 } 498 499 static void put_uncharge_info(struct file_region *rg) 500 { 501 #ifdef CONFIG_CGROUP_HUGETLB 502 if (rg->css) 503 css_put(rg->css); 504 #endif 505 } 506 507 static bool has_same_uncharge_info(struct file_region *rg, 508 struct file_region *org) 509 { 510 #ifdef CONFIG_CGROUP_HUGETLB 511 return rg->reservation_counter == org->reservation_counter && 512 rg->css == org->css; 513 514 #else 515 return true; 516 #endif 517 } 518 519 static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) 520 { 521 struct file_region *nrg, *prg; 522 523 prg = list_prev_entry(rg, link); 524 if (&prg->link != &resv->regions && prg->to == rg->from && 525 has_same_uncharge_info(prg, rg)) { 526 prg->to = rg->to; 527 528 list_del(&rg->link); 529 put_uncharge_info(rg); 530 kfree(rg); 531 532 rg = prg; 533 } 534 535 nrg = list_next_entry(rg, link); 536 if (&nrg->link != &resv->regions && nrg->from == rg->to && 537 has_same_uncharge_info(nrg, rg)) { 538 nrg->from = rg->from; 539 540 list_del(&rg->link); 541 put_uncharge_info(rg); 542 kfree(rg); 543 } 544 } 545 546 static inline long 547 hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from, 548 long to, struct hstate *h, struct hugetlb_cgroup *cg, 549 long *regions_needed) 550 { 551 struct file_region *nrg; 552 553 if (!regions_needed) { 554 nrg = get_file_region_entry_from_cache(map, from, to); 555 record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); 556 list_add(&nrg->link, rg); 557 coalesce_file_region(map, nrg); 558 } else 559 *regions_needed += 1; 560 561 return to - from; 562 } 563 564 /* 565 * Must be called with resv->lock held. 566 * 567 * Calling this with regions_needed != NULL will count the number of pages 568 * to be added but will not modify the linked list. And regions_needed will 569 * indicate the number of file_regions needed in the cache to carry out to add 570 * the regions for this range. 571 */ 572 static long add_reservation_in_range(struct resv_map *resv, long f, long t, 573 struct hugetlb_cgroup *h_cg, 574 struct hstate *h, long *regions_needed) 575 { 576 long add = 0; 577 struct list_head *head = &resv->regions; 578 long last_accounted_offset = f; 579 struct file_region *iter, *trg = NULL; 580 struct list_head *rg = NULL; 581 582 if (regions_needed) 583 *regions_needed = 0; 584 585 /* In this loop, we essentially handle an entry for the range 586 * [last_accounted_offset, iter->from), at every iteration, with some 587 * bounds checking. 588 */ 589 list_for_each_entry_safe(iter, trg, head, link) { 590 /* Skip irrelevant regions that start before our range. */ 591 if (iter->from < f) { 592 /* If this region ends after the last accounted offset, 593 * then we need to update last_accounted_offset. 594 */ 595 if (iter->to > last_accounted_offset) 596 last_accounted_offset = iter->to; 597 continue; 598 } 599 600 /* When we find a region that starts beyond our range, we've 601 * finished. 602 */ 603 if (iter->from >= t) { 604 rg = iter->link.prev; 605 break; 606 } 607 608 /* Add an entry for last_accounted_offset -> iter->from, and 609 * update last_accounted_offset. 610 */ 611 if (iter->from > last_accounted_offset) 612 add += hugetlb_resv_map_add(resv, iter->link.prev, 613 last_accounted_offset, 614 iter->from, h, h_cg, 615 regions_needed); 616 617 last_accounted_offset = iter->to; 618 } 619 620 /* Handle the case where our range extends beyond 621 * last_accounted_offset. 622 */ 623 if (!rg) 624 rg = head->prev; 625 if (last_accounted_offset < t) 626 add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, 627 t, h, h_cg, regions_needed); 628 629 return add; 630 } 631 632 /* Must be called with resv->lock acquired. Will drop lock to allocate entries. 633 */ 634 static int allocate_file_region_entries(struct resv_map *resv, 635 int regions_needed) 636 __must_hold(&resv->lock) 637 { 638 LIST_HEAD(allocated_regions); 639 int to_allocate = 0, i = 0; 640 struct file_region *trg = NULL, *rg = NULL; 641 642 VM_BUG_ON(regions_needed < 0); 643 644 /* 645 * Check for sufficient descriptors in the cache to accommodate 646 * the number of in progress add operations plus regions_needed. 647 * 648 * This is a while loop because when we drop the lock, some other call 649 * to region_add or region_del may have consumed some region_entries, 650 * so we keep looping here until we finally have enough entries for 651 * (adds_in_progress + regions_needed). 652 */ 653 while (resv->region_cache_count < 654 (resv->adds_in_progress + regions_needed)) { 655 to_allocate = resv->adds_in_progress + regions_needed - 656 resv->region_cache_count; 657 658 /* At this point, we should have enough entries in the cache 659 * for all the existing adds_in_progress. We should only be 660 * needing to allocate for regions_needed. 661 */ 662 VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); 663 664 spin_unlock(&resv->lock); 665 for (i = 0; i < to_allocate; i++) { 666 trg = kmalloc(sizeof(*trg), GFP_KERNEL); 667 if (!trg) 668 goto out_of_memory; 669 list_add(&trg->link, &allocated_regions); 670 } 671 672 spin_lock(&resv->lock); 673 674 list_splice(&allocated_regions, &resv->region_cache); 675 resv->region_cache_count += to_allocate; 676 } 677 678 return 0; 679 680 out_of_memory: 681 list_for_each_entry_safe(rg, trg, &allocated_regions, link) { 682 list_del(&rg->link); 683 kfree(rg); 684 } 685 return -ENOMEM; 686 } 687 688 /* 689 * Add the huge page range represented by [f, t) to the reserve 690 * map. Regions will be taken from the cache to fill in this range. 691 * Sufficient regions should exist in the cache due to the previous 692 * call to region_chg with the same range, but in some cases the cache will not 693 * have sufficient entries due to races with other code doing region_add or 694 * region_del. The extra needed entries will be allocated. 695 * 696 * regions_needed is the out value provided by a previous call to region_chg. 697 * 698 * Return the number of new huge pages added to the map. This number is greater 699 * than or equal to zero. If file_region entries needed to be allocated for 700 * this operation and we were not able to allocate, it returns -ENOMEM. 701 * region_add of regions of length 1 never allocate file_regions and cannot 702 * fail; region_chg will always allocate at least 1 entry and a region_add for 703 * 1 page will only require at most 1 entry. 704 */ 705 static long region_add(struct resv_map *resv, long f, long t, 706 long in_regions_needed, struct hstate *h, 707 struct hugetlb_cgroup *h_cg) 708 { 709 long add = 0, actual_regions_needed = 0; 710 711 spin_lock(&resv->lock); 712 retry: 713 714 /* Count how many regions are actually needed to execute this add. */ 715 add_reservation_in_range(resv, f, t, NULL, NULL, 716 &actual_regions_needed); 717 718 /* 719 * Check for sufficient descriptors in the cache to accommodate 720 * this add operation. Note that actual_regions_needed may be greater 721 * than in_regions_needed, as the resv_map may have been modified since 722 * the region_chg call. In this case, we need to make sure that we 723 * allocate extra entries, such that we have enough for all the 724 * existing adds_in_progress, plus the excess needed for this 725 * operation. 726 */ 727 if (actual_regions_needed > in_regions_needed && 728 resv->region_cache_count < 729 resv->adds_in_progress + 730 (actual_regions_needed - in_regions_needed)) { 731 /* region_add operation of range 1 should never need to 732 * allocate file_region entries. 733 */ 734 VM_BUG_ON(t - f <= 1); 735 736 if (allocate_file_region_entries( 737 resv, actual_regions_needed - in_regions_needed)) { 738 return -ENOMEM; 739 } 740 741 goto retry; 742 } 743 744 add = add_reservation_in_range(resv, f, t, h_cg, h, NULL); 745 746 resv->adds_in_progress -= in_regions_needed; 747 748 spin_unlock(&resv->lock); 749 return add; 750 } 751 752 /* 753 * Examine the existing reserve map and determine how many 754 * huge pages in the specified range [f, t) are NOT currently 755 * represented. This routine is called before a subsequent 756 * call to region_add that will actually modify the reserve 757 * map to add the specified range [f, t). region_chg does 758 * not change the number of huge pages represented by the 759 * map. A number of new file_region structures is added to the cache as a 760 * placeholder, for the subsequent region_add call to use. At least 1 761 * file_region structure is added. 762 * 763 * out_regions_needed is the number of regions added to the 764 * resv->adds_in_progress. This value needs to be provided to a follow up call 765 * to region_add or region_abort for proper accounting. 766 * 767 * Returns the number of huge pages that need to be added to the existing 768 * reservation map for the range [f, t). This number is greater or equal to 769 * zero. -ENOMEM is returned if a new file_region structure or cache entry 770 * is needed and can not be allocated. 771 */ 772 static long region_chg(struct resv_map *resv, long f, long t, 773 long *out_regions_needed) 774 { 775 long chg = 0; 776 777 spin_lock(&resv->lock); 778 779 /* Count how many hugepages in this range are NOT represented. */ 780 chg = add_reservation_in_range(resv, f, t, NULL, NULL, 781 out_regions_needed); 782 783 if (*out_regions_needed == 0) 784 *out_regions_needed = 1; 785 786 if (allocate_file_region_entries(resv, *out_regions_needed)) 787 return -ENOMEM; 788 789 resv->adds_in_progress += *out_regions_needed; 790 791 spin_unlock(&resv->lock); 792 return chg; 793 } 794 795 /* 796 * Abort the in progress add operation. The adds_in_progress field 797 * of the resv_map keeps track of the operations in progress between 798 * calls to region_chg and region_add. Operations are sometimes 799 * aborted after the call to region_chg. In such cases, region_abort 800 * is called to decrement the adds_in_progress counter. regions_needed 801 * is the value returned by the region_chg call, it is used to decrement 802 * the adds_in_progress counter. 803 * 804 * NOTE: The range arguments [f, t) are not needed or used in this 805 * routine. They are kept to make reading the calling code easier as 806 * arguments will match the associated region_chg call. 807 */ 808 static void region_abort(struct resv_map *resv, long f, long t, 809 long regions_needed) 810 { 811 spin_lock(&resv->lock); 812 VM_BUG_ON(!resv->region_cache_count); 813 resv->adds_in_progress -= regions_needed; 814 spin_unlock(&resv->lock); 815 } 816 817 /* 818 * Delete the specified range [f, t) from the reserve map. If the 819 * t parameter is LONG_MAX, this indicates that ALL regions after f 820 * should be deleted. Locate the regions which intersect [f, t) 821 * and either trim, delete or split the existing regions. 822 * 823 * Returns the number of huge pages deleted from the reserve map. 824 * In the normal case, the return value is zero or more. In the 825 * case where a region must be split, a new region descriptor must 826 * be allocated. If the allocation fails, -ENOMEM will be returned. 827 * NOTE: If the parameter t == LONG_MAX, then we will never split 828 * a region and possibly return -ENOMEM. Callers specifying 829 * t == LONG_MAX do not need to check for -ENOMEM error. 830 */ 831 static long region_del(struct resv_map *resv, long f, long t) 832 { 833 struct list_head *head = &resv->regions; 834 struct file_region *rg, *trg; 835 struct file_region *nrg = NULL; 836 long del = 0; 837 838 retry: 839 spin_lock(&resv->lock); 840 list_for_each_entry_safe(rg, trg, head, link) { 841 /* 842 * Skip regions before the range to be deleted. file_region 843 * ranges are normally of the form [from, to). However, there 844 * may be a "placeholder" entry in the map which is of the form 845 * (from, to) with from == to. Check for placeholder entries 846 * at the beginning of the range to be deleted. 847 */ 848 if (rg->to <= f && (rg->to != rg->from || rg->to != f)) 849 continue; 850 851 if (rg->from >= t) 852 break; 853 854 if (f > rg->from && t < rg->to) { /* Must split region */ 855 /* 856 * Check for an entry in the cache before dropping 857 * lock and attempting allocation. 858 */ 859 if (!nrg && 860 resv->region_cache_count > resv->adds_in_progress) { 861 nrg = list_first_entry(&resv->region_cache, 862 struct file_region, 863 link); 864 list_del(&nrg->link); 865 resv->region_cache_count--; 866 } 867 868 if (!nrg) { 869 spin_unlock(&resv->lock); 870 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 871 if (!nrg) 872 return -ENOMEM; 873 goto retry; 874 } 875 876 del += t - f; 877 hugetlb_cgroup_uncharge_file_region( 878 resv, rg, t - f, false); 879 880 /* New entry for end of split region */ 881 nrg->from = t; 882 nrg->to = rg->to; 883 884 copy_hugetlb_cgroup_uncharge_info(nrg, rg); 885 886 INIT_LIST_HEAD(&nrg->link); 887 888 /* Original entry is trimmed */ 889 rg->to = f; 890 891 list_add(&nrg->link, &rg->link); 892 nrg = NULL; 893 break; 894 } 895 896 if (f <= rg->from && t >= rg->to) { /* Remove entire region */ 897 del += rg->to - rg->from; 898 hugetlb_cgroup_uncharge_file_region(resv, rg, 899 rg->to - rg->from, true); 900 list_del(&rg->link); 901 kfree(rg); 902 continue; 903 } 904 905 if (f <= rg->from) { /* Trim beginning of region */ 906 hugetlb_cgroup_uncharge_file_region(resv, rg, 907 t - rg->from, false); 908 909 del += t - rg->from; 910 rg->from = t; 911 } else { /* Trim end of region */ 912 hugetlb_cgroup_uncharge_file_region(resv, rg, 913 rg->to - f, false); 914 915 del += rg->to - f; 916 rg->to = f; 917 } 918 } 919 920 spin_unlock(&resv->lock); 921 kfree(nrg); 922 return del; 923 } 924 925 /* 926 * A rare out of memory error was encountered which prevented removal of 927 * the reserve map region for a page. The huge page itself was free'ed 928 * and removed from the page cache. This routine will adjust the subpool 929 * usage count, and the global reserve count if needed. By incrementing 930 * these counts, the reserve map entry which could not be deleted will 931 * appear as a "reserved" entry instead of simply dangling with incorrect 932 * counts. 933 */ 934 void hugetlb_fix_reserve_counts(struct inode *inode) 935 { 936 struct hugepage_subpool *spool = subpool_inode(inode); 937 long rsv_adjust; 938 bool reserved = false; 939 940 rsv_adjust = hugepage_subpool_get_pages(spool, 1); 941 if (rsv_adjust > 0) { 942 struct hstate *h = hstate_inode(inode); 943 944 if (!hugetlb_acct_memory(h, 1)) 945 reserved = true; 946 } else if (!rsv_adjust) { 947 reserved = true; 948 } 949 950 if (!reserved) 951 pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); 952 } 953 954 /* 955 * Count and return the number of huge pages in the reserve map 956 * that intersect with the range [f, t). 957 */ 958 static long region_count(struct resv_map *resv, long f, long t) 959 { 960 struct list_head *head = &resv->regions; 961 struct file_region *rg; 962 long chg = 0; 963 964 spin_lock(&resv->lock); 965 /* Locate each segment we overlap with, and count that overlap. */ 966 list_for_each_entry(rg, head, link) { 967 long seg_from; 968 long seg_to; 969 970 if (rg->to <= f) 971 continue; 972 if (rg->from >= t) 973 break; 974 975 seg_from = max(rg->from, f); 976 seg_to = min(rg->to, t); 977 978 chg += seg_to - seg_from; 979 } 980 spin_unlock(&resv->lock); 981 982 return chg; 983 } 984 985 /* 986 * Convert the address within this vma to the page offset within 987 * the mapping, in pagecache page units; huge pages here. 988 */ 989 static pgoff_t vma_hugecache_offset(struct hstate *h, 990 struct vm_area_struct *vma, unsigned long address) 991 { 992 return ((address - vma->vm_start) >> huge_page_shift(h)) + 993 (vma->vm_pgoff >> huge_page_order(h)); 994 } 995 996 pgoff_t linear_hugepage_index(struct vm_area_struct *vma, 997 unsigned long address) 998 { 999 return vma_hugecache_offset(hstate_vma(vma), vma, address); 1000 } 1001 EXPORT_SYMBOL_GPL(linear_hugepage_index); 1002 1003 /** 1004 * vma_kernel_pagesize - Page size granularity for this VMA. 1005 * @vma: The user mapping. 1006 * 1007 * Folios in this VMA will be aligned to, and at least the size of the 1008 * number of bytes returned by this function. 1009 * 1010 * Return: The default size of the folios allocated when backing a VMA. 1011 */ 1012 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 1013 { 1014 if (vma->vm_ops && vma->vm_ops->pagesize) 1015 return vma->vm_ops->pagesize(vma); 1016 return PAGE_SIZE; 1017 } 1018 EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 1019 1020 /* 1021 * Return the page size being used by the MMU to back a VMA. In the majority 1022 * of cases, the page size used by the kernel matches the MMU size. On 1023 * architectures where it differs, an architecture-specific 'strong' 1024 * version of this symbol is required. 1025 */ 1026 __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 1027 { 1028 return vma_kernel_pagesize(vma); 1029 } 1030 1031 /* 1032 * Flags for MAP_PRIVATE reservations. These are stored in the bottom 1033 * bits of the reservation map pointer, which are always clear due to 1034 * alignment. 1035 */ 1036 #define HPAGE_RESV_OWNER (1UL << 0) 1037 #define HPAGE_RESV_UNMAPPED (1UL << 1) 1038 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 1039 1040 /* 1041 * These helpers are used to track how many pages are reserved for 1042 * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 1043 * is guaranteed to have their future faults succeed. 1044 * 1045 * With the exception of hugetlb_dup_vma_private() which is called at fork(), 1046 * the reserve counters are updated with the hugetlb_lock held. It is safe 1047 * to reset the VMA at fork() time as it is not in use yet and there is no 1048 * chance of the global counters getting corrupted as a result of the values. 1049 * 1050 * The private mapping reservation is represented in a subtly different 1051 * manner to a shared mapping. A shared mapping has a region map associated 1052 * with the underlying file, this region map represents the backing file 1053 * pages which have ever had a reservation assigned which this persists even 1054 * after the page is instantiated. A private mapping has a region map 1055 * associated with the original mmap which is attached to all VMAs which 1056 * reference it, this region map represents those offsets which have consumed 1057 * reservation ie. where pages have been instantiated. 1058 */ 1059 static unsigned long get_vma_private_data(struct vm_area_struct *vma) 1060 { 1061 return (unsigned long)vma->vm_private_data; 1062 } 1063 1064 static void set_vma_private_data(struct vm_area_struct *vma, 1065 unsigned long value) 1066 { 1067 vma->vm_private_data = (void *)value; 1068 } 1069 1070 static void 1071 resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map, 1072 struct hugetlb_cgroup *h_cg, 1073 struct hstate *h) 1074 { 1075 #ifdef CONFIG_CGROUP_HUGETLB 1076 if (!h_cg || !h) { 1077 resv_map->reservation_counter = NULL; 1078 resv_map->pages_per_hpage = 0; 1079 resv_map->css = NULL; 1080 } else { 1081 resv_map->reservation_counter = 1082 &h_cg->rsvd_hugepage[hstate_index(h)]; 1083 resv_map->pages_per_hpage = pages_per_huge_page(h); 1084 resv_map->css = &h_cg->css; 1085 } 1086 #endif 1087 } 1088 1089 struct resv_map *resv_map_alloc(void) 1090 { 1091 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 1092 struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); 1093 1094 if (!resv_map || !rg) { 1095 kfree(resv_map); 1096 kfree(rg); 1097 return NULL; 1098 } 1099 1100 kref_init(&resv_map->refs); 1101 spin_lock_init(&resv_map->lock); 1102 INIT_LIST_HEAD(&resv_map->regions); 1103 init_rwsem(&resv_map->rw_sema); 1104 1105 resv_map->adds_in_progress = 0; 1106 /* 1107 * Initialize these to 0. On shared mappings, 0's here indicate these 1108 * fields don't do cgroup accounting. On private mappings, these will be 1109 * re-initialized to the proper values, to indicate that hugetlb cgroup 1110 * reservations are to be un-charged from here. 1111 */ 1112 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL); 1113 1114 INIT_LIST_HEAD(&resv_map->region_cache); 1115 list_add(&rg->link, &resv_map->region_cache); 1116 resv_map->region_cache_count = 1; 1117 1118 return resv_map; 1119 } 1120 1121 void resv_map_release(struct kref *ref) 1122 { 1123 struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 1124 struct list_head *head = &resv_map->region_cache; 1125 struct file_region *rg, *trg; 1126 1127 /* Clear out any active regions before we release the map. */ 1128 region_del(resv_map, 0, LONG_MAX); 1129 1130 /* ... and any entries left in the cache */ 1131 list_for_each_entry_safe(rg, trg, head, link) { 1132 list_del(&rg->link); 1133 kfree(rg); 1134 } 1135 1136 VM_BUG_ON(resv_map->adds_in_progress); 1137 1138 kfree(resv_map); 1139 } 1140 1141 static inline struct resv_map *inode_resv_map(struct inode *inode) 1142 { 1143 /* 1144 * At inode evict time, i_mapping may not point to the original 1145 * address space within the inode. This original address space 1146 * contains the pointer to the resv_map. So, always use the 1147 * address space embedded within the inode. 1148 * The VERY common case is inode->mapping == &inode->i_data but, 1149 * this may not be true for device special inodes. 1150 */ 1151 return (struct resv_map *)(&inode->i_data)->private_data; 1152 } 1153 1154 static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 1155 { 1156 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 1157 if (vma->vm_flags & VM_MAYSHARE) { 1158 struct address_space *mapping = vma->vm_file->f_mapping; 1159 struct inode *inode = mapping->host; 1160 1161 return inode_resv_map(inode); 1162 1163 } else { 1164 return (struct resv_map *)(get_vma_private_data(vma) & 1165 ~HPAGE_RESV_MASK); 1166 } 1167 } 1168 1169 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 1170 { 1171 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 1172 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 1173 1174 set_vma_private_data(vma, (unsigned long)map); 1175 } 1176 1177 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 1178 { 1179 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 1180 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 1181 1182 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 1183 } 1184 1185 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 1186 { 1187 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 1188 1189 return (get_vma_private_data(vma) & flag) != 0; 1190 } 1191 1192 bool __vma_private_lock(struct vm_area_struct *vma) 1193 { 1194 return !(vma->vm_flags & VM_MAYSHARE) && 1195 get_vma_private_data(vma) & ~HPAGE_RESV_MASK && 1196 is_vma_resv_set(vma, HPAGE_RESV_OWNER); 1197 } 1198 1199 void hugetlb_dup_vma_private(struct vm_area_struct *vma) 1200 { 1201 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 1202 /* 1203 * Clear vm_private_data 1204 * - For shared mappings this is a per-vma semaphore that may be 1205 * allocated in a subsequent call to hugetlb_vm_op_open. 1206 * Before clearing, make sure pointer is not associated with vma 1207 * as this will leak the structure. This is the case when called 1208 * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already 1209 * been called to allocate a new structure. 1210 * - For MAP_PRIVATE mappings, this is the reserve map which does 1211 * not apply to children. Faults generated by the children are 1212 * not guaranteed to succeed, even if read-only. 1213 */ 1214 if (vma->vm_flags & VM_MAYSHARE) { 1215 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 1216 1217 if (vma_lock && vma_lock->vma != vma) 1218 vma->vm_private_data = NULL; 1219 } else 1220 vma->vm_private_data = NULL; 1221 } 1222 1223 /* 1224 * Reset and decrement one ref on hugepage private reservation. 1225 * Called with mm->mmap_lock writer semaphore held. 1226 * This function should be only used by move_vma() and operate on 1227 * same sized vma. It should never come here with last ref on the 1228 * reservation. 1229 */ 1230 void clear_vma_resv_huge_pages(struct vm_area_struct *vma) 1231 { 1232 /* 1233 * Clear the old hugetlb private page reservation. 1234 * It has already been transferred to new_vma. 1235 * 1236 * During a mremap() operation of a hugetlb vma we call move_vma() 1237 * which copies vma into new_vma and unmaps vma. After the copy 1238 * operation both new_vma and vma share a reference to the resv_map 1239 * struct, and at that point vma is about to be unmapped. We don't 1240 * want to return the reservation to the pool at unmap of vma because 1241 * the reservation still lives on in new_vma, so simply decrement the 1242 * ref here and remove the resv_map reference from this vma. 1243 */ 1244 struct resv_map *reservations = vma_resv_map(vma); 1245 1246 if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 1247 resv_map_put_hugetlb_cgroup_uncharge_info(reservations); 1248 kref_put(&reservations->refs, resv_map_release); 1249 } 1250 1251 hugetlb_dup_vma_private(vma); 1252 } 1253 1254 /* Returns true if the VMA has associated reserve pages */ 1255 static bool vma_has_reserves(struct vm_area_struct *vma, long chg) 1256 { 1257 if (vma->vm_flags & VM_NORESERVE) { 1258 /* 1259 * This address is already reserved by other process(chg == 0), 1260 * so, we should decrement reserved count. Without decrementing, 1261 * reserve count remains after releasing inode, because this 1262 * allocated page will go into page cache and is regarded as 1263 * coming from reserved pool in releasing step. Currently, we 1264 * don't have any other solution to deal with this situation 1265 * properly, so add work-around here. 1266 */ 1267 if (vma->vm_flags & VM_MAYSHARE && chg == 0) 1268 return true; 1269 else 1270 return false; 1271 } 1272 1273 /* Shared mappings always use reserves */ 1274 if (vma->vm_flags & VM_MAYSHARE) { 1275 /* 1276 * We know VM_NORESERVE is not set. Therefore, there SHOULD 1277 * be a region map for all pages. The only situation where 1278 * there is no region map is if a hole was punched via 1279 * fallocate. In this case, there really are no reserves to 1280 * use. This situation is indicated if chg != 0. 1281 */ 1282 if (chg) 1283 return false; 1284 else 1285 return true; 1286 } 1287 1288 /* 1289 * Only the process that called mmap() has reserves for 1290 * private mappings. 1291 */ 1292 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 1293 /* 1294 * Like the shared case above, a hole punch or truncate 1295 * could have been performed on the private mapping. 1296 * Examine the value of chg to determine if reserves 1297 * actually exist or were previously consumed. 1298 * Very Subtle - The value of chg comes from a previous 1299 * call to vma_needs_reserves(). The reserve map for 1300 * private mappings has different (opposite) semantics 1301 * than that of shared mappings. vma_needs_reserves() 1302 * has already taken this difference in semantics into 1303 * account. Therefore, the meaning of chg is the same 1304 * as in the shared case above. Code could easily be 1305 * combined, but keeping it separate draws attention to 1306 * subtle differences. 1307 */ 1308 if (chg) 1309 return false; 1310 else 1311 return true; 1312 } 1313 1314 return false; 1315 } 1316 1317 static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) 1318 { 1319 int nid = folio_nid(folio); 1320 1321 lockdep_assert_held(&hugetlb_lock); 1322 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 1323 1324 list_move(&folio->lru, &h->hugepage_freelists[nid]); 1325 h->free_huge_pages++; 1326 h->free_huge_pages_node[nid]++; 1327 folio_set_hugetlb_freed(folio); 1328 } 1329 1330 static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, 1331 int nid) 1332 { 1333 struct folio *folio; 1334 bool pin = !!(current->flags & PF_MEMALLOC_PIN); 1335 1336 lockdep_assert_held(&hugetlb_lock); 1337 list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) { 1338 if (pin && !folio_is_longterm_pinnable(folio)) 1339 continue; 1340 1341 if (folio_test_hwpoison(folio)) 1342 continue; 1343 1344 list_move(&folio->lru, &h->hugepage_activelist); 1345 folio_ref_unfreeze(folio, 1); 1346 folio_clear_hugetlb_freed(folio); 1347 h->free_huge_pages--; 1348 h->free_huge_pages_node[nid]--; 1349 return folio; 1350 } 1351 1352 return NULL; 1353 } 1354 1355 static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask, 1356 int nid, nodemask_t *nmask) 1357 { 1358 unsigned int cpuset_mems_cookie; 1359 struct zonelist *zonelist; 1360 struct zone *zone; 1361 struct zoneref *z; 1362 int node = NUMA_NO_NODE; 1363 1364 zonelist = node_zonelist(nid, gfp_mask); 1365 1366 retry_cpuset: 1367 cpuset_mems_cookie = read_mems_allowed_begin(); 1368 for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { 1369 struct folio *folio; 1370 1371 if (!cpuset_zone_allowed(zone, gfp_mask)) 1372 continue; 1373 /* 1374 * no need to ask again on the same node. Pool is node rather than 1375 * zone aware 1376 */ 1377 if (zone_to_nid(zone) == node) 1378 continue; 1379 node = zone_to_nid(zone); 1380 1381 folio = dequeue_hugetlb_folio_node_exact(h, node); 1382 if (folio) 1383 return folio; 1384 } 1385 if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) 1386 goto retry_cpuset; 1387 1388 return NULL; 1389 } 1390 1391 static unsigned long available_huge_pages(struct hstate *h) 1392 { 1393 return h->free_huge_pages - h->resv_huge_pages; 1394 } 1395 1396 static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, 1397 struct vm_area_struct *vma, 1398 unsigned long address, int avoid_reserve, 1399 long chg) 1400 { 1401 struct folio *folio = NULL; 1402 struct mempolicy *mpol; 1403 gfp_t gfp_mask; 1404 nodemask_t *nodemask; 1405 int nid; 1406 1407 /* 1408 * A child process with MAP_PRIVATE mappings created by their parent 1409 * have no page reserves. This check ensures that reservations are 1410 * not "stolen". The child may still get SIGKILLed 1411 */ 1412 if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) 1413 goto err; 1414 1415 /* If reserves cannot be used, ensure enough pages are in the pool */ 1416 if (avoid_reserve && !available_huge_pages(h)) 1417 goto err; 1418 1419 gfp_mask = htlb_alloc_mask(h); 1420 nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 1421 1422 if (mpol_is_preferred_many(mpol)) { 1423 folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, 1424 nid, nodemask); 1425 1426 /* Fallback to all nodes if page==NULL */ 1427 nodemask = NULL; 1428 } 1429 1430 if (!folio) 1431 folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, 1432 nid, nodemask); 1433 1434 if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) { 1435 folio_set_hugetlb_restore_reserve(folio); 1436 h->resv_huge_pages--; 1437 } 1438 1439 mpol_cond_put(mpol); 1440 return folio; 1441 1442 err: 1443 return NULL; 1444 } 1445 1446 /* 1447 * common helper functions for hstate_next_node_to_{alloc|free}. 1448 * We may have allocated or freed a huge page based on a different 1449 * nodes_allowed previously, so h->next_node_to_{alloc|free} might 1450 * be outside of *nodes_allowed. Ensure that we use an allowed 1451 * node for alloc or free. 1452 */ 1453 static int next_node_allowed(int nid, nodemask_t *nodes_allowed) 1454 { 1455 nid = next_node_in(nid, *nodes_allowed); 1456 VM_BUG_ON(nid >= MAX_NUMNODES); 1457 1458 return nid; 1459 } 1460 1461 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) 1462 { 1463 if (!node_isset(nid, *nodes_allowed)) 1464 nid = next_node_allowed(nid, nodes_allowed); 1465 return nid; 1466 } 1467 1468 /* 1469 * returns the previously saved node ["this node"] from which to 1470 * allocate a persistent huge page for the pool and advance the 1471 * next node from which to allocate, handling wrap at end of node 1472 * mask. 1473 */ 1474 static int hstate_next_node_to_alloc(struct hstate *h, 1475 nodemask_t *nodes_allowed) 1476 { 1477 int nid; 1478 1479 VM_BUG_ON(!nodes_allowed); 1480 1481 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); 1482 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); 1483 1484 return nid; 1485 } 1486 1487 /* 1488 * helper for remove_pool_huge_page() - return the previously saved 1489 * node ["this node"] from which to free a huge page. Advance the 1490 * next node id whether or not we find a free huge page to free so 1491 * that the next attempt to free addresses the next node. 1492 */ 1493 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) 1494 { 1495 int nid; 1496 1497 VM_BUG_ON(!nodes_allowed); 1498 1499 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); 1500 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); 1501 1502 return nid; 1503 } 1504 1505 #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ 1506 for (nr_nodes = nodes_weight(*mask); \ 1507 nr_nodes > 0 && \ 1508 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ 1509 nr_nodes--) 1510 1511 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ 1512 for (nr_nodes = nodes_weight(*mask); \ 1513 nr_nodes > 0 && \ 1514 ((node = hstate_next_node_to_free(hs, mask)) || 1); \ 1515 nr_nodes--) 1516 1517 /* used to demote non-gigantic_huge pages as well */ 1518 static void __destroy_compound_gigantic_folio(struct folio *folio, 1519 unsigned int order, bool demote) 1520 { 1521 int i; 1522 int nr_pages = 1 << order; 1523 struct page *p; 1524 1525 atomic_set(&folio->_entire_mapcount, 0); 1526 atomic_set(&folio->_nr_pages_mapped, 0); 1527 atomic_set(&folio->_pincount, 0); 1528 1529 for (i = 1; i < nr_pages; i++) { 1530 p = folio_page(folio, i); 1531 p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE; 1532 p->mapping = NULL; 1533 clear_compound_head(p); 1534 if (!demote) 1535 set_page_refcounted(p); 1536 } 1537 1538 __folio_clear_head(folio); 1539 } 1540 1541 static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio, 1542 unsigned int order) 1543 { 1544 __destroy_compound_gigantic_folio(folio, order, true); 1545 } 1546 1547 #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE 1548 static void destroy_compound_gigantic_folio(struct folio *folio, 1549 unsigned int order) 1550 { 1551 __destroy_compound_gigantic_folio(folio, order, false); 1552 } 1553 1554 static void free_gigantic_folio(struct folio *folio, unsigned int order) 1555 { 1556 /* 1557 * If the page isn't allocated using the cma allocator, 1558 * cma_release() returns false. 1559 */ 1560 #ifdef CONFIG_CMA 1561 int nid = folio_nid(folio); 1562 1563 if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order)) 1564 return; 1565 #endif 1566 1567 free_contig_range(folio_pfn(folio), 1 << order); 1568 } 1569 1570 #ifdef CONFIG_CONTIG_ALLOC 1571 static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, 1572 int nid, nodemask_t *nodemask) 1573 { 1574 struct page *page; 1575 unsigned long nr_pages = pages_per_huge_page(h); 1576 if (nid == NUMA_NO_NODE) 1577 nid = numa_mem_id(); 1578 1579 #ifdef CONFIG_CMA 1580 { 1581 int node; 1582 1583 if (hugetlb_cma[nid]) { 1584 page = cma_alloc(hugetlb_cma[nid], nr_pages, 1585 huge_page_order(h), true); 1586 if (page) 1587 return page_folio(page); 1588 } 1589 1590 if (!(gfp_mask & __GFP_THISNODE)) { 1591 for_each_node_mask(node, *nodemask) { 1592 if (node == nid || !hugetlb_cma[node]) 1593 continue; 1594 1595 page = cma_alloc(hugetlb_cma[node], nr_pages, 1596 huge_page_order(h), true); 1597 if (page) 1598 return page_folio(page); 1599 } 1600 } 1601 } 1602 #endif 1603 1604 page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); 1605 return page ? page_folio(page) : NULL; 1606 } 1607 1608 #else /* !CONFIG_CONTIG_ALLOC */ 1609 static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, 1610 int nid, nodemask_t *nodemask) 1611 { 1612 return NULL; 1613 } 1614 #endif /* CONFIG_CONTIG_ALLOC */ 1615 1616 #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ 1617 static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, 1618 int nid, nodemask_t *nodemask) 1619 { 1620 return NULL; 1621 } 1622 static inline void free_gigantic_folio(struct folio *folio, 1623 unsigned int order) { } 1624 static inline void destroy_compound_gigantic_folio(struct folio *folio, 1625 unsigned int order) { } 1626 #endif 1627 1628 static inline void __clear_hugetlb_destructor(struct hstate *h, 1629 struct folio *folio) 1630 { 1631 lockdep_assert_held(&hugetlb_lock); 1632 1633 __folio_clear_hugetlb(folio); 1634 } 1635 1636 /* 1637 * Remove hugetlb folio from lists. 1638 * If vmemmap exists for the folio, update dtor so that the folio appears 1639 * as just a compound page. Otherwise, wait until after allocating vmemmap 1640 * to update dtor. 1641 * 1642 * A reference is held on the folio, except in the case of demote. 1643 * 1644 * Must be called with hugetlb lock held. 1645 */ 1646 static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, 1647 bool adjust_surplus, 1648 bool demote) 1649 { 1650 int nid = folio_nid(folio); 1651 1652 VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio); 1653 VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio); 1654 1655 lockdep_assert_held(&hugetlb_lock); 1656 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 1657 return; 1658 1659 list_del(&folio->lru); 1660 1661 if (folio_test_hugetlb_freed(folio)) { 1662 h->free_huge_pages--; 1663 h->free_huge_pages_node[nid]--; 1664 } 1665 if (adjust_surplus) { 1666 h->surplus_huge_pages--; 1667 h->surplus_huge_pages_node[nid]--; 1668 } 1669 1670 /* 1671 * We can only clear the hugetlb destructor after allocating vmemmap 1672 * pages. Otherwise, someone (memory error handling) may try to write 1673 * to tail struct pages. 1674 */ 1675 if (!folio_test_hugetlb_vmemmap_optimized(folio)) 1676 __clear_hugetlb_destructor(h, folio); 1677 1678 /* 1679 * In the case of demote we do not ref count the page as it will soon 1680 * be turned into a page of smaller size. 1681 */ 1682 if (!demote) 1683 folio_ref_unfreeze(folio, 1); 1684 1685 h->nr_huge_pages--; 1686 h->nr_huge_pages_node[nid]--; 1687 } 1688 1689 static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, 1690 bool adjust_surplus) 1691 { 1692 __remove_hugetlb_folio(h, folio, adjust_surplus, false); 1693 } 1694 1695 static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio, 1696 bool adjust_surplus) 1697 { 1698 __remove_hugetlb_folio(h, folio, adjust_surplus, true); 1699 } 1700 1701 static void add_hugetlb_folio(struct hstate *h, struct folio *folio, 1702 bool adjust_surplus) 1703 { 1704 int zeroed; 1705 int nid = folio_nid(folio); 1706 1707 VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio); 1708 1709 lockdep_assert_held(&hugetlb_lock); 1710 1711 INIT_LIST_HEAD(&folio->lru); 1712 h->nr_huge_pages++; 1713 h->nr_huge_pages_node[nid]++; 1714 1715 if (adjust_surplus) { 1716 h->surplus_huge_pages++; 1717 h->surplus_huge_pages_node[nid]++; 1718 } 1719 1720 __folio_set_hugetlb(folio); 1721 folio_change_private(folio, NULL); 1722 /* 1723 * We have to set hugetlb_vmemmap_optimized again as above 1724 * folio_change_private(folio, NULL) cleared it. 1725 */ 1726 folio_set_hugetlb_vmemmap_optimized(folio); 1727 1728 /* 1729 * This folio is about to be managed by the hugetlb allocator and 1730 * should have no users. Drop our reference, and check for others 1731 * just in case. 1732 */ 1733 zeroed = folio_put_testzero(folio); 1734 if (unlikely(!zeroed)) 1735 /* 1736 * It is VERY unlikely soneone else has taken a ref 1737 * on the folio. In this case, we simply return as 1738 * free_huge_folio() will be called when this other ref 1739 * is dropped. 1740 */ 1741 return; 1742 1743 arch_clear_hugepage_flags(&folio->page); 1744 enqueue_hugetlb_folio(h, folio); 1745 } 1746 1747 static void __update_and_free_hugetlb_folio(struct hstate *h, 1748 struct folio *folio) 1749 { 1750 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 1751 return; 1752 1753 /* 1754 * If we don't know which subpages are hwpoisoned, we can't free 1755 * the hugepage, so it's leaked intentionally. 1756 */ 1757 if (folio_test_hugetlb_raw_hwp_unreliable(folio)) 1758 return; 1759 1760 if (hugetlb_vmemmap_restore(h, &folio->page)) { 1761 spin_lock_irq(&hugetlb_lock); 1762 /* 1763 * If we cannot allocate vmemmap pages, just refuse to free the 1764 * page and put the page back on the hugetlb free list and treat 1765 * as a surplus page. 1766 */ 1767 add_hugetlb_folio(h, folio, true); 1768 spin_unlock_irq(&hugetlb_lock); 1769 return; 1770 } 1771 1772 /* 1773 * If vmemmap pages were allocated above, then we need to clear the 1774 * hugetlb destructor under the hugetlb lock. 1775 */ 1776 if (folio_test_hugetlb(folio)) { 1777 spin_lock_irq(&hugetlb_lock); 1778 __clear_hugetlb_destructor(h, folio); 1779 spin_unlock_irq(&hugetlb_lock); 1780 } 1781 1782 /* 1783 * Move PageHWPoison flag from head page to the raw error pages, 1784 * which makes any healthy subpages reusable. 1785 */ 1786 if (unlikely(folio_test_hwpoison(folio))) 1787 folio_clear_hugetlb_hwpoison(folio); 1788 1789 /* 1790 * Non-gigantic pages demoted from CMA allocated gigantic pages 1791 * need to be given back to CMA in free_gigantic_folio. 1792 */ 1793 if (hstate_is_gigantic(h) || 1794 hugetlb_cma_folio(folio, huge_page_order(h))) { 1795 destroy_compound_gigantic_folio(folio, huge_page_order(h)); 1796 free_gigantic_folio(folio, huge_page_order(h)); 1797 } else { 1798 __free_pages(&folio->page, huge_page_order(h)); 1799 } 1800 } 1801 1802 /* 1803 * As update_and_free_hugetlb_folio() can be called under any context, so we cannot 1804 * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the 1805 * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate 1806 * the vmemmap pages. 1807 * 1808 * free_hpage_workfn() locklessly retrieves the linked list of pages to be 1809 * freed and frees them one-by-one. As the page->mapping pointer is going 1810 * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node 1811 * structure of a lockless linked list of huge pages to be freed. 1812 */ 1813 static LLIST_HEAD(hpage_freelist); 1814 1815 static void free_hpage_workfn(struct work_struct *work) 1816 { 1817 struct llist_node *node; 1818 1819 node = llist_del_all(&hpage_freelist); 1820 1821 while (node) { 1822 struct page *page; 1823 struct hstate *h; 1824 1825 page = container_of((struct address_space **)node, 1826 struct page, mapping); 1827 node = node->next; 1828 page->mapping = NULL; 1829 /* 1830 * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in 1831 * folio_hstate() is going to trigger because a previous call to 1832 * remove_hugetlb_folio() will clear the hugetlb bit, so do 1833 * not use folio_hstate() directly. 1834 */ 1835 h = size_to_hstate(page_size(page)); 1836 1837 __update_and_free_hugetlb_folio(h, page_folio(page)); 1838 1839 cond_resched(); 1840 } 1841 } 1842 static DECLARE_WORK(free_hpage_work, free_hpage_workfn); 1843 1844 static inline void flush_free_hpage_work(struct hstate *h) 1845 { 1846 if (hugetlb_vmemmap_optimizable(h)) 1847 flush_work(&free_hpage_work); 1848 } 1849 1850 static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, 1851 bool atomic) 1852 { 1853 if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) { 1854 __update_and_free_hugetlb_folio(h, folio); 1855 return; 1856 } 1857 1858 /* 1859 * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages. 1860 * 1861 * Only call schedule_work() if hpage_freelist is previously 1862 * empty. Otherwise, schedule_work() had been called but the workfn 1863 * hasn't retrieved the list yet. 1864 */ 1865 if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist)) 1866 schedule_work(&free_hpage_work); 1867 } 1868 1869 static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) 1870 { 1871 struct page *page, *t_page; 1872 struct folio *folio; 1873 1874 list_for_each_entry_safe(page, t_page, list, lru) { 1875 folio = page_folio(page); 1876 update_and_free_hugetlb_folio(h, folio, false); 1877 cond_resched(); 1878 } 1879 } 1880 1881 struct hstate *size_to_hstate(unsigned long size) 1882 { 1883 struct hstate *h; 1884 1885 for_each_hstate(h) { 1886 if (huge_page_size(h) == size) 1887 return h; 1888 } 1889 return NULL; 1890 } 1891 1892 void free_huge_folio(struct folio *folio) 1893 { 1894 /* 1895 * Can't pass hstate in here because it is called from the 1896 * compound page destructor. 1897 */ 1898 struct hstate *h = folio_hstate(folio); 1899 int nid = folio_nid(folio); 1900 struct hugepage_subpool *spool = hugetlb_folio_subpool(folio); 1901 bool restore_reserve; 1902 unsigned long flags; 1903 1904 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 1905 VM_BUG_ON_FOLIO(folio_mapcount(folio), folio); 1906 1907 hugetlb_set_folio_subpool(folio, NULL); 1908 if (folio_test_anon(folio)) 1909 __ClearPageAnonExclusive(&folio->page); 1910 folio->mapping = NULL; 1911 restore_reserve = folio_test_hugetlb_restore_reserve(folio); 1912 folio_clear_hugetlb_restore_reserve(folio); 1913 1914 /* 1915 * If HPageRestoreReserve was set on page, page allocation consumed a 1916 * reservation. If the page was associated with a subpool, there 1917 * would have been a page reserved in the subpool before allocation 1918 * via hugepage_subpool_get_pages(). Since we are 'restoring' the 1919 * reservation, do not call hugepage_subpool_put_pages() as this will 1920 * remove the reserved page from the subpool. 1921 */ 1922 if (!restore_reserve) { 1923 /* 1924 * A return code of zero implies that the subpool will be 1925 * under its minimum size if the reservation is not restored 1926 * after page is free. Therefore, force restore_reserve 1927 * operation. 1928 */ 1929 if (hugepage_subpool_put_pages(spool, 1) == 0) 1930 restore_reserve = true; 1931 } 1932 1933 spin_lock_irqsave(&hugetlb_lock, flags); 1934 folio_clear_hugetlb_migratable(folio); 1935 hugetlb_cgroup_uncharge_folio(hstate_index(h), 1936 pages_per_huge_page(h), folio); 1937 hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), 1938 pages_per_huge_page(h), folio); 1939 if (restore_reserve) 1940 h->resv_huge_pages++; 1941 1942 if (folio_test_hugetlb_temporary(folio)) { 1943 remove_hugetlb_folio(h, folio, false); 1944 spin_unlock_irqrestore(&hugetlb_lock, flags); 1945 update_and_free_hugetlb_folio(h, folio, true); 1946 } else if (h->surplus_huge_pages_node[nid]) { 1947 /* remove the page from active list */ 1948 remove_hugetlb_folio(h, folio, true); 1949 spin_unlock_irqrestore(&hugetlb_lock, flags); 1950 update_and_free_hugetlb_folio(h, folio, true); 1951 } else { 1952 arch_clear_hugepage_flags(&folio->page); 1953 enqueue_hugetlb_folio(h, folio); 1954 spin_unlock_irqrestore(&hugetlb_lock, flags); 1955 } 1956 } 1957 1958 /* 1959 * Must be called with the hugetlb lock held 1960 */ 1961 static void __prep_account_new_huge_page(struct hstate *h, int nid) 1962 { 1963 lockdep_assert_held(&hugetlb_lock); 1964 h->nr_huge_pages++; 1965 h->nr_huge_pages_node[nid]++; 1966 } 1967 1968 static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) 1969 { 1970 hugetlb_vmemmap_optimize(h, &folio->page); 1971 INIT_LIST_HEAD(&folio->lru); 1972 __folio_set_hugetlb(folio); 1973 hugetlb_set_folio_subpool(folio, NULL); 1974 set_hugetlb_cgroup(folio, NULL); 1975 set_hugetlb_cgroup_rsvd(folio, NULL); 1976 } 1977 1978 static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) 1979 { 1980 __prep_new_hugetlb_folio(h, folio); 1981 spin_lock_irq(&hugetlb_lock); 1982 __prep_account_new_huge_page(h, nid); 1983 spin_unlock_irq(&hugetlb_lock); 1984 } 1985 1986 static bool __prep_compound_gigantic_folio(struct folio *folio, 1987 unsigned int order, bool demote) 1988 { 1989 int i, j; 1990 int nr_pages = 1 << order; 1991 struct page *p; 1992 1993 __folio_clear_reserved(folio); 1994 for (i = 0; i < nr_pages; i++) { 1995 p = folio_page(folio, i); 1996 1997 /* 1998 * For gigantic hugepages allocated through bootmem at 1999 * boot, it's safer to be consistent with the not-gigantic 2000 * hugepages and clear the PG_reserved bit from all tail pages 2001 * too. Otherwise drivers using get_user_pages() to access tail 2002 * pages may get the reference counting wrong if they see 2003 * PG_reserved set on a tail page (despite the head page not 2004 * having PG_reserved set). Enforcing this consistency between 2005 * head and tail pages allows drivers to optimize away a check 2006 * on the head page when they need know if put_page() is needed 2007 * after get_user_pages(). 2008 */ 2009 if (i != 0) /* head page cleared above */ 2010 __ClearPageReserved(p); 2011 /* 2012 * Subtle and very unlikely 2013 * 2014 * Gigantic 'page allocators' such as memblock or cma will 2015 * return a set of pages with each page ref counted. We need 2016 * to turn this set of pages into a compound page with tail 2017 * page ref counts set to zero. Code such as speculative page 2018 * cache adding could take a ref on a 'to be' tail page. 2019 * We need to respect any increased ref count, and only set 2020 * the ref count to zero if count is currently 1. If count 2021 * is not 1, we return an error. An error return indicates 2022 * the set of pages can not be converted to a gigantic page. 2023 * The caller who allocated the pages should then discard the 2024 * pages using the appropriate free interface. 2025 * 2026 * In the case of demote, the ref count will be zero. 2027 */ 2028 if (!demote) { 2029 if (!page_ref_freeze(p, 1)) { 2030 pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); 2031 goto out_error; 2032 } 2033 } else { 2034 VM_BUG_ON_PAGE(page_count(p), p); 2035 } 2036 if (i != 0) 2037 set_compound_head(p, &folio->page); 2038 } 2039 __folio_set_head(folio); 2040 /* we rely on prep_new_hugetlb_folio to set the destructor */ 2041 folio_set_order(folio, order); 2042 atomic_set(&folio->_entire_mapcount, -1); 2043 atomic_set(&folio->_nr_pages_mapped, 0); 2044 atomic_set(&folio->_pincount, 0); 2045 return true; 2046 2047 out_error: 2048 /* undo page modifications made above */ 2049 for (j = 0; j < i; j++) { 2050 p = folio_page(folio, j); 2051 if (j != 0) 2052 clear_compound_head(p); 2053 set_page_refcounted(p); 2054 } 2055 /* need to clear PG_reserved on remaining tail pages */ 2056 for (; j < nr_pages; j++) { 2057 p = folio_page(folio, j); 2058 __ClearPageReserved(p); 2059 } 2060 return false; 2061 } 2062 2063 static bool prep_compound_gigantic_folio(struct folio *folio, 2064 unsigned int order) 2065 { 2066 return __prep_compound_gigantic_folio(folio, order, false); 2067 } 2068 2069 static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, 2070 unsigned int order) 2071 { 2072 return __prep_compound_gigantic_folio(folio, order, true); 2073 } 2074 2075 /* 2076 * Find and lock address space (mapping) in write mode. 2077 * 2078 * Upon entry, the page is locked which means that page_mapping() is 2079 * stable. Due to locking order, we can only trylock_write. If we can 2080 * not get the lock, simply return NULL to caller. 2081 */ 2082 struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) 2083 { 2084 struct address_space *mapping = page_mapping(hpage); 2085 2086 if (!mapping) 2087 return mapping; 2088 2089 if (i_mmap_trylock_write(mapping)) 2090 return mapping; 2091 2092 return NULL; 2093 } 2094 2095 pgoff_t hugetlb_basepage_index(struct page *page) 2096 { 2097 struct page *page_head = compound_head(page); 2098 pgoff_t index = page_index(page_head); 2099 unsigned long compound_idx; 2100 2101 if (compound_order(page_head) > MAX_ORDER) 2102 compound_idx = page_to_pfn(page) - page_to_pfn(page_head); 2103 else 2104 compound_idx = page - page_head; 2105 2106 return (index << compound_order(page_head)) + compound_idx; 2107 } 2108 2109 static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, 2110 gfp_t gfp_mask, int nid, nodemask_t *nmask, 2111 nodemask_t *node_alloc_noretry) 2112 { 2113 int order = huge_page_order(h); 2114 struct page *page; 2115 bool alloc_try_hard = true; 2116 bool retry = true; 2117 2118 /* 2119 * By default we always try hard to allocate the page with 2120 * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in 2121 * a loop (to adjust global huge page counts) and previous allocation 2122 * failed, do not continue to try hard on the same node. Use the 2123 * node_alloc_noretry bitmap to manage this state information. 2124 */ 2125 if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) 2126 alloc_try_hard = false; 2127 gfp_mask |= __GFP_COMP|__GFP_NOWARN; 2128 if (alloc_try_hard) 2129 gfp_mask |= __GFP_RETRY_MAYFAIL; 2130 if (nid == NUMA_NO_NODE) 2131 nid = numa_mem_id(); 2132 retry: 2133 page = __alloc_pages(gfp_mask, order, nid, nmask); 2134 2135 /* Freeze head page */ 2136 if (page && !page_ref_freeze(page, 1)) { 2137 __free_pages(page, order); 2138 if (retry) { /* retry once */ 2139 retry = false; 2140 goto retry; 2141 } 2142 /* WOW! twice in a row. */ 2143 pr_warn("HugeTLB head page unexpected inflated ref count\n"); 2144 page = NULL; 2145 } 2146 2147 /* 2148 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this 2149 * indicates an overall state change. Clear bit so that we resume 2150 * normal 'try hard' allocations. 2151 */ 2152 if (node_alloc_noretry && page && !alloc_try_hard) 2153 node_clear(nid, *node_alloc_noretry); 2154 2155 /* 2156 * If we tried hard to get a page but failed, set bit so that 2157 * subsequent attempts will not try as hard until there is an 2158 * overall state change. 2159 */ 2160 if (node_alloc_noretry && !page && alloc_try_hard) 2161 node_set(nid, *node_alloc_noretry); 2162 2163 if (!page) { 2164 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 2165 return NULL; 2166 } 2167 2168 __count_vm_event(HTLB_BUDDY_PGALLOC); 2169 return page_folio(page); 2170 } 2171 2172 /* 2173 * Common helper to allocate a fresh hugetlb page. All specific allocators 2174 * should use this function to get new hugetlb pages 2175 * 2176 * Note that returned page is 'frozen': ref count of head page and all tail 2177 * pages is zero. 2178 */ 2179 static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, 2180 gfp_t gfp_mask, int nid, nodemask_t *nmask, 2181 nodemask_t *node_alloc_noretry) 2182 { 2183 struct folio *folio; 2184 bool retry = false; 2185 2186 retry: 2187 if (hstate_is_gigantic(h)) 2188 folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); 2189 else 2190 folio = alloc_buddy_hugetlb_folio(h, gfp_mask, 2191 nid, nmask, node_alloc_noretry); 2192 if (!folio) 2193 return NULL; 2194 if (hstate_is_gigantic(h)) { 2195 if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { 2196 /* 2197 * Rare failure to convert pages to compound page. 2198 * Free pages and try again - ONCE! 2199 */ 2200 free_gigantic_folio(folio, huge_page_order(h)); 2201 if (!retry) { 2202 retry = true; 2203 goto retry; 2204 } 2205 return NULL; 2206 } 2207 } 2208 prep_new_hugetlb_folio(h, folio, folio_nid(folio)); 2209 2210 return folio; 2211 } 2212 2213 /* 2214 * Allocates a fresh page to the hugetlb allocator pool in the node interleaved 2215 * manner. 2216 */ 2217 static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 2218 nodemask_t *node_alloc_noretry) 2219 { 2220 struct folio *folio; 2221 int nr_nodes, node; 2222 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 2223 2224 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 2225 folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, 2226 nodes_allowed, node_alloc_noretry); 2227 if (folio) { 2228 free_huge_folio(folio); /* free it into the hugepage allocator */ 2229 return 1; 2230 } 2231 } 2232 2233 return 0; 2234 } 2235 2236 /* 2237 * Remove huge page from pool from next node to free. Attempt to keep 2238 * persistent huge pages more or less balanced over allowed nodes. 2239 * This routine only 'removes' the hugetlb page. The caller must make 2240 * an additional call to free the page to low level allocators. 2241 * Called with hugetlb_lock locked. 2242 */ 2243 static struct page *remove_pool_huge_page(struct hstate *h, 2244 nodemask_t *nodes_allowed, 2245 bool acct_surplus) 2246 { 2247 int nr_nodes, node; 2248 struct page *page = NULL; 2249 struct folio *folio; 2250 2251 lockdep_assert_held(&hugetlb_lock); 2252 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 2253 /* 2254 * If we're returning unused surplus pages, only examine 2255 * nodes with surplus pages. 2256 */ 2257 if ((!acct_surplus || h->surplus_huge_pages_node[node]) && 2258 !list_empty(&h->hugepage_freelists[node])) { 2259 page = list_entry(h->hugepage_freelists[node].next, 2260 struct page, lru); 2261 folio = page_folio(page); 2262 remove_hugetlb_folio(h, folio, acct_surplus); 2263 break; 2264 } 2265 } 2266 2267 return page; 2268 } 2269 2270 /* 2271 * Dissolve a given free hugepage into free buddy pages. This function does 2272 * nothing for in-use hugepages and non-hugepages. 2273 * This function returns values like below: 2274 * 2275 * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages 2276 * when the system is under memory pressure and the feature of 2277 * freeing unused vmemmap pages associated with each hugetlb page 2278 * is enabled. 2279 * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use 2280 * (allocated or reserved.) 2281 * 0: successfully dissolved free hugepages or the page is not a 2282 * hugepage (considered as already dissolved) 2283 */ 2284 int dissolve_free_huge_page(struct page *page) 2285 { 2286 int rc = -EBUSY; 2287 struct folio *folio = page_folio(page); 2288 2289 retry: 2290 /* Not to disrupt normal path by vainly holding hugetlb_lock */ 2291 if (!folio_test_hugetlb(folio)) 2292 return 0; 2293 2294 spin_lock_irq(&hugetlb_lock); 2295 if (!folio_test_hugetlb(folio)) { 2296 rc = 0; 2297 goto out; 2298 } 2299 2300 if (!folio_ref_count(folio)) { 2301 struct hstate *h = folio_hstate(folio); 2302 if (!available_huge_pages(h)) 2303 goto out; 2304 2305 /* 2306 * We should make sure that the page is already on the free list 2307 * when it is dissolved. 2308 */ 2309 if (unlikely(!folio_test_hugetlb_freed(folio))) { 2310 spin_unlock_irq(&hugetlb_lock); 2311 cond_resched(); 2312 2313 /* 2314 * Theoretically, we should return -EBUSY when we 2315 * encounter this race. In fact, we have a chance 2316 * to successfully dissolve the page if we do a 2317 * retry. Because the race window is quite small. 2318 * If we seize this opportunity, it is an optimization 2319 * for increasing the success rate of dissolving page. 2320 */ 2321 goto retry; 2322 } 2323 2324 remove_hugetlb_folio(h, folio, false); 2325 h->max_huge_pages--; 2326 spin_unlock_irq(&hugetlb_lock); 2327 2328 /* 2329 * Normally update_and_free_hugtlb_folio will allocate required vmemmmap 2330 * before freeing the page. update_and_free_hugtlb_folio will fail to 2331 * free the page if it can not allocate required vmemmap. We 2332 * need to adjust max_huge_pages if the page is not freed. 2333 * Attempt to allocate vmemmmap here so that we can take 2334 * appropriate action on failure. 2335 */ 2336 rc = hugetlb_vmemmap_restore(h, &folio->page); 2337 if (!rc) { 2338 update_and_free_hugetlb_folio(h, folio, false); 2339 } else { 2340 spin_lock_irq(&hugetlb_lock); 2341 add_hugetlb_folio(h, folio, false); 2342 h->max_huge_pages++; 2343 spin_unlock_irq(&hugetlb_lock); 2344 } 2345 2346 return rc; 2347 } 2348 out: 2349 spin_unlock_irq(&hugetlb_lock); 2350 return rc; 2351 } 2352 2353 /* 2354 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to 2355 * make specified memory blocks removable from the system. 2356 * Note that this will dissolve a free gigantic hugepage completely, if any 2357 * part of it lies within the given range. 2358 * Also note that if dissolve_free_huge_page() returns with an error, all 2359 * free hugepages that were dissolved before that error are lost. 2360 */ 2361 int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) 2362 { 2363 unsigned long pfn; 2364 struct page *page; 2365 int rc = 0; 2366 unsigned int order; 2367 struct hstate *h; 2368 2369 if (!hugepages_supported()) 2370 return rc; 2371 2372 order = huge_page_order(&default_hstate); 2373 for_each_hstate(h) 2374 order = min(order, huge_page_order(h)); 2375 2376 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { 2377 page = pfn_to_page(pfn); 2378 rc = dissolve_free_huge_page(page); 2379 if (rc) 2380 break; 2381 } 2382 2383 return rc; 2384 } 2385 2386 /* 2387 * Allocates a fresh surplus page from the page allocator. 2388 */ 2389 static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, 2390 gfp_t gfp_mask, int nid, nodemask_t *nmask) 2391 { 2392 struct folio *folio = NULL; 2393 2394 if (hstate_is_gigantic(h)) 2395 return NULL; 2396 2397 spin_lock_irq(&hugetlb_lock); 2398 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) 2399 goto out_unlock; 2400 spin_unlock_irq(&hugetlb_lock); 2401 2402 folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); 2403 if (!folio) 2404 return NULL; 2405 2406 spin_lock_irq(&hugetlb_lock); 2407 /* 2408 * We could have raced with the pool size change. 2409 * Double check that and simply deallocate the new page 2410 * if we would end up overcommiting the surpluses. Abuse 2411 * temporary page to workaround the nasty free_huge_folio 2412 * codeflow 2413 */ 2414 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { 2415 folio_set_hugetlb_temporary(folio); 2416 spin_unlock_irq(&hugetlb_lock); 2417 free_huge_folio(folio); 2418 return NULL; 2419 } 2420 2421 h->surplus_huge_pages++; 2422 h->surplus_huge_pages_node[folio_nid(folio)]++; 2423 2424 out_unlock: 2425 spin_unlock_irq(&hugetlb_lock); 2426 2427 return folio; 2428 } 2429 2430 static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, 2431 int nid, nodemask_t *nmask) 2432 { 2433 struct folio *folio; 2434 2435 if (hstate_is_gigantic(h)) 2436 return NULL; 2437 2438 folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); 2439 if (!folio) 2440 return NULL; 2441 2442 /* fresh huge pages are frozen */ 2443 folio_ref_unfreeze(folio, 1); 2444 /* 2445 * We do not account these pages as surplus because they are only 2446 * temporary and will be released properly on the last reference 2447 */ 2448 folio_set_hugetlb_temporary(folio); 2449 2450 return folio; 2451 } 2452 2453 /* 2454 * Use the VMA's mpolicy to allocate a huge page from the buddy. 2455 */ 2456 static 2457 struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, 2458 struct vm_area_struct *vma, unsigned long addr) 2459 { 2460 struct folio *folio = NULL; 2461 struct mempolicy *mpol; 2462 gfp_t gfp_mask = htlb_alloc_mask(h); 2463 int nid; 2464 nodemask_t *nodemask; 2465 2466 nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); 2467 if (mpol_is_preferred_many(mpol)) { 2468 gfp_t gfp = gfp_mask | __GFP_NOWARN; 2469 2470 gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2471 folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); 2472 2473 /* Fallback to all nodes if page==NULL */ 2474 nodemask = NULL; 2475 } 2476 2477 if (!folio) 2478 folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); 2479 mpol_cond_put(mpol); 2480 return folio; 2481 } 2482 2483 /* folio migration callback function */ 2484 struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, 2485 nodemask_t *nmask, gfp_t gfp_mask) 2486 { 2487 spin_lock_irq(&hugetlb_lock); 2488 if (available_huge_pages(h)) { 2489 struct folio *folio; 2490 2491 folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, 2492 preferred_nid, nmask); 2493 if (folio) { 2494 spin_unlock_irq(&hugetlb_lock); 2495 return folio; 2496 } 2497 } 2498 spin_unlock_irq(&hugetlb_lock); 2499 2500 return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); 2501 } 2502 2503 /* mempolicy aware migration callback */ 2504 struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, 2505 unsigned long address) 2506 { 2507 struct mempolicy *mpol; 2508 nodemask_t *nodemask; 2509 struct folio *folio; 2510 gfp_t gfp_mask; 2511 int node; 2512 2513 gfp_mask = htlb_alloc_mask(h); 2514 node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 2515 folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); 2516 mpol_cond_put(mpol); 2517 2518 return folio; 2519 } 2520 2521 static nodemask_t *policy_mbind_nodemask(gfp_t gfp) 2522 { 2523 #ifdef CONFIG_NUMA 2524 struct mempolicy *mpol = get_task_policy(current); 2525 2526 /* 2527 * Only enforce MPOL_BIND policy which overlaps with cpuset policy 2528 * (from policy_nodemask) specifically for hugetlb case 2529 */ 2530 if (mpol->mode == MPOL_BIND && 2531 (apply_policy_zone(mpol, gfp_zone(gfp)) && 2532 cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) 2533 return &mpol->nodes; 2534 #endif 2535 return NULL; 2536 } 2537 2538 /* 2539 * Increase the hugetlb pool such that it can accommodate a reservation 2540 * of size 'delta'. 2541 */ 2542 static int gather_surplus_pages(struct hstate *h, long delta) 2543 __must_hold(&hugetlb_lock) 2544 { 2545 LIST_HEAD(surplus_list); 2546 struct folio *folio, *tmp; 2547 int ret; 2548 long i; 2549 long needed, allocated; 2550 bool alloc_ok = true; 2551 int node; 2552 nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); 2553 2554 lockdep_assert_held(&hugetlb_lock); 2555 needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 2556 if (needed <= 0) { 2557 h->resv_huge_pages += delta; 2558 return 0; 2559 } 2560 2561 allocated = 0; 2562 2563 ret = -ENOMEM; 2564 retry: 2565 spin_unlock_irq(&hugetlb_lock); 2566 for (i = 0; i < needed; i++) { 2567 folio = NULL; 2568 for_each_node_mask(node, cpuset_current_mems_allowed) { 2569 if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) { 2570 folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), 2571 node, NULL); 2572 if (folio) 2573 break; 2574 } 2575 } 2576 if (!folio) { 2577 alloc_ok = false; 2578 break; 2579 } 2580 list_add(&folio->lru, &surplus_list); 2581 cond_resched(); 2582 } 2583 allocated += i; 2584 2585 /* 2586 * After retaking hugetlb_lock, we need to recalculate 'needed' 2587 * because either resv_huge_pages or free_huge_pages may have changed. 2588 */ 2589 spin_lock_irq(&hugetlb_lock); 2590 needed = (h->resv_huge_pages + delta) - 2591 (h->free_huge_pages + allocated); 2592 if (needed > 0) { 2593 if (alloc_ok) 2594 goto retry; 2595 /* 2596 * We were not able to allocate enough pages to 2597 * satisfy the entire reservation so we free what 2598 * we've allocated so far. 2599 */ 2600 goto free; 2601 } 2602 /* 2603 * The surplus_list now contains _at_least_ the number of extra pages 2604 * needed to accommodate the reservation. Add the appropriate number 2605 * of pages to the hugetlb pool and free the extras back to the buddy 2606 * allocator. Commit the entire reservation here to prevent another 2607 * process from stealing the pages as they are added to the pool but 2608 * before they are reserved. 2609 */ 2610 needed += allocated; 2611 h->resv_huge_pages += delta; 2612 ret = 0; 2613 2614 /* Free the needed pages to the hugetlb pool */ 2615 list_for_each_entry_safe(folio, tmp, &surplus_list, lru) { 2616 if ((--needed) < 0) 2617 break; 2618 /* Add the page to the hugetlb allocator */ 2619 enqueue_hugetlb_folio(h, folio); 2620 } 2621 free: 2622 spin_unlock_irq(&hugetlb_lock); 2623 2624 /* 2625 * Free unnecessary surplus pages to the buddy allocator. 2626 * Pages have no ref count, call free_huge_folio directly. 2627 */ 2628 list_for_each_entry_safe(folio, tmp, &surplus_list, lru) 2629 free_huge_folio(folio); 2630 spin_lock_irq(&hugetlb_lock); 2631 2632 return ret; 2633 } 2634 2635 /* 2636 * This routine has two main purposes: 2637 * 1) Decrement the reservation count (resv_huge_pages) by the value passed 2638 * in unused_resv_pages. This corresponds to the prior adjustments made 2639 * to the associated reservation map. 2640 * 2) Free any unused surplus pages that may have been allocated to satisfy 2641 * the reservation. As many as unused_resv_pages may be freed. 2642 */ 2643 static void return_unused_surplus_pages(struct hstate *h, 2644 unsigned long unused_resv_pages) 2645 { 2646 unsigned long nr_pages; 2647 struct page *page; 2648 LIST_HEAD(page_list); 2649 2650 lockdep_assert_held(&hugetlb_lock); 2651 /* Uncommit the reservation */ 2652 h->resv_huge_pages -= unused_resv_pages; 2653 2654 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 2655 goto out; 2656 2657 /* 2658 * Part (or even all) of the reservation could have been backed 2659 * by pre-allocated pages. Only free surplus pages. 2660 */ 2661 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 2662 2663 /* 2664 * We want to release as many surplus pages as possible, spread 2665 * evenly across all nodes with memory. Iterate across these nodes 2666 * until we can no longer free unreserved surplus pages. This occurs 2667 * when the nodes with surplus pages have no free pages. 2668 * remove_pool_huge_page() will balance the freed pages across the 2669 * on-line nodes with memory and will handle the hstate accounting. 2670 */ 2671 while (nr_pages--) { 2672 page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); 2673 if (!page) 2674 goto out; 2675 2676 list_add(&page->lru, &page_list); 2677 } 2678 2679 out: 2680 spin_unlock_irq(&hugetlb_lock); 2681 update_and_free_pages_bulk(h, &page_list); 2682 spin_lock_irq(&hugetlb_lock); 2683 } 2684 2685 2686 /* 2687 * vma_needs_reservation, vma_commit_reservation and vma_end_reservation 2688 * are used by the huge page allocation routines to manage reservations. 2689 * 2690 * vma_needs_reservation is called to determine if the huge page at addr 2691 * within the vma has an associated reservation. If a reservation is 2692 * needed, the value 1 is returned. The caller is then responsible for 2693 * managing the global reservation and subpool usage counts. After 2694 * the huge page has been allocated, vma_commit_reservation is called 2695 * to add the page to the reservation map. If the page allocation fails, 2696 * the reservation must be ended instead of committed. vma_end_reservation 2697 * is called in such cases. 2698 * 2699 * In the normal case, vma_commit_reservation returns the same value 2700 * as the preceding vma_needs_reservation call. The only time this 2701 * is not the case is if a reserve map was changed between calls. It 2702 * is the responsibility of the caller to notice the difference and 2703 * take appropriate action. 2704 * 2705 * vma_add_reservation is used in error paths where a reservation must 2706 * be restored when a newly allocated huge page must be freed. It is 2707 * to be called after calling vma_needs_reservation to determine if a 2708 * reservation exists. 2709 * 2710 * vma_del_reservation is used in error paths where an entry in the reserve 2711 * map was created during huge page allocation and must be removed. It is to 2712 * be called after calling vma_needs_reservation to determine if a reservation 2713 * exists. 2714 */ 2715 enum vma_resv_mode { 2716 VMA_NEEDS_RESV, 2717 VMA_COMMIT_RESV, 2718 VMA_END_RESV, 2719 VMA_ADD_RESV, 2720 VMA_DEL_RESV, 2721 }; 2722 static long __vma_reservation_common(struct hstate *h, 2723 struct vm_area_struct *vma, unsigned long addr, 2724 enum vma_resv_mode mode) 2725 { 2726 struct resv_map *resv; 2727 pgoff_t idx; 2728 long ret; 2729 long dummy_out_regions_needed; 2730 2731 resv = vma_resv_map(vma); 2732 if (!resv) 2733 return 1; 2734 2735 idx = vma_hugecache_offset(h, vma, addr); 2736 switch (mode) { 2737 case VMA_NEEDS_RESV: 2738 ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); 2739 /* We assume that vma_reservation_* routines always operate on 2740 * 1 page, and that adding to resv map a 1 page entry can only 2741 * ever require 1 region. 2742 */ 2743 VM_BUG_ON(dummy_out_regions_needed != 1); 2744 break; 2745 case VMA_COMMIT_RESV: 2746 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2747 /* region_add calls of range 1 should never fail. */ 2748 VM_BUG_ON(ret < 0); 2749 break; 2750 case VMA_END_RESV: 2751 region_abort(resv, idx, idx + 1, 1); 2752 ret = 0; 2753 break; 2754 case VMA_ADD_RESV: 2755 if (vma->vm_flags & VM_MAYSHARE) { 2756 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2757 /* region_add calls of range 1 should never fail. */ 2758 VM_BUG_ON(ret < 0); 2759 } else { 2760 region_abort(resv, idx, idx + 1, 1); 2761 ret = region_del(resv, idx, idx + 1); 2762 } 2763 break; 2764 case VMA_DEL_RESV: 2765 if (vma->vm_flags & VM_MAYSHARE) { 2766 region_abort(resv, idx, idx + 1, 1); 2767 ret = region_del(resv, idx, idx + 1); 2768 } else { 2769 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2770 /* region_add calls of range 1 should never fail. */ 2771 VM_BUG_ON(ret < 0); 2772 } 2773 break; 2774 default: 2775 BUG(); 2776 } 2777 2778 if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV) 2779 return ret; 2780 /* 2781 * We know private mapping must have HPAGE_RESV_OWNER set. 2782 * 2783 * In most cases, reserves always exist for private mappings. 2784 * However, a file associated with mapping could have been 2785 * hole punched or truncated after reserves were consumed. 2786 * As subsequent fault on such a range will not use reserves. 2787 * Subtle - The reserve map for private mappings has the 2788 * opposite meaning than that of shared mappings. If NO 2789 * entry is in the reserve map, it means a reservation exists. 2790 * If an entry exists in the reserve map, it means the 2791 * reservation has already been consumed. As a result, the 2792 * return value of this routine is the opposite of the 2793 * value returned from reserve map manipulation routines above. 2794 */ 2795 if (ret > 0) 2796 return 0; 2797 if (ret == 0) 2798 return 1; 2799 return ret; 2800 } 2801 2802 static long vma_needs_reservation(struct hstate *h, 2803 struct vm_area_struct *vma, unsigned long addr) 2804 { 2805 return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); 2806 } 2807 2808 static long vma_commit_reservation(struct hstate *h, 2809 struct vm_area_struct *vma, unsigned long addr) 2810 { 2811 return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); 2812 } 2813 2814 static void vma_end_reservation(struct hstate *h, 2815 struct vm_area_struct *vma, unsigned long addr) 2816 { 2817 (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); 2818 } 2819 2820 static long vma_add_reservation(struct hstate *h, 2821 struct vm_area_struct *vma, unsigned long addr) 2822 { 2823 return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); 2824 } 2825 2826 static long vma_del_reservation(struct hstate *h, 2827 struct vm_area_struct *vma, unsigned long addr) 2828 { 2829 return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV); 2830 } 2831 2832 /* 2833 * This routine is called to restore reservation information on error paths. 2834 * It should ONLY be called for folios allocated via alloc_hugetlb_folio(), 2835 * and the hugetlb mutex should remain held when calling this routine. 2836 * 2837 * It handles two specific cases: 2838 * 1) A reservation was in place and the folio consumed the reservation. 2839 * hugetlb_restore_reserve is set in the folio. 2840 * 2) No reservation was in place for the page, so hugetlb_restore_reserve is 2841 * not set. However, alloc_hugetlb_folio always updates the reserve map. 2842 * 2843 * In case 1, free_huge_folio later in the error path will increment the 2844 * global reserve count. But, free_huge_folio does not have enough context 2845 * to adjust the reservation map. This case deals primarily with private 2846 * mappings. Adjust the reserve map here to be consistent with global 2847 * reserve count adjustments to be made by free_huge_folio. Make sure the 2848 * reserve map indicates there is a reservation present. 2849 * 2850 * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. 2851 */ 2852 void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, 2853 unsigned long address, struct folio *folio) 2854 { 2855 long rc = vma_needs_reservation(h, vma, address); 2856 2857 if (folio_test_hugetlb_restore_reserve(folio)) { 2858 if (unlikely(rc < 0)) 2859 /* 2860 * Rare out of memory condition in reserve map 2861 * manipulation. Clear hugetlb_restore_reserve so 2862 * that global reserve count will not be incremented 2863 * by free_huge_folio. This will make it appear 2864 * as though the reservation for this folio was 2865 * consumed. This may prevent the task from 2866 * faulting in the folio at a later time. This 2867 * is better than inconsistent global huge page 2868 * accounting of reserve counts. 2869 */ 2870 folio_clear_hugetlb_restore_reserve(folio); 2871 else if (rc) 2872 (void)vma_add_reservation(h, vma, address); 2873 else 2874 vma_end_reservation(h, vma, address); 2875 } else { 2876 if (!rc) { 2877 /* 2878 * This indicates there is an entry in the reserve map 2879 * not added by alloc_hugetlb_folio. We know it was added 2880 * before the alloc_hugetlb_folio call, otherwise 2881 * hugetlb_restore_reserve would be set on the folio. 2882 * Remove the entry so that a subsequent allocation 2883 * does not consume a reservation. 2884 */ 2885 rc = vma_del_reservation(h, vma, address); 2886 if (rc < 0) 2887 /* 2888 * VERY rare out of memory condition. Since 2889 * we can not delete the entry, set 2890 * hugetlb_restore_reserve so that the reserve 2891 * count will be incremented when the folio 2892 * is freed. This reserve will be consumed 2893 * on a subsequent allocation. 2894 */ 2895 folio_set_hugetlb_restore_reserve(folio); 2896 } else if (rc < 0) { 2897 /* 2898 * Rare out of memory condition from 2899 * vma_needs_reservation call. Memory allocation is 2900 * only attempted if a new entry is needed. Therefore, 2901 * this implies there is not an entry in the 2902 * reserve map. 2903 * 2904 * For shared mappings, no entry in the map indicates 2905 * no reservation. We are done. 2906 */ 2907 if (!(vma->vm_flags & VM_MAYSHARE)) 2908 /* 2909 * For private mappings, no entry indicates 2910 * a reservation is present. Since we can 2911 * not add an entry, set hugetlb_restore_reserve 2912 * on the folio so reserve count will be 2913 * incremented when freed. This reserve will 2914 * be consumed on a subsequent allocation. 2915 */ 2916 folio_set_hugetlb_restore_reserve(folio); 2917 } else 2918 /* 2919 * No reservation present, do nothing 2920 */ 2921 vma_end_reservation(h, vma, address); 2922 } 2923 } 2924 2925 /* 2926 * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve 2927 * the old one 2928 * @h: struct hstate old page belongs to 2929 * @old_folio: Old folio to dissolve 2930 * @list: List to isolate the page in case we need to 2931 * Returns 0 on success, otherwise negated error. 2932 */ 2933 static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, 2934 struct folio *old_folio, struct list_head *list) 2935 { 2936 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 2937 int nid = folio_nid(old_folio); 2938 struct folio *new_folio; 2939 int ret = 0; 2940 2941 /* 2942 * Before dissolving the folio, we need to allocate a new one for the 2943 * pool to remain stable. Here, we allocate the folio and 'prep' it 2944 * by doing everything but actually updating counters and adding to 2945 * the pool. This simplifies and let us do most of the processing 2946 * under the lock. 2947 */ 2948 new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); 2949 if (!new_folio) 2950 return -ENOMEM; 2951 __prep_new_hugetlb_folio(h, new_folio); 2952 2953 retry: 2954 spin_lock_irq(&hugetlb_lock); 2955 if (!folio_test_hugetlb(old_folio)) { 2956 /* 2957 * Freed from under us. Drop new_folio too. 2958 */ 2959 goto free_new; 2960 } else if (folio_ref_count(old_folio)) { 2961 bool isolated; 2962 2963 /* 2964 * Someone has grabbed the folio, try to isolate it here. 2965 * Fail with -EBUSY if not possible. 2966 */ 2967 spin_unlock_irq(&hugetlb_lock); 2968 isolated = isolate_hugetlb(old_folio, list); 2969 ret = isolated ? 0 : -EBUSY; 2970 spin_lock_irq(&hugetlb_lock); 2971 goto free_new; 2972 } else if (!folio_test_hugetlb_freed(old_folio)) { 2973 /* 2974 * Folio's refcount is 0 but it has not been enqueued in the 2975 * freelist yet. Race window is small, so we can succeed here if 2976 * we retry. 2977 */ 2978 spin_unlock_irq(&hugetlb_lock); 2979 cond_resched(); 2980 goto retry; 2981 } else { 2982 /* 2983 * Ok, old_folio is still a genuine free hugepage. Remove it from 2984 * the freelist and decrease the counters. These will be 2985 * incremented again when calling __prep_account_new_huge_page() 2986 * and enqueue_hugetlb_folio() for new_folio. The counters will 2987 * remain stable since this happens under the lock. 2988 */ 2989 remove_hugetlb_folio(h, old_folio, false); 2990 2991 /* 2992 * Ref count on new_folio is already zero as it was dropped 2993 * earlier. It can be directly added to the pool free list. 2994 */ 2995 __prep_account_new_huge_page(h, nid); 2996 enqueue_hugetlb_folio(h, new_folio); 2997 2998 /* 2999 * Folio has been replaced, we can safely free the old one. 3000 */ 3001 spin_unlock_irq(&hugetlb_lock); 3002 update_and_free_hugetlb_folio(h, old_folio, false); 3003 } 3004 3005 return ret; 3006 3007 free_new: 3008 spin_unlock_irq(&hugetlb_lock); 3009 /* Folio has a zero ref count, but needs a ref to be freed */ 3010 folio_ref_unfreeze(new_folio, 1); 3011 update_and_free_hugetlb_folio(h, new_folio, false); 3012 3013 return ret; 3014 } 3015 3016 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) 3017 { 3018 struct hstate *h; 3019 struct folio *folio = page_folio(page); 3020 int ret = -EBUSY; 3021 3022 /* 3023 * The page might have been dissolved from under our feet, so make sure 3024 * to carefully check the state under the lock. 3025 * Return success when racing as if we dissolved the page ourselves. 3026 */ 3027 spin_lock_irq(&hugetlb_lock); 3028 if (folio_test_hugetlb(folio)) { 3029 h = folio_hstate(folio); 3030 } else { 3031 spin_unlock_irq(&hugetlb_lock); 3032 return 0; 3033 } 3034 spin_unlock_irq(&hugetlb_lock); 3035 3036 /* 3037 * Fence off gigantic pages as there is a cyclic dependency between 3038 * alloc_contig_range and them. Return -ENOMEM as this has the effect 3039 * of bailing out right away without further retrying. 3040 */ 3041 if (hstate_is_gigantic(h)) 3042 return -ENOMEM; 3043 3044 if (folio_ref_count(folio) && isolate_hugetlb(folio, list)) 3045 ret = 0; 3046 else if (!folio_ref_count(folio)) 3047 ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); 3048 3049 return ret; 3050 } 3051 3052 struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, 3053 unsigned long addr, int avoid_reserve) 3054 { 3055 struct hugepage_subpool *spool = subpool_vma(vma); 3056 struct hstate *h = hstate_vma(vma); 3057 struct folio *folio; 3058 long map_chg, map_commit; 3059 long gbl_chg; 3060 int ret, idx; 3061 struct hugetlb_cgroup *h_cg = NULL; 3062 bool deferred_reserve; 3063 3064 idx = hstate_index(h); 3065 /* 3066 * Examine the region/reserve map to determine if the process 3067 * has a reservation for the page to be allocated. A return 3068 * code of zero indicates a reservation exists (no change). 3069 */ 3070 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); 3071 if (map_chg < 0) 3072 return ERR_PTR(-ENOMEM); 3073 3074 /* 3075 * Processes that did not create the mapping will have no 3076 * reserves as indicated by the region/reserve map. Check 3077 * that the allocation will not exceed the subpool limit. 3078 * Allocations for MAP_NORESERVE mappings also need to be 3079 * checked against any subpool limit. 3080 */ 3081 if (map_chg || avoid_reserve) { 3082 gbl_chg = hugepage_subpool_get_pages(spool, 1); 3083 if (gbl_chg < 0) { 3084 vma_end_reservation(h, vma, addr); 3085 return ERR_PTR(-ENOSPC); 3086 } 3087 3088 /* 3089 * Even though there was no reservation in the region/reserve 3090 * map, there could be reservations associated with the 3091 * subpool that can be used. This would be indicated if the 3092 * return value of hugepage_subpool_get_pages() is zero. 3093 * However, if avoid_reserve is specified we still avoid even 3094 * the subpool reservations. 3095 */ 3096 if (avoid_reserve) 3097 gbl_chg = 1; 3098 } 3099 3100 /* If this allocation is not consuming a reservation, charge it now. 3101 */ 3102 deferred_reserve = map_chg || avoid_reserve; 3103 if (deferred_reserve) { 3104 ret = hugetlb_cgroup_charge_cgroup_rsvd( 3105 idx, pages_per_huge_page(h), &h_cg); 3106 if (ret) 3107 goto out_subpool_put; 3108 } 3109 3110 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 3111 if (ret) 3112 goto out_uncharge_cgroup_reservation; 3113 3114 spin_lock_irq(&hugetlb_lock); 3115 /* 3116 * glb_chg is passed to indicate whether or not a page must be taken 3117 * from the global free pool (global change). gbl_chg == 0 indicates 3118 * a reservation exists for the allocation. 3119 */ 3120 folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); 3121 if (!folio) { 3122 spin_unlock_irq(&hugetlb_lock); 3123 folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); 3124 if (!folio) 3125 goto out_uncharge_cgroup; 3126 spin_lock_irq(&hugetlb_lock); 3127 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { 3128 folio_set_hugetlb_restore_reserve(folio); 3129 h->resv_huge_pages--; 3130 } 3131 list_add(&folio->lru, &h->hugepage_activelist); 3132 folio_ref_unfreeze(folio, 1); 3133 /* Fall through */ 3134 } 3135 3136 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); 3137 /* If allocation is not consuming a reservation, also store the 3138 * hugetlb_cgroup pointer on the page. 3139 */ 3140 if (deferred_reserve) { 3141 hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), 3142 h_cg, folio); 3143 } 3144 3145 spin_unlock_irq(&hugetlb_lock); 3146 3147 hugetlb_set_folio_subpool(folio, spool); 3148 3149 map_commit = vma_commit_reservation(h, vma, addr); 3150 if (unlikely(map_chg > map_commit)) { 3151 /* 3152 * The page was added to the reservation map between 3153 * vma_needs_reservation and vma_commit_reservation. 3154 * This indicates a race with hugetlb_reserve_pages. 3155 * Adjust for the subpool count incremented above AND 3156 * in hugetlb_reserve_pages for the same page. Also, 3157 * the reservation count added in hugetlb_reserve_pages 3158 * no longer applies. 3159 */ 3160 long rsv_adjust; 3161 3162 rsv_adjust = hugepage_subpool_put_pages(spool, 1); 3163 hugetlb_acct_memory(h, -rsv_adjust); 3164 if (deferred_reserve) { 3165 spin_lock_irq(&hugetlb_lock); 3166 hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), 3167 pages_per_huge_page(h), folio); 3168 spin_unlock_irq(&hugetlb_lock); 3169 } 3170 } 3171 return folio; 3172 3173 out_uncharge_cgroup: 3174 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); 3175 out_uncharge_cgroup_reservation: 3176 if (deferred_reserve) 3177 hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), 3178 h_cg); 3179 out_subpool_put: 3180 if (map_chg || avoid_reserve) 3181 hugepage_subpool_put_pages(spool, 1); 3182 vma_end_reservation(h, vma, addr); 3183 return ERR_PTR(-ENOSPC); 3184 } 3185 3186 int alloc_bootmem_huge_page(struct hstate *h, int nid) 3187 __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); 3188 int __alloc_bootmem_huge_page(struct hstate *h, int nid) 3189 { 3190 struct huge_bootmem_page *m = NULL; /* initialize for clang */ 3191 int nr_nodes, node; 3192 3193 /* do node specific alloc */ 3194 if (nid != NUMA_NO_NODE) { 3195 m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), 3196 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); 3197 if (!m) 3198 return 0; 3199 goto found; 3200 } 3201 /* allocate from next node when distributing huge pages */ 3202 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 3203 m = memblock_alloc_try_nid_raw( 3204 huge_page_size(h), huge_page_size(h), 3205 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); 3206 /* 3207 * Use the beginning of the huge page to store the 3208 * huge_bootmem_page struct (until gather_bootmem 3209 * puts them into the mem_map). 3210 */ 3211 if (!m) 3212 return 0; 3213 goto found; 3214 } 3215 3216 found: 3217 /* Put them into a private list first because mem_map is not up yet */ 3218 INIT_LIST_HEAD(&m->list); 3219 list_add(&m->list, &huge_boot_pages); 3220 m->hstate = h; 3221 return 1; 3222 } 3223 3224 /* 3225 * Put bootmem huge pages into the standard lists after mem_map is up. 3226 * Note: This only applies to gigantic (order > MAX_ORDER) pages. 3227 */ 3228 static void __init gather_bootmem_prealloc(void) 3229 { 3230 struct huge_bootmem_page *m; 3231 3232 list_for_each_entry(m, &huge_boot_pages, list) { 3233 struct page *page = virt_to_page(m); 3234 struct folio *folio = page_folio(page); 3235 struct hstate *h = m->hstate; 3236 3237 VM_BUG_ON(!hstate_is_gigantic(h)); 3238 WARN_ON(folio_ref_count(folio) != 1); 3239 if (prep_compound_gigantic_folio(folio, huge_page_order(h))) { 3240 WARN_ON(folio_test_reserved(folio)); 3241 prep_new_hugetlb_folio(h, folio, folio_nid(folio)); 3242 free_huge_folio(folio); /* add to the hugepage allocator */ 3243 } else { 3244 /* VERY unlikely inflated ref count on a tail page */ 3245 free_gigantic_folio(folio, huge_page_order(h)); 3246 } 3247 3248 /* 3249 * We need to restore the 'stolen' pages to totalram_pages 3250 * in order to fix confusing memory reports from free(1) and 3251 * other side-effects, like CommitLimit going negative. 3252 */ 3253 adjust_managed_page_count(page, pages_per_huge_page(h)); 3254 cond_resched(); 3255 } 3256 } 3257 static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) 3258 { 3259 unsigned long i; 3260 char buf[32]; 3261 3262 for (i = 0; i < h->max_huge_pages_node[nid]; ++i) { 3263 if (hstate_is_gigantic(h)) { 3264 if (!alloc_bootmem_huge_page(h, nid)) 3265 break; 3266 } else { 3267 struct folio *folio; 3268 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 3269 3270 folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, 3271 &node_states[N_MEMORY], NULL); 3272 if (!folio) 3273 break; 3274 free_huge_folio(folio); /* free it into the hugepage allocator */ 3275 } 3276 cond_resched(); 3277 } 3278 if (i == h->max_huge_pages_node[nid]) 3279 return; 3280 3281 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 3282 pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", 3283 h->max_huge_pages_node[nid], buf, nid, i); 3284 h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); 3285 h->max_huge_pages_node[nid] = i; 3286 } 3287 3288 static void __init hugetlb_hstate_alloc_pages(struct hstate *h) 3289 { 3290 unsigned long i; 3291 nodemask_t *node_alloc_noretry; 3292 bool node_specific_alloc = false; 3293 3294 /* skip gigantic hugepages allocation if hugetlb_cma enabled */ 3295 if (hstate_is_gigantic(h) && hugetlb_cma_size) { 3296 pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); 3297 return; 3298 } 3299 3300 /* do node specific alloc */ 3301 for_each_online_node(i) { 3302 if (h->max_huge_pages_node[i] > 0) { 3303 hugetlb_hstate_alloc_pages_onenode(h, i); 3304 node_specific_alloc = true; 3305 } 3306 } 3307 3308 if (node_specific_alloc) 3309 return; 3310 3311 /* below will do all node balanced alloc */ 3312 if (!hstate_is_gigantic(h)) { 3313 /* 3314 * Bit mask controlling how hard we retry per-node allocations. 3315 * Ignore errors as lower level routines can deal with 3316 * node_alloc_noretry == NULL. If this kmalloc fails at boot 3317 * time, we are likely in bigger trouble. 3318 */ 3319 node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), 3320 GFP_KERNEL); 3321 } else { 3322 /* allocations done at boot time */ 3323 node_alloc_noretry = NULL; 3324 } 3325 3326 /* bit mask controlling how hard we retry per-node allocations */ 3327 if (node_alloc_noretry) 3328 nodes_clear(*node_alloc_noretry); 3329 3330 for (i = 0; i < h->max_huge_pages; ++i) { 3331 if (hstate_is_gigantic(h)) { 3332 if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) 3333 break; 3334 } else if (!alloc_pool_huge_page(h, 3335 &node_states[N_MEMORY], 3336 node_alloc_noretry)) 3337 break; 3338 cond_resched(); 3339 } 3340 if (i < h->max_huge_pages) { 3341 char buf[32]; 3342 3343 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 3344 pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", 3345 h->max_huge_pages, buf, i); 3346 h->max_huge_pages = i; 3347 } 3348 kfree(node_alloc_noretry); 3349 } 3350 3351 static void __init hugetlb_init_hstates(void) 3352 { 3353 struct hstate *h, *h2; 3354 3355 for_each_hstate(h) { 3356 /* oversize hugepages were init'ed in early boot */ 3357 if (!hstate_is_gigantic(h)) 3358 hugetlb_hstate_alloc_pages(h); 3359 3360 /* 3361 * Set demote order for each hstate. Note that 3362 * h->demote_order is initially 0. 3363 * - We can not demote gigantic pages if runtime freeing 3364 * is not supported, so skip this. 3365 * - If CMA allocation is possible, we can not demote 3366 * HUGETLB_PAGE_ORDER or smaller size pages. 3367 */ 3368 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 3369 continue; 3370 if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER) 3371 continue; 3372 for_each_hstate(h2) { 3373 if (h2 == h) 3374 continue; 3375 if (h2->order < h->order && 3376 h2->order > h->demote_order) 3377 h->demote_order = h2->order; 3378 } 3379 } 3380 } 3381 3382 static void __init report_hugepages(void) 3383 { 3384 struct hstate *h; 3385 3386 for_each_hstate(h) { 3387 char buf[32]; 3388 3389 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 3390 pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", 3391 buf, h->free_huge_pages); 3392 pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", 3393 hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); 3394 } 3395 } 3396 3397 #ifdef CONFIG_HIGHMEM 3398 static void try_to_free_low(struct hstate *h, unsigned long count, 3399 nodemask_t *nodes_allowed) 3400 { 3401 int i; 3402 LIST_HEAD(page_list); 3403 3404 lockdep_assert_held(&hugetlb_lock); 3405 if (hstate_is_gigantic(h)) 3406 return; 3407 3408 /* 3409 * Collect pages to be freed on a list, and free after dropping lock 3410 */ 3411 for_each_node_mask(i, *nodes_allowed) { 3412 struct page *page, *next; 3413 struct list_head *freel = &h->hugepage_freelists[i]; 3414 list_for_each_entry_safe(page, next, freel, lru) { 3415 if (count >= h->nr_huge_pages) 3416 goto out; 3417 if (PageHighMem(page)) 3418 continue; 3419 remove_hugetlb_folio(h, page_folio(page), false); 3420 list_add(&page->lru, &page_list); 3421 } 3422 } 3423 3424 out: 3425 spin_unlock_irq(&hugetlb_lock); 3426 update_and_free_pages_bulk(h, &page_list); 3427 spin_lock_irq(&hugetlb_lock); 3428 } 3429 #else 3430 static inline void try_to_free_low(struct hstate *h, unsigned long count, 3431 nodemask_t *nodes_allowed) 3432 { 3433 } 3434 #endif 3435 3436 /* 3437 * Increment or decrement surplus_huge_pages. Keep node-specific counters 3438 * balanced by operating on them in a round-robin fashion. 3439 * Returns 1 if an adjustment was made. 3440 */ 3441 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 3442 int delta) 3443 { 3444 int nr_nodes, node; 3445 3446 lockdep_assert_held(&hugetlb_lock); 3447 VM_BUG_ON(delta != -1 && delta != 1); 3448 3449 if (delta < 0) { 3450 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 3451 if (h->surplus_huge_pages_node[node]) 3452 goto found; 3453 } 3454 } else { 3455 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 3456 if (h->surplus_huge_pages_node[node] < 3457 h->nr_huge_pages_node[node]) 3458 goto found; 3459 } 3460 } 3461 return 0; 3462 3463 found: 3464 h->surplus_huge_pages += delta; 3465 h->surplus_huge_pages_node[node] += delta; 3466 return 1; 3467 } 3468 3469 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 3470 static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, 3471 nodemask_t *nodes_allowed) 3472 { 3473 unsigned long min_count, ret; 3474 struct page *page; 3475 LIST_HEAD(page_list); 3476 NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); 3477 3478 /* 3479 * Bit mask controlling how hard we retry per-node allocations. 3480 * If we can not allocate the bit mask, do not attempt to allocate 3481 * the requested huge pages. 3482 */ 3483 if (node_alloc_noretry) 3484 nodes_clear(*node_alloc_noretry); 3485 else 3486 return -ENOMEM; 3487 3488 /* 3489 * resize_lock mutex prevents concurrent adjustments to number of 3490 * pages in hstate via the proc/sysfs interfaces. 3491 */ 3492 mutex_lock(&h->resize_lock); 3493 flush_free_hpage_work(h); 3494 spin_lock_irq(&hugetlb_lock); 3495 3496 /* 3497 * Check for a node specific request. 3498 * Changing node specific huge page count may require a corresponding 3499 * change to the global count. In any case, the passed node mask 3500 * (nodes_allowed) will restrict alloc/free to the specified node. 3501 */ 3502 if (nid != NUMA_NO_NODE) { 3503 unsigned long old_count = count; 3504 3505 count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 3506 /* 3507 * User may have specified a large count value which caused the 3508 * above calculation to overflow. In this case, they wanted 3509 * to allocate as many huge pages as possible. Set count to 3510 * largest possible value to align with their intention. 3511 */ 3512 if (count < old_count) 3513 count = ULONG_MAX; 3514 } 3515 3516 /* 3517 * Gigantic pages runtime allocation depend on the capability for large 3518 * page range allocation. 3519 * If the system does not provide this feature, return an error when 3520 * the user tries to allocate gigantic pages but let the user free the 3521 * boottime allocated gigantic pages. 3522 */ 3523 if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { 3524 if (count > persistent_huge_pages(h)) { 3525 spin_unlock_irq(&hugetlb_lock); 3526 mutex_unlock(&h->resize_lock); 3527 NODEMASK_FREE(node_alloc_noretry); 3528 return -EINVAL; 3529 } 3530 /* Fall through to decrease pool */ 3531 } 3532 3533 /* 3534 * Increase the pool size 3535 * First take pages out of surplus state. Then make up the 3536 * remaining difference by allocating fresh huge pages. 3537 * 3538 * We might race with alloc_surplus_hugetlb_folio() here and be unable 3539 * to convert a surplus huge page to a normal huge page. That is 3540 * not critical, though, it just means the overall size of the 3541 * pool might be one hugepage larger than it needs to be, but 3542 * within all the constraints specified by the sysctls. 3543 */ 3544 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 3545 if (!adjust_pool_surplus(h, nodes_allowed, -1)) 3546 break; 3547 } 3548 3549 while (count > persistent_huge_pages(h)) { 3550 /* 3551 * If this allocation races such that we no longer need the 3552 * page, free_huge_folio will handle it by freeing the page 3553 * and reducing the surplus. 3554 */ 3555 spin_unlock_irq(&hugetlb_lock); 3556 3557 /* yield cpu to avoid soft lockup */ 3558 cond_resched(); 3559 3560 ret = alloc_pool_huge_page(h, nodes_allowed, 3561 node_alloc_noretry); 3562 spin_lock_irq(&hugetlb_lock); 3563 if (!ret) 3564 goto out; 3565 3566 /* Bail for signals. Probably ctrl-c from user */ 3567 if (signal_pending(current)) 3568 goto out; 3569 } 3570 3571 /* 3572 * Decrease the pool size 3573 * First return free pages to the buddy allocator (being careful 3574 * to keep enough around to satisfy reservations). Then place 3575 * pages into surplus state as needed so the pool will shrink 3576 * to the desired size as pages become free. 3577 * 3578 * By placing pages into the surplus state independent of the 3579 * overcommit value, we are allowing the surplus pool size to 3580 * exceed overcommit. There are few sane options here. Since 3581 * alloc_surplus_hugetlb_folio() is checking the global counter, 3582 * though, we'll note that we're not allowed to exceed surplus 3583 * and won't grow the pool anywhere else. Not until one of the 3584 * sysctls are changed, or the surplus pages go out of use. 3585 */ 3586 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 3587 min_count = max(count, min_count); 3588 try_to_free_low(h, min_count, nodes_allowed); 3589 3590 /* 3591 * Collect pages to be removed on list without dropping lock 3592 */ 3593 while (min_count < persistent_huge_pages(h)) { 3594 page = remove_pool_huge_page(h, nodes_allowed, 0); 3595 if (!page) 3596 break; 3597 3598 list_add(&page->lru, &page_list); 3599 } 3600 /* free the pages after dropping lock */ 3601 spin_unlock_irq(&hugetlb_lock); 3602 update_and_free_pages_bulk(h, &page_list); 3603 flush_free_hpage_work(h); 3604 spin_lock_irq(&hugetlb_lock); 3605 3606 while (count < persistent_huge_pages(h)) { 3607 if (!adjust_pool_surplus(h, nodes_allowed, 1)) 3608 break; 3609 } 3610 out: 3611 h->max_huge_pages = persistent_huge_pages(h); 3612 spin_unlock_irq(&hugetlb_lock); 3613 mutex_unlock(&h->resize_lock); 3614 3615 NODEMASK_FREE(node_alloc_noretry); 3616 3617 return 0; 3618 } 3619 3620 static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) 3621 { 3622 int i, nid = folio_nid(folio); 3623 struct hstate *target_hstate; 3624 struct page *subpage; 3625 struct folio *inner_folio; 3626 int rc = 0; 3627 3628 target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); 3629 3630 remove_hugetlb_folio_for_demote(h, folio, false); 3631 spin_unlock_irq(&hugetlb_lock); 3632 3633 rc = hugetlb_vmemmap_restore(h, &folio->page); 3634 if (rc) { 3635 /* Allocation of vmemmmap failed, we can not demote folio */ 3636 spin_lock_irq(&hugetlb_lock); 3637 folio_ref_unfreeze(folio, 1); 3638 add_hugetlb_folio(h, folio, false); 3639 return rc; 3640 } 3641 3642 /* 3643 * Use destroy_compound_hugetlb_folio_for_demote for all huge page 3644 * sizes as it will not ref count folios. 3645 */ 3646 destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); 3647 3648 /* 3649 * Taking target hstate mutex synchronizes with set_max_huge_pages. 3650 * Without the mutex, pages added to target hstate could be marked 3651 * as surplus. 3652 * 3653 * Note that we already hold h->resize_lock. To prevent deadlock, 3654 * use the convention of always taking larger size hstate mutex first. 3655 */ 3656 mutex_lock(&target_hstate->resize_lock); 3657 for (i = 0; i < pages_per_huge_page(h); 3658 i += pages_per_huge_page(target_hstate)) { 3659 subpage = folio_page(folio, i); 3660 inner_folio = page_folio(subpage); 3661 if (hstate_is_gigantic(target_hstate)) 3662 prep_compound_gigantic_folio_for_demote(inner_folio, 3663 target_hstate->order); 3664 else 3665 prep_compound_page(subpage, target_hstate->order); 3666 folio_change_private(inner_folio, NULL); 3667 prep_new_hugetlb_folio(target_hstate, inner_folio, nid); 3668 free_huge_folio(inner_folio); 3669 } 3670 mutex_unlock(&target_hstate->resize_lock); 3671 3672 spin_lock_irq(&hugetlb_lock); 3673 3674 /* 3675 * Not absolutely necessary, but for consistency update max_huge_pages 3676 * based on pool changes for the demoted page. 3677 */ 3678 h->max_huge_pages--; 3679 target_hstate->max_huge_pages += 3680 pages_per_huge_page(h) / pages_per_huge_page(target_hstate); 3681 3682 return rc; 3683 } 3684 3685 static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) 3686 __must_hold(&hugetlb_lock) 3687 { 3688 int nr_nodes, node; 3689 struct folio *folio; 3690 3691 lockdep_assert_held(&hugetlb_lock); 3692 3693 /* We should never get here if no demote order */ 3694 if (!h->demote_order) { 3695 pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); 3696 return -EINVAL; /* internal error */ 3697 } 3698 3699 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 3700 list_for_each_entry(folio, &h->hugepage_freelists[node], lru) { 3701 if (folio_test_hwpoison(folio)) 3702 continue; 3703 return demote_free_hugetlb_folio(h, folio); 3704 } 3705 } 3706 3707 /* 3708 * Only way to get here is if all pages on free lists are poisoned. 3709 * Return -EBUSY so that caller will not retry. 3710 */ 3711 return -EBUSY; 3712 } 3713 3714 #define HSTATE_ATTR_RO(_name) \ 3715 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 3716 3717 #define HSTATE_ATTR_WO(_name) \ 3718 static struct kobj_attribute _name##_attr = __ATTR_WO(_name) 3719 3720 #define HSTATE_ATTR(_name) \ 3721 static struct kobj_attribute _name##_attr = __ATTR_RW(_name) 3722 3723 static struct kobject *hugepages_kobj; 3724 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 3725 3726 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); 3727 3728 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) 3729 { 3730 int i; 3731 3732 for (i = 0; i < HUGE_MAX_HSTATE; i++) 3733 if (hstate_kobjs[i] == kobj) { 3734 if (nidp) 3735 *nidp = NUMA_NO_NODE; 3736 return &hstates[i]; 3737 } 3738 3739 return kobj_to_node_hstate(kobj, nidp); 3740 } 3741 3742 static ssize_t nr_hugepages_show_common(struct kobject *kobj, 3743 struct kobj_attribute *attr, char *buf) 3744 { 3745 struct hstate *h; 3746 unsigned long nr_huge_pages; 3747 int nid; 3748 3749 h = kobj_to_hstate(kobj, &nid); 3750 if (nid == NUMA_NO_NODE) 3751 nr_huge_pages = h->nr_huge_pages; 3752 else 3753 nr_huge_pages = h->nr_huge_pages_node[nid]; 3754 3755 return sysfs_emit(buf, "%lu\n", nr_huge_pages); 3756 } 3757 3758 static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, 3759 struct hstate *h, int nid, 3760 unsigned long count, size_t len) 3761 { 3762 int err; 3763 nodemask_t nodes_allowed, *n_mask; 3764 3765 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 3766 return -EINVAL; 3767 3768 if (nid == NUMA_NO_NODE) { 3769 /* 3770 * global hstate attribute 3771 */ 3772 if (!(obey_mempolicy && 3773 init_nodemask_of_mempolicy(&nodes_allowed))) 3774 n_mask = &node_states[N_MEMORY]; 3775 else 3776 n_mask = &nodes_allowed; 3777 } else { 3778 /* 3779 * Node specific request. count adjustment happens in 3780 * set_max_huge_pages() after acquiring hugetlb_lock. 3781 */ 3782 init_nodemask_of_node(&nodes_allowed, nid); 3783 n_mask = &nodes_allowed; 3784 } 3785 3786 err = set_max_huge_pages(h, count, nid, n_mask); 3787 3788 return err ? err : len; 3789 } 3790 3791 static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 3792 struct kobject *kobj, const char *buf, 3793 size_t len) 3794 { 3795 struct hstate *h; 3796 unsigned long count; 3797 int nid; 3798 int err; 3799 3800 err = kstrtoul(buf, 10, &count); 3801 if (err) 3802 return err; 3803 3804 h = kobj_to_hstate(kobj, &nid); 3805 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); 3806 } 3807 3808 static ssize_t nr_hugepages_show(struct kobject *kobj, 3809 struct kobj_attribute *attr, char *buf) 3810 { 3811 return nr_hugepages_show_common(kobj, attr, buf); 3812 } 3813 3814 static ssize_t nr_hugepages_store(struct kobject *kobj, 3815 struct kobj_attribute *attr, const char *buf, size_t len) 3816 { 3817 return nr_hugepages_store_common(false, kobj, buf, len); 3818 } 3819 HSTATE_ATTR(nr_hugepages); 3820 3821 #ifdef CONFIG_NUMA 3822 3823 /* 3824 * hstate attribute for optionally mempolicy-based constraint on persistent 3825 * huge page alloc/free. 3826 */ 3827 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, 3828 struct kobj_attribute *attr, 3829 char *buf) 3830 { 3831 return nr_hugepages_show_common(kobj, attr, buf); 3832 } 3833 3834 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, 3835 struct kobj_attribute *attr, const char *buf, size_t len) 3836 { 3837 return nr_hugepages_store_common(true, kobj, buf, len); 3838 } 3839 HSTATE_ATTR(nr_hugepages_mempolicy); 3840 #endif 3841 3842 3843 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 3844 struct kobj_attribute *attr, char *buf) 3845 { 3846 struct hstate *h = kobj_to_hstate(kobj, NULL); 3847 return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); 3848 } 3849 3850 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 3851 struct kobj_attribute *attr, const char *buf, size_t count) 3852 { 3853 int err; 3854 unsigned long input; 3855 struct hstate *h = kobj_to_hstate(kobj, NULL); 3856 3857 if (hstate_is_gigantic(h)) 3858 return -EINVAL; 3859 3860 err = kstrtoul(buf, 10, &input); 3861 if (err) 3862 return err; 3863 3864 spin_lock_irq(&hugetlb_lock); 3865 h->nr_overcommit_huge_pages = input; 3866 spin_unlock_irq(&hugetlb_lock); 3867 3868 return count; 3869 } 3870 HSTATE_ATTR(nr_overcommit_hugepages); 3871 3872 static ssize_t free_hugepages_show(struct kobject *kobj, 3873 struct kobj_attribute *attr, char *buf) 3874 { 3875 struct hstate *h; 3876 unsigned long free_huge_pages; 3877 int nid; 3878 3879 h = kobj_to_hstate(kobj, &nid); 3880 if (nid == NUMA_NO_NODE) 3881 free_huge_pages = h->free_huge_pages; 3882 else 3883 free_huge_pages = h->free_huge_pages_node[nid]; 3884 3885 return sysfs_emit(buf, "%lu\n", free_huge_pages); 3886 } 3887 HSTATE_ATTR_RO(free_hugepages); 3888 3889 static ssize_t resv_hugepages_show(struct kobject *kobj, 3890 struct kobj_attribute *attr, char *buf) 3891 { 3892 struct hstate *h = kobj_to_hstate(kobj, NULL); 3893 return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); 3894 } 3895 HSTATE_ATTR_RO(resv_hugepages); 3896 3897 static ssize_t surplus_hugepages_show(struct kobject *kobj, 3898 struct kobj_attribute *attr, char *buf) 3899 { 3900 struct hstate *h; 3901 unsigned long surplus_huge_pages; 3902 int nid; 3903 3904 h = kobj_to_hstate(kobj, &nid); 3905 if (nid == NUMA_NO_NODE) 3906 surplus_huge_pages = h->surplus_huge_pages; 3907 else 3908 surplus_huge_pages = h->surplus_huge_pages_node[nid]; 3909 3910 return sysfs_emit(buf, "%lu\n", surplus_huge_pages); 3911 } 3912 HSTATE_ATTR_RO(surplus_hugepages); 3913 3914 static ssize_t demote_store(struct kobject *kobj, 3915 struct kobj_attribute *attr, const char *buf, size_t len) 3916 { 3917 unsigned long nr_demote; 3918 unsigned long nr_available; 3919 nodemask_t nodes_allowed, *n_mask; 3920 struct hstate *h; 3921 int err; 3922 int nid; 3923 3924 err = kstrtoul(buf, 10, &nr_demote); 3925 if (err) 3926 return err; 3927 h = kobj_to_hstate(kobj, &nid); 3928 3929 if (nid != NUMA_NO_NODE) { 3930 init_nodemask_of_node(&nodes_allowed, nid); 3931 n_mask = &nodes_allowed; 3932 } else { 3933 n_mask = &node_states[N_MEMORY]; 3934 } 3935 3936 /* Synchronize with other sysfs operations modifying huge pages */ 3937 mutex_lock(&h->resize_lock); 3938 spin_lock_irq(&hugetlb_lock); 3939 3940 while (nr_demote) { 3941 /* 3942 * Check for available pages to demote each time thorough the 3943 * loop as demote_pool_huge_page will drop hugetlb_lock. 3944 */ 3945 if (nid != NUMA_NO_NODE) 3946 nr_available = h->free_huge_pages_node[nid]; 3947 else 3948 nr_available = h->free_huge_pages; 3949 nr_available -= h->resv_huge_pages; 3950 if (!nr_available) 3951 break; 3952 3953 err = demote_pool_huge_page(h, n_mask); 3954 if (err) 3955 break; 3956 3957 nr_demote--; 3958 } 3959 3960 spin_unlock_irq(&hugetlb_lock); 3961 mutex_unlock(&h->resize_lock); 3962 3963 if (err) 3964 return err; 3965 return len; 3966 } 3967 HSTATE_ATTR_WO(demote); 3968 3969 static ssize_t demote_size_show(struct kobject *kobj, 3970 struct kobj_attribute *attr, char *buf) 3971 { 3972 struct hstate *h = kobj_to_hstate(kobj, NULL); 3973 unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; 3974 3975 return sysfs_emit(buf, "%lukB\n", demote_size); 3976 } 3977 3978 static ssize_t demote_size_store(struct kobject *kobj, 3979 struct kobj_attribute *attr, 3980 const char *buf, size_t count) 3981 { 3982 struct hstate *h, *demote_hstate; 3983 unsigned long demote_size; 3984 unsigned int demote_order; 3985 3986 demote_size = (unsigned long)memparse(buf, NULL); 3987 3988 demote_hstate = size_to_hstate(demote_size); 3989 if (!demote_hstate) 3990 return -EINVAL; 3991 demote_order = demote_hstate->order; 3992 if (demote_order < HUGETLB_PAGE_ORDER) 3993 return -EINVAL; 3994 3995 /* demote order must be smaller than hstate order */ 3996 h = kobj_to_hstate(kobj, NULL); 3997 if (demote_order >= h->order) 3998 return -EINVAL; 3999 4000 /* resize_lock synchronizes access to demote size and writes */ 4001 mutex_lock(&h->resize_lock); 4002 h->demote_order = demote_order; 4003 mutex_unlock(&h->resize_lock); 4004 4005 return count; 4006 } 4007 HSTATE_ATTR(demote_size); 4008 4009 static struct attribute *hstate_attrs[] = { 4010 &nr_hugepages_attr.attr, 4011 &nr_overcommit_hugepages_attr.attr, 4012 &free_hugepages_attr.attr, 4013 &resv_hugepages_attr.attr, 4014 &surplus_hugepages_attr.attr, 4015 #ifdef CONFIG_NUMA 4016 &nr_hugepages_mempolicy_attr.attr, 4017 #endif 4018 NULL, 4019 }; 4020 4021 static const struct attribute_group hstate_attr_group = { 4022 .attrs = hstate_attrs, 4023 }; 4024 4025 static struct attribute *hstate_demote_attrs[] = { 4026 &demote_size_attr.attr, 4027 &demote_attr.attr, 4028 NULL, 4029 }; 4030 4031 static const struct attribute_group hstate_demote_attr_group = { 4032 .attrs = hstate_demote_attrs, 4033 }; 4034 4035 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, 4036 struct kobject **hstate_kobjs, 4037 const struct attribute_group *hstate_attr_group) 4038 { 4039 int retval; 4040 int hi = hstate_index(h); 4041 4042 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 4043 if (!hstate_kobjs[hi]) 4044 return -ENOMEM; 4045 4046 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); 4047 if (retval) { 4048 kobject_put(hstate_kobjs[hi]); 4049 hstate_kobjs[hi] = NULL; 4050 return retval; 4051 } 4052 4053 if (h->demote_order) { 4054 retval = sysfs_create_group(hstate_kobjs[hi], 4055 &hstate_demote_attr_group); 4056 if (retval) { 4057 pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); 4058 sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group); 4059 kobject_put(hstate_kobjs[hi]); 4060 hstate_kobjs[hi] = NULL; 4061 return retval; 4062 } 4063 } 4064 4065 return 0; 4066 } 4067 4068 #ifdef CONFIG_NUMA 4069 static bool hugetlb_sysfs_initialized __ro_after_init; 4070 4071 /* 4072 * node_hstate/s - associate per node hstate attributes, via their kobjects, 4073 * with node devices in node_devices[] using a parallel array. The array 4074 * index of a node device or _hstate == node id. 4075 * This is here to avoid any static dependency of the node device driver, in 4076 * the base kernel, on the hugetlb module. 4077 */ 4078 struct node_hstate { 4079 struct kobject *hugepages_kobj; 4080 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 4081 }; 4082 static struct node_hstate node_hstates[MAX_NUMNODES]; 4083 4084 /* 4085 * A subset of global hstate attributes for node devices 4086 */ 4087 static struct attribute *per_node_hstate_attrs[] = { 4088 &nr_hugepages_attr.attr, 4089 &free_hugepages_attr.attr, 4090 &surplus_hugepages_attr.attr, 4091 NULL, 4092 }; 4093 4094 static const struct attribute_group per_node_hstate_attr_group = { 4095 .attrs = per_node_hstate_attrs, 4096 }; 4097 4098 /* 4099 * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. 4100 * Returns node id via non-NULL nidp. 4101 */ 4102 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 4103 { 4104 int nid; 4105 4106 for (nid = 0; nid < nr_node_ids; nid++) { 4107 struct node_hstate *nhs = &node_hstates[nid]; 4108 int i; 4109 for (i = 0; i < HUGE_MAX_HSTATE; i++) 4110 if (nhs->hstate_kobjs[i] == kobj) { 4111 if (nidp) 4112 *nidp = nid; 4113 return &hstates[i]; 4114 } 4115 } 4116 4117 BUG(); 4118 return NULL; 4119 } 4120 4121 /* 4122 * Unregister hstate attributes from a single node device. 4123 * No-op if no hstate attributes attached. 4124 */ 4125 void hugetlb_unregister_node(struct node *node) 4126 { 4127 struct hstate *h; 4128 struct node_hstate *nhs = &node_hstates[node->dev.id]; 4129 4130 if (!nhs->hugepages_kobj) 4131 return; /* no hstate attributes */ 4132 4133 for_each_hstate(h) { 4134 int idx = hstate_index(h); 4135 struct kobject *hstate_kobj = nhs->hstate_kobjs[idx]; 4136 4137 if (!hstate_kobj) 4138 continue; 4139 if (h->demote_order) 4140 sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group); 4141 sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group); 4142 kobject_put(hstate_kobj); 4143 nhs->hstate_kobjs[idx] = NULL; 4144 } 4145 4146 kobject_put(nhs->hugepages_kobj); 4147 nhs->hugepages_kobj = NULL; 4148 } 4149 4150 4151 /* 4152 * Register hstate attributes for a single node device. 4153 * No-op if attributes already registered. 4154 */ 4155 void hugetlb_register_node(struct node *node) 4156 { 4157 struct hstate *h; 4158 struct node_hstate *nhs = &node_hstates[node->dev.id]; 4159 int err; 4160 4161 if (!hugetlb_sysfs_initialized) 4162 return; 4163 4164 if (nhs->hugepages_kobj) 4165 return; /* already allocated */ 4166 4167 nhs->hugepages_kobj = kobject_create_and_add("hugepages", 4168 &node->dev.kobj); 4169 if (!nhs->hugepages_kobj) 4170 return; 4171 4172 for_each_hstate(h) { 4173 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, 4174 nhs->hstate_kobjs, 4175 &per_node_hstate_attr_group); 4176 if (err) { 4177 pr_err("HugeTLB: Unable to add hstate %s for node %d\n", 4178 h->name, node->dev.id); 4179 hugetlb_unregister_node(node); 4180 break; 4181 } 4182 } 4183 } 4184 4185 /* 4186 * hugetlb init time: register hstate attributes for all registered node 4187 * devices of nodes that have memory. All on-line nodes should have 4188 * registered their associated device by this time. 4189 */ 4190 static void __init hugetlb_register_all_nodes(void) 4191 { 4192 int nid; 4193 4194 for_each_online_node(nid) 4195 hugetlb_register_node(node_devices[nid]); 4196 } 4197 #else /* !CONFIG_NUMA */ 4198 4199 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 4200 { 4201 BUG(); 4202 if (nidp) 4203 *nidp = -1; 4204 return NULL; 4205 } 4206 4207 static void hugetlb_register_all_nodes(void) { } 4208 4209 #endif 4210 4211 #ifdef CONFIG_CMA 4212 static void __init hugetlb_cma_check(void); 4213 #else 4214 static inline __init void hugetlb_cma_check(void) 4215 { 4216 } 4217 #endif 4218 4219 static void __init hugetlb_sysfs_init(void) 4220 { 4221 struct hstate *h; 4222 int err; 4223 4224 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); 4225 if (!hugepages_kobj) 4226 return; 4227 4228 for_each_hstate(h) { 4229 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 4230 hstate_kobjs, &hstate_attr_group); 4231 if (err) 4232 pr_err("HugeTLB: Unable to add hstate %s", h->name); 4233 } 4234 4235 #ifdef CONFIG_NUMA 4236 hugetlb_sysfs_initialized = true; 4237 #endif 4238 hugetlb_register_all_nodes(); 4239 } 4240 4241 #ifdef CONFIG_SYSCTL 4242 static void hugetlb_sysctl_init(void); 4243 #else 4244 static inline void hugetlb_sysctl_init(void) { } 4245 #endif 4246 4247 static int __init hugetlb_init(void) 4248 { 4249 int i; 4250 4251 BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < 4252 __NR_HPAGEFLAGS); 4253 4254 if (!hugepages_supported()) { 4255 if (hugetlb_max_hstate || default_hstate_max_huge_pages) 4256 pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); 4257 return 0; 4258 } 4259 4260 /* 4261 * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some 4262 * architectures depend on setup being done here. 4263 */ 4264 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 4265 if (!parsed_default_hugepagesz) { 4266 /* 4267 * If we did not parse a default huge page size, set 4268 * default_hstate_idx to HPAGE_SIZE hstate. And, if the 4269 * number of huge pages for this default size was implicitly 4270 * specified, set that here as well. 4271 * Note that the implicit setting will overwrite an explicit 4272 * setting. A warning will be printed in this case. 4273 */ 4274 default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); 4275 if (default_hstate_max_huge_pages) { 4276 if (default_hstate.max_huge_pages) { 4277 char buf[32]; 4278 4279 string_get_size(huge_page_size(&default_hstate), 4280 1, STRING_UNITS_2, buf, 32); 4281 pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", 4282 default_hstate.max_huge_pages, buf); 4283 pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", 4284 default_hstate_max_huge_pages); 4285 } 4286 default_hstate.max_huge_pages = 4287 default_hstate_max_huge_pages; 4288 4289 for_each_online_node(i) 4290 default_hstate.max_huge_pages_node[i] = 4291 default_hugepages_in_node[i]; 4292 } 4293 } 4294 4295 hugetlb_cma_check(); 4296 hugetlb_init_hstates(); 4297 gather_bootmem_prealloc(); 4298 report_hugepages(); 4299 4300 hugetlb_sysfs_init(); 4301 hugetlb_cgroup_file_init(); 4302 hugetlb_sysctl_init(); 4303 4304 #ifdef CONFIG_SMP 4305 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); 4306 #else 4307 num_fault_mutexes = 1; 4308 #endif 4309 hugetlb_fault_mutex_table = 4310 kmalloc_array(num_fault_mutexes, sizeof(struct mutex), 4311 GFP_KERNEL); 4312 BUG_ON(!hugetlb_fault_mutex_table); 4313 4314 for (i = 0; i < num_fault_mutexes; i++) 4315 mutex_init(&hugetlb_fault_mutex_table[i]); 4316 return 0; 4317 } 4318 subsys_initcall(hugetlb_init); 4319 4320 /* Overwritten by architectures with more huge page sizes */ 4321 bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) 4322 { 4323 return size == HPAGE_SIZE; 4324 } 4325 4326 void __init hugetlb_add_hstate(unsigned int order) 4327 { 4328 struct hstate *h; 4329 unsigned long i; 4330 4331 if (size_to_hstate(PAGE_SIZE << order)) { 4332 return; 4333 } 4334 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); 4335 BUG_ON(order == 0); 4336 h = &hstates[hugetlb_max_hstate++]; 4337 __mutex_init(&h->resize_lock, "resize mutex", &h->resize_key); 4338 h->order = order; 4339 h->mask = ~(huge_page_size(h) - 1); 4340 for (i = 0; i < MAX_NUMNODES; ++i) 4341 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 4342 INIT_LIST_HEAD(&h->hugepage_activelist); 4343 h->next_nid_to_alloc = first_memory_node; 4344 h->next_nid_to_free = first_memory_node; 4345 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 4346 huge_page_size(h)/SZ_1K); 4347 4348 parsed_hstate = h; 4349 } 4350 4351 bool __init __weak hugetlb_node_alloc_supported(void) 4352 { 4353 return true; 4354 } 4355 4356 static void __init hugepages_clear_pages_in_node(void) 4357 { 4358 if (!hugetlb_max_hstate) { 4359 default_hstate_max_huge_pages = 0; 4360 memset(default_hugepages_in_node, 0, 4361 sizeof(default_hugepages_in_node)); 4362 } else { 4363 parsed_hstate->max_huge_pages = 0; 4364 memset(parsed_hstate->max_huge_pages_node, 0, 4365 sizeof(parsed_hstate->max_huge_pages_node)); 4366 } 4367 } 4368 4369 /* 4370 * hugepages command line processing 4371 * hugepages normally follows a valid hugepagsz or default_hugepagsz 4372 * specification. If not, ignore the hugepages value. hugepages can also 4373 * be the first huge page command line option in which case it implicitly 4374 * specifies the number of huge pages for the default size. 4375 */ 4376 static int __init hugepages_setup(char *s) 4377 { 4378 unsigned long *mhp; 4379 static unsigned long *last_mhp; 4380 int node = NUMA_NO_NODE; 4381 int count; 4382 unsigned long tmp; 4383 char *p = s; 4384 4385 if (!parsed_valid_hugepagesz) { 4386 pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); 4387 parsed_valid_hugepagesz = true; 4388 return 1; 4389 } 4390 4391 /* 4392 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter 4393 * yet, so this hugepages= parameter goes to the "default hstate". 4394 * Otherwise, it goes with the previously parsed hugepagesz or 4395 * default_hugepagesz. 4396 */ 4397 else if (!hugetlb_max_hstate) 4398 mhp = &default_hstate_max_huge_pages; 4399 else 4400 mhp = &parsed_hstate->max_huge_pages; 4401 4402 if (mhp == last_mhp) { 4403 pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); 4404 return 1; 4405 } 4406 4407 while (*p) { 4408 count = 0; 4409 if (sscanf(p, "%lu%n", &tmp, &count) != 1) 4410 goto invalid; 4411 /* Parameter is node format */ 4412 if (p[count] == ':') { 4413 if (!hugetlb_node_alloc_supported()) { 4414 pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); 4415 return 1; 4416 } 4417 if (tmp >= MAX_NUMNODES || !node_online(tmp)) 4418 goto invalid; 4419 node = array_index_nospec(tmp, MAX_NUMNODES); 4420 p += count + 1; 4421 /* Parse hugepages */ 4422 if (sscanf(p, "%lu%n", &tmp, &count) != 1) 4423 goto invalid; 4424 if (!hugetlb_max_hstate) 4425 default_hugepages_in_node[node] = tmp; 4426 else 4427 parsed_hstate->max_huge_pages_node[node] = tmp; 4428 *mhp += tmp; 4429 /* Go to parse next node*/ 4430 if (p[count] == ',') 4431 p += count + 1; 4432 else 4433 break; 4434 } else { 4435 if (p != s) 4436 goto invalid; 4437 *mhp = tmp; 4438 break; 4439 } 4440 } 4441 4442 /* 4443 * Global state is always initialized later in hugetlb_init. 4444 * But we need to allocate gigantic hstates here early to still 4445 * use the bootmem allocator. 4446 */ 4447 if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate)) 4448 hugetlb_hstate_alloc_pages(parsed_hstate); 4449 4450 last_mhp = mhp; 4451 4452 return 1; 4453 4454 invalid: 4455 pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); 4456 hugepages_clear_pages_in_node(); 4457 return 1; 4458 } 4459 __setup("hugepages=", hugepages_setup); 4460 4461 /* 4462 * hugepagesz command line processing 4463 * A specific huge page size can only be specified once with hugepagesz. 4464 * hugepagesz is followed by hugepages on the command line. The global 4465 * variable 'parsed_valid_hugepagesz' is used to determine if prior 4466 * hugepagesz argument was valid. 4467 */ 4468 static int __init hugepagesz_setup(char *s) 4469 { 4470 unsigned long size; 4471 struct hstate *h; 4472 4473 parsed_valid_hugepagesz = false; 4474 size = (unsigned long)memparse(s, NULL); 4475 4476 if (!arch_hugetlb_valid_size(size)) { 4477 pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); 4478 return 1; 4479 } 4480 4481 h = size_to_hstate(size); 4482 if (h) { 4483 /* 4484 * hstate for this size already exists. This is normally 4485 * an error, but is allowed if the existing hstate is the 4486 * default hstate. More specifically, it is only allowed if 4487 * the number of huge pages for the default hstate was not 4488 * previously specified. 4489 */ 4490 if (!parsed_default_hugepagesz || h != &default_hstate || 4491 default_hstate.max_huge_pages) { 4492 pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); 4493 return 1; 4494 } 4495 4496 /* 4497 * No need to call hugetlb_add_hstate() as hstate already 4498 * exists. But, do set parsed_hstate so that a following 4499 * hugepages= parameter will be applied to this hstate. 4500 */ 4501 parsed_hstate = h; 4502 parsed_valid_hugepagesz = true; 4503 return 1; 4504 } 4505 4506 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 4507 parsed_valid_hugepagesz = true; 4508 return 1; 4509 } 4510 __setup("hugepagesz=", hugepagesz_setup); 4511 4512 /* 4513 * default_hugepagesz command line input 4514 * Only one instance of default_hugepagesz allowed on command line. 4515 */ 4516 static int __init default_hugepagesz_setup(char *s) 4517 { 4518 unsigned long size; 4519 int i; 4520 4521 parsed_valid_hugepagesz = false; 4522 if (parsed_default_hugepagesz) { 4523 pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); 4524 return 1; 4525 } 4526 4527 size = (unsigned long)memparse(s, NULL); 4528 4529 if (!arch_hugetlb_valid_size(size)) { 4530 pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); 4531 return 1; 4532 } 4533 4534 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 4535 parsed_valid_hugepagesz = true; 4536 parsed_default_hugepagesz = true; 4537 default_hstate_idx = hstate_index(size_to_hstate(size)); 4538 4539 /* 4540 * The number of default huge pages (for this size) could have been 4541 * specified as the first hugetlb parameter: hugepages=X. If so, 4542 * then default_hstate_max_huge_pages is set. If the default huge 4543 * page size is gigantic (> MAX_ORDER), then the pages must be 4544 * allocated here from bootmem allocator. 4545 */ 4546 if (default_hstate_max_huge_pages) { 4547 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 4548 for_each_online_node(i) 4549 default_hstate.max_huge_pages_node[i] = 4550 default_hugepages_in_node[i]; 4551 if (hstate_is_gigantic(&default_hstate)) 4552 hugetlb_hstate_alloc_pages(&default_hstate); 4553 default_hstate_max_huge_pages = 0; 4554 } 4555 4556 return 1; 4557 } 4558 __setup("default_hugepagesz=", default_hugepagesz_setup); 4559 4560 static unsigned int allowed_mems_nr(struct hstate *h) 4561 { 4562 int node; 4563 unsigned int nr = 0; 4564 nodemask_t *mbind_nodemask; 4565 unsigned int *array = h->free_huge_pages_node; 4566 gfp_t gfp_mask = htlb_alloc_mask(h); 4567 4568 mbind_nodemask = policy_mbind_nodemask(gfp_mask); 4569 for_each_node_mask(node, cpuset_current_mems_allowed) { 4570 if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) 4571 nr += array[node]; 4572 } 4573 4574 return nr; 4575 } 4576 4577 #ifdef CONFIG_SYSCTL 4578 static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, 4579 void *buffer, size_t *length, 4580 loff_t *ppos, unsigned long *out) 4581 { 4582 struct ctl_table dup_table; 4583 4584 /* 4585 * In order to avoid races with __do_proc_doulongvec_minmax(), we 4586 * can duplicate the @table and alter the duplicate of it. 4587 */ 4588 dup_table = *table; 4589 dup_table.data = out; 4590 4591 return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); 4592 } 4593 4594 static int hugetlb_sysctl_handler_common(bool obey_mempolicy, 4595 struct ctl_table *table, int write, 4596 void *buffer, size_t *length, loff_t *ppos) 4597 { 4598 struct hstate *h = &default_hstate; 4599 unsigned long tmp = h->max_huge_pages; 4600 int ret; 4601 4602 if (!hugepages_supported()) 4603 return -EOPNOTSUPP; 4604 4605 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, 4606 &tmp); 4607 if (ret) 4608 goto out; 4609 4610 if (write) 4611 ret = __nr_hugepages_store_common(obey_mempolicy, h, 4612 NUMA_NO_NODE, tmp, *length); 4613 out: 4614 return ret; 4615 } 4616 4617 static int hugetlb_sysctl_handler(struct ctl_table *table, int write, 4618 void *buffer, size_t *length, loff_t *ppos) 4619 { 4620 4621 return hugetlb_sysctl_handler_common(false, table, write, 4622 buffer, length, ppos); 4623 } 4624 4625 #ifdef CONFIG_NUMA 4626 static int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, 4627 void *buffer, size_t *length, loff_t *ppos) 4628 { 4629 return hugetlb_sysctl_handler_common(true, table, write, 4630 buffer, length, ppos); 4631 } 4632 #endif /* CONFIG_NUMA */ 4633 4634 static int hugetlb_overcommit_handler(struct ctl_table *table, int write, 4635 void *buffer, size_t *length, loff_t *ppos) 4636 { 4637 struct hstate *h = &default_hstate; 4638 unsigned long tmp; 4639 int ret; 4640 4641 if (!hugepages_supported()) 4642 return -EOPNOTSUPP; 4643 4644 tmp = h->nr_overcommit_huge_pages; 4645 4646 if (write && hstate_is_gigantic(h)) 4647 return -EINVAL; 4648 4649 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, 4650 &tmp); 4651 if (ret) 4652 goto out; 4653 4654 if (write) { 4655 spin_lock_irq(&hugetlb_lock); 4656 h->nr_overcommit_huge_pages = tmp; 4657 spin_unlock_irq(&hugetlb_lock); 4658 } 4659 out: 4660 return ret; 4661 } 4662 4663 static struct ctl_table hugetlb_table[] = { 4664 { 4665 .procname = "nr_hugepages", 4666 .data = NULL, 4667 .maxlen = sizeof(unsigned long), 4668 .mode = 0644, 4669 .proc_handler = hugetlb_sysctl_handler, 4670 }, 4671 #ifdef CONFIG_NUMA 4672 { 4673 .procname = "nr_hugepages_mempolicy", 4674 .data = NULL, 4675 .maxlen = sizeof(unsigned long), 4676 .mode = 0644, 4677 .proc_handler = &hugetlb_mempolicy_sysctl_handler, 4678 }, 4679 #endif 4680 { 4681 .procname = "hugetlb_shm_group", 4682 .data = &sysctl_hugetlb_shm_group, 4683 .maxlen = sizeof(gid_t), 4684 .mode = 0644, 4685 .proc_handler = proc_dointvec, 4686 }, 4687 { 4688 .procname = "nr_overcommit_hugepages", 4689 .data = NULL, 4690 .maxlen = sizeof(unsigned long), 4691 .mode = 0644, 4692 .proc_handler = hugetlb_overcommit_handler, 4693 }, 4694 { } 4695 }; 4696 4697 static void hugetlb_sysctl_init(void) 4698 { 4699 register_sysctl_init("vm", hugetlb_table); 4700 } 4701 #endif /* CONFIG_SYSCTL */ 4702 4703 void hugetlb_report_meminfo(struct seq_file *m) 4704 { 4705 struct hstate *h; 4706 unsigned long total = 0; 4707 4708 if (!hugepages_supported()) 4709 return; 4710 4711 for_each_hstate(h) { 4712 unsigned long count = h->nr_huge_pages; 4713 4714 total += huge_page_size(h) * count; 4715 4716 if (h == &default_hstate) 4717 seq_printf(m, 4718 "HugePages_Total: %5lu\n" 4719 "HugePages_Free: %5lu\n" 4720 "HugePages_Rsvd: %5lu\n" 4721 "HugePages_Surp: %5lu\n" 4722 "Hugepagesize: %8lu kB\n", 4723 count, 4724 h->free_huge_pages, 4725 h->resv_huge_pages, 4726 h->surplus_huge_pages, 4727 huge_page_size(h) / SZ_1K); 4728 } 4729 4730 seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K); 4731 } 4732 4733 int hugetlb_report_node_meminfo(char *buf, int len, int nid) 4734 { 4735 struct hstate *h = &default_hstate; 4736 4737 if (!hugepages_supported()) 4738 return 0; 4739 4740 return sysfs_emit_at(buf, len, 4741 "Node %d HugePages_Total: %5u\n" 4742 "Node %d HugePages_Free: %5u\n" 4743 "Node %d HugePages_Surp: %5u\n", 4744 nid, h->nr_huge_pages_node[nid], 4745 nid, h->free_huge_pages_node[nid], 4746 nid, h->surplus_huge_pages_node[nid]); 4747 } 4748 4749 void hugetlb_show_meminfo_node(int nid) 4750 { 4751 struct hstate *h; 4752 4753 if (!hugepages_supported()) 4754 return; 4755 4756 for_each_hstate(h) 4757 printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", 4758 nid, 4759 h->nr_huge_pages_node[nid], 4760 h->free_huge_pages_node[nid], 4761 h->surplus_huge_pages_node[nid], 4762 huge_page_size(h) / SZ_1K); 4763 } 4764 4765 void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) 4766 { 4767 seq_printf(m, "HugetlbPages:\t%8lu kB\n", 4768 K(atomic_long_read(&mm->hugetlb_usage))); 4769 } 4770 4771 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 4772 unsigned long hugetlb_total_pages(void) 4773 { 4774 struct hstate *h; 4775 unsigned long nr_total_pages = 0; 4776 4777 for_each_hstate(h) 4778 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); 4779 return nr_total_pages; 4780 } 4781 4782 static int hugetlb_acct_memory(struct hstate *h, long delta) 4783 { 4784 int ret = -ENOMEM; 4785 4786 if (!delta) 4787 return 0; 4788 4789 spin_lock_irq(&hugetlb_lock); 4790 /* 4791 * When cpuset is configured, it breaks the strict hugetlb page 4792 * reservation as the accounting is done on a global variable. Such 4793 * reservation is completely rubbish in the presence of cpuset because 4794 * the reservation is not checked against page availability for the 4795 * current cpuset. Application can still potentially OOM'ed by kernel 4796 * with lack of free htlb page in cpuset that the task is in. 4797 * Attempt to enforce strict accounting with cpuset is almost 4798 * impossible (or too ugly) because cpuset is too fluid that 4799 * task or memory node can be dynamically moved between cpusets. 4800 * 4801 * The change of semantics for shared hugetlb mapping with cpuset is 4802 * undesirable. However, in order to preserve some of the semantics, 4803 * we fall back to check against current free page availability as 4804 * a best attempt and hopefully to minimize the impact of changing 4805 * semantics that cpuset has. 4806 * 4807 * Apart from cpuset, we also have memory policy mechanism that 4808 * also determines from which node the kernel will allocate memory 4809 * in a NUMA system. So similar to cpuset, we also should consider 4810 * the memory policy of the current task. Similar to the description 4811 * above. 4812 */ 4813 if (delta > 0) { 4814 if (gather_surplus_pages(h, delta) < 0) 4815 goto out; 4816 4817 if (delta > allowed_mems_nr(h)) { 4818 return_unused_surplus_pages(h, delta); 4819 goto out; 4820 } 4821 } 4822 4823 ret = 0; 4824 if (delta < 0) 4825 return_unused_surplus_pages(h, (unsigned long) -delta); 4826 4827 out: 4828 spin_unlock_irq(&hugetlb_lock); 4829 return ret; 4830 } 4831 4832 static void hugetlb_vm_op_open(struct vm_area_struct *vma) 4833 { 4834 struct resv_map *resv = vma_resv_map(vma); 4835 4836 /* 4837 * HPAGE_RESV_OWNER indicates a private mapping. 4838 * This new VMA should share its siblings reservation map if present. 4839 * The VMA will only ever have a valid reservation map pointer where 4840 * it is being copied for another still existing VMA. As that VMA 4841 * has a reference to the reservation map it cannot disappear until 4842 * after this open call completes. It is therefore safe to take a 4843 * new reference here without additional locking. 4844 */ 4845 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 4846 resv_map_dup_hugetlb_cgroup_uncharge_info(resv); 4847 kref_get(&resv->refs); 4848 } 4849 4850 /* 4851 * vma_lock structure for sharable mappings is vma specific. 4852 * Clear old pointer (if copied via vm_area_dup) and allocate 4853 * new structure. Before clearing, make sure vma_lock is not 4854 * for this vma. 4855 */ 4856 if (vma->vm_flags & VM_MAYSHARE) { 4857 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 4858 4859 if (vma_lock) { 4860 if (vma_lock->vma != vma) { 4861 vma->vm_private_data = NULL; 4862 hugetlb_vma_lock_alloc(vma); 4863 } else 4864 pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__); 4865 } else 4866 hugetlb_vma_lock_alloc(vma); 4867 } 4868 } 4869 4870 static void hugetlb_vm_op_close(struct vm_area_struct *vma) 4871 { 4872 struct hstate *h = hstate_vma(vma); 4873 struct resv_map *resv; 4874 struct hugepage_subpool *spool = subpool_vma(vma); 4875 unsigned long reserve, start, end; 4876 long gbl_reserve; 4877 4878 hugetlb_vma_lock_free(vma); 4879 4880 resv = vma_resv_map(vma); 4881 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 4882 return; 4883 4884 start = vma_hugecache_offset(h, vma, vma->vm_start); 4885 end = vma_hugecache_offset(h, vma, vma->vm_end); 4886 4887 reserve = (end - start) - region_count(resv, start, end); 4888 hugetlb_cgroup_uncharge_counter(resv, start, end); 4889 if (reserve) { 4890 /* 4891 * Decrement reserve counts. The global reserve count may be 4892 * adjusted if the subpool has a minimum size. 4893 */ 4894 gbl_reserve = hugepage_subpool_put_pages(spool, reserve); 4895 hugetlb_acct_memory(h, -gbl_reserve); 4896 } 4897 4898 kref_put(&resv->refs, resv_map_release); 4899 } 4900 4901 static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) 4902 { 4903 if (addr & ~(huge_page_mask(hstate_vma(vma)))) 4904 return -EINVAL; 4905 4906 /* 4907 * PMD sharing is only possible for PUD_SIZE-aligned address ranges 4908 * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this 4909 * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. 4910 */ 4911 if (addr & ~PUD_MASK) { 4912 /* 4913 * hugetlb_vm_op_split is called right before we attempt to 4914 * split the VMA. We will need to unshare PMDs in the old and 4915 * new VMAs, so let's unshare before we split. 4916 */ 4917 unsigned long floor = addr & PUD_MASK; 4918 unsigned long ceil = floor + PUD_SIZE; 4919 4920 if (floor >= vma->vm_start && ceil <= vma->vm_end) 4921 hugetlb_unshare_pmds(vma, floor, ceil); 4922 } 4923 4924 return 0; 4925 } 4926 4927 static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) 4928 { 4929 return huge_page_size(hstate_vma(vma)); 4930 } 4931 4932 /* 4933 * We cannot handle pagefaults against hugetlb pages at all. They cause 4934 * handle_mm_fault() to try to instantiate regular-sized pages in the 4935 * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get 4936 * this far. 4937 */ 4938 static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) 4939 { 4940 BUG(); 4941 return 0; 4942 } 4943 4944 /* 4945 * When a new function is introduced to vm_operations_struct and added 4946 * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. 4947 * This is because under System V memory model, mappings created via 4948 * shmget/shmat with "huge page" specified are backed by hugetlbfs files, 4949 * their original vm_ops are overwritten with shm_vm_ops. 4950 */ 4951 const struct vm_operations_struct hugetlb_vm_ops = { 4952 .fault = hugetlb_vm_op_fault, 4953 .open = hugetlb_vm_op_open, 4954 .close = hugetlb_vm_op_close, 4955 .may_split = hugetlb_vm_op_split, 4956 .pagesize = hugetlb_vm_op_pagesize, 4957 }; 4958 4959 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 4960 int writable) 4961 { 4962 pte_t entry; 4963 unsigned int shift = huge_page_shift(hstate_vma(vma)); 4964 4965 if (writable) { 4966 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, 4967 vma->vm_page_prot))); 4968 } else { 4969 entry = huge_pte_wrprotect(mk_huge_pte(page, 4970 vma->vm_page_prot)); 4971 } 4972 entry = pte_mkyoung(entry); 4973 entry = arch_make_huge_pte(entry, shift, vma->vm_flags); 4974 4975 return entry; 4976 } 4977 4978 static void set_huge_ptep_writable(struct vm_area_struct *vma, 4979 unsigned long address, pte_t *ptep) 4980 { 4981 pte_t entry; 4982 4983 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); 4984 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) 4985 update_mmu_cache(vma, address, ptep); 4986 } 4987 4988 bool is_hugetlb_entry_migration(pte_t pte) 4989 { 4990 swp_entry_t swp; 4991 4992 if (huge_pte_none(pte) || pte_present(pte)) 4993 return false; 4994 swp = pte_to_swp_entry(pte); 4995 if (is_migration_entry(swp)) 4996 return true; 4997 else 4998 return false; 4999 } 5000 5001 static bool is_hugetlb_entry_hwpoisoned(pte_t pte) 5002 { 5003 swp_entry_t swp; 5004 5005 if (huge_pte_none(pte) || pte_present(pte)) 5006 return false; 5007 swp = pte_to_swp_entry(pte); 5008 if (is_hwpoison_entry(swp)) 5009 return true; 5010 else 5011 return false; 5012 } 5013 5014 static void 5015 hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, 5016 struct folio *new_folio, pte_t old, unsigned long sz) 5017 { 5018 pte_t newpte = make_huge_pte(vma, &new_folio->page, 1); 5019 5020 __folio_mark_uptodate(new_folio); 5021 hugepage_add_new_anon_rmap(new_folio, vma, addr); 5022 if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old)) 5023 newpte = huge_pte_mkuffd_wp(newpte); 5024 set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz); 5025 hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); 5026 folio_set_hugetlb_migratable(new_folio); 5027 } 5028 5029 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 5030 struct vm_area_struct *dst_vma, 5031 struct vm_area_struct *src_vma) 5032 { 5033 pte_t *src_pte, *dst_pte, entry; 5034 struct folio *pte_folio; 5035 unsigned long addr; 5036 bool cow = is_cow_mapping(src_vma->vm_flags); 5037 struct hstate *h = hstate_vma(src_vma); 5038 unsigned long sz = huge_page_size(h); 5039 unsigned long npages = pages_per_huge_page(h); 5040 struct mmu_notifier_range range; 5041 unsigned long last_addr_mask; 5042 int ret = 0; 5043 5044 if (cow) { 5045 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src, 5046 src_vma->vm_start, 5047 src_vma->vm_end); 5048 mmu_notifier_invalidate_range_start(&range); 5049 vma_assert_write_locked(src_vma); 5050 raw_write_seqcount_begin(&src->write_protect_seq); 5051 } else { 5052 /* 5053 * For shared mappings the vma lock must be held before 5054 * calling hugetlb_walk() in the src vma. Otherwise, the 5055 * returned ptep could go away if part of a shared pmd and 5056 * another thread calls huge_pmd_unshare. 5057 */ 5058 hugetlb_vma_lock_read(src_vma); 5059 } 5060 5061 last_addr_mask = hugetlb_mask_last_page(h); 5062 for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { 5063 spinlock_t *src_ptl, *dst_ptl; 5064 src_pte = hugetlb_walk(src_vma, addr, sz); 5065 if (!src_pte) { 5066 addr |= last_addr_mask; 5067 continue; 5068 } 5069 dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); 5070 if (!dst_pte) { 5071 ret = -ENOMEM; 5072 break; 5073 } 5074 5075 /* 5076 * If the pagetables are shared don't copy or take references. 5077 * 5078 * dst_pte == src_pte is the common case of src/dest sharing. 5079 * However, src could have 'unshared' and dst shares with 5080 * another vma. So page_count of ptep page is checked instead 5081 * to reliably determine whether pte is shared. 5082 */ 5083 if (page_count(virt_to_page(dst_pte)) > 1) { 5084 addr |= last_addr_mask; 5085 continue; 5086 } 5087 5088 dst_ptl = huge_pte_lock(h, dst, dst_pte); 5089 src_ptl = huge_pte_lockptr(h, src, src_pte); 5090 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 5091 entry = huge_ptep_get(src_pte); 5092 again: 5093 if (huge_pte_none(entry)) { 5094 /* 5095 * Skip if src entry none. 5096 */ 5097 ; 5098 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { 5099 if (!userfaultfd_wp(dst_vma)) 5100 entry = huge_pte_clear_uffd_wp(entry); 5101 set_huge_pte_at(dst, addr, dst_pte, entry, sz); 5102 } else if (unlikely(is_hugetlb_entry_migration(entry))) { 5103 swp_entry_t swp_entry = pte_to_swp_entry(entry); 5104 bool uffd_wp = pte_swp_uffd_wp(entry); 5105 5106 if (!is_readable_migration_entry(swp_entry) && cow) { 5107 /* 5108 * COW mappings require pages in both 5109 * parent and child to be set to read. 5110 */ 5111 swp_entry = make_readable_migration_entry( 5112 swp_offset(swp_entry)); 5113 entry = swp_entry_to_pte(swp_entry); 5114 if (userfaultfd_wp(src_vma) && uffd_wp) 5115 entry = pte_swp_mkuffd_wp(entry); 5116 set_huge_pte_at(src, addr, src_pte, entry, sz); 5117 } 5118 if (!userfaultfd_wp(dst_vma)) 5119 entry = huge_pte_clear_uffd_wp(entry); 5120 set_huge_pte_at(dst, addr, dst_pte, entry, sz); 5121 } else if (unlikely(is_pte_marker(entry))) { 5122 pte_marker marker = copy_pte_marker( 5123 pte_to_swp_entry(entry), dst_vma); 5124 5125 if (marker) 5126 set_huge_pte_at(dst, addr, dst_pte, 5127 make_pte_marker(marker), sz); 5128 } else { 5129 entry = huge_ptep_get(src_pte); 5130 pte_folio = page_folio(pte_page(entry)); 5131 folio_get(pte_folio); 5132 5133 /* 5134 * Failing to duplicate the anon rmap is a rare case 5135 * where we see pinned hugetlb pages while they're 5136 * prone to COW. We need to do the COW earlier during 5137 * fork. 5138 * 5139 * When pre-allocating the page or copying data, we 5140 * need to be without the pgtable locks since we could 5141 * sleep during the process. 5142 */ 5143 if (!folio_test_anon(pte_folio)) { 5144 page_dup_file_rmap(&pte_folio->page, true); 5145 } else if (page_try_dup_anon_rmap(&pte_folio->page, 5146 true, src_vma)) { 5147 pte_t src_pte_old = entry; 5148 struct folio *new_folio; 5149 5150 spin_unlock(src_ptl); 5151 spin_unlock(dst_ptl); 5152 /* Do not use reserve as it's private owned */ 5153 new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); 5154 if (IS_ERR(new_folio)) { 5155 folio_put(pte_folio); 5156 ret = PTR_ERR(new_folio); 5157 break; 5158 } 5159 ret = copy_user_large_folio(new_folio, 5160 pte_folio, 5161 addr, dst_vma); 5162 folio_put(pte_folio); 5163 if (ret) { 5164 folio_put(new_folio); 5165 break; 5166 } 5167 5168 /* Install the new hugetlb folio if src pte stable */ 5169 dst_ptl = huge_pte_lock(h, dst, dst_pte); 5170 src_ptl = huge_pte_lockptr(h, src, src_pte); 5171 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 5172 entry = huge_ptep_get(src_pte); 5173 if (!pte_same(src_pte_old, entry)) { 5174 restore_reserve_on_error(h, dst_vma, addr, 5175 new_folio); 5176 folio_put(new_folio); 5177 /* huge_ptep of dst_pte won't change as in child */ 5178 goto again; 5179 } 5180 hugetlb_install_folio(dst_vma, dst_pte, addr, 5181 new_folio, src_pte_old, sz); 5182 spin_unlock(src_ptl); 5183 spin_unlock(dst_ptl); 5184 continue; 5185 } 5186 5187 if (cow) { 5188 /* 5189 * No need to notify as we are downgrading page 5190 * table protection not changing it to point 5191 * to a new page. 5192 * 5193 * See Documentation/mm/mmu_notifier.rst 5194 */ 5195 huge_ptep_set_wrprotect(src, addr, src_pte); 5196 entry = huge_pte_wrprotect(entry); 5197 } 5198 5199 if (!userfaultfd_wp(dst_vma)) 5200 entry = huge_pte_clear_uffd_wp(entry); 5201 5202 set_huge_pte_at(dst, addr, dst_pte, entry, sz); 5203 hugetlb_count_add(npages, dst); 5204 } 5205 spin_unlock(src_ptl); 5206 spin_unlock(dst_ptl); 5207 } 5208 5209 if (cow) { 5210 raw_write_seqcount_end(&src->write_protect_seq); 5211 mmu_notifier_invalidate_range_end(&range); 5212 } else { 5213 hugetlb_vma_unlock_read(src_vma); 5214 } 5215 5216 return ret; 5217 } 5218 5219 static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, 5220 unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte, 5221 unsigned long sz) 5222 { 5223 struct hstate *h = hstate_vma(vma); 5224 struct mm_struct *mm = vma->vm_mm; 5225 spinlock_t *src_ptl, *dst_ptl; 5226 pte_t pte; 5227 5228 dst_ptl = huge_pte_lock(h, mm, dst_pte); 5229 src_ptl = huge_pte_lockptr(h, mm, src_pte); 5230 5231 /* 5232 * We don't have to worry about the ordering of src and dst ptlocks 5233 * because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock. 5234 */ 5235 if (src_ptl != dst_ptl) 5236 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 5237 5238 pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); 5239 set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); 5240 5241 if (src_ptl != dst_ptl) 5242 spin_unlock(src_ptl); 5243 spin_unlock(dst_ptl); 5244 } 5245 5246 int move_hugetlb_page_tables(struct vm_area_struct *vma, 5247 struct vm_area_struct *new_vma, 5248 unsigned long old_addr, unsigned long new_addr, 5249 unsigned long len) 5250 { 5251 struct hstate *h = hstate_vma(vma); 5252 struct address_space *mapping = vma->vm_file->f_mapping; 5253 unsigned long sz = huge_page_size(h); 5254 struct mm_struct *mm = vma->vm_mm; 5255 unsigned long old_end = old_addr + len; 5256 unsigned long last_addr_mask; 5257 pte_t *src_pte, *dst_pte; 5258 struct mmu_notifier_range range; 5259 bool shared_pmd = false; 5260 5261 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, 5262 old_end); 5263 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 5264 /* 5265 * In case of shared PMDs, we should cover the maximum possible 5266 * range. 5267 */ 5268 flush_cache_range(vma, range.start, range.end); 5269 5270 mmu_notifier_invalidate_range_start(&range); 5271 last_addr_mask = hugetlb_mask_last_page(h); 5272 /* Prevent race with file truncation */ 5273 hugetlb_vma_lock_write(vma); 5274 i_mmap_lock_write(mapping); 5275 for (; old_addr < old_end; old_addr += sz, new_addr += sz) { 5276 src_pte = hugetlb_walk(vma, old_addr, sz); 5277 if (!src_pte) { 5278 old_addr |= last_addr_mask; 5279 new_addr |= last_addr_mask; 5280 continue; 5281 } 5282 if (huge_pte_none(huge_ptep_get(src_pte))) 5283 continue; 5284 5285 if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { 5286 shared_pmd = true; 5287 old_addr |= last_addr_mask; 5288 new_addr |= last_addr_mask; 5289 continue; 5290 } 5291 5292 dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz); 5293 if (!dst_pte) 5294 break; 5295 5296 move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz); 5297 } 5298 5299 if (shared_pmd) 5300 flush_hugetlb_tlb_range(vma, range.start, range.end); 5301 else 5302 flush_hugetlb_tlb_range(vma, old_end - len, old_end); 5303 mmu_notifier_invalidate_range_end(&range); 5304 i_mmap_unlock_write(mapping); 5305 hugetlb_vma_unlock_write(vma); 5306 5307 return len + old_addr - old_end; 5308 } 5309 5310 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 5311 unsigned long start, unsigned long end, 5312 struct page *ref_page, zap_flags_t zap_flags) 5313 { 5314 struct mm_struct *mm = vma->vm_mm; 5315 unsigned long address; 5316 pte_t *ptep; 5317 pte_t pte; 5318 spinlock_t *ptl; 5319 struct page *page; 5320 struct hstate *h = hstate_vma(vma); 5321 unsigned long sz = huge_page_size(h); 5322 unsigned long last_addr_mask; 5323 bool force_flush = false; 5324 5325 WARN_ON(!is_vm_hugetlb_page(vma)); 5326 BUG_ON(start & ~huge_page_mask(h)); 5327 BUG_ON(end & ~huge_page_mask(h)); 5328 5329 /* 5330 * This is a hugetlb vma, all the pte entries should point 5331 * to huge page. 5332 */ 5333 tlb_change_page_size(tlb, sz); 5334 tlb_start_vma(tlb, vma); 5335 5336 last_addr_mask = hugetlb_mask_last_page(h); 5337 address = start; 5338 for (; address < end; address += sz) { 5339 ptep = hugetlb_walk(vma, address, sz); 5340 if (!ptep) { 5341 address |= last_addr_mask; 5342 continue; 5343 } 5344 5345 ptl = huge_pte_lock(h, mm, ptep); 5346 if (huge_pmd_unshare(mm, vma, address, ptep)) { 5347 spin_unlock(ptl); 5348 tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); 5349 force_flush = true; 5350 address |= last_addr_mask; 5351 continue; 5352 } 5353 5354 pte = huge_ptep_get(ptep); 5355 if (huge_pte_none(pte)) { 5356 spin_unlock(ptl); 5357 continue; 5358 } 5359 5360 /* 5361 * Migrating hugepage or HWPoisoned hugepage is already 5362 * unmapped and its refcount is dropped, so just clear pte here. 5363 */ 5364 if (unlikely(!pte_present(pte))) { 5365 /* 5366 * If the pte was wr-protected by uffd-wp in any of the 5367 * swap forms, meanwhile the caller does not want to 5368 * drop the uffd-wp bit in this zap, then replace the 5369 * pte with a marker. 5370 */ 5371 if (pte_swp_uffd_wp_any(pte) && 5372 !(zap_flags & ZAP_FLAG_DROP_MARKER)) 5373 set_huge_pte_at(mm, address, ptep, 5374 make_pte_marker(PTE_MARKER_UFFD_WP), 5375 sz); 5376 else 5377 huge_pte_clear(mm, address, ptep, sz); 5378 spin_unlock(ptl); 5379 continue; 5380 } 5381 5382 page = pte_page(pte); 5383 /* 5384 * If a reference page is supplied, it is because a specific 5385 * page is being unmapped, not a range. Ensure the page we 5386 * are about to unmap is the actual page of interest. 5387 */ 5388 if (ref_page) { 5389 if (page != ref_page) { 5390 spin_unlock(ptl); 5391 continue; 5392 } 5393 /* 5394 * Mark the VMA as having unmapped its page so that 5395 * future faults in this VMA will fail rather than 5396 * looking like data was lost 5397 */ 5398 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); 5399 } 5400 5401 pte = huge_ptep_get_and_clear(mm, address, ptep); 5402 tlb_remove_huge_tlb_entry(h, tlb, ptep, address); 5403 if (huge_pte_dirty(pte)) 5404 set_page_dirty(page); 5405 /* Leave a uffd-wp pte marker if needed */ 5406 if (huge_pte_uffd_wp(pte) && 5407 !(zap_flags & ZAP_FLAG_DROP_MARKER)) 5408 set_huge_pte_at(mm, address, ptep, 5409 make_pte_marker(PTE_MARKER_UFFD_WP), 5410 sz); 5411 hugetlb_count_sub(pages_per_huge_page(h), mm); 5412 page_remove_rmap(page, vma, true); 5413 5414 spin_unlock(ptl); 5415 tlb_remove_page_size(tlb, page, huge_page_size(h)); 5416 /* 5417 * Bail out after unmapping reference page if supplied 5418 */ 5419 if (ref_page) 5420 break; 5421 } 5422 tlb_end_vma(tlb, vma); 5423 5424 /* 5425 * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We 5426 * could defer the flush until now, since by holding i_mmap_rwsem we 5427 * guaranteed that the last refernece would not be dropped. But we must 5428 * do the flushing before we return, as otherwise i_mmap_rwsem will be 5429 * dropped and the last reference to the shared PMDs page might be 5430 * dropped as well. 5431 * 5432 * In theory we could defer the freeing of the PMD pages as well, but 5433 * huge_pmd_unshare() relies on the exact page_count for the PMD page to 5434 * detect sharing, so we cannot defer the release of the page either. 5435 * Instead, do flush now. 5436 */ 5437 if (force_flush) 5438 tlb_flush_mmu_tlbonly(tlb); 5439 } 5440 5441 void __hugetlb_zap_begin(struct vm_area_struct *vma, 5442 unsigned long *start, unsigned long *end) 5443 { 5444 if (!vma->vm_file) /* hugetlbfs_file_mmap error */ 5445 return; 5446 5447 adjust_range_if_pmd_sharing_possible(vma, start, end); 5448 hugetlb_vma_lock_write(vma); 5449 if (vma->vm_file) 5450 i_mmap_lock_write(vma->vm_file->f_mapping); 5451 } 5452 5453 void __hugetlb_zap_end(struct vm_area_struct *vma, 5454 struct zap_details *details) 5455 { 5456 zap_flags_t zap_flags = details ? details->zap_flags : 0; 5457 5458 if (!vma->vm_file) /* hugetlbfs_file_mmap error */ 5459 return; 5460 5461 if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ 5462 /* 5463 * Unlock and free the vma lock before releasing i_mmap_rwsem. 5464 * When the vma_lock is freed, this makes the vma ineligible 5465 * for pmd sharing. And, i_mmap_rwsem is required to set up 5466 * pmd sharing. This is important as page tables for this 5467 * unmapped range will be asynchrously deleted. If the page 5468 * tables are shared, there will be issues when accessed by 5469 * someone else. 5470 */ 5471 __hugetlb_vma_unlock_write_free(vma); 5472 } else { 5473 hugetlb_vma_unlock_write(vma); 5474 } 5475 5476 if (vma->vm_file) 5477 i_mmap_unlock_write(vma->vm_file->f_mapping); 5478 } 5479 5480 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 5481 unsigned long end, struct page *ref_page, 5482 zap_flags_t zap_flags) 5483 { 5484 struct mmu_notifier_range range; 5485 struct mmu_gather tlb; 5486 5487 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 5488 start, end); 5489 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 5490 mmu_notifier_invalidate_range_start(&range); 5491 tlb_gather_mmu(&tlb, vma->vm_mm); 5492 5493 __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); 5494 5495 mmu_notifier_invalidate_range_end(&range); 5496 tlb_finish_mmu(&tlb); 5497 } 5498 5499 /* 5500 * This is called when the original mapper is failing to COW a MAP_PRIVATE 5501 * mapping it owns the reserve page for. The intention is to unmap the page 5502 * from other VMAs and let the children be SIGKILLed if they are faulting the 5503 * same region. 5504 */ 5505 static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, 5506 struct page *page, unsigned long address) 5507 { 5508 struct hstate *h = hstate_vma(vma); 5509 struct vm_area_struct *iter_vma; 5510 struct address_space *mapping; 5511 pgoff_t pgoff; 5512 5513 /* 5514 * vm_pgoff is in PAGE_SIZE units, hence the different calculation 5515 * from page cache lookup which is in HPAGE_SIZE units. 5516 */ 5517 address = address & huge_page_mask(h); 5518 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + 5519 vma->vm_pgoff; 5520 mapping = vma->vm_file->f_mapping; 5521 5522 /* 5523 * Take the mapping lock for the duration of the table walk. As 5524 * this mapping should be shared between all the VMAs, 5525 * __unmap_hugepage_range() is called as the lock is already held 5526 */ 5527 i_mmap_lock_write(mapping); 5528 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { 5529 /* Do not unmap the current VMA */ 5530 if (iter_vma == vma) 5531 continue; 5532 5533 /* 5534 * Shared VMAs have their own reserves and do not affect 5535 * MAP_PRIVATE accounting but it is possible that a shared 5536 * VMA is using the same page so check and skip such VMAs. 5537 */ 5538 if (iter_vma->vm_flags & VM_MAYSHARE) 5539 continue; 5540 5541 /* 5542 * Unmap the page from other VMAs without their own reserves. 5543 * They get marked to be SIGKILLed if they fault in these 5544 * areas. This is because a future no-page fault on this VMA 5545 * could insert a zeroed page instead of the data existing 5546 * from the time of fork. This would look like data corruption 5547 */ 5548 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 5549 unmap_hugepage_range(iter_vma, address, 5550 address + huge_page_size(h), page, 0); 5551 } 5552 i_mmap_unlock_write(mapping); 5553 } 5554 5555 /* 5556 * hugetlb_wp() should be called with page lock of the original hugepage held. 5557 * Called with hugetlb_fault_mutex_table held and pte_page locked so we 5558 * cannot race with other handlers or page migration. 5559 * Keep the pte_same checks anyway to make transition from the mutex easier. 5560 */ 5561 static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, 5562 unsigned long address, pte_t *ptep, unsigned int flags, 5563 struct folio *pagecache_folio, spinlock_t *ptl) 5564 { 5565 const bool unshare = flags & FAULT_FLAG_UNSHARE; 5566 pte_t pte = huge_ptep_get(ptep); 5567 struct hstate *h = hstate_vma(vma); 5568 struct folio *old_folio; 5569 struct folio *new_folio; 5570 int outside_reserve = 0; 5571 vm_fault_t ret = 0; 5572 unsigned long haddr = address & huge_page_mask(h); 5573 struct mmu_notifier_range range; 5574 5575 /* 5576 * Never handle CoW for uffd-wp protected pages. It should be only 5577 * handled when the uffd-wp protection is removed. 5578 * 5579 * Note that only the CoW optimization path (in hugetlb_no_page()) 5580 * can trigger this, because hugetlb_fault() will always resolve 5581 * uffd-wp bit first. 5582 */ 5583 if (!unshare && huge_pte_uffd_wp(pte)) 5584 return 0; 5585 5586 /* 5587 * hugetlb does not support FOLL_FORCE-style write faults that keep the 5588 * PTE mapped R/O such as maybe_mkwrite() would do. 5589 */ 5590 if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE))) 5591 return VM_FAULT_SIGSEGV; 5592 5593 /* Let's take out MAP_SHARED mappings first. */ 5594 if (vma->vm_flags & VM_MAYSHARE) { 5595 set_huge_ptep_writable(vma, haddr, ptep); 5596 return 0; 5597 } 5598 5599 old_folio = page_folio(pte_page(pte)); 5600 5601 delayacct_wpcopy_start(); 5602 5603 retry_avoidcopy: 5604 /* 5605 * If no-one else is actually using this page, we're the exclusive 5606 * owner and can reuse this page. 5607 */ 5608 if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { 5609 if (!PageAnonExclusive(&old_folio->page)) 5610 page_move_anon_rmap(&old_folio->page, vma); 5611 if (likely(!unshare)) 5612 set_huge_ptep_writable(vma, haddr, ptep); 5613 5614 delayacct_wpcopy_end(); 5615 return 0; 5616 } 5617 VM_BUG_ON_PAGE(folio_test_anon(old_folio) && 5618 PageAnonExclusive(&old_folio->page), &old_folio->page); 5619 5620 /* 5621 * If the process that created a MAP_PRIVATE mapping is about to 5622 * perform a COW due to a shared page count, attempt to satisfy 5623 * the allocation without using the existing reserves. The pagecache 5624 * page is used to determine if the reserve at this address was 5625 * consumed or not. If reserves were used, a partial faulted mapping 5626 * at the time of fork() could consume its reserves on COW instead 5627 * of the full address range. 5628 */ 5629 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 5630 old_folio != pagecache_folio) 5631 outside_reserve = 1; 5632 5633 folio_get(old_folio); 5634 5635 /* 5636 * Drop page table lock as buddy allocator may be called. It will 5637 * be acquired again before returning to the caller, as expected. 5638 */ 5639 spin_unlock(ptl); 5640 new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); 5641 5642 if (IS_ERR(new_folio)) { 5643 /* 5644 * If a process owning a MAP_PRIVATE mapping fails to COW, 5645 * it is due to references held by a child and an insufficient 5646 * huge page pool. To guarantee the original mappers 5647 * reliability, unmap the page from child processes. The child 5648 * may get SIGKILLed if it later faults. 5649 */ 5650 if (outside_reserve) { 5651 struct address_space *mapping = vma->vm_file->f_mapping; 5652 pgoff_t idx; 5653 u32 hash; 5654 5655 folio_put(old_folio); 5656 /* 5657 * Drop hugetlb_fault_mutex and vma_lock before 5658 * unmapping. unmapping needs to hold vma_lock 5659 * in write mode. Dropping vma_lock in read mode 5660 * here is OK as COW mappings do not interact with 5661 * PMD sharing. 5662 * 5663 * Reacquire both after unmap operation. 5664 */ 5665 idx = vma_hugecache_offset(h, vma, haddr); 5666 hash = hugetlb_fault_mutex_hash(mapping, idx); 5667 hugetlb_vma_unlock_read(vma); 5668 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 5669 5670 unmap_ref_private(mm, vma, &old_folio->page, haddr); 5671 5672 mutex_lock(&hugetlb_fault_mutex_table[hash]); 5673 hugetlb_vma_lock_read(vma); 5674 spin_lock(ptl); 5675 ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); 5676 if (likely(ptep && 5677 pte_same(huge_ptep_get(ptep), pte))) 5678 goto retry_avoidcopy; 5679 /* 5680 * race occurs while re-acquiring page table 5681 * lock, and our job is done. 5682 */ 5683 delayacct_wpcopy_end(); 5684 return 0; 5685 } 5686 5687 ret = vmf_error(PTR_ERR(new_folio)); 5688 goto out_release_old; 5689 } 5690 5691 /* 5692 * When the original hugepage is shared one, it does not have 5693 * anon_vma prepared. 5694 */ 5695 if (unlikely(anon_vma_prepare(vma))) { 5696 ret = VM_FAULT_OOM; 5697 goto out_release_all; 5698 } 5699 5700 if (copy_user_large_folio(new_folio, old_folio, address, vma)) { 5701 ret = VM_FAULT_HWPOISON_LARGE; 5702 goto out_release_all; 5703 } 5704 __folio_mark_uptodate(new_folio); 5705 5706 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, 5707 haddr + huge_page_size(h)); 5708 mmu_notifier_invalidate_range_start(&range); 5709 5710 /* 5711 * Retake the page table lock to check for racing updates 5712 * before the page tables are altered 5713 */ 5714 spin_lock(ptl); 5715 ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); 5716 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { 5717 pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); 5718 5719 /* Break COW or unshare */ 5720 huge_ptep_clear_flush(vma, haddr, ptep); 5721 page_remove_rmap(&old_folio->page, vma, true); 5722 hugepage_add_new_anon_rmap(new_folio, vma, haddr); 5723 if (huge_pte_uffd_wp(pte)) 5724 newpte = huge_pte_mkuffd_wp(newpte); 5725 set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h)); 5726 folio_set_hugetlb_migratable(new_folio); 5727 /* Make the old page be freed below */ 5728 new_folio = old_folio; 5729 } 5730 spin_unlock(ptl); 5731 mmu_notifier_invalidate_range_end(&range); 5732 out_release_all: 5733 /* 5734 * No restore in case of successful pagetable update (Break COW or 5735 * unshare) 5736 */ 5737 if (new_folio != old_folio) 5738 restore_reserve_on_error(h, vma, haddr, new_folio); 5739 folio_put(new_folio); 5740 out_release_old: 5741 folio_put(old_folio); 5742 5743 spin_lock(ptl); /* Caller expects lock to be held */ 5744 5745 delayacct_wpcopy_end(); 5746 return ret; 5747 } 5748 5749 /* 5750 * Return whether there is a pagecache page to back given address within VMA. 5751 */ 5752 static bool hugetlbfs_pagecache_present(struct hstate *h, 5753 struct vm_area_struct *vma, unsigned long address) 5754 { 5755 struct address_space *mapping = vma->vm_file->f_mapping; 5756 pgoff_t idx = vma_hugecache_offset(h, vma, address); 5757 struct folio *folio; 5758 5759 folio = filemap_get_folio(mapping, idx); 5760 if (IS_ERR(folio)) 5761 return false; 5762 folio_put(folio); 5763 return true; 5764 } 5765 5766 int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, 5767 pgoff_t idx) 5768 { 5769 struct inode *inode = mapping->host; 5770 struct hstate *h = hstate_inode(inode); 5771 int err; 5772 5773 __folio_set_locked(folio); 5774 err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); 5775 5776 if (unlikely(err)) { 5777 __folio_clear_locked(folio); 5778 return err; 5779 } 5780 folio_clear_hugetlb_restore_reserve(folio); 5781 5782 /* 5783 * mark folio dirty so that it will not be removed from cache/file 5784 * by non-hugetlbfs specific code paths. 5785 */ 5786 folio_mark_dirty(folio); 5787 5788 spin_lock(&inode->i_lock); 5789 inode->i_blocks += blocks_per_huge_page(h); 5790 spin_unlock(&inode->i_lock); 5791 return 0; 5792 } 5793 5794 static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, 5795 struct address_space *mapping, 5796 pgoff_t idx, 5797 unsigned int flags, 5798 unsigned long haddr, 5799 unsigned long addr, 5800 unsigned long reason) 5801 { 5802 u32 hash; 5803 struct vm_fault vmf = { 5804 .vma = vma, 5805 .address = haddr, 5806 .real_address = addr, 5807 .flags = flags, 5808 5809 /* 5810 * Hard to debug if it ends up being 5811 * used by a callee that assumes 5812 * something about the other 5813 * uninitialized fields... same as in 5814 * memory.c 5815 */ 5816 }; 5817 5818 /* 5819 * vma_lock and hugetlb_fault_mutex must be dropped before handling 5820 * userfault. Also mmap_lock could be dropped due to handling 5821 * userfault, any vma operation should be careful from here. 5822 */ 5823 hugetlb_vma_unlock_read(vma); 5824 hash = hugetlb_fault_mutex_hash(mapping, idx); 5825 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 5826 return handle_userfault(&vmf, reason); 5827 } 5828 5829 /* 5830 * Recheck pte with pgtable lock. Returns true if pte didn't change, or 5831 * false if pte changed or is changing. 5832 */ 5833 static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, 5834 pte_t *ptep, pte_t old_pte) 5835 { 5836 spinlock_t *ptl; 5837 bool same; 5838 5839 ptl = huge_pte_lock(h, mm, ptep); 5840 same = pte_same(huge_ptep_get(ptep), old_pte); 5841 spin_unlock(ptl); 5842 5843 return same; 5844 } 5845 5846 static vm_fault_t hugetlb_no_page(struct mm_struct *mm, 5847 struct vm_area_struct *vma, 5848 struct address_space *mapping, pgoff_t idx, 5849 unsigned long address, pte_t *ptep, 5850 pte_t old_pte, unsigned int flags) 5851 { 5852 struct hstate *h = hstate_vma(vma); 5853 vm_fault_t ret = VM_FAULT_SIGBUS; 5854 int anon_rmap = 0; 5855 unsigned long size; 5856 struct folio *folio; 5857 pte_t new_pte; 5858 spinlock_t *ptl; 5859 unsigned long haddr = address & huge_page_mask(h); 5860 bool new_folio, new_pagecache_folio = false; 5861 u32 hash = hugetlb_fault_mutex_hash(mapping, idx); 5862 5863 /* 5864 * Currently, we are forced to kill the process in the event the 5865 * original mapper has unmapped pages from the child due to a failed 5866 * COW/unsharing. Warn that such a situation has occurred as it may not 5867 * be obvious. 5868 */ 5869 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 5870 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", 5871 current->pid); 5872 goto out; 5873 } 5874 5875 /* 5876 * Use page lock to guard against racing truncation 5877 * before we get page_table_lock. 5878 */ 5879 new_folio = false; 5880 folio = filemap_lock_folio(mapping, idx); 5881 if (IS_ERR(folio)) { 5882 size = i_size_read(mapping->host) >> huge_page_shift(h); 5883 if (idx >= size) 5884 goto out; 5885 /* Check for page in userfault range */ 5886 if (userfaultfd_missing(vma)) { 5887 /* 5888 * Since hugetlb_no_page() was examining pte 5889 * without pgtable lock, we need to re-test under 5890 * lock because the pte may not be stable and could 5891 * have changed from under us. Try to detect 5892 * either changed or during-changing ptes and retry 5893 * properly when needed. 5894 * 5895 * Note that userfaultfd is actually fine with 5896 * false positives (e.g. caused by pte changed), 5897 * but not wrong logical events (e.g. caused by 5898 * reading a pte during changing). The latter can 5899 * confuse the userspace, so the strictness is very 5900 * much preferred. E.g., MISSING event should 5901 * never happen on the page after UFFDIO_COPY has 5902 * correctly installed the page and returned. 5903 */ 5904 if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { 5905 ret = 0; 5906 goto out; 5907 } 5908 5909 return hugetlb_handle_userfault(vma, mapping, idx, flags, 5910 haddr, address, 5911 VM_UFFD_MISSING); 5912 } 5913 5914 folio = alloc_hugetlb_folio(vma, haddr, 0); 5915 if (IS_ERR(folio)) { 5916 /* 5917 * Returning error will result in faulting task being 5918 * sent SIGBUS. The hugetlb fault mutex prevents two 5919 * tasks from racing to fault in the same page which 5920 * could result in false unable to allocate errors. 5921 * Page migration does not take the fault mutex, but 5922 * does a clear then write of pte's under page table 5923 * lock. Page fault code could race with migration, 5924 * notice the clear pte and try to allocate a page 5925 * here. Before returning error, get ptl and make 5926 * sure there really is no pte entry. 5927 */ 5928 if (hugetlb_pte_stable(h, mm, ptep, old_pte)) 5929 ret = vmf_error(PTR_ERR(folio)); 5930 else 5931 ret = 0; 5932 goto out; 5933 } 5934 clear_huge_page(&folio->page, address, pages_per_huge_page(h)); 5935 __folio_mark_uptodate(folio); 5936 new_folio = true; 5937 5938 if (vma->vm_flags & VM_MAYSHARE) { 5939 int err = hugetlb_add_to_page_cache(folio, mapping, idx); 5940 if (err) { 5941 /* 5942 * err can't be -EEXIST which implies someone 5943 * else consumed the reservation since hugetlb 5944 * fault mutex is held when add a hugetlb page 5945 * to the page cache. So it's safe to call 5946 * restore_reserve_on_error() here. 5947 */ 5948 restore_reserve_on_error(h, vma, haddr, folio); 5949 folio_put(folio); 5950 goto out; 5951 } 5952 new_pagecache_folio = true; 5953 } else { 5954 folio_lock(folio); 5955 if (unlikely(anon_vma_prepare(vma))) { 5956 ret = VM_FAULT_OOM; 5957 goto backout_unlocked; 5958 } 5959 anon_rmap = 1; 5960 } 5961 } else { 5962 /* 5963 * If memory error occurs between mmap() and fault, some process 5964 * don't have hwpoisoned swap entry for errored virtual address. 5965 * So we need to block hugepage fault by PG_hwpoison bit check. 5966 */ 5967 if (unlikely(folio_test_hwpoison(folio))) { 5968 ret = VM_FAULT_HWPOISON_LARGE | 5969 VM_FAULT_SET_HINDEX(hstate_index(h)); 5970 goto backout_unlocked; 5971 } 5972 5973 /* Check for page in userfault range. */ 5974 if (userfaultfd_minor(vma)) { 5975 folio_unlock(folio); 5976 folio_put(folio); 5977 /* See comment in userfaultfd_missing() block above */ 5978 if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { 5979 ret = 0; 5980 goto out; 5981 } 5982 return hugetlb_handle_userfault(vma, mapping, idx, flags, 5983 haddr, address, 5984 VM_UFFD_MINOR); 5985 } 5986 } 5987 5988 /* 5989 * If we are going to COW a private mapping later, we examine the 5990 * pending reservations for this page now. This will ensure that 5991 * any allocations necessary to record that reservation occur outside 5992 * the spinlock. 5993 */ 5994 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 5995 if (vma_needs_reservation(h, vma, haddr) < 0) { 5996 ret = VM_FAULT_OOM; 5997 goto backout_unlocked; 5998 } 5999 /* Just decrements count, does not deallocate */ 6000 vma_end_reservation(h, vma, haddr); 6001 } 6002 6003 ptl = huge_pte_lock(h, mm, ptep); 6004 ret = 0; 6005 /* If pte changed from under us, retry */ 6006 if (!pte_same(huge_ptep_get(ptep), old_pte)) 6007 goto backout; 6008 6009 if (anon_rmap) 6010 hugepage_add_new_anon_rmap(folio, vma, haddr); 6011 else 6012 page_dup_file_rmap(&folio->page, true); 6013 new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) 6014 && (vma->vm_flags & VM_SHARED))); 6015 /* 6016 * If this pte was previously wr-protected, keep it wr-protected even 6017 * if populated. 6018 */ 6019 if (unlikely(pte_marker_uffd_wp(old_pte))) 6020 new_pte = huge_pte_mkuffd_wp(new_pte); 6021 set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h)); 6022 6023 hugetlb_count_add(pages_per_huge_page(h), mm); 6024 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 6025 /* Optimization, do the COW without a second fault */ 6026 ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl); 6027 } 6028 6029 spin_unlock(ptl); 6030 6031 /* 6032 * Only set hugetlb_migratable in newly allocated pages. Existing pages 6033 * found in the pagecache may not have hugetlb_migratable if they have 6034 * been isolated for migration. 6035 */ 6036 if (new_folio) 6037 folio_set_hugetlb_migratable(folio); 6038 6039 folio_unlock(folio); 6040 out: 6041 hugetlb_vma_unlock_read(vma); 6042 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 6043 return ret; 6044 6045 backout: 6046 spin_unlock(ptl); 6047 backout_unlocked: 6048 if (new_folio && !new_pagecache_folio) 6049 restore_reserve_on_error(h, vma, haddr, folio); 6050 6051 folio_unlock(folio); 6052 folio_put(folio); 6053 goto out; 6054 } 6055 6056 #ifdef CONFIG_SMP 6057 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 6058 { 6059 unsigned long key[2]; 6060 u32 hash; 6061 6062 key[0] = (unsigned long) mapping; 6063 key[1] = idx; 6064 6065 hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); 6066 6067 return hash & (num_fault_mutexes - 1); 6068 } 6069 #else 6070 /* 6071 * For uniprocessor systems we always use a single mutex, so just 6072 * return 0 and avoid the hashing overhead. 6073 */ 6074 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 6075 { 6076 return 0; 6077 } 6078 #endif 6079 6080 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 6081 unsigned long address, unsigned int flags) 6082 { 6083 pte_t *ptep, entry; 6084 spinlock_t *ptl; 6085 vm_fault_t ret; 6086 u32 hash; 6087 pgoff_t idx; 6088 struct folio *folio = NULL; 6089 struct folio *pagecache_folio = NULL; 6090 struct hstate *h = hstate_vma(vma); 6091 struct address_space *mapping; 6092 int need_wait_lock = 0; 6093 unsigned long haddr = address & huge_page_mask(h); 6094 6095 /* TODO: Handle faults under the VMA lock */ 6096 if (flags & FAULT_FLAG_VMA_LOCK) { 6097 vma_end_read(vma); 6098 return VM_FAULT_RETRY; 6099 } 6100 6101 /* 6102 * Serialize hugepage allocation and instantiation, so that we don't 6103 * get spurious allocation failures if two CPUs race to instantiate 6104 * the same page in the page cache. 6105 */ 6106 mapping = vma->vm_file->f_mapping; 6107 idx = vma_hugecache_offset(h, vma, haddr); 6108 hash = hugetlb_fault_mutex_hash(mapping, idx); 6109 mutex_lock(&hugetlb_fault_mutex_table[hash]); 6110 6111 /* 6112 * Acquire vma lock before calling huge_pte_alloc and hold 6113 * until finished with ptep. This prevents huge_pmd_unshare from 6114 * being called elsewhere and making the ptep no longer valid. 6115 */ 6116 hugetlb_vma_lock_read(vma); 6117 ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); 6118 if (!ptep) { 6119 hugetlb_vma_unlock_read(vma); 6120 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 6121 return VM_FAULT_OOM; 6122 } 6123 6124 entry = huge_ptep_get(ptep); 6125 if (huge_pte_none_mostly(entry)) { 6126 if (is_pte_marker(entry)) { 6127 pte_marker marker = 6128 pte_marker_get(pte_to_swp_entry(entry)); 6129 6130 if (marker & PTE_MARKER_POISONED) { 6131 ret = VM_FAULT_HWPOISON_LARGE; 6132 goto out_mutex; 6133 } 6134 } 6135 6136 /* 6137 * Other PTE markers should be handled the same way as none PTE. 6138 * 6139 * hugetlb_no_page will drop vma lock and hugetlb fault 6140 * mutex internally, which make us return immediately. 6141 */ 6142 return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, 6143 entry, flags); 6144 } 6145 6146 ret = 0; 6147 6148 /* 6149 * entry could be a migration/hwpoison entry at this point, so this 6150 * check prevents the kernel from going below assuming that we have 6151 * an active hugepage in pagecache. This goto expects the 2nd page 6152 * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will 6153 * properly handle it. 6154 */ 6155 if (!pte_present(entry)) { 6156 if (unlikely(is_hugetlb_entry_migration(entry))) { 6157 /* 6158 * Release the hugetlb fault lock now, but retain 6159 * the vma lock, because it is needed to guard the 6160 * huge_pte_lockptr() later in 6161 * migration_entry_wait_huge(). The vma lock will 6162 * be released there. 6163 */ 6164 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 6165 migration_entry_wait_huge(vma, ptep); 6166 return 0; 6167 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 6168 ret = VM_FAULT_HWPOISON_LARGE | 6169 VM_FAULT_SET_HINDEX(hstate_index(h)); 6170 goto out_mutex; 6171 } 6172 6173 /* 6174 * If we are going to COW/unshare the mapping later, we examine the 6175 * pending reservations for this page now. This will ensure that any 6176 * allocations necessary to record that reservation occur outside the 6177 * spinlock. Also lookup the pagecache page now as it is used to 6178 * determine if a reservation has been consumed. 6179 */ 6180 if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && 6181 !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) { 6182 if (vma_needs_reservation(h, vma, haddr) < 0) { 6183 ret = VM_FAULT_OOM; 6184 goto out_mutex; 6185 } 6186 /* Just decrements count, does not deallocate */ 6187 vma_end_reservation(h, vma, haddr); 6188 6189 pagecache_folio = filemap_lock_folio(mapping, idx); 6190 if (IS_ERR(pagecache_folio)) 6191 pagecache_folio = NULL; 6192 } 6193 6194 ptl = huge_pte_lock(h, mm, ptep); 6195 6196 /* Check for a racing update before calling hugetlb_wp() */ 6197 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 6198 goto out_ptl; 6199 6200 /* Handle userfault-wp first, before trying to lock more pages */ 6201 if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && 6202 (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { 6203 struct vm_fault vmf = { 6204 .vma = vma, 6205 .address = haddr, 6206 .real_address = address, 6207 .flags = flags, 6208 }; 6209 6210 spin_unlock(ptl); 6211 if (pagecache_folio) { 6212 folio_unlock(pagecache_folio); 6213 folio_put(pagecache_folio); 6214 } 6215 hugetlb_vma_unlock_read(vma); 6216 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 6217 return handle_userfault(&vmf, VM_UFFD_WP); 6218 } 6219 6220 /* 6221 * hugetlb_wp() requires page locks of pte_page(entry) and 6222 * pagecache_folio, so here we need take the former one 6223 * when folio != pagecache_folio or !pagecache_folio. 6224 */ 6225 folio = page_folio(pte_page(entry)); 6226 if (folio != pagecache_folio) 6227 if (!folio_trylock(folio)) { 6228 need_wait_lock = 1; 6229 goto out_ptl; 6230 } 6231 6232 folio_get(folio); 6233 6234 if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { 6235 if (!huge_pte_write(entry)) { 6236 ret = hugetlb_wp(mm, vma, address, ptep, flags, 6237 pagecache_folio, ptl); 6238 goto out_put_page; 6239 } else if (likely(flags & FAULT_FLAG_WRITE)) { 6240 entry = huge_pte_mkdirty(entry); 6241 } 6242 } 6243 entry = pte_mkyoung(entry); 6244 if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, 6245 flags & FAULT_FLAG_WRITE)) 6246 update_mmu_cache(vma, haddr, ptep); 6247 out_put_page: 6248 if (folio != pagecache_folio) 6249 folio_unlock(folio); 6250 folio_put(folio); 6251 out_ptl: 6252 spin_unlock(ptl); 6253 6254 if (pagecache_folio) { 6255 folio_unlock(pagecache_folio); 6256 folio_put(pagecache_folio); 6257 } 6258 out_mutex: 6259 hugetlb_vma_unlock_read(vma); 6260 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 6261 /* 6262 * Generally it's safe to hold refcount during waiting page lock. But 6263 * here we just wait to defer the next page fault to avoid busy loop and 6264 * the page is not used after unlocked before returning from the current 6265 * page fault. So we are safe from accessing freed page, even if we wait 6266 * here without taking refcount. 6267 */ 6268 if (need_wait_lock) 6269 folio_wait_locked(folio); 6270 return ret; 6271 } 6272 6273 #ifdef CONFIG_USERFAULTFD 6274 /* 6275 * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte 6276 * with modifications for hugetlb pages. 6277 */ 6278 int hugetlb_mfill_atomic_pte(pte_t *dst_pte, 6279 struct vm_area_struct *dst_vma, 6280 unsigned long dst_addr, 6281 unsigned long src_addr, 6282 uffd_flags_t flags, 6283 struct folio **foliop) 6284 { 6285 struct mm_struct *dst_mm = dst_vma->vm_mm; 6286 bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE); 6287 bool wp_enabled = (flags & MFILL_ATOMIC_WP); 6288 struct hstate *h = hstate_vma(dst_vma); 6289 struct address_space *mapping = dst_vma->vm_file->f_mapping; 6290 pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); 6291 unsigned long size; 6292 int vm_shared = dst_vma->vm_flags & VM_SHARED; 6293 pte_t _dst_pte; 6294 spinlock_t *ptl; 6295 int ret = -ENOMEM; 6296 struct folio *folio; 6297 int writable; 6298 bool folio_in_pagecache = false; 6299 6300 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { 6301 ptl = huge_pte_lock(h, dst_mm, dst_pte); 6302 6303 /* Don't overwrite any existing PTEs (even markers) */ 6304 if (!huge_pte_none(huge_ptep_get(dst_pte))) { 6305 spin_unlock(ptl); 6306 return -EEXIST; 6307 } 6308 6309 _dst_pte = make_pte_marker(PTE_MARKER_POISONED); 6310 set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, 6311 huge_page_size(h)); 6312 6313 /* No need to invalidate - it was non-present before */ 6314 update_mmu_cache(dst_vma, dst_addr, dst_pte); 6315 6316 spin_unlock(ptl); 6317 return 0; 6318 } 6319 6320 if (is_continue) { 6321 ret = -EFAULT; 6322 folio = filemap_lock_folio(mapping, idx); 6323 if (IS_ERR(folio)) 6324 goto out; 6325 folio_in_pagecache = true; 6326 } else if (!*foliop) { 6327 /* If a folio already exists, then it's UFFDIO_COPY for 6328 * a non-missing case. Return -EEXIST. 6329 */ 6330 if (vm_shared && 6331 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { 6332 ret = -EEXIST; 6333 goto out; 6334 } 6335 6336 folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); 6337 if (IS_ERR(folio)) { 6338 ret = -ENOMEM; 6339 goto out; 6340 } 6341 6342 ret = copy_folio_from_user(folio, (const void __user *) src_addr, 6343 false); 6344 6345 /* fallback to copy_from_user outside mmap_lock */ 6346 if (unlikely(ret)) { 6347 ret = -ENOENT; 6348 /* Free the allocated folio which may have 6349 * consumed a reservation. 6350 */ 6351 restore_reserve_on_error(h, dst_vma, dst_addr, folio); 6352 folio_put(folio); 6353 6354 /* Allocate a temporary folio to hold the copied 6355 * contents. 6356 */ 6357 folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr); 6358 if (!folio) { 6359 ret = -ENOMEM; 6360 goto out; 6361 } 6362 *foliop = folio; 6363 /* Set the outparam foliop and return to the caller to 6364 * copy the contents outside the lock. Don't free the 6365 * folio. 6366 */ 6367 goto out; 6368 } 6369 } else { 6370 if (vm_shared && 6371 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { 6372 folio_put(*foliop); 6373 ret = -EEXIST; 6374 *foliop = NULL; 6375 goto out; 6376 } 6377 6378 folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); 6379 if (IS_ERR(folio)) { 6380 folio_put(*foliop); 6381 ret = -ENOMEM; 6382 *foliop = NULL; 6383 goto out; 6384 } 6385 ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma); 6386 folio_put(*foliop); 6387 *foliop = NULL; 6388 if (ret) { 6389 folio_put(folio); 6390 goto out; 6391 } 6392 } 6393 6394 /* 6395 * The memory barrier inside __folio_mark_uptodate makes sure that 6396 * preceding stores to the page contents become visible before 6397 * the set_pte_at() write. 6398 */ 6399 __folio_mark_uptodate(folio); 6400 6401 /* Add shared, newly allocated pages to the page cache. */ 6402 if (vm_shared && !is_continue) { 6403 size = i_size_read(mapping->host) >> huge_page_shift(h); 6404 ret = -EFAULT; 6405 if (idx >= size) 6406 goto out_release_nounlock; 6407 6408 /* 6409 * Serialization between remove_inode_hugepages() and 6410 * hugetlb_add_to_page_cache() below happens through the 6411 * hugetlb_fault_mutex_table that here must be hold by 6412 * the caller. 6413 */ 6414 ret = hugetlb_add_to_page_cache(folio, mapping, idx); 6415 if (ret) 6416 goto out_release_nounlock; 6417 folio_in_pagecache = true; 6418 } 6419 6420 ptl = huge_pte_lock(h, dst_mm, dst_pte); 6421 6422 ret = -EIO; 6423 if (folio_test_hwpoison(folio)) 6424 goto out_release_unlock; 6425 6426 /* 6427 * We allow to overwrite a pte marker: consider when both MISSING|WP 6428 * registered, we firstly wr-protect a none pte which has no page cache 6429 * page backing it, then access the page. 6430 */ 6431 ret = -EEXIST; 6432 if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) 6433 goto out_release_unlock; 6434 6435 if (folio_in_pagecache) 6436 page_dup_file_rmap(&folio->page, true); 6437 else 6438 hugepage_add_new_anon_rmap(folio, dst_vma, dst_addr); 6439 6440 /* 6441 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY 6442 * with wp flag set, don't set pte write bit. 6443 */ 6444 if (wp_enabled || (is_continue && !vm_shared)) 6445 writable = 0; 6446 else 6447 writable = dst_vma->vm_flags & VM_WRITE; 6448 6449 _dst_pte = make_huge_pte(dst_vma, &folio->page, writable); 6450 /* 6451 * Always mark UFFDIO_COPY page dirty; note that this may not be 6452 * extremely important for hugetlbfs for now since swapping is not 6453 * supported, but we should still be clear in that this page cannot be 6454 * thrown away at will, even if write bit not set. 6455 */ 6456 _dst_pte = huge_pte_mkdirty(_dst_pte); 6457 _dst_pte = pte_mkyoung(_dst_pte); 6458 6459 if (wp_enabled) 6460 _dst_pte = huge_pte_mkuffd_wp(_dst_pte); 6461 6462 set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, huge_page_size(h)); 6463 6464 hugetlb_count_add(pages_per_huge_page(h), dst_mm); 6465 6466 /* No need to invalidate - it was non-present before */ 6467 update_mmu_cache(dst_vma, dst_addr, dst_pte); 6468 6469 spin_unlock(ptl); 6470 if (!is_continue) 6471 folio_set_hugetlb_migratable(folio); 6472 if (vm_shared || is_continue) 6473 folio_unlock(folio); 6474 ret = 0; 6475 out: 6476 return ret; 6477 out_release_unlock: 6478 spin_unlock(ptl); 6479 if (vm_shared || is_continue) 6480 folio_unlock(folio); 6481 out_release_nounlock: 6482 if (!folio_in_pagecache) 6483 restore_reserve_on_error(h, dst_vma, dst_addr, folio); 6484 folio_put(folio); 6485 goto out; 6486 } 6487 #endif /* CONFIG_USERFAULTFD */ 6488 6489 struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, 6490 unsigned long address, unsigned int flags, 6491 unsigned int *page_mask) 6492 { 6493 struct hstate *h = hstate_vma(vma); 6494 struct mm_struct *mm = vma->vm_mm; 6495 unsigned long haddr = address & huge_page_mask(h); 6496 struct page *page = NULL; 6497 spinlock_t *ptl; 6498 pte_t *pte, entry; 6499 int ret; 6500 6501 hugetlb_vma_lock_read(vma); 6502 pte = hugetlb_walk(vma, haddr, huge_page_size(h)); 6503 if (!pte) 6504 goto out_unlock; 6505 6506 ptl = huge_pte_lock(h, mm, pte); 6507 entry = huge_ptep_get(pte); 6508 if (pte_present(entry)) { 6509 page = pte_page(entry); 6510 6511 if (!huge_pte_write(entry)) { 6512 if (flags & FOLL_WRITE) { 6513 page = NULL; 6514 goto out; 6515 } 6516 6517 if (gup_must_unshare(vma, flags, page)) { 6518 /* Tell the caller to do unsharing */ 6519 page = ERR_PTR(-EMLINK); 6520 goto out; 6521 } 6522 } 6523 6524 page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT)); 6525 6526 /* 6527 * Note that page may be a sub-page, and with vmemmap 6528 * optimizations the page struct may be read only. 6529 * try_grab_page() will increase the ref count on the 6530 * head page, so this will be OK. 6531 * 6532 * try_grab_page() should always be able to get the page here, 6533 * because we hold the ptl lock and have verified pte_present(). 6534 */ 6535 ret = try_grab_folio(page_folio(page), 1, flags); 6536 6537 if (WARN_ON_ONCE(ret)) { 6538 page = ERR_PTR(ret); 6539 goto out; 6540 } 6541 6542 *page_mask = (1U << huge_page_order(h)) - 1; 6543 } 6544 out: 6545 spin_unlock(ptl); 6546 out_unlock: 6547 hugetlb_vma_unlock_read(vma); 6548 6549 /* 6550 * Fixup retval for dump requests: if pagecache doesn't exist, 6551 * don't try to allocate a new page but just skip it. 6552 */ 6553 if (!page && (flags & FOLL_DUMP) && 6554 !hugetlbfs_pagecache_present(h, vma, address)) 6555 page = ERR_PTR(-EFAULT); 6556 6557 return page; 6558 } 6559 6560 long hugetlb_change_protection(struct vm_area_struct *vma, 6561 unsigned long address, unsigned long end, 6562 pgprot_t newprot, unsigned long cp_flags) 6563 { 6564 struct mm_struct *mm = vma->vm_mm; 6565 unsigned long start = address; 6566 pte_t *ptep; 6567 pte_t pte; 6568 struct hstate *h = hstate_vma(vma); 6569 long pages = 0, psize = huge_page_size(h); 6570 bool shared_pmd = false; 6571 struct mmu_notifier_range range; 6572 unsigned long last_addr_mask; 6573 bool uffd_wp = cp_flags & MM_CP_UFFD_WP; 6574 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; 6575 6576 /* 6577 * In the case of shared PMDs, the area to flush could be beyond 6578 * start/end. Set range.start/range.end to cover the maximum possible 6579 * range if PMD sharing is possible. 6580 */ 6581 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 6582 0, mm, start, end); 6583 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 6584 6585 BUG_ON(address >= end); 6586 flush_cache_range(vma, range.start, range.end); 6587 6588 mmu_notifier_invalidate_range_start(&range); 6589 hugetlb_vma_lock_write(vma); 6590 i_mmap_lock_write(vma->vm_file->f_mapping); 6591 last_addr_mask = hugetlb_mask_last_page(h); 6592 for (; address < end; address += psize) { 6593 spinlock_t *ptl; 6594 ptep = hugetlb_walk(vma, address, psize); 6595 if (!ptep) { 6596 if (!uffd_wp) { 6597 address |= last_addr_mask; 6598 continue; 6599 } 6600 /* 6601 * Userfaultfd wr-protect requires pgtable 6602 * pre-allocations to install pte markers. 6603 */ 6604 ptep = huge_pte_alloc(mm, vma, address, psize); 6605 if (!ptep) { 6606 pages = -ENOMEM; 6607 break; 6608 } 6609 } 6610 ptl = huge_pte_lock(h, mm, ptep); 6611 if (huge_pmd_unshare(mm, vma, address, ptep)) { 6612 /* 6613 * When uffd-wp is enabled on the vma, unshare 6614 * shouldn't happen at all. Warn about it if it 6615 * happened due to some reason. 6616 */ 6617 WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); 6618 pages++; 6619 spin_unlock(ptl); 6620 shared_pmd = true; 6621 address |= last_addr_mask; 6622 continue; 6623 } 6624 pte = huge_ptep_get(ptep); 6625 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 6626 /* Nothing to do. */ 6627 } else if (unlikely(is_hugetlb_entry_migration(pte))) { 6628 swp_entry_t entry = pte_to_swp_entry(pte); 6629 struct page *page = pfn_swap_entry_to_page(entry); 6630 pte_t newpte = pte; 6631 6632 if (is_writable_migration_entry(entry)) { 6633 if (PageAnon(page)) 6634 entry = make_readable_exclusive_migration_entry( 6635 swp_offset(entry)); 6636 else 6637 entry = make_readable_migration_entry( 6638 swp_offset(entry)); 6639 newpte = swp_entry_to_pte(entry); 6640 pages++; 6641 } 6642 6643 if (uffd_wp) 6644 newpte = pte_swp_mkuffd_wp(newpte); 6645 else if (uffd_wp_resolve) 6646 newpte = pte_swp_clear_uffd_wp(newpte); 6647 if (!pte_same(pte, newpte)) 6648 set_huge_pte_at(mm, address, ptep, newpte, psize); 6649 } else if (unlikely(is_pte_marker(pte))) { 6650 /* 6651 * Do nothing on a poison marker; page is 6652 * corrupted, permissons do not apply. Here 6653 * pte_marker_uffd_wp()==true implies !poison 6654 * because they're mutual exclusive. 6655 */ 6656 if (pte_marker_uffd_wp(pte) && uffd_wp_resolve) 6657 /* Safe to modify directly (non-present->none). */ 6658 huge_pte_clear(mm, address, ptep, psize); 6659 } else if (!huge_pte_none(pte)) { 6660 pte_t old_pte; 6661 unsigned int shift = huge_page_shift(hstate_vma(vma)); 6662 6663 old_pte = huge_ptep_modify_prot_start(vma, address, ptep); 6664 pte = huge_pte_modify(old_pte, newprot); 6665 pte = arch_make_huge_pte(pte, shift, vma->vm_flags); 6666 if (uffd_wp) 6667 pte = huge_pte_mkuffd_wp(pte); 6668 else if (uffd_wp_resolve) 6669 pte = huge_pte_clear_uffd_wp(pte); 6670 huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); 6671 pages++; 6672 } else { 6673 /* None pte */ 6674 if (unlikely(uffd_wp)) 6675 /* Safe to modify directly (none->non-present). */ 6676 set_huge_pte_at(mm, address, ptep, 6677 make_pte_marker(PTE_MARKER_UFFD_WP), 6678 psize); 6679 } 6680 spin_unlock(ptl); 6681 } 6682 /* 6683 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare 6684 * may have cleared our pud entry and done put_page on the page table: 6685 * once we release i_mmap_rwsem, another task can do the final put_page 6686 * and that page table be reused and filled with junk. If we actually 6687 * did unshare a page of pmds, flush the range corresponding to the pud. 6688 */ 6689 if (shared_pmd) 6690 flush_hugetlb_tlb_range(vma, range.start, range.end); 6691 else 6692 flush_hugetlb_tlb_range(vma, start, end); 6693 /* 6694 * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are 6695 * downgrading page table protection not changing it to point to a new 6696 * page. 6697 * 6698 * See Documentation/mm/mmu_notifier.rst 6699 */ 6700 i_mmap_unlock_write(vma->vm_file->f_mapping); 6701 hugetlb_vma_unlock_write(vma); 6702 mmu_notifier_invalidate_range_end(&range); 6703 6704 return pages > 0 ? (pages << h->order) : pages; 6705 } 6706 6707 /* Return true if reservation was successful, false otherwise. */ 6708 bool hugetlb_reserve_pages(struct inode *inode, 6709 long from, long to, 6710 struct vm_area_struct *vma, 6711 vm_flags_t vm_flags) 6712 { 6713 long chg = -1, add = -1; 6714 struct hstate *h = hstate_inode(inode); 6715 struct hugepage_subpool *spool = subpool_inode(inode); 6716 struct resv_map *resv_map; 6717 struct hugetlb_cgroup *h_cg = NULL; 6718 long gbl_reserve, regions_needed = 0; 6719 6720 /* This should never happen */ 6721 if (from > to) { 6722 VM_WARN(1, "%s called with a negative range\n", __func__); 6723 return false; 6724 } 6725 6726 /* 6727 * vma specific semaphore used for pmd sharing and fault/truncation 6728 * synchronization 6729 */ 6730 hugetlb_vma_lock_alloc(vma); 6731 6732 /* 6733 * Only apply hugepage reservation if asked. At fault time, an 6734 * attempt will be made for VM_NORESERVE to allocate a page 6735 * without using reserves 6736 */ 6737 if (vm_flags & VM_NORESERVE) 6738 return true; 6739 6740 /* 6741 * Shared mappings base their reservation on the number of pages that 6742 * are already allocated on behalf of the file. Private mappings need 6743 * to reserve the full area even if read-only as mprotect() may be 6744 * called to make the mapping read-write. Assume !vma is a shm mapping 6745 */ 6746 if (!vma || vma->vm_flags & VM_MAYSHARE) { 6747 /* 6748 * resv_map can not be NULL as hugetlb_reserve_pages is only 6749 * called for inodes for which resv_maps were created (see 6750 * hugetlbfs_get_inode). 6751 */ 6752 resv_map = inode_resv_map(inode); 6753 6754 chg = region_chg(resv_map, from, to, ®ions_needed); 6755 } else { 6756 /* Private mapping. */ 6757 resv_map = resv_map_alloc(); 6758 if (!resv_map) 6759 goto out_err; 6760 6761 chg = to - from; 6762 6763 set_vma_resv_map(vma, resv_map); 6764 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 6765 } 6766 6767 if (chg < 0) 6768 goto out_err; 6769 6770 if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h), 6771 chg * pages_per_huge_page(h), &h_cg) < 0) 6772 goto out_err; 6773 6774 if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { 6775 /* For private mappings, the hugetlb_cgroup uncharge info hangs 6776 * of the resv_map. 6777 */ 6778 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); 6779 } 6780 6781 /* 6782 * There must be enough pages in the subpool for the mapping. If 6783 * the subpool has a minimum size, there may be some global 6784 * reservations already in place (gbl_reserve). 6785 */ 6786 gbl_reserve = hugepage_subpool_get_pages(spool, chg); 6787 if (gbl_reserve < 0) 6788 goto out_uncharge_cgroup; 6789 6790 /* 6791 * Check enough hugepages are available for the reservation. 6792 * Hand the pages back to the subpool if there are not 6793 */ 6794 if (hugetlb_acct_memory(h, gbl_reserve) < 0) 6795 goto out_put_pages; 6796 6797 /* 6798 * Account for the reservations made. Shared mappings record regions 6799 * that have reservations as they are shared by multiple VMAs. 6800 * When the last VMA disappears, the region map says how much 6801 * the reservation was and the page cache tells how much of 6802 * the reservation was consumed. Private mappings are per-VMA and 6803 * only the consumed reservations are tracked. When the VMA 6804 * disappears, the original reservation is the VMA size and the 6805 * consumed reservations are stored in the map. Hence, nothing 6806 * else has to be done for private mappings here 6807 */ 6808 if (!vma || vma->vm_flags & VM_MAYSHARE) { 6809 add = region_add(resv_map, from, to, regions_needed, h, h_cg); 6810 6811 if (unlikely(add < 0)) { 6812 hugetlb_acct_memory(h, -gbl_reserve); 6813 goto out_put_pages; 6814 } else if (unlikely(chg > add)) { 6815 /* 6816 * pages in this range were added to the reserve 6817 * map between region_chg and region_add. This 6818 * indicates a race with alloc_hugetlb_folio. Adjust 6819 * the subpool and reserve counts modified above 6820 * based on the difference. 6821 */ 6822 long rsv_adjust; 6823 6824 /* 6825 * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the 6826 * reference to h_cg->css. See comment below for detail. 6827 */ 6828 hugetlb_cgroup_uncharge_cgroup_rsvd( 6829 hstate_index(h), 6830 (chg - add) * pages_per_huge_page(h), h_cg); 6831 6832 rsv_adjust = hugepage_subpool_put_pages(spool, 6833 chg - add); 6834 hugetlb_acct_memory(h, -rsv_adjust); 6835 } else if (h_cg) { 6836 /* 6837 * The file_regions will hold their own reference to 6838 * h_cg->css. So we should release the reference held 6839 * via hugetlb_cgroup_charge_cgroup_rsvd() when we are 6840 * done. 6841 */ 6842 hugetlb_cgroup_put_rsvd_cgroup(h_cg); 6843 } 6844 } 6845 return true; 6846 6847 out_put_pages: 6848 /* put back original number of pages, chg */ 6849 (void)hugepage_subpool_put_pages(spool, chg); 6850 out_uncharge_cgroup: 6851 hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), 6852 chg * pages_per_huge_page(h), h_cg); 6853 out_err: 6854 hugetlb_vma_lock_free(vma); 6855 if (!vma || vma->vm_flags & VM_MAYSHARE) 6856 /* Only call region_abort if the region_chg succeeded but the 6857 * region_add failed or didn't run. 6858 */ 6859 if (chg >= 0 && add < 0) 6860 region_abort(resv_map, from, to, regions_needed); 6861 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 6862 kref_put(&resv_map->refs, resv_map_release); 6863 set_vma_resv_map(vma, NULL); 6864 } 6865 return false; 6866 } 6867 6868 long hugetlb_unreserve_pages(struct inode *inode, long start, long end, 6869 long freed) 6870 { 6871 struct hstate *h = hstate_inode(inode); 6872 struct resv_map *resv_map = inode_resv_map(inode); 6873 long chg = 0; 6874 struct hugepage_subpool *spool = subpool_inode(inode); 6875 long gbl_reserve; 6876 6877 /* 6878 * Since this routine can be called in the evict inode path for all 6879 * hugetlbfs inodes, resv_map could be NULL. 6880 */ 6881 if (resv_map) { 6882 chg = region_del(resv_map, start, end); 6883 /* 6884 * region_del() can fail in the rare case where a region 6885 * must be split and another region descriptor can not be 6886 * allocated. If end == LONG_MAX, it will not fail. 6887 */ 6888 if (chg < 0) 6889 return chg; 6890 } 6891 6892 spin_lock(&inode->i_lock); 6893 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 6894 spin_unlock(&inode->i_lock); 6895 6896 /* 6897 * If the subpool has a minimum size, the number of global 6898 * reservations to be released may be adjusted. 6899 * 6900 * Note that !resv_map implies freed == 0. So (chg - freed) 6901 * won't go negative. 6902 */ 6903 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); 6904 hugetlb_acct_memory(h, -gbl_reserve); 6905 6906 return 0; 6907 } 6908 6909 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 6910 static unsigned long page_table_shareable(struct vm_area_struct *svma, 6911 struct vm_area_struct *vma, 6912 unsigned long addr, pgoff_t idx) 6913 { 6914 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + 6915 svma->vm_start; 6916 unsigned long sbase = saddr & PUD_MASK; 6917 unsigned long s_end = sbase + PUD_SIZE; 6918 6919 /* Allow segments to share if only one is marked locked */ 6920 unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; 6921 unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; 6922 6923 /* 6924 * match the virtual addresses, permission and the alignment of the 6925 * page table page. 6926 * 6927 * Also, vma_lock (vm_private_data) is required for sharing. 6928 */ 6929 if (pmd_index(addr) != pmd_index(saddr) || 6930 vm_flags != svm_flags || 6931 !range_in_vma(svma, sbase, s_end) || 6932 !svma->vm_private_data) 6933 return 0; 6934 6935 return saddr; 6936 } 6937 6938 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) 6939 { 6940 unsigned long start = addr & PUD_MASK; 6941 unsigned long end = start + PUD_SIZE; 6942 6943 #ifdef CONFIG_USERFAULTFD 6944 if (uffd_disable_huge_pmd_share(vma)) 6945 return false; 6946 #endif 6947 /* 6948 * check on proper vm_flags and page table alignment 6949 */ 6950 if (!(vma->vm_flags & VM_MAYSHARE)) 6951 return false; 6952 if (!vma->vm_private_data) /* vma lock required for sharing */ 6953 return false; 6954 if (!range_in_vma(vma, start, end)) 6955 return false; 6956 return true; 6957 } 6958 6959 /* 6960 * Determine if start,end range within vma could be mapped by shared pmd. 6961 * If yes, adjust start and end to cover range associated with possible 6962 * shared pmd mappings. 6963 */ 6964 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 6965 unsigned long *start, unsigned long *end) 6966 { 6967 unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE), 6968 v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); 6969 6970 /* 6971 * vma needs to span at least one aligned PUD size, and the range 6972 * must be at least partially within in. 6973 */ 6974 if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || 6975 (*end <= v_start) || (*start >= v_end)) 6976 return; 6977 6978 /* Extend the range to be PUD aligned for a worst case scenario */ 6979 if (*start > v_start) 6980 *start = ALIGN_DOWN(*start, PUD_SIZE); 6981 6982 if (*end < v_end) 6983 *end = ALIGN(*end, PUD_SIZE); 6984 } 6985 6986 /* 6987 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 6988 * and returns the corresponding pte. While this is not necessary for the 6989 * !shared pmd case because we can allocate the pmd later as well, it makes the 6990 * code much cleaner. pmd allocation is essential for the shared case because 6991 * pud has to be populated inside the same i_mmap_rwsem section - otherwise 6992 * racing tasks could either miss the sharing (see huge_pte_offset) or select a 6993 * bad pmd for sharing. 6994 */ 6995 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, 6996 unsigned long addr, pud_t *pud) 6997 { 6998 struct address_space *mapping = vma->vm_file->f_mapping; 6999 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 7000 vma->vm_pgoff; 7001 struct vm_area_struct *svma; 7002 unsigned long saddr; 7003 pte_t *spte = NULL; 7004 pte_t *pte; 7005 7006 i_mmap_lock_read(mapping); 7007 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 7008 if (svma == vma) 7009 continue; 7010 7011 saddr = page_table_shareable(svma, vma, addr, idx); 7012 if (saddr) { 7013 spte = hugetlb_walk(svma, saddr, 7014 vma_mmu_pagesize(svma)); 7015 if (spte) { 7016 get_page(virt_to_page(spte)); 7017 break; 7018 } 7019 } 7020 } 7021 7022 if (!spte) 7023 goto out; 7024 7025 spin_lock(&mm->page_table_lock); 7026 if (pud_none(*pud)) { 7027 pud_populate(mm, pud, 7028 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 7029 mm_inc_nr_pmds(mm); 7030 } else { 7031 put_page(virt_to_page(spte)); 7032 } 7033 spin_unlock(&mm->page_table_lock); 7034 out: 7035 pte = (pte_t *)pmd_alloc(mm, pud, addr); 7036 i_mmap_unlock_read(mapping); 7037 return pte; 7038 } 7039 7040 /* 7041 * unmap huge page backed by shared pte. 7042 * 7043 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared 7044 * indicated by page_count > 1, unmap is achieved by clearing pud and 7045 * decrementing the ref count. If count == 1, the pte page is not shared. 7046 * 7047 * Called with page table lock held. 7048 * 7049 * returns: 1 successfully unmapped a shared pte page 7050 * 0 the underlying pte page is not shared, or it is the last user 7051 */ 7052 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, 7053 unsigned long addr, pte_t *ptep) 7054 { 7055 pgd_t *pgd = pgd_offset(mm, addr); 7056 p4d_t *p4d = p4d_offset(pgd, addr); 7057 pud_t *pud = pud_offset(p4d, addr); 7058 7059 i_mmap_assert_write_locked(vma->vm_file->f_mapping); 7060 hugetlb_vma_assert_locked(vma); 7061 BUG_ON(page_count(virt_to_page(ptep)) == 0); 7062 if (page_count(virt_to_page(ptep)) == 1) 7063 return 0; 7064 7065 pud_clear(pud); 7066 put_page(virt_to_page(ptep)); 7067 mm_dec_nr_pmds(mm); 7068 return 1; 7069 } 7070 7071 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 7072 7073 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, 7074 unsigned long addr, pud_t *pud) 7075 { 7076 return NULL; 7077 } 7078 7079 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, 7080 unsigned long addr, pte_t *ptep) 7081 { 7082 return 0; 7083 } 7084 7085 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 7086 unsigned long *start, unsigned long *end) 7087 { 7088 } 7089 7090 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) 7091 { 7092 return false; 7093 } 7094 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 7095 7096 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB 7097 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 7098 unsigned long addr, unsigned long sz) 7099 { 7100 pgd_t *pgd; 7101 p4d_t *p4d; 7102 pud_t *pud; 7103 pte_t *pte = NULL; 7104 7105 pgd = pgd_offset(mm, addr); 7106 p4d = p4d_alloc(mm, pgd, addr); 7107 if (!p4d) 7108 return NULL; 7109 pud = pud_alloc(mm, p4d, addr); 7110 if (pud) { 7111 if (sz == PUD_SIZE) { 7112 pte = (pte_t *)pud; 7113 } else { 7114 BUG_ON(sz != PMD_SIZE); 7115 if (want_pmd_share(vma, addr) && pud_none(*pud)) 7116 pte = huge_pmd_share(mm, vma, addr, pud); 7117 else 7118 pte = (pte_t *)pmd_alloc(mm, pud, addr); 7119 } 7120 } 7121 7122 if (pte) { 7123 pte_t pteval = ptep_get_lockless(pte); 7124 7125 BUG_ON(pte_present(pteval) && !pte_huge(pteval)); 7126 } 7127 7128 return pte; 7129 } 7130 7131 /* 7132 * huge_pte_offset() - Walk the page table to resolve the hugepage 7133 * entry at address @addr 7134 * 7135 * Return: Pointer to page table entry (PUD or PMD) for 7136 * address @addr, or NULL if a !p*d_present() entry is encountered and the 7137 * size @sz doesn't match the hugepage size at this level of the page 7138 * table. 7139 */ 7140 pte_t *huge_pte_offset(struct mm_struct *mm, 7141 unsigned long addr, unsigned long sz) 7142 { 7143 pgd_t *pgd; 7144 p4d_t *p4d; 7145 pud_t *pud; 7146 pmd_t *pmd; 7147 7148 pgd = pgd_offset(mm, addr); 7149 if (!pgd_present(*pgd)) 7150 return NULL; 7151 p4d = p4d_offset(pgd, addr); 7152 if (!p4d_present(*p4d)) 7153 return NULL; 7154 7155 pud = pud_offset(p4d, addr); 7156 if (sz == PUD_SIZE) 7157 /* must be pud huge, non-present or none */ 7158 return (pte_t *)pud; 7159 if (!pud_present(*pud)) 7160 return NULL; 7161 /* must have a valid entry and size to go further */ 7162 7163 pmd = pmd_offset(pud, addr); 7164 /* must be pmd huge, non-present or none */ 7165 return (pte_t *)pmd; 7166 } 7167 7168 /* 7169 * Return a mask that can be used to update an address to the last huge 7170 * page in a page table page mapping size. Used to skip non-present 7171 * page table entries when linearly scanning address ranges. Architectures 7172 * with unique huge page to page table relationships can define their own 7173 * version of this routine. 7174 */ 7175 unsigned long hugetlb_mask_last_page(struct hstate *h) 7176 { 7177 unsigned long hp_size = huge_page_size(h); 7178 7179 if (hp_size == PUD_SIZE) 7180 return P4D_SIZE - PUD_SIZE; 7181 else if (hp_size == PMD_SIZE) 7182 return PUD_SIZE - PMD_SIZE; 7183 else 7184 return 0UL; 7185 } 7186 7187 #else 7188 7189 /* See description above. Architectures can provide their own version. */ 7190 __weak unsigned long hugetlb_mask_last_page(struct hstate *h) 7191 { 7192 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 7193 if (huge_page_size(h) == PMD_SIZE) 7194 return PUD_SIZE - PMD_SIZE; 7195 #endif 7196 return 0UL; 7197 } 7198 7199 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 7200 7201 /* 7202 * These functions are overwritable if your architecture needs its own 7203 * behavior. 7204 */ 7205 bool isolate_hugetlb(struct folio *folio, struct list_head *list) 7206 { 7207 bool ret = true; 7208 7209 spin_lock_irq(&hugetlb_lock); 7210 if (!folio_test_hugetlb(folio) || 7211 !folio_test_hugetlb_migratable(folio) || 7212 !folio_try_get(folio)) { 7213 ret = false; 7214 goto unlock; 7215 } 7216 folio_clear_hugetlb_migratable(folio); 7217 list_move_tail(&folio->lru, list); 7218 unlock: 7219 spin_unlock_irq(&hugetlb_lock); 7220 return ret; 7221 } 7222 7223 int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) 7224 { 7225 int ret = 0; 7226 7227 *hugetlb = false; 7228 spin_lock_irq(&hugetlb_lock); 7229 if (folio_test_hugetlb(folio)) { 7230 *hugetlb = true; 7231 if (folio_test_hugetlb_freed(folio)) 7232 ret = 0; 7233 else if (folio_test_hugetlb_migratable(folio) || unpoison) 7234 ret = folio_try_get(folio); 7235 else 7236 ret = -EBUSY; 7237 } 7238 spin_unlock_irq(&hugetlb_lock); 7239 return ret; 7240 } 7241 7242 int get_huge_page_for_hwpoison(unsigned long pfn, int flags, 7243 bool *migratable_cleared) 7244 { 7245 int ret; 7246 7247 spin_lock_irq(&hugetlb_lock); 7248 ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); 7249 spin_unlock_irq(&hugetlb_lock); 7250 return ret; 7251 } 7252 7253 void folio_putback_active_hugetlb(struct folio *folio) 7254 { 7255 spin_lock_irq(&hugetlb_lock); 7256 folio_set_hugetlb_migratable(folio); 7257 list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist); 7258 spin_unlock_irq(&hugetlb_lock); 7259 folio_put(folio); 7260 } 7261 7262 void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) 7263 { 7264 struct hstate *h = folio_hstate(old_folio); 7265 7266 hugetlb_cgroup_migrate(old_folio, new_folio); 7267 set_page_owner_migrate_reason(&new_folio->page, reason); 7268 7269 /* 7270 * transfer temporary state of the new hugetlb folio. This is 7271 * reverse to other transitions because the newpage is going to 7272 * be final while the old one will be freed so it takes over 7273 * the temporary status. 7274 * 7275 * Also note that we have to transfer the per-node surplus state 7276 * here as well otherwise the global surplus count will not match 7277 * the per-node's. 7278 */ 7279 if (folio_test_hugetlb_temporary(new_folio)) { 7280 int old_nid = folio_nid(old_folio); 7281 int new_nid = folio_nid(new_folio); 7282 7283 folio_set_hugetlb_temporary(old_folio); 7284 folio_clear_hugetlb_temporary(new_folio); 7285 7286 7287 /* 7288 * There is no need to transfer the per-node surplus state 7289 * when we do not cross the node. 7290 */ 7291 if (new_nid == old_nid) 7292 return; 7293 spin_lock_irq(&hugetlb_lock); 7294 if (h->surplus_huge_pages_node[old_nid]) { 7295 h->surplus_huge_pages_node[old_nid]--; 7296 h->surplus_huge_pages_node[new_nid]++; 7297 } 7298 spin_unlock_irq(&hugetlb_lock); 7299 } 7300 } 7301 7302 static void hugetlb_unshare_pmds(struct vm_area_struct *vma, 7303 unsigned long start, 7304 unsigned long end) 7305 { 7306 struct hstate *h = hstate_vma(vma); 7307 unsigned long sz = huge_page_size(h); 7308 struct mm_struct *mm = vma->vm_mm; 7309 struct mmu_notifier_range range; 7310 unsigned long address; 7311 spinlock_t *ptl; 7312 pte_t *ptep; 7313 7314 if (!(vma->vm_flags & VM_MAYSHARE)) 7315 return; 7316 7317 if (start >= end) 7318 return; 7319 7320 flush_cache_range(vma, start, end); 7321 /* 7322 * No need to call adjust_range_if_pmd_sharing_possible(), because 7323 * we have already done the PUD_SIZE alignment. 7324 */ 7325 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 7326 start, end); 7327 mmu_notifier_invalidate_range_start(&range); 7328 hugetlb_vma_lock_write(vma); 7329 i_mmap_lock_write(vma->vm_file->f_mapping); 7330 for (address = start; address < end; address += PUD_SIZE) { 7331 ptep = hugetlb_walk(vma, address, sz); 7332 if (!ptep) 7333 continue; 7334 ptl = huge_pte_lock(h, mm, ptep); 7335 huge_pmd_unshare(mm, vma, address, ptep); 7336 spin_unlock(ptl); 7337 } 7338 flush_hugetlb_tlb_range(vma, start, end); 7339 i_mmap_unlock_write(vma->vm_file->f_mapping); 7340 hugetlb_vma_unlock_write(vma); 7341 /* 7342 * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see 7343 * Documentation/mm/mmu_notifier.rst. 7344 */ 7345 mmu_notifier_invalidate_range_end(&range); 7346 } 7347 7348 /* 7349 * This function will unconditionally remove all the shared pmd pgtable entries 7350 * within the specific vma for a hugetlbfs memory range. 7351 */ 7352 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) 7353 { 7354 hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), 7355 ALIGN_DOWN(vma->vm_end, PUD_SIZE)); 7356 } 7357 7358 #ifdef CONFIG_CMA 7359 static bool cma_reserve_called __initdata; 7360 7361 static int __init cmdline_parse_hugetlb_cma(char *p) 7362 { 7363 int nid, count = 0; 7364 unsigned long tmp; 7365 char *s = p; 7366 7367 while (*s) { 7368 if (sscanf(s, "%lu%n", &tmp, &count) != 1) 7369 break; 7370 7371 if (s[count] == ':') { 7372 if (tmp >= MAX_NUMNODES) 7373 break; 7374 nid = array_index_nospec(tmp, MAX_NUMNODES); 7375 7376 s += count + 1; 7377 tmp = memparse(s, &s); 7378 hugetlb_cma_size_in_node[nid] = tmp; 7379 hugetlb_cma_size += tmp; 7380 7381 /* 7382 * Skip the separator if have one, otherwise 7383 * break the parsing. 7384 */ 7385 if (*s == ',') 7386 s++; 7387 else 7388 break; 7389 } else { 7390 hugetlb_cma_size = memparse(p, &p); 7391 break; 7392 } 7393 } 7394 7395 return 0; 7396 } 7397 7398 early_param("hugetlb_cma", cmdline_parse_hugetlb_cma); 7399 7400 void __init hugetlb_cma_reserve(int order) 7401 { 7402 unsigned long size, reserved, per_node; 7403 bool node_specific_cma_alloc = false; 7404 int nid; 7405 7406 cma_reserve_called = true; 7407 7408 if (!hugetlb_cma_size) 7409 return; 7410 7411 for (nid = 0; nid < MAX_NUMNODES; nid++) { 7412 if (hugetlb_cma_size_in_node[nid] == 0) 7413 continue; 7414 7415 if (!node_online(nid)) { 7416 pr_warn("hugetlb_cma: invalid node %d specified\n", nid); 7417 hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; 7418 hugetlb_cma_size_in_node[nid] = 0; 7419 continue; 7420 } 7421 7422 if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) { 7423 pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n", 7424 nid, (PAGE_SIZE << order) / SZ_1M); 7425 hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; 7426 hugetlb_cma_size_in_node[nid] = 0; 7427 } else { 7428 node_specific_cma_alloc = true; 7429 } 7430 } 7431 7432 /* Validate the CMA size again in case some invalid nodes specified. */ 7433 if (!hugetlb_cma_size) 7434 return; 7435 7436 if (hugetlb_cma_size < (PAGE_SIZE << order)) { 7437 pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", 7438 (PAGE_SIZE << order) / SZ_1M); 7439 hugetlb_cma_size = 0; 7440 return; 7441 } 7442 7443 if (!node_specific_cma_alloc) { 7444 /* 7445 * If 3 GB area is requested on a machine with 4 numa nodes, 7446 * let's allocate 1 GB on first three nodes and ignore the last one. 7447 */ 7448 per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); 7449 pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", 7450 hugetlb_cma_size / SZ_1M, per_node / SZ_1M); 7451 } 7452 7453 reserved = 0; 7454 for_each_online_node(nid) { 7455 int res; 7456 char name[CMA_MAX_NAME]; 7457 7458 if (node_specific_cma_alloc) { 7459 if (hugetlb_cma_size_in_node[nid] == 0) 7460 continue; 7461 7462 size = hugetlb_cma_size_in_node[nid]; 7463 } else { 7464 size = min(per_node, hugetlb_cma_size - reserved); 7465 } 7466 7467 size = round_up(size, PAGE_SIZE << order); 7468 7469 snprintf(name, sizeof(name), "hugetlb%d", nid); 7470 /* 7471 * Note that 'order per bit' is based on smallest size that 7472 * may be returned to CMA allocator in the case of 7473 * huge page demotion. 7474 */ 7475 res = cma_declare_contiguous_nid(0, size, 0, 7476 PAGE_SIZE << HUGETLB_PAGE_ORDER, 7477 HUGETLB_PAGE_ORDER, false, name, 7478 &hugetlb_cma[nid], nid); 7479 if (res) { 7480 pr_warn("hugetlb_cma: reservation failed: err %d, node %d", 7481 res, nid); 7482 continue; 7483 } 7484 7485 reserved += size; 7486 pr_info("hugetlb_cma: reserved %lu MiB on node %d\n", 7487 size / SZ_1M, nid); 7488 7489 if (reserved >= hugetlb_cma_size) 7490 break; 7491 } 7492 7493 if (!reserved) 7494 /* 7495 * hugetlb_cma_size is used to determine if allocations from 7496 * cma are possible. Set to zero if no cma regions are set up. 7497 */ 7498 hugetlb_cma_size = 0; 7499 } 7500 7501 static void __init hugetlb_cma_check(void) 7502 { 7503 if (!hugetlb_cma_size || cma_reserve_called) 7504 return; 7505 7506 pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); 7507 } 7508 7509 #endif /* CONFIG_CMA */ 7510