1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/madvise.c 4 * 5 * Copyright (C) 1999 Linus Torvalds 6 * Copyright (C) 2002 Christoph Hellwig 7 */ 8 9 #include <linux/mman.h> 10 #include <linux/pagemap.h> 11 #include <linux/syscalls.h> 12 #include <linux/mempolicy.h> 13 #include <linux/page-isolation.h> 14 #include <linux/page_idle.h> 15 #include <linux/userfaultfd_k.h> 16 #include <linux/hugetlb.h> 17 #include <linux/falloc.h> 18 #include <linux/fadvise.h> 19 #include <linux/sched.h> 20 #include <linux/sched/mm.h> 21 #include <linux/mm_inline.h> 22 #include <linux/string.h> 23 #include <linux/uio.h> 24 #include <linux/ksm.h> 25 #include <linux/fs.h> 26 #include <linux/file.h> 27 #include <linux/blkdev.h> 28 #include <linux/backing-dev.h> 29 #include <linux/pagewalk.h> 30 #include <linux/swap.h> 31 #include <linux/swapops.h> 32 #include <linux/shmem_fs.h> 33 #include <linux/mmu_notifier.h> 34 35 #include <asm/tlb.h> 36 37 #include "internal.h" 38 39 struct madvise_walk_private { 40 struct mmu_gather *tlb; 41 bool pageout; 42 }; 43 44 /* 45 * Any behaviour which results in changes to the vma->vm_flags needs to 46 * take mmap_lock for writing. Others, which simply traverse vmas, need 47 * to only take it for reading. 48 */ 49 static int madvise_need_mmap_write(int behavior) 50 { 51 switch (behavior) { 52 case MADV_REMOVE: 53 case MADV_WILLNEED: 54 case MADV_DONTNEED: 55 case MADV_COLD: 56 case MADV_PAGEOUT: 57 case MADV_FREE: 58 case MADV_POPULATE_READ: 59 case MADV_POPULATE_WRITE: 60 return 0; 61 default: 62 /* be safe, default to 1. list exceptions explicitly */ 63 return 1; 64 } 65 } 66 67 #ifdef CONFIG_ANON_VMA_NAME 68 struct anon_vma_name *anon_vma_name_alloc(const char *name) 69 { 70 struct anon_vma_name *anon_name; 71 size_t count; 72 73 /* Add 1 for NUL terminator at the end of the anon_name->name */ 74 count = strlen(name) + 1; 75 anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); 76 if (anon_name) { 77 kref_init(&anon_name->kref); 78 memcpy(anon_name->name, name, count); 79 } 80 81 return anon_name; 82 } 83 84 void anon_vma_name_free(struct kref *kref) 85 { 86 struct anon_vma_name *anon_name = 87 container_of(kref, struct anon_vma_name, kref); 88 kfree(anon_name); 89 } 90 91 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) 92 { 93 mmap_assert_locked(vma->vm_mm); 94 95 if (vma->vm_file) 96 return NULL; 97 98 return vma->anon_name; 99 } 100 101 /* mmap_lock should be write-locked */ 102 static int replace_anon_vma_name(struct vm_area_struct *vma, 103 struct anon_vma_name *anon_name) 104 { 105 struct anon_vma_name *orig_name = anon_vma_name(vma); 106 107 if (!anon_name) { 108 vma->anon_name = NULL; 109 anon_vma_name_put(orig_name); 110 return 0; 111 } 112 113 if (anon_vma_name_eq(orig_name, anon_name)) 114 return 0; 115 116 vma->anon_name = anon_vma_name_reuse(anon_name); 117 anon_vma_name_put(orig_name); 118 119 return 0; 120 } 121 #else /* CONFIG_ANON_VMA_NAME */ 122 static int replace_anon_vma_name(struct vm_area_struct *vma, 123 struct anon_vma_name *anon_name) 124 { 125 if (anon_name) 126 return -EINVAL; 127 128 return 0; 129 } 130 #endif /* CONFIG_ANON_VMA_NAME */ 131 /* 132 * Update the vm_flags on region of a vma, splitting it or merging it as 133 * necessary. Must be called with mmap_sem held for writing; 134 * Caller should ensure anon_name stability by raising its refcount even when 135 * anon_name belongs to a valid vma because this function might free that vma. 136 */ 137 static int madvise_update_vma(struct vm_area_struct *vma, 138 struct vm_area_struct **prev, unsigned long start, 139 unsigned long end, unsigned long new_flags, 140 struct anon_vma_name *anon_name) 141 { 142 struct mm_struct *mm = vma->vm_mm; 143 int error; 144 pgoff_t pgoff; 145 146 if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { 147 *prev = vma; 148 return 0; 149 } 150 151 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 152 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 153 vma->vm_file, pgoff, vma_policy(vma), 154 vma->vm_userfaultfd_ctx, anon_name); 155 if (*prev) { 156 vma = *prev; 157 goto success; 158 } 159 160 *prev = vma; 161 162 if (start != vma->vm_start) { 163 if (unlikely(mm->map_count >= sysctl_max_map_count)) 164 return -ENOMEM; 165 error = __split_vma(mm, vma, start, 1); 166 if (error) 167 return error; 168 } 169 170 if (end != vma->vm_end) { 171 if (unlikely(mm->map_count >= sysctl_max_map_count)) 172 return -ENOMEM; 173 error = __split_vma(mm, vma, end, 0); 174 if (error) 175 return error; 176 } 177 178 success: 179 /* 180 * vm_flags is protected by the mmap_lock held in write mode. 181 */ 182 vma->vm_flags = new_flags; 183 if (!vma->vm_file) { 184 error = replace_anon_vma_name(vma, anon_name); 185 if (error) 186 return error; 187 } 188 189 return 0; 190 } 191 192 #ifdef CONFIG_SWAP 193 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 194 unsigned long end, struct mm_walk *walk) 195 { 196 pte_t *orig_pte; 197 struct vm_area_struct *vma = walk->private; 198 unsigned long index; 199 200 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 201 return 0; 202 203 for (index = start; index != end; index += PAGE_SIZE) { 204 pte_t pte; 205 swp_entry_t entry; 206 struct page *page; 207 spinlock_t *ptl; 208 209 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 210 pte = *(orig_pte + ((index - start) / PAGE_SIZE)); 211 pte_unmap_unlock(orig_pte, ptl); 212 213 if (pte_present(pte) || pte_none(pte)) 214 continue; 215 entry = pte_to_swp_entry(pte); 216 if (unlikely(non_swap_entry(entry))) 217 continue; 218 219 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 220 vma, index, false); 221 if (page) 222 put_page(page); 223 } 224 225 return 0; 226 } 227 228 static const struct mm_walk_ops swapin_walk_ops = { 229 .pmd_entry = swapin_walk_pmd_entry, 230 }; 231 232 static void force_shm_swapin_readahead(struct vm_area_struct *vma, 233 unsigned long start, unsigned long end, 234 struct address_space *mapping) 235 { 236 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); 237 pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1); 238 struct page *page; 239 240 rcu_read_lock(); 241 xas_for_each(&xas, page, end_index) { 242 swp_entry_t swap; 243 244 if (!xa_is_value(page)) 245 continue; 246 xas_pause(&xas); 247 rcu_read_unlock(); 248 249 swap = radix_to_swp_entry(page); 250 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, 251 NULL, 0, false); 252 if (page) 253 put_page(page); 254 255 rcu_read_lock(); 256 } 257 rcu_read_unlock(); 258 259 lru_add_drain(); /* Push any new pages onto the LRU now */ 260 } 261 #endif /* CONFIG_SWAP */ 262 263 /* 264 * Schedule all required I/O operations. Do not wait for completion. 265 */ 266 static long madvise_willneed(struct vm_area_struct *vma, 267 struct vm_area_struct **prev, 268 unsigned long start, unsigned long end) 269 { 270 struct mm_struct *mm = vma->vm_mm; 271 struct file *file = vma->vm_file; 272 loff_t offset; 273 274 *prev = vma; 275 #ifdef CONFIG_SWAP 276 if (!file) { 277 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); 278 lru_add_drain(); /* Push any new pages onto the LRU now */ 279 return 0; 280 } 281 282 if (shmem_mapping(file->f_mapping)) { 283 force_shm_swapin_readahead(vma, start, end, 284 file->f_mapping); 285 return 0; 286 } 287 #else 288 if (!file) 289 return -EBADF; 290 #endif 291 292 if (IS_DAX(file_inode(file))) { 293 /* no bad return value, but ignore advice */ 294 return 0; 295 } 296 297 /* 298 * Filesystem's fadvise may need to take various locks. We need to 299 * explicitly grab a reference because the vma (and hence the 300 * vma's reference to the file) can go away as soon as we drop 301 * mmap_lock. 302 */ 303 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 304 get_file(file); 305 offset = (loff_t)(start - vma->vm_start) 306 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 307 mmap_read_unlock(mm); 308 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 309 fput(file); 310 mmap_read_lock(mm); 311 return 0; 312 } 313 314 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, 315 unsigned long addr, unsigned long end, 316 struct mm_walk *walk) 317 { 318 struct madvise_walk_private *private = walk->private; 319 struct mmu_gather *tlb = private->tlb; 320 bool pageout = private->pageout; 321 struct mm_struct *mm = tlb->mm; 322 struct vm_area_struct *vma = walk->vma; 323 pte_t *orig_pte, *pte, ptent; 324 spinlock_t *ptl; 325 struct page *page = NULL; 326 LIST_HEAD(page_list); 327 328 if (fatal_signal_pending(current)) 329 return -EINTR; 330 331 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 332 if (pmd_trans_huge(*pmd)) { 333 pmd_t orig_pmd; 334 unsigned long next = pmd_addr_end(addr, end); 335 336 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 337 ptl = pmd_trans_huge_lock(pmd, vma); 338 if (!ptl) 339 return 0; 340 341 orig_pmd = *pmd; 342 if (is_huge_zero_pmd(orig_pmd)) 343 goto huge_unlock; 344 345 if (unlikely(!pmd_present(orig_pmd))) { 346 VM_BUG_ON(thp_migration_supported() && 347 !is_pmd_migration_entry(orig_pmd)); 348 goto huge_unlock; 349 } 350 351 page = pmd_page(orig_pmd); 352 353 /* Do not interfere with other mappings of this page */ 354 if (page_mapcount(page) != 1) 355 goto huge_unlock; 356 357 if (next - addr != HPAGE_PMD_SIZE) { 358 int err; 359 360 get_page(page); 361 spin_unlock(ptl); 362 lock_page(page); 363 err = split_huge_page(page); 364 unlock_page(page); 365 put_page(page); 366 if (!err) 367 goto regular_page; 368 return 0; 369 } 370 371 if (pmd_young(orig_pmd)) { 372 pmdp_invalidate(vma, addr, pmd); 373 orig_pmd = pmd_mkold(orig_pmd); 374 375 set_pmd_at(mm, addr, pmd, orig_pmd); 376 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 377 } 378 379 ClearPageReferenced(page); 380 test_and_clear_page_young(page); 381 if (pageout) { 382 if (!isolate_lru_page(page)) { 383 if (PageUnevictable(page)) 384 putback_lru_page(page); 385 else 386 list_add(&page->lru, &page_list); 387 } 388 } else 389 deactivate_page(page); 390 huge_unlock: 391 spin_unlock(ptl); 392 if (pageout) 393 reclaim_pages(&page_list); 394 return 0; 395 } 396 397 regular_page: 398 if (pmd_trans_unstable(pmd)) 399 return 0; 400 #endif 401 tlb_change_page_size(tlb, PAGE_SIZE); 402 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 403 flush_tlb_batched_pending(mm); 404 arch_enter_lazy_mmu_mode(); 405 for (; addr < end; pte++, addr += PAGE_SIZE) { 406 ptent = *pte; 407 408 if (pte_none(ptent)) 409 continue; 410 411 if (!pte_present(ptent)) 412 continue; 413 414 page = vm_normal_page(vma, addr, ptent); 415 if (!page) 416 continue; 417 418 /* 419 * Creating a THP page is expensive so split it only if we 420 * are sure it's worth. Split it if we are only owner. 421 */ 422 if (PageTransCompound(page)) { 423 if (page_mapcount(page) != 1) 424 break; 425 get_page(page); 426 if (!trylock_page(page)) { 427 put_page(page); 428 break; 429 } 430 pte_unmap_unlock(orig_pte, ptl); 431 if (split_huge_page(page)) { 432 unlock_page(page); 433 put_page(page); 434 pte_offset_map_lock(mm, pmd, addr, &ptl); 435 break; 436 } 437 unlock_page(page); 438 put_page(page); 439 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 440 pte--; 441 addr -= PAGE_SIZE; 442 continue; 443 } 444 445 /* Do not interfere with other mappings of this page */ 446 if (page_mapcount(page) != 1) 447 continue; 448 449 VM_BUG_ON_PAGE(PageTransCompound(page), page); 450 451 if (pte_young(ptent)) { 452 ptent = ptep_get_and_clear_full(mm, addr, pte, 453 tlb->fullmm); 454 ptent = pte_mkold(ptent); 455 set_pte_at(mm, addr, pte, ptent); 456 tlb_remove_tlb_entry(tlb, pte, addr); 457 } 458 459 /* 460 * We are deactivating a page for accelerating reclaiming. 461 * VM couldn't reclaim the page unless we clear PG_young. 462 * As a side effect, it makes confuse idle-page tracking 463 * because they will miss recent referenced history. 464 */ 465 ClearPageReferenced(page); 466 test_and_clear_page_young(page); 467 if (pageout) { 468 if (!isolate_lru_page(page)) { 469 if (PageUnevictable(page)) 470 putback_lru_page(page); 471 else 472 list_add(&page->lru, &page_list); 473 } 474 } else 475 deactivate_page(page); 476 } 477 478 arch_leave_lazy_mmu_mode(); 479 pte_unmap_unlock(orig_pte, ptl); 480 if (pageout) 481 reclaim_pages(&page_list); 482 cond_resched(); 483 484 return 0; 485 } 486 487 static const struct mm_walk_ops cold_walk_ops = { 488 .pmd_entry = madvise_cold_or_pageout_pte_range, 489 }; 490 491 static void madvise_cold_page_range(struct mmu_gather *tlb, 492 struct vm_area_struct *vma, 493 unsigned long addr, unsigned long end) 494 { 495 struct madvise_walk_private walk_private = { 496 .pageout = false, 497 .tlb = tlb, 498 }; 499 500 tlb_start_vma(tlb, vma); 501 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 502 tlb_end_vma(tlb, vma); 503 } 504 505 static long madvise_cold(struct vm_area_struct *vma, 506 struct vm_area_struct **prev, 507 unsigned long start_addr, unsigned long end_addr) 508 { 509 struct mm_struct *mm = vma->vm_mm; 510 struct mmu_gather tlb; 511 512 *prev = vma; 513 if (!can_madv_lru_vma(vma)) 514 return -EINVAL; 515 516 lru_add_drain(); 517 tlb_gather_mmu(&tlb, mm); 518 madvise_cold_page_range(&tlb, vma, start_addr, end_addr); 519 tlb_finish_mmu(&tlb); 520 521 return 0; 522 } 523 524 static void madvise_pageout_page_range(struct mmu_gather *tlb, 525 struct vm_area_struct *vma, 526 unsigned long addr, unsigned long end) 527 { 528 struct madvise_walk_private walk_private = { 529 .pageout = true, 530 .tlb = tlb, 531 }; 532 533 tlb_start_vma(tlb, vma); 534 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 535 tlb_end_vma(tlb, vma); 536 } 537 538 static inline bool can_do_pageout(struct vm_area_struct *vma) 539 { 540 if (vma_is_anonymous(vma)) 541 return true; 542 if (!vma->vm_file) 543 return false; 544 /* 545 * paging out pagecache only for non-anonymous mappings that correspond 546 * to the files the calling process could (if tried) open for writing; 547 * otherwise we'd be including shared non-exclusive mappings, which 548 * opens a side channel. 549 */ 550 return inode_owner_or_capable(&init_user_ns, 551 file_inode(vma->vm_file)) || 552 file_permission(vma->vm_file, MAY_WRITE) == 0; 553 } 554 555 static long madvise_pageout(struct vm_area_struct *vma, 556 struct vm_area_struct **prev, 557 unsigned long start_addr, unsigned long end_addr) 558 { 559 struct mm_struct *mm = vma->vm_mm; 560 struct mmu_gather tlb; 561 562 *prev = vma; 563 if (!can_madv_lru_vma(vma)) 564 return -EINVAL; 565 566 if (!can_do_pageout(vma)) 567 return 0; 568 569 lru_add_drain(); 570 tlb_gather_mmu(&tlb, mm); 571 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); 572 tlb_finish_mmu(&tlb); 573 574 return 0; 575 } 576 577 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 578 unsigned long end, struct mm_walk *walk) 579 580 { 581 struct mmu_gather *tlb = walk->private; 582 struct mm_struct *mm = tlb->mm; 583 struct vm_area_struct *vma = walk->vma; 584 spinlock_t *ptl; 585 pte_t *orig_pte, *pte, ptent; 586 struct page *page; 587 int nr_swap = 0; 588 unsigned long next; 589 590 next = pmd_addr_end(addr, end); 591 if (pmd_trans_huge(*pmd)) 592 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 593 goto next; 594 595 if (pmd_trans_unstable(pmd)) 596 return 0; 597 598 tlb_change_page_size(tlb, PAGE_SIZE); 599 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 600 flush_tlb_batched_pending(mm); 601 arch_enter_lazy_mmu_mode(); 602 for (; addr != end; pte++, addr += PAGE_SIZE) { 603 ptent = *pte; 604 605 if (pte_none(ptent)) 606 continue; 607 /* 608 * If the pte has swp_entry, just clear page table to 609 * prevent swap-in which is more expensive rather than 610 * (page allocation + zeroing). 611 */ 612 if (!pte_present(ptent)) { 613 swp_entry_t entry; 614 615 entry = pte_to_swp_entry(ptent); 616 if (non_swap_entry(entry)) 617 continue; 618 nr_swap--; 619 free_swap_and_cache(entry); 620 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 621 continue; 622 } 623 624 page = vm_normal_page(vma, addr, ptent); 625 if (!page) 626 continue; 627 628 /* 629 * If pmd isn't transhuge but the page is THP and 630 * is owned by only this process, split it and 631 * deactivate all pages. 632 */ 633 if (PageTransCompound(page)) { 634 if (page_mapcount(page) != 1) 635 goto out; 636 get_page(page); 637 if (!trylock_page(page)) { 638 put_page(page); 639 goto out; 640 } 641 pte_unmap_unlock(orig_pte, ptl); 642 if (split_huge_page(page)) { 643 unlock_page(page); 644 put_page(page); 645 pte_offset_map_lock(mm, pmd, addr, &ptl); 646 goto out; 647 } 648 unlock_page(page); 649 put_page(page); 650 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 651 pte--; 652 addr -= PAGE_SIZE; 653 continue; 654 } 655 656 VM_BUG_ON_PAGE(PageTransCompound(page), page); 657 658 if (PageSwapCache(page) || PageDirty(page)) { 659 if (!trylock_page(page)) 660 continue; 661 /* 662 * If page is shared with others, we couldn't clear 663 * PG_dirty of the page. 664 */ 665 if (page_mapcount(page) != 1) { 666 unlock_page(page); 667 continue; 668 } 669 670 if (PageSwapCache(page) && !try_to_free_swap(page)) { 671 unlock_page(page); 672 continue; 673 } 674 675 ClearPageDirty(page); 676 unlock_page(page); 677 } 678 679 if (pte_young(ptent) || pte_dirty(ptent)) { 680 /* 681 * Some of architecture(ex, PPC) don't update TLB 682 * with set_pte_at and tlb_remove_tlb_entry so for 683 * the portability, remap the pte with old|clean 684 * after pte clearing. 685 */ 686 ptent = ptep_get_and_clear_full(mm, addr, pte, 687 tlb->fullmm); 688 689 ptent = pte_mkold(ptent); 690 ptent = pte_mkclean(ptent); 691 set_pte_at(mm, addr, pte, ptent); 692 tlb_remove_tlb_entry(tlb, pte, addr); 693 } 694 mark_page_lazyfree(page); 695 } 696 out: 697 if (nr_swap) { 698 if (current->mm == mm) 699 sync_mm_rss(mm); 700 701 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 702 } 703 arch_leave_lazy_mmu_mode(); 704 pte_unmap_unlock(orig_pte, ptl); 705 cond_resched(); 706 next: 707 return 0; 708 } 709 710 static const struct mm_walk_ops madvise_free_walk_ops = { 711 .pmd_entry = madvise_free_pte_range, 712 }; 713 714 static int madvise_free_single_vma(struct vm_area_struct *vma, 715 unsigned long start_addr, unsigned long end_addr) 716 { 717 struct mm_struct *mm = vma->vm_mm; 718 struct mmu_notifier_range range; 719 struct mmu_gather tlb; 720 721 /* MADV_FREE works for only anon vma at the moment */ 722 if (!vma_is_anonymous(vma)) 723 return -EINVAL; 724 725 range.start = max(vma->vm_start, start_addr); 726 if (range.start >= vma->vm_end) 727 return -EINVAL; 728 range.end = min(vma->vm_end, end_addr); 729 if (range.end <= vma->vm_start) 730 return -EINVAL; 731 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 732 range.start, range.end); 733 734 lru_add_drain(); 735 tlb_gather_mmu(&tlb, mm); 736 update_hiwater_rss(mm); 737 738 mmu_notifier_invalidate_range_start(&range); 739 tlb_start_vma(&tlb, vma); 740 walk_page_range(vma->vm_mm, range.start, range.end, 741 &madvise_free_walk_ops, &tlb); 742 tlb_end_vma(&tlb, vma); 743 mmu_notifier_invalidate_range_end(&range); 744 tlb_finish_mmu(&tlb); 745 746 return 0; 747 } 748 749 /* 750 * Application no longer needs these pages. If the pages are dirty, 751 * it's OK to just throw them away. The app will be more careful about 752 * data it wants to keep. Be sure to free swap resources too. The 753 * zap_page_range call sets things up for shrink_active_list to actually free 754 * these pages later if no one else has touched them in the meantime, 755 * although we could add these pages to a global reuse list for 756 * shrink_active_list to pick up before reclaiming other pages. 757 * 758 * NB: This interface discards data rather than pushes it out to swap, 759 * as some implementations do. This has performance implications for 760 * applications like large transactional databases which want to discard 761 * pages in anonymous maps after committing to backing store the data 762 * that was kept in them. There is no reason to write this data out to 763 * the swap area if the application is discarding it. 764 * 765 * An interface that causes the system to free clean pages and flush 766 * dirty pages is already available as msync(MS_INVALIDATE). 767 */ 768 static long madvise_dontneed_single_vma(struct vm_area_struct *vma, 769 unsigned long start, unsigned long end) 770 { 771 zap_page_range(vma, start, end - start); 772 return 0; 773 } 774 775 static long madvise_dontneed_free(struct vm_area_struct *vma, 776 struct vm_area_struct **prev, 777 unsigned long start, unsigned long end, 778 int behavior) 779 { 780 struct mm_struct *mm = vma->vm_mm; 781 782 *prev = vma; 783 if (!can_madv_lru_vma(vma)) 784 return -EINVAL; 785 786 if (!userfaultfd_remove(vma, start, end)) { 787 *prev = NULL; /* mmap_lock has been dropped, prev is stale */ 788 789 mmap_read_lock(mm); 790 vma = find_vma(mm, start); 791 if (!vma) 792 return -ENOMEM; 793 if (start < vma->vm_start) { 794 /* 795 * This "vma" under revalidation is the one 796 * with the lowest vma->vm_start where start 797 * is also < vma->vm_end. If start < 798 * vma->vm_start it means an hole materialized 799 * in the user address space within the 800 * virtual range passed to MADV_DONTNEED 801 * or MADV_FREE. 802 */ 803 return -ENOMEM; 804 } 805 if (!can_madv_lru_vma(vma)) 806 return -EINVAL; 807 if (end > vma->vm_end) { 808 /* 809 * Don't fail if end > vma->vm_end. If the old 810 * vma was split while the mmap_lock was 811 * released the effect of the concurrent 812 * operation may not cause madvise() to 813 * have an undefined result. There may be an 814 * adjacent next vma that we'll walk 815 * next. userfaultfd_remove() will generate an 816 * UFFD_EVENT_REMOVE repetition on the 817 * end-vma->vm_end range, but the manager can 818 * handle a repetition fine. 819 */ 820 end = vma->vm_end; 821 } 822 VM_WARN_ON(start >= end); 823 } 824 825 if (behavior == MADV_DONTNEED) 826 return madvise_dontneed_single_vma(vma, start, end); 827 else if (behavior == MADV_FREE) 828 return madvise_free_single_vma(vma, start, end); 829 else 830 return -EINVAL; 831 } 832 833 static long madvise_populate(struct vm_area_struct *vma, 834 struct vm_area_struct **prev, 835 unsigned long start, unsigned long end, 836 int behavior) 837 { 838 const bool write = behavior == MADV_POPULATE_WRITE; 839 struct mm_struct *mm = vma->vm_mm; 840 unsigned long tmp_end; 841 int locked = 1; 842 long pages; 843 844 *prev = vma; 845 846 while (start < end) { 847 /* 848 * We might have temporarily dropped the lock. For example, 849 * our VMA might have been split. 850 */ 851 if (!vma || start >= vma->vm_end) { 852 vma = find_vma(mm, start); 853 if (!vma || start < vma->vm_start) 854 return -ENOMEM; 855 } 856 857 tmp_end = min_t(unsigned long, end, vma->vm_end); 858 /* Populate (prefault) page tables readable/writable. */ 859 pages = faultin_vma_page_range(vma, start, tmp_end, write, 860 &locked); 861 if (!locked) { 862 mmap_read_lock(mm); 863 locked = 1; 864 *prev = NULL; 865 vma = NULL; 866 } 867 if (pages < 0) { 868 switch (pages) { 869 case -EINTR: 870 return -EINTR; 871 case -EINVAL: /* Incompatible mappings / permissions. */ 872 return -EINVAL; 873 case -EHWPOISON: 874 return -EHWPOISON; 875 case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ 876 return -EFAULT; 877 default: 878 pr_warn_once("%s: unhandled return value: %ld\n", 879 __func__, pages); 880 fallthrough; 881 case -ENOMEM: 882 return -ENOMEM; 883 } 884 } 885 start += pages * PAGE_SIZE; 886 } 887 return 0; 888 } 889 890 /* 891 * Application wants to free up the pages and associated backing store. 892 * This is effectively punching a hole into the middle of a file. 893 */ 894 static long madvise_remove(struct vm_area_struct *vma, 895 struct vm_area_struct **prev, 896 unsigned long start, unsigned long end) 897 { 898 loff_t offset; 899 int error; 900 struct file *f; 901 struct mm_struct *mm = vma->vm_mm; 902 903 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 904 905 if (vma->vm_flags & VM_LOCKED) 906 return -EINVAL; 907 908 f = vma->vm_file; 909 910 if (!f || !f->f_mapping || !f->f_mapping->host) { 911 return -EINVAL; 912 } 913 914 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 915 return -EACCES; 916 917 offset = (loff_t)(start - vma->vm_start) 918 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 919 920 /* 921 * Filesystem's fallocate may need to take i_rwsem. We need to 922 * explicitly grab a reference because the vma (and hence the 923 * vma's reference to the file) can go away as soon as we drop 924 * mmap_lock. 925 */ 926 get_file(f); 927 if (userfaultfd_remove(vma, start, end)) { 928 /* mmap_lock was not released by userfaultfd_remove() */ 929 mmap_read_unlock(mm); 930 } 931 error = vfs_fallocate(f, 932 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 933 offset, end - start); 934 fput(f); 935 mmap_read_lock(mm); 936 return error; 937 } 938 939 /* 940 * Apply an madvise behavior to a region of a vma. madvise_update_vma 941 * will handle splitting a vm area into separate areas, each area with its own 942 * behavior. 943 */ 944 static int madvise_vma_behavior(struct vm_area_struct *vma, 945 struct vm_area_struct **prev, 946 unsigned long start, unsigned long end, 947 unsigned long behavior) 948 { 949 int error; 950 struct anon_vma_name *anon_name; 951 unsigned long new_flags = vma->vm_flags; 952 953 switch (behavior) { 954 case MADV_REMOVE: 955 return madvise_remove(vma, prev, start, end); 956 case MADV_WILLNEED: 957 return madvise_willneed(vma, prev, start, end); 958 case MADV_COLD: 959 return madvise_cold(vma, prev, start, end); 960 case MADV_PAGEOUT: 961 return madvise_pageout(vma, prev, start, end); 962 case MADV_FREE: 963 case MADV_DONTNEED: 964 return madvise_dontneed_free(vma, prev, start, end, behavior); 965 case MADV_POPULATE_READ: 966 case MADV_POPULATE_WRITE: 967 return madvise_populate(vma, prev, start, end, behavior); 968 case MADV_NORMAL: 969 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 970 break; 971 case MADV_SEQUENTIAL: 972 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 973 break; 974 case MADV_RANDOM: 975 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 976 break; 977 case MADV_DONTFORK: 978 new_flags |= VM_DONTCOPY; 979 break; 980 case MADV_DOFORK: 981 if (vma->vm_flags & VM_IO) 982 return -EINVAL; 983 new_flags &= ~VM_DONTCOPY; 984 break; 985 case MADV_WIPEONFORK: 986 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 987 if (vma->vm_file || vma->vm_flags & VM_SHARED) 988 return -EINVAL; 989 new_flags |= VM_WIPEONFORK; 990 break; 991 case MADV_KEEPONFORK: 992 new_flags &= ~VM_WIPEONFORK; 993 break; 994 case MADV_DONTDUMP: 995 new_flags |= VM_DONTDUMP; 996 break; 997 case MADV_DODUMP: 998 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) 999 return -EINVAL; 1000 new_flags &= ~VM_DONTDUMP; 1001 break; 1002 case MADV_MERGEABLE: 1003 case MADV_UNMERGEABLE: 1004 error = ksm_madvise(vma, start, end, behavior, &new_flags); 1005 if (error) 1006 goto out; 1007 break; 1008 case MADV_HUGEPAGE: 1009 case MADV_NOHUGEPAGE: 1010 error = hugepage_madvise(vma, &new_flags, behavior); 1011 if (error) 1012 goto out; 1013 break; 1014 } 1015 1016 anon_name = anon_vma_name(vma); 1017 anon_vma_name_get(anon_name); 1018 error = madvise_update_vma(vma, prev, start, end, new_flags, 1019 anon_name); 1020 anon_vma_name_put(anon_name); 1021 1022 out: 1023 /* 1024 * madvise() returns EAGAIN if kernel resources, such as 1025 * slab, are temporarily unavailable. 1026 */ 1027 if (error == -ENOMEM) 1028 error = -EAGAIN; 1029 return error; 1030 } 1031 1032 #ifdef CONFIG_MEMORY_FAILURE 1033 /* 1034 * Error injection support for memory error handling. 1035 */ 1036 static int madvise_inject_error(int behavior, 1037 unsigned long start, unsigned long end) 1038 { 1039 unsigned long size; 1040 1041 if (!capable(CAP_SYS_ADMIN)) 1042 return -EPERM; 1043 1044 1045 for (; start < end; start += size) { 1046 unsigned long pfn; 1047 struct page *page; 1048 int ret; 1049 1050 ret = get_user_pages_fast(start, 1, 0, &page); 1051 if (ret != 1) 1052 return ret; 1053 pfn = page_to_pfn(page); 1054 1055 /* 1056 * When soft offlining hugepages, after migrating the page 1057 * we dissolve it, therefore in the second loop "page" will 1058 * no longer be a compound page. 1059 */ 1060 size = page_size(compound_head(page)); 1061 1062 if (behavior == MADV_SOFT_OFFLINE) { 1063 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 1064 pfn, start); 1065 ret = soft_offline_page(pfn, MF_COUNT_INCREASED); 1066 } else { 1067 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 1068 pfn, start); 1069 ret = memory_failure(pfn, MF_COUNT_INCREASED); 1070 } 1071 1072 if (ret) 1073 return ret; 1074 } 1075 1076 return 0; 1077 } 1078 #endif 1079 1080 static bool 1081 madvise_behavior_valid(int behavior) 1082 { 1083 switch (behavior) { 1084 case MADV_DOFORK: 1085 case MADV_DONTFORK: 1086 case MADV_NORMAL: 1087 case MADV_SEQUENTIAL: 1088 case MADV_RANDOM: 1089 case MADV_REMOVE: 1090 case MADV_WILLNEED: 1091 case MADV_DONTNEED: 1092 case MADV_FREE: 1093 case MADV_COLD: 1094 case MADV_PAGEOUT: 1095 case MADV_POPULATE_READ: 1096 case MADV_POPULATE_WRITE: 1097 #ifdef CONFIG_KSM 1098 case MADV_MERGEABLE: 1099 case MADV_UNMERGEABLE: 1100 #endif 1101 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1102 case MADV_HUGEPAGE: 1103 case MADV_NOHUGEPAGE: 1104 #endif 1105 case MADV_DONTDUMP: 1106 case MADV_DODUMP: 1107 case MADV_WIPEONFORK: 1108 case MADV_KEEPONFORK: 1109 #ifdef CONFIG_MEMORY_FAILURE 1110 case MADV_SOFT_OFFLINE: 1111 case MADV_HWPOISON: 1112 #endif 1113 return true; 1114 1115 default: 1116 return false; 1117 } 1118 } 1119 1120 static bool 1121 process_madvise_behavior_valid(int behavior) 1122 { 1123 switch (behavior) { 1124 case MADV_COLD: 1125 case MADV_PAGEOUT: 1126 case MADV_WILLNEED: 1127 return true; 1128 default: 1129 return false; 1130 } 1131 } 1132 1133 /* 1134 * Walk the vmas in range [start,end), and call the visit function on each one. 1135 * The visit function will get start and end parameters that cover the overlap 1136 * between the current vma and the original range. Any unmapped regions in the 1137 * original range will result in this function returning -ENOMEM while still 1138 * calling the visit function on all of the existing vmas in the range. 1139 * Must be called with the mmap_lock held for reading or writing. 1140 */ 1141 static 1142 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, 1143 unsigned long end, unsigned long arg, 1144 int (*visit)(struct vm_area_struct *vma, 1145 struct vm_area_struct **prev, unsigned long start, 1146 unsigned long end, unsigned long arg)) 1147 { 1148 struct vm_area_struct *vma; 1149 struct vm_area_struct *prev; 1150 unsigned long tmp; 1151 int unmapped_error = 0; 1152 1153 /* 1154 * If the interval [start,end) covers some unmapped address 1155 * ranges, just ignore them, but return -ENOMEM at the end. 1156 * - different from the way of handling in mlock etc. 1157 */ 1158 vma = find_vma_prev(mm, start, &prev); 1159 if (vma && start > vma->vm_start) 1160 prev = vma; 1161 1162 for (;;) { 1163 int error; 1164 1165 /* Still start < end. */ 1166 if (!vma) 1167 return -ENOMEM; 1168 1169 /* Here start < (end|vma->vm_end). */ 1170 if (start < vma->vm_start) { 1171 unmapped_error = -ENOMEM; 1172 start = vma->vm_start; 1173 if (start >= end) 1174 break; 1175 } 1176 1177 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 1178 tmp = vma->vm_end; 1179 if (end < tmp) 1180 tmp = end; 1181 1182 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 1183 error = visit(vma, &prev, start, tmp, arg); 1184 if (error) 1185 return error; 1186 start = tmp; 1187 if (prev && start < prev->vm_end) 1188 start = prev->vm_end; 1189 if (start >= end) 1190 break; 1191 if (prev) 1192 vma = prev->vm_next; 1193 else /* madvise_remove dropped mmap_lock */ 1194 vma = find_vma(mm, start); 1195 } 1196 1197 return unmapped_error; 1198 } 1199 1200 #ifdef CONFIG_ANON_VMA_NAME 1201 static int madvise_vma_anon_name(struct vm_area_struct *vma, 1202 struct vm_area_struct **prev, 1203 unsigned long start, unsigned long end, 1204 unsigned long anon_name) 1205 { 1206 int error; 1207 1208 /* Only anonymous mappings can be named */ 1209 if (vma->vm_file) 1210 return -EBADF; 1211 1212 error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, 1213 (struct anon_vma_name *)anon_name); 1214 1215 /* 1216 * madvise() returns EAGAIN if kernel resources, such as 1217 * slab, are temporarily unavailable. 1218 */ 1219 if (error == -ENOMEM) 1220 error = -EAGAIN; 1221 return error; 1222 } 1223 1224 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, 1225 unsigned long len_in, struct anon_vma_name *anon_name) 1226 { 1227 unsigned long end; 1228 unsigned long len; 1229 1230 if (start & ~PAGE_MASK) 1231 return -EINVAL; 1232 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 1233 1234 /* Check to see whether len was rounded up from small -ve to zero */ 1235 if (len_in && !len) 1236 return -EINVAL; 1237 1238 end = start + len; 1239 if (end < start) 1240 return -EINVAL; 1241 1242 if (end == start) 1243 return 0; 1244 1245 return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, 1246 madvise_vma_anon_name); 1247 } 1248 #endif /* CONFIG_ANON_VMA_NAME */ 1249 /* 1250 * The madvise(2) system call. 1251 * 1252 * Applications can use madvise() to advise the kernel how it should 1253 * handle paging I/O in this VM area. The idea is to help the kernel 1254 * use appropriate read-ahead and caching techniques. The information 1255 * provided is advisory only, and can be safely disregarded by the 1256 * kernel without affecting the correct operation of the application. 1257 * 1258 * behavior values: 1259 * MADV_NORMAL - the default behavior is to read clusters. This 1260 * results in some read-ahead and read-behind. 1261 * MADV_RANDOM - the system should read the minimum amount of data 1262 * on any access, since it is unlikely that the appli- 1263 * cation will need more than what it asks for. 1264 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 1265 * once, so they can be aggressively read ahead, and 1266 * can be freed soon after they are accessed. 1267 * MADV_WILLNEED - the application is notifying the system to read 1268 * some pages ahead. 1269 * MADV_DONTNEED - the application is finished with the given range, 1270 * so the kernel can free resources associated with it. 1271 * MADV_FREE - the application marks pages in the given range as lazy free, 1272 * where actual purges are postponed until memory pressure happens. 1273 * MADV_REMOVE - the application wants to free up the given range of 1274 * pages and associated backing store. 1275 * MADV_DONTFORK - omit this area from child's address space when forking: 1276 * typically, to avoid COWing pages pinned by get_user_pages(). 1277 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 1278 * MADV_WIPEONFORK - present the child process with zero-filled memory in this 1279 * range after a fork. 1280 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 1281 * MADV_HWPOISON - trigger memory error handler as if the given memory range 1282 * were corrupted by unrecoverable hardware memory failure. 1283 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 1284 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 1285 * this area with pages of identical content from other such areas. 1286 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 1287 * MADV_HUGEPAGE - the application wants to back the given range by transparent 1288 * huge pages in the future. Existing pages might be coalesced and 1289 * new pages might be allocated as THP. 1290 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 1291 * transparent huge pages so the existing pages will not be 1292 * coalesced into THP and new pages will not be allocated as THP. 1293 * MADV_DONTDUMP - the application wants to prevent pages in the given range 1294 * from being included in its core dump. 1295 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 1296 * MADV_COLD - the application is not expected to use this memory soon, 1297 * deactivate pages in this range so that they can be reclaimed 1298 * easily if memory pressure happens. 1299 * MADV_PAGEOUT - the application is not expected to use this memory soon, 1300 * page out the pages in this range immediately. 1301 * MADV_POPULATE_READ - populate (prefault) page tables readable by 1302 * triggering read faults if required 1303 * MADV_POPULATE_WRITE - populate (prefault) page tables writable by 1304 * triggering write faults if required 1305 * 1306 * return values: 1307 * zero - success 1308 * -EINVAL - start + len < 0, start is not page-aligned, 1309 * "behavior" is not a valid value, or application 1310 * is attempting to release locked or shared pages, 1311 * or the specified address range includes file, Huge TLB, 1312 * MAP_SHARED or VMPFNMAP range. 1313 * -ENOMEM - addresses in the specified range are not currently 1314 * mapped, or are outside the AS of the process. 1315 * -EIO - an I/O error occurred while paging in data. 1316 * -EBADF - map exists, but area maps something that isn't a file. 1317 * -EAGAIN - a kernel resource was temporarily unavailable. 1318 */ 1319 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) 1320 { 1321 unsigned long end; 1322 int error; 1323 int write; 1324 size_t len; 1325 struct blk_plug plug; 1326 1327 start = untagged_addr(start); 1328 1329 if (!madvise_behavior_valid(behavior)) 1330 return -EINVAL; 1331 1332 if (!PAGE_ALIGNED(start)) 1333 return -EINVAL; 1334 len = PAGE_ALIGN(len_in); 1335 1336 /* Check to see whether len was rounded up from small -ve to zero */ 1337 if (len_in && !len) 1338 return -EINVAL; 1339 1340 end = start + len; 1341 if (end < start) 1342 return -EINVAL; 1343 1344 if (end == start) 1345 return 0; 1346 1347 #ifdef CONFIG_MEMORY_FAILURE 1348 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 1349 return madvise_inject_error(behavior, start, start + len_in); 1350 #endif 1351 1352 write = madvise_need_mmap_write(behavior); 1353 if (write) { 1354 if (mmap_write_lock_killable(mm)) 1355 return -EINTR; 1356 } else { 1357 mmap_read_lock(mm); 1358 } 1359 1360 blk_start_plug(&plug); 1361 error = madvise_walk_vmas(mm, start, end, behavior, 1362 madvise_vma_behavior); 1363 blk_finish_plug(&plug); 1364 if (write) 1365 mmap_write_unlock(mm); 1366 else 1367 mmap_read_unlock(mm); 1368 1369 return error; 1370 } 1371 1372 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 1373 { 1374 return do_madvise(current->mm, start, len_in, behavior); 1375 } 1376 1377 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, 1378 size_t, vlen, int, behavior, unsigned int, flags) 1379 { 1380 ssize_t ret; 1381 struct iovec iovstack[UIO_FASTIOV], iovec; 1382 struct iovec *iov = iovstack; 1383 struct iov_iter iter; 1384 struct task_struct *task; 1385 struct mm_struct *mm; 1386 size_t total_len; 1387 unsigned int f_flags; 1388 1389 if (flags != 0) { 1390 ret = -EINVAL; 1391 goto out; 1392 } 1393 1394 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 1395 if (ret < 0) 1396 goto out; 1397 1398 task = pidfd_get_task(pidfd, &f_flags); 1399 if (IS_ERR(task)) { 1400 ret = PTR_ERR(task); 1401 goto free_iov; 1402 } 1403 1404 if (!process_madvise_behavior_valid(behavior)) { 1405 ret = -EINVAL; 1406 goto release_task; 1407 } 1408 1409 /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ 1410 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); 1411 if (IS_ERR_OR_NULL(mm)) { 1412 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; 1413 goto release_task; 1414 } 1415 1416 /* 1417 * Require CAP_SYS_NICE for influencing process performance. Note that 1418 * only non-destructive hints are currently supported. 1419 */ 1420 if (!capable(CAP_SYS_NICE)) { 1421 ret = -EPERM; 1422 goto release_mm; 1423 } 1424 1425 total_len = iov_iter_count(&iter); 1426 1427 while (iov_iter_count(&iter)) { 1428 iovec = iov_iter_iovec(&iter); 1429 ret = do_madvise(mm, (unsigned long)iovec.iov_base, 1430 iovec.iov_len, behavior); 1431 if (ret < 0) 1432 break; 1433 iov_iter_advance(&iter, iovec.iov_len); 1434 } 1435 1436 if (ret == 0) 1437 ret = total_len - iov_iter_count(&iter); 1438 1439 release_mm: 1440 mmput(mm); 1441 release_task: 1442 put_task_struct(task); 1443 free_iov: 1444 kfree(iov); 1445 out: 1446 return ret; 1447 } 1448