1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/madvise.c 4 * 5 * Copyright (C) 1999 Linus Torvalds 6 * Copyright (C) 2002 Christoph Hellwig 7 */ 8 9 #include <linux/mman.h> 10 #include <linux/pagemap.h> 11 #include <linux/syscalls.h> 12 #include <linux/mempolicy.h> 13 #include <linux/page-isolation.h> 14 #include <linux/page_idle.h> 15 #include <linux/userfaultfd_k.h> 16 #include <linux/hugetlb.h> 17 #include <linux/falloc.h> 18 #include <linux/fadvise.h> 19 #include <linux/sched.h> 20 #include <linux/sched/mm.h> 21 #include <linux/uio.h> 22 #include <linux/ksm.h> 23 #include <linux/fs.h> 24 #include <linux/file.h> 25 #include <linux/blkdev.h> 26 #include <linux/backing-dev.h> 27 #include <linux/pagewalk.h> 28 #include <linux/swap.h> 29 #include <linux/swapops.h> 30 #include <linux/shmem_fs.h> 31 #include <linux/mmu_notifier.h> 32 33 #include <asm/tlb.h> 34 35 #include "internal.h" 36 37 struct madvise_walk_private { 38 struct mmu_gather *tlb; 39 bool pageout; 40 }; 41 42 /* 43 * Any behaviour which results in changes to the vma->vm_flags needs to 44 * take mmap_lock for writing. Others, which simply traverse vmas, need 45 * to only take it for reading. 46 */ 47 static int madvise_need_mmap_write(int behavior) 48 { 49 switch (behavior) { 50 case MADV_REMOVE: 51 case MADV_WILLNEED: 52 case MADV_DONTNEED: 53 case MADV_COLD: 54 case MADV_PAGEOUT: 55 case MADV_FREE: 56 return 0; 57 default: 58 /* be safe, default to 1. list exceptions explicitly */ 59 return 1; 60 } 61 } 62 63 /* 64 * We can potentially split a vm area into separate 65 * areas, each area with its own behavior. 66 */ 67 static long madvise_behavior(struct vm_area_struct *vma, 68 struct vm_area_struct **prev, 69 unsigned long start, unsigned long end, int behavior) 70 { 71 struct mm_struct *mm = vma->vm_mm; 72 int error = 0; 73 pgoff_t pgoff; 74 unsigned long new_flags = vma->vm_flags; 75 76 switch (behavior) { 77 case MADV_NORMAL: 78 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 79 break; 80 case MADV_SEQUENTIAL: 81 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 82 break; 83 case MADV_RANDOM: 84 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 85 break; 86 case MADV_DONTFORK: 87 new_flags |= VM_DONTCOPY; 88 break; 89 case MADV_DOFORK: 90 if (vma->vm_flags & VM_IO) { 91 error = -EINVAL; 92 goto out; 93 } 94 new_flags &= ~VM_DONTCOPY; 95 break; 96 case MADV_WIPEONFORK: 97 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 98 if (vma->vm_file || vma->vm_flags & VM_SHARED) { 99 error = -EINVAL; 100 goto out; 101 } 102 new_flags |= VM_WIPEONFORK; 103 break; 104 case MADV_KEEPONFORK: 105 new_flags &= ~VM_WIPEONFORK; 106 break; 107 case MADV_DONTDUMP: 108 new_flags |= VM_DONTDUMP; 109 break; 110 case MADV_DODUMP: 111 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { 112 error = -EINVAL; 113 goto out; 114 } 115 new_flags &= ~VM_DONTDUMP; 116 break; 117 case MADV_MERGEABLE: 118 case MADV_UNMERGEABLE: 119 error = ksm_madvise(vma, start, end, behavior, &new_flags); 120 if (error) 121 goto out_convert_errno; 122 break; 123 case MADV_HUGEPAGE: 124 case MADV_NOHUGEPAGE: 125 error = hugepage_madvise(vma, &new_flags, behavior); 126 if (error) 127 goto out_convert_errno; 128 break; 129 } 130 131 if (new_flags == vma->vm_flags) { 132 *prev = vma; 133 goto out; 134 } 135 136 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 137 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 138 vma->vm_file, pgoff, vma_policy(vma), 139 vma->vm_userfaultfd_ctx); 140 if (*prev) { 141 vma = *prev; 142 goto success; 143 } 144 145 *prev = vma; 146 147 if (start != vma->vm_start) { 148 if (unlikely(mm->map_count >= sysctl_max_map_count)) { 149 error = -ENOMEM; 150 goto out; 151 } 152 error = __split_vma(mm, vma, start, 1); 153 if (error) 154 goto out_convert_errno; 155 } 156 157 if (end != vma->vm_end) { 158 if (unlikely(mm->map_count >= sysctl_max_map_count)) { 159 error = -ENOMEM; 160 goto out; 161 } 162 error = __split_vma(mm, vma, end, 0); 163 if (error) 164 goto out_convert_errno; 165 } 166 167 success: 168 /* 169 * vm_flags is protected by the mmap_lock held in write mode. 170 */ 171 vma->vm_flags = new_flags; 172 173 out_convert_errno: 174 /* 175 * madvise() returns EAGAIN if kernel resources, such as 176 * slab, are temporarily unavailable. 177 */ 178 if (error == -ENOMEM) 179 error = -EAGAIN; 180 out: 181 return error; 182 } 183 184 #ifdef CONFIG_SWAP 185 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 186 unsigned long end, struct mm_walk *walk) 187 { 188 pte_t *orig_pte; 189 struct vm_area_struct *vma = walk->private; 190 unsigned long index; 191 192 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 193 return 0; 194 195 for (index = start; index != end; index += PAGE_SIZE) { 196 pte_t pte; 197 swp_entry_t entry; 198 struct page *page; 199 spinlock_t *ptl; 200 201 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 202 pte = *(orig_pte + ((index - start) / PAGE_SIZE)); 203 pte_unmap_unlock(orig_pte, ptl); 204 205 if (pte_present(pte) || pte_none(pte)) 206 continue; 207 entry = pte_to_swp_entry(pte); 208 if (unlikely(non_swap_entry(entry))) 209 continue; 210 211 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 212 vma, index, false); 213 if (page) 214 put_page(page); 215 } 216 217 return 0; 218 } 219 220 static const struct mm_walk_ops swapin_walk_ops = { 221 .pmd_entry = swapin_walk_pmd_entry, 222 }; 223 224 static void force_shm_swapin_readahead(struct vm_area_struct *vma, 225 unsigned long start, unsigned long end, 226 struct address_space *mapping) 227 { 228 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); 229 pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1); 230 struct page *page; 231 232 rcu_read_lock(); 233 xas_for_each(&xas, page, end_index) { 234 swp_entry_t swap; 235 236 if (!xa_is_value(page)) 237 continue; 238 xas_pause(&xas); 239 rcu_read_unlock(); 240 241 swap = radix_to_swp_entry(page); 242 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, 243 NULL, 0, false); 244 if (page) 245 put_page(page); 246 247 rcu_read_lock(); 248 } 249 rcu_read_unlock(); 250 251 lru_add_drain(); /* Push any new pages onto the LRU now */ 252 } 253 #endif /* CONFIG_SWAP */ 254 255 /* 256 * Schedule all required I/O operations. Do not wait for completion. 257 */ 258 static long madvise_willneed(struct vm_area_struct *vma, 259 struct vm_area_struct **prev, 260 unsigned long start, unsigned long end) 261 { 262 struct mm_struct *mm = vma->vm_mm; 263 struct file *file = vma->vm_file; 264 loff_t offset; 265 266 *prev = vma; 267 #ifdef CONFIG_SWAP 268 if (!file) { 269 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); 270 lru_add_drain(); /* Push any new pages onto the LRU now */ 271 return 0; 272 } 273 274 if (shmem_mapping(file->f_mapping)) { 275 force_shm_swapin_readahead(vma, start, end, 276 file->f_mapping); 277 return 0; 278 } 279 #else 280 if (!file) 281 return -EBADF; 282 #endif 283 284 if (IS_DAX(file_inode(file))) { 285 /* no bad return value, but ignore advice */ 286 return 0; 287 } 288 289 /* 290 * Filesystem's fadvise may need to take various locks. We need to 291 * explicitly grab a reference because the vma (and hence the 292 * vma's reference to the file) can go away as soon as we drop 293 * mmap_lock. 294 */ 295 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 296 get_file(file); 297 offset = (loff_t)(start - vma->vm_start) 298 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 299 mmap_read_unlock(mm); 300 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 301 fput(file); 302 mmap_read_lock(mm); 303 return 0; 304 } 305 306 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, 307 unsigned long addr, unsigned long end, 308 struct mm_walk *walk) 309 { 310 struct madvise_walk_private *private = walk->private; 311 struct mmu_gather *tlb = private->tlb; 312 bool pageout = private->pageout; 313 struct mm_struct *mm = tlb->mm; 314 struct vm_area_struct *vma = walk->vma; 315 pte_t *orig_pte, *pte, ptent; 316 spinlock_t *ptl; 317 struct page *page = NULL; 318 LIST_HEAD(page_list); 319 320 if (fatal_signal_pending(current)) 321 return -EINTR; 322 323 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 324 if (pmd_trans_huge(*pmd)) { 325 pmd_t orig_pmd; 326 unsigned long next = pmd_addr_end(addr, end); 327 328 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 329 ptl = pmd_trans_huge_lock(pmd, vma); 330 if (!ptl) 331 return 0; 332 333 orig_pmd = *pmd; 334 if (is_huge_zero_pmd(orig_pmd)) 335 goto huge_unlock; 336 337 if (unlikely(!pmd_present(orig_pmd))) { 338 VM_BUG_ON(thp_migration_supported() && 339 !is_pmd_migration_entry(orig_pmd)); 340 goto huge_unlock; 341 } 342 343 page = pmd_page(orig_pmd); 344 345 /* Do not interfere with other mappings of this page */ 346 if (page_mapcount(page) != 1) 347 goto huge_unlock; 348 349 if (next - addr != HPAGE_PMD_SIZE) { 350 int err; 351 352 get_page(page); 353 spin_unlock(ptl); 354 lock_page(page); 355 err = split_huge_page(page); 356 unlock_page(page); 357 put_page(page); 358 if (!err) 359 goto regular_page; 360 return 0; 361 } 362 363 if (pmd_young(orig_pmd)) { 364 pmdp_invalidate(vma, addr, pmd); 365 orig_pmd = pmd_mkold(orig_pmd); 366 367 set_pmd_at(mm, addr, pmd, orig_pmd); 368 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 369 } 370 371 ClearPageReferenced(page); 372 test_and_clear_page_young(page); 373 if (pageout) { 374 if (!isolate_lru_page(page)) { 375 if (PageUnevictable(page)) 376 putback_lru_page(page); 377 else 378 list_add(&page->lru, &page_list); 379 } 380 } else 381 deactivate_page(page); 382 huge_unlock: 383 spin_unlock(ptl); 384 if (pageout) 385 reclaim_pages(&page_list); 386 return 0; 387 } 388 389 regular_page: 390 if (pmd_trans_unstable(pmd)) 391 return 0; 392 #endif 393 tlb_change_page_size(tlb, PAGE_SIZE); 394 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 395 flush_tlb_batched_pending(mm); 396 arch_enter_lazy_mmu_mode(); 397 for (; addr < end; pte++, addr += PAGE_SIZE) { 398 ptent = *pte; 399 400 if (pte_none(ptent)) 401 continue; 402 403 if (!pte_present(ptent)) 404 continue; 405 406 page = vm_normal_page(vma, addr, ptent); 407 if (!page) 408 continue; 409 410 /* 411 * Creating a THP page is expensive so split it only if we 412 * are sure it's worth. Split it if we are only owner. 413 */ 414 if (PageTransCompound(page)) { 415 if (page_mapcount(page) != 1) 416 break; 417 get_page(page); 418 if (!trylock_page(page)) { 419 put_page(page); 420 break; 421 } 422 pte_unmap_unlock(orig_pte, ptl); 423 if (split_huge_page(page)) { 424 unlock_page(page); 425 put_page(page); 426 pte_offset_map_lock(mm, pmd, addr, &ptl); 427 break; 428 } 429 unlock_page(page); 430 put_page(page); 431 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 432 pte--; 433 addr -= PAGE_SIZE; 434 continue; 435 } 436 437 /* Do not interfere with other mappings of this page */ 438 if (page_mapcount(page) != 1) 439 continue; 440 441 VM_BUG_ON_PAGE(PageTransCompound(page), page); 442 443 if (pte_young(ptent)) { 444 ptent = ptep_get_and_clear_full(mm, addr, pte, 445 tlb->fullmm); 446 ptent = pte_mkold(ptent); 447 set_pte_at(mm, addr, pte, ptent); 448 tlb_remove_tlb_entry(tlb, pte, addr); 449 } 450 451 /* 452 * We are deactivating a page for accelerating reclaiming. 453 * VM couldn't reclaim the page unless we clear PG_young. 454 * As a side effect, it makes confuse idle-page tracking 455 * because they will miss recent referenced history. 456 */ 457 ClearPageReferenced(page); 458 test_and_clear_page_young(page); 459 if (pageout) { 460 if (!isolate_lru_page(page)) { 461 if (PageUnevictable(page)) 462 putback_lru_page(page); 463 else 464 list_add(&page->lru, &page_list); 465 } 466 } else 467 deactivate_page(page); 468 } 469 470 arch_leave_lazy_mmu_mode(); 471 pte_unmap_unlock(orig_pte, ptl); 472 if (pageout) 473 reclaim_pages(&page_list); 474 cond_resched(); 475 476 return 0; 477 } 478 479 static const struct mm_walk_ops cold_walk_ops = { 480 .pmd_entry = madvise_cold_or_pageout_pte_range, 481 }; 482 483 static void madvise_cold_page_range(struct mmu_gather *tlb, 484 struct vm_area_struct *vma, 485 unsigned long addr, unsigned long end) 486 { 487 struct madvise_walk_private walk_private = { 488 .pageout = false, 489 .tlb = tlb, 490 }; 491 492 tlb_start_vma(tlb, vma); 493 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 494 tlb_end_vma(tlb, vma); 495 } 496 497 static long madvise_cold(struct vm_area_struct *vma, 498 struct vm_area_struct **prev, 499 unsigned long start_addr, unsigned long end_addr) 500 { 501 struct mm_struct *mm = vma->vm_mm; 502 struct mmu_gather tlb; 503 504 *prev = vma; 505 if (!can_madv_lru_vma(vma)) 506 return -EINVAL; 507 508 lru_add_drain(); 509 tlb_gather_mmu(&tlb, mm, start_addr, end_addr); 510 madvise_cold_page_range(&tlb, vma, start_addr, end_addr); 511 tlb_finish_mmu(&tlb, start_addr, end_addr); 512 513 return 0; 514 } 515 516 static void madvise_pageout_page_range(struct mmu_gather *tlb, 517 struct vm_area_struct *vma, 518 unsigned long addr, unsigned long end) 519 { 520 struct madvise_walk_private walk_private = { 521 .pageout = true, 522 .tlb = tlb, 523 }; 524 525 tlb_start_vma(tlb, vma); 526 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 527 tlb_end_vma(tlb, vma); 528 } 529 530 static inline bool can_do_pageout(struct vm_area_struct *vma) 531 { 532 if (vma_is_anonymous(vma)) 533 return true; 534 if (!vma->vm_file) 535 return false; 536 /* 537 * paging out pagecache only for non-anonymous mappings that correspond 538 * to the files the calling process could (if tried) open for writing; 539 * otherwise we'd be including shared non-exclusive mappings, which 540 * opens a side channel. 541 */ 542 return inode_owner_or_capable(file_inode(vma->vm_file)) || 543 inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; 544 } 545 546 static long madvise_pageout(struct vm_area_struct *vma, 547 struct vm_area_struct **prev, 548 unsigned long start_addr, unsigned long end_addr) 549 { 550 struct mm_struct *mm = vma->vm_mm; 551 struct mmu_gather tlb; 552 553 *prev = vma; 554 if (!can_madv_lru_vma(vma)) 555 return -EINVAL; 556 557 if (!can_do_pageout(vma)) 558 return 0; 559 560 lru_add_drain(); 561 tlb_gather_mmu(&tlb, mm, start_addr, end_addr); 562 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); 563 tlb_finish_mmu(&tlb, start_addr, end_addr); 564 565 return 0; 566 } 567 568 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 569 unsigned long end, struct mm_walk *walk) 570 571 { 572 struct mmu_gather *tlb = walk->private; 573 struct mm_struct *mm = tlb->mm; 574 struct vm_area_struct *vma = walk->vma; 575 spinlock_t *ptl; 576 pte_t *orig_pte, *pte, ptent; 577 struct page *page; 578 int nr_swap = 0; 579 unsigned long next; 580 581 next = pmd_addr_end(addr, end); 582 if (pmd_trans_huge(*pmd)) 583 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 584 goto next; 585 586 if (pmd_trans_unstable(pmd)) 587 return 0; 588 589 tlb_change_page_size(tlb, PAGE_SIZE); 590 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 591 flush_tlb_batched_pending(mm); 592 arch_enter_lazy_mmu_mode(); 593 for (; addr != end; pte++, addr += PAGE_SIZE) { 594 ptent = *pte; 595 596 if (pte_none(ptent)) 597 continue; 598 /* 599 * If the pte has swp_entry, just clear page table to 600 * prevent swap-in which is more expensive rather than 601 * (page allocation + zeroing). 602 */ 603 if (!pte_present(ptent)) { 604 swp_entry_t entry; 605 606 entry = pte_to_swp_entry(ptent); 607 if (non_swap_entry(entry)) 608 continue; 609 nr_swap--; 610 free_swap_and_cache(entry); 611 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 612 continue; 613 } 614 615 page = vm_normal_page(vma, addr, ptent); 616 if (!page) 617 continue; 618 619 /* 620 * If pmd isn't transhuge but the page is THP and 621 * is owned by only this process, split it and 622 * deactivate all pages. 623 */ 624 if (PageTransCompound(page)) { 625 if (page_mapcount(page) != 1) 626 goto out; 627 get_page(page); 628 if (!trylock_page(page)) { 629 put_page(page); 630 goto out; 631 } 632 pte_unmap_unlock(orig_pte, ptl); 633 if (split_huge_page(page)) { 634 unlock_page(page); 635 put_page(page); 636 pte_offset_map_lock(mm, pmd, addr, &ptl); 637 goto out; 638 } 639 unlock_page(page); 640 put_page(page); 641 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 642 pte--; 643 addr -= PAGE_SIZE; 644 continue; 645 } 646 647 VM_BUG_ON_PAGE(PageTransCompound(page), page); 648 649 if (PageSwapCache(page) || PageDirty(page)) { 650 if (!trylock_page(page)) 651 continue; 652 /* 653 * If page is shared with others, we couldn't clear 654 * PG_dirty of the page. 655 */ 656 if (page_mapcount(page) != 1) { 657 unlock_page(page); 658 continue; 659 } 660 661 if (PageSwapCache(page) && !try_to_free_swap(page)) { 662 unlock_page(page); 663 continue; 664 } 665 666 ClearPageDirty(page); 667 unlock_page(page); 668 } 669 670 if (pte_young(ptent) || pte_dirty(ptent)) { 671 /* 672 * Some of architecture(ex, PPC) don't update TLB 673 * with set_pte_at and tlb_remove_tlb_entry so for 674 * the portability, remap the pte with old|clean 675 * after pte clearing. 676 */ 677 ptent = ptep_get_and_clear_full(mm, addr, pte, 678 tlb->fullmm); 679 680 ptent = pte_mkold(ptent); 681 ptent = pte_mkclean(ptent); 682 set_pte_at(mm, addr, pte, ptent); 683 tlb_remove_tlb_entry(tlb, pte, addr); 684 } 685 mark_page_lazyfree(page); 686 } 687 out: 688 if (nr_swap) { 689 if (current->mm == mm) 690 sync_mm_rss(mm); 691 692 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 693 } 694 arch_leave_lazy_mmu_mode(); 695 pte_unmap_unlock(orig_pte, ptl); 696 cond_resched(); 697 next: 698 return 0; 699 } 700 701 static const struct mm_walk_ops madvise_free_walk_ops = { 702 .pmd_entry = madvise_free_pte_range, 703 }; 704 705 static int madvise_free_single_vma(struct vm_area_struct *vma, 706 unsigned long start_addr, unsigned long end_addr) 707 { 708 struct mm_struct *mm = vma->vm_mm; 709 struct mmu_notifier_range range; 710 struct mmu_gather tlb; 711 712 /* MADV_FREE works for only anon vma at the moment */ 713 if (!vma_is_anonymous(vma)) 714 return -EINVAL; 715 716 range.start = max(vma->vm_start, start_addr); 717 if (range.start >= vma->vm_end) 718 return -EINVAL; 719 range.end = min(vma->vm_end, end_addr); 720 if (range.end <= vma->vm_start) 721 return -EINVAL; 722 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 723 range.start, range.end); 724 725 lru_add_drain(); 726 tlb_gather_mmu(&tlb, mm, range.start, range.end); 727 update_hiwater_rss(mm); 728 729 mmu_notifier_invalidate_range_start(&range); 730 tlb_start_vma(&tlb, vma); 731 walk_page_range(vma->vm_mm, range.start, range.end, 732 &madvise_free_walk_ops, &tlb); 733 tlb_end_vma(&tlb, vma); 734 mmu_notifier_invalidate_range_end(&range); 735 tlb_finish_mmu(&tlb, range.start, range.end); 736 737 return 0; 738 } 739 740 /* 741 * Application no longer needs these pages. If the pages are dirty, 742 * it's OK to just throw them away. The app will be more careful about 743 * data it wants to keep. Be sure to free swap resources too. The 744 * zap_page_range call sets things up for shrink_active_list to actually free 745 * these pages later if no one else has touched them in the meantime, 746 * although we could add these pages to a global reuse list for 747 * shrink_active_list to pick up before reclaiming other pages. 748 * 749 * NB: This interface discards data rather than pushes it out to swap, 750 * as some implementations do. This has performance implications for 751 * applications like large transactional databases which want to discard 752 * pages in anonymous maps after committing to backing store the data 753 * that was kept in them. There is no reason to write this data out to 754 * the swap area if the application is discarding it. 755 * 756 * An interface that causes the system to free clean pages and flush 757 * dirty pages is already available as msync(MS_INVALIDATE). 758 */ 759 static long madvise_dontneed_single_vma(struct vm_area_struct *vma, 760 unsigned long start, unsigned long end) 761 { 762 zap_page_range(vma, start, end - start); 763 return 0; 764 } 765 766 static long madvise_dontneed_free(struct vm_area_struct *vma, 767 struct vm_area_struct **prev, 768 unsigned long start, unsigned long end, 769 int behavior) 770 { 771 struct mm_struct *mm = vma->vm_mm; 772 773 *prev = vma; 774 if (!can_madv_lru_vma(vma)) 775 return -EINVAL; 776 777 if (!userfaultfd_remove(vma, start, end)) { 778 *prev = NULL; /* mmap_lock has been dropped, prev is stale */ 779 780 mmap_read_lock(mm); 781 vma = find_vma(mm, start); 782 if (!vma) 783 return -ENOMEM; 784 if (start < vma->vm_start) { 785 /* 786 * This "vma" under revalidation is the one 787 * with the lowest vma->vm_start where start 788 * is also < vma->vm_end. If start < 789 * vma->vm_start it means an hole materialized 790 * in the user address space within the 791 * virtual range passed to MADV_DONTNEED 792 * or MADV_FREE. 793 */ 794 return -ENOMEM; 795 } 796 if (!can_madv_lru_vma(vma)) 797 return -EINVAL; 798 if (end > vma->vm_end) { 799 /* 800 * Don't fail if end > vma->vm_end. If the old 801 * vma was splitted while the mmap_lock was 802 * released the effect of the concurrent 803 * operation may not cause madvise() to 804 * have an undefined result. There may be an 805 * adjacent next vma that we'll walk 806 * next. userfaultfd_remove() will generate an 807 * UFFD_EVENT_REMOVE repetition on the 808 * end-vma->vm_end range, but the manager can 809 * handle a repetition fine. 810 */ 811 end = vma->vm_end; 812 } 813 VM_WARN_ON(start >= end); 814 } 815 816 if (behavior == MADV_DONTNEED) 817 return madvise_dontneed_single_vma(vma, start, end); 818 else if (behavior == MADV_FREE) 819 return madvise_free_single_vma(vma, start, end); 820 else 821 return -EINVAL; 822 } 823 824 /* 825 * Application wants to free up the pages and associated backing store. 826 * This is effectively punching a hole into the middle of a file. 827 */ 828 static long madvise_remove(struct vm_area_struct *vma, 829 struct vm_area_struct **prev, 830 unsigned long start, unsigned long end) 831 { 832 loff_t offset; 833 int error; 834 struct file *f; 835 struct mm_struct *mm = vma->vm_mm; 836 837 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 838 839 if (vma->vm_flags & VM_LOCKED) 840 return -EINVAL; 841 842 f = vma->vm_file; 843 844 if (!f || !f->f_mapping || !f->f_mapping->host) { 845 return -EINVAL; 846 } 847 848 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 849 return -EACCES; 850 851 offset = (loff_t)(start - vma->vm_start) 852 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 853 854 /* 855 * Filesystem's fallocate may need to take i_mutex. We need to 856 * explicitly grab a reference because the vma (and hence the 857 * vma's reference to the file) can go away as soon as we drop 858 * mmap_lock. 859 */ 860 get_file(f); 861 if (userfaultfd_remove(vma, start, end)) { 862 /* mmap_lock was not released by userfaultfd_remove() */ 863 mmap_read_unlock(mm); 864 } 865 error = vfs_fallocate(f, 866 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 867 offset, end - start); 868 fput(f); 869 mmap_read_lock(mm); 870 return error; 871 } 872 873 #ifdef CONFIG_MEMORY_FAILURE 874 /* 875 * Error injection support for memory error handling. 876 */ 877 static int madvise_inject_error(int behavior, 878 unsigned long start, unsigned long end) 879 { 880 struct zone *zone; 881 unsigned long size; 882 883 if (!capable(CAP_SYS_ADMIN)) 884 return -EPERM; 885 886 887 for (; start < end; start += size) { 888 unsigned long pfn; 889 struct page *page; 890 int ret; 891 892 ret = get_user_pages_fast(start, 1, 0, &page); 893 if (ret != 1) 894 return ret; 895 pfn = page_to_pfn(page); 896 897 /* 898 * When soft offlining hugepages, after migrating the page 899 * we dissolve it, therefore in the second loop "page" will 900 * no longer be a compound page. 901 */ 902 size = page_size(compound_head(page)); 903 904 if (behavior == MADV_SOFT_OFFLINE) { 905 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 906 pfn, start); 907 ret = soft_offline_page(pfn, MF_COUNT_INCREASED); 908 } else { 909 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 910 pfn, start); 911 /* 912 * Drop the page reference taken by get_user_pages_fast(). In 913 * the absence of MF_COUNT_INCREASED the memory_failure() 914 * routine is responsible for pinning the page to prevent it 915 * from being released back to the page allocator. 916 */ 917 put_page(page); 918 ret = memory_failure(pfn, 0); 919 } 920 921 if (ret) 922 return ret; 923 } 924 925 /* Ensure that all poisoned pages are removed from per-cpu lists */ 926 for_each_populated_zone(zone) 927 drain_all_pages(zone); 928 929 return 0; 930 } 931 #endif 932 933 static long 934 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 935 unsigned long start, unsigned long end, int behavior) 936 { 937 switch (behavior) { 938 case MADV_REMOVE: 939 return madvise_remove(vma, prev, start, end); 940 case MADV_WILLNEED: 941 return madvise_willneed(vma, prev, start, end); 942 case MADV_COLD: 943 return madvise_cold(vma, prev, start, end); 944 case MADV_PAGEOUT: 945 return madvise_pageout(vma, prev, start, end); 946 case MADV_FREE: 947 case MADV_DONTNEED: 948 return madvise_dontneed_free(vma, prev, start, end, behavior); 949 default: 950 return madvise_behavior(vma, prev, start, end, behavior); 951 } 952 } 953 954 static bool 955 madvise_behavior_valid(int behavior) 956 { 957 switch (behavior) { 958 case MADV_DOFORK: 959 case MADV_DONTFORK: 960 case MADV_NORMAL: 961 case MADV_SEQUENTIAL: 962 case MADV_RANDOM: 963 case MADV_REMOVE: 964 case MADV_WILLNEED: 965 case MADV_DONTNEED: 966 case MADV_FREE: 967 case MADV_COLD: 968 case MADV_PAGEOUT: 969 #ifdef CONFIG_KSM 970 case MADV_MERGEABLE: 971 case MADV_UNMERGEABLE: 972 #endif 973 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 974 case MADV_HUGEPAGE: 975 case MADV_NOHUGEPAGE: 976 #endif 977 case MADV_DONTDUMP: 978 case MADV_DODUMP: 979 case MADV_WIPEONFORK: 980 case MADV_KEEPONFORK: 981 #ifdef CONFIG_MEMORY_FAILURE 982 case MADV_SOFT_OFFLINE: 983 case MADV_HWPOISON: 984 #endif 985 return true; 986 987 default: 988 return false; 989 } 990 } 991 992 static bool 993 process_madvise_behavior_valid(int behavior) 994 { 995 switch (behavior) { 996 case MADV_COLD: 997 case MADV_PAGEOUT: 998 return true; 999 default: 1000 return false; 1001 } 1002 } 1003 1004 /* 1005 * The madvise(2) system call. 1006 * 1007 * Applications can use madvise() to advise the kernel how it should 1008 * handle paging I/O in this VM area. The idea is to help the kernel 1009 * use appropriate read-ahead and caching techniques. The information 1010 * provided is advisory only, and can be safely disregarded by the 1011 * kernel without affecting the correct operation of the application. 1012 * 1013 * behavior values: 1014 * MADV_NORMAL - the default behavior is to read clusters. This 1015 * results in some read-ahead and read-behind. 1016 * MADV_RANDOM - the system should read the minimum amount of data 1017 * on any access, since it is unlikely that the appli- 1018 * cation will need more than what it asks for. 1019 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 1020 * once, so they can be aggressively read ahead, and 1021 * can be freed soon after they are accessed. 1022 * MADV_WILLNEED - the application is notifying the system to read 1023 * some pages ahead. 1024 * MADV_DONTNEED - the application is finished with the given range, 1025 * so the kernel can free resources associated with it. 1026 * MADV_FREE - the application marks pages in the given range as lazy free, 1027 * where actual purges are postponed until memory pressure happens. 1028 * MADV_REMOVE - the application wants to free up the given range of 1029 * pages and associated backing store. 1030 * MADV_DONTFORK - omit this area from child's address space when forking: 1031 * typically, to avoid COWing pages pinned by get_user_pages(). 1032 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 1033 * MADV_WIPEONFORK - present the child process with zero-filled memory in this 1034 * range after a fork. 1035 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 1036 * MADV_HWPOISON - trigger memory error handler as if the given memory range 1037 * were corrupted by unrecoverable hardware memory failure. 1038 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 1039 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 1040 * this area with pages of identical content from other such areas. 1041 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 1042 * MADV_HUGEPAGE - the application wants to back the given range by transparent 1043 * huge pages in the future. Existing pages might be coalesced and 1044 * new pages might be allocated as THP. 1045 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 1046 * transparent huge pages so the existing pages will not be 1047 * coalesced into THP and new pages will not be allocated as THP. 1048 * MADV_DONTDUMP - the application wants to prevent pages in the given range 1049 * from being included in its core dump. 1050 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 1051 * MADV_COLD - the application is not expected to use this memory soon, 1052 * deactivate pages in this range so that they can be reclaimed 1053 * easily if memory pressure hanppens. 1054 * MADV_PAGEOUT - the application is not expected to use this memory soon, 1055 * page out the pages in this range immediately. 1056 * 1057 * return values: 1058 * zero - success 1059 * -EINVAL - start + len < 0, start is not page-aligned, 1060 * "behavior" is not a valid value, or application 1061 * is attempting to release locked or shared pages, 1062 * or the specified address range includes file, Huge TLB, 1063 * MAP_SHARED or VMPFNMAP range. 1064 * -ENOMEM - addresses in the specified range are not currently 1065 * mapped, or are outside the AS of the process. 1066 * -EIO - an I/O error occurred while paging in data. 1067 * -EBADF - map exists, but area maps something that isn't a file. 1068 * -EAGAIN - a kernel resource was temporarily unavailable. 1069 */ 1070 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) 1071 { 1072 unsigned long end, tmp; 1073 struct vm_area_struct *vma, *prev; 1074 int unmapped_error = 0; 1075 int error = -EINVAL; 1076 int write; 1077 size_t len; 1078 struct blk_plug plug; 1079 1080 start = untagged_addr(start); 1081 1082 if (!madvise_behavior_valid(behavior)) 1083 return error; 1084 1085 if (!PAGE_ALIGNED(start)) 1086 return error; 1087 len = PAGE_ALIGN(len_in); 1088 1089 /* Check to see whether len was rounded up from small -ve to zero */ 1090 if (len_in && !len) 1091 return error; 1092 1093 end = start + len; 1094 if (end < start) 1095 return error; 1096 1097 error = 0; 1098 if (end == start) 1099 return error; 1100 1101 #ifdef CONFIG_MEMORY_FAILURE 1102 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 1103 return madvise_inject_error(behavior, start, start + len_in); 1104 #endif 1105 1106 write = madvise_need_mmap_write(behavior); 1107 if (write) { 1108 if (mmap_write_lock_killable(mm)) 1109 return -EINTR; 1110 } else { 1111 mmap_read_lock(mm); 1112 } 1113 1114 /* 1115 * If the interval [start,end) covers some unmapped address 1116 * ranges, just ignore them, but return -ENOMEM at the end. 1117 * - different from the way of handling in mlock etc. 1118 */ 1119 vma = find_vma_prev(mm, start, &prev); 1120 if (vma && start > vma->vm_start) 1121 prev = vma; 1122 1123 blk_start_plug(&plug); 1124 for (;;) { 1125 /* Still start < end. */ 1126 error = -ENOMEM; 1127 if (!vma) 1128 goto out; 1129 1130 /* Here start < (end|vma->vm_end). */ 1131 if (start < vma->vm_start) { 1132 unmapped_error = -ENOMEM; 1133 start = vma->vm_start; 1134 if (start >= end) 1135 goto out; 1136 } 1137 1138 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 1139 tmp = vma->vm_end; 1140 if (end < tmp) 1141 tmp = end; 1142 1143 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 1144 error = madvise_vma(vma, &prev, start, tmp, behavior); 1145 if (error) 1146 goto out; 1147 start = tmp; 1148 if (prev && start < prev->vm_end) 1149 start = prev->vm_end; 1150 error = unmapped_error; 1151 if (start >= end) 1152 goto out; 1153 if (prev) 1154 vma = prev->vm_next; 1155 else /* madvise_remove dropped mmap_lock */ 1156 vma = find_vma(mm, start); 1157 } 1158 out: 1159 blk_finish_plug(&plug); 1160 if (write) 1161 mmap_write_unlock(mm); 1162 else 1163 mmap_read_unlock(mm); 1164 1165 return error; 1166 } 1167 1168 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 1169 { 1170 return do_madvise(current->mm, start, len_in, behavior); 1171 } 1172 1173 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, 1174 size_t, vlen, int, behavior, unsigned int, flags) 1175 { 1176 ssize_t ret; 1177 struct iovec iovstack[UIO_FASTIOV], iovec; 1178 struct iovec *iov = iovstack; 1179 struct iov_iter iter; 1180 struct pid *pid; 1181 struct task_struct *task; 1182 struct mm_struct *mm; 1183 size_t total_len; 1184 unsigned int f_flags; 1185 1186 if (flags != 0) { 1187 ret = -EINVAL; 1188 goto out; 1189 } 1190 1191 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 1192 if (ret < 0) 1193 goto out; 1194 1195 pid = pidfd_get_pid(pidfd, &f_flags); 1196 if (IS_ERR(pid)) { 1197 ret = PTR_ERR(pid); 1198 goto free_iov; 1199 } 1200 1201 task = get_pid_task(pid, PIDTYPE_PID); 1202 if (!task) { 1203 ret = -ESRCH; 1204 goto put_pid; 1205 } 1206 1207 if (task->mm != current->mm && 1208 !process_madvise_behavior_valid(behavior)) { 1209 ret = -EINVAL; 1210 goto release_task; 1211 } 1212 1213 mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); 1214 if (IS_ERR_OR_NULL(mm)) { 1215 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; 1216 goto release_task; 1217 } 1218 1219 total_len = iov_iter_count(&iter); 1220 1221 while (iov_iter_count(&iter)) { 1222 iovec = iov_iter_iovec(&iter); 1223 ret = do_madvise(mm, (unsigned long)iovec.iov_base, 1224 iovec.iov_len, behavior); 1225 if (ret < 0) 1226 break; 1227 iov_iter_advance(&iter, iovec.iov_len); 1228 } 1229 1230 if (ret == 0) 1231 ret = total_len - iov_iter_count(&iter); 1232 1233 mmput(mm); 1234 release_task: 1235 put_task_struct(task); 1236 put_pid: 1237 put_pid(pid); 1238 free_iov: 1239 kfree(iov); 1240 out: 1241 return ret; 1242 } 1243