1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/madvise.c 4 * 5 * Copyright (C) 1999 Linus Torvalds 6 * Copyright (C) 2002 Christoph Hellwig 7 */ 8 9 #include <linux/mman.h> 10 #include <linux/pagemap.h> 11 #include <linux/syscalls.h> 12 #include <linux/mempolicy.h> 13 #include <linux/page-isolation.h> 14 #include <linux/page_idle.h> 15 #include <linux/userfaultfd_k.h> 16 #include <linux/hugetlb.h> 17 #include <linux/falloc.h> 18 #include <linux/fadvise.h> 19 #include <linux/sched.h> 20 #include <linux/sched/mm.h> 21 #include <linux/uio.h> 22 #include <linux/ksm.h> 23 #include <linux/fs.h> 24 #include <linux/file.h> 25 #include <linux/blkdev.h> 26 #include <linux/backing-dev.h> 27 #include <linux/pagewalk.h> 28 #include <linux/swap.h> 29 #include <linux/swapops.h> 30 #include <linux/shmem_fs.h> 31 #include <linux/mmu_notifier.h> 32 33 #include <asm/tlb.h> 34 35 #include "internal.h" 36 37 struct madvise_walk_private { 38 struct mmu_gather *tlb; 39 bool pageout; 40 }; 41 42 /* 43 * Any behaviour which results in changes to the vma->vm_flags needs to 44 * take mmap_lock for writing. Others, which simply traverse vmas, need 45 * to only take it for reading. 46 */ 47 static int madvise_need_mmap_write(int behavior) 48 { 49 switch (behavior) { 50 case MADV_REMOVE: 51 case MADV_WILLNEED: 52 case MADV_DONTNEED: 53 case MADV_COLD: 54 case MADV_PAGEOUT: 55 case MADV_FREE: 56 return 0; 57 default: 58 /* be safe, default to 1. list exceptions explicitly */ 59 return 1; 60 } 61 } 62 63 /* 64 * We can potentially split a vm area into separate 65 * areas, each area with its own behavior. 66 */ 67 static long madvise_behavior(struct vm_area_struct *vma, 68 struct vm_area_struct **prev, 69 unsigned long start, unsigned long end, int behavior) 70 { 71 struct mm_struct *mm = vma->vm_mm; 72 int error = 0; 73 pgoff_t pgoff; 74 unsigned long new_flags = vma->vm_flags; 75 76 switch (behavior) { 77 case MADV_NORMAL: 78 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 79 break; 80 case MADV_SEQUENTIAL: 81 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 82 break; 83 case MADV_RANDOM: 84 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 85 break; 86 case MADV_DONTFORK: 87 new_flags |= VM_DONTCOPY; 88 break; 89 case MADV_DOFORK: 90 if (vma->vm_flags & VM_IO) { 91 error = -EINVAL; 92 goto out; 93 } 94 new_flags &= ~VM_DONTCOPY; 95 break; 96 case MADV_WIPEONFORK: 97 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 98 if (vma->vm_file || vma->vm_flags & VM_SHARED) { 99 error = -EINVAL; 100 goto out; 101 } 102 new_flags |= VM_WIPEONFORK; 103 break; 104 case MADV_KEEPONFORK: 105 new_flags &= ~VM_WIPEONFORK; 106 break; 107 case MADV_DONTDUMP: 108 new_flags |= VM_DONTDUMP; 109 break; 110 case MADV_DODUMP: 111 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { 112 error = -EINVAL; 113 goto out; 114 } 115 new_flags &= ~VM_DONTDUMP; 116 break; 117 case MADV_MERGEABLE: 118 case MADV_UNMERGEABLE: 119 error = ksm_madvise(vma, start, end, behavior, &new_flags); 120 if (error) 121 goto out_convert_errno; 122 break; 123 case MADV_HUGEPAGE: 124 case MADV_NOHUGEPAGE: 125 error = hugepage_madvise(vma, &new_flags, behavior); 126 if (error) 127 goto out_convert_errno; 128 break; 129 } 130 131 if (new_flags == vma->vm_flags) { 132 *prev = vma; 133 goto out; 134 } 135 136 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 137 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 138 vma->vm_file, pgoff, vma_policy(vma), 139 vma->vm_userfaultfd_ctx); 140 if (*prev) { 141 vma = *prev; 142 goto success; 143 } 144 145 *prev = vma; 146 147 if (start != vma->vm_start) { 148 if (unlikely(mm->map_count >= sysctl_max_map_count)) { 149 error = -ENOMEM; 150 goto out; 151 } 152 error = __split_vma(mm, vma, start, 1); 153 if (error) 154 goto out_convert_errno; 155 } 156 157 if (end != vma->vm_end) { 158 if (unlikely(mm->map_count >= sysctl_max_map_count)) { 159 error = -ENOMEM; 160 goto out; 161 } 162 error = __split_vma(mm, vma, end, 0); 163 if (error) 164 goto out_convert_errno; 165 } 166 167 success: 168 /* 169 * vm_flags is protected by the mmap_lock held in write mode. 170 */ 171 vma->vm_flags = new_flags; 172 173 out_convert_errno: 174 /* 175 * madvise() returns EAGAIN if kernel resources, such as 176 * slab, are temporarily unavailable. 177 */ 178 if (error == -ENOMEM) 179 error = -EAGAIN; 180 out: 181 return error; 182 } 183 184 #ifdef CONFIG_SWAP 185 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 186 unsigned long end, struct mm_walk *walk) 187 { 188 pte_t *orig_pte; 189 struct vm_area_struct *vma = walk->private; 190 unsigned long index; 191 192 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 193 return 0; 194 195 for (index = start; index != end; index += PAGE_SIZE) { 196 pte_t pte; 197 swp_entry_t entry; 198 struct page *page; 199 spinlock_t *ptl; 200 201 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 202 pte = *(orig_pte + ((index - start) / PAGE_SIZE)); 203 pte_unmap_unlock(orig_pte, ptl); 204 205 if (pte_present(pte) || pte_none(pte)) 206 continue; 207 entry = pte_to_swp_entry(pte); 208 if (unlikely(non_swap_entry(entry))) 209 continue; 210 211 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 212 vma, index, false); 213 if (page) 214 put_page(page); 215 } 216 217 return 0; 218 } 219 220 static const struct mm_walk_ops swapin_walk_ops = { 221 .pmd_entry = swapin_walk_pmd_entry, 222 }; 223 224 static void force_shm_swapin_readahead(struct vm_area_struct *vma, 225 unsigned long start, unsigned long end, 226 struct address_space *mapping) 227 { 228 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); 229 pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1); 230 struct page *page; 231 232 rcu_read_lock(); 233 xas_for_each(&xas, page, end_index) { 234 swp_entry_t swap; 235 236 if (!xa_is_value(page)) 237 continue; 238 xas_pause(&xas); 239 rcu_read_unlock(); 240 241 swap = radix_to_swp_entry(page); 242 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, 243 NULL, 0, false); 244 if (page) 245 put_page(page); 246 247 rcu_read_lock(); 248 } 249 rcu_read_unlock(); 250 251 lru_add_drain(); /* Push any new pages onto the LRU now */ 252 } 253 #endif /* CONFIG_SWAP */ 254 255 /* 256 * Schedule all required I/O operations. Do not wait for completion. 257 */ 258 static long madvise_willneed(struct vm_area_struct *vma, 259 struct vm_area_struct **prev, 260 unsigned long start, unsigned long end) 261 { 262 struct mm_struct *mm = vma->vm_mm; 263 struct file *file = vma->vm_file; 264 loff_t offset; 265 266 *prev = vma; 267 #ifdef CONFIG_SWAP 268 if (!file) { 269 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); 270 lru_add_drain(); /* Push any new pages onto the LRU now */ 271 return 0; 272 } 273 274 if (shmem_mapping(file->f_mapping)) { 275 force_shm_swapin_readahead(vma, start, end, 276 file->f_mapping); 277 return 0; 278 } 279 #else 280 if (!file) 281 return -EBADF; 282 #endif 283 284 if (IS_DAX(file_inode(file))) { 285 /* no bad return value, but ignore advice */ 286 return 0; 287 } 288 289 /* 290 * Filesystem's fadvise may need to take various locks. We need to 291 * explicitly grab a reference because the vma (and hence the 292 * vma's reference to the file) can go away as soon as we drop 293 * mmap_lock. 294 */ 295 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 296 get_file(file); 297 offset = (loff_t)(start - vma->vm_start) 298 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 299 mmap_read_unlock(mm); 300 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 301 fput(file); 302 mmap_read_lock(mm); 303 return 0; 304 } 305 306 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, 307 unsigned long addr, unsigned long end, 308 struct mm_walk *walk) 309 { 310 struct madvise_walk_private *private = walk->private; 311 struct mmu_gather *tlb = private->tlb; 312 bool pageout = private->pageout; 313 struct mm_struct *mm = tlb->mm; 314 struct vm_area_struct *vma = walk->vma; 315 pte_t *orig_pte, *pte, ptent; 316 spinlock_t *ptl; 317 struct page *page = NULL; 318 LIST_HEAD(page_list); 319 320 if (fatal_signal_pending(current)) 321 return -EINTR; 322 323 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 324 if (pmd_trans_huge(*pmd)) { 325 pmd_t orig_pmd; 326 unsigned long next = pmd_addr_end(addr, end); 327 328 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 329 ptl = pmd_trans_huge_lock(pmd, vma); 330 if (!ptl) 331 return 0; 332 333 orig_pmd = *pmd; 334 if (is_huge_zero_pmd(orig_pmd)) 335 goto huge_unlock; 336 337 if (unlikely(!pmd_present(orig_pmd))) { 338 VM_BUG_ON(thp_migration_supported() && 339 !is_pmd_migration_entry(orig_pmd)); 340 goto huge_unlock; 341 } 342 343 page = pmd_page(orig_pmd); 344 345 /* Do not interfere with other mappings of this page */ 346 if (page_mapcount(page) != 1) 347 goto huge_unlock; 348 349 if (next - addr != HPAGE_PMD_SIZE) { 350 int err; 351 352 get_page(page); 353 spin_unlock(ptl); 354 lock_page(page); 355 err = split_huge_page(page); 356 unlock_page(page); 357 put_page(page); 358 if (!err) 359 goto regular_page; 360 return 0; 361 } 362 363 if (pmd_young(orig_pmd)) { 364 pmdp_invalidate(vma, addr, pmd); 365 orig_pmd = pmd_mkold(orig_pmd); 366 367 set_pmd_at(mm, addr, pmd, orig_pmd); 368 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 369 } 370 371 ClearPageReferenced(page); 372 test_and_clear_page_young(page); 373 if (pageout) { 374 if (!isolate_lru_page(page)) { 375 if (PageUnevictable(page)) 376 putback_lru_page(page); 377 else 378 list_add(&page->lru, &page_list); 379 } 380 } else 381 deactivate_page(page); 382 huge_unlock: 383 spin_unlock(ptl); 384 if (pageout) 385 reclaim_pages(&page_list); 386 return 0; 387 } 388 389 regular_page: 390 if (pmd_trans_unstable(pmd)) 391 return 0; 392 #endif 393 tlb_change_page_size(tlb, PAGE_SIZE); 394 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 395 flush_tlb_batched_pending(mm); 396 arch_enter_lazy_mmu_mode(); 397 for (; addr < end; pte++, addr += PAGE_SIZE) { 398 ptent = *pte; 399 400 if (pte_none(ptent)) 401 continue; 402 403 if (!pte_present(ptent)) 404 continue; 405 406 page = vm_normal_page(vma, addr, ptent); 407 if (!page) 408 continue; 409 410 /* 411 * Creating a THP page is expensive so split it only if we 412 * are sure it's worth. Split it if we are only owner. 413 */ 414 if (PageTransCompound(page)) { 415 if (page_mapcount(page) != 1) 416 break; 417 get_page(page); 418 if (!trylock_page(page)) { 419 put_page(page); 420 break; 421 } 422 pte_unmap_unlock(orig_pte, ptl); 423 if (split_huge_page(page)) { 424 unlock_page(page); 425 put_page(page); 426 pte_offset_map_lock(mm, pmd, addr, &ptl); 427 break; 428 } 429 unlock_page(page); 430 put_page(page); 431 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 432 pte--; 433 addr -= PAGE_SIZE; 434 continue; 435 } 436 437 /* Do not interfere with other mappings of this page */ 438 if (page_mapcount(page) != 1) 439 continue; 440 441 VM_BUG_ON_PAGE(PageTransCompound(page), page); 442 443 if (pte_young(ptent)) { 444 ptent = ptep_get_and_clear_full(mm, addr, pte, 445 tlb->fullmm); 446 ptent = pte_mkold(ptent); 447 set_pte_at(mm, addr, pte, ptent); 448 tlb_remove_tlb_entry(tlb, pte, addr); 449 } 450 451 /* 452 * We are deactivating a page for accelerating reclaiming. 453 * VM couldn't reclaim the page unless we clear PG_young. 454 * As a side effect, it makes confuse idle-page tracking 455 * because they will miss recent referenced history. 456 */ 457 ClearPageReferenced(page); 458 test_and_clear_page_young(page); 459 if (pageout) { 460 if (!isolate_lru_page(page)) { 461 if (PageUnevictable(page)) 462 putback_lru_page(page); 463 else 464 list_add(&page->lru, &page_list); 465 } 466 } else 467 deactivate_page(page); 468 } 469 470 arch_leave_lazy_mmu_mode(); 471 pte_unmap_unlock(orig_pte, ptl); 472 if (pageout) 473 reclaim_pages(&page_list); 474 cond_resched(); 475 476 return 0; 477 } 478 479 static const struct mm_walk_ops cold_walk_ops = { 480 .pmd_entry = madvise_cold_or_pageout_pte_range, 481 }; 482 483 static void madvise_cold_page_range(struct mmu_gather *tlb, 484 struct vm_area_struct *vma, 485 unsigned long addr, unsigned long end) 486 { 487 struct madvise_walk_private walk_private = { 488 .pageout = false, 489 .tlb = tlb, 490 }; 491 492 tlb_start_vma(tlb, vma); 493 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 494 tlb_end_vma(tlb, vma); 495 } 496 497 static long madvise_cold(struct vm_area_struct *vma, 498 struct vm_area_struct **prev, 499 unsigned long start_addr, unsigned long end_addr) 500 { 501 struct mm_struct *mm = vma->vm_mm; 502 struct mmu_gather tlb; 503 504 *prev = vma; 505 if (!can_madv_lru_vma(vma)) 506 return -EINVAL; 507 508 lru_add_drain(); 509 tlb_gather_mmu(&tlb, mm); 510 madvise_cold_page_range(&tlb, vma, start_addr, end_addr); 511 tlb_finish_mmu(&tlb); 512 513 return 0; 514 } 515 516 static void madvise_pageout_page_range(struct mmu_gather *tlb, 517 struct vm_area_struct *vma, 518 unsigned long addr, unsigned long end) 519 { 520 struct madvise_walk_private walk_private = { 521 .pageout = true, 522 .tlb = tlb, 523 }; 524 525 tlb_start_vma(tlb, vma); 526 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 527 tlb_end_vma(tlb, vma); 528 } 529 530 static inline bool can_do_pageout(struct vm_area_struct *vma) 531 { 532 if (vma_is_anonymous(vma)) 533 return true; 534 if (!vma->vm_file) 535 return false; 536 /* 537 * paging out pagecache only for non-anonymous mappings that correspond 538 * to the files the calling process could (if tried) open for writing; 539 * otherwise we'd be including shared non-exclusive mappings, which 540 * opens a side channel. 541 */ 542 return inode_owner_or_capable(file_inode(vma->vm_file)) || 543 inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; 544 } 545 546 static long madvise_pageout(struct vm_area_struct *vma, 547 struct vm_area_struct **prev, 548 unsigned long start_addr, unsigned long end_addr) 549 { 550 struct mm_struct *mm = vma->vm_mm; 551 struct mmu_gather tlb; 552 553 *prev = vma; 554 if (!can_madv_lru_vma(vma)) 555 return -EINVAL; 556 557 if (!can_do_pageout(vma)) 558 return 0; 559 560 lru_add_drain(); 561 tlb_gather_mmu(&tlb, mm); 562 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); 563 tlb_finish_mmu(&tlb); 564 565 return 0; 566 } 567 568 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 569 unsigned long end, struct mm_walk *walk) 570 571 { 572 struct mmu_gather *tlb = walk->private; 573 struct mm_struct *mm = tlb->mm; 574 struct vm_area_struct *vma = walk->vma; 575 spinlock_t *ptl; 576 pte_t *orig_pte, *pte, ptent; 577 struct page *page; 578 int nr_swap = 0; 579 unsigned long next; 580 581 next = pmd_addr_end(addr, end); 582 if (pmd_trans_huge(*pmd)) 583 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 584 goto next; 585 586 if (pmd_trans_unstable(pmd)) 587 return 0; 588 589 tlb_change_page_size(tlb, PAGE_SIZE); 590 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 591 flush_tlb_batched_pending(mm); 592 arch_enter_lazy_mmu_mode(); 593 for (; addr != end; pte++, addr += PAGE_SIZE) { 594 ptent = *pte; 595 596 if (pte_none(ptent)) 597 continue; 598 /* 599 * If the pte has swp_entry, just clear page table to 600 * prevent swap-in which is more expensive rather than 601 * (page allocation + zeroing). 602 */ 603 if (!pte_present(ptent)) { 604 swp_entry_t entry; 605 606 entry = pte_to_swp_entry(ptent); 607 if (non_swap_entry(entry)) 608 continue; 609 nr_swap--; 610 free_swap_and_cache(entry); 611 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 612 continue; 613 } 614 615 page = vm_normal_page(vma, addr, ptent); 616 if (!page) 617 continue; 618 619 /* 620 * If pmd isn't transhuge but the page is THP and 621 * is owned by only this process, split it and 622 * deactivate all pages. 623 */ 624 if (PageTransCompound(page)) { 625 if (page_mapcount(page) != 1) 626 goto out; 627 get_page(page); 628 if (!trylock_page(page)) { 629 put_page(page); 630 goto out; 631 } 632 pte_unmap_unlock(orig_pte, ptl); 633 if (split_huge_page(page)) { 634 unlock_page(page); 635 put_page(page); 636 pte_offset_map_lock(mm, pmd, addr, &ptl); 637 goto out; 638 } 639 unlock_page(page); 640 put_page(page); 641 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 642 pte--; 643 addr -= PAGE_SIZE; 644 continue; 645 } 646 647 VM_BUG_ON_PAGE(PageTransCompound(page), page); 648 649 if (PageSwapCache(page) || PageDirty(page)) { 650 if (!trylock_page(page)) 651 continue; 652 /* 653 * If page is shared with others, we couldn't clear 654 * PG_dirty of the page. 655 */ 656 if (page_mapcount(page) != 1) { 657 unlock_page(page); 658 continue; 659 } 660 661 if (PageSwapCache(page) && !try_to_free_swap(page)) { 662 unlock_page(page); 663 continue; 664 } 665 666 ClearPageDirty(page); 667 unlock_page(page); 668 } 669 670 if (pte_young(ptent) || pte_dirty(ptent)) { 671 /* 672 * Some of architecture(ex, PPC) don't update TLB 673 * with set_pte_at and tlb_remove_tlb_entry so for 674 * the portability, remap the pte with old|clean 675 * after pte clearing. 676 */ 677 ptent = ptep_get_and_clear_full(mm, addr, pte, 678 tlb->fullmm); 679 680 ptent = pte_mkold(ptent); 681 ptent = pte_mkclean(ptent); 682 set_pte_at(mm, addr, pte, ptent); 683 tlb_remove_tlb_entry(tlb, pte, addr); 684 } 685 mark_page_lazyfree(page); 686 } 687 out: 688 if (nr_swap) { 689 if (current->mm == mm) 690 sync_mm_rss(mm); 691 692 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 693 } 694 arch_leave_lazy_mmu_mode(); 695 pte_unmap_unlock(orig_pte, ptl); 696 cond_resched(); 697 next: 698 return 0; 699 } 700 701 static const struct mm_walk_ops madvise_free_walk_ops = { 702 .pmd_entry = madvise_free_pte_range, 703 }; 704 705 static int madvise_free_single_vma(struct vm_area_struct *vma, 706 unsigned long start_addr, unsigned long end_addr) 707 { 708 struct mm_struct *mm = vma->vm_mm; 709 struct mmu_notifier_range range; 710 struct mmu_gather tlb; 711 712 /* MADV_FREE works for only anon vma at the moment */ 713 if (!vma_is_anonymous(vma)) 714 return -EINVAL; 715 716 range.start = max(vma->vm_start, start_addr); 717 if (range.start >= vma->vm_end) 718 return -EINVAL; 719 range.end = min(vma->vm_end, end_addr); 720 if (range.end <= vma->vm_start) 721 return -EINVAL; 722 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 723 range.start, range.end); 724 725 lru_add_drain(); 726 tlb_gather_mmu(&tlb, mm); 727 update_hiwater_rss(mm); 728 729 mmu_notifier_invalidate_range_start(&range); 730 tlb_start_vma(&tlb, vma); 731 walk_page_range(vma->vm_mm, range.start, range.end, 732 &madvise_free_walk_ops, &tlb); 733 tlb_end_vma(&tlb, vma); 734 mmu_notifier_invalidate_range_end(&range); 735 tlb_finish_mmu(&tlb); 736 737 return 0; 738 } 739 740 /* 741 * Application no longer needs these pages. If the pages are dirty, 742 * it's OK to just throw them away. The app will be more careful about 743 * data it wants to keep. Be sure to free swap resources too. The 744 * zap_page_range call sets things up for shrink_active_list to actually free 745 * these pages later if no one else has touched them in the meantime, 746 * although we could add these pages to a global reuse list for 747 * shrink_active_list to pick up before reclaiming other pages. 748 * 749 * NB: This interface discards data rather than pushes it out to swap, 750 * as some implementations do. This has performance implications for 751 * applications like large transactional databases which want to discard 752 * pages in anonymous maps after committing to backing store the data 753 * that was kept in them. There is no reason to write this data out to 754 * the swap area if the application is discarding it. 755 * 756 * An interface that causes the system to free clean pages and flush 757 * dirty pages is already available as msync(MS_INVALIDATE). 758 */ 759 static long madvise_dontneed_single_vma(struct vm_area_struct *vma, 760 unsigned long start, unsigned long end) 761 { 762 zap_page_range(vma, start, end - start); 763 return 0; 764 } 765 766 static long madvise_dontneed_free(struct vm_area_struct *vma, 767 struct vm_area_struct **prev, 768 unsigned long start, unsigned long end, 769 int behavior) 770 { 771 struct mm_struct *mm = vma->vm_mm; 772 773 *prev = vma; 774 if (!can_madv_lru_vma(vma)) 775 return -EINVAL; 776 777 if (!userfaultfd_remove(vma, start, end)) { 778 *prev = NULL; /* mmap_lock has been dropped, prev is stale */ 779 780 mmap_read_lock(mm); 781 vma = find_vma(mm, start); 782 if (!vma) 783 return -ENOMEM; 784 if (start < vma->vm_start) { 785 /* 786 * This "vma" under revalidation is the one 787 * with the lowest vma->vm_start where start 788 * is also < vma->vm_end. If start < 789 * vma->vm_start it means an hole materialized 790 * in the user address space within the 791 * virtual range passed to MADV_DONTNEED 792 * or MADV_FREE. 793 */ 794 return -ENOMEM; 795 } 796 if (!can_madv_lru_vma(vma)) 797 return -EINVAL; 798 if (end > vma->vm_end) { 799 /* 800 * Don't fail if end > vma->vm_end. If the old 801 * vma was splitted while the mmap_lock was 802 * released the effect of the concurrent 803 * operation may not cause madvise() to 804 * have an undefined result. There may be an 805 * adjacent next vma that we'll walk 806 * next. userfaultfd_remove() will generate an 807 * UFFD_EVENT_REMOVE repetition on the 808 * end-vma->vm_end range, but the manager can 809 * handle a repetition fine. 810 */ 811 end = vma->vm_end; 812 } 813 VM_WARN_ON(start >= end); 814 } 815 816 if (behavior == MADV_DONTNEED) 817 return madvise_dontneed_single_vma(vma, start, end); 818 else if (behavior == MADV_FREE) 819 return madvise_free_single_vma(vma, start, end); 820 else 821 return -EINVAL; 822 } 823 824 /* 825 * Application wants to free up the pages and associated backing store. 826 * This is effectively punching a hole into the middle of a file. 827 */ 828 static long madvise_remove(struct vm_area_struct *vma, 829 struct vm_area_struct **prev, 830 unsigned long start, unsigned long end) 831 { 832 loff_t offset; 833 int error; 834 struct file *f; 835 struct mm_struct *mm = vma->vm_mm; 836 837 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 838 839 if (vma->vm_flags & VM_LOCKED) 840 return -EINVAL; 841 842 f = vma->vm_file; 843 844 if (!f || !f->f_mapping || !f->f_mapping->host) { 845 return -EINVAL; 846 } 847 848 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 849 return -EACCES; 850 851 offset = (loff_t)(start - vma->vm_start) 852 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 853 854 /* 855 * Filesystem's fallocate may need to take i_mutex. We need to 856 * explicitly grab a reference because the vma (and hence the 857 * vma's reference to the file) can go away as soon as we drop 858 * mmap_lock. 859 */ 860 get_file(f); 861 if (userfaultfd_remove(vma, start, end)) { 862 /* mmap_lock was not released by userfaultfd_remove() */ 863 mmap_read_unlock(mm); 864 } 865 error = vfs_fallocate(f, 866 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 867 offset, end - start); 868 fput(f); 869 mmap_read_lock(mm); 870 return error; 871 } 872 873 #ifdef CONFIG_MEMORY_FAILURE 874 /* 875 * Error injection support for memory error handling. 876 */ 877 static int madvise_inject_error(int behavior, 878 unsigned long start, unsigned long end) 879 { 880 unsigned long size; 881 882 if (!capable(CAP_SYS_ADMIN)) 883 return -EPERM; 884 885 886 for (; start < end; start += size) { 887 unsigned long pfn; 888 struct page *page; 889 int ret; 890 891 ret = get_user_pages_fast(start, 1, 0, &page); 892 if (ret != 1) 893 return ret; 894 pfn = page_to_pfn(page); 895 896 /* 897 * When soft offlining hugepages, after migrating the page 898 * we dissolve it, therefore in the second loop "page" will 899 * no longer be a compound page. 900 */ 901 size = page_size(compound_head(page)); 902 903 if (behavior == MADV_SOFT_OFFLINE) { 904 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 905 pfn, start); 906 ret = soft_offline_page(pfn, MF_COUNT_INCREASED); 907 } else { 908 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 909 pfn, start); 910 ret = memory_failure(pfn, MF_COUNT_INCREASED); 911 } 912 913 if (ret) 914 return ret; 915 } 916 917 return 0; 918 } 919 #endif 920 921 static long 922 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 923 unsigned long start, unsigned long end, int behavior) 924 { 925 switch (behavior) { 926 case MADV_REMOVE: 927 return madvise_remove(vma, prev, start, end); 928 case MADV_WILLNEED: 929 return madvise_willneed(vma, prev, start, end); 930 case MADV_COLD: 931 return madvise_cold(vma, prev, start, end); 932 case MADV_PAGEOUT: 933 return madvise_pageout(vma, prev, start, end); 934 case MADV_FREE: 935 case MADV_DONTNEED: 936 return madvise_dontneed_free(vma, prev, start, end, behavior); 937 default: 938 return madvise_behavior(vma, prev, start, end, behavior); 939 } 940 } 941 942 static bool 943 madvise_behavior_valid(int behavior) 944 { 945 switch (behavior) { 946 case MADV_DOFORK: 947 case MADV_DONTFORK: 948 case MADV_NORMAL: 949 case MADV_SEQUENTIAL: 950 case MADV_RANDOM: 951 case MADV_REMOVE: 952 case MADV_WILLNEED: 953 case MADV_DONTNEED: 954 case MADV_FREE: 955 case MADV_COLD: 956 case MADV_PAGEOUT: 957 #ifdef CONFIG_KSM 958 case MADV_MERGEABLE: 959 case MADV_UNMERGEABLE: 960 #endif 961 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 962 case MADV_HUGEPAGE: 963 case MADV_NOHUGEPAGE: 964 #endif 965 case MADV_DONTDUMP: 966 case MADV_DODUMP: 967 case MADV_WIPEONFORK: 968 case MADV_KEEPONFORK: 969 #ifdef CONFIG_MEMORY_FAILURE 970 case MADV_SOFT_OFFLINE: 971 case MADV_HWPOISON: 972 #endif 973 return true; 974 975 default: 976 return false; 977 } 978 } 979 980 static bool 981 process_madvise_behavior_valid(int behavior) 982 { 983 switch (behavior) { 984 case MADV_COLD: 985 case MADV_PAGEOUT: 986 return true; 987 default: 988 return false; 989 } 990 } 991 992 /* 993 * The madvise(2) system call. 994 * 995 * Applications can use madvise() to advise the kernel how it should 996 * handle paging I/O in this VM area. The idea is to help the kernel 997 * use appropriate read-ahead and caching techniques. The information 998 * provided is advisory only, and can be safely disregarded by the 999 * kernel without affecting the correct operation of the application. 1000 * 1001 * behavior values: 1002 * MADV_NORMAL - the default behavior is to read clusters. This 1003 * results in some read-ahead and read-behind. 1004 * MADV_RANDOM - the system should read the minimum amount of data 1005 * on any access, since it is unlikely that the appli- 1006 * cation will need more than what it asks for. 1007 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 1008 * once, so they can be aggressively read ahead, and 1009 * can be freed soon after they are accessed. 1010 * MADV_WILLNEED - the application is notifying the system to read 1011 * some pages ahead. 1012 * MADV_DONTNEED - the application is finished with the given range, 1013 * so the kernel can free resources associated with it. 1014 * MADV_FREE - the application marks pages in the given range as lazy free, 1015 * where actual purges are postponed until memory pressure happens. 1016 * MADV_REMOVE - the application wants to free up the given range of 1017 * pages and associated backing store. 1018 * MADV_DONTFORK - omit this area from child's address space when forking: 1019 * typically, to avoid COWing pages pinned by get_user_pages(). 1020 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 1021 * MADV_WIPEONFORK - present the child process with zero-filled memory in this 1022 * range after a fork. 1023 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 1024 * MADV_HWPOISON - trigger memory error handler as if the given memory range 1025 * were corrupted by unrecoverable hardware memory failure. 1026 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 1027 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 1028 * this area with pages of identical content from other such areas. 1029 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 1030 * MADV_HUGEPAGE - the application wants to back the given range by transparent 1031 * huge pages in the future. Existing pages might be coalesced and 1032 * new pages might be allocated as THP. 1033 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 1034 * transparent huge pages so the existing pages will not be 1035 * coalesced into THP and new pages will not be allocated as THP. 1036 * MADV_DONTDUMP - the application wants to prevent pages in the given range 1037 * from being included in its core dump. 1038 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 1039 * MADV_COLD - the application is not expected to use this memory soon, 1040 * deactivate pages in this range so that they can be reclaimed 1041 * easily if memory pressure hanppens. 1042 * MADV_PAGEOUT - the application is not expected to use this memory soon, 1043 * page out the pages in this range immediately. 1044 * 1045 * return values: 1046 * zero - success 1047 * -EINVAL - start + len < 0, start is not page-aligned, 1048 * "behavior" is not a valid value, or application 1049 * is attempting to release locked or shared pages, 1050 * or the specified address range includes file, Huge TLB, 1051 * MAP_SHARED or VMPFNMAP range. 1052 * -ENOMEM - addresses in the specified range are not currently 1053 * mapped, or are outside the AS of the process. 1054 * -EIO - an I/O error occurred while paging in data. 1055 * -EBADF - map exists, but area maps something that isn't a file. 1056 * -EAGAIN - a kernel resource was temporarily unavailable. 1057 */ 1058 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) 1059 { 1060 unsigned long end, tmp; 1061 struct vm_area_struct *vma, *prev; 1062 int unmapped_error = 0; 1063 int error = -EINVAL; 1064 int write; 1065 size_t len; 1066 struct blk_plug plug; 1067 1068 start = untagged_addr(start); 1069 1070 if (!madvise_behavior_valid(behavior)) 1071 return error; 1072 1073 if (!PAGE_ALIGNED(start)) 1074 return error; 1075 len = PAGE_ALIGN(len_in); 1076 1077 /* Check to see whether len was rounded up from small -ve to zero */ 1078 if (len_in && !len) 1079 return error; 1080 1081 end = start + len; 1082 if (end < start) 1083 return error; 1084 1085 error = 0; 1086 if (end == start) 1087 return error; 1088 1089 #ifdef CONFIG_MEMORY_FAILURE 1090 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 1091 return madvise_inject_error(behavior, start, start + len_in); 1092 #endif 1093 1094 write = madvise_need_mmap_write(behavior); 1095 if (write) { 1096 if (mmap_write_lock_killable(mm)) 1097 return -EINTR; 1098 } else { 1099 mmap_read_lock(mm); 1100 } 1101 1102 /* 1103 * If the interval [start,end) covers some unmapped address 1104 * ranges, just ignore them, but return -ENOMEM at the end. 1105 * - different from the way of handling in mlock etc. 1106 */ 1107 vma = find_vma_prev(mm, start, &prev); 1108 if (vma && start > vma->vm_start) 1109 prev = vma; 1110 1111 blk_start_plug(&plug); 1112 for (;;) { 1113 /* Still start < end. */ 1114 error = -ENOMEM; 1115 if (!vma) 1116 goto out; 1117 1118 /* Here start < (end|vma->vm_end). */ 1119 if (start < vma->vm_start) { 1120 unmapped_error = -ENOMEM; 1121 start = vma->vm_start; 1122 if (start >= end) 1123 goto out; 1124 } 1125 1126 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 1127 tmp = vma->vm_end; 1128 if (end < tmp) 1129 tmp = end; 1130 1131 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 1132 error = madvise_vma(vma, &prev, start, tmp, behavior); 1133 if (error) 1134 goto out; 1135 start = tmp; 1136 if (prev && start < prev->vm_end) 1137 start = prev->vm_end; 1138 error = unmapped_error; 1139 if (start >= end) 1140 goto out; 1141 if (prev) 1142 vma = prev->vm_next; 1143 else /* madvise_remove dropped mmap_lock */ 1144 vma = find_vma(mm, start); 1145 } 1146 out: 1147 blk_finish_plug(&plug); 1148 if (write) 1149 mmap_write_unlock(mm); 1150 else 1151 mmap_read_unlock(mm); 1152 1153 return error; 1154 } 1155 1156 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 1157 { 1158 return do_madvise(current->mm, start, len_in, behavior); 1159 } 1160 1161 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, 1162 size_t, vlen, int, behavior, unsigned int, flags) 1163 { 1164 ssize_t ret; 1165 struct iovec iovstack[UIO_FASTIOV], iovec; 1166 struct iovec *iov = iovstack; 1167 struct iov_iter iter; 1168 struct pid *pid; 1169 struct task_struct *task; 1170 struct mm_struct *mm; 1171 size_t total_len; 1172 unsigned int f_flags; 1173 1174 if (flags != 0) { 1175 ret = -EINVAL; 1176 goto out; 1177 } 1178 1179 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 1180 if (ret < 0) 1181 goto out; 1182 1183 pid = pidfd_get_pid(pidfd, &f_flags); 1184 if (IS_ERR(pid)) { 1185 ret = PTR_ERR(pid); 1186 goto free_iov; 1187 } 1188 1189 task = get_pid_task(pid, PIDTYPE_PID); 1190 if (!task) { 1191 ret = -ESRCH; 1192 goto put_pid; 1193 } 1194 1195 if (!process_madvise_behavior_valid(behavior)) { 1196 ret = -EINVAL; 1197 goto release_task; 1198 } 1199 1200 mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); 1201 if (IS_ERR_OR_NULL(mm)) { 1202 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; 1203 goto release_task; 1204 } 1205 1206 total_len = iov_iter_count(&iter); 1207 1208 while (iov_iter_count(&iter)) { 1209 iovec = iov_iter_iovec(&iter); 1210 ret = do_madvise(mm, (unsigned long)iovec.iov_base, 1211 iovec.iov_len, behavior); 1212 if (ret < 0) 1213 break; 1214 iov_iter_advance(&iter, iovec.iov_len); 1215 } 1216 1217 if (ret == 0) 1218 ret = total_len - iov_iter_count(&iter); 1219 1220 mmput(mm); 1221 release_task: 1222 put_task_struct(task); 1223 put_pid: 1224 put_pid(pid); 1225 free_iov: 1226 kfree(iov); 1227 out: 1228 return ret; 1229 } 1230