1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/madvise.c 4 * 5 * Copyright (C) 1999 Linus Torvalds 6 * Copyright (C) 2002 Christoph Hellwig 7 */ 8 9 #include <linux/mman.h> 10 #include <linux/pagemap.h> 11 #include <linux/syscalls.h> 12 #include <linux/mempolicy.h> 13 #include <linux/page-isolation.h> 14 #include <linux/page_idle.h> 15 #include <linux/userfaultfd_k.h> 16 #include <linux/hugetlb.h> 17 #include <linux/falloc.h> 18 #include <linux/fadvise.h> 19 #include <linux/sched.h> 20 #include <linux/ksm.h> 21 #include <linux/fs.h> 22 #include <linux/file.h> 23 #include <linux/blkdev.h> 24 #include <linux/backing-dev.h> 25 #include <linux/pagewalk.h> 26 #include <linux/swap.h> 27 #include <linux/swapops.h> 28 #include <linux/shmem_fs.h> 29 #include <linux/mmu_notifier.h> 30 #include <linux/sched/mm.h> 31 32 #include <asm/tlb.h> 33 34 #include "internal.h" 35 36 struct madvise_walk_private { 37 struct mmu_gather *tlb; 38 bool pageout; 39 }; 40 41 /* 42 * Any behaviour which results in changes to the vma->vm_flags needs to 43 * take mmap_lock for writing. Others, which simply traverse vmas, need 44 * to only take it for reading. 45 */ 46 static int madvise_need_mmap_write(int behavior) 47 { 48 switch (behavior) { 49 case MADV_REMOVE: 50 case MADV_WILLNEED: 51 case MADV_DONTNEED: 52 case MADV_COLD: 53 case MADV_PAGEOUT: 54 case MADV_FREE: 55 return 0; 56 default: 57 /* be safe, default to 1. list exceptions explicitly */ 58 return 1; 59 } 60 } 61 62 /* 63 * We can potentially split a vm area into separate 64 * areas, each area with its own behavior. 65 */ 66 static long madvise_behavior(struct vm_area_struct *vma, 67 struct vm_area_struct **prev, 68 unsigned long start, unsigned long end, int behavior) 69 { 70 struct mm_struct *mm = vma->vm_mm; 71 int error = 0; 72 pgoff_t pgoff; 73 unsigned long new_flags = vma->vm_flags; 74 75 switch (behavior) { 76 case MADV_NORMAL: 77 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 78 break; 79 case MADV_SEQUENTIAL: 80 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 81 break; 82 case MADV_RANDOM: 83 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 84 break; 85 case MADV_DONTFORK: 86 new_flags |= VM_DONTCOPY; 87 break; 88 case MADV_DOFORK: 89 if (vma->vm_flags & VM_IO) { 90 error = -EINVAL; 91 goto out; 92 } 93 new_flags &= ~VM_DONTCOPY; 94 break; 95 case MADV_WIPEONFORK: 96 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 97 if (vma->vm_file || vma->vm_flags & VM_SHARED) { 98 error = -EINVAL; 99 goto out; 100 } 101 new_flags |= VM_WIPEONFORK; 102 break; 103 case MADV_KEEPONFORK: 104 new_flags &= ~VM_WIPEONFORK; 105 break; 106 case MADV_DONTDUMP: 107 new_flags |= VM_DONTDUMP; 108 break; 109 case MADV_DODUMP: 110 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { 111 error = -EINVAL; 112 goto out; 113 } 114 new_flags &= ~VM_DONTDUMP; 115 break; 116 case MADV_MERGEABLE: 117 case MADV_UNMERGEABLE: 118 error = ksm_madvise(vma, start, end, behavior, &new_flags); 119 if (error) 120 goto out_convert_errno; 121 break; 122 case MADV_HUGEPAGE: 123 case MADV_NOHUGEPAGE: 124 error = hugepage_madvise(vma, &new_flags, behavior); 125 if (error) 126 goto out_convert_errno; 127 break; 128 } 129 130 if (new_flags == vma->vm_flags) { 131 *prev = vma; 132 goto out; 133 } 134 135 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 136 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 137 vma->vm_file, pgoff, vma_policy(vma), 138 vma->vm_userfaultfd_ctx); 139 if (*prev) { 140 vma = *prev; 141 goto success; 142 } 143 144 *prev = vma; 145 146 if (start != vma->vm_start) { 147 if (unlikely(mm->map_count >= sysctl_max_map_count)) { 148 error = -ENOMEM; 149 goto out; 150 } 151 error = __split_vma(mm, vma, start, 1); 152 if (error) 153 goto out_convert_errno; 154 } 155 156 if (end != vma->vm_end) { 157 if (unlikely(mm->map_count >= sysctl_max_map_count)) { 158 error = -ENOMEM; 159 goto out; 160 } 161 error = __split_vma(mm, vma, end, 0); 162 if (error) 163 goto out_convert_errno; 164 } 165 166 success: 167 /* 168 * vm_flags is protected by the mmap_lock held in write mode. 169 */ 170 vma->vm_flags = new_flags; 171 172 out_convert_errno: 173 /* 174 * madvise() returns EAGAIN if kernel resources, such as 175 * slab, are temporarily unavailable. 176 */ 177 if (error == -ENOMEM) 178 error = -EAGAIN; 179 out: 180 return error; 181 } 182 183 #ifdef CONFIG_SWAP 184 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 185 unsigned long end, struct mm_walk *walk) 186 { 187 pte_t *orig_pte; 188 struct vm_area_struct *vma = walk->private; 189 unsigned long index; 190 191 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 192 return 0; 193 194 for (index = start; index != end; index += PAGE_SIZE) { 195 pte_t pte; 196 swp_entry_t entry; 197 struct page *page; 198 spinlock_t *ptl; 199 200 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 201 pte = *(orig_pte + ((index - start) / PAGE_SIZE)); 202 pte_unmap_unlock(orig_pte, ptl); 203 204 if (pte_present(pte) || pte_none(pte)) 205 continue; 206 entry = pte_to_swp_entry(pte); 207 if (unlikely(non_swap_entry(entry))) 208 continue; 209 210 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 211 vma, index, false); 212 if (page) 213 put_page(page); 214 } 215 216 return 0; 217 } 218 219 static const struct mm_walk_ops swapin_walk_ops = { 220 .pmd_entry = swapin_walk_pmd_entry, 221 }; 222 223 static void force_shm_swapin_readahead(struct vm_area_struct *vma, 224 unsigned long start, unsigned long end, 225 struct address_space *mapping) 226 { 227 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); 228 pgoff_t end_index = end / PAGE_SIZE; 229 struct page *page; 230 231 rcu_read_lock(); 232 xas_for_each(&xas, page, end_index) { 233 swp_entry_t swap; 234 235 if (!xa_is_value(page)) 236 continue; 237 xas_pause(&xas); 238 rcu_read_unlock(); 239 240 swap = radix_to_swp_entry(page); 241 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, 242 NULL, 0, false); 243 if (page) 244 put_page(page); 245 246 rcu_read_lock(); 247 } 248 rcu_read_unlock(); 249 250 lru_add_drain(); /* Push any new pages onto the LRU now */ 251 } 252 #endif /* CONFIG_SWAP */ 253 254 /* 255 * Schedule all required I/O operations. Do not wait for completion. 256 */ 257 static long madvise_willneed(struct vm_area_struct *vma, 258 struct vm_area_struct **prev, 259 unsigned long start, unsigned long end) 260 { 261 struct file *file = vma->vm_file; 262 loff_t offset; 263 264 *prev = vma; 265 #ifdef CONFIG_SWAP 266 if (!file) { 267 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); 268 lru_add_drain(); /* Push any new pages onto the LRU now */ 269 return 0; 270 } 271 272 if (shmem_mapping(file->f_mapping)) { 273 force_shm_swapin_readahead(vma, start, end, 274 file->f_mapping); 275 return 0; 276 } 277 #else 278 if (!file) 279 return -EBADF; 280 #endif 281 282 if (IS_DAX(file_inode(file))) { 283 /* no bad return value, but ignore advice */ 284 return 0; 285 } 286 287 /* 288 * Filesystem's fadvise may need to take various locks. We need to 289 * explicitly grab a reference because the vma (and hence the 290 * vma's reference to the file) can go away as soon as we drop 291 * mmap_lock. 292 */ 293 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 294 get_file(file); 295 offset = (loff_t)(start - vma->vm_start) 296 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 297 mmap_read_unlock(current->mm); 298 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 299 fput(file); 300 mmap_read_lock(current->mm); 301 return 0; 302 } 303 304 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, 305 unsigned long addr, unsigned long end, 306 struct mm_walk *walk) 307 { 308 struct madvise_walk_private *private = walk->private; 309 struct mmu_gather *tlb = private->tlb; 310 bool pageout = private->pageout; 311 struct mm_struct *mm = tlb->mm; 312 struct vm_area_struct *vma = walk->vma; 313 pte_t *orig_pte, *pte, ptent; 314 spinlock_t *ptl; 315 struct page *page = NULL; 316 LIST_HEAD(page_list); 317 318 if (fatal_signal_pending(current)) 319 return -EINTR; 320 321 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 322 if (pmd_trans_huge(*pmd)) { 323 pmd_t orig_pmd; 324 unsigned long next = pmd_addr_end(addr, end); 325 326 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 327 ptl = pmd_trans_huge_lock(pmd, vma); 328 if (!ptl) 329 return 0; 330 331 orig_pmd = *pmd; 332 if (is_huge_zero_pmd(orig_pmd)) 333 goto huge_unlock; 334 335 if (unlikely(!pmd_present(orig_pmd))) { 336 VM_BUG_ON(thp_migration_supported() && 337 !is_pmd_migration_entry(orig_pmd)); 338 goto huge_unlock; 339 } 340 341 page = pmd_page(orig_pmd); 342 343 /* Do not interfere with other mappings of this page */ 344 if (page_mapcount(page) != 1) 345 goto huge_unlock; 346 347 if (next - addr != HPAGE_PMD_SIZE) { 348 int err; 349 350 get_page(page); 351 spin_unlock(ptl); 352 lock_page(page); 353 err = split_huge_page(page); 354 unlock_page(page); 355 put_page(page); 356 if (!err) 357 goto regular_page; 358 return 0; 359 } 360 361 if (pmd_young(orig_pmd)) { 362 pmdp_invalidate(vma, addr, pmd); 363 orig_pmd = pmd_mkold(orig_pmd); 364 365 set_pmd_at(mm, addr, pmd, orig_pmd); 366 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 367 } 368 369 ClearPageReferenced(page); 370 test_and_clear_page_young(page); 371 if (pageout) { 372 if (!isolate_lru_page(page)) { 373 if (PageUnevictable(page)) 374 putback_lru_page(page); 375 else 376 list_add(&page->lru, &page_list); 377 } 378 } else 379 deactivate_page(page); 380 huge_unlock: 381 spin_unlock(ptl); 382 if (pageout) 383 reclaim_pages(&page_list); 384 return 0; 385 } 386 387 regular_page: 388 if (pmd_trans_unstable(pmd)) 389 return 0; 390 #endif 391 tlb_change_page_size(tlb, PAGE_SIZE); 392 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 393 flush_tlb_batched_pending(mm); 394 arch_enter_lazy_mmu_mode(); 395 for (; addr < end; pte++, addr += PAGE_SIZE) { 396 ptent = *pte; 397 398 if (pte_none(ptent)) 399 continue; 400 401 if (!pte_present(ptent)) 402 continue; 403 404 page = vm_normal_page(vma, addr, ptent); 405 if (!page) 406 continue; 407 408 /* 409 * Creating a THP page is expensive so split it only if we 410 * are sure it's worth. Split it if we are only owner. 411 */ 412 if (PageTransCompound(page)) { 413 if (page_mapcount(page) != 1) 414 break; 415 get_page(page); 416 if (!trylock_page(page)) { 417 put_page(page); 418 break; 419 } 420 pte_unmap_unlock(orig_pte, ptl); 421 if (split_huge_page(page)) { 422 unlock_page(page); 423 put_page(page); 424 pte_offset_map_lock(mm, pmd, addr, &ptl); 425 break; 426 } 427 unlock_page(page); 428 put_page(page); 429 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 430 pte--; 431 addr -= PAGE_SIZE; 432 continue; 433 } 434 435 /* Do not interfere with other mappings of this page */ 436 if (page_mapcount(page) != 1) 437 continue; 438 439 VM_BUG_ON_PAGE(PageTransCompound(page), page); 440 441 if (pte_young(ptent)) { 442 ptent = ptep_get_and_clear_full(mm, addr, pte, 443 tlb->fullmm); 444 ptent = pte_mkold(ptent); 445 set_pte_at(mm, addr, pte, ptent); 446 tlb_remove_tlb_entry(tlb, pte, addr); 447 } 448 449 /* 450 * We are deactivating a page for accelerating reclaiming. 451 * VM couldn't reclaim the page unless we clear PG_young. 452 * As a side effect, it makes confuse idle-page tracking 453 * because they will miss recent referenced history. 454 */ 455 ClearPageReferenced(page); 456 test_and_clear_page_young(page); 457 if (pageout) { 458 if (!isolate_lru_page(page)) { 459 if (PageUnevictable(page)) 460 putback_lru_page(page); 461 else 462 list_add(&page->lru, &page_list); 463 } 464 } else 465 deactivate_page(page); 466 } 467 468 arch_leave_lazy_mmu_mode(); 469 pte_unmap_unlock(orig_pte, ptl); 470 if (pageout) 471 reclaim_pages(&page_list); 472 cond_resched(); 473 474 return 0; 475 } 476 477 static const struct mm_walk_ops cold_walk_ops = { 478 .pmd_entry = madvise_cold_or_pageout_pte_range, 479 }; 480 481 static void madvise_cold_page_range(struct mmu_gather *tlb, 482 struct vm_area_struct *vma, 483 unsigned long addr, unsigned long end) 484 { 485 struct madvise_walk_private walk_private = { 486 .pageout = false, 487 .tlb = tlb, 488 }; 489 490 tlb_start_vma(tlb, vma); 491 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 492 tlb_end_vma(tlb, vma); 493 } 494 495 static long madvise_cold(struct vm_area_struct *vma, 496 struct vm_area_struct **prev, 497 unsigned long start_addr, unsigned long end_addr) 498 { 499 struct mm_struct *mm = vma->vm_mm; 500 struct mmu_gather tlb; 501 502 *prev = vma; 503 if (!can_madv_lru_vma(vma)) 504 return -EINVAL; 505 506 lru_add_drain(); 507 tlb_gather_mmu(&tlb, mm, start_addr, end_addr); 508 madvise_cold_page_range(&tlb, vma, start_addr, end_addr); 509 tlb_finish_mmu(&tlb, start_addr, end_addr); 510 511 return 0; 512 } 513 514 static void madvise_pageout_page_range(struct mmu_gather *tlb, 515 struct vm_area_struct *vma, 516 unsigned long addr, unsigned long end) 517 { 518 struct madvise_walk_private walk_private = { 519 .pageout = true, 520 .tlb = tlb, 521 }; 522 523 tlb_start_vma(tlb, vma); 524 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 525 tlb_end_vma(tlb, vma); 526 } 527 528 static inline bool can_do_pageout(struct vm_area_struct *vma) 529 { 530 if (vma_is_anonymous(vma)) 531 return true; 532 if (!vma->vm_file) 533 return false; 534 /* 535 * paging out pagecache only for non-anonymous mappings that correspond 536 * to the files the calling process could (if tried) open for writing; 537 * otherwise we'd be including shared non-exclusive mappings, which 538 * opens a side channel. 539 */ 540 return inode_owner_or_capable(file_inode(vma->vm_file)) || 541 inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; 542 } 543 544 static long madvise_pageout(struct vm_area_struct *vma, 545 struct vm_area_struct **prev, 546 unsigned long start_addr, unsigned long end_addr) 547 { 548 struct mm_struct *mm = vma->vm_mm; 549 struct mmu_gather tlb; 550 551 *prev = vma; 552 if (!can_madv_lru_vma(vma)) 553 return -EINVAL; 554 555 if (!can_do_pageout(vma)) 556 return 0; 557 558 lru_add_drain(); 559 tlb_gather_mmu(&tlb, mm, start_addr, end_addr); 560 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); 561 tlb_finish_mmu(&tlb, start_addr, end_addr); 562 563 return 0; 564 } 565 566 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 567 unsigned long end, struct mm_walk *walk) 568 569 { 570 struct mmu_gather *tlb = walk->private; 571 struct mm_struct *mm = tlb->mm; 572 struct vm_area_struct *vma = walk->vma; 573 spinlock_t *ptl; 574 pte_t *orig_pte, *pte, ptent; 575 struct page *page; 576 int nr_swap = 0; 577 unsigned long next; 578 579 next = pmd_addr_end(addr, end); 580 if (pmd_trans_huge(*pmd)) 581 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 582 goto next; 583 584 if (pmd_trans_unstable(pmd)) 585 return 0; 586 587 tlb_change_page_size(tlb, PAGE_SIZE); 588 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 589 flush_tlb_batched_pending(mm); 590 arch_enter_lazy_mmu_mode(); 591 for (; addr != end; pte++, addr += PAGE_SIZE) { 592 ptent = *pte; 593 594 if (pte_none(ptent)) 595 continue; 596 /* 597 * If the pte has swp_entry, just clear page table to 598 * prevent swap-in which is more expensive rather than 599 * (page allocation + zeroing). 600 */ 601 if (!pte_present(ptent)) { 602 swp_entry_t entry; 603 604 entry = pte_to_swp_entry(ptent); 605 if (non_swap_entry(entry)) 606 continue; 607 nr_swap--; 608 free_swap_and_cache(entry); 609 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 610 continue; 611 } 612 613 page = vm_normal_page(vma, addr, ptent); 614 if (!page) 615 continue; 616 617 /* 618 * If pmd isn't transhuge but the page is THP and 619 * is owned by only this process, split it and 620 * deactivate all pages. 621 */ 622 if (PageTransCompound(page)) { 623 if (page_mapcount(page) != 1) 624 goto out; 625 get_page(page); 626 if (!trylock_page(page)) { 627 put_page(page); 628 goto out; 629 } 630 pte_unmap_unlock(orig_pte, ptl); 631 if (split_huge_page(page)) { 632 unlock_page(page); 633 put_page(page); 634 pte_offset_map_lock(mm, pmd, addr, &ptl); 635 goto out; 636 } 637 unlock_page(page); 638 put_page(page); 639 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 640 pte--; 641 addr -= PAGE_SIZE; 642 continue; 643 } 644 645 VM_BUG_ON_PAGE(PageTransCompound(page), page); 646 647 if (PageSwapCache(page) || PageDirty(page)) { 648 if (!trylock_page(page)) 649 continue; 650 /* 651 * If page is shared with others, we couldn't clear 652 * PG_dirty of the page. 653 */ 654 if (page_mapcount(page) != 1) { 655 unlock_page(page); 656 continue; 657 } 658 659 if (PageSwapCache(page) && !try_to_free_swap(page)) { 660 unlock_page(page); 661 continue; 662 } 663 664 ClearPageDirty(page); 665 unlock_page(page); 666 } 667 668 if (pte_young(ptent) || pte_dirty(ptent)) { 669 /* 670 * Some of architecture(ex, PPC) don't update TLB 671 * with set_pte_at and tlb_remove_tlb_entry so for 672 * the portability, remap the pte with old|clean 673 * after pte clearing. 674 */ 675 ptent = ptep_get_and_clear_full(mm, addr, pte, 676 tlb->fullmm); 677 678 ptent = pte_mkold(ptent); 679 ptent = pte_mkclean(ptent); 680 set_pte_at(mm, addr, pte, ptent); 681 tlb_remove_tlb_entry(tlb, pte, addr); 682 } 683 mark_page_lazyfree(page); 684 } 685 out: 686 if (nr_swap) { 687 if (current->mm == mm) 688 sync_mm_rss(mm); 689 690 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 691 } 692 arch_leave_lazy_mmu_mode(); 693 pte_unmap_unlock(orig_pte, ptl); 694 cond_resched(); 695 next: 696 return 0; 697 } 698 699 static const struct mm_walk_ops madvise_free_walk_ops = { 700 .pmd_entry = madvise_free_pte_range, 701 }; 702 703 static int madvise_free_single_vma(struct vm_area_struct *vma, 704 unsigned long start_addr, unsigned long end_addr) 705 { 706 struct mm_struct *mm = vma->vm_mm; 707 struct mmu_notifier_range range; 708 struct mmu_gather tlb; 709 710 /* MADV_FREE works for only anon vma at the moment */ 711 if (!vma_is_anonymous(vma)) 712 return -EINVAL; 713 714 range.start = max(vma->vm_start, start_addr); 715 if (range.start >= vma->vm_end) 716 return -EINVAL; 717 range.end = min(vma->vm_end, end_addr); 718 if (range.end <= vma->vm_start) 719 return -EINVAL; 720 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 721 range.start, range.end); 722 723 lru_add_drain(); 724 tlb_gather_mmu(&tlb, mm, range.start, range.end); 725 update_hiwater_rss(mm); 726 727 mmu_notifier_invalidate_range_start(&range); 728 tlb_start_vma(&tlb, vma); 729 walk_page_range(vma->vm_mm, range.start, range.end, 730 &madvise_free_walk_ops, &tlb); 731 tlb_end_vma(&tlb, vma); 732 mmu_notifier_invalidate_range_end(&range); 733 tlb_finish_mmu(&tlb, range.start, range.end); 734 735 return 0; 736 } 737 738 /* 739 * Application no longer needs these pages. If the pages are dirty, 740 * it's OK to just throw them away. The app will be more careful about 741 * data it wants to keep. Be sure to free swap resources too. The 742 * zap_page_range call sets things up for shrink_active_list to actually free 743 * these pages later if no one else has touched them in the meantime, 744 * although we could add these pages to a global reuse list for 745 * shrink_active_list to pick up before reclaiming other pages. 746 * 747 * NB: This interface discards data rather than pushes it out to swap, 748 * as some implementations do. This has performance implications for 749 * applications like large transactional databases which want to discard 750 * pages in anonymous maps after committing to backing store the data 751 * that was kept in them. There is no reason to write this data out to 752 * the swap area if the application is discarding it. 753 * 754 * An interface that causes the system to free clean pages and flush 755 * dirty pages is already available as msync(MS_INVALIDATE). 756 */ 757 static long madvise_dontneed_single_vma(struct vm_area_struct *vma, 758 unsigned long start, unsigned long end) 759 { 760 zap_page_range(vma, start, end - start); 761 return 0; 762 } 763 764 static long madvise_dontneed_free(struct vm_area_struct *vma, 765 struct vm_area_struct **prev, 766 unsigned long start, unsigned long end, 767 int behavior) 768 { 769 *prev = vma; 770 if (!can_madv_lru_vma(vma)) 771 return -EINVAL; 772 773 if (!userfaultfd_remove(vma, start, end)) { 774 *prev = NULL; /* mmap_lock has been dropped, prev is stale */ 775 776 mmap_read_lock(current->mm); 777 vma = find_vma(current->mm, start); 778 if (!vma) 779 return -ENOMEM; 780 if (start < vma->vm_start) { 781 /* 782 * This "vma" under revalidation is the one 783 * with the lowest vma->vm_start where start 784 * is also < vma->vm_end. If start < 785 * vma->vm_start it means an hole materialized 786 * in the user address space within the 787 * virtual range passed to MADV_DONTNEED 788 * or MADV_FREE. 789 */ 790 return -ENOMEM; 791 } 792 if (!can_madv_lru_vma(vma)) 793 return -EINVAL; 794 if (end > vma->vm_end) { 795 /* 796 * Don't fail if end > vma->vm_end. If the old 797 * vma was splitted while the mmap_lock was 798 * released the effect of the concurrent 799 * operation may not cause madvise() to 800 * have an undefined result. There may be an 801 * adjacent next vma that we'll walk 802 * next. userfaultfd_remove() will generate an 803 * UFFD_EVENT_REMOVE repetition on the 804 * end-vma->vm_end range, but the manager can 805 * handle a repetition fine. 806 */ 807 end = vma->vm_end; 808 } 809 VM_WARN_ON(start >= end); 810 } 811 812 if (behavior == MADV_DONTNEED) 813 return madvise_dontneed_single_vma(vma, start, end); 814 else if (behavior == MADV_FREE) 815 return madvise_free_single_vma(vma, start, end); 816 else 817 return -EINVAL; 818 } 819 820 /* 821 * Application wants to free up the pages and associated backing store. 822 * This is effectively punching a hole into the middle of a file. 823 */ 824 static long madvise_remove(struct vm_area_struct *vma, 825 struct vm_area_struct **prev, 826 unsigned long start, unsigned long end) 827 { 828 loff_t offset; 829 int error; 830 struct file *f; 831 832 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 833 834 if (vma->vm_flags & VM_LOCKED) 835 return -EINVAL; 836 837 f = vma->vm_file; 838 839 if (!f || !f->f_mapping || !f->f_mapping->host) { 840 return -EINVAL; 841 } 842 843 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 844 return -EACCES; 845 846 offset = (loff_t)(start - vma->vm_start) 847 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 848 849 /* 850 * Filesystem's fallocate may need to take i_mutex. We need to 851 * explicitly grab a reference because the vma (and hence the 852 * vma's reference to the file) can go away as soon as we drop 853 * mmap_lock. 854 */ 855 get_file(f); 856 if (userfaultfd_remove(vma, start, end)) { 857 /* mmap_lock was not released by userfaultfd_remove() */ 858 mmap_read_unlock(current->mm); 859 } 860 error = vfs_fallocate(f, 861 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 862 offset, end - start); 863 fput(f); 864 mmap_read_lock(current->mm); 865 return error; 866 } 867 868 #ifdef CONFIG_MEMORY_FAILURE 869 /* 870 * Error injection support for memory error handling. 871 */ 872 static int madvise_inject_error(int behavior, 873 unsigned long start, unsigned long end) 874 { 875 struct page *page; 876 struct zone *zone; 877 unsigned long size; 878 879 if (!capable(CAP_SYS_ADMIN)) 880 return -EPERM; 881 882 883 for (; start < end; start += size) { 884 unsigned long pfn; 885 int ret; 886 887 ret = get_user_pages_fast(start, 1, 0, &page); 888 if (ret != 1) 889 return ret; 890 pfn = page_to_pfn(page); 891 892 /* 893 * When soft offlining hugepages, after migrating the page 894 * we dissolve it, therefore in the second loop "page" will 895 * no longer be a compound page. 896 */ 897 size = page_size(compound_head(page)); 898 899 if (PageHWPoison(page)) { 900 put_page(page); 901 continue; 902 } 903 904 if (behavior == MADV_SOFT_OFFLINE) { 905 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 906 pfn, start); 907 908 ret = soft_offline_page(pfn, MF_COUNT_INCREASED); 909 if (ret) 910 return ret; 911 continue; 912 } 913 914 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 915 pfn, start); 916 917 /* 918 * Drop the page reference taken by get_user_pages_fast(). In 919 * the absence of MF_COUNT_INCREASED the memory_failure() 920 * routine is responsible for pinning the page to prevent it 921 * from being released back to the page allocator. 922 */ 923 put_page(page); 924 ret = memory_failure(pfn, 0); 925 if (ret) 926 return ret; 927 } 928 929 /* Ensure that all poisoned pages are removed from per-cpu lists */ 930 for_each_populated_zone(zone) 931 drain_all_pages(zone); 932 933 return 0; 934 } 935 #endif 936 937 static long 938 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 939 unsigned long start, unsigned long end, int behavior) 940 { 941 switch (behavior) { 942 case MADV_REMOVE: 943 return madvise_remove(vma, prev, start, end); 944 case MADV_WILLNEED: 945 return madvise_willneed(vma, prev, start, end); 946 case MADV_COLD: 947 return madvise_cold(vma, prev, start, end); 948 case MADV_PAGEOUT: 949 return madvise_pageout(vma, prev, start, end); 950 case MADV_FREE: 951 case MADV_DONTNEED: 952 return madvise_dontneed_free(vma, prev, start, end, behavior); 953 default: 954 return madvise_behavior(vma, prev, start, end, behavior); 955 } 956 } 957 958 static bool 959 madvise_behavior_valid(int behavior) 960 { 961 switch (behavior) { 962 case MADV_DOFORK: 963 case MADV_DONTFORK: 964 case MADV_NORMAL: 965 case MADV_SEQUENTIAL: 966 case MADV_RANDOM: 967 case MADV_REMOVE: 968 case MADV_WILLNEED: 969 case MADV_DONTNEED: 970 case MADV_FREE: 971 case MADV_COLD: 972 case MADV_PAGEOUT: 973 #ifdef CONFIG_KSM 974 case MADV_MERGEABLE: 975 case MADV_UNMERGEABLE: 976 #endif 977 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 978 case MADV_HUGEPAGE: 979 case MADV_NOHUGEPAGE: 980 #endif 981 case MADV_DONTDUMP: 982 case MADV_DODUMP: 983 case MADV_WIPEONFORK: 984 case MADV_KEEPONFORK: 985 #ifdef CONFIG_MEMORY_FAILURE 986 case MADV_SOFT_OFFLINE: 987 case MADV_HWPOISON: 988 #endif 989 return true; 990 991 default: 992 return false; 993 } 994 } 995 996 /* 997 * The madvise(2) system call. 998 * 999 * Applications can use madvise() to advise the kernel how it should 1000 * handle paging I/O in this VM area. The idea is to help the kernel 1001 * use appropriate read-ahead and caching techniques. The information 1002 * provided is advisory only, and can be safely disregarded by the 1003 * kernel without affecting the correct operation of the application. 1004 * 1005 * behavior values: 1006 * MADV_NORMAL - the default behavior is to read clusters. This 1007 * results in some read-ahead and read-behind. 1008 * MADV_RANDOM - the system should read the minimum amount of data 1009 * on any access, since it is unlikely that the appli- 1010 * cation will need more than what it asks for. 1011 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 1012 * once, so they can be aggressively read ahead, and 1013 * can be freed soon after they are accessed. 1014 * MADV_WILLNEED - the application is notifying the system to read 1015 * some pages ahead. 1016 * MADV_DONTNEED - the application is finished with the given range, 1017 * so the kernel can free resources associated with it. 1018 * MADV_FREE - the application marks pages in the given range as lazy free, 1019 * where actual purges are postponed until memory pressure happens. 1020 * MADV_REMOVE - the application wants to free up the given range of 1021 * pages and associated backing store. 1022 * MADV_DONTFORK - omit this area from child's address space when forking: 1023 * typically, to avoid COWing pages pinned by get_user_pages(). 1024 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 1025 * MADV_WIPEONFORK - present the child process with zero-filled memory in this 1026 * range after a fork. 1027 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 1028 * MADV_HWPOISON - trigger memory error handler as if the given memory range 1029 * were corrupted by unrecoverable hardware memory failure. 1030 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 1031 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 1032 * this area with pages of identical content from other such areas. 1033 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 1034 * MADV_HUGEPAGE - the application wants to back the given range by transparent 1035 * huge pages in the future. Existing pages might be coalesced and 1036 * new pages might be allocated as THP. 1037 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 1038 * transparent huge pages so the existing pages will not be 1039 * coalesced into THP and new pages will not be allocated as THP. 1040 * MADV_DONTDUMP - the application wants to prevent pages in the given range 1041 * from being included in its core dump. 1042 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 1043 * 1044 * return values: 1045 * zero - success 1046 * -EINVAL - start + len < 0, start is not page-aligned, 1047 * "behavior" is not a valid value, or application 1048 * is attempting to release locked or shared pages, 1049 * or the specified address range includes file, Huge TLB, 1050 * MAP_SHARED or VMPFNMAP range. 1051 * -ENOMEM - addresses in the specified range are not currently 1052 * mapped, or are outside the AS of the process. 1053 * -EIO - an I/O error occurred while paging in data. 1054 * -EBADF - map exists, but area maps something that isn't a file. 1055 * -EAGAIN - a kernel resource was temporarily unavailable. 1056 */ 1057 int do_madvise(unsigned long start, size_t len_in, int behavior) 1058 { 1059 unsigned long end, tmp; 1060 struct vm_area_struct *vma, *prev; 1061 int unmapped_error = 0; 1062 int error = -EINVAL; 1063 int write; 1064 size_t len; 1065 struct blk_plug plug; 1066 1067 start = untagged_addr(start); 1068 1069 if (!madvise_behavior_valid(behavior)) 1070 return error; 1071 1072 if (!PAGE_ALIGNED(start)) 1073 return error; 1074 len = PAGE_ALIGN(len_in); 1075 1076 /* Check to see whether len was rounded up from small -ve to zero */ 1077 if (len_in && !len) 1078 return error; 1079 1080 end = start + len; 1081 if (end < start) 1082 return error; 1083 1084 error = 0; 1085 if (end == start) 1086 return error; 1087 1088 #ifdef CONFIG_MEMORY_FAILURE 1089 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 1090 return madvise_inject_error(behavior, start, start + len_in); 1091 #endif 1092 1093 write = madvise_need_mmap_write(behavior); 1094 if (write) { 1095 if (mmap_write_lock_killable(current->mm)) 1096 return -EINTR; 1097 1098 /* 1099 * We may have stolen the mm from another process 1100 * that is undergoing core dumping. 1101 * 1102 * Right now that's io_ring, in the future it may 1103 * be remote process management and not "current" 1104 * at all. 1105 * 1106 * We need to fix core dumping to not do this, 1107 * but for now we have the mmget_still_valid() 1108 * model. 1109 */ 1110 if (!mmget_still_valid(current->mm)) { 1111 mmap_write_unlock(current->mm); 1112 return -EINTR; 1113 } 1114 } else { 1115 mmap_read_lock(current->mm); 1116 } 1117 1118 /* 1119 * If the interval [start,end) covers some unmapped address 1120 * ranges, just ignore them, but return -ENOMEM at the end. 1121 * - different from the way of handling in mlock etc. 1122 */ 1123 vma = find_vma_prev(current->mm, start, &prev); 1124 if (vma && start > vma->vm_start) 1125 prev = vma; 1126 1127 blk_start_plug(&plug); 1128 for (;;) { 1129 /* Still start < end. */ 1130 error = -ENOMEM; 1131 if (!vma) 1132 goto out; 1133 1134 /* Here start < (end|vma->vm_end). */ 1135 if (start < vma->vm_start) { 1136 unmapped_error = -ENOMEM; 1137 start = vma->vm_start; 1138 if (start >= end) 1139 goto out; 1140 } 1141 1142 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 1143 tmp = vma->vm_end; 1144 if (end < tmp) 1145 tmp = end; 1146 1147 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 1148 error = madvise_vma(vma, &prev, start, tmp, behavior); 1149 if (error) 1150 goto out; 1151 start = tmp; 1152 if (prev && start < prev->vm_end) 1153 start = prev->vm_end; 1154 error = unmapped_error; 1155 if (start >= end) 1156 goto out; 1157 if (prev) 1158 vma = prev->vm_next; 1159 else /* madvise_remove dropped mmap_lock */ 1160 vma = find_vma(current->mm, start); 1161 } 1162 out: 1163 blk_finish_plug(&plug); 1164 if (write) 1165 mmap_write_unlock(current->mm); 1166 else 1167 mmap_read_unlock(current->mm); 1168 1169 return error; 1170 } 1171 1172 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 1173 { 1174 return do_madvise(start, len_in, behavior); 1175 } 1176