1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/madvise.c 4 * 5 * Copyright (C) 1999 Linus Torvalds 6 * Copyright (C) 2002 Christoph Hellwig 7 */ 8 9 #include <linux/mman.h> 10 #include <linux/pagemap.h> 11 #include <linux/syscalls.h> 12 #include <linux/mempolicy.h> 13 #include <linux/page-isolation.h> 14 #include <linux/page_idle.h> 15 #include <linux/userfaultfd_k.h> 16 #include <linux/hugetlb.h> 17 #include <linux/falloc.h> 18 #include <linux/fadvise.h> 19 #include <linux/sched.h> 20 #include <linux/sched/mm.h> 21 #include <linux/mm_inline.h> 22 #include <linux/string.h> 23 #include <linux/uio.h> 24 #include <linux/ksm.h> 25 #include <linux/fs.h> 26 #include <linux/file.h> 27 #include <linux/blkdev.h> 28 #include <linux/backing-dev.h> 29 #include <linux/pagewalk.h> 30 #include <linux/swap.h> 31 #include <linux/swapops.h> 32 #include <linux/shmem_fs.h> 33 #include <linux/mmu_notifier.h> 34 35 #include <asm/tlb.h> 36 37 #include "internal.h" 38 39 struct madvise_walk_private { 40 struct mmu_gather *tlb; 41 bool pageout; 42 }; 43 44 /* 45 * Any behaviour which results in changes to the vma->vm_flags needs to 46 * take mmap_lock for writing. Others, which simply traverse vmas, need 47 * to only take it for reading. 48 */ 49 static int madvise_need_mmap_write(int behavior) 50 { 51 switch (behavior) { 52 case MADV_REMOVE: 53 case MADV_WILLNEED: 54 case MADV_DONTNEED: 55 case MADV_DONTNEED_LOCKED: 56 case MADV_COLD: 57 case MADV_PAGEOUT: 58 case MADV_FREE: 59 case MADV_POPULATE_READ: 60 case MADV_POPULATE_WRITE: 61 return 0; 62 default: 63 /* be safe, default to 1. list exceptions explicitly */ 64 return 1; 65 } 66 } 67 68 #ifdef CONFIG_ANON_VMA_NAME 69 struct anon_vma_name *anon_vma_name_alloc(const char *name) 70 { 71 struct anon_vma_name *anon_name; 72 size_t count; 73 74 /* Add 1 for NUL terminator at the end of the anon_name->name */ 75 count = strlen(name) + 1; 76 anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); 77 if (anon_name) { 78 kref_init(&anon_name->kref); 79 memcpy(anon_name->name, name, count); 80 } 81 82 return anon_name; 83 } 84 85 void anon_vma_name_free(struct kref *kref) 86 { 87 struct anon_vma_name *anon_name = 88 container_of(kref, struct anon_vma_name, kref); 89 kfree(anon_name); 90 } 91 92 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) 93 { 94 mmap_assert_locked(vma->vm_mm); 95 96 if (vma->vm_file) 97 return NULL; 98 99 return vma->anon_name; 100 } 101 102 /* mmap_lock should be write-locked */ 103 static int replace_anon_vma_name(struct vm_area_struct *vma, 104 struct anon_vma_name *anon_name) 105 { 106 struct anon_vma_name *orig_name = anon_vma_name(vma); 107 108 if (!anon_name) { 109 vma->anon_name = NULL; 110 anon_vma_name_put(orig_name); 111 return 0; 112 } 113 114 if (anon_vma_name_eq(orig_name, anon_name)) 115 return 0; 116 117 vma->anon_name = anon_vma_name_reuse(anon_name); 118 anon_vma_name_put(orig_name); 119 120 return 0; 121 } 122 #else /* CONFIG_ANON_VMA_NAME */ 123 static int replace_anon_vma_name(struct vm_area_struct *vma, 124 struct anon_vma_name *anon_name) 125 { 126 if (anon_name) 127 return -EINVAL; 128 129 return 0; 130 } 131 #endif /* CONFIG_ANON_VMA_NAME */ 132 /* 133 * Update the vm_flags on region of a vma, splitting it or merging it as 134 * necessary. Must be called with mmap_sem held for writing; 135 * Caller should ensure anon_name stability by raising its refcount even when 136 * anon_name belongs to a valid vma because this function might free that vma. 137 */ 138 static int madvise_update_vma(struct vm_area_struct *vma, 139 struct vm_area_struct **prev, unsigned long start, 140 unsigned long end, unsigned long new_flags, 141 struct anon_vma_name *anon_name) 142 { 143 struct mm_struct *mm = vma->vm_mm; 144 int error; 145 pgoff_t pgoff; 146 147 if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { 148 *prev = vma; 149 return 0; 150 } 151 152 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 153 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 154 vma->vm_file, pgoff, vma_policy(vma), 155 vma->vm_userfaultfd_ctx, anon_name); 156 if (*prev) { 157 vma = *prev; 158 goto success; 159 } 160 161 *prev = vma; 162 163 if (start != vma->vm_start) { 164 if (unlikely(mm->map_count >= sysctl_max_map_count)) 165 return -ENOMEM; 166 error = __split_vma(mm, vma, start, 1); 167 if (error) 168 return error; 169 } 170 171 if (end != vma->vm_end) { 172 if (unlikely(mm->map_count >= sysctl_max_map_count)) 173 return -ENOMEM; 174 error = __split_vma(mm, vma, end, 0); 175 if (error) 176 return error; 177 } 178 179 success: 180 /* 181 * vm_flags is protected by the mmap_lock held in write mode. 182 */ 183 vma->vm_flags = new_flags; 184 if (!vma->vm_file) { 185 error = replace_anon_vma_name(vma, anon_name); 186 if (error) 187 return error; 188 } 189 190 return 0; 191 } 192 193 #ifdef CONFIG_SWAP 194 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 195 unsigned long end, struct mm_walk *walk) 196 { 197 pte_t *orig_pte; 198 struct vm_area_struct *vma = walk->private; 199 unsigned long index; 200 201 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 202 return 0; 203 204 for (index = start; index != end; index += PAGE_SIZE) { 205 pte_t pte; 206 swp_entry_t entry; 207 struct page *page; 208 spinlock_t *ptl; 209 210 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 211 pte = *(orig_pte + ((index - start) / PAGE_SIZE)); 212 pte_unmap_unlock(orig_pte, ptl); 213 214 if (pte_present(pte) || pte_none(pte)) 215 continue; 216 entry = pte_to_swp_entry(pte); 217 if (unlikely(non_swap_entry(entry))) 218 continue; 219 220 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 221 vma, index, false); 222 if (page) 223 put_page(page); 224 } 225 226 return 0; 227 } 228 229 static const struct mm_walk_ops swapin_walk_ops = { 230 .pmd_entry = swapin_walk_pmd_entry, 231 }; 232 233 static void force_shm_swapin_readahead(struct vm_area_struct *vma, 234 unsigned long start, unsigned long end, 235 struct address_space *mapping) 236 { 237 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); 238 pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1); 239 struct page *page; 240 241 rcu_read_lock(); 242 xas_for_each(&xas, page, end_index) { 243 swp_entry_t swap; 244 245 if (!xa_is_value(page)) 246 continue; 247 xas_pause(&xas); 248 rcu_read_unlock(); 249 250 swap = radix_to_swp_entry(page); 251 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, 252 NULL, 0, false); 253 if (page) 254 put_page(page); 255 256 rcu_read_lock(); 257 } 258 rcu_read_unlock(); 259 260 lru_add_drain(); /* Push any new pages onto the LRU now */ 261 } 262 #endif /* CONFIG_SWAP */ 263 264 /* 265 * Schedule all required I/O operations. Do not wait for completion. 266 */ 267 static long madvise_willneed(struct vm_area_struct *vma, 268 struct vm_area_struct **prev, 269 unsigned long start, unsigned long end) 270 { 271 struct mm_struct *mm = vma->vm_mm; 272 struct file *file = vma->vm_file; 273 loff_t offset; 274 275 *prev = vma; 276 #ifdef CONFIG_SWAP 277 if (!file) { 278 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); 279 lru_add_drain(); /* Push any new pages onto the LRU now */ 280 return 0; 281 } 282 283 if (shmem_mapping(file->f_mapping)) { 284 force_shm_swapin_readahead(vma, start, end, 285 file->f_mapping); 286 return 0; 287 } 288 #else 289 if (!file) 290 return -EBADF; 291 #endif 292 293 if (IS_DAX(file_inode(file))) { 294 /* no bad return value, but ignore advice */ 295 return 0; 296 } 297 298 /* 299 * Filesystem's fadvise may need to take various locks. We need to 300 * explicitly grab a reference because the vma (and hence the 301 * vma's reference to the file) can go away as soon as we drop 302 * mmap_lock. 303 */ 304 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 305 get_file(file); 306 offset = (loff_t)(start - vma->vm_start) 307 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 308 mmap_read_unlock(mm); 309 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 310 fput(file); 311 mmap_read_lock(mm); 312 return 0; 313 } 314 315 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, 316 unsigned long addr, unsigned long end, 317 struct mm_walk *walk) 318 { 319 struct madvise_walk_private *private = walk->private; 320 struct mmu_gather *tlb = private->tlb; 321 bool pageout = private->pageout; 322 struct mm_struct *mm = tlb->mm; 323 struct vm_area_struct *vma = walk->vma; 324 pte_t *orig_pte, *pte, ptent; 325 spinlock_t *ptl; 326 struct page *page = NULL; 327 LIST_HEAD(page_list); 328 329 if (fatal_signal_pending(current)) 330 return -EINTR; 331 332 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 333 if (pmd_trans_huge(*pmd)) { 334 pmd_t orig_pmd; 335 unsigned long next = pmd_addr_end(addr, end); 336 337 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 338 ptl = pmd_trans_huge_lock(pmd, vma); 339 if (!ptl) 340 return 0; 341 342 orig_pmd = *pmd; 343 if (is_huge_zero_pmd(orig_pmd)) 344 goto huge_unlock; 345 346 if (unlikely(!pmd_present(orig_pmd))) { 347 VM_BUG_ON(thp_migration_supported() && 348 !is_pmd_migration_entry(orig_pmd)); 349 goto huge_unlock; 350 } 351 352 page = pmd_page(orig_pmd); 353 354 /* Do not interfere with other mappings of this page */ 355 if (page_mapcount(page) != 1) 356 goto huge_unlock; 357 358 if (next - addr != HPAGE_PMD_SIZE) { 359 int err; 360 361 get_page(page); 362 spin_unlock(ptl); 363 lock_page(page); 364 err = split_huge_page(page); 365 unlock_page(page); 366 put_page(page); 367 if (!err) 368 goto regular_page; 369 return 0; 370 } 371 372 if (pmd_young(orig_pmd)) { 373 pmdp_invalidate(vma, addr, pmd); 374 orig_pmd = pmd_mkold(orig_pmd); 375 376 set_pmd_at(mm, addr, pmd, orig_pmd); 377 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 378 } 379 380 ClearPageReferenced(page); 381 test_and_clear_page_young(page); 382 if (pageout) { 383 if (!isolate_lru_page(page)) { 384 if (PageUnevictable(page)) 385 putback_lru_page(page); 386 else 387 list_add(&page->lru, &page_list); 388 } 389 } else 390 deactivate_page(page); 391 huge_unlock: 392 spin_unlock(ptl); 393 if (pageout) 394 reclaim_pages(&page_list); 395 return 0; 396 } 397 398 regular_page: 399 if (pmd_trans_unstable(pmd)) 400 return 0; 401 #endif 402 tlb_change_page_size(tlb, PAGE_SIZE); 403 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 404 flush_tlb_batched_pending(mm); 405 arch_enter_lazy_mmu_mode(); 406 for (; addr < end; pte++, addr += PAGE_SIZE) { 407 ptent = *pte; 408 409 if (pte_none(ptent)) 410 continue; 411 412 if (!pte_present(ptent)) 413 continue; 414 415 page = vm_normal_page(vma, addr, ptent); 416 if (!page) 417 continue; 418 419 /* 420 * Creating a THP page is expensive so split it only if we 421 * are sure it's worth. Split it if we are only owner. 422 */ 423 if (PageTransCompound(page)) { 424 if (page_mapcount(page) != 1) 425 break; 426 get_page(page); 427 if (!trylock_page(page)) { 428 put_page(page); 429 break; 430 } 431 pte_unmap_unlock(orig_pte, ptl); 432 if (split_huge_page(page)) { 433 unlock_page(page); 434 put_page(page); 435 orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 436 break; 437 } 438 unlock_page(page); 439 put_page(page); 440 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 441 pte--; 442 addr -= PAGE_SIZE; 443 continue; 444 } 445 446 /* Do not interfere with other mappings of this page */ 447 if (page_mapcount(page) != 1) 448 continue; 449 450 VM_BUG_ON_PAGE(PageTransCompound(page), page); 451 452 if (pte_young(ptent)) { 453 ptent = ptep_get_and_clear_full(mm, addr, pte, 454 tlb->fullmm); 455 ptent = pte_mkold(ptent); 456 set_pte_at(mm, addr, pte, ptent); 457 tlb_remove_tlb_entry(tlb, pte, addr); 458 } 459 460 /* 461 * We are deactivating a page for accelerating reclaiming. 462 * VM couldn't reclaim the page unless we clear PG_young. 463 * As a side effect, it makes confuse idle-page tracking 464 * because they will miss recent referenced history. 465 */ 466 ClearPageReferenced(page); 467 test_and_clear_page_young(page); 468 if (pageout) { 469 if (!isolate_lru_page(page)) { 470 if (PageUnevictable(page)) 471 putback_lru_page(page); 472 else 473 list_add(&page->lru, &page_list); 474 } 475 } else 476 deactivate_page(page); 477 } 478 479 arch_leave_lazy_mmu_mode(); 480 pte_unmap_unlock(orig_pte, ptl); 481 if (pageout) 482 reclaim_pages(&page_list); 483 cond_resched(); 484 485 return 0; 486 } 487 488 static const struct mm_walk_ops cold_walk_ops = { 489 .pmd_entry = madvise_cold_or_pageout_pte_range, 490 }; 491 492 static void madvise_cold_page_range(struct mmu_gather *tlb, 493 struct vm_area_struct *vma, 494 unsigned long addr, unsigned long end) 495 { 496 struct madvise_walk_private walk_private = { 497 .pageout = false, 498 .tlb = tlb, 499 }; 500 501 tlb_start_vma(tlb, vma); 502 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 503 tlb_end_vma(tlb, vma); 504 } 505 506 static inline bool can_madv_lru_vma(struct vm_area_struct *vma) 507 { 508 return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); 509 } 510 511 static long madvise_cold(struct vm_area_struct *vma, 512 struct vm_area_struct **prev, 513 unsigned long start_addr, unsigned long end_addr) 514 { 515 struct mm_struct *mm = vma->vm_mm; 516 struct mmu_gather tlb; 517 518 *prev = vma; 519 if (!can_madv_lru_vma(vma)) 520 return -EINVAL; 521 522 lru_add_drain(); 523 tlb_gather_mmu(&tlb, mm); 524 madvise_cold_page_range(&tlb, vma, start_addr, end_addr); 525 tlb_finish_mmu(&tlb); 526 527 return 0; 528 } 529 530 static void madvise_pageout_page_range(struct mmu_gather *tlb, 531 struct vm_area_struct *vma, 532 unsigned long addr, unsigned long end) 533 { 534 struct madvise_walk_private walk_private = { 535 .pageout = true, 536 .tlb = tlb, 537 }; 538 539 tlb_start_vma(tlb, vma); 540 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 541 tlb_end_vma(tlb, vma); 542 } 543 544 static inline bool can_do_pageout(struct vm_area_struct *vma) 545 { 546 if (vma_is_anonymous(vma)) 547 return true; 548 if (!vma->vm_file) 549 return false; 550 /* 551 * paging out pagecache only for non-anonymous mappings that correspond 552 * to the files the calling process could (if tried) open for writing; 553 * otherwise we'd be including shared non-exclusive mappings, which 554 * opens a side channel. 555 */ 556 return inode_owner_or_capable(&init_user_ns, 557 file_inode(vma->vm_file)) || 558 file_permission(vma->vm_file, MAY_WRITE) == 0; 559 } 560 561 static long madvise_pageout(struct vm_area_struct *vma, 562 struct vm_area_struct **prev, 563 unsigned long start_addr, unsigned long end_addr) 564 { 565 struct mm_struct *mm = vma->vm_mm; 566 struct mmu_gather tlb; 567 568 *prev = vma; 569 if (!can_madv_lru_vma(vma)) 570 return -EINVAL; 571 572 if (!can_do_pageout(vma)) 573 return 0; 574 575 lru_add_drain(); 576 tlb_gather_mmu(&tlb, mm); 577 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); 578 tlb_finish_mmu(&tlb); 579 580 return 0; 581 } 582 583 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 584 unsigned long end, struct mm_walk *walk) 585 586 { 587 struct mmu_gather *tlb = walk->private; 588 struct mm_struct *mm = tlb->mm; 589 struct vm_area_struct *vma = walk->vma; 590 spinlock_t *ptl; 591 pte_t *orig_pte, *pte, ptent; 592 struct page *page; 593 int nr_swap = 0; 594 unsigned long next; 595 596 next = pmd_addr_end(addr, end); 597 if (pmd_trans_huge(*pmd)) 598 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 599 goto next; 600 601 if (pmd_trans_unstable(pmd)) 602 return 0; 603 604 tlb_change_page_size(tlb, PAGE_SIZE); 605 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 606 flush_tlb_batched_pending(mm); 607 arch_enter_lazy_mmu_mode(); 608 for (; addr != end; pte++, addr += PAGE_SIZE) { 609 ptent = *pte; 610 611 if (pte_none(ptent)) 612 continue; 613 /* 614 * If the pte has swp_entry, just clear page table to 615 * prevent swap-in which is more expensive rather than 616 * (page allocation + zeroing). 617 */ 618 if (!pte_present(ptent)) { 619 swp_entry_t entry; 620 621 entry = pte_to_swp_entry(ptent); 622 if (non_swap_entry(entry)) 623 continue; 624 nr_swap--; 625 free_swap_and_cache(entry); 626 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 627 continue; 628 } 629 630 page = vm_normal_page(vma, addr, ptent); 631 if (!page) 632 continue; 633 634 /* 635 * If pmd isn't transhuge but the page is THP and 636 * is owned by only this process, split it and 637 * deactivate all pages. 638 */ 639 if (PageTransCompound(page)) { 640 if (page_mapcount(page) != 1) 641 goto out; 642 get_page(page); 643 if (!trylock_page(page)) { 644 put_page(page); 645 goto out; 646 } 647 pte_unmap_unlock(orig_pte, ptl); 648 if (split_huge_page(page)) { 649 unlock_page(page); 650 put_page(page); 651 orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 652 goto out; 653 } 654 unlock_page(page); 655 put_page(page); 656 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 657 pte--; 658 addr -= PAGE_SIZE; 659 continue; 660 } 661 662 VM_BUG_ON_PAGE(PageTransCompound(page), page); 663 664 if (PageSwapCache(page) || PageDirty(page)) { 665 if (!trylock_page(page)) 666 continue; 667 /* 668 * If page is shared with others, we couldn't clear 669 * PG_dirty of the page. 670 */ 671 if (page_mapcount(page) != 1) { 672 unlock_page(page); 673 continue; 674 } 675 676 if (PageSwapCache(page) && !try_to_free_swap(page)) { 677 unlock_page(page); 678 continue; 679 } 680 681 ClearPageDirty(page); 682 unlock_page(page); 683 } 684 685 if (pte_young(ptent) || pte_dirty(ptent)) { 686 /* 687 * Some of architecture(ex, PPC) don't update TLB 688 * with set_pte_at and tlb_remove_tlb_entry so for 689 * the portability, remap the pte with old|clean 690 * after pte clearing. 691 */ 692 ptent = ptep_get_and_clear_full(mm, addr, pte, 693 tlb->fullmm); 694 695 ptent = pte_mkold(ptent); 696 ptent = pte_mkclean(ptent); 697 set_pte_at(mm, addr, pte, ptent); 698 tlb_remove_tlb_entry(tlb, pte, addr); 699 } 700 mark_page_lazyfree(page); 701 } 702 out: 703 if (nr_swap) { 704 if (current->mm == mm) 705 sync_mm_rss(mm); 706 707 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 708 } 709 arch_leave_lazy_mmu_mode(); 710 pte_unmap_unlock(orig_pte, ptl); 711 cond_resched(); 712 next: 713 return 0; 714 } 715 716 static const struct mm_walk_ops madvise_free_walk_ops = { 717 .pmd_entry = madvise_free_pte_range, 718 }; 719 720 static int madvise_free_single_vma(struct vm_area_struct *vma, 721 unsigned long start_addr, unsigned long end_addr) 722 { 723 struct mm_struct *mm = vma->vm_mm; 724 struct mmu_notifier_range range; 725 struct mmu_gather tlb; 726 727 /* MADV_FREE works for only anon vma at the moment */ 728 if (!vma_is_anonymous(vma)) 729 return -EINVAL; 730 731 range.start = max(vma->vm_start, start_addr); 732 if (range.start >= vma->vm_end) 733 return -EINVAL; 734 range.end = min(vma->vm_end, end_addr); 735 if (range.end <= vma->vm_start) 736 return -EINVAL; 737 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 738 range.start, range.end); 739 740 lru_add_drain(); 741 tlb_gather_mmu(&tlb, mm); 742 update_hiwater_rss(mm); 743 744 mmu_notifier_invalidate_range_start(&range); 745 tlb_start_vma(&tlb, vma); 746 walk_page_range(vma->vm_mm, range.start, range.end, 747 &madvise_free_walk_ops, &tlb); 748 tlb_end_vma(&tlb, vma); 749 mmu_notifier_invalidate_range_end(&range); 750 tlb_finish_mmu(&tlb); 751 752 return 0; 753 } 754 755 /* 756 * Application no longer needs these pages. If the pages are dirty, 757 * it's OK to just throw them away. The app will be more careful about 758 * data it wants to keep. Be sure to free swap resources too. The 759 * zap_page_range call sets things up for shrink_active_list to actually free 760 * these pages later if no one else has touched them in the meantime, 761 * although we could add these pages to a global reuse list for 762 * shrink_active_list to pick up before reclaiming other pages. 763 * 764 * NB: This interface discards data rather than pushes it out to swap, 765 * as some implementations do. This has performance implications for 766 * applications like large transactional databases which want to discard 767 * pages in anonymous maps after committing to backing store the data 768 * that was kept in them. There is no reason to write this data out to 769 * the swap area if the application is discarding it. 770 * 771 * An interface that causes the system to free clean pages and flush 772 * dirty pages is already available as msync(MS_INVALIDATE). 773 */ 774 static long madvise_dontneed_single_vma(struct vm_area_struct *vma, 775 unsigned long start, unsigned long end) 776 { 777 zap_page_range(vma, start, end - start); 778 return 0; 779 } 780 781 static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, 782 unsigned long start, 783 unsigned long *end, 784 int behavior) 785 { 786 if (!is_vm_hugetlb_page(vma)) { 787 unsigned int forbidden = VM_PFNMAP; 788 789 if (behavior != MADV_DONTNEED_LOCKED) 790 forbidden |= VM_LOCKED; 791 792 return !(vma->vm_flags & forbidden); 793 } 794 795 if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) 796 return false; 797 if (start & ~huge_page_mask(hstate_vma(vma))) 798 return false; 799 800 *end = ALIGN(*end, huge_page_size(hstate_vma(vma))); 801 return true; 802 } 803 804 static long madvise_dontneed_free(struct vm_area_struct *vma, 805 struct vm_area_struct **prev, 806 unsigned long start, unsigned long end, 807 int behavior) 808 { 809 struct mm_struct *mm = vma->vm_mm; 810 811 *prev = vma; 812 if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) 813 return -EINVAL; 814 815 if (!userfaultfd_remove(vma, start, end)) { 816 *prev = NULL; /* mmap_lock has been dropped, prev is stale */ 817 818 mmap_read_lock(mm); 819 vma = find_vma(mm, start); 820 if (!vma) 821 return -ENOMEM; 822 if (start < vma->vm_start) { 823 /* 824 * This "vma" under revalidation is the one 825 * with the lowest vma->vm_start where start 826 * is also < vma->vm_end. If start < 827 * vma->vm_start it means an hole materialized 828 * in the user address space within the 829 * virtual range passed to MADV_DONTNEED 830 * or MADV_FREE. 831 */ 832 return -ENOMEM; 833 } 834 /* 835 * Potential end adjustment for hugetlb vma is OK as 836 * the check below keeps end within vma. 837 */ 838 if (!madvise_dontneed_free_valid_vma(vma, start, &end, 839 behavior)) 840 return -EINVAL; 841 if (end > vma->vm_end) { 842 /* 843 * Don't fail if end > vma->vm_end. If the old 844 * vma was split while the mmap_lock was 845 * released the effect of the concurrent 846 * operation may not cause madvise() to 847 * have an undefined result. There may be an 848 * adjacent next vma that we'll walk 849 * next. userfaultfd_remove() will generate an 850 * UFFD_EVENT_REMOVE repetition on the 851 * end-vma->vm_end range, but the manager can 852 * handle a repetition fine. 853 */ 854 end = vma->vm_end; 855 } 856 VM_WARN_ON(start >= end); 857 } 858 859 if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) 860 return madvise_dontneed_single_vma(vma, start, end); 861 else if (behavior == MADV_FREE) 862 return madvise_free_single_vma(vma, start, end); 863 else 864 return -EINVAL; 865 } 866 867 static long madvise_populate(struct vm_area_struct *vma, 868 struct vm_area_struct **prev, 869 unsigned long start, unsigned long end, 870 int behavior) 871 { 872 const bool write = behavior == MADV_POPULATE_WRITE; 873 struct mm_struct *mm = vma->vm_mm; 874 unsigned long tmp_end; 875 int locked = 1; 876 long pages; 877 878 *prev = vma; 879 880 while (start < end) { 881 /* 882 * We might have temporarily dropped the lock. For example, 883 * our VMA might have been split. 884 */ 885 if (!vma || start >= vma->vm_end) { 886 vma = vma_lookup(mm, start); 887 if (!vma) 888 return -ENOMEM; 889 } 890 891 tmp_end = min_t(unsigned long, end, vma->vm_end); 892 /* Populate (prefault) page tables readable/writable. */ 893 pages = faultin_vma_page_range(vma, start, tmp_end, write, 894 &locked); 895 if (!locked) { 896 mmap_read_lock(mm); 897 locked = 1; 898 *prev = NULL; 899 vma = NULL; 900 } 901 if (pages < 0) { 902 switch (pages) { 903 case -EINTR: 904 return -EINTR; 905 case -EINVAL: /* Incompatible mappings / permissions. */ 906 return -EINVAL; 907 case -EHWPOISON: 908 return -EHWPOISON; 909 case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ 910 return -EFAULT; 911 default: 912 pr_warn_once("%s: unhandled return value: %ld\n", 913 __func__, pages); 914 fallthrough; 915 case -ENOMEM: 916 return -ENOMEM; 917 } 918 } 919 start += pages * PAGE_SIZE; 920 } 921 return 0; 922 } 923 924 /* 925 * Application wants to free up the pages and associated backing store. 926 * This is effectively punching a hole into the middle of a file. 927 */ 928 static long madvise_remove(struct vm_area_struct *vma, 929 struct vm_area_struct **prev, 930 unsigned long start, unsigned long end) 931 { 932 loff_t offset; 933 int error; 934 struct file *f; 935 struct mm_struct *mm = vma->vm_mm; 936 937 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 938 939 if (vma->vm_flags & VM_LOCKED) 940 return -EINVAL; 941 942 f = vma->vm_file; 943 944 if (!f || !f->f_mapping || !f->f_mapping->host) { 945 return -EINVAL; 946 } 947 948 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 949 return -EACCES; 950 951 offset = (loff_t)(start - vma->vm_start) 952 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 953 954 /* 955 * Filesystem's fallocate may need to take i_rwsem. We need to 956 * explicitly grab a reference because the vma (and hence the 957 * vma's reference to the file) can go away as soon as we drop 958 * mmap_lock. 959 */ 960 get_file(f); 961 if (userfaultfd_remove(vma, start, end)) { 962 /* mmap_lock was not released by userfaultfd_remove() */ 963 mmap_read_unlock(mm); 964 } 965 error = vfs_fallocate(f, 966 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 967 offset, end - start); 968 fput(f); 969 mmap_read_lock(mm); 970 return error; 971 } 972 973 /* 974 * Apply an madvise behavior to a region of a vma. madvise_update_vma 975 * will handle splitting a vm area into separate areas, each area with its own 976 * behavior. 977 */ 978 static int madvise_vma_behavior(struct vm_area_struct *vma, 979 struct vm_area_struct **prev, 980 unsigned long start, unsigned long end, 981 unsigned long behavior) 982 { 983 int error; 984 struct anon_vma_name *anon_name; 985 unsigned long new_flags = vma->vm_flags; 986 987 switch (behavior) { 988 case MADV_REMOVE: 989 return madvise_remove(vma, prev, start, end); 990 case MADV_WILLNEED: 991 return madvise_willneed(vma, prev, start, end); 992 case MADV_COLD: 993 return madvise_cold(vma, prev, start, end); 994 case MADV_PAGEOUT: 995 return madvise_pageout(vma, prev, start, end); 996 case MADV_FREE: 997 case MADV_DONTNEED: 998 case MADV_DONTNEED_LOCKED: 999 return madvise_dontneed_free(vma, prev, start, end, behavior); 1000 case MADV_POPULATE_READ: 1001 case MADV_POPULATE_WRITE: 1002 return madvise_populate(vma, prev, start, end, behavior); 1003 case MADV_NORMAL: 1004 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 1005 break; 1006 case MADV_SEQUENTIAL: 1007 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 1008 break; 1009 case MADV_RANDOM: 1010 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 1011 break; 1012 case MADV_DONTFORK: 1013 new_flags |= VM_DONTCOPY; 1014 break; 1015 case MADV_DOFORK: 1016 if (vma->vm_flags & VM_IO) 1017 return -EINVAL; 1018 new_flags &= ~VM_DONTCOPY; 1019 break; 1020 case MADV_WIPEONFORK: 1021 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 1022 if (vma->vm_file || vma->vm_flags & VM_SHARED) 1023 return -EINVAL; 1024 new_flags |= VM_WIPEONFORK; 1025 break; 1026 case MADV_KEEPONFORK: 1027 new_flags &= ~VM_WIPEONFORK; 1028 break; 1029 case MADV_DONTDUMP: 1030 new_flags |= VM_DONTDUMP; 1031 break; 1032 case MADV_DODUMP: 1033 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) 1034 return -EINVAL; 1035 new_flags &= ~VM_DONTDUMP; 1036 break; 1037 case MADV_MERGEABLE: 1038 case MADV_UNMERGEABLE: 1039 error = ksm_madvise(vma, start, end, behavior, &new_flags); 1040 if (error) 1041 goto out; 1042 break; 1043 case MADV_HUGEPAGE: 1044 case MADV_NOHUGEPAGE: 1045 error = hugepage_madvise(vma, &new_flags, behavior); 1046 if (error) 1047 goto out; 1048 break; 1049 } 1050 1051 anon_name = anon_vma_name(vma); 1052 anon_vma_name_get(anon_name); 1053 error = madvise_update_vma(vma, prev, start, end, new_flags, 1054 anon_name); 1055 anon_vma_name_put(anon_name); 1056 1057 out: 1058 /* 1059 * madvise() returns EAGAIN if kernel resources, such as 1060 * slab, are temporarily unavailable. 1061 */ 1062 if (error == -ENOMEM) 1063 error = -EAGAIN; 1064 return error; 1065 } 1066 1067 #ifdef CONFIG_MEMORY_FAILURE 1068 /* 1069 * Error injection support for memory error handling. 1070 */ 1071 static int madvise_inject_error(int behavior, 1072 unsigned long start, unsigned long end) 1073 { 1074 unsigned long size; 1075 1076 if (!capable(CAP_SYS_ADMIN)) 1077 return -EPERM; 1078 1079 1080 for (; start < end; start += size) { 1081 unsigned long pfn; 1082 struct page *page; 1083 int ret; 1084 1085 ret = get_user_pages_fast(start, 1, 0, &page); 1086 if (ret != 1) 1087 return ret; 1088 pfn = page_to_pfn(page); 1089 1090 /* 1091 * When soft offlining hugepages, after migrating the page 1092 * we dissolve it, therefore in the second loop "page" will 1093 * no longer be a compound page. 1094 */ 1095 size = page_size(compound_head(page)); 1096 1097 if (behavior == MADV_SOFT_OFFLINE) { 1098 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 1099 pfn, start); 1100 ret = soft_offline_page(pfn, MF_COUNT_INCREASED); 1101 } else { 1102 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 1103 pfn, start); 1104 ret = memory_failure(pfn, MF_COUNT_INCREASED); 1105 if (ret == -EOPNOTSUPP) 1106 ret = 0; 1107 } 1108 1109 if (ret) 1110 return ret; 1111 } 1112 1113 return 0; 1114 } 1115 #endif 1116 1117 static bool 1118 madvise_behavior_valid(int behavior) 1119 { 1120 switch (behavior) { 1121 case MADV_DOFORK: 1122 case MADV_DONTFORK: 1123 case MADV_NORMAL: 1124 case MADV_SEQUENTIAL: 1125 case MADV_RANDOM: 1126 case MADV_REMOVE: 1127 case MADV_WILLNEED: 1128 case MADV_DONTNEED: 1129 case MADV_DONTNEED_LOCKED: 1130 case MADV_FREE: 1131 case MADV_COLD: 1132 case MADV_PAGEOUT: 1133 case MADV_POPULATE_READ: 1134 case MADV_POPULATE_WRITE: 1135 #ifdef CONFIG_KSM 1136 case MADV_MERGEABLE: 1137 case MADV_UNMERGEABLE: 1138 #endif 1139 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1140 case MADV_HUGEPAGE: 1141 case MADV_NOHUGEPAGE: 1142 #endif 1143 case MADV_DONTDUMP: 1144 case MADV_DODUMP: 1145 case MADV_WIPEONFORK: 1146 case MADV_KEEPONFORK: 1147 #ifdef CONFIG_MEMORY_FAILURE 1148 case MADV_SOFT_OFFLINE: 1149 case MADV_HWPOISON: 1150 #endif 1151 return true; 1152 1153 default: 1154 return false; 1155 } 1156 } 1157 1158 static bool 1159 process_madvise_behavior_valid(int behavior) 1160 { 1161 switch (behavior) { 1162 case MADV_COLD: 1163 case MADV_PAGEOUT: 1164 case MADV_WILLNEED: 1165 return true; 1166 default: 1167 return false; 1168 } 1169 } 1170 1171 /* 1172 * Walk the vmas in range [start,end), and call the visit function on each one. 1173 * The visit function will get start and end parameters that cover the overlap 1174 * between the current vma and the original range. Any unmapped regions in the 1175 * original range will result in this function returning -ENOMEM while still 1176 * calling the visit function on all of the existing vmas in the range. 1177 * Must be called with the mmap_lock held for reading or writing. 1178 */ 1179 static 1180 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, 1181 unsigned long end, unsigned long arg, 1182 int (*visit)(struct vm_area_struct *vma, 1183 struct vm_area_struct **prev, unsigned long start, 1184 unsigned long end, unsigned long arg)) 1185 { 1186 struct vm_area_struct *vma; 1187 struct vm_area_struct *prev; 1188 unsigned long tmp; 1189 int unmapped_error = 0; 1190 1191 /* 1192 * If the interval [start,end) covers some unmapped address 1193 * ranges, just ignore them, but return -ENOMEM at the end. 1194 * - different from the way of handling in mlock etc. 1195 */ 1196 vma = find_vma_prev(mm, start, &prev); 1197 if (vma && start > vma->vm_start) 1198 prev = vma; 1199 1200 for (;;) { 1201 int error; 1202 1203 /* Still start < end. */ 1204 if (!vma) 1205 return -ENOMEM; 1206 1207 /* Here start < (end|vma->vm_end). */ 1208 if (start < vma->vm_start) { 1209 unmapped_error = -ENOMEM; 1210 start = vma->vm_start; 1211 if (start >= end) 1212 break; 1213 } 1214 1215 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 1216 tmp = vma->vm_end; 1217 if (end < tmp) 1218 tmp = end; 1219 1220 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 1221 error = visit(vma, &prev, start, tmp, arg); 1222 if (error) 1223 return error; 1224 start = tmp; 1225 if (prev && start < prev->vm_end) 1226 start = prev->vm_end; 1227 if (start >= end) 1228 break; 1229 if (prev) 1230 vma = prev->vm_next; 1231 else /* madvise_remove dropped mmap_lock */ 1232 vma = find_vma(mm, start); 1233 } 1234 1235 return unmapped_error; 1236 } 1237 1238 #ifdef CONFIG_ANON_VMA_NAME 1239 static int madvise_vma_anon_name(struct vm_area_struct *vma, 1240 struct vm_area_struct **prev, 1241 unsigned long start, unsigned long end, 1242 unsigned long anon_name) 1243 { 1244 int error; 1245 1246 /* Only anonymous mappings can be named */ 1247 if (vma->vm_file) 1248 return -EBADF; 1249 1250 error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, 1251 (struct anon_vma_name *)anon_name); 1252 1253 /* 1254 * madvise() returns EAGAIN if kernel resources, such as 1255 * slab, are temporarily unavailable. 1256 */ 1257 if (error == -ENOMEM) 1258 error = -EAGAIN; 1259 return error; 1260 } 1261 1262 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, 1263 unsigned long len_in, struct anon_vma_name *anon_name) 1264 { 1265 unsigned long end; 1266 unsigned long len; 1267 1268 if (start & ~PAGE_MASK) 1269 return -EINVAL; 1270 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 1271 1272 /* Check to see whether len was rounded up from small -ve to zero */ 1273 if (len_in && !len) 1274 return -EINVAL; 1275 1276 end = start + len; 1277 if (end < start) 1278 return -EINVAL; 1279 1280 if (end == start) 1281 return 0; 1282 1283 return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, 1284 madvise_vma_anon_name); 1285 } 1286 #endif /* CONFIG_ANON_VMA_NAME */ 1287 /* 1288 * The madvise(2) system call. 1289 * 1290 * Applications can use madvise() to advise the kernel how it should 1291 * handle paging I/O in this VM area. The idea is to help the kernel 1292 * use appropriate read-ahead and caching techniques. The information 1293 * provided is advisory only, and can be safely disregarded by the 1294 * kernel without affecting the correct operation of the application. 1295 * 1296 * behavior values: 1297 * MADV_NORMAL - the default behavior is to read clusters. This 1298 * results in some read-ahead and read-behind. 1299 * MADV_RANDOM - the system should read the minimum amount of data 1300 * on any access, since it is unlikely that the appli- 1301 * cation will need more than what it asks for. 1302 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 1303 * once, so they can be aggressively read ahead, and 1304 * can be freed soon after they are accessed. 1305 * MADV_WILLNEED - the application is notifying the system to read 1306 * some pages ahead. 1307 * MADV_DONTNEED - the application is finished with the given range, 1308 * so the kernel can free resources associated with it. 1309 * MADV_FREE - the application marks pages in the given range as lazy free, 1310 * where actual purges are postponed until memory pressure happens. 1311 * MADV_REMOVE - the application wants to free up the given range of 1312 * pages and associated backing store. 1313 * MADV_DONTFORK - omit this area from child's address space when forking: 1314 * typically, to avoid COWing pages pinned by get_user_pages(). 1315 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 1316 * MADV_WIPEONFORK - present the child process with zero-filled memory in this 1317 * range after a fork. 1318 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 1319 * MADV_HWPOISON - trigger memory error handler as if the given memory range 1320 * were corrupted by unrecoverable hardware memory failure. 1321 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 1322 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 1323 * this area with pages of identical content from other such areas. 1324 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 1325 * MADV_HUGEPAGE - the application wants to back the given range by transparent 1326 * huge pages in the future. Existing pages might be coalesced and 1327 * new pages might be allocated as THP. 1328 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 1329 * transparent huge pages so the existing pages will not be 1330 * coalesced into THP and new pages will not be allocated as THP. 1331 * MADV_DONTDUMP - the application wants to prevent pages in the given range 1332 * from being included in its core dump. 1333 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 1334 * MADV_COLD - the application is not expected to use this memory soon, 1335 * deactivate pages in this range so that they can be reclaimed 1336 * easily if memory pressure happens. 1337 * MADV_PAGEOUT - the application is not expected to use this memory soon, 1338 * page out the pages in this range immediately. 1339 * MADV_POPULATE_READ - populate (prefault) page tables readable by 1340 * triggering read faults if required 1341 * MADV_POPULATE_WRITE - populate (prefault) page tables writable by 1342 * triggering write faults if required 1343 * 1344 * return values: 1345 * zero - success 1346 * -EINVAL - start + len < 0, start is not page-aligned, 1347 * "behavior" is not a valid value, or application 1348 * is attempting to release locked or shared pages, 1349 * or the specified address range includes file, Huge TLB, 1350 * MAP_SHARED or VMPFNMAP range. 1351 * -ENOMEM - addresses in the specified range are not currently 1352 * mapped, or are outside the AS of the process. 1353 * -EIO - an I/O error occurred while paging in data. 1354 * -EBADF - map exists, but area maps something that isn't a file. 1355 * -EAGAIN - a kernel resource was temporarily unavailable. 1356 */ 1357 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) 1358 { 1359 unsigned long end; 1360 int error; 1361 int write; 1362 size_t len; 1363 struct blk_plug plug; 1364 1365 start = untagged_addr(start); 1366 1367 if (!madvise_behavior_valid(behavior)) 1368 return -EINVAL; 1369 1370 if (!PAGE_ALIGNED(start)) 1371 return -EINVAL; 1372 len = PAGE_ALIGN(len_in); 1373 1374 /* Check to see whether len was rounded up from small -ve to zero */ 1375 if (len_in && !len) 1376 return -EINVAL; 1377 1378 end = start + len; 1379 if (end < start) 1380 return -EINVAL; 1381 1382 if (end == start) 1383 return 0; 1384 1385 #ifdef CONFIG_MEMORY_FAILURE 1386 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 1387 return madvise_inject_error(behavior, start, start + len_in); 1388 #endif 1389 1390 write = madvise_need_mmap_write(behavior); 1391 if (write) { 1392 if (mmap_write_lock_killable(mm)) 1393 return -EINTR; 1394 } else { 1395 mmap_read_lock(mm); 1396 } 1397 1398 blk_start_plug(&plug); 1399 error = madvise_walk_vmas(mm, start, end, behavior, 1400 madvise_vma_behavior); 1401 blk_finish_plug(&plug); 1402 if (write) 1403 mmap_write_unlock(mm); 1404 else 1405 mmap_read_unlock(mm); 1406 1407 return error; 1408 } 1409 1410 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 1411 { 1412 return do_madvise(current->mm, start, len_in, behavior); 1413 } 1414 1415 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, 1416 size_t, vlen, int, behavior, unsigned int, flags) 1417 { 1418 ssize_t ret; 1419 struct iovec iovstack[UIO_FASTIOV], iovec; 1420 struct iovec *iov = iovstack; 1421 struct iov_iter iter; 1422 struct task_struct *task; 1423 struct mm_struct *mm; 1424 size_t total_len; 1425 unsigned int f_flags; 1426 1427 if (flags != 0) { 1428 ret = -EINVAL; 1429 goto out; 1430 } 1431 1432 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 1433 if (ret < 0) 1434 goto out; 1435 1436 task = pidfd_get_task(pidfd, &f_flags); 1437 if (IS_ERR(task)) { 1438 ret = PTR_ERR(task); 1439 goto free_iov; 1440 } 1441 1442 if (!process_madvise_behavior_valid(behavior)) { 1443 ret = -EINVAL; 1444 goto release_task; 1445 } 1446 1447 /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ 1448 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); 1449 if (IS_ERR_OR_NULL(mm)) { 1450 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; 1451 goto release_task; 1452 } 1453 1454 /* 1455 * Require CAP_SYS_NICE for influencing process performance. Note that 1456 * only non-destructive hints are currently supported. 1457 */ 1458 if (!capable(CAP_SYS_NICE)) { 1459 ret = -EPERM; 1460 goto release_mm; 1461 } 1462 1463 total_len = iov_iter_count(&iter); 1464 1465 while (iov_iter_count(&iter)) { 1466 iovec = iov_iter_iovec(&iter); 1467 ret = do_madvise(mm, (unsigned long)iovec.iov_base, 1468 iovec.iov_len, behavior); 1469 if (ret < 0) 1470 break; 1471 iov_iter_advance(&iter, iovec.iov_len); 1472 } 1473 1474 ret = (total_len - iov_iter_count(&iter)) ? : ret; 1475 1476 release_mm: 1477 mmput(mm); 1478 release_task: 1479 put_task_struct(task); 1480 free_iov: 1481 kfree(iov); 1482 out: 1483 return ret; 1484 } 1485