1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/madvise.c 4 * 5 * Copyright (C) 1999 Linus Torvalds 6 * Copyright (C) 2002 Christoph Hellwig 7 */ 8 9 #include <linux/mman.h> 10 #include <linux/pagemap.h> 11 #include <linux/syscalls.h> 12 #include <linux/mempolicy.h> 13 #include <linux/page-isolation.h> 14 #include <linux/page_idle.h> 15 #include <linux/userfaultfd_k.h> 16 #include <linux/hugetlb.h> 17 #include <linux/falloc.h> 18 #include <linux/fadvise.h> 19 #include <linux/sched.h> 20 #include <linux/sched/mm.h> 21 #include <linux/mm_inline.h> 22 #include <linux/string.h> 23 #include <linux/uio.h> 24 #include <linux/ksm.h> 25 #include <linux/fs.h> 26 #include <linux/file.h> 27 #include <linux/blkdev.h> 28 #include <linux/backing-dev.h> 29 #include <linux/pagewalk.h> 30 #include <linux/swap.h> 31 #include <linux/swapops.h> 32 #include <linux/shmem_fs.h> 33 #include <linux/mmu_notifier.h> 34 35 #include <asm/tlb.h> 36 37 #include "internal.h" 38 #include "swap.h" 39 40 struct madvise_walk_private { 41 struct mmu_gather *tlb; 42 bool pageout; 43 }; 44 45 /* 46 * Any behaviour which results in changes to the vma->vm_flags needs to 47 * take mmap_lock for writing. Others, which simply traverse vmas, need 48 * to only take it for reading. 49 */ 50 static int madvise_need_mmap_write(int behavior) 51 { 52 switch (behavior) { 53 case MADV_REMOVE: 54 case MADV_WILLNEED: 55 case MADV_DONTNEED: 56 case MADV_DONTNEED_LOCKED: 57 case MADV_COLD: 58 case MADV_PAGEOUT: 59 case MADV_FREE: 60 case MADV_POPULATE_READ: 61 case MADV_POPULATE_WRITE: 62 case MADV_COLLAPSE: 63 return 0; 64 default: 65 /* be safe, default to 1. list exceptions explicitly */ 66 return 1; 67 } 68 } 69 70 #ifdef CONFIG_ANON_VMA_NAME 71 struct anon_vma_name *anon_vma_name_alloc(const char *name) 72 { 73 struct anon_vma_name *anon_name; 74 size_t count; 75 76 /* Add 1 for NUL terminator at the end of the anon_name->name */ 77 count = strlen(name) + 1; 78 anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); 79 if (anon_name) { 80 kref_init(&anon_name->kref); 81 memcpy(anon_name->name, name, count); 82 } 83 84 return anon_name; 85 } 86 87 void anon_vma_name_free(struct kref *kref) 88 { 89 struct anon_vma_name *anon_name = 90 container_of(kref, struct anon_vma_name, kref); 91 kfree(anon_name); 92 } 93 94 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) 95 { 96 mmap_assert_locked(vma->vm_mm); 97 98 return vma->anon_name; 99 } 100 101 /* mmap_lock should be write-locked */ 102 static int replace_anon_vma_name(struct vm_area_struct *vma, 103 struct anon_vma_name *anon_name) 104 { 105 struct anon_vma_name *orig_name = anon_vma_name(vma); 106 107 if (!anon_name) { 108 vma->anon_name = NULL; 109 anon_vma_name_put(orig_name); 110 return 0; 111 } 112 113 if (anon_vma_name_eq(orig_name, anon_name)) 114 return 0; 115 116 vma->anon_name = anon_vma_name_reuse(anon_name); 117 anon_vma_name_put(orig_name); 118 119 return 0; 120 } 121 #else /* CONFIG_ANON_VMA_NAME */ 122 static int replace_anon_vma_name(struct vm_area_struct *vma, 123 struct anon_vma_name *anon_name) 124 { 125 if (anon_name) 126 return -EINVAL; 127 128 return 0; 129 } 130 #endif /* CONFIG_ANON_VMA_NAME */ 131 /* 132 * Update the vm_flags on region of a vma, splitting it or merging it as 133 * necessary. Must be called with mmap_lock held for writing; 134 * Caller should ensure anon_name stability by raising its refcount even when 135 * anon_name belongs to a valid vma because this function might free that vma. 136 */ 137 static int madvise_update_vma(struct vm_area_struct *vma, 138 struct vm_area_struct **prev, unsigned long start, 139 unsigned long end, unsigned long new_flags, 140 struct anon_vma_name *anon_name) 141 { 142 struct mm_struct *mm = vma->vm_mm; 143 int error; 144 pgoff_t pgoff; 145 VMA_ITERATOR(vmi, mm, start); 146 147 if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { 148 *prev = vma; 149 return 0; 150 } 151 152 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 153 *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags, 154 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), 155 vma->vm_userfaultfd_ctx, anon_name); 156 if (*prev) { 157 vma = *prev; 158 goto success; 159 } 160 161 *prev = vma; 162 163 if (start != vma->vm_start) { 164 error = split_vma(&vmi, vma, start, 1); 165 if (error) 166 return error; 167 } 168 169 if (end != vma->vm_end) { 170 error = split_vma(&vmi, vma, end, 0); 171 if (error) 172 return error; 173 } 174 175 success: 176 /* 177 * vm_flags is protected by the mmap_lock held in write mode. 178 */ 179 vm_flags_reset(vma, new_flags); 180 if (!vma->vm_file || vma_is_anon_shmem(vma)) { 181 error = replace_anon_vma_name(vma, anon_name); 182 if (error) 183 return error; 184 } 185 186 return 0; 187 } 188 189 #ifdef CONFIG_SWAP 190 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 191 unsigned long end, struct mm_walk *walk) 192 { 193 struct vm_area_struct *vma = walk->private; 194 struct swap_iocb *splug = NULL; 195 pte_t *ptep = NULL; 196 spinlock_t *ptl; 197 unsigned long addr; 198 199 for (addr = start; addr < end; addr += PAGE_SIZE) { 200 pte_t pte; 201 swp_entry_t entry; 202 struct page *page; 203 204 if (!ptep++) { 205 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 206 if (!ptep) 207 break; 208 } 209 210 pte = ptep_get(ptep); 211 if (!is_swap_pte(pte)) 212 continue; 213 entry = pte_to_swp_entry(pte); 214 if (unlikely(non_swap_entry(entry))) 215 continue; 216 217 pte_unmap_unlock(ptep, ptl); 218 ptep = NULL; 219 220 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 221 vma, addr, false, &splug); 222 if (page) 223 put_page(page); 224 } 225 226 if (ptep) 227 pte_unmap_unlock(ptep, ptl); 228 swap_read_unplug(splug); 229 cond_resched(); 230 231 return 0; 232 } 233 234 static const struct mm_walk_ops swapin_walk_ops = { 235 .pmd_entry = swapin_walk_pmd_entry, 236 }; 237 238 static void shmem_swapin_range(struct vm_area_struct *vma, 239 unsigned long start, unsigned long end, 240 struct address_space *mapping) 241 { 242 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); 243 pgoff_t end_index = linear_page_index(vma, end) - 1; 244 struct page *page; 245 struct swap_iocb *splug = NULL; 246 247 rcu_read_lock(); 248 xas_for_each(&xas, page, end_index) { 249 unsigned long addr; 250 swp_entry_t entry; 251 252 if (!xa_is_value(page)) 253 continue; 254 entry = radix_to_swp_entry(page); 255 /* There might be swapin error entries in shmem mapping. */ 256 if (non_swap_entry(entry)) 257 continue; 258 259 addr = vma->vm_start + 260 ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT); 261 xas_pause(&xas); 262 rcu_read_unlock(); 263 264 page = read_swap_cache_async(entry, mapping_gfp_mask(mapping), 265 vma, addr, false, &splug); 266 if (page) 267 put_page(page); 268 269 rcu_read_lock(); 270 } 271 rcu_read_unlock(); 272 swap_read_unplug(splug); 273 } 274 #endif /* CONFIG_SWAP */ 275 276 /* 277 * Schedule all required I/O operations. Do not wait for completion. 278 */ 279 static long madvise_willneed(struct vm_area_struct *vma, 280 struct vm_area_struct **prev, 281 unsigned long start, unsigned long end) 282 { 283 struct mm_struct *mm = vma->vm_mm; 284 struct file *file = vma->vm_file; 285 loff_t offset; 286 287 *prev = vma; 288 #ifdef CONFIG_SWAP 289 if (!file) { 290 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); 291 lru_add_drain(); /* Push any new pages onto the LRU now */ 292 return 0; 293 } 294 295 if (shmem_mapping(file->f_mapping)) { 296 shmem_swapin_range(vma, start, end, file->f_mapping); 297 lru_add_drain(); /* Push any new pages onto the LRU now */ 298 return 0; 299 } 300 #else 301 if (!file) 302 return -EBADF; 303 #endif 304 305 if (IS_DAX(file_inode(file))) { 306 /* no bad return value, but ignore advice */ 307 return 0; 308 } 309 310 /* 311 * Filesystem's fadvise may need to take various locks. We need to 312 * explicitly grab a reference because the vma (and hence the 313 * vma's reference to the file) can go away as soon as we drop 314 * mmap_lock. 315 */ 316 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 317 get_file(file); 318 offset = (loff_t)(start - vma->vm_start) 319 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 320 mmap_read_unlock(mm); 321 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 322 fput(file); 323 mmap_read_lock(mm); 324 return 0; 325 } 326 327 static inline bool can_do_file_pageout(struct vm_area_struct *vma) 328 { 329 if (!vma->vm_file) 330 return false; 331 /* 332 * paging out pagecache only for non-anonymous mappings that correspond 333 * to the files the calling process could (if tried) open for writing; 334 * otherwise we'd be including shared non-exclusive mappings, which 335 * opens a side channel. 336 */ 337 return inode_owner_or_capable(&nop_mnt_idmap, 338 file_inode(vma->vm_file)) || 339 file_permission(vma->vm_file, MAY_WRITE) == 0; 340 } 341 342 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, 343 unsigned long addr, unsigned long end, 344 struct mm_walk *walk) 345 { 346 struct madvise_walk_private *private = walk->private; 347 struct mmu_gather *tlb = private->tlb; 348 bool pageout = private->pageout; 349 struct mm_struct *mm = tlb->mm; 350 struct vm_area_struct *vma = walk->vma; 351 pte_t *start_pte, *pte, ptent; 352 spinlock_t *ptl; 353 struct folio *folio = NULL; 354 LIST_HEAD(folio_list); 355 bool pageout_anon_only_filter; 356 357 if (fatal_signal_pending(current)) 358 return -EINTR; 359 360 pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && 361 !can_do_file_pageout(vma); 362 363 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 364 if (pmd_trans_huge(*pmd)) { 365 pmd_t orig_pmd; 366 unsigned long next = pmd_addr_end(addr, end); 367 368 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 369 ptl = pmd_trans_huge_lock(pmd, vma); 370 if (!ptl) 371 return 0; 372 373 orig_pmd = *pmd; 374 if (is_huge_zero_pmd(orig_pmd)) 375 goto huge_unlock; 376 377 if (unlikely(!pmd_present(orig_pmd))) { 378 VM_BUG_ON(thp_migration_supported() && 379 !is_pmd_migration_entry(orig_pmd)); 380 goto huge_unlock; 381 } 382 383 folio = pfn_folio(pmd_pfn(orig_pmd)); 384 385 /* Do not interfere with other mappings of this folio */ 386 if (folio_mapcount(folio) != 1) 387 goto huge_unlock; 388 389 if (pageout_anon_only_filter && !folio_test_anon(folio)) 390 goto huge_unlock; 391 392 if (next - addr != HPAGE_PMD_SIZE) { 393 int err; 394 395 folio_get(folio); 396 spin_unlock(ptl); 397 folio_lock(folio); 398 err = split_folio(folio); 399 folio_unlock(folio); 400 folio_put(folio); 401 if (!err) 402 goto regular_folio; 403 return 0; 404 } 405 406 if (pmd_young(orig_pmd)) { 407 pmdp_invalidate(vma, addr, pmd); 408 orig_pmd = pmd_mkold(orig_pmd); 409 410 set_pmd_at(mm, addr, pmd, orig_pmd); 411 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 412 } 413 414 folio_clear_referenced(folio); 415 folio_test_clear_young(folio); 416 if (folio_test_active(folio)) 417 folio_set_workingset(folio); 418 if (pageout) { 419 if (folio_isolate_lru(folio)) { 420 if (folio_test_unevictable(folio)) 421 folio_putback_lru(folio); 422 else 423 list_add(&folio->lru, &folio_list); 424 } 425 } else 426 folio_deactivate(folio); 427 huge_unlock: 428 spin_unlock(ptl); 429 if (pageout) 430 reclaim_pages(&folio_list); 431 return 0; 432 } 433 434 regular_folio: 435 #endif 436 tlb_change_page_size(tlb, PAGE_SIZE); 437 start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 438 if (!start_pte) 439 return 0; 440 flush_tlb_batched_pending(mm); 441 arch_enter_lazy_mmu_mode(); 442 for (; addr < end; pte++, addr += PAGE_SIZE) { 443 ptent = ptep_get(pte); 444 445 if (pte_none(ptent)) 446 continue; 447 448 if (!pte_present(ptent)) 449 continue; 450 451 folio = vm_normal_folio(vma, addr, ptent); 452 if (!folio || folio_is_zone_device(folio)) 453 continue; 454 455 /* 456 * Creating a THP page is expensive so split it only if we 457 * are sure it's worth. Split it if we are only owner. 458 */ 459 if (folio_test_large(folio)) { 460 int err; 461 462 if (folio_mapcount(folio) != 1) 463 break; 464 if (pageout_anon_only_filter && !folio_test_anon(folio)) 465 break; 466 if (!folio_trylock(folio)) 467 break; 468 folio_get(folio); 469 arch_leave_lazy_mmu_mode(); 470 pte_unmap_unlock(start_pte, ptl); 471 start_pte = NULL; 472 err = split_folio(folio); 473 folio_unlock(folio); 474 folio_put(folio); 475 if (err) 476 break; 477 start_pte = pte = 478 pte_offset_map_lock(mm, pmd, addr, &ptl); 479 if (!start_pte) 480 break; 481 arch_enter_lazy_mmu_mode(); 482 pte--; 483 addr -= PAGE_SIZE; 484 continue; 485 } 486 487 /* 488 * Do not interfere with other mappings of this folio and 489 * non-LRU folio. 490 */ 491 if (!folio_test_lru(folio) || folio_mapcount(folio) != 1) 492 continue; 493 494 if (pageout_anon_only_filter && !folio_test_anon(folio)) 495 continue; 496 497 VM_BUG_ON_FOLIO(folio_test_large(folio), folio); 498 499 if (pte_young(ptent)) { 500 ptent = ptep_get_and_clear_full(mm, addr, pte, 501 tlb->fullmm); 502 ptent = pte_mkold(ptent); 503 set_pte_at(mm, addr, pte, ptent); 504 tlb_remove_tlb_entry(tlb, pte, addr); 505 } 506 507 /* 508 * We are deactivating a folio for accelerating reclaiming. 509 * VM couldn't reclaim the folio unless we clear PG_young. 510 * As a side effect, it makes confuse idle-page tracking 511 * because they will miss recent referenced history. 512 */ 513 folio_clear_referenced(folio); 514 folio_test_clear_young(folio); 515 if (folio_test_active(folio)) 516 folio_set_workingset(folio); 517 if (pageout) { 518 if (folio_isolate_lru(folio)) { 519 if (folio_test_unevictable(folio)) 520 folio_putback_lru(folio); 521 else 522 list_add(&folio->lru, &folio_list); 523 } 524 } else 525 folio_deactivate(folio); 526 } 527 528 if (start_pte) { 529 arch_leave_lazy_mmu_mode(); 530 pte_unmap_unlock(start_pte, ptl); 531 } 532 if (pageout) 533 reclaim_pages(&folio_list); 534 cond_resched(); 535 536 return 0; 537 } 538 539 static const struct mm_walk_ops cold_walk_ops = { 540 .pmd_entry = madvise_cold_or_pageout_pte_range, 541 }; 542 543 static void madvise_cold_page_range(struct mmu_gather *tlb, 544 struct vm_area_struct *vma, 545 unsigned long addr, unsigned long end) 546 { 547 struct madvise_walk_private walk_private = { 548 .pageout = false, 549 .tlb = tlb, 550 }; 551 552 tlb_start_vma(tlb, vma); 553 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 554 tlb_end_vma(tlb, vma); 555 } 556 557 static inline bool can_madv_lru_vma(struct vm_area_struct *vma) 558 { 559 return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); 560 } 561 562 static long madvise_cold(struct vm_area_struct *vma, 563 struct vm_area_struct **prev, 564 unsigned long start_addr, unsigned long end_addr) 565 { 566 struct mm_struct *mm = vma->vm_mm; 567 struct mmu_gather tlb; 568 569 *prev = vma; 570 if (!can_madv_lru_vma(vma)) 571 return -EINVAL; 572 573 lru_add_drain(); 574 tlb_gather_mmu(&tlb, mm); 575 madvise_cold_page_range(&tlb, vma, start_addr, end_addr); 576 tlb_finish_mmu(&tlb); 577 578 return 0; 579 } 580 581 static void madvise_pageout_page_range(struct mmu_gather *tlb, 582 struct vm_area_struct *vma, 583 unsigned long addr, unsigned long end) 584 { 585 struct madvise_walk_private walk_private = { 586 .pageout = true, 587 .tlb = tlb, 588 }; 589 590 tlb_start_vma(tlb, vma); 591 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 592 tlb_end_vma(tlb, vma); 593 } 594 595 static long madvise_pageout(struct vm_area_struct *vma, 596 struct vm_area_struct **prev, 597 unsigned long start_addr, unsigned long end_addr) 598 { 599 struct mm_struct *mm = vma->vm_mm; 600 struct mmu_gather tlb; 601 602 *prev = vma; 603 if (!can_madv_lru_vma(vma)) 604 return -EINVAL; 605 606 /* 607 * If the VMA belongs to a private file mapping, there can be private 608 * dirty pages which can be paged out if even this process is neither 609 * owner nor write capable of the file. We allow private file mappings 610 * further to pageout dirty anon pages. 611 */ 612 if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && 613 (vma->vm_flags & VM_MAYSHARE))) 614 return 0; 615 616 lru_add_drain(); 617 tlb_gather_mmu(&tlb, mm); 618 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); 619 tlb_finish_mmu(&tlb); 620 621 return 0; 622 } 623 624 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 625 unsigned long end, struct mm_walk *walk) 626 627 { 628 struct mmu_gather *tlb = walk->private; 629 struct mm_struct *mm = tlb->mm; 630 struct vm_area_struct *vma = walk->vma; 631 spinlock_t *ptl; 632 pte_t *start_pte, *pte, ptent; 633 struct folio *folio; 634 int nr_swap = 0; 635 unsigned long next; 636 637 next = pmd_addr_end(addr, end); 638 if (pmd_trans_huge(*pmd)) 639 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 640 return 0; 641 642 tlb_change_page_size(tlb, PAGE_SIZE); 643 start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 644 if (!start_pte) 645 return 0; 646 flush_tlb_batched_pending(mm); 647 arch_enter_lazy_mmu_mode(); 648 for (; addr != end; pte++, addr += PAGE_SIZE) { 649 ptent = ptep_get(pte); 650 651 if (pte_none(ptent)) 652 continue; 653 /* 654 * If the pte has swp_entry, just clear page table to 655 * prevent swap-in which is more expensive rather than 656 * (page allocation + zeroing). 657 */ 658 if (!pte_present(ptent)) { 659 swp_entry_t entry; 660 661 entry = pte_to_swp_entry(ptent); 662 if (!non_swap_entry(entry)) { 663 nr_swap--; 664 free_swap_and_cache(entry); 665 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 666 } else if (is_hwpoison_entry(entry) || 667 is_poisoned_swp_entry(entry)) { 668 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 669 } 670 continue; 671 } 672 673 folio = vm_normal_folio(vma, addr, ptent); 674 if (!folio || folio_is_zone_device(folio)) 675 continue; 676 677 /* 678 * If pmd isn't transhuge but the folio is large and 679 * is owned by only this process, split it and 680 * deactivate all pages. 681 */ 682 if (folio_test_large(folio)) { 683 int err; 684 685 if (folio_mapcount(folio) != 1) 686 break; 687 if (!folio_trylock(folio)) 688 break; 689 folio_get(folio); 690 arch_leave_lazy_mmu_mode(); 691 pte_unmap_unlock(start_pte, ptl); 692 start_pte = NULL; 693 err = split_folio(folio); 694 folio_unlock(folio); 695 folio_put(folio); 696 if (err) 697 break; 698 start_pte = pte = 699 pte_offset_map_lock(mm, pmd, addr, &ptl); 700 if (!start_pte) 701 break; 702 arch_enter_lazy_mmu_mode(); 703 pte--; 704 addr -= PAGE_SIZE; 705 continue; 706 } 707 708 if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { 709 if (!folio_trylock(folio)) 710 continue; 711 /* 712 * If folio is shared with others, we mustn't clear 713 * the folio's dirty flag. 714 */ 715 if (folio_mapcount(folio) != 1) { 716 folio_unlock(folio); 717 continue; 718 } 719 720 if (folio_test_swapcache(folio) && 721 !folio_free_swap(folio)) { 722 folio_unlock(folio); 723 continue; 724 } 725 726 folio_clear_dirty(folio); 727 folio_unlock(folio); 728 } 729 730 if (pte_young(ptent) || pte_dirty(ptent)) { 731 /* 732 * Some of architecture(ex, PPC) don't update TLB 733 * with set_pte_at and tlb_remove_tlb_entry so for 734 * the portability, remap the pte with old|clean 735 * after pte clearing. 736 */ 737 ptent = ptep_get_and_clear_full(mm, addr, pte, 738 tlb->fullmm); 739 740 ptent = pte_mkold(ptent); 741 ptent = pte_mkclean(ptent); 742 set_pte_at(mm, addr, pte, ptent); 743 tlb_remove_tlb_entry(tlb, pte, addr); 744 } 745 folio_mark_lazyfree(folio); 746 } 747 748 if (nr_swap) { 749 if (current->mm == mm) 750 sync_mm_rss(mm); 751 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 752 } 753 if (start_pte) { 754 arch_leave_lazy_mmu_mode(); 755 pte_unmap_unlock(start_pte, ptl); 756 } 757 cond_resched(); 758 759 return 0; 760 } 761 762 static const struct mm_walk_ops madvise_free_walk_ops = { 763 .pmd_entry = madvise_free_pte_range, 764 }; 765 766 static int madvise_free_single_vma(struct vm_area_struct *vma, 767 unsigned long start_addr, unsigned long end_addr) 768 { 769 struct mm_struct *mm = vma->vm_mm; 770 struct mmu_notifier_range range; 771 struct mmu_gather tlb; 772 773 /* MADV_FREE works for only anon vma at the moment */ 774 if (!vma_is_anonymous(vma)) 775 return -EINVAL; 776 777 range.start = max(vma->vm_start, start_addr); 778 if (range.start >= vma->vm_end) 779 return -EINVAL; 780 range.end = min(vma->vm_end, end_addr); 781 if (range.end <= vma->vm_start) 782 return -EINVAL; 783 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 784 range.start, range.end); 785 786 lru_add_drain(); 787 tlb_gather_mmu(&tlb, mm); 788 update_hiwater_rss(mm); 789 790 mmu_notifier_invalidate_range_start(&range); 791 tlb_start_vma(&tlb, vma); 792 walk_page_range(vma->vm_mm, range.start, range.end, 793 &madvise_free_walk_ops, &tlb); 794 tlb_end_vma(&tlb, vma); 795 mmu_notifier_invalidate_range_end(&range); 796 tlb_finish_mmu(&tlb); 797 798 return 0; 799 } 800 801 /* 802 * Application no longer needs these pages. If the pages are dirty, 803 * it's OK to just throw them away. The app will be more careful about 804 * data it wants to keep. Be sure to free swap resources too. The 805 * zap_page_range_single call sets things up for shrink_active_list to actually 806 * free these pages later if no one else has touched them in the meantime, 807 * although we could add these pages to a global reuse list for 808 * shrink_active_list to pick up before reclaiming other pages. 809 * 810 * NB: This interface discards data rather than pushes it out to swap, 811 * as some implementations do. This has performance implications for 812 * applications like large transactional databases which want to discard 813 * pages in anonymous maps after committing to backing store the data 814 * that was kept in them. There is no reason to write this data out to 815 * the swap area if the application is discarding it. 816 * 817 * An interface that causes the system to free clean pages and flush 818 * dirty pages is already available as msync(MS_INVALIDATE). 819 */ 820 static long madvise_dontneed_single_vma(struct vm_area_struct *vma, 821 unsigned long start, unsigned long end) 822 { 823 zap_page_range_single(vma, start, end - start, NULL); 824 return 0; 825 } 826 827 static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, 828 unsigned long start, 829 unsigned long *end, 830 int behavior) 831 { 832 if (!is_vm_hugetlb_page(vma)) { 833 unsigned int forbidden = VM_PFNMAP; 834 835 if (behavior != MADV_DONTNEED_LOCKED) 836 forbidden |= VM_LOCKED; 837 838 return !(vma->vm_flags & forbidden); 839 } 840 841 if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) 842 return false; 843 if (start & ~huge_page_mask(hstate_vma(vma))) 844 return false; 845 846 /* 847 * Madvise callers expect the length to be rounded up to PAGE_SIZE 848 * boundaries, and may be unaware that this VMA uses huge pages. 849 * Avoid unexpected data loss by rounding down the number of 850 * huge pages freed. 851 */ 852 *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); 853 854 return true; 855 } 856 857 static long madvise_dontneed_free(struct vm_area_struct *vma, 858 struct vm_area_struct **prev, 859 unsigned long start, unsigned long end, 860 int behavior) 861 { 862 struct mm_struct *mm = vma->vm_mm; 863 864 *prev = vma; 865 if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) 866 return -EINVAL; 867 868 if (start == end) 869 return 0; 870 871 if (!userfaultfd_remove(vma, start, end)) { 872 *prev = NULL; /* mmap_lock has been dropped, prev is stale */ 873 874 mmap_read_lock(mm); 875 vma = vma_lookup(mm, start); 876 if (!vma) 877 return -ENOMEM; 878 /* 879 * Potential end adjustment for hugetlb vma is OK as 880 * the check below keeps end within vma. 881 */ 882 if (!madvise_dontneed_free_valid_vma(vma, start, &end, 883 behavior)) 884 return -EINVAL; 885 if (end > vma->vm_end) { 886 /* 887 * Don't fail if end > vma->vm_end. If the old 888 * vma was split while the mmap_lock was 889 * released the effect of the concurrent 890 * operation may not cause madvise() to 891 * have an undefined result. There may be an 892 * adjacent next vma that we'll walk 893 * next. userfaultfd_remove() will generate an 894 * UFFD_EVENT_REMOVE repetition on the 895 * end-vma->vm_end range, but the manager can 896 * handle a repetition fine. 897 */ 898 end = vma->vm_end; 899 } 900 VM_WARN_ON(start >= end); 901 } 902 903 if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) 904 return madvise_dontneed_single_vma(vma, start, end); 905 else if (behavior == MADV_FREE) 906 return madvise_free_single_vma(vma, start, end); 907 else 908 return -EINVAL; 909 } 910 911 static long madvise_populate(struct vm_area_struct *vma, 912 struct vm_area_struct **prev, 913 unsigned long start, unsigned long end, 914 int behavior) 915 { 916 const bool write = behavior == MADV_POPULATE_WRITE; 917 struct mm_struct *mm = vma->vm_mm; 918 unsigned long tmp_end; 919 int locked = 1; 920 long pages; 921 922 *prev = vma; 923 924 while (start < end) { 925 /* 926 * We might have temporarily dropped the lock. For example, 927 * our VMA might have been split. 928 */ 929 if (!vma || start >= vma->vm_end) { 930 vma = vma_lookup(mm, start); 931 if (!vma) 932 return -ENOMEM; 933 } 934 935 tmp_end = min_t(unsigned long, end, vma->vm_end); 936 /* Populate (prefault) page tables readable/writable. */ 937 pages = faultin_vma_page_range(vma, start, tmp_end, write, 938 &locked); 939 if (!locked) { 940 mmap_read_lock(mm); 941 locked = 1; 942 *prev = NULL; 943 vma = NULL; 944 } 945 if (pages < 0) { 946 switch (pages) { 947 case -EINTR: 948 return -EINTR; 949 case -EINVAL: /* Incompatible mappings / permissions. */ 950 return -EINVAL; 951 case -EHWPOISON: 952 return -EHWPOISON; 953 case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ 954 return -EFAULT; 955 default: 956 pr_warn_once("%s: unhandled return value: %ld\n", 957 __func__, pages); 958 fallthrough; 959 case -ENOMEM: 960 return -ENOMEM; 961 } 962 } 963 start += pages * PAGE_SIZE; 964 } 965 return 0; 966 } 967 968 /* 969 * Application wants to free up the pages and associated backing store. 970 * This is effectively punching a hole into the middle of a file. 971 */ 972 static long madvise_remove(struct vm_area_struct *vma, 973 struct vm_area_struct **prev, 974 unsigned long start, unsigned long end) 975 { 976 loff_t offset; 977 int error; 978 struct file *f; 979 struct mm_struct *mm = vma->vm_mm; 980 981 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 982 983 if (vma->vm_flags & VM_LOCKED) 984 return -EINVAL; 985 986 f = vma->vm_file; 987 988 if (!f || !f->f_mapping || !f->f_mapping->host) { 989 return -EINVAL; 990 } 991 992 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 993 return -EACCES; 994 995 offset = (loff_t)(start - vma->vm_start) 996 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 997 998 /* 999 * Filesystem's fallocate may need to take i_rwsem. We need to 1000 * explicitly grab a reference because the vma (and hence the 1001 * vma's reference to the file) can go away as soon as we drop 1002 * mmap_lock. 1003 */ 1004 get_file(f); 1005 if (userfaultfd_remove(vma, start, end)) { 1006 /* mmap_lock was not released by userfaultfd_remove() */ 1007 mmap_read_unlock(mm); 1008 } 1009 error = vfs_fallocate(f, 1010 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1011 offset, end - start); 1012 fput(f); 1013 mmap_read_lock(mm); 1014 return error; 1015 } 1016 1017 /* 1018 * Apply an madvise behavior to a region of a vma. madvise_update_vma 1019 * will handle splitting a vm area into separate areas, each area with its own 1020 * behavior. 1021 */ 1022 static int madvise_vma_behavior(struct vm_area_struct *vma, 1023 struct vm_area_struct **prev, 1024 unsigned long start, unsigned long end, 1025 unsigned long behavior) 1026 { 1027 int error; 1028 struct anon_vma_name *anon_name; 1029 unsigned long new_flags = vma->vm_flags; 1030 1031 switch (behavior) { 1032 case MADV_REMOVE: 1033 return madvise_remove(vma, prev, start, end); 1034 case MADV_WILLNEED: 1035 return madvise_willneed(vma, prev, start, end); 1036 case MADV_COLD: 1037 return madvise_cold(vma, prev, start, end); 1038 case MADV_PAGEOUT: 1039 return madvise_pageout(vma, prev, start, end); 1040 case MADV_FREE: 1041 case MADV_DONTNEED: 1042 case MADV_DONTNEED_LOCKED: 1043 return madvise_dontneed_free(vma, prev, start, end, behavior); 1044 case MADV_POPULATE_READ: 1045 case MADV_POPULATE_WRITE: 1046 return madvise_populate(vma, prev, start, end, behavior); 1047 case MADV_NORMAL: 1048 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 1049 break; 1050 case MADV_SEQUENTIAL: 1051 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 1052 break; 1053 case MADV_RANDOM: 1054 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 1055 break; 1056 case MADV_DONTFORK: 1057 new_flags |= VM_DONTCOPY; 1058 break; 1059 case MADV_DOFORK: 1060 if (vma->vm_flags & VM_IO) 1061 return -EINVAL; 1062 new_flags &= ~VM_DONTCOPY; 1063 break; 1064 case MADV_WIPEONFORK: 1065 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 1066 if (vma->vm_file || vma->vm_flags & VM_SHARED) 1067 return -EINVAL; 1068 new_flags |= VM_WIPEONFORK; 1069 break; 1070 case MADV_KEEPONFORK: 1071 new_flags &= ~VM_WIPEONFORK; 1072 break; 1073 case MADV_DONTDUMP: 1074 new_flags |= VM_DONTDUMP; 1075 break; 1076 case MADV_DODUMP: 1077 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) 1078 return -EINVAL; 1079 new_flags &= ~VM_DONTDUMP; 1080 break; 1081 case MADV_MERGEABLE: 1082 case MADV_UNMERGEABLE: 1083 error = ksm_madvise(vma, start, end, behavior, &new_flags); 1084 if (error) 1085 goto out; 1086 break; 1087 case MADV_HUGEPAGE: 1088 case MADV_NOHUGEPAGE: 1089 error = hugepage_madvise(vma, &new_flags, behavior); 1090 if (error) 1091 goto out; 1092 break; 1093 case MADV_COLLAPSE: 1094 return madvise_collapse(vma, prev, start, end); 1095 } 1096 1097 anon_name = anon_vma_name(vma); 1098 anon_vma_name_get(anon_name); 1099 error = madvise_update_vma(vma, prev, start, end, new_flags, 1100 anon_name); 1101 anon_vma_name_put(anon_name); 1102 1103 out: 1104 /* 1105 * madvise() returns EAGAIN if kernel resources, such as 1106 * slab, are temporarily unavailable. 1107 */ 1108 if (error == -ENOMEM) 1109 error = -EAGAIN; 1110 return error; 1111 } 1112 1113 #ifdef CONFIG_MEMORY_FAILURE 1114 /* 1115 * Error injection support for memory error handling. 1116 */ 1117 static int madvise_inject_error(int behavior, 1118 unsigned long start, unsigned long end) 1119 { 1120 unsigned long size; 1121 1122 if (!capable(CAP_SYS_ADMIN)) 1123 return -EPERM; 1124 1125 1126 for (; start < end; start += size) { 1127 unsigned long pfn; 1128 struct page *page; 1129 int ret; 1130 1131 ret = get_user_pages_fast(start, 1, 0, &page); 1132 if (ret != 1) 1133 return ret; 1134 pfn = page_to_pfn(page); 1135 1136 /* 1137 * When soft offlining hugepages, after migrating the page 1138 * we dissolve it, therefore in the second loop "page" will 1139 * no longer be a compound page. 1140 */ 1141 size = page_size(compound_head(page)); 1142 1143 if (behavior == MADV_SOFT_OFFLINE) { 1144 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 1145 pfn, start); 1146 ret = soft_offline_page(pfn, MF_COUNT_INCREASED); 1147 } else { 1148 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 1149 pfn, start); 1150 ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED); 1151 if (ret == -EOPNOTSUPP) 1152 ret = 0; 1153 } 1154 1155 if (ret) 1156 return ret; 1157 } 1158 1159 return 0; 1160 } 1161 #endif 1162 1163 static bool 1164 madvise_behavior_valid(int behavior) 1165 { 1166 switch (behavior) { 1167 case MADV_DOFORK: 1168 case MADV_DONTFORK: 1169 case MADV_NORMAL: 1170 case MADV_SEQUENTIAL: 1171 case MADV_RANDOM: 1172 case MADV_REMOVE: 1173 case MADV_WILLNEED: 1174 case MADV_DONTNEED: 1175 case MADV_DONTNEED_LOCKED: 1176 case MADV_FREE: 1177 case MADV_COLD: 1178 case MADV_PAGEOUT: 1179 case MADV_POPULATE_READ: 1180 case MADV_POPULATE_WRITE: 1181 #ifdef CONFIG_KSM 1182 case MADV_MERGEABLE: 1183 case MADV_UNMERGEABLE: 1184 #endif 1185 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1186 case MADV_HUGEPAGE: 1187 case MADV_NOHUGEPAGE: 1188 case MADV_COLLAPSE: 1189 #endif 1190 case MADV_DONTDUMP: 1191 case MADV_DODUMP: 1192 case MADV_WIPEONFORK: 1193 case MADV_KEEPONFORK: 1194 #ifdef CONFIG_MEMORY_FAILURE 1195 case MADV_SOFT_OFFLINE: 1196 case MADV_HWPOISON: 1197 #endif 1198 return true; 1199 1200 default: 1201 return false; 1202 } 1203 } 1204 1205 static bool process_madvise_behavior_valid(int behavior) 1206 { 1207 switch (behavior) { 1208 case MADV_COLD: 1209 case MADV_PAGEOUT: 1210 case MADV_WILLNEED: 1211 case MADV_COLLAPSE: 1212 return true; 1213 default: 1214 return false; 1215 } 1216 } 1217 1218 /* 1219 * Walk the vmas in range [start,end), and call the visit function on each one. 1220 * The visit function will get start and end parameters that cover the overlap 1221 * between the current vma and the original range. Any unmapped regions in the 1222 * original range will result in this function returning -ENOMEM while still 1223 * calling the visit function on all of the existing vmas in the range. 1224 * Must be called with the mmap_lock held for reading or writing. 1225 */ 1226 static 1227 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, 1228 unsigned long end, unsigned long arg, 1229 int (*visit)(struct vm_area_struct *vma, 1230 struct vm_area_struct **prev, unsigned long start, 1231 unsigned long end, unsigned long arg)) 1232 { 1233 struct vm_area_struct *vma; 1234 struct vm_area_struct *prev; 1235 unsigned long tmp; 1236 int unmapped_error = 0; 1237 1238 /* 1239 * If the interval [start,end) covers some unmapped address 1240 * ranges, just ignore them, but return -ENOMEM at the end. 1241 * - different from the way of handling in mlock etc. 1242 */ 1243 vma = find_vma_prev(mm, start, &prev); 1244 if (vma && start > vma->vm_start) 1245 prev = vma; 1246 1247 for (;;) { 1248 int error; 1249 1250 /* Still start < end. */ 1251 if (!vma) 1252 return -ENOMEM; 1253 1254 /* Here start < (end|vma->vm_end). */ 1255 if (start < vma->vm_start) { 1256 unmapped_error = -ENOMEM; 1257 start = vma->vm_start; 1258 if (start >= end) 1259 break; 1260 } 1261 1262 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 1263 tmp = vma->vm_end; 1264 if (end < tmp) 1265 tmp = end; 1266 1267 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 1268 error = visit(vma, &prev, start, tmp, arg); 1269 if (error) 1270 return error; 1271 start = tmp; 1272 if (prev && start < prev->vm_end) 1273 start = prev->vm_end; 1274 if (start >= end) 1275 break; 1276 if (prev) 1277 vma = find_vma(mm, prev->vm_end); 1278 else /* madvise_remove dropped mmap_lock */ 1279 vma = find_vma(mm, start); 1280 } 1281 1282 return unmapped_error; 1283 } 1284 1285 #ifdef CONFIG_ANON_VMA_NAME 1286 static int madvise_vma_anon_name(struct vm_area_struct *vma, 1287 struct vm_area_struct **prev, 1288 unsigned long start, unsigned long end, 1289 unsigned long anon_name) 1290 { 1291 int error; 1292 1293 /* Only anonymous mappings can be named */ 1294 if (vma->vm_file && !vma_is_anon_shmem(vma)) 1295 return -EBADF; 1296 1297 error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, 1298 (struct anon_vma_name *)anon_name); 1299 1300 /* 1301 * madvise() returns EAGAIN if kernel resources, such as 1302 * slab, are temporarily unavailable. 1303 */ 1304 if (error == -ENOMEM) 1305 error = -EAGAIN; 1306 return error; 1307 } 1308 1309 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, 1310 unsigned long len_in, struct anon_vma_name *anon_name) 1311 { 1312 unsigned long end; 1313 unsigned long len; 1314 1315 if (start & ~PAGE_MASK) 1316 return -EINVAL; 1317 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 1318 1319 /* Check to see whether len was rounded up from small -ve to zero */ 1320 if (len_in && !len) 1321 return -EINVAL; 1322 1323 end = start + len; 1324 if (end < start) 1325 return -EINVAL; 1326 1327 if (end == start) 1328 return 0; 1329 1330 return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, 1331 madvise_vma_anon_name); 1332 } 1333 #endif /* CONFIG_ANON_VMA_NAME */ 1334 /* 1335 * The madvise(2) system call. 1336 * 1337 * Applications can use madvise() to advise the kernel how it should 1338 * handle paging I/O in this VM area. The idea is to help the kernel 1339 * use appropriate read-ahead and caching techniques. The information 1340 * provided is advisory only, and can be safely disregarded by the 1341 * kernel without affecting the correct operation of the application. 1342 * 1343 * behavior values: 1344 * MADV_NORMAL - the default behavior is to read clusters. This 1345 * results in some read-ahead and read-behind. 1346 * MADV_RANDOM - the system should read the minimum amount of data 1347 * on any access, since it is unlikely that the appli- 1348 * cation will need more than what it asks for. 1349 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 1350 * once, so they can be aggressively read ahead, and 1351 * can be freed soon after they are accessed. 1352 * MADV_WILLNEED - the application is notifying the system to read 1353 * some pages ahead. 1354 * MADV_DONTNEED - the application is finished with the given range, 1355 * so the kernel can free resources associated with it. 1356 * MADV_FREE - the application marks pages in the given range as lazy free, 1357 * where actual purges are postponed until memory pressure happens. 1358 * MADV_REMOVE - the application wants to free up the given range of 1359 * pages and associated backing store. 1360 * MADV_DONTFORK - omit this area from child's address space when forking: 1361 * typically, to avoid COWing pages pinned by get_user_pages(). 1362 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 1363 * MADV_WIPEONFORK - present the child process with zero-filled memory in this 1364 * range after a fork. 1365 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 1366 * MADV_HWPOISON - trigger memory error handler as if the given memory range 1367 * were corrupted by unrecoverable hardware memory failure. 1368 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 1369 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 1370 * this area with pages of identical content from other such areas. 1371 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 1372 * MADV_HUGEPAGE - the application wants to back the given range by transparent 1373 * huge pages in the future. Existing pages might be coalesced and 1374 * new pages might be allocated as THP. 1375 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 1376 * transparent huge pages so the existing pages will not be 1377 * coalesced into THP and new pages will not be allocated as THP. 1378 * MADV_COLLAPSE - synchronously coalesce pages into new THP. 1379 * MADV_DONTDUMP - the application wants to prevent pages in the given range 1380 * from being included in its core dump. 1381 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 1382 * MADV_COLD - the application is not expected to use this memory soon, 1383 * deactivate pages in this range so that they can be reclaimed 1384 * easily if memory pressure happens. 1385 * MADV_PAGEOUT - the application is not expected to use this memory soon, 1386 * page out the pages in this range immediately. 1387 * MADV_POPULATE_READ - populate (prefault) page tables readable by 1388 * triggering read faults if required 1389 * MADV_POPULATE_WRITE - populate (prefault) page tables writable by 1390 * triggering write faults if required 1391 * 1392 * return values: 1393 * zero - success 1394 * -EINVAL - start + len < 0, start is not page-aligned, 1395 * "behavior" is not a valid value, or application 1396 * is attempting to release locked or shared pages, 1397 * or the specified address range includes file, Huge TLB, 1398 * MAP_SHARED or VMPFNMAP range. 1399 * -ENOMEM - addresses in the specified range are not currently 1400 * mapped, or are outside the AS of the process. 1401 * -EIO - an I/O error occurred while paging in data. 1402 * -EBADF - map exists, but area maps something that isn't a file. 1403 * -EAGAIN - a kernel resource was temporarily unavailable. 1404 */ 1405 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) 1406 { 1407 unsigned long end; 1408 int error; 1409 int write; 1410 size_t len; 1411 struct blk_plug plug; 1412 1413 if (!madvise_behavior_valid(behavior)) 1414 return -EINVAL; 1415 1416 if (!PAGE_ALIGNED(start)) 1417 return -EINVAL; 1418 len = PAGE_ALIGN(len_in); 1419 1420 /* Check to see whether len was rounded up from small -ve to zero */ 1421 if (len_in && !len) 1422 return -EINVAL; 1423 1424 end = start + len; 1425 if (end < start) 1426 return -EINVAL; 1427 1428 if (end == start) 1429 return 0; 1430 1431 #ifdef CONFIG_MEMORY_FAILURE 1432 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 1433 return madvise_inject_error(behavior, start, start + len_in); 1434 #endif 1435 1436 write = madvise_need_mmap_write(behavior); 1437 if (write) { 1438 if (mmap_write_lock_killable(mm)) 1439 return -EINTR; 1440 } else { 1441 mmap_read_lock(mm); 1442 } 1443 1444 start = untagged_addr_remote(mm, start); 1445 end = start + len; 1446 1447 blk_start_plug(&plug); 1448 error = madvise_walk_vmas(mm, start, end, behavior, 1449 madvise_vma_behavior); 1450 blk_finish_plug(&plug); 1451 if (write) 1452 mmap_write_unlock(mm); 1453 else 1454 mmap_read_unlock(mm); 1455 1456 return error; 1457 } 1458 1459 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 1460 { 1461 return do_madvise(current->mm, start, len_in, behavior); 1462 } 1463 1464 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, 1465 size_t, vlen, int, behavior, unsigned int, flags) 1466 { 1467 ssize_t ret; 1468 struct iovec iovstack[UIO_FASTIOV]; 1469 struct iovec *iov = iovstack; 1470 struct iov_iter iter; 1471 struct task_struct *task; 1472 struct mm_struct *mm; 1473 size_t total_len; 1474 unsigned int f_flags; 1475 1476 if (flags != 0) { 1477 ret = -EINVAL; 1478 goto out; 1479 } 1480 1481 ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 1482 if (ret < 0) 1483 goto out; 1484 1485 task = pidfd_get_task(pidfd, &f_flags); 1486 if (IS_ERR(task)) { 1487 ret = PTR_ERR(task); 1488 goto free_iov; 1489 } 1490 1491 if (!process_madvise_behavior_valid(behavior)) { 1492 ret = -EINVAL; 1493 goto release_task; 1494 } 1495 1496 /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ 1497 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); 1498 if (IS_ERR_OR_NULL(mm)) { 1499 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; 1500 goto release_task; 1501 } 1502 1503 /* 1504 * Require CAP_SYS_NICE for influencing process performance. Note that 1505 * only non-destructive hints are currently supported. 1506 */ 1507 if (!capable(CAP_SYS_NICE)) { 1508 ret = -EPERM; 1509 goto release_mm; 1510 } 1511 1512 total_len = iov_iter_count(&iter); 1513 1514 while (iov_iter_count(&iter)) { 1515 ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter), 1516 iter_iov_len(&iter), behavior); 1517 if (ret < 0) 1518 break; 1519 iov_iter_advance(&iter, iter_iov_len(&iter)); 1520 } 1521 1522 ret = (total_len - iov_iter_count(&iter)) ? : ret; 1523 1524 release_mm: 1525 mmput(mm); 1526 release_task: 1527 put_task_struct(task); 1528 free_iov: 1529 kfree(iov); 1530 out: 1531 return ret; 1532 } 1533