1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/madvise.c 4 * 5 * Copyright (C) 1999 Linus Torvalds 6 * Copyright (C) 2002 Christoph Hellwig 7 */ 8 9 #include <linux/mman.h> 10 #include <linux/pagemap.h> 11 #include <linux/syscalls.h> 12 #include <linux/mempolicy.h> 13 #include <linux/page-isolation.h> 14 #include <linux/userfaultfd_k.h> 15 #include <linux/hugetlb.h> 16 #include <linux/falloc.h> 17 #include <linux/fadvise.h> 18 #include <linux/sched.h> 19 #include <linux/ksm.h> 20 #include <linux/fs.h> 21 #include <linux/file.h> 22 #include <linux/blkdev.h> 23 #include <linux/backing-dev.h> 24 #include <linux/pagewalk.h> 25 #include <linux/swap.h> 26 #include <linux/swapops.h> 27 #include <linux/shmem_fs.h> 28 #include <linux/mmu_notifier.h> 29 30 #include <asm/tlb.h> 31 32 #include "internal.h" 33 34 /* 35 * Any behaviour which results in changes to the vma->vm_flags needs to 36 * take mmap_sem for writing. Others, which simply traverse vmas, need 37 * to only take it for reading. 38 */ 39 static int madvise_need_mmap_write(int behavior) 40 { 41 switch (behavior) { 42 case MADV_REMOVE: 43 case MADV_WILLNEED: 44 case MADV_DONTNEED: 45 case MADV_FREE: 46 return 0; 47 default: 48 /* be safe, default to 1. list exceptions explicitly */ 49 return 1; 50 } 51 } 52 53 /* 54 * We can potentially split a vm area into separate 55 * areas, each area with its own behavior. 56 */ 57 static long madvise_behavior(struct vm_area_struct *vma, 58 struct vm_area_struct **prev, 59 unsigned long start, unsigned long end, int behavior) 60 { 61 struct mm_struct *mm = vma->vm_mm; 62 int error = 0; 63 pgoff_t pgoff; 64 unsigned long new_flags = vma->vm_flags; 65 66 switch (behavior) { 67 case MADV_NORMAL: 68 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 69 break; 70 case MADV_SEQUENTIAL: 71 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 72 break; 73 case MADV_RANDOM: 74 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 75 break; 76 case MADV_DONTFORK: 77 new_flags |= VM_DONTCOPY; 78 break; 79 case MADV_DOFORK: 80 if (vma->vm_flags & VM_IO) { 81 error = -EINVAL; 82 goto out; 83 } 84 new_flags &= ~VM_DONTCOPY; 85 break; 86 case MADV_WIPEONFORK: 87 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 88 if (vma->vm_file || vma->vm_flags & VM_SHARED) { 89 error = -EINVAL; 90 goto out; 91 } 92 new_flags |= VM_WIPEONFORK; 93 break; 94 case MADV_KEEPONFORK: 95 new_flags &= ~VM_WIPEONFORK; 96 break; 97 case MADV_DONTDUMP: 98 new_flags |= VM_DONTDUMP; 99 break; 100 case MADV_DODUMP: 101 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { 102 error = -EINVAL; 103 goto out; 104 } 105 new_flags &= ~VM_DONTDUMP; 106 break; 107 case MADV_MERGEABLE: 108 case MADV_UNMERGEABLE: 109 error = ksm_madvise(vma, start, end, behavior, &new_flags); 110 if (error) { 111 /* 112 * madvise() returns EAGAIN if kernel resources, such as 113 * slab, are temporarily unavailable. 114 */ 115 if (error == -ENOMEM) 116 error = -EAGAIN; 117 goto out; 118 } 119 break; 120 case MADV_HUGEPAGE: 121 case MADV_NOHUGEPAGE: 122 error = hugepage_madvise(vma, &new_flags, behavior); 123 if (error) { 124 /* 125 * madvise() returns EAGAIN if kernel resources, such as 126 * slab, are temporarily unavailable. 127 */ 128 if (error == -ENOMEM) 129 error = -EAGAIN; 130 goto out; 131 } 132 break; 133 } 134 135 if (new_flags == vma->vm_flags) { 136 *prev = vma; 137 goto out; 138 } 139 140 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 141 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 142 vma->vm_file, pgoff, vma_policy(vma), 143 vma->vm_userfaultfd_ctx); 144 if (*prev) { 145 vma = *prev; 146 goto success; 147 } 148 149 *prev = vma; 150 151 if (start != vma->vm_start) { 152 if (unlikely(mm->map_count >= sysctl_max_map_count)) { 153 error = -ENOMEM; 154 goto out; 155 } 156 error = __split_vma(mm, vma, start, 1); 157 if (error) { 158 /* 159 * madvise() returns EAGAIN if kernel resources, such as 160 * slab, are temporarily unavailable. 161 */ 162 if (error == -ENOMEM) 163 error = -EAGAIN; 164 goto out; 165 } 166 } 167 168 if (end != vma->vm_end) { 169 if (unlikely(mm->map_count >= sysctl_max_map_count)) { 170 error = -ENOMEM; 171 goto out; 172 } 173 error = __split_vma(mm, vma, end, 0); 174 if (error) { 175 /* 176 * madvise() returns EAGAIN if kernel resources, such as 177 * slab, are temporarily unavailable. 178 */ 179 if (error == -ENOMEM) 180 error = -EAGAIN; 181 goto out; 182 } 183 } 184 185 success: 186 /* 187 * vm_flags is protected by the mmap_sem held in write mode. 188 */ 189 vma->vm_flags = new_flags; 190 out: 191 return error; 192 } 193 194 #ifdef CONFIG_SWAP 195 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 196 unsigned long end, struct mm_walk *walk) 197 { 198 pte_t *orig_pte; 199 struct vm_area_struct *vma = walk->private; 200 unsigned long index; 201 202 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 203 return 0; 204 205 for (index = start; index != end; index += PAGE_SIZE) { 206 pte_t pte; 207 swp_entry_t entry; 208 struct page *page; 209 spinlock_t *ptl; 210 211 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 212 pte = *(orig_pte + ((index - start) / PAGE_SIZE)); 213 pte_unmap_unlock(orig_pte, ptl); 214 215 if (pte_present(pte) || pte_none(pte)) 216 continue; 217 entry = pte_to_swp_entry(pte); 218 if (unlikely(non_swap_entry(entry))) 219 continue; 220 221 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 222 vma, index, false); 223 if (page) 224 put_page(page); 225 } 226 227 return 0; 228 } 229 230 static const struct mm_walk_ops swapin_walk_ops = { 231 .pmd_entry = swapin_walk_pmd_entry, 232 }; 233 234 static void force_shm_swapin_readahead(struct vm_area_struct *vma, 235 unsigned long start, unsigned long end, 236 struct address_space *mapping) 237 { 238 pgoff_t index; 239 struct page *page; 240 swp_entry_t swap; 241 242 for (; start < end; start += PAGE_SIZE) { 243 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 244 245 page = find_get_entry(mapping, index); 246 if (!xa_is_value(page)) { 247 if (page) 248 put_page(page); 249 continue; 250 } 251 swap = radix_to_swp_entry(page); 252 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, 253 NULL, 0, false); 254 if (page) 255 put_page(page); 256 } 257 258 lru_add_drain(); /* Push any new pages onto the LRU now */ 259 } 260 #endif /* CONFIG_SWAP */ 261 262 /* 263 * Schedule all required I/O operations. Do not wait for completion. 264 */ 265 static long madvise_willneed(struct vm_area_struct *vma, 266 struct vm_area_struct **prev, 267 unsigned long start, unsigned long end) 268 { 269 struct file *file = vma->vm_file; 270 loff_t offset; 271 272 *prev = vma; 273 #ifdef CONFIG_SWAP 274 if (!file) { 275 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); 276 lru_add_drain(); /* Push any new pages onto the LRU now */ 277 return 0; 278 } 279 280 if (shmem_mapping(file->f_mapping)) { 281 force_shm_swapin_readahead(vma, start, end, 282 file->f_mapping); 283 return 0; 284 } 285 #else 286 if (!file) 287 return -EBADF; 288 #endif 289 290 if (IS_DAX(file_inode(file))) { 291 /* no bad return value, but ignore advice */ 292 return 0; 293 } 294 295 /* 296 * Filesystem's fadvise may need to take various locks. We need to 297 * explicitly grab a reference because the vma (and hence the 298 * vma's reference to the file) can go away as soon as we drop 299 * mmap_sem. 300 */ 301 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 302 get_file(file); 303 up_read(¤t->mm->mmap_sem); 304 offset = (loff_t)(start - vma->vm_start) 305 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 306 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 307 fput(file); 308 down_read(¤t->mm->mmap_sem); 309 return 0; 310 } 311 312 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 313 unsigned long end, struct mm_walk *walk) 314 315 { 316 struct mmu_gather *tlb = walk->private; 317 struct mm_struct *mm = tlb->mm; 318 struct vm_area_struct *vma = walk->vma; 319 spinlock_t *ptl; 320 pte_t *orig_pte, *pte, ptent; 321 struct page *page; 322 int nr_swap = 0; 323 unsigned long next; 324 325 next = pmd_addr_end(addr, end); 326 if (pmd_trans_huge(*pmd)) 327 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 328 goto next; 329 330 if (pmd_trans_unstable(pmd)) 331 return 0; 332 333 tlb_change_page_size(tlb, PAGE_SIZE); 334 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 335 flush_tlb_batched_pending(mm); 336 arch_enter_lazy_mmu_mode(); 337 for (; addr != end; pte++, addr += PAGE_SIZE) { 338 ptent = *pte; 339 340 if (pte_none(ptent)) 341 continue; 342 /* 343 * If the pte has swp_entry, just clear page table to 344 * prevent swap-in which is more expensive rather than 345 * (page allocation + zeroing). 346 */ 347 if (!pte_present(ptent)) { 348 swp_entry_t entry; 349 350 entry = pte_to_swp_entry(ptent); 351 if (non_swap_entry(entry)) 352 continue; 353 nr_swap--; 354 free_swap_and_cache(entry); 355 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 356 continue; 357 } 358 359 page = vm_normal_page(vma, addr, ptent); 360 if (!page) 361 continue; 362 363 /* 364 * If pmd isn't transhuge but the page is THP and 365 * is owned by only this process, split it and 366 * deactivate all pages. 367 */ 368 if (PageTransCompound(page)) { 369 if (page_mapcount(page) != 1) 370 goto out; 371 get_page(page); 372 if (!trylock_page(page)) { 373 put_page(page); 374 goto out; 375 } 376 pte_unmap_unlock(orig_pte, ptl); 377 if (split_huge_page(page)) { 378 unlock_page(page); 379 put_page(page); 380 pte_offset_map_lock(mm, pmd, addr, &ptl); 381 goto out; 382 } 383 unlock_page(page); 384 put_page(page); 385 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 386 pte--; 387 addr -= PAGE_SIZE; 388 continue; 389 } 390 391 VM_BUG_ON_PAGE(PageTransCompound(page), page); 392 393 if (PageSwapCache(page) || PageDirty(page)) { 394 if (!trylock_page(page)) 395 continue; 396 /* 397 * If page is shared with others, we couldn't clear 398 * PG_dirty of the page. 399 */ 400 if (page_mapcount(page) != 1) { 401 unlock_page(page); 402 continue; 403 } 404 405 if (PageSwapCache(page) && !try_to_free_swap(page)) { 406 unlock_page(page); 407 continue; 408 } 409 410 ClearPageDirty(page); 411 unlock_page(page); 412 } 413 414 if (pte_young(ptent) || pte_dirty(ptent)) { 415 /* 416 * Some of architecture(ex, PPC) don't update TLB 417 * with set_pte_at and tlb_remove_tlb_entry so for 418 * the portability, remap the pte with old|clean 419 * after pte clearing. 420 */ 421 ptent = ptep_get_and_clear_full(mm, addr, pte, 422 tlb->fullmm); 423 424 ptent = pte_mkold(ptent); 425 ptent = pte_mkclean(ptent); 426 set_pte_at(mm, addr, pte, ptent); 427 tlb_remove_tlb_entry(tlb, pte, addr); 428 } 429 mark_page_lazyfree(page); 430 } 431 out: 432 if (nr_swap) { 433 if (current->mm == mm) 434 sync_mm_rss(mm); 435 436 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 437 } 438 arch_leave_lazy_mmu_mode(); 439 pte_unmap_unlock(orig_pte, ptl); 440 cond_resched(); 441 next: 442 return 0; 443 } 444 445 static const struct mm_walk_ops madvise_free_walk_ops = { 446 .pmd_entry = madvise_free_pte_range, 447 }; 448 449 static int madvise_free_single_vma(struct vm_area_struct *vma, 450 unsigned long start_addr, unsigned long end_addr) 451 { 452 struct mm_struct *mm = vma->vm_mm; 453 struct mmu_notifier_range range; 454 struct mmu_gather tlb; 455 456 /* MADV_FREE works for only anon vma at the moment */ 457 if (!vma_is_anonymous(vma)) 458 return -EINVAL; 459 460 range.start = max(vma->vm_start, start_addr); 461 if (range.start >= vma->vm_end) 462 return -EINVAL; 463 range.end = min(vma->vm_end, end_addr); 464 if (range.end <= vma->vm_start) 465 return -EINVAL; 466 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 467 range.start, range.end); 468 469 lru_add_drain(); 470 tlb_gather_mmu(&tlb, mm, range.start, range.end); 471 update_hiwater_rss(mm); 472 473 mmu_notifier_invalidate_range_start(&range); 474 tlb_start_vma(&tlb, vma); 475 walk_page_range(vma->vm_mm, range.start, range.end, 476 &madvise_free_walk_ops, &tlb); 477 tlb_end_vma(&tlb, vma); 478 mmu_notifier_invalidate_range_end(&range); 479 tlb_finish_mmu(&tlb, range.start, range.end); 480 481 return 0; 482 } 483 484 /* 485 * Application no longer needs these pages. If the pages are dirty, 486 * it's OK to just throw them away. The app will be more careful about 487 * data it wants to keep. Be sure to free swap resources too. The 488 * zap_page_range call sets things up for shrink_active_list to actually free 489 * these pages later if no one else has touched them in the meantime, 490 * although we could add these pages to a global reuse list for 491 * shrink_active_list to pick up before reclaiming other pages. 492 * 493 * NB: This interface discards data rather than pushes it out to swap, 494 * as some implementations do. This has performance implications for 495 * applications like large transactional databases which want to discard 496 * pages in anonymous maps after committing to backing store the data 497 * that was kept in them. There is no reason to write this data out to 498 * the swap area if the application is discarding it. 499 * 500 * An interface that causes the system to free clean pages and flush 501 * dirty pages is already available as msync(MS_INVALIDATE). 502 */ 503 static long madvise_dontneed_single_vma(struct vm_area_struct *vma, 504 unsigned long start, unsigned long end) 505 { 506 zap_page_range(vma, start, end - start); 507 return 0; 508 } 509 510 static long madvise_dontneed_free(struct vm_area_struct *vma, 511 struct vm_area_struct **prev, 512 unsigned long start, unsigned long end, 513 int behavior) 514 { 515 *prev = vma; 516 if (!can_madv_dontneed_vma(vma)) 517 return -EINVAL; 518 519 if (!userfaultfd_remove(vma, start, end)) { 520 *prev = NULL; /* mmap_sem has been dropped, prev is stale */ 521 522 down_read(¤t->mm->mmap_sem); 523 vma = find_vma(current->mm, start); 524 if (!vma) 525 return -ENOMEM; 526 if (start < vma->vm_start) { 527 /* 528 * This "vma" under revalidation is the one 529 * with the lowest vma->vm_start where start 530 * is also < vma->vm_end. If start < 531 * vma->vm_start it means an hole materialized 532 * in the user address space within the 533 * virtual range passed to MADV_DONTNEED 534 * or MADV_FREE. 535 */ 536 return -ENOMEM; 537 } 538 if (!can_madv_dontneed_vma(vma)) 539 return -EINVAL; 540 if (end > vma->vm_end) { 541 /* 542 * Don't fail if end > vma->vm_end. If the old 543 * vma was splitted while the mmap_sem was 544 * released the effect of the concurrent 545 * operation may not cause madvise() to 546 * have an undefined result. There may be an 547 * adjacent next vma that we'll walk 548 * next. userfaultfd_remove() will generate an 549 * UFFD_EVENT_REMOVE repetition on the 550 * end-vma->vm_end range, but the manager can 551 * handle a repetition fine. 552 */ 553 end = vma->vm_end; 554 } 555 VM_WARN_ON(start >= end); 556 } 557 558 if (behavior == MADV_DONTNEED) 559 return madvise_dontneed_single_vma(vma, start, end); 560 else if (behavior == MADV_FREE) 561 return madvise_free_single_vma(vma, start, end); 562 else 563 return -EINVAL; 564 } 565 566 /* 567 * Application wants to free up the pages and associated backing store. 568 * This is effectively punching a hole into the middle of a file. 569 */ 570 static long madvise_remove(struct vm_area_struct *vma, 571 struct vm_area_struct **prev, 572 unsigned long start, unsigned long end) 573 { 574 loff_t offset; 575 int error; 576 struct file *f; 577 578 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 579 580 if (vma->vm_flags & VM_LOCKED) 581 return -EINVAL; 582 583 f = vma->vm_file; 584 585 if (!f || !f->f_mapping || !f->f_mapping->host) { 586 return -EINVAL; 587 } 588 589 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 590 return -EACCES; 591 592 offset = (loff_t)(start - vma->vm_start) 593 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 594 595 /* 596 * Filesystem's fallocate may need to take i_mutex. We need to 597 * explicitly grab a reference because the vma (and hence the 598 * vma's reference to the file) can go away as soon as we drop 599 * mmap_sem. 600 */ 601 get_file(f); 602 if (userfaultfd_remove(vma, start, end)) { 603 /* mmap_sem was not released by userfaultfd_remove() */ 604 up_read(¤t->mm->mmap_sem); 605 } 606 error = vfs_fallocate(f, 607 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 608 offset, end - start); 609 fput(f); 610 down_read(¤t->mm->mmap_sem); 611 return error; 612 } 613 614 #ifdef CONFIG_MEMORY_FAILURE 615 /* 616 * Error injection support for memory error handling. 617 */ 618 static int madvise_inject_error(int behavior, 619 unsigned long start, unsigned long end) 620 { 621 struct page *page; 622 struct zone *zone; 623 unsigned int order; 624 625 if (!capable(CAP_SYS_ADMIN)) 626 return -EPERM; 627 628 629 for (; start < end; start += PAGE_SIZE << order) { 630 unsigned long pfn; 631 int ret; 632 633 ret = get_user_pages_fast(start, 1, 0, &page); 634 if (ret != 1) 635 return ret; 636 pfn = page_to_pfn(page); 637 638 /* 639 * When soft offlining hugepages, after migrating the page 640 * we dissolve it, therefore in the second loop "page" will 641 * no longer be a compound page, and order will be 0. 642 */ 643 order = compound_order(compound_head(page)); 644 645 if (PageHWPoison(page)) { 646 put_page(page); 647 continue; 648 } 649 650 if (behavior == MADV_SOFT_OFFLINE) { 651 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 652 pfn, start); 653 654 ret = soft_offline_page(page, MF_COUNT_INCREASED); 655 if (ret) 656 return ret; 657 continue; 658 } 659 660 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 661 pfn, start); 662 663 /* 664 * Drop the page reference taken by get_user_pages_fast(). In 665 * the absence of MF_COUNT_INCREASED the memory_failure() 666 * routine is responsible for pinning the page to prevent it 667 * from being released back to the page allocator. 668 */ 669 put_page(page); 670 ret = memory_failure(pfn, 0); 671 if (ret) 672 return ret; 673 } 674 675 /* Ensure that all poisoned pages are removed from per-cpu lists */ 676 for_each_populated_zone(zone) 677 drain_all_pages(zone); 678 679 return 0; 680 } 681 #endif 682 683 static long 684 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 685 unsigned long start, unsigned long end, int behavior) 686 { 687 switch (behavior) { 688 case MADV_REMOVE: 689 return madvise_remove(vma, prev, start, end); 690 case MADV_WILLNEED: 691 return madvise_willneed(vma, prev, start, end); 692 case MADV_FREE: 693 case MADV_DONTNEED: 694 return madvise_dontneed_free(vma, prev, start, end, behavior); 695 default: 696 return madvise_behavior(vma, prev, start, end, behavior); 697 } 698 } 699 700 static bool 701 madvise_behavior_valid(int behavior) 702 { 703 switch (behavior) { 704 case MADV_DOFORK: 705 case MADV_DONTFORK: 706 case MADV_NORMAL: 707 case MADV_SEQUENTIAL: 708 case MADV_RANDOM: 709 case MADV_REMOVE: 710 case MADV_WILLNEED: 711 case MADV_DONTNEED: 712 case MADV_FREE: 713 #ifdef CONFIG_KSM 714 case MADV_MERGEABLE: 715 case MADV_UNMERGEABLE: 716 #endif 717 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 718 case MADV_HUGEPAGE: 719 case MADV_NOHUGEPAGE: 720 #endif 721 case MADV_DONTDUMP: 722 case MADV_DODUMP: 723 case MADV_WIPEONFORK: 724 case MADV_KEEPONFORK: 725 #ifdef CONFIG_MEMORY_FAILURE 726 case MADV_SOFT_OFFLINE: 727 case MADV_HWPOISON: 728 #endif 729 return true; 730 731 default: 732 return false; 733 } 734 } 735 736 /* 737 * The madvise(2) system call. 738 * 739 * Applications can use madvise() to advise the kernel how it should 740 * handle paging I/O in this VM area. The idea is to help the kernel 741 * use appropriate read-ahead and caching techniques. The information 742 * provided is advisory only, and can be safely disregarded by the 743 * kernel without affecting the correct operation of the application. 744 * 745 * behavior values: 746 * MADV_NORMAL - the default behavior is to read clusters. This 747 * results in some read-ahead and read-behind. 748 * MADV_RANDOM - the system should read the minimum amount of data 749 * on any access, since it is unlikely that the appli- 750 * cation will need more than what it asks for. 751 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 752 * once, so they can be aggressively read ahead, and 753 * can be freed soon after they are accessed. 754 * MADV_WILLNEED - the application is notifying the system to read 755 * some pages ahead. 756 * MADV_DONTNEED - the application is finished with the given range, 757 * so the kernel can free resources associated with it. 758 * MADV_FREE - the application marks pages in the given range as lazy free, 759 * where actual purges are postponed until memory pressure happens. 760 * MADV_REMOVE - the application wants to free up the given range of 761 * pages and associated backing store. 762 * MADV_DONTFORK - omit this area from child's address space when forking: 763 * typically, to avoid COWing pages pinned by get_user_pages(). 764 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 765 * MADV_WIPEONFORK - present the child process with zero-filled memory in this 766 * range after a fork. 767 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 768 * MADV_HWPOISON - trigger memory error handler as if the given memory range 769 * were corrupted by unrecoverable hardware memory failure. 770 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 771 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 772 * this area with pages of identical content from other such areas. 773 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 774 * MADV_HUGEPAGE - the application wants to back the given range by transparent 775 * huge pages in the future. Existing pages might be coalesced and 776 * new pages might be allocated as THP. 777 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 778 * transparent huge pages so the existing pages will not be 779 * coalesced into THP and new pages will not be allocated as THP. 780 * MADV_DONTDUMP - the application wants to prevent pages in the given range 781 * from being included in its core dump. 782 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 783 * 784 * return values: 785 * zero - success 786 * -EINVAL - start + len < 0, start is not page-aligned, 787 * "behavior" is not a valid value, or application 788 * is attempting to release locked or shared pages, 789 * or the specified address range includes file, Huge TLB, 790 * MAP_SHARED or VMPFNMAP range. 791 * -ENOMEM - addresses in the specified range are not currently 792 * mapped, or are outside the AS of the process. 793 * -EIO - an I/O error occurred while paging in data. 794 * -EBADF - map exists, but area maps something that isn't a file. 795 * -EAGAIN - a kernel resource was temporarily unavailable. 796 */ 797 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 798 { 799 unsigned long end, tmp; 800 struct vm_area_struct *vma, *prev; 801 int unmapped_error = 0; 802 int error = -EINVAL; 803 int write; 804 size_t len; 805 struct blk_plug plug; 806 807 if (!madvise_behavior_valid(behavior)) 808 return error; 809 810 if (start & ~PAGE_MASK) 811 return error; 812 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 813 814 /* Check to see whether len was rounded up from small -ve to zero */ 815 if (len_in && !len) 816 return error; 817 818 end = start + len; 819 if (end < start) 820 return error; 821 822 error = 0; 823 if (end == start) 824 return error; 825 826 #ifdef CONFIG_MEMORY_FAILURE 827 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 828 return madvise_inject_error(behavior, start, start + len_in); 829 #endif 830 831 write = madvise_need_mmap_write(behavior); 832 if (write) { 833 if (down_write_killable(¤t->mm->mmap_sem)) 834 return -EINTR; 835 } else { 836 down_read(¤t->mm->mmap_sem); 837 } 838 839 /* 840 * If the interval [start,end) covers some unmapped address 841 * ranges, just ignore them, but return -ENOMEM at the end. 842 * - different from the way of handling in mlock etc. 843 */ 844 vma = find_vma_prev(current->mm, start, &prev); 845 if (vma && start > vma->vm_start) 846 prev = vma; 847 848 blk_start_plug(&plug); 849 for (;;) { 850 /* Still start < end. */ 851 error = -ENOMEM; 852 if (!vma) 853 goto out; 854 855 /* Here start < (end|vma->vm_end). */ 856 if (start < vma->vm_start) { 857 unmapped_error = -ENOMEM; 858 start = vma->vm_start; 859 if (start >= end) 860 goto out; 861 } 862 863 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 864 tmp = vma->vm_end; 865 if (end < tmp) 866 tmp = end; 867 868 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 869 error = madvise_vma(vma, &prev, start, tmp, behavior); 870 if (error) 871 goto out; 872 start = tmp; 873 if (prev && start < prev->vm_end) 874 start = prev->vm_end; 875 error = unmapped_error; 876 if (start >= end) 877 goto out; 878 if (prev) 879 vma = prev->vm_next; 880 else /* madvise_remove dropped mmap_sem */ 881 vma = find_vma(current->mm, start); 882 } 883 out: 884 blk_finish_plug(&plug); 885 if (write) 886 up_write(¤t->mm->mmap_sem); 887 else 888 up_read(¤t->mm->mmap_sem); 889 890 return error; 891 } 892