1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/madvise.c 4 * 5 * Copyright (C) 1999 Linus Torvalds 6 * Copyright (C) 2002 Christoph Hellwig 7 */ 8 9 #include <linux/mman.h> 10 #include <linux/pagemap.h> 11 #include <linux/syscalls.h> 12 #include <linux/mempolicy.h> 13 #include <linux/page-isolation.h> 14 #include <linux/userfaultfd_k.h> 15 #include <linux/hugetlb.h> 16 #include <linux/falloc.h> 17 #include <linux/fadvise.h> 18 #include <linux/sched.h> 19 #include <linux/ksm.h> 20 #include <linux/fs.h> 21 #include <linux/file.h> 22 #include <linux/blkdev.h> 23 #include <linux/backing-dev.h> 24 #include <linux/pagewalk.h> 25 #include <linux/swap.h> 26 #include <linux/swapops.h> 27 #include <linux/shmem_fs.h> 28 #include <linux/mmu_notifier.h> 29 30 #include <asm/tlb.h> 31 32 #include "internal.h" 33 34 /* 35 * Any behaviour which results in changes to the vma->vm_flags needs to 36 * take mmap_sem for writing. Others, which simply traverse vmas, need 37 * to only take it for reading. 38 */ 39 static int madvise_need_mmap_write(int behavior) 40 { 41 switch (behavior) { 42 case MADV_REMOVE: 43 case MADV_WILLNEED: 44 case MADV_DONTNEED: 45 case MADV_FREE: 46 return 0; 47 default: 48 /* be safe, default to 1. list exceptions explicitly */ 49 return 1; 50 } 51 } 52 53 /* 54 * We can potentially split a vm area into separate 55 * areas, each area with its own behavior. 56 */ 57 static long madvise_behavior(struct vm_area_struct *vma, 58 struct vm_area_struct **prev, 59 unsigned long start, unsigned long end, int behavior) 60 { 61 struct mm_struct *mm = vma->vm_mm; 62 int error = 0; 63 pgoff_t pgoff; 64 unsigned long new_flags = vma->vm_flags; 65 66 switch (behavior) { 67 case MADV_NORMAL: 68 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 69 break; 70 case MADV_SEQUENTIAL: 71 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 72 break; 73 case MADV_RANDOM: 74 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 75 break; 76 case MADV_DONTFORK: 77 new_flags |= VM_DONTCOPY; 78 break; 79 case MADV_DOFORK: 80 if (vma->vm_flags & VM_IO) { 81 error = -EINVAL; 82 goto out; 83 } 84 new_flags &= ~VM_DONTCOPY; 85 break; 86 case MADV_WIPEONFORK: 87 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 88 if (vma->vm_file || vma->vm_flags & VM_SHARED) { 89 error = -EINVAL; 90 goto out; 91 } 92 new_flags |= VM_WIPEONFORK; 93 break; 94 case MADV_KEEPONFORK: 95 new_flags &= ~VM_WIPEONFORK; 96 break; 97 case MADV_DONTDUMP: 98 new_flags |= VM_DONTDUMP; 99 break; 100 case MADV_DODUMP: 101 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { 102 error = -EINVAL; 103 goto out; 104 } 105 new_flags &= ~VM_DONTDUMP; 106 break; 107 case MADV_MERGEABLE: 108 case MADV_UNMERGEABLE: 109 error = ksm_madvise(vma, start, end, behavior, &new_flags); 110 if (error) 111 goto out_convert_errno; 112 break; 113 case MADV_HUGEPAGE: 114 case MADV_NOHUGEPAGE: 115 error = hugepage_madvise(vma, &new_flags, behavior); 116 if (error) 117 goto out_convert_errno; 118 break; 119 } 120 121 if (new_flags == vma->vm_flags) { 122 *prev = vma; 123 goto out; 124 } 125 126 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 127 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 128 vma->vm_file, pgoff, vma_policy(vma), 129 vma->vm_userfaultfd_ctx); 130 if (*prev) { 131 vma = *prev; 132 goto success; 133 } 134 135 *prev = vma; 136 137 if (start != vma->vm_start) { 138 if (unlikely(mm->map_count >= sysctl_max_map_count)) { 139 error = -ENOMEM; 140 goto out; 141 } 142 error = __split_vma(mm, vma, start, 1); 143 if (error) 144 goto out_convert_errno; 145 } 146 147 if (end != vma->vm_end) { 148 if (unlikely(mm->map_count >= sysctl_max_map_count)) { 149 error = -ENOMEM; 150 goto out; 151 } 152 error = __split_vma(mm, vma, end, 0); 153 if (error) 154 goto out_convert_errno; 155 } 156 157 success: 158 /* 159 * vm_flags is protected by the mmap_sem held in write mode. 160 */ 161 vma->vm_flags = new_flags; 162 163 out_convert_errno: 164 /* 165 * madvise() returns EAGAIN if kernel resources, such as 166 * slab, are temporarily unavailable. 167 */ 168 if (error == -ENOMEM) 169 error = -EAGAIN; 170 out: 171 return error; 172 } 173 174 #ifdef CONFIG_SWAP 175 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 176 unsigned long end, struct mm_walk *walk) 177 { 178 pte_t *orig_pte; 179 struct vm_area_struct *vma = walk->private; 180 unsigned long index; 181 182 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 183 return 0; 184 185 for (index = start; index != end; index += PAGE_SIZE) { 186 pte_t pte; 187 swp_entry_t entry; 188 struct page *page; 189 spinlock_t *ptl; 190 191 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 192 pte = *(orig_pte + ((index - start) / PAGE_SIZE)); 193 pte_unmap_unlock(orig_pte, ptl); 194 195 if (pte_present(pte) || pte_none(pte)) 196 continue; 197 entry = pte_to_swp_entry(pte); 198 if (unlikely(non_swap_entry(entry))) 199 continue; 200 201 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 202 vma, index, false); 203 if (page) 204 put_page(page); 205 } 206 207 return 0; 208 } 209 210 static const struct mm_walk_ops swapin_walk_ops = { 211 .pmd_entry = swapin_walk_pmd_entry, 212 }; 213 214 static void force_shm_swapin_readahead(struct vm_area_struct *vma, 215 unsigned long start, unsigned long end, 216 struct address_space *mapping) 217 { 218 pgoff_t index; 219 struct page *page; 220 swp_entry_t swap; 221 222 for (; start < end; start += PAGE_SIZE) { 223 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 224 225 page = find_get_entry(mapping, index); 226 if (!xa_is_value(page)) { 227 if (page) 228 put_page(page); 229 continue; 230 } 231 swap = radix_to_swp_entry(page); 232 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, 233 NULL, 0, false); 234 if (page) 235 put_page(page); 236 } 237 238 lru_add_drain(); /* Push any new pages onto the LRU now */ 239 } 240 #endif /* CONFIG_SWAP */ 241 242 /* 243 * Schedule all required I/O operations. Do not wait for completion. 244 */ 245 static long madvise_willneed(struct vm_area_struct *vma, 246 struct vm_area_struct **prev, 247 unsigned long start, unsigned long end) 248 { 249 struct file *file = vma->vm_file; 250 loff_t offset; 251 252 *prev = vma; 253 #ifdef CONFIG_SWAP 254 if (!file) { 255 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); 256 lru_add_drain(); /* Push any new pages onto the LRU now */ 257 return 0; 258 } 259 260 if (shmem_mapping(file->f_mapping)) { 261 force_shm_swapin_readahead(vma, start, end, 262 file->f_mapping); 263 return 0; 264 } 265 #else 266 if (!file) 267 return -EBADF; 268 #endif 269 270 if (IS_DAX(file_inode(file))) { 271 /* no bad return value, but ignore advice */ 272 return 0; 273 } 274 275 /* 276 * Filesystem's fadvise may need to take various locks. We need to 277 * explicitly grab a reference because the vma (and hence the 278 * vma's reference to the file) can go away as soon as we drop 279 * mmap_sem. 280 */ 281 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 282 get_file(file); 283 up_read(¤t->mm->mmap_sem); 284 offset = (loff_t)(start - vma->vm_start) 285 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 286 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 287 fput(file); 288 down_read(¤t->mm->mmap_sem); 289 return 0; 290 } 291 292 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 293 unsigned long end, struct mm_walk *walk) 294 295 { 296 struct mmu_gather *tlb = walk->private; 297 struct mm_struct *mm = tlb->mm; 298 struct vm_area_struct *vma = walk->vma; 299 spinlock_t *ptl; 300 pte_t *orig_pte, *pte, ptent; 301 struct page *page; 302 int nr_swap = 0; 303 unsigned long next; 304 305 next = pmd_addr_end(addr, end); 306 if (pmd_trans_huge(*pmd)) 307 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 308 goto next; 309 310 if (pmd_trans_unstable(pmd)) 311 return 0; 312 313 tlb_change_page_size(tlb, PAGE_SIZE); 314 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 315 flush_tlb_batched_pending(mm); 316 arch_enter_lazy_mmu_mode(); 317 for (; addr != end; pte++, addr += PAGE_SIZE) { 318 ptent = *pte; 319 320 if (pte_none(ptent)) 321 continue; 322 /* 323 * If the pte has swp_entry, just clear page table to 324 * prevent swap-in which is more expensive rather than 325 * (page allocation + zeroing). 326 */ 327 if (!pte_present(ptent)) { 328 swp_entry_t entry; 329 330 entry = pte_to_swp_entry(ptent); 331 if (non_swap_entry(entry)) 332 continue; 333 nr_swap--; 334 free_swap_and_cache(entry); 335 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 336 continue; 337 } 338 339 page = vm_normal_page(vma, addr, ptent); 340 if (!page) 341 continue; 342 343 /* 344 * If pmd isn't transhuge but the page is THP and 345 * is owned by only this process, split it and 346 * deactivate all pages. 347 */ 348 if (PageTransCompound(page)) { 349 if (page_mapcount(page) != 1) 350 goto out; 351 get_page(page); 352 if (!trylock_page(page)) { 353 put_page(page); 354 goto out; 355 } 356 pte_unmap_unlock(orig_pte, ptl); 357 if (split_huge_page(page)) { 358 unlock_page(page); 359 put_page(page); 360 pte_offset_map_lock(mm, pmd, addr, &ptl); 361 goto out; 362 } 363 unlock_page(page); 364 put_page(page); 365 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 366 pte--; 367 addr -= PAGE_SIZE; 368 continue; 369 } 370 371 VM_BUG_ON_PAGE(PageTransCompound(page), page); 372 373 if (PageSwapCache(page) || PageDirty(page)) { 374 if (!trylock_page(page)) 375 continue; 376 /* 377 * If page is shared with others, we couldn't clear 378 * PG_dirty of the page. 379 */ 380 if (page_mapcount(page) != 1) { 381 unlock_page(page); 382 continue; 383 } 384 385 if (PageSwapCache(page) && !try_to_free_swap(page)) { 386 unlock_page(page); 387 continue; 388 } 389 390 ClearPageDirty(page); 391 unlock_page(page); 392 } 393 394 if (pte_young(ptent) || pte_dirty(ptent)) { 395 /* 396 * Some of architecture(ex, PPC) don't update TLB 397 * with set_pte_at and tlb_remove_tlb_entry so for 398 * the portability, remap the pte with old|clean 399 * after pte clearing. 400 */ 401 ptent = ptep_get_and_clear_full(mm, addr, pte, 402 tlb->fullmm); 403 404 ptent = pte_mkold(ptent); 405 ptent = pte_mkclean(ptent); 406 set_pte_at(mm, addr, pte, ptent); 407 tlb_remove_tlb_entry(tlb, pte, addr); 408 } 409 mark_page_lazyfree(page); 410 } 411 out: 412 if (nr_swap) { 413 if (current->mm == mm) 414 sync_mm_rss(mm); 415 416 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 417 } 418 arch_leave_lazy_mmu_mode(); 419 pte_unmap_unlock(orig_pte, ptl); 420 cond_resched(); 421 next: 422 return 0; 423 } 424 425 static const struct mm_walk_ops madvise_free_walk_ops = { 426 .pmd_entry = madvise_free_pte_range, 427 }; 428 429 static int madvise_free_single_vma(struct vm_area_struct *vma, 430 unsigned long start_addr, unsigned long end_addr) 431 { 432 struct mm_struct *mm = vma->vm_mm; 433 struct mmu_notifier_range range; 434 struct mmu_gather tlb; 435 436 /* MADV_FREE works for only anon vma at the moment */ 437 if (!vma_is_anonymous(vma)) 438 return -EINVAL; 439 440 range.start = max(vma->vm_start, start_addr); 441 if (range.start >= vma->vm_end) 442 return -EINVAL; 443 range.end = min(vma->vm_end, end_addr); 444 if (range.end <= vma->vm_start) 445 return -EINVAL; 446 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 447 range.start, range.end); 448 449 lru_add_drain(); 450 tlb_gather_mmu(&tlb, mm, range.start, range.end); 451 update_hiwater_rss(mm); 452 453 mmu_notifier_invalidate_range_start(&range); 454 tlb_start_vma(&tlb, vma); 455 walk_page_range(vma->vm_mm, range.start, range.end, 456 &madvise_free_walk_ops, &tlb); 457 tlb_end_vma(&tlb, vma); 458 mmu_notifier_invalidate_range_end(&range); 459 tlb_finish_mmu(&tlb, range.start, range.end); 460 461 return 0; 462 } 463 464 /* 465 * Application no longer needs these pages. If the pages are dirty, 466 * it's OK to just throw them away. The app will be more careful about 467 * data it wants to keep. Be sure to free swap resources too. The 468 * zap_page_range call sets things up for shrink_active_list to actually free 469 * these pages later if no one else has touched them in the meantime, 470 * although we could add these pages to a global reuse list for 471 * shrink_active_list to pick up before reclaiming other pages. 472 * 473 * NB: This interface discards data rather than pushes it out to swap, 474 * as some implementations do. This has performance implications for 475 * applications like large transactional databases which want to discard 476 * pages in anonymous maps after committing to backing store the data 477 * that was kept in them. There is no reason to write this data out to 478 * the swap area if the application is discarding it. 479 * 480 * An interface that causes the system to free clean pages and flush 481 * dirty pages is already available as msync(MS_INVALIDATE). 482 */ 483 static long madvise_dontneed_single_vma(struct vm_area_struct *vma, 484 unsigned long start, unsigned long end) 485 { 486 zap_page_range(vma, start, end - start); 487 return 0; 488 } 489 490 static long madvise_dontneed_free(struct vm_area_struct *vma, 491 struct vm_area_struct **prev, 492 unsigned long start, unsigned long end, 493 int behavior) 494 { 495 *prev = vma; 496 if (!can_madv_dontneed_vma(vma)) 497 return -EINVAL; 498 499 if (!userfaultfd_remove(vma, start, end)) { 500 *prev = NULL; /* mmap_sem has been dropped, prev is stale */ 501 502 down_read(¤t->mm->mmap_sem); 503 vma = find_vma(current->mm, start); 504 if (!vma) 505 return -ENOMEM; 506 if (start < vma->vm_start) { 507 /* 508 * This "vma" under revalidation is the one 509 * with the lowest vma->vm_start where start 510 * is also < vma->vm_end. If start < 511 * vma->vm_start it means an hole materialized 512 * in the user address space within the 513 * virtual range passed to MADV_DONTNEED 514 * or MADV_FREE. 515 */ 516 return -ENOMEM; 517 } 518 if (!can_madv_dontneed_vma(vma)) 519 return -EINVAL; 520 if (end > vma->vm_end) { 521 /* 522 * Don't fail if end > vma->vm_end. If the old 523 * vma was splitted while the mmap_sem was 524 * released the effect of the concurrent 525 * operation may not cause madvise() to 526 * have an undefined result. There may be an 527 * adjacent next vma that we'll walk 528 * next. userfaultfd_remove() will generate an 529 * UFFD_EVENT_REMOVE repetition on the 530 * end-vma->vm_end range, but the manager can 531 * handle a repetition fine. 532 */ 533 end = vma->vm_end; 534 } 535 VM_WARN_ON(start >= end); 536 } 537 538 if (behavior == MADV_DONTNEED) 539 return madvise_dontneed_single_vma(vma, start, end); 540 else if (behavior == MADV_FREE) 541 return madvise_free_single_vma(vma, start, end); 542 else 543 return -EINVAL; 544 } 545 546 /* 547 * Application wants to free up the pages and associated backing store. 548 * This is effectively punching a hole into the middle of a file. 549 */ 550 static long madvise_remove(struct vm_area_struct *vma, 551 struct vm_area_struct **prev, 552 unsigned long start, unsigned long end) 553 { 554 loff_t offset; 555 int error; 556 struct file *f; 557 558 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 559 560 if (vma->vm_flags & VM_LOCKED) 561 return -EINVAL; 562 563 f = vma->vm_file; 564 565 if (!f || !f->f_mapping || !f->f_mapping->host) { 566 return -EINVAL; 567 } 568 569 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 570 return -EACCES; 571 572 offset = (loff_t)(start - vma->vm_start) 573 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 574 575 /* 576 * Filesystem's fallocate may need to take i_mutex. We need to 577 * explicitly grab a reference because the vma (and hence the 578 * vma's reference to the file) can go away as soon as we drop 579 * mmap_sem. 580 */ 581 get_file(f); 582 if (userfaultfd_remove(vma, start, end)) { 583 /* mmap_sem was not released by userfaultfd_remove() */ 584 up_read(¤t->mm->mmap_sem); 585 } 586 error = vfs_fallocate(f, 587 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 588 offset, end - start); 589 fput(f); 590 down_read(¤t->mm->mmap_sem); 591 return error; 592 } 593 594 #ifdef CONFIG_MEMORY_FAILURE 595 /* 596 * Error injection support for memory error handling. 597 */ 598 static int madvise_inject_error(int behavior, 599 unsigned long start, unsigned long end) 600 { 601 struct page *page; 602 struct zone *zone; 603 unsigned int order; 604 605 if (!capable(CAP_SYS_ADMIN)) 606 return -EPERM; 607 608 609 for (; start < end; start += PAGE_SIZE << order) { 610 unsigned long pfn; 611 int ret; 612 613 ret = get_user_pages_fast(start, 1, 0, &page); 614 if (ret != 1) 615 return ret; 616 pfn = page_to_pfn(page); 617 618 /* 619 * When soft offlining hugepages, after migrating the page 620 * we dissolve it, therefore in the second loop "page" will 621 * no longer be a compound page, and order will be 0. 622 */ 623 order = compound_order(compound_head(page)); 624 625 if (PageHWPoison(page)) { 626 put_page(page); 627 continue; 628 } 629 630 if (behavior == MADV_SOFT_OFFLINE) { 631 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 632 pfn, start); 633 634 ret = soft_offline_page(page, MF_COUNT_INCREASED); 635 if (ret) 636 return ret; 637 continue; 638 } 639 640 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 641 pfn, start); 642 643 /* 644 * Drop the page reference taken by get_user_pages_fast(). In 645 * the absence of MF_COUNT_INCREASED the memory_failure() 646 * routine is responsible for pinning the page to prevent it 647 * from being released back to the page allocator. 648 */ 649 put_page(page); 650 ret = memory_failure(pfn, 0); 651 if (ret) 652 return ret; 653 } 654 655 /* Ensure that all poisoned pages are removed from per-cpu lists */ 656 for_each_populated_zone(zone) 657 drain_all_pages(zone); 658 659 return 0; 660 } 661 #endif 662 663 static long 664 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 665 unsigned long start, unsigned long end, int behavior) 666 { 667 switch (behavior) { 668 case MADV_REMOVE: 669 return madvise_remove(vma, prev, start, end); 670 case MADV_WILLNEED: 671 return madvise_willneed(vma, prev, start, end); 672 case MADV_FREE: 673 case MADV_DONTNEED: 674 return madvise_dontneed_free(vma, prev, start, end, behavior); 675 default: 676 return madvise_behavior(vma, prev, start, end, behavior); 677 } 678 } 679 680 static bool 681 madvise_behavior_valid(int behavior) 682 { 683 switch (behavior) { 684 case MADV_DOFORK: 685 case MADV_DONTFORK: 686 case MADV_NORMAL: 687 case MADV_SEQUENTIAL: 688 case MADV_RANDOM: 689 case MADV_REMOVE: 690 case MADV_WILLNEED: 691 case MADV_DONTNEED: 692 case MADV_FREE: 693 #ifdef CONFIG_KSM 694 case MADV_MERGEABLE: 695 case MADV_UNMERGEABLE: 696 #endif 697 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 698 case MADV_HUGEPAGE: 699 case MADV_NOHUGEPAGE: 700 #endif 701 case MADV_DONTDUMP: 702 case MADV_DODUMP: 703 case MADV_WIPEONFORK: 704 case MADV_KEEPONFORK: 705 #ifdef CONFIG_MEMORY_FAILURE 706 case MADV_SOFT_OFFLINE: 707 case MADV_HWPOISON: 708 #endif 709 return true; 710 711 default: 712 return false; 713 } 714 } 715 716 /* 717 * The madvise(2) system call. 718 * 719 * Applications can use madvise() to advise the kernel how it should 720 * handle paging I/O in this VM area. The idea is to help the kernel 721 * use appropriate read-ahead and caching techniques. The information 722 * provided is advisory only, and can be safely disregarded by the 723 * kernel without affecting the correct operation of the application. 724 * 725 * behavior values: 726 * MADV_NORMAL - the default behavior is to read clusters. This 727 * results in some read-ahead and read-behind. 728 * MADV_RANDOM - the system should read the minimum amount of data 729 * on any access, since it is unlikely that the appli- 730 * cation will need more than what it asks for. 731 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 732 * once, so they can be aggressively read ahead, and 733 * can be freed soon after they are accessed. 734 * MADV_WILLNEED - the application is notifying the system to read 735 * some pages ahead. 736 * MADV_DONTNEED - the application is finished with the given range, 737 * so the kernel can free resources associated with it. 738 * MADV_FREE - the application marks pages in the given range as lazy free, 739 * where actual purges are postponed until memory pressure happens. 740 * MADV_REMOVE - the application wants to free up the given range of 741 * pages and associated backing store. 742 * MADV_DONTFORK - omit this area from child's address space when forking: 743 * typically, to avoid COWing pages pinned by get_user_pages(). 744 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 745 * MADV_WIPEONFORK - present the child process with zero-filled memory in this 746 * range after a fork. 747 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 748 * MADV_HWPOISON - trigger memory error handler as if the given memory range 749 * were corrupted by unrecoverable hardware memory failure. 750 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 751 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 752 * this area with pages of identical content from other such areas. 753 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 754 * MADV_HUGEPAGE - the application wants to back the given range by transparent 755 * huge pages in the future. Existing pages might be coalesced and 756 * new pages might be allocated as THP. 757 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 758 * transparent huge pages so the existing pages will not be 759 * coalesced into THP and new pages will not be allocated as THP. 760 * MADV_DONTDUMP - the application wants to prevent pages in the given range 761 * from being included in its core dump. 762 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 763 * 764 * return values: 765 * zero - success 766 * -EINVAL - start + len < 0, start is not page-aligned, 767 * "behavior" is not a valid value, or application 768 * is attempting to release locked or shared pages, 769 * or the specified address range includes file, Huge TLB, 770 * MAP_SHARED or VMPFNMAP range. 771 * -ENOMEM - addresses in the specified range are not currently 772 * mapped, or are outside the AS of the process. 773 * -EIO - an I/O error occurred while paging in data. 774 * -EBADF - map exists, but area maps something that isn't a file. 775 * -EAGAIN - a kernel resource was temporarily unavailable. 776 */ 777 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 778 { 779 unsigned long end, tmp; 780 struct vm_area_struct *vma, *prev; 781 int unmapped_error = 0; 782 int error = -EINVAL; 783 int write; 784 size_t len; 785 struct blk_plug plug; 786 787 if (!madvise_behavior_valid(behavior)) 788 return error; 789 790 if (start & ~PAGE_MASK) 791 return error; 792 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 793 794 /* Check to see whether len was rounded up from small -ve to zero */ 795 if (len_in && !len) 796 return error; 797 798 end = start + len; 799 if (end < start) 800 return error; 801 802 error = 0; 803 if (end == start) 804 return error; 805 806 #ifdef CONFIG_MEMORY_FAILURE 807 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 808 return madvise_inject_error(behavior, start, start + len_in); 809 #endif 810 811 write = madvise_need_mmap_write(behavior); 812 if (write) { 813 if (down_write_killable(¤t->mm->mmap_sem)) 814 return -EINTR; 815 } else { 816 down_read(¤t->mm->mmap_sem); 817 } 818 819 /* 820 * If the interval [start,end) covers some unmapped address 821 * ranges, just ignore them, but return -ENOMEM at the end. 822 * - different from the way of handling in mlock etc. 823 */ 824 vma = find_vma_prev(current->mm, start, &prev); 825 if (vma && start > vma->vm_start) 826 prev = vma; 827 828 blk_start_plug(&plug); 829 for (;;) { 830 /* Still start < end. */ 831 error = -ENOMEM; 832 if (!vma) 833 goto out; 834 835 /* Here start < (end|vma->vm_end). */ 836 if (start < vma->vm_start) { 837 unmapped_error = -ENOMEM; 838 start = vma->vm_start; 839 if (start >= end) 840 goto out; 841 } 842 843 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 844 tmp = vma->vm_end; 845 if (end < tmp) 846 tmp = end; 847 848 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 849 error = madvise_vma(vma, &prev, start, tmp, behavior); 850 if (error) 851 goto out; 852 start = tmp; 853 if (prev && start < prev->vm_end) 854 start = prev->vm_end; 855 error = unmapped_error; 856 if (start >= end) 857 goto out; 858 if (prev) 859 vma = prev->vm_next; 860 else /* madvise_remove dropped mmap_sem */ 861 vma = find_vma(current->mm, start); 862 } 863 out: 864 blk_finish_plug(&plug); 865 if (write) 866 up_write(¤t->mm->mmap_sem); 867 else 868 up_read(¤t->mm->mmap_sem); 869 870 return error; 871 } 872