1 /* 2 * linux/mm/madvise.c 3 * 4 * Copyright (C) 1999 Linus Torvalds 5 * Copyright (C) 2002 Christoph Hellwig 6 */ 7 8 #include <linux/mman.h> 9 #include <linux/pagemap.h> 10 #include <linux/syscalls.h> 11 #include <linux/mempolicy.h> 12 #include <linux/page-isolation.h> 13 #include <linux/userfaultfd_k.h> 14 #include <linux/hugetlb.h> 15 #include <linux/falloc.h> 16 #include <linux/sched.h> 17 #include <linux/ksm.h> 18 #include <linux/fs.h> 19 #include <linux/file.h> 20 #include <linux/blkdev.h> 21 #include <linux/backing-dev.h> 22 #include <linux/swap.h> 23 #include <linux/swapops.h> 24 #include <linux/shmem_fs.h> 25 #include <linux/mmu_notifier.h> 26 27 #include <asm/tlb.h> 28 29 #include "internal.h" 30 31 /* 32 * Any behaviour which results in changes to the vma->vm_flags needs to 33 * take mmap_sem for writing. Others, which simply traverse vmas, need 34 * to only take it for reading. 35 */ 36 static int madvise_need_mmap_write(int behavior) 37 { 38 switch (behavior) { 39 case MADV_REMOVE: 40 case MADV_WILLNEED: 41 case MADV_DONTNEED: 42 case MADV_FREE: 43 return 0; 44 default: 45 /* be safe, default to 1. list exceptions explicitly */ 46 return 1; 47 } 48 } 49 50 /* 51 * We can potentially split a vm area into separate 52 * areas, each area with its own behavior. 53 */ 54 static long madvise_behavior(struct vm_area_struct *vma, 55 struct vm_area_struct **prev, 56 unsigned long start, unsigned long end, int behavior) 57 { 58 struct mm_struct *mm = vma->vm_mm; 59 int error = 0; 60 pgoff_t pgoff; 61 unsigned long new_flags = vma->vm_flags; 62 63 switch (behavior) { 64 case MADV_NORMAL: 65 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 66 break; 67 case MADV_SEQUENTIAL: 68 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 69 break; 70 case MADV_RANDOM: 71 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 72 break; 73 case MADV_DONTFORK: 74 new_flags |= VM_DONTCOPY; 75 break; 76 case MADV_DOFORK: 77 if (vma->vm_flags & VM_IO) { 78 error = -EINVAL; 79 goto out; 80 } 81 new_flags &= ~VM_DONTCOPY; 82 break; 83 case MADV_WIPEONFORK: 84 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 85 if (vma->vm_file || vma->vm_flags & VM_SHARED) { 86 error = -EINVAL; 87 goto out; 88 } 89 new_flags |= VM_WIPEONFORK; 90 break; 91 case MADV_KEEPONFORK: 92 new_flags &= ~VM_WIPEONFORK; 93 break; 94 case MADV_DONTDUMP: 95 new_flags |= VM_DONTDUMP; 96 break; 97 case MADV_DODUMP: 98 if (new_flags & VM_SPECIAL) { 99 error = -EINVAL; 100 goto out; 101 } 102 new_flags &= ~VM_DONTDUMP; 103 break; 104 case MADV_MERGEABLE: 105 case MADV_UNMERGEABLE: 106 error = ksm_madvise(vma, start, end, behavior, &new_flags); 107 if (error) { 108 /* 109 * madvise() returns EAGAIN if kernel resources, such as 110 * slab, are temporarily unavailable. 111 */ 112 if (error == -ENOMEM) 113 error = -EAGAIN; 114 goto out; 115 } 116 break; 117 case MADV_HUGEPAGE: 118 case MADV_NOHUGEPAGE: 119 error = hugepage_madvise(vma, &new_flags, behavior); 120 if (error) { 121 /* 122 * madvise() returns EAGAIN if kernel resources, such as 123 * slab, are temporarily unavailable. 124 */ 125 if (error == -ENOMEM) 126 error = -EAGAIN; 127 goto out; 128 } 129 break; 130 } 131 132 if (new_flags == vma->vm_flags) { 133 *prev = vma; 134 goto out; 135 } 136 137 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 138 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 139 vma->vm_file, pgoff, vma_policy(vma), 140 vma->vm_userfaultfd_ctx); 141 if (*prev) { 142 vma = *prev; 143 goto success; 144 } 145 146 *prev = vma; 147 148 if (start != vma->vm_start) { 149 if (unlikely(mm->map_count >= sysctl_max_map_count)) { 150 error = -ENOMEM; 151 goto out; 152 } 153 error = __split_vma(mm, vma, start, 1); 154 if (error) { 155 /* 156 * madvise() returns EAGAIN if kernel resources, such as 157 * slab, are temporarily unavailable. 158 */ 159 if (error == -ENOMEM) 160 error = -EAGAIN; 161 goto out; 162 } 163 } 164 165 if (end != vma->vm_end) { 166 if (unlikely(mm->map_count >= sysctl_max_map_count)) { 167 error = -ENOMEM; 168 goto out; 169 } 170 error = __split_vma(mm, vma, end, 0); 171 if (error) { 172 /* 173 * madvise() returns EAGAIN if kernel resources, such as 174 * slab, are temporarily unavailable. 175 */ 176 if (error == -ENOMEM) 177 error = -EAGAIN; 178 goto out; 179 } 180 } 181 182 success: 183 /* 184 * vm_flags is protected by the mmap_sem held in write mode. 185 */ 186 vma->vm_flags = new_flags; 187 out: 188 return error; 189 } 190 191 #ifdef CONFIG_SWAP 192 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 193 unsigned long end, struct mm_walk *walk) 194 { 195 pte_t *orig_pte; 196 struct vm_area_struct *vma = walk->private; 197 unsigned long index; 198 199 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 200 return 0; 201 202 for (index = start; index != end; index += PAGE_SIZE) { 203 pte_t pte; 204 swp_entry_t entry; 205 struct page *page; 206 spinlock_t *ptl; 207 208 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 209 pte = *(orig_pte + ((index - start) / PAGE_SIZE)); 210 pte_unmap_unlock(orig_pte, ptl); 211 212 if (pte_present(pte) || pte_none(pte)) 213 continue; 214 entry = pte_to_swp_entry(pte); 215 if (unlikely(non_swap_entry(entry))) 216 continue; 217 218 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 219 vma, index, false); 220 if (page) 221 put_page(page); 222 } 223 224 return 0; 225 } 226 227 static void force_swapin_readahead(struct vm_area_struct *vma, 228 unsigned long start, unsigned long end) 229 { 230 struct mm_walk walk = { 231 .mm = vma->vm_mm, 232 .pmd_entry = swapin_walk_pmd_entry, 233 .private = vma, 234 }; 235 236 walk_page_range(start, end, &walk); 237 238 lru_add_drain(); /* Push any new pages onto the LRU now */ 239 } 240 241 static void force_shm_swapin_readahead(struct vm_area_struct *vma, 242 unsigned long start, unsigned long end, 243 struct address_space *mapping) 244 { 245 pgoff_t index; 246 struct page *page; 247 swp_entry_t swap; 248 249 for (; start < end; start += PAGE_SIZE) { 250 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 251 252 page = find_get_entry(mapping, index); 253 if (!radix_tree_exceptional_entry(page)) { 254 if (page) 255 put_page(page); 256 continue; 257 } 258 swap = radix_to_swp_entry(page); 259 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, 260 NULL, 0, false); 261 if (page) 262 put_page(page); 263 } 264 265 lru_add_drain(); /* Push any new pages onto the LRU now */ 266 } 267 #endif /* CONFIG_SWAP */ 268 269 /* 270 * Schedule all required I/O operations. Do not wait for completion. 271 */ 272 static long madvise_willneed(struct vm_area_struct *vma, 273 struct vm_area_struct **prev, 274 unsigned long start, unsigned long end) 275 { 276 struct file *file = vma->vm_file; 277 278 #ifdef CONFIG_SWAP 279 if (!file) { 280 *prev = vma; 281 force_swapin_readahead(vma, start, end); 282 return 0; 283 } 284 285 if (shmem_mapping(file->f_mapping)) { 286 *prev = vma; 287 force_shm_swapin_readahead(vma, start, end, 288 file->f_mapping); 289 return 0; 290 } 291 #else 292 if (!file) 293 return -EBADF; 294 #endif 295 296 if (IS_DAX(file_inode(file))) { 297 /* no bad return value, but ignore advice */ 298 return 0; 299 } 300 301 *prev = vma; 302 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 303 if (end > vma->vm_end) 304 end = vma->vm_end; 305 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 306 307 force_page_cache_readahead(file->f_mapping, file, start, end - start); 308 return 0; 309 } 310 311 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 312 unsigned long end, struct mm_walk *walk) 313 314 { 315 struct mmu_gather *tlb = walk->private; 316 struct mm_struct *mm = tlb->mm; 317 struct vm_area_struct *vma = walk->vma; 318 spinlock_t *ptl; 319 pte_t *orig_pte, *pte, ptent; 320 struct page *page; 321 int nr_swap = 0; 322 unsigned long next; 323 324 next = pmd_addr_end(addr, end); 325 if (pmd_trans_huge(*pmd)) 326 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 327 goto next; 328 329 if (pmd_trans_unstable(pmd)) 330 return 0; 331 332 tlb_remove_check_page_size_change(tlb, PAGE_SIZE); 333 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 334 flush_tlb_batched_pending(mm); 335 arch_enter_lazy_mmu_mode(); 336 for (; addr != end; pte++, addr += PAGE_SIZE) { 337 ptent = *pte; 338 339 if (pte_none(ptent)) 340 continue; 341 /* 342 * If the pte has swp_entry, just clear page table to 343 * prevent swap-in which is more expensive rather than 344 * (page allocation + zeroing). 345 */ 346 if (!pte_present(ptent)) { 347 swp_entry_t entry; 348 349 entry = pte_to_swp_entry(ptent); 350 if (non_swap_entry(entry)) 351 continue; 352 nr_swap--; 353 free_swap_and_cache(entry); 354 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 355 continue; 356 } 357 358 page = _vm_normal_page(vma, addr, ptent, true); 359 if (!page) 360 continue; 361 362 /* 363 * If pmd isn't transhuge but the page is THP and 364 * is owned by only this process, split it and 365 * deactivate all pages. 366 */ 367 if (PageTransCompound(page)) { 368 if (page_mapcount(page) != 1) 369 goto out; 370 get_page(page); 371 if (!trylock_page(page)) { 372 put_page(page); 373 goto out; 374 } 375 pte_unmap_unlock(orig_pte, ptl); 376 if (split_huge_page(page)) { 377 unlock_page(page); 378 put_page(page); 379 pte_offset_map_lock(mm, pmd, addr, &ptl); 380 goto out; 381 } 382 unlock_page(page); 383 put_page(page); 384 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 385 pte--; 386 addr -= PAGE_SIZE; 387 continue; 388 } 389 390 VM_BUG_ON_PAGE(PageTransCompound(page), page); 391 392 if (PageSwapCache(page) || PageDirty(page)) { 393 if (!trylock_page(page)) 394 continue; 395 /* 396 * If page is shared with others, we couldn't clear 397 * PG_dirty of the page. 398 */ 399 if (page_mapcount(page) != 1) { 400 unlock_page(page); 401 continue; 402 } 403 404 if (PageSwapCache(page) && !try_to_free_swap(page)) { 405 unlock_page(page); 406 continue; 407 } 408 409 ClearPageDirty(page); 410 unlock_page(page); 411 } 412 413 if (pte_young(ptent) || pte_dirty(ptent)) { 414 /* 415 * Some of architecture(ex, PPC) don't update TLB 416 * with set_pte_at and tlb_remove_tlb_entry so for 417 * the portability, remap the pte with old|clean 418 * after pte clearing. 419 */ 420 ptent = ptep_get_and_clear_full(mm, addr, pte, 421 tlb->fullmm); 422 423 ptent = pte_mkold(ptent); 424 ptent = pte_mkclean(ptent); 425 set_pte_at(mm, addr, pte, ptent); 426 tlb_remove_tlb_entry(tlb, pte, addr); 427 } 428 mark_page_lazyfree(page); 429 } 430 out: 431 if (nr_swap) { 432 if (current->mm == mm) 433 sync_mm_rss(mm); 434 435 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 436 } 437 arch_leave_lazy_mmu_mode(); 438 pte_unmap_unlock(orig_pte, ptl); 439 cond_resched(); 440 next: 441 return 0; 442 } 443 444 static void madvise_free_page_range(struct mmu_gather *tlb, 445 struct vm_area_struct *vma, 446 unsigned long addr, unsigned long end) 447 { 448 struct mm_walk free_walk = { 449 .pmd_entry = madvise_free_pte_range, 450 .mm = vma->vm_mm, 451 .private = tlb, 452 }; 453 454 tlb_start_vma(tlb, vma); 455 walk_page_range(addr, end, &free_walk); 456 tlb_end_vma(tlb, vma); 457 } 458 459 static int madvise_free_single_vma(struct vm_area_struct *vma, 460 unsigned long start_addr, unsigned long end_addr) 461 { 462 unsigned long start, end; 463 struct mm_struct *mm = vma->vm_mm; 464 struct mmu_gather tlb; 465 466 /* MADV_FREE works for only anon vma at the moment */ 467 if (!vma_is_anonymous(vma)) 468 return -EINVAL; 469 470 start = max(vma->vm_start, start_addr); 471 if (start >= vma->vm_end) 472 return -EINVAL; 473 end = min(vma->vm_end, end_addr); 474 if (end <= vma->vm_start) 475 return -EINVAL; 476 477 lru_add_drain(); 478 tlb_gather_mmu(&tlb, mm, start, end); 479 update_hiwater_rss(mm); 480 481 mmu_notifier_invalidate_range_start(mm, start, end); 482 madvise_free_page_range(&tlb, vma, start, end); 483 mmu_notifier_invalidate_range_end(mm, start, end); 484 tlb_finish_mmu(&tlb, start, end); 485 486 return 0; 487 } 488 489 /* 490 * Application no longer needs these pages. If the pages are dirty, 491 * it's OK to just throw them away. The app will be more careful about 492 * data it wants to keep. Be sure to free swap resources too. The 493 * zap_page_range call sets things up for shrink_active_list to actually free 494 * these pages later if no one else has touched them in the meantime, 495 * although we could add these pages to a global reuse list for 496 * shrink_active_list to pick up before reclaiming other pages. 497 * 498 * NB: This interface discards data rather than pushes it out to swap, 499 * as some implementations do. This has performance implications for 500 * applications like large transactional databases which want to discard 501 * pages in anonymous maps after committing to backing store the data 502 * that was kept in them. There is no reason to write this data out to 503 * the swap area if the application is discarding it. 504 * 505 * An interface that causes the system to free clean pages and flush 506 * dirty pages is already available as msync(MS_INVALIDATE). 507 */ 508 static long madvise_dontneed_single_vma(struct vm_area_struct *vma, 509 unsigned long start, unsigned long end) 510 { 511 zap_page_range(vma, start, end - start); 512 return 0; 513 } 514 515 static long madvise_dontneed_free(struct vm_area_struct *vma, 516 struct vm_area_struct **prev, 517 unsigned long start, unsigned long end, 518 int behavior) 519 { 520 *prev = vma; 521 if (!can_madv_dontneed_vma(vma)) 522 return -EINVAL; 523 524 if (!userfaultfd_remove(vma, start, end)) { 525 *prev = NULL; /* mmap_sem has been dropped, prev is stale */ 526 527 down_read(¤t->mm->mmap_sem); 528 vma = find_vma(current->mm, start); 529 if (!vma) 530 return -ENOMEM; 531 if (start < vma->vm_start) { 532 /* 533 * This "vma" under revalidation is the one 534 * with the lowest vma->vm_start where start 535 * is also < vma->vm_end. If start < 536 * vma->vm_start it means an hole materialized 537 * in the user address space within the 538 * virtual range passed to MADV_DONTNEED 539 * or MADV_FREE. 540 */ 541 return -ENOMEM; 542 } 543 if (!can_madv_dontneed_vma(vma)) 544 return -EINVAL; 545 if (end > vma->vm_end) { 546 /* 547 * Don't fail if end > vma->vm_end. If the old 548 * vma was splitted while the mmap_sem was 549 * released the effect of the concurrent 550 * operation may not cause madvise() to 551 * have an undefined result. There may be an 552 * adjacent next vma that we'll walk 553 * next. userfaultfd_remove() will generate an 554 * UFFD_EVENT_REMOVE repetition on the 555 * end-vma->vm_end range, but the manager can 556 * handle a repetition fine. 557 */ 558 end = vma->vm_end; 559 } 560 VM_WARN_ON(start >= end); 561 } 562 563 if (behavior == MADV_DONTNEED) 564 return madvise_dontneed_single_vma(vma, start, end); 565 else if (behavior == MADV_FREE) 566 return madvise_free_single_vma(vma, start, end); 567 else 568 return -EINVAL; 569 } 570 571 /* 572 * Application wants to free up the pages and associated backing store. 573 * This is effectively punching a hole into the middle of a file. 574 */ 575 static long madvise_remove(struct vm_area_struct *vma, 576 struct vm_area_struct **prev, 577 unsigned long start, unsigned long end) 578 { 579 loff_t offset; 580 int error; 581 struct file *f; 582 583 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 584 585 if (vma->vm_flags & VM_LOCKED) 586 return -EINVAL; 587 588 f = vma->vm_file; 589 590 if (!f || !f->f_mapping || !f->f_mapping->host) { 591 return -EINVAL; 592 } 593 594 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 595 return -EACCES; 596 597 offset = (loff_t)(start - vma->vm_start) 598 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 599 600 /* 601 * Filesystem's fallocate may need to take i_mutex. We need to 602 * explicitly grab a reference because the vma (and hence the 603 * vma's reference to the file) can go away as soon as we drop 604 * mmap_sem. 605 */ 606 get_file(f); 607 if (userfaultfd_remove(vma, start, end)) { 608 /* mmap_sem was not released by userfaultfd_remove() */ 609 up_read(¤t->mm->mmap_sem); 610 } 611 error = vfs_fallocate(f, 612 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 613 offset, end - start); 614 fput(f); 615 down_read(¤t->mm->mmap_sem); 616 return error; 617 } 618 619 #ifdef CONFIG_MEMORY_FAILURE 620 /* 621 * Error injection support for memory error handling. 622 */ 623 static int madvise_inject_error(int behavior, 624 unsigned long start, unsigned long end) 625 { 626 struct page *page; 627 struct zone *zone; 628 629 if (!capable(CAP_SYS_ADMIN)) 630 return -EPERM; 631 632 for (; start < end; start += PAGE_SIZE << 633 compound_order(compound_head(page))) { 634 int ret; 635 636 ret = get_user_pages_fast(start, 1, 0, &page); 637 if (ret != 1) 638 return ret; 639 640 if (PageHWPoison(page)) { 641 put_page(page); 642 continue; 643 } 644 645 if (behavior == MADV_SOFT_OFFLINE) { 646 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 647 page_to_pfn(page), start); 648 649 ret = soft_offline_page(page, MF_COUNT_INCREASED); 650 if (ret) 651 return ret; 652 continue; 653 } 654 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 655 page_to_pfn(page), start); 656 657 ret = memory_failure(page_to_pfn(page), 0, MF_COUNT_INCREASED); 658 if (ret) 659 return ret; 660 } 661 662 /* Ensure that all poisoned pages are removed from per-cpu lists */ 663 for_each_populated_zone(zone) 664 drain_all_pages(zone); 665 666 return 0; 667 } 668 #endif 669 670 static long 671 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 672 unsigned long start, unsigned long end, int behavior) 673 { 674 switch (behavior) { 675 case MADV_REMOVE: 676 return madvise_remove(vma, prev, start, end); 677 case MADV_WILLNEED: 678 return madvise_willneed(vma, prev, start, end); 679 case MADV_FREE: 680 case MADV_DONTNEED: 681 return madvise_dontneed_free(vma, prev, start, end, behavior); 682 default: 683 return madvise_behavior(vma, prev, start, end, behavior); 684 } 685 } 686 687 static bool 688 madvise_behavior_valid(int behavior) 689 { 690 switch (behavior) { 691 case MADV_DOFORK: 692 case MADV_DONTFORK: 693 case MADV_NORMAL: 694 case MADV_SEQUENTIAL: 695 case MADV_RANDOM: 696 case MADV_REMOVE: 697 case MADV_WILLNEED: 698 case MADV_DONTNEED: 699 case MADV_FREE: 700 #ifdef CONFIG_KSM 701 case MADV_MERGEABLE: 702 case MADV_UNMERGEABLE: 703 #endif 704 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 705 case MADV_HUGEPAGE: 706 case MADV_NOHUGEPAGE: 707 #endif 708 case MADV_DONTDUMP: 709 case MADV_DODUMP: 710 case MADV_WIPEONFORK: 711 case MADV_KEEPONFORK: 712 #ifdef CONFIG_MEMORY_FAILURE 713 case MADV_SOFT_OFFLINE: 714 case MADV_HWPOISON: 715 #endif 716 return true; 717 718 default: 719 return false; 720 } 721 } 722 723 /* 724 * The madvise(2) system call. 725 * 726 * Applications can use madvise() to advise the kernel how it should 727 * handle paging I/O in this VM area. The idea is to help the kernel 728 * use appropriate read-ahead and caching techniques. The information 729 * provided is advisory only, and can be safely disregarded by the 730 * kernel without affecting the correct operation of the application. 731 * 732 * behavior values: 733 * MADV_NORMAL - the default behavior is to read clusters. This 734 * results in some read-ahead and read-behind. 735 * MADV_RANDOM - the system should read the minimum amount of data 736 * on any access, since it is unlikely that the appli- 737 * cation will need more than what it asks for. 738 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 739 * once, so they can be aggressively read ahead, and 740 * can be freed soon after they are accessed. 741 * MADV_WILLNEED - the application is notifying the system to read 742 * some pages ahead. 743 * MADV_DONTNEED - the application is finished with the given range, 744 * so the kernel can free resources associated with it. 745 * MADV_FREE - the application marks pages in the given range as lazy free, 746 * where actual purges are postponed until memory pressure happens. 747 * MADV_REMOVE - the application wants to free up the given range of 748 * pages and associated backing store. 749 * MADV_DONTFORK - omit this area from child's address space when forking: 750 * typically, to avoid COWing pages pinned by get_user_pages(). 751 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 752 * MADV_HWPOISON - trigger memory error handler as if the given memory range 753 * were corrupted by unrecoverable hardware memory failure. 754 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 755 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 756 * this area with pages of identical content from other such areas. 757 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 758 * MADV_HUGEPAGE - the application wants to back the given range by transparent 759 * huge pages in the future. Existing pages might be coalesced and 760 * new pages might be allocated as THP. 761 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 762 * transparent huge pages so the existing pages will not be 763 * coalesced into THP and new pages will not be allocated as THP. 764 * MADV_DONTDUMP - the application wants to prevent pages in the given range 765 * from being included in its core dump. 766 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 767 * 768 * return values: 769 * zero - success 770 * -EINVAL - start + len < 0, start is not page-aligned, 771 * "behavior" is not a valid value, or application 772 * is attempting to release locked or shared pages. 773 * -ENOMEM - addresses in the specified range are not currently 774 * mapped, or are outside the AS of the process. 775 * -EIO - an I/O error occurred while paging in data. 776 * -EBADF - map exists, but area maps something that isn't a file. 777 * -EAGAIN - a kernel resource was temporarily unavailable. 778 */ 779 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 780 { 781 unsigned long end, tmp; 782 struct vm_area_struct *vma, *prev; 783 int unmapped_error = 0; 784 int error = -EINVAL; 785 int write; 786 size_t len; 787 struct blk_plug plug; 788 789 if (!madvise_behavior_valid(behavior)) 790 return error; 791 792 if (start & ~PAGE_MASK) 793 return error; 794 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 795 796 /* Check to see whether len was rounded up from small -ve to zero */ 797 if (len_in && !len) 798 return error; 799 800 end = start + len; 801 if (end < start) 802 return error; 803 804 error = 0; 805 if (end == start) 806 return error; 807 808 #ifdef CONFIG_MEMORY_FAILURE 809 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 810 return madvise_inject_error(behavior, start, start + len_in); 811 #endif 812 813 write = madvise_need_mmap_write(behavior); 814 if (write) { 815 if (down_write_killable(¤t->mm->mmap_sem)) 816 return -EINTR; 817 } else { 818 down_read(¤t->mm->mmap_sem); 819 } 820 821 /* 822 * If the interval [start,end) covers some unmapped address 823 * ranges, just ignore them, but return -ENOMEM at the end. 824 * - different from the way of handling in mlock etc. 825 */ 826 vma = find_vma_prev(current->mm, start, &prev); 827 if (vma && start > vma->vm_start) 828 prev = vma; 829 830 blk_start_plug(&plug); 831 for (;;) { 832 /* Still start < end. */ 833 error = -ENOMEM; 834 if (!vma) 835 goto out; 836 837 /* Here start < (end|vma->vm_end). */ 838 if (start < vma->vm_start) { 839 unmapped_error = -ENOMEM; 840 start = vma->vm_start; 841 if (start >= end) 842 goto out; 843 } 844 845 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 846 tmp = vma->vm_end; 847 if (end < tmp) 848 tmp = end; 849 850 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 851 error = madvise_vma(vma, &prev, start, tmp, behavior); 852 if (error) 853 goto out; 854 start = tmp; 855 if (prev && start < prev->vm_end) 856 start = prev->vm_end; 857 error = unmapped_error; 858 if (start >= end) 859 goto out; 860 if (prev) 861 vma = prev->vm_next; 862 else /* madvise_remove dropped mmap_sem */ 863 vma = find_vma(current->mm, start); 864 } 865 out: 866 blk_finish_plug(&plug); 867 if (write) 868 up_write(¤t->mm->mmap_sem); 869 else 870 up_read(¤t->mm->mmap_sem); 871 872 return error; 873 } 874