1 /* 2 * linux/mm/madvise.c 3 * 4 * Copyright (C) 1999 Linus Torvalds 5 * Copyright (C) 2002 Christoph Hellwig 6 */ 7 8 #include <linux/mman.h> 9 #include <linux/pagemap.h> 10 #include <linux/syscalls.h> 11 #include <linux/mempolicy.h> 12 #include <linux/page-isolation.h> 13 #include <linux/hugetlb.h> 14 #include <linux/falloc.h> 15 #include <linux/sched.h> 16 #include <linux/ksm.h> 17 #include <linux/fs.h> 18 #include <linux/file.h> 19 #include <linux/blkdev.h> 20 #include <linux/backing-dev.h> 21 #include <linux/swap.h> 22 #include <linux/swapops.h> 23 24 /* 25 * Any behaviour which results in changes to the vma->vm_flags needs to 26 * take mmap_sem for writing. Others, which simply traverse vmas, need 27 * to only take it for reading. 28 */ 29 static int madvise_need_mmap_write(int behavior) 30 { 31 switch (behavior) { 32 case MADV_REMOVE: 33 case MADV_WILLNEED: 34 case MADV_DONTNEED: 35 return 0; 36 default: 37 /* be safe, default to 1. list exceptions explicitly */ 38 return 1; 39 } 40 } 41 42 /* 43 * We can potentially split a vm area into separate 44 * areas, each area with its own behavior. 45 */ 46 static long madvise_behavior(struct vm_area_struct *vma, 47 struct vm_area_struct **prev, 48 unsigned long start, unsigned long end, int behavior) 49 { 50 struct mm_struct *mm = vma->vm_mm; 51 int error = 0; 52 pgoff_t pgoff; 53 unsigned long new_flags = vma->vm_flags; 54 55 switch (behavior) { 56 case MADV_NORMAL: 57 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 58 break; 59 case MADV_SEQUENTIAL: 60 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 61 break; 62 case MADV_RANDOM: 63 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 64 break; 65 case MADV_DONTFORK: 66 new_flags |= VM_DONTCOPY; 67 break; 68 case MADV_DOFORK: 69 if (vma->vm_flags & VM_IO) { 70 error = -EINVAL; 71 goto out; 72 } 73 new_flags &= ~VM_DONTCOPY; 74 break; 75 case MADV_DONTDUMP: 76 new_flags |= VM_DONTDUMP; 77 break; 78 case MADV_DODUMP: 79 if (new_flags & VM_SPECIAL) { 80 error = -EINVAL; 81 goto out; 82 } 83 new_flags &= ~VM_DONTDUMP; 84 break; 85 case MADV_MERGEABLE: 86 case MADV_UNMERGEABLE: 87 error = ksm_madvise(vma, start, end, behavior, &new_flags); 88 if (error) 89 goto out; 90 break; 91 case MADV_HUGEPAGE: 92 case MADV_NOHUGEPAGE: 93 error = hugepage_madvise(vma, &new_flags, behavior); 94 if (error) 95 goto out; 96 break; 97 } 98 99 if (new_flags == vma->vm_flags) { 100 *prev = vma; 101 goto out; 102 } 103 104 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 105 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 106 vma->vm_file, pgoff, vma_policy(vma)); 107 if (*prev) { 108 vma = *prev; 109 goto success; 110 } 111 112 *prev = vma; 113 114 if (start != vma->vm_start) { 115 error = split_vma(mm, vma, start, 1); 116 if (error) 117 goto out; 118 } 119 120 if (end != vma->vm_end) { 121 error = split_vma(mm, vma, end, 0); 122 if (error) 123 goto out; 124 } 125 126 success: 127 /* 128 * vm_flags is protected by the mmap_sem held in write mode. 129 */ 130 vma->vm_flags = new_flags; 131 132 out: 133 if (error == -ENOMEM) 134 error = -EAGAIN; 135 return error; 136 } 137 138 #ifdef CONFIG_SWAP 139 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 140 unsigned long end, struct mm_walk *walk) 141 { 142 pte_t *orig_pte; 143 struct vm_area_struct *vma = walk->private; 144 unsigned long index; 145 146 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 147 return 0; 148 149 for (index = start; index != end; index += PAGE_SIZE) { 150 pte_t pte; 151 swp_entry_t entry; 152 struct page *page; 153 spinlock_t *ptl; 154 155 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 156 pte = *(orig_pte + ((index - start) / PAGE_SIZE)); 157 pte_unmap_unlock(orig_pte, ptl); 158 159 if (pte_present(pte) || pte_none(pte)) 160 continue; 161 entry = pte_to_swp_entry(pte); 162 if (unlikely(non_swap_entry(entry))) 163 continue; 164 165 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 166 vma, index); 167 if (page) 168 page_cache_release(page); 169 } 170 171 return 0; 172 } 173 174 static void force_swapin_readahead(struct vm_area_struct *vma, 175 unsigned long start, unsigned long end) 176 { 177 struct mm_walk walk = { 178 .mm = vma->vm_mm, 179 .pmd_entry = swapin_walk_pmd_entry, 180 .private = vma, 181 }; 182 183 walk_page_range(start, end, &walk); 184 185 lru_add_drain(); /* Push any new pages onto the LRU now */ 186 } 187 188 static void force_shm_swapin_readahead(struct vm_area_struct *vma, 189 unsigned long start, unsigned long end, 190 struct address_space *mapping) 191 { 192 pgoff_t index; 193 struct page *page; 194 swp_entry_t swap; 195 196 for (; start < end; start += PAGE_SIZE) { 197 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 198 199 page = find_get_entry(mapping, index); 200 if (!radix_tree_exceptional_entry(page)) { 201 if (page) 202 page_cache_release(page); 203 continue; 204 } 205 swap = radix_to_swp_entry(page); 206 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, 207 NULL, 0); 208 if (page) 209 page_cache_release(page); 210 } 211 212 lru_add_drain(); /* Push any new pages onto the LRU now */ 213 } 214 #endif /* CONFIG_SWAP */ 215 216 /* 217 * Schedule all required I/O operations. Do not wait for completion. 218 */ 219 static long madvise_willneed(struct vm_area_struct *vma, 220 struct vm_area_struct **prev, 221 unsigned long start, unsigned long end) 222 { 223 struct file *file = vma->vm_file; 224 225 #ifdef CONFIG_SWAP 226 if (!file) { 227 *prev = vma; 228 force_swapin_readahead(vma, start, end); 229 return 0; 230 } 231 232 if (shmem_mapping(file->f_mapping)) { 233 *prev = vma; 234 force_shm_swapin_readahead(vma, start, end, 235 file->f_mapping); 236 return 0; 237 } 238 #else 239 if (!file) 240 return -EBADF; 241 #endif 242 243 if (IS_DAX(file_inode(file))) { 244 /* no bad return value, but ignore advice */ 245 return 0; 246 } 247 248 *prev = vma; 249 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 250 if (end > vma->vm_end) 251 end = vma->vm_end; 252 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 253 254 force_page_cache_readahead(file->f_mapping, file, start, end - start); 255 return 0; 256 } 257 258 /* 259 * Application no longer needs these pages. If the pages are dirty, 260 * it's OK to just throw them away. The app will be more careful about 261 * data it wants to keep. Be sure to free swap resources too. The 262 * zap_page_range call sets things up for shrink_active_list to actually free 263 * these pages later if no one else has touched them in the meantime, 264 * although we could add these pages to a global reuse list for 265 * shrink_active_list to pick up before reclaiming other pages. 266 * 267 * NB: This interface discards data rather than pushes it out to swap, 268 * as some implementations do. This has performance implications for 269 * applications like large transactional databases which want to discard 270 * pages in anonymous maps after committing to backing store the data 271 * that was kept in them. There is no reason to write this data out to 272 * the swap area if the application is discarding it. 273 * 274 * An interface that causes the system to free clean pages and flush 275 * dirty pages is already available as msync(MS_INVALIDATE). 276 */ 277 static long madvise_dontneed(struct vm_area_struct *vma, 278 struct vm_area_struct **prev, 279 unsigned long start, unsigned long end) 280 { 281 *prev = vma; 282 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) 283 return -EINVAL; 284 285 zap_page_range(vma, start, end - start, NULL); 286 return 0; 287 } 288 289 /* 290 * Application wants to free up the pages and associated backing store. 291 * This is effectively punching a hole into the middle of a file. 292 */ 293 static long madvise_remove(struct vm_area_struct *vma, 294 struct vm_area_struct **prev, 295 unsigned long start, unsigned long end) 296 { 297 loff_t offset; 298 int error; 299 struct file *f; 300 301 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 302 303 if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB)) 304 return -EINVAL; 305 306 f = vma->vm_file; 307 308 if (!f || !f->f_mapping || !f->f_mapping->host) { 309 return -EINVAL; 310 } 311 312 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 313 return -EACCES; 314 315 offset = (loff_t)(start - vma->vm_start) 316 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 317 318 /* 319 * Filesystem's fallocate may need to take i_mutex. We need to 320 * explicitly grab a reference because the vma (and hence the 321 * vma's reference to the file) can go away as soon as we drop 322 * mmap_sem. 323 */ 324 get_file(f); 325 up_read(¤t->mm->mmap_sem); 326 error = vfs_fallocate(f, 327 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 328 offset, end - start); 329 fput(f); 330 down_read(¤t->mm->mmap_sem); 331 return error; 332 } 333 334 #ifdef CONFIG_MEMORY_FAILURE 335 /* 336 * Error injection support for memory error handling. 337 */ 338 static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) 339 { 340 struct page *p; 341 if (!capable(CAP_SYS_ADMIN)) 342 return -EPERM; 343 for (; start < end; start += PAGE_SIZE << 344 compound_order(compound_head(p))) { 345 int ret; 346 347 ret = get_user_pages_fast(start, 1, 0, &p); 348 if (ret != 1) 349 return ret; 350 351 if (PageHWPoison(p)) { 352 put_page(p); 353 continue; 354 } 355 if (bhv == MADV_SOFT_OFFLINE) { 356 pr_info("Soft offlining page %#lx at %#lx\n", 357 page_to_pfn(p), start); 358 ret = soft_offline_page(p, MF_COUNT_INCREASED); 359 if (ret) 360 return ret; 361 continue; 362 } 363 pr_info("Injecting memory failure for page %#lx at %#lx\n", 364 page_to_pfn(p), start); 365 /* Ignore return value for now */ 366 memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); 367 } 368 return 0; 369 } 370 #endif 371 372 static long 373 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 374 unsigned long start, unsigned long end, int behavior) 375 { 376 switch (behavior) { 377 case MADV_REMOVE: 378 return madvise_remove(vma, prev, start, end); 379 case MADV_WILLNEED: 380 return madvise_willneed(vma, prev, start, end); 381 case MADV_DONTNEED: 382 return madvise_dontneed(vma, prev, start, end); 383 default: 384 return madvise_behavior(vma, prev, start, end, behavior); 385 } 386 } 387 388 static int 389 madvise_behavior_valid(int behavior) 390 { 391 switch (behavior) { 392 case MADV_DOFORK: 393 case MADV_DONTFORK: 394 case MADV_NORMAL: 395 case MADV_SEQUENTIAL: 396 case MADV_RANDOM: 397 case MADV_REMOVE: 398 case MADV_WILLNEED: 399 case MADV_DONTNEED: 400 #ifdef CONFIG_KSM 401 case MADV_MERGEABLE: 402 case MADV_UNMERGEABLE: 403 #endif 404 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 405 case MADV_HUGEPAGE: 406 case MADV_NOHUGEPAGE: 407 #endif 408 case MADV_DONTDUMP: 409 case MADV_DODUMP: 410 return 1; 411 412 default: 413 return 0; 414 } 415 } 416 417 /* 418 * The madvise(2) system call. 419 * 420 * Applications can use madvise() to advise the kernel how it should 421 * handle paging I/O in this VM area. The idea is to help the kernel 422 * use appropriate read-ahead and caching techniques. The information 423 * provided is advisory only, and can be safely disregarded by the 424 * kernel without affecting the correct operation of the application. 425 * 426 * behavior values: 427 * MADV_NORMAL - the default behavior is to read clusters. This 428 * results in some read-ahead and read-behind. 429 * MADV_RANDOM - the system should read the minimum amount of data 430 * on any access, since it is unlikely that the appli- 431 * cation will need more than what it asks for. 432 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 433 * once, so they can be aggressively read ahead, and 434 * can be freed soon after they are accessed. 435 * MADV_WILLNEED - the application is notifying the system to read 436 * some pages ahead. 437 * MADV_DONTNEED - the application is finished with the given range, 438 * so the kernel can free resources associated with it. 439 * MADV_REMOVE - the application wants to free up the given range of 440 * pages and associated backing store. 441 * MADV_DONTFORK - omit this area from child's address space when forking: 442 * typically, to avoid COWing pages pinned by get_user_pages(). 443 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 444 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 445 * this area with pages of identical content from other such areas. 446 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 447 * 448 * return values: 449 * zero - success 450 * -EINVAL - start + len < 0, start is not page-aligned, 451 * "behavior" is not a valid value, or application 452 * is attempting to release locked or shared pages. 453 * -ENOMEM - addresses in the specified range are not currently 454 * mapped, or are outside the AS of the process. 455 * -EIO - an I/O error occurred while paging in data. 456 * -EBADF - map exists, but area maps something that isn't a file. 457 * -EAGAIN - a kernel resource was temporarily unavailable. 458 */ 459 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 460 { 461 unsigned long end, tmp; 462 struct vm_area_struct *vma, *prev; 463 int unmapped_error = 0; 464 int error = -EINVAL; 465 int write; 466 size_t len; 467 struct blk_plug plug; 468 469 #ifdef CONFIG_MEMORY_FAILURE 470 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 471 return madvise_hwpoison(behavior, start, start+len_in); 472 #endif 473 if (!madvise_behavior_valid(behavior)) 474 return error; 475 476 if (start & ~PAGE_MASK) 477 return error; 478 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 479 480 /* Check to see whether len was rounded up from small -ve to zero */ 481 if (len_in && !len) 482 return error; 483 484 end = start + len; 485 if (end < start) 486 return error; 487 488 error = 0; 489 if (end == start) 490 return error; 491 492 write = madvise_need_mmap_write(behavior); 493 if (write) 494 down_write(¤t->mm->mmap_sem); 495 else 496 down_read(¤t->mm->mmap_sem); 497 498 /* 499 * If the interval [start,end) covers some unmapped address 500 * ranges, just ignore them, but return -ENOMEM at the end. 501 * - different from the way of handling in mlock etc. 502 */ 503 vma = find_vma_prev(current->mm, start, &prev); 504 if (vma && start > vma->vm_start) 505 prev = vma; 506 507 blk_start_plug(&plug); 508 for (;;) { 509 /* Still start < end. */ 510 error = -ENOMEM; 511 if (!vma) 512 goto out; 513 514 /* Here start < (end|vma->vm_end). */ 515 if (start < vma->vm_start) { 516 unmapped_error = -ENOMEM; 517 start = vma->vm_start; 518 if (start >= end) 519 goto out; 520 } 521 522 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 523 tmp = vma->vm_end; 524 if (end < tmp) 525 tmp = end; 526 527 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 528 error = madvise_vma(vma, &prev, start, tmp, behavior); 529 if (error) 530 goto out; 531 start = tmp; 532 if (prev && start < prev->vm_end) 533 start = prev->vm_end; 534 error = unmapped_error; 535 if (start >= end) 536 goto out; 537 if (prev) 538 vma = prev->vm_next; 539 else /* madvise_remove dropped mmap_sem */ 540 vma = find_vma(current->mm, start); 541 } 542 out: 543 blk_finish_plug(&plug); 544 if (write) 545 up_write(¤t->mm->mmap_sem); 546 else 547 up_read(¤t->mm->mmap_sem); 548 549 return error; 550 } 551