1 /* 2 * linux/mm/madvise.c 3 * 4 * Copyright (C) 1999 Linus Torvalds 5 * Copyright (C) 2002 Christoph Hellwig 6 */ 7 8 #include <linux/mman.h> 9 #include <linux/pagemap.h> 10 #include <linux/syscalls.h> 11 #include <linux/mempolicy.h> 12 #include <linux/page-isolation.h> 13 #include <linux/hugetlb.h> 14 #include <linux/sched.h> 15 #include <linux/ksm.h> 16 17 /* 18 * Any behaviour which results in changes to the vma->vm_flags needs to 19 * take mmap_sem for writing. Others, which simply traverse vmas, need 20 * to only take it for reading. 21 */ 22 static int madvise_need_mmap_write(int behavior) 23 { 24 switch (behavior) { 25 case MADV_REMOVE: 26 case MADV_WILLNEED: 27 case MADV_DONTNEED: 28 return 0; 29 default: 30 /* be safe, default to 1. list exceptions explicitly */ 31 return 1; 32 } 33 } 34 35 /* 36 * We can potentially split a vm area into separate 37 * areas, each area with its own behavior. 38 */ 39 static long madvise_behavior(struct vm_area_struct * vma, 40 struct vm_area_struct **prev, 41 unsigned long start, unsigned long end, int behavior) 42 { 43 struct mm_struct * mm = vma->vm_mm; 44 int error = 0; 45 pgoff_t pgoff; 46 unsigned long new_flags = vma->vm_flags; 47 48 switch (behavior) { 49 case MADV_NORMAL: 50 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 51 break; 52 case MADV_SEQUENTIAL: 53 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 54 break; 55 case MADV_RANDOM: 56 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 57 break; 58 case MADV_DONTFORK: 59 new_flags |= VM_DONTCOPY; 60 break; 61 case MADV_DOFORK: 62 if (vma->vm_flags & VM_IO) { 63 error = -EINVAL; 64 goto out; 65 } 66 new_flags &= ~VM_DONTCOPY; 67 break; 68 case MADV_DONTDUMP: 69 new_flags |= VM_NODUMP; 70 break; 71 case MADV_DODUMP: 72 new_flags &= ~VM_NODUMP; 73 break; 74 case MADV_MERGEABLE: 75 case MADV_UNMERGEABLE: 76 error = ksm_madvise(vma, start, end, behavior, &new_flags); 77 if (error) 78 goto out; 79 break; 80 case MADV_HUGEPAGE: 81 case MADV_NOHUGEPAGE: 82 error = hugepage_madvise(vma, &new_flags, behavior); 83 if (error) 84 goto out; 85 break; 86 } 87 88 if (new_flags == vma->vm_flags) { 89 *prev = vma; 90 goto out; 91 } 92 93 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 94 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 95 vma->vm_file, pgoff, vma_policy(vma)); 96 if (*prev) { 97 vma = *prev; 98 goto success; 99 } 100 101 *prev = vma; 102 103 if (start != vma->vm_start) { 104 error = split_vma(mm, vma, start, 1); 105 if (error) 106 goto out; 107 } 108 109 if (end != vma->vm_end) { 110 error = split_vma(mm, vma, end, 0); 111 if (error) 112 goto out; 113 } 114 115 success: 116 /* 117 * vm_flags is protected by the mmap_sem held in write mode. 118 */ 119 vma->vm_flags = new_flags; 120 121 out: 122 if (error == -ENOMEM) 123 error = -EAGAIN; 124 return error; 125 } 126 127 /* 128 * Schedule all required I/O operations. Do not wait for completion. 129 */ 130 static long madvise_willneed(struct vm_area_struct * vma, 131 struct vm_area_struct ** prev, 132 unsigned long start, unsigned long end) 133 { 134 struct file *file = vma->vm_file; 135 136 if (!file) 137 return -EBADF; 138 139 if (file->f_mapping->a_ops->get_xip_mem) { 140 /* no bad return value, but ignore advice */ 141 return 0; 142 } 143 144 *prev = vma; 145 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 146 if (end > vma->vm_end) 147 end = vma->vm_end; 148 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 149 150 force_page_cache_readahead(file->f_mapping, file, start, end - start); 151 return 0; 152 } 153 154 /* 155 * Application no longer needs these pages. If the pages are dirty, 156 * it's OK to just throw them away. The app will be more careful about 157 * data it wants to keep. Be sure to free swap resources too. The 158 * zap_page_range call sets things up for shrink_active_list to actually free 159 * these pages later if no one else has touched them in the meantime, 160 * although we could add these pages to a global reuse list for 161 * shrink_active_list to pick up before reclaiming other pages. 162 * 163 * NB: This interface discards data rather than pushes it out to swap, 164 * as some implementations do. This has performance implications for 165 * applications like large transactional databases which want to discard 166 * pages in anonymous maps after committing to backing store the data 167 * that was kept in them. There is no reason to write this data out to 168 * the swap area if the application is discarding it. 169 * 170 * An interface that causes the system to free clean pages and flush 171 * dirty pages is already available as msync(MS_INVALIDATE). 172 */ 173 static long madvise_dontneed(struct vm_area_struct * vma, 174 struct vm_area_struct ** prev, 175 unsigned long start, unsigned long end) 176 { 177 *prev = vma; 178 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) 179 return -EINVAL; 180 181 if (unlikely(vma->vm_flags & VM_NONLINEAR)) { 182 struct zap_details details = { 183 .nonlinear_vma = vma, 184 .last_index = ULONG_MAX, 185 }; 186 zap_page_range(vma, start, end - start, &details); 187 } else 188 zap_page_range(vma, start, end - start, NULL); 189 return 0; 190 } 191 192 /* 193 * Application wants to free up the pages and associated backing store. 194 * This is effectively punching a hole into the middle of a file. 195 * 196 * NOTE: Currently, only shmfs/tmpfs is supported for this operation. 197 * Other filesystems return -ENOSYS. 198 */ 199 static long madvise_remove(struct vm_area_struct *vma, 200 struct vm_area_struct **prev, 201 unsigned long start, unsigned long end) 202 { 203 struct address_space *mapping; 204 loff_t offset, endoff; 205 int error; 206 207 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 208 209 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 210 return -EINVAL; 211 212 if (!vma->vm_file || !vma->vm_file->f_mapping 213 || !vma->vm_file->f_mapping->host) { 214 return -EINVAL; 215 } 216 217 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 218 return -EACCES; 219 220 mapping = vma->vm_file->f_mapping; 221 222 offset = (loff_t)(start - vma->vm_start) 223 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 224 endoff = (loff_t)(end - vma->vm_start - 1) 225 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 226 227 /* vmtruncate_range needs to take i_mutex */ 228 up_read(¤t->mm->mmap_sem); 229 error = vmtruncate_range(mapping->host, offset, endoff); 230 down_read(¤t->mm->mmap_sem); 231 return error; 232 } 233 234 #ifdef CONFIG_MEMORY_FAILURE 235 /* 236 * Error injection support for memory error handling. 237 */ 238 static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) 239 { 240 int ret = 0; 241 242 if (!capable(CAP_SYS_ADMIN)) 243 return -EPERM; 244 for (; start < end; start += PAGE_SIZE) { 245 struct page *p; 246 int ret = get_user_pages_fast(start, 1, 0, &p); 247 if (ret != 1) 248 return ret; 249 if (bhv == MADV_SOFT_OFFLINE) { 250 printk(KERN_INFO "Soft offlining page %lx at %lx\n", 251 page_to_pfn(p), start); 252 ret = soft_offline_page(p, MF_COUNT_INCREASED); 253 if (ret) 254 break; 255 continue; 256 } 257 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", 258 page_to_pfn(p), start); 259 /* Ignore return value for now */ 260 memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); 261 } 262 return ret; 263 } 264 #endif 265 266 static long 267 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 268 unsigned long start, unsigned long end, int behavior) 269 { 270 switch (behavior) { 271 case MADV_REMOVE: 272 return madvise_remove(vma, prev, start, end); 273 case MADV_WILLNEED: 274 return madvise_willneed(vma, prev, start, end); 275 case MADV_DONTNEED: 276 return madvise_dontneed(vma, prev, start, end); 277 default: 278 return madvise_behavior(vma, prev, start, end, behavior); 279 } 280 } 281 282 static int 283 madvise_behavior_valid(int behavior) 284 { 285 switch (behavior) { 286 case MADV_DOFORK: 287 case MADV_DONTFORK: 288 case MADV_NORMAL: 289 case MADV_SEQUENTIAL: 290 case MADV_RANDOM: 291 case MADV_REMOVE: 292 case MADV_WILLNEED: 293 case MADV_DONTNEED: 294 #ifdef CONFIG_KSM 295 case MADV_MERGEABLE: 296 case MADV_UNMERGEABLE: 297 #endif 298 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 299 case MADV_HUGEPAGE: 300 case MADV_NOHUGEPAGE: 301 #endif 302 case MADV_DONTDUMP: 303 case MADV_DODUMP: 304 return 1; 305 306 default: 307 return 0; 308 } 309 } 310 311 /* 312 * The madvise(2) system call. 313 * 314 * Applications can use madvise() to advise the kernel how it should 315 * handle paging I/O in this VM area. The idea is to help the kernel 316 * use appropriate read-ahead and caching techniques. The information 317 * provided is advisory only, and can be safely disregarded by the 318 * kernel without affecting the correct operation of the application. 319 * 320 * behavior values: 321 * MADV_NORMAL - the default behavior is to read clusters. This 322 * results in some read-ahead and read-behind. 323 * MADV_RANDOM - the system should read the minimum amount of data 324 * on any access, since it is unlikely that the appli- 325 * cation will need more than what it asks for. 326 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 327 * once, so they can be aggressively read ahead, and 328 * can be freed soon after they are accessed. 329 * MADV_WILLNEED - the application is notifying the system to read 330 * some pages ahead. 331 * MADV_DONTNEED - the application is finished with the given range, 332 * so the kernel can free resources associated with it. 333 * MADV_REMOVE - the application wants to free up the given range of 334 * pages and associated backing store. 335 * MADV_DONTFORK - omit this area from child's address space when forking: 336 * typically, to avoid COWing pages pinned by get_user_pages(). 337 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 338 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 339 * this area with pages of identical content from other such areas. 340 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 341 * 342 * return values: 343 * zero - success 344 * -EINVAL - start + len < 0, start is not page-aligned, 345 * "behavior" is not a valid value, or application 346 * is attempting to release locked or shared pages. 347 * -ENOMEM - addresses in the specified range are not currently 348 * mapped, or are outside the AS of the process. 349 * -EIO - an I/O error occurred while paging in data. 350 * -EBADF - map exists, but area maps something that isn't a file. 351 * -EAGAIN - a kernel resource was temporarily unavailable. 352 */ 353 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 354 { 355 unsigned long end, tmp; 356 struct vm_area_struct * vma, *prev; 357 int unmapped_error = 0; 358 int error = -EINVAL; 359 int write; 360 size_t len; 361 362 #ifdef CONFIG_MEMORY_FAILURE 363 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 364 return madvise_hwpoison(behavior, start, start+len_in); 365 #endif 366 if (!madvise_behavior_valid(behavior)) 367 return error; 368 369 write = madvise_need_mmap_write(behavior); 370 if (write) 371 down_write(¤t->mm->mmap_sem); 372 else 373 down_read(¤t->mm->mmap_sem); 374 375 if (start & ~PAGE_MASK) 376 goto out; 377 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 378 379 /* Check to see whether len was rounded up from small -ve to zero */ 380 if (len_in && !len) 381 goto out; 382 383 end = start + len; 384 if (end < start) 385 goto out; 386 387 error = 0; 388 if (end == start) 389 goto out; 390 391 /* 392 * If the interval [start,end) covers some unmapped address 393 * ranges, just ignore them, but return -ENOMEM at the end. 394 * - different from the way of handling in mlock etc. 395 */ 396 vma = find_vma_prev(current->mm, start, &prev); 397 if (vma && start > vma->vm_start) 398 prev = vma; 399 400 for (;;) { 401 /* Still start < end. */ 402 error = -ENOMEM; 403 if (!vma) 404 goto out; 405 406 /* Here start < (end|vma->vm_end). */ 407 if (start < vma->vm_start) { 408 unmapped_error = -ENOMEM; 409 start = vma->vm_start; 410 if (start >= end) 411 goto out; 412 } 413 414 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 415 tmp = vma->vm_end; 416 if (end < tmp) 417 tmp = end; 418 419 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 420 error = madvise_vma(vma, &prev, start, tmp, behavior); 421 if (error) 422 goto out; 423 start = tmp; 424 if (prev && start < prev->vm_end) 425 start = prev->vm_end; 426 error = unmapped_error; 427 if (start >= end) 428 goto out; 429 if (prev) 430 vma = prev->vm_next; 431 else /* madvise_remove dropped mmap_sem */ 432 vma = find_vma(current->mm, start); 433 } 434 out: 435 if (write) 436 up_write(¤t->mm->mmap_sem); 437 else 438 up_read(¤t->mm->mmap_sem); 439 440 return error; 441 } 442