1 /* 2 * linux/mm/madvise.c 3 * 4 * Copyright (C) 1999 Linus Torvalds 5 * Copyright (C) 2002 Christoph Hellwig 6 */ 7 8 #include <linux/mman.h> 9 #include <linux/pagemap.h> 10 #include <linux/syscalls.h> 11 #include <linux/mempolicy.h> 12 #include <linux/page-isolation.h> 13 #include <linux/hugetlb.h> 14 #include <linux/sched.h> 15 #include <linux/ksm.h> 16 17 /* 18 * Any behaviour which results in changes to the vma->vm_flags needs to 19 * take mmap_sem for writing. Others, which simply traverse vmas, need 20 * to only take it for reading. 21 */ 22 static int madvise_need_mmap_write(int behavior) 23 { 24 switch (behavior) { 25 case MADV_REMOVE: 26 case MADV_WILLNEED: 27 case MADV_DONTNEED: 28 return 0; 29 default: 30 /* be safe, default to 1. list exceptions explicitly */ 31 return 1; 32 } 33 } 34 35 /* 36 * We can potentially split a vm area into separate 37 * areas, each area with its own behavior. 38 */ 39 static long madvise_behavior(struct vm_area_struct * vma, 40 struct vm_area_struct **prev, 41 unsigned long start, unsigned long end, int behavior) 42 { 43 struct mm_struct * mm = vma->vm_mm; 44 int error = 0; 45 pgoff_t pgoff; 46 unsigned long new_flags = vma->vm_flags; 47 48 switch (behavior) { 49 case MADV_NORMAL: 50 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 51 break; 52 case MADV_SEQUENTIAL: 53 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 54 break; 55 case MADV_RANDOM: 56 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 57 break; 58 case MADV_DONTFORK: 59 new_flags |= VM_DONTCOPY; 60 break; 61 case MADV_DOFORK: 62 if (vma->vm_flags & VM_IO) { 63 error = -EINVAL; 64 goto out; 65 } 66 new_flags &= ~VM_DONTCOPY; 67 break; 68 case MADV_MERGEABLE: 69 case MADV_UNMERGEABLE: 70 error = ksm_madvise(vma, start, end, behavior, &new_flags); 71 if (error) 72 goto out; 73 break; 74 case MADV_HUGEPAGE: 75 case MADV_NOHUGEPAGE: 76 error = hugepage_madvise(vma, &new_flags, behavior); 77 if (error) 78 goto out; 79 break; 80 } 81 82 if (new_flags == vma->vm_flags) { 83 *prev = vma; 84 goto out; 85 } 86 87 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 88 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 89 vma->vm_file, pgoff, vma_policy(vma)); 90 if (*prev) { 91 vma = *prev; 92 goto success; 93 } 94 95 *prev = vma; 96 97 if (start != vma->vm_start) { 98 error = split_vma(mm, vma, start, 1); 99 if (error) 100 goto out; 101 } 102 103 if (end != vma->vm_end) { 104 error = split_vma(mm, vma, end, 0); 105 if (error) 106 goto out; 107 } 108 109 success: 110 /* 111 * vm_flags is protected by the mmap_sem held in write mode. 112 */ 113 vma->vm_flags = new_flags; 114 115 out: 116 if (error == -ENOMEM) 117 error = -EAGAIN; 118 return error; 119 } 120 121 /* 122 * Schedule all required I/O operations. Do not wait for completion. 123 */ 124 static long madvise_willneed(struct vm_area_struct * vma, 125 struct vm_area_struct ** prev, 126 unsigned long start, unsigned long end) 127 { 128 struct file *file = vma->vm_file; 129 130 if (!file) 131 return -EBADF; 132 133 if (file->f_mapping->a_ops->get_xip_mem) { 134 /* no bad return value, but ignore advice */ 135 return 0; 136 } 137 138 *prev = vma; 139 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 140 if (end > vma->vm_end) 141 end = vma->vm_end; 142 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 143 144 force_page_cache_readahead(file->f_mapping, file, start, end - start); 145 return 0; 146 } 147 148 /* 149 * Application no longer needs these pages. If the pages are dirty, 150 * it's OK to just throw them away. The app will be more careful about 151 * data it wants to keep. Be sure to free swap resources too. The 152 * zap_page_range call sets things up for shrink_active_list to actually free 153 * these pages later if no one else has touched them in the meantime, 154 * although we could add these pages to a global reuse list for 155 * shrink_active_list to pick up before reclaiming other pages. 156 * 157 * NB: This interface discards data rather than pushes it out to swap, 158 * as some implementations do. This has performance implications for 159 * applications like large transactional databases which want to discard 160 * pages in anonymous maps after committing to backing store the data 161 * that was kept in them. There is no reason to write this data out to 162 * the swap area if the application is discarding it. 163 * 164 * An interface that causes the system to free clean pages and flush 165 * dirty pages is already available as msync(MS_INVALIDATE). 166 */ 167 static long madvise_dontneed(struct vm_area_struct * vma, 168 struct vm_area_struct ** prev, 169 unsigned long start, unsigned long end) 170 { 171 *prev = vma; 172 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) 173 return -EINVAL; 174 175 if (unlikely(vma->vm_flags & VM_NONLINEAR)) { 176 struct zap_details details = { 177 .nonlinear_vma = vma, 178 .last_index = ULONG_MAX, 179 }; 180 zap_page_range(vma, start, end - start, &details); 181 } else 182 zap_page_range(vma, start, end - start, NULL); 183 return 0; 184 } 185 186 /* 187 * Application wants to free up the pages and associated backing store. 188 * This is effectively punching a hole into the middle of a file. 189 * 190 * NOTE: Currently, only shmfs/tmpfs is supported for this operation. 191 * Other filesystems return -ENOSYS. 192 */ 193 static long madvise_remove(struct vm_area_struct *vma, 194 struct vm_area_struct **prev, 195 unsigned long start, unsigned long end) 196 { 197 struct address_space *mapping; 198 loff_t offset, endoff; 199 int error; 200 201 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 202 203 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 204 return -EINVAL; 205 206 if (!vma->vm_file || !vma->vm_file->f_mapping 207 || !vma->vm_file->f_mapping->host) { 208 return -EINVAL; 209 } 210 211 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 212 return -EACCES; 213 214 mapping = vma->vm_file->f_mapping; 215 216 offset = (loff_t)(start - vma->vm_start) 217 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 218 endoff = (loff_t)(end - vma->vm_start - 1) 219 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 220 221 /* vmtruncate_range needs to take i_mutex */ 222 up_read(¤t->mm->mmap_sem); 223 error = vmtruncate_range(mapping->host, offset, endoff); 224 down_read(¤t->mm->mmap_sem); 225 return error; 226 } 227 228 #ifdef CONFIG_MEMORY_FAILURE 229 /* 230 * Error injection support for memory error handling. 231 */ 232 static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) 233 { 234 int ret = 0; 235 236 if (!capable(CAP_SYS_ADMIN)) 237 return -EPERM; 238 for (; start < end; start += PAGE_SIZE) { 239 struct page *p; 240 int ret = get_user_pages_fast(start, 1, 0, &p); 241 if (ret != 1) 242 return ret; 243 if (bhv == MADV_SOFT_OFFLINE) { 244 printk(KERN_INFO "Soft offlining page %lx at %lx\n", 245 page_to_pfn(p), start); 246 ret = soft_offline_page(p, MF_COUNT_INCREASED); 247 if (ret) 248 break; 249 continue; 250 } 251 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", 252 page_to_pfn(p), start); 253 /* Ignore return value for now */ 254 __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); 255 } 256 return ret; 257 } 258 #endif 259 260 static long 261 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 262 unsigned long start, unsigned long end, int behavior) 263 { 264 switch (behavior) { 265 case MADV_REMOVE: 266 return madvise_remove(vma, prev, start, end); 267 case MADV_WILLNEED: 268 return madvise_willneed(vma, prev, start, end); 269 case MADV_DONTNEED: 270 return madvise_dontneed(vma, prev, start, end); 271 default: 272 return madvise_behavior(vma, prev, start, end, behavior); 273 } 274 } 275 276 static int 277 madvise_behavior_valid(int behavior) 278 { 279 switch (behavior) { 280 case MADV_DOFORK: 281 case MADV_DONTFORK: 282 case MADV_NORMAL: 283 case MADV_SEQUENTIAL: 284 case MADV_RANDOM: 285 case MADV_REMOVE: 286 case MADV_WILLNEED: 287 case MADV_DONTNEED: 288 #ifdef CONFIG_KSM 289 case MADV_MERGEABLE: 290 case MADV_UNMERGEABLE: 291 #endif 292 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 293 case MADV_HUGEPAGE: 294 case MADV_NOHUGEPAGE: 295 #endif 296 return 1; 297 298 default: 299 return 0; 300 } 301 } 302 303 /* 304 * The madvise(2) system call. 305 * 306 * Applications can use madvise() to advise the kernel how it should 307 * handle paging I/O in this VM area. The idea is to help the kernel 308 * use appropriate read-ahead and caching techniques. The information 309 * provided is advisory only, and can be safely disregarded by the 310 * kernel without affecting the correct operation of the application. 311 * 312 * behavior values: 313 * MADV_NORMAL - the default behavior is to read clusters. This 314 * results in some read-ahead and read-behind. 315 * MADV_RANDOM - the system should read the minimum amount of data 316 * on any access, since it is unlikely that the appli- 317 * cation will need more than what it asks for. 318 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 319 * once, so they can be aggressively read ahead, and 320 * can be freed soon after they are accessed. 321 * MADV_WILLNEED - the application is notifying the system to read 322 * some pages ahead. 323 * MADV_DONTNEED - the application is finished with the given range, 324 * so the kernel can free resources associated with it. 325 * MADV_REMOVE - the application wants to free up the given range of 326 * pages and associated backing store. 327 * MADV_DONTFORK - omit this area from child's address space when forking: 328 * typically, to avoid COWing pages pinned by get_user_pages(). 329 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 330 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 331 * this area with pages of identical content from other such areas. 332 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 333 * 334 * return values: 335 * zero - success 336 * -EINVAL - start + len < 0, start is not page-aligned, 337 * "behavior" is not a valid value, or application 338 * is attempting to release locked or shared pages. 339 * -ENOMEM - addresses in the specified range are not currently 340 * mapped, or are outside the AS of the process. 341 * -EIO - an I/O error occurred while paging in data. 342 * -EBADF - map exists, but area maps something that isn't a file. 343 * -EAGAIN - a kernel resource was temporarily unavailable. 344 */ 345 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 346 { 347 unsigned long end, tmp; 348 struct vm_area_struct * vma, *prev; 349 int unmapped_error = 0; 350 int error = -EINVAL; 351 int write; 352 size_t len; 353 354 #ifdef CONFIG_MEMORY_FAILURE 355 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 356 return madvise_hwpoison(behavior, start, start+len_in); 357 #endif 358 if (!madvise_behavior_valid(behavior)) 359 return error; 360 361 write = madvise_need_mmap_write(behavior); 362 if (write) 363 down_write(¤t->mm->mmap_sem); 364 else 365 down_read(¤t->mm->mmap_sem); 366 367 if (start & ~PAGE_MASK) 368 goto out; 369 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 370 371 /* Check to see whether len was rounded up from small -ve to zero */ 372 if (len_in && !len) 373 goto out; 374 375 end = start + len; 376 if (end < start) 377 goto out; 378 379 error = 0; 380 if (end == start) 381 goto out; 382 383 /* 384 * If the interval [start,end) covers some unmapped address 385 * ranges, just ignore them, but return -ENOMEM at the end. 386 * - different from the way of handling in mlock etc. 387 */ 388 vma = find_vma_prev(current->mm, start, &prev); 389 if (vma && start > vma->vm_start) 390 prev = vma; 391 392 for (;;) { 393 /* Still start < end. */ 394 error = -ENOMEM; 395 if (!vma) 396 goto out; 397 398 /* Here start < (end|vma->vm_end). */ 399 if (start < vma->vm_start) { 400 unmapped_error = -ENOMEM; 401 start = vma->vm_start; 402 if (start >= end) 403 goto out; 404 } 405 406 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 407 tmp = vma->vm_end; 408 if (end < tmp) 409 tmp = end; 410 411 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 412 error = madvise_vma(vma, &prev, start, tmp, behavior); 413 if (error) 414 goto out; 415 start = tmp; 416 if (prev && start < prev->vm_end) 417 start = prev->vm_end; 418 error = unmapped_error; 419 if (start >= end) 420 goto out; 421 if (prev) 422 vma = prev->vm_next; 423 else /* madvise_remove dropped mmap_sem */ 424 vma = find_vma(current->mm, start); 425 } 426 out: 427 if (write) 428 up_write(¤t->mm->mmap_sem); 429 else 430 up_read(¤t->mm->mmap_sem); 431 432 return error; 433 } 434