1 /* 2 * linux/mm/madvise.c 3 * 4 * Copyright (C) 1999 Linus Torvalds 5 * Copyright (C) 2002 Christoph Hellwig 6 */ 7 8 #include <linux/mman.h> 9 #include <linux/pagemap.h> 10 #include <linux/syscalls.h> 11 #include <linux/mempolicy.h> 12 #include <linux/hugetlb.h> 13 #include <linux/sched.h> 14 #include <linux/ksm.h> 15 16 /* 17 * Any behaviour which results in changes to the vma->vm_flags needs to 18 * take mmap_sem for writing. Others, which simply traverse vmas, need 19 * to only take it for reading. 20 */ 21 static int madvise_need_mmap_write(int behavior) 22 { 23 switch (behavior) { 24 case MADV_REMOVE: 25 case MADV_WILLNEED: 26 case MADV_DONTNEED: 27 return 0; 28 default: 29 /* be safe, default to 1. list exceptions explicitly */ 30 return 1; 31 } 32 } 33 34 /* 35 * We can potentially split a vm area into separate 36 * areas, each area with its own behavior. 37 */ 38 static long madvise_behavior(struct vm_area_struct * vma, 39 struct vm_area_struct **prev, 40 unsigned long start, unsigned long end, int behavior) 41 { 42 struct mm_struct * mm = vma->vm_mm; 43 int error = 0; 44 pgoff_t pgoff; 45 unsigned long new_flags = vma->vm_flags; 46 47 switch (behavior) { 48 case MADV_NORMAL: 49 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 50 break; 51 case MADV_SEQUENTIAL: 52 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 53 break; 54 case MADV_RANDOM: 55 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 56 break; 57 case MADV_DONTFORK: 58 new_flags |= VM_DONTCOPY; 59 break; 60 case MADV_DOFORK: 61 if (vma->vm_flags & VM_IO) { 62 error = -EINVAL; 63 goto out; 64 } 65 new_flags &= ~VM_DONTCOPY; 66 break; 67 case MADV_MERGEABLE: 68 case MADV_UNMERGEABLE: 69 error = ksm_madvise(vma, start, end, behavior, &new_flags); 70 if (error) 71 goto out; 72 break; 73 } 74 75 if (new_flags == vma->vm_flags) { 76 *prev = vma; 77 goto out; 78 } 79 80 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 81 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 82 vma->vm_file, pgoff, vma_policy(vma)); 83 if (*prev) { 84 vma = *prev; 85 goto success; 86 } 87 88 *prev = vma; 89 90 if (start != vma->vm_start) { 91 error = split_vma(mm, vma, start, 1); 92 if (error) 93 goto out; 94 } 95 96 if (end != vma->vm_end) { 97 error = split_vma(mm, vma, end, 0); 98 if (error) 99 goto out; 100 } 101 102 success: 103 /* 104 * vm_flags is protected by the mmap_sem held in write mode. 105 */ 106 vma->vm_flags = new_flags; 107 108 out: 109 if (error == -ENOMEM) 110 error = -EAGAIN; 111 return error; 112 } 113 114 /* 115 * Schedule all required I/O operations. Do not wait for completion. 116 */ 117 static long madvise_willneed(struct vm_area_struct * vma, 118 struct vm_area_struct ** prev, 119 unsigned long start, unsigned long end) 120 { 121 struct file *file = vma->vm_file; 122 123 if (!file) 124 return -EBADF; 125 126 if (file->f_mapping->a_ops->get_xip_mem) { 127 /* no bad return value, but ignore advice */ 128 return 0; 129 } 130 131 *prev = vma; 132 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 133 if (end > vma->vm_end) 134 end = vma->vm_end; 135 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 136 137 force_page_cache_readahead(file->f_mapping, file, start, end - start); 138 return 0; 139 } 140 141 /* 142 * Application no longer needs these pages. If the pages are dirty, 143 * it's OK to just throw them away. The app will be more careful about 144 * data it wants to keep. Be sure to free swap resources too. The 145 * zap_page_range call sets things up for shrink_active_list to actually free 146 * these pages later if no one else has touched them in the meantime, 147 * although we could add these pages to a global reuse list for 148 * shrink_active_list to pick up before reclaiming other pages. 149 * 150 * NB: This interface discards data rather than pushes it out to swap, 151 * as some implementations do. This has performance implications for 152 * applications like large transactional databases which want to discard 153 * pages in anonymous maps after committing to backing store the data 154 * that was kept in them. There is no reason to write this data out to 155 * the swap area if the application is discarding it. 156 * 157 * An interface that causes the system to free clean pages and flush 158 * dirty pages is already available as msync(MS_INVALIDATE). 159 */ 160 static long madvise_dontneed(struct vm_area_struct * vma, 161 struct vm_area_struct ** prev, 162 unsigned long start, unsigned long end) 163 { 164 *prev = vma; 165 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) 166 return -EINVAL; 167 168 if (unlikely(vma->vm_flags & VM_NONLINEAR)) { 169 struct zap_details details = { 170 .nonlinear_vma = vma, 171 .last_index = ULONG_MAX, 172 }; 173 zap_page_range(vma, start, end - start, &details); 174 } else 175 zap_page_range(vma, start, end - start, NULL); 176 return 0; 177 } 178 179 /* 180 * Application wants to free up the pages and associated backing store. 181 * This is effectively punching a hole into the middle of a file. 182 * 183 * NOTE: Currently, only shmfs/tmpfs is supported for this operation. 184 * Other filesystems return -ENOSYS. 185 */ 186 static long madvise_remove(struct vm_area_struct *vma, 187 struct vm_area_struct **prev, 188 unsigned long start, unsigned long end) 189 { 190 struct address_space *mapping; 191 loff_t offset, endoff; 192 int error; 193 194 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 195 196 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 197 return -EINVAL; 198 199 if (!vma->vm_file || !vma->vm_file->f_mapping 200 || !vma->vm_file->f_mapping->host) { 201 return -EINVAL; 202 } 203 204 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 205 return -EACCES; 206 207 mapping = vma->vm_file->f_mapping; 208 209 offset = (loff_t)(start - vma->vm_start) 210 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 211 endoff = (loff_t)(end - vma->vm_start - 1) 212 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 213 214 /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ 215 up_read(¤t->mm->mmap_sem); 216 error = vmtruncate_range(mapping->host, offset, endoff); 217 down_read(¤t->mm->mmap_sem); 218 return error; 219 } 220 221 #ifdef CONFIG_MEMORY_FAILURE 222 /* 223 * Error injection support for memory error handling. 224 */ 225 static int madvise_hwpoison(unsigned long start, unsigned long end) 226 { 227 int ret = 0; 228 229 if (!capable(CAP_SYS_ADMIN)) 230 return -EPERM; 231 for (; start < end; start += PAGE_SIZE) { 232 struct page *p; 233 int ret = get_user_pages(current, current->mm, start, 1, 234 0, 0, &p, NULL); 235 if (ret != 1) 236 return ret; 237 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", 238 page_to_pfn(p), start); 239 /* Ignore return value for now */ 240 __memory_failure(page_to_pfn(p), 0, 1); 241 put_page(p); 242 } 243 return ret; 244 } 245 #endif 246 247 static long 248 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 249 unsigned long start, unsigned long end, int behavior) 250 { 251 switch (behavior) { 252 case MADV_REMOVE: 253 return madvise_remove(vma, prev, start, end); 254 case MADV_WILLNEED: 255 return madvise_willneed(vma, prev, start, end); 256 case MADV_DONTNEED: 257 return madvise_dontneed(vma, prev, start, end); 258 default: 259 return madvise_behavior(vma, prev, start, end, behavior); 260 } 261 } 262 263 static int 264 madvise_behavior_valid(int behavior) 265 { 266 switch (behavior) { 267 case MADV_DOFORK: 268 case MADV_DONTFORK: 269 case MADV_NORMAL: 270 case MADV_SEQUENTIAL: 271 case MADV_RANDOM: 272 case MADV_REMOVE: 273 case MADV_WILLNEED: 274 case MADV_DONTNEED: 275 #ifdef CONFIG_KSM 276 case MADV_MERGEABLE: 277 case MADV_UNMERGEABLE: 278 #endif 279 return 1; 280 281 default: 282 return 0; 283 } 284 } 285 286 /* 287 * The madvise(2) system call. 288 * 289 * Applications can use madvise() to advise the kernel how it should 290 * handle paging I/O in this VM area. The idea is to help the kernel 291 * use appropriate read-ahead and caching techniques. The information 292 * provided is advisory only, and can be safely disregarded by the 293 * kernel without affecting the correct operation of the application. 294 * 295 * behavior values: 296 * MADV_NORMAL - the default behavior is to read clusters. This 297 * results in some read-ahead and read-behind. 298 * MADV_RANDOM - the system should read the minimum amount of data 299 * on any access, since it is unlikely that the appli- 300 * cation will need more than what it asks for. 301 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 302 * once, so they can be aggressively read ahead, and 303 * can be freed soon after they are accessed. 304 * MADV_WILLNEED - the application is notifying the system to read 305 * some pages ahead. 306 * MADV_DONTNEED - the application is finished with the given range, 307 * so the kernel can free resources associated with it. 308 * MADV_REMOVE - the application wants to free up the given range of 309 * pages and associated backing store. 310 * MADV_DONTFORK - omit this area from child's address space when forking: 311 * typically, to avoid COWing pages pinned by get_user_pages(). 312 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 313 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 314 * this area with pages of identical content from other such areas. 315 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 316 * 317 * return values: 318 * zero - success 319 * -EINVAL - start + len < 0, start is not page-aligned, 320 * "behavior" is not a valid value, or application 321 * is attempting to release locked or shared pages. 322 * -ENOMEM - addresses in the specified range are not currently 323 * mapped, or are outside the AS of the process. 324 * -EIO - an I/O error occurred while paging in data. 325 * -EBADF - map exists, but area maps something that isn't a file. 326 * -EAGAIN - a kernel resource was temporarily unavailable. 327 */ 328 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 329 { 330 unsigned long end, tmp; 331 struct vm_area_struct * vma, *prev; 332 int unmapped_error = 0; 333 int error = -EINVAL; 334 int write; 335 size_t len; 336 337 #ifdef CONFIG_MEMORY_FAILURE 338 if (behavior == MADV_HWPOISON) 339 return madvise_hwpoison(start, start+len_in); 340 #endif 341 if (!madvise_behavior_valid(behavior)) 342 return error; 343 344 write = madvise_need_mmap_write(behavior); 345 if (write) 346 down_write(¤t->mm->mmap_sem); 347 else 348 down_read(¤t->mm->mmap_sem); 349 350 if (start & ~PAGE_MASK) 351 goto out; 352 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 353 354 /* Check to see whether len was rounded up from small -ve to zero */ 355 if (len_in && !len) 356 goto out; 357 358 end = start + len; 359 if (end < start) 360 goto out; 361 362 error = 0; 363 if (end == start) 364 goto out; 365 366 /* 367 * If the interval [start,end) covers some unmapped address 368 * ranges, just ignore them, but return -ENOMEM at the end. 369 * - different from the way of handling in mlock etc. 370 */ 371 vma = find_vma_prev(current->mm, start, &prev); 372 if (vma && start > vma->vm_start) 373 prev = vma; 374 375 for (;;) { 376 /* Still start < end. */ 377 error = -ENOMEM; 378 if (!vma) 379 goto out; 380 381 /* Here start < (end|vma->vm_end). */ 382 if (start < vma->vm_start) { 383 unmapped_error = -ENOMEM; 384 start = vma->vm_start; 385 if (start >= end) 386 goto out; 387 } 388 389 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 390 tmp = vma->vm_end; 391 if (end < tmp) 392 tmp = end; 393 394 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 395 error = madvise_vma(vma, &prev, start, tmp, behavior); 396 if (error) 397 goto out; 398 start = tmp; 399 if (prev && start < prev->vm_end) 400 start = prev->vm_end; 401 error = unmapped_error; 402 if (start >= end) 403 goto out; 404 if (prev) 405 vma = prev->vm_next; 406 else /* madvise_remove dropped mmap_sem */ 407 vma = find_vma(current->mm, start); 408 } 409 out: 410 if (write) 411 up_write(¤t->mm->mmap_sem); 412 else 413 up_read(¤t->mm->mmap_sem); 414 415 return error; 416 } 417