xref: /openbmc/linux/mm/madvise.c (revision 63dc02bd)
1 /*
2  *	linux/mm/madvise.c
3  *
4  * Copyright (C) 1999  Linus Torvalds
5  * Copyright (C) 2002  Christoph Hellwig
6  */
7 
8 #include <linux/mman.h>
9 #include <linux/pagemap.h>
10 #include <linux/syscalls.h>
11 #include <linux/mempolicy.h>
12 #include <linux/page-isolation.h>
13 #include <linux/hugetlb.h>
14 #include <linux/sched.h>
15 #include <linux/ksm.h>
16 
17 /*
18  * Any behaviour which results in changes to the vma->vm_flags needs to
19  * take mmap_sem for writing. Others, which simply traverse vmas, need
20  * to only take it for reading.
21  */
22 static int madvise_need_mmap_write(int behavior)
23 {
24 	switch (behavior) {
25 	case MADV_REMOVE:
26 	case MADV_WILLNEED:
27 	case MADV_DONTNEED:
28 		return 0;
29 	default:
30 		/* be safe, default to 1. list exceptions explicitly */
31 		return 1;
32 	}
33 }
34 
35 /*
36  * We can potentially split a vm area into separate
37  * areas, each area with its own behavior.
38  */
39 static long madvise_behavior(struct vm_area_struct * vma,
40 		     struct vm_area_struct **prev,
41 		     unsigned long start, unsigned long end, int behavior)
42 {
43 	struct mm_struct * mm = vma->vm_mm;
44 	int error = 0;
45 	pgoff_t pgoff;
46 	unsigned long new_flags = vma->vm_flags;
47 
48 	switch (behavior) {
49 	case MADV_NORMAL:
50 		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
51 		break;
52 	case MADV_SEQUENTIAL:
53 		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
54 		break;
55 	case MADV_RANDOM:
56 		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
57 		break;
58 	case MADV_DONTFORK:
59 		new_flags |= VM_DONTCOPY;
60 		break;
61 	case MADV_DOFORK:
62 		if (vma->vm_flags & VM_IO) {
63 			error = -EINVAL;
64 			goto out;
65 		}
66 		new_flags &= ~VM_DONTCOPY;
67 		break;
68 	case MADV_DONTDUMP:
69 		new_flags |= VM_NODUMP;
70 		break;
71 	case MADV_DODUMP:
72 		new_flags &= ~VM_NODUMP;
73 		break;
74 	case MADV_MERGEABLE:
75 	case MADV_UNMERGEABLE:
76 		error = ksm_madvise(vma, start, end, behavior, &new_flags);
77 		if (error)
78 			goto out;
79 		break;
80 	case MADV_HUGEPAGE:
81 	case MADV_NOHUGEPAGE:
82 		error = hugepage_madvise(vma, &new_flags, behavior);
83 		if (error)
84 			goto out;
85 		break;
86 	}
87 
88 	if (new_flags == vma->vm_flags) {
89 		*prev = vma;
90 		goto out;
91 	}
92 
93 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
94 	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
95 				vma->vm_file, pgoff, vma_policy(vma));
96 	if (*prev) {
97 		vma = *prev;
98 		goto success;
99 	}
100 
101 	*prev = vma;
102 
103 	if (start != vma->vm_start) {
104 		error = split_vma(mm, vma, start, 1);
105 		if (error)
106 			goto out;
107 	}
108 
109 	if (end != vma->vm_end) {
110 		error = split_vma(mm, vma, end, 0);
111 		if (error)
112 			goto out;
113 	}
114 
115 success:
116 	/*
117 	 * vm_flags is protected by the mmap_sem held in write mode.
118 	 */
119 	vma->vm_flags = new_flags;
120 
121 out:
122 	if (error == -ENOMEM)
123 		error = -EAGAIN;
124 	return error;
125 }
126 
127 /*
128  * Schedule all required I/O operations.  Do not wait for completion.
129  */
130 static long madvise_willneed(struct vm_area_struct * vma,
131 			     struct vm_area_struct ** prev,
132 			     unsigned long start, unsigned long end)
133 {
134 	struct file *file = vma->vm_file;
135 
136 	if (!file)
137 		return -EBADF;
138 
139 	if (file->f_mapping->a_ops->get_xip_mem) {
140 		/* no bad return value, but ignore advice */
141 		return 0;
142 	}
143 
144 	*prev = vma;
145 	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
146 	if (end > vma->vm_end)
147 		end = vma->vm_end;
148 	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
149 
150 	force_page_cache_readahead(file->f_mapping, file, start, end - start);
151 	return 0;
152 }
153 
154 /*
155  * Application no longer needs these pages.  If the pages are dirty,
156  * it's OK to just throw them away.  The app will be more careful about
157  * data it wants to keep.  Be sure to free swap resources too.  The
158  * zap_page_range call sets things up for shrink_active_list to actually free
159  * these pages later if no one else has touched them in the meantime,
160  * although we could add these pages to a global reuse list for
161  * shrink_active_list to pick up before reclaiming other pages.
162  *
163  * NB: This interface discards data rather than pushes it out to swap,
164  * as some implementations do.  This has performance implications for
165  * applications like large transactional databases which want to discard
166  * pages in anonymous maps after committing to backing store the data
167  * that was kept in them.  There is no reason to write this data out to
168  * the swap area if the application is discarding it.
169  *
170  * An interface that causes the system to free clean pages and flush
171  * dirty pages is already available as msync(MS_INVALIDATE).
172  */
173 static long madvise_dontneed(struct vm_area_struct * vma,
174 			     struct vm_area_struct ** prev,
175 			     unsigned long start, unsigned long end)
176 {
177 	*prev = vma;
178 	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
179 		return -EINVAL;
180 
181 	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
182 		struct zap_details details = {
183 			.nonlinear_vma = vma,
184 			.last_index = ULONG_MAX,
185 		};
186 		zap_page_range(vma, start, end - start, &details);
187 	} else
188 		zap_page_range(vma, start, end - start, NULL);
189 	return 0;
190 }
191 
192 /*
193  * Application wants to free up the pages and associated backing store.
194  * This is effectively punching a hole into the middle of a file.
195  *
196  * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
197  * Other filesystems return -ENOSYS.
198  */
199 static long madvise_remove(struct vm_area_struct *vma,
200 				struct vm_area_struct **prev,
201 				unsigned long start, unsigned long end)
202 {
203 	struct address_space *mapping;
204 	loff_t offset, endoff;
205 	int error;
206 
207 	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
208 
209 	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
210 		return -EINVAL;
211 
212 	if (!vma->vm_file || !vma->vm_file->f_mapping
213 		|| !vma->vm_file->f_mapping->host) {
214 			return -EINVAL;
215 	}
216 
217 	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
218 		return -EACCES;
219 
220 	mapping = vma->vm_file->f_mapping;
221 
222 	offset = (loff_t)(start - vma->vm_start)
223 			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
224 	endoff = (loff_t)(end - vma->vm_start - 1)
225 			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
226 
227 	/* vmtruncate_range needs to take i_mutex */
228 	up_read(&current->mm->mmap_sem);
229 	error = vmtruncate_range(mapping->host, offset, endoff);
230 	down_read(&current->mm->mmap_sem);
231 	return error;
232 }
233 
234 #ifdef CONFIG_MEMORY_FAILURE
235 /*
236  * Error injection support for memory error handling.
237  */
238 static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
239 {
240 	int ret = 0;
241 
242 	if (!capable(CAP_SYS_ADMIN))
243 		return -EPERM;
244 	for (; start < end; start += PAGE_SIZE) {
245 		struct page *p;
246 		int ret = get_user_pages_fast(start, 1, 0, &p);
247 		if (ret != 1)
248 			return ret;
249 		if (bhv == MADV_SOFT_OFFLINE) {
250 			printk(KERN_INFO "Soft offlining page %lx at %lx\n",
251 				page_to_pfn(p), start);
252 			ret = soft_offline_page(p, MF_COUNT_INCREASED);
253 			if (ret)
254 				break;
255 			continue;
256 		}
257 		printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
258 		       page_to_pfn(p), start);
259 		/* Ignore return value for now */
260 		memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
261 	}
262 	return ret;
263 }
264 #endif
265 
266 static long
267 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
268 		unsigned long start, unsigned long end, int behavior)
269 {
270 	switch (behavior) {
271 	case MADV_REMOVE:
272 		return madvise_remove(vma, prev, start, end);
273 	case MADV_WILLNEED:
274 		return madvise_willneed(vma, prev, start, end);
275 	case MADV_DONTNEED:
276 		return madvise_dontneed(vma, prev, start, end);
277 	default:
278 		return madvise_behavior(vma, prev, start, end, behavior);
279 	}
280 }
281 
282 static int
283 madvise_behavior_valid(int behavior)
284 {
285 	switch (behavior) {
286 	case MADV_DOFORK:
287 	case MADV_DONTFORK:
288 	case MADV_NORMAL:
289 	case MADV_SEQUENTIAL:
290 	case MADV_RANDOM:
291 	case MADV_REMOVE:
292 	case MADV_WILLNEED:
293 	case MADV_DONTNEED:
294 #ifdef CONFIG_KSM
295 	case MADV_MERGEABLE:
296 	case MADV_UNMERGEABLE:
297 #endif
298 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
299 	case MADV_HUGEPAGE:
300 	case MADV_NOHUGEPAGE:
301 #endif
302 	case MADV_DONTDUMP:
303 	case MADV_DODUMP:
304 		return 1;
305 
306 	default:
307 		return 0;
308 	}
309 }
310 
311 /*
312  * The madvise(2) system call.
313  *
314  * Applications can use madvise() to advise the kernel how it should
315  * handle paging I/O in this VM area.  The idea is to help the kernel
316  * use appropriate read-ahead and caching techniques.  The information
317  * provided is advisory only, and can be safely disregarded by the
318  * kernel without affecting the correct operation of the application.
319  *
320  * behavior values:
321  *  MADV_NORMAL - the default behavior is to read clusters.  This
322  *		results in some read-ahead and read-behind.
323  *  MADV_RANDOM - the system should read the minimum amount of data
324  *		on any access, since it is unlikely that the appli-
325  *		cation will need more than what it asks for.
326  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
327  *		once, so they can be aggressively read ahead, and
328  *		can be freed soon after they are accessed.
329  *  MADV_WILLNEED - the application is notifying the system to read
330  *		some pages ahead.
331  *  MADV_DONTNEED - the application is finished with the given range,
332  *		so the kernel can free resources associated with it.
333  *  MADV_REMOVE - the application wants to free up the given range of
334  *		pages and associated backing store.
335  *  MADV_DONTFORK - omit this area from child's address space when forking:
336  *		typically, to avoid COWing pages pinned by get_user_pages().
337  *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
338  *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
339  *		this area with pages of identical content from other such areas.
340  *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
341  *
342  * return values:
343  *  zero    - success
344  *  -EINVAL - start + len < 0, start is not page-aligned,
345  *		"behavior" is not a valid value, or application
346  *		is attempting to release locked or shared pages.
347  *  -ENOMEM - addresses in the specified range are not currently
348  *		mapped, or are outside the AS of the process.
349  *  -EIO    - an I/O error occurred while paging in data.
350  *  -EBADF  - map exists, but area maps something that isn't a file.
351  *  -EAGAIN - a kernel resource was temporarily unavailable.
352  */
353 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
354 {
355 	unsigned long end, tmp;
356 	struct vm_area_struct * vma, *prev;
357 	int unmapped_error = 0;
358 	int error = -EINVAL;
359 	int write;
360 	size_t len;
361 
362 #ifdef CONFIG_MEMORY_FAILURE
363 	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
364 		return madvise_hwpoison(behavior, start, start+len_in);
365 #endif
366 	if (!madvise_behavior_valid(behavior))
367 		return error;
368 
369 	write = madvise_need_mmap_write(behavior);
370 	if (write)
371 		down_write(&current->mm->mmap_sem);
372 	else
373 		down_read(&current->mm->mmap_sem);
374 
375 	if (start & ~PAGE_MASK)
376 		goto out;
377 	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
378 
379 	/* Check to see whether len was rounded up from small -ve to zero */
380 	if (len_in && !len)
381 		goto out;
382 
383 	end = start + len;
384 	if (end < start)
385 		goto out;
386 
387 	error = 0;
388 	if (end == start)
389 		goto out;
390 
391 	/*
392 	 * If the interval [start,end) covers some unmapped address
393 	 * ranges, just ignore them, but return -ENOMEM at the end.
394 	 * - different from the way of handling in mlock etc.
395 	 */
396 	vma = find_vma_prev(current->mm, start, &prev);
397 	if (vma && start > vma->vm_start)
398 		prev = vma;
399 
400 	for (;;) {
401 		/* Still start < end. */
402 		error = -ENOMEM;
403 		if (!vma)
404 			goto out;
405 
406 		/* Here start < (end|vma->vm_end). */
407 		if (start < vma->vm_start) {
408 			unmapped_error = -ENOMEM;
409 			start = vma->vm_start;
410 			if (start >= end)
411 				goto out;
412 		}
413 
414 		/* Here vma->vm_start <= start < (end|vma->vm_end) */
415 		tmp = vma->vm_end;
416 		if (end < tmp)
417 			tmp = end;
418 
419 		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
420 		error = madvise_vma(vma, &prev, start, tmp, behavior);
421 		if (error)
422 			goto out;
423 		start = tmp;
424 		if (prev && start < prev->vm_end)
425 			start = prev->vm_end;
426 		error = unmapped_error;
427 		if (start >= end)
428 			goto out;
429 		if (prev)
430 			vma = prev->vm_next;
431 		else	/* madvise_remove dropped mmap_sem */
432 			vma = find_vma(current->mm, start);
433 	}
434 out:
435 	if (write)
436 		up_write(&current->mm->mmap_sem);
437 	else
438 		up_read(&current->mm->mmap_sem);
439 
440 	return error;
441 }
442