xref: /openbmc/linux/mm/madvise.c (revision e868d61272caa648214046a096e5a6bfc068dc8c)
1 /*
2  *	linux/mm/madvise.c
3  *
4  * Copyright (C) 1999  Linus Torvalds
5  * Copyright (C) 2002  Christoph Hellwig
6  */
7 
8 #include <linux/mman.h>
9 #include <linux/pagemap.h>
10 #include <linux/syscalls.h>
11 #include <linux/mempolicy.h>
12 #include <linux/hugetlb.h>
13 
14 /*
15  * Any behaviour which results in changes to the vma->vm_flags needs to
16  * take mmap_sem for writing. Others, which simply traverse vmas, need
17  * to only take it for reading.
18  */
19 static int madvise_need_mmap_write(int behavior)
20 {
21 	switch (behavior) {
22 	case MADV_REMOVE:
23 	case MADV_WILLNEED:
24 	case MADV_DONTNEED:
25 		return 0;
26 	default:
27 		/* be safe, default to 1. list exceptions explicitly */
28 		return 1;
29 	}
30 }
31 
32 /*
33  * We can potentially split a vm area into separate
34  * areas, each area with its own behavior.
35  */
36 static long madvise_behavior(struct vm_area_struct * vma,
37 		     struct vm_area_struct **prev,
38 		     unsigned long start, unsigned long end, int behavior)
39 {
40 	struct mm_struct * mm = vma->vm_mm;
41 	int error = 0;
42 	pgoff_t pgoff;
43 	int new_flags = vma->vm_flags;
44 
45 	switch (behavior) {
46 	case MADV_NORMAL:
47 		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
48 		break;
49 	case MADV_SEQUENTIAL:
50 		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
51 		break;
52 	case MADV_RANDOM:
53 		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
54 		break;
55 	case MADV_DONTFORK:
56 		new_flags |= VM_DONTCOPY;
57 		break;
58 	case MADV_DOFORK:
59 		new_flags &= ~VM_DONTCOPY;
60 		break;
61 	}
62 
63 	if (new_flags == vma->vm_flags) {
64 		*prev = vma;
65 		goto out;
66 	}
67 
68 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
69 	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
70 				vma->vm_file, pgoff, vma_policy(vma));
71 	if (*prev) {
72 		vma = *prev;
73 		goto success;
74 	}
75 
76 	*prev = vma;
77 
78 	if (start != vma->vm_start) {
79 		error = split_vma(mm, vma, start, 1);
80 		if (error)
81 			goto out;
82 	}
83 
84 	if (end != vma->vm_end) {
85 		error = split_vma(mm, vma, end, 0);
86 		if (error)
87 			goto out;
88 	}
89 
90 success:
91 	/*
92 	 * vm_flags is protected by the mmap_sem held in write mode.
93 	 */
94 	vma->vm_flags = new_flags;
95 
96 out:
97 	if (error == -ENOMEM)
98 		error = -EAGAIN;
99 	return error;
100 }
101 
102 /*
103  * Schedule all required I/O operations.  Do not wait for completion.
104  */
105 static long madvise_willneed(struct vm_area_struct * vma,
106 			     struct vm_area_struct ** prev,
107 			     unsigned long start, unsigned long end)
108 {
109 	struct file *file = vma->vm_file;
110 
111 	if (!file)
112 		return -EBADF;
113 
114 	if (file->f_mapping->a_ops->get_xip_page) {
115 		/* no bad return value, but ignore advice */
116 		return 0;
117 	}
118 
119 	*prev = vma;
120 	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
121 	if (end > vma->vm_end)
122 		end = vma->vm_end;
123 	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
124 
125 	force_page_cache_readahead(file->f_mapping,
126 			file, start, max_sane_readahead(end - start));
127 	return 0;
128 }
129 
130 /*
131  * Application no longer needs these pages.  If the pages are dirty,
132  * it's OK to just throw them away.  The app will be more careful about
133  * data it wants to keep.  Be sure to free swap resources too.  The
134  * zap_page_range call sets things up for refill_inactive to actually free
135  * these pages later if no one else has touched them in the meantime,
136  * although we could add these pages to a global reuse list for
137  * refill_inactive to pick up before reclaiming other pages.
138  *
139  * NB: This interface discards data rather than pushes it out to swap,
140  * as some implementations do.  This has performance implications for
141  * applications like large transactional databases which want to discard
142  * pages in anonymous maps after committing to backing store the data
143  * that was kept in them.  There is no reason to write this data out to
144  * the swap area if the application is discarding it.
145  *
146  * An interface that causes the system to free clean pages and flush
147  * dirty pages is already available as msync(MS_INVALIDATE).
148  */
149 static long madvise_dontneed(struct vm_area_struct * vma,
150 			     struct vm_area_struct ** prev,
151 			     unsigned long start, unsigned long end)
152 {
153 	*prev = vma;
154 	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
155 		return -EINVAL;
156 
157 	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
158 		struct zap_details details = {
159 			.nonlinear_vma = vma,
160 			.last_index = ULONG_MAX,
161 		};
162 		zap_page_range(vma, start, end - start, &details);
163 	} else
164 		zap_page_range(vma, start, end - start, NULL);
165 	return 0;
166 }
167 
168 /*
169  * Application wants to free up the pages and associated backing store.
170  * This is effectively punching a hole into the middle of a file.
171  *
172  * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
173  * Other filesystems return -ENOSYS.
174  */
175 static long madvise_remove(struct vm_area_struct *vma,
176 				struct vm_area_struct **prev,
177 				unsigned long start, unsigned long end)
178 {
179 	struct address_space *mapping;
180 	loff_t offset, endoff;
181 	int error;
182 
183 	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
184 
185 	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
186 		return -EINVAL;
187 
188 	if (!vma->vm_file || !vma->vm_file->f_mapping
189 		|| !vma->vm_file->f_mapping->host) {
190 			return -EINVAL;
191 	}
192 
193 	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
194 		return -EACCES;
195 
196 	mapping = vma->vm_file->f_mapping;
197 
198 	offset = (loff_t)(start - vma->vm_start)
199 			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
200 	endoff = (loff_t)(end - vma->vm_start - 1)
201 			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
202 
203 	/* vmtruncate_range needs to take i_mutex and i_alloc_sem */
204 	up_read(&current->mm->mmap_sem);
205 	error = vmtruncate_range(mapping->host, offset, endoff);
206 	down_read(&current->mm->mmap_sem);
207 	return error;
208 }
209 
210 static long
211 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
212 		unsigned long start, unsigned long end, int behavior)
213 {
214 	long error;
215 
216 	switch (behavior) {
217 	case MADV_DOFORK:
218 		if (vma->vm_flags & VM_IO) {
219 			error = -EINVAL;
220 			break;
221 		}
222 	case MADV_DONTFORK:
223 	case MADV_NORMAL:
224 	case MADV_SEQUENTIAL:
225 	case MADV_RANDOM:
226 		error = madvise_behavior(vma, prev, start, end, behavior);
227 		break;
228 	case MADV_REMOVE:
229 		error = madvise_remove(vma, prev, start, end);
230 		break;
231 
232 	case MADV_WILLNEED:
233 		error = madvise_willneed(vma, prev, start, end);
234 		break;
235 
236 	case MADV_DONTNEED:
237 		error = madvise_dontneed(vma, prev, start, end);
238 		break;
239 
240 	default:
241 		error = -EINVAL;
242 		break;
243 	}
244 	return error;
245 }
246 
247 /*
248  * The madvise(2) system call.
249  *
250  * Applications can use madvise() to advise the kernel how it should
251  * handle paging I/O in this VM area.  The idea is to help the kernel
252  * use appropriate read-ahead and caching techniques.  The information
253  * provided is advisory only, and can be safely disregarded by the
254  * kernel without affecting the correct operation of the application.
255  *
256  * behavior values:
257  *  MADV_NORMAL - the default behavior is to read clusters.  This
258  *		results in some read-ahead and read-behind.
259  *  MADV_RANDOM - the system should read the minimum amount of data
260  *		on any access, since it is unlikely that the appli-
261  *		cation will need more than what it asks for.
262  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
263  *		once, so they can be aggressively read ahead, and
264  *		can be freed soon after they are accessed.
265  *  MADV_WILLNEED - the application is notifying the system to read
266  *		some pages ahead.
267  *  MADV_DONTNEED - the application is finished with the given range,
268  *		so the kernel can free resources associated with it.
269  *  MADV_REMOVE - the application wants to free up the given range of
270  *		pages and associated backing store.
271  *
272  * return values:
273  *  zero    - success
274  *  -EINVAL - start + len < 0, start is not page-aligned,
275  *		"behavior" is not a valid value, or application
276  *		is attempting to release locked or shared pages.
277  *  -ENOMEM - addresses in the specified range are not currently
278  *		mapped, or are outside the AS of the process.
279  *  -EIO    - an I/O error occurred while paging in data.
280  *  -EBADF  - map exists, but area maps something that isn't a file.
281  *  -EAGAIN - a kernel resource was temporarily unavailable.
282  */
283 asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
284 {
285 	unsigned long end, tmp;
286 	struct vm_area_struct * vma, *prev;
287 	int unmapped_error = 0;
288 	int error = -EINVAL;
289 	size_t len;
290 
291 	if (madvise_need_mmap_write(behavior))
292 		down_write(&current->mm->mmap_sem);
293 	else
294 		down_read(&current->mm->mmap_sem);
295 
296 	if (start & ~PAGE_MASK)
297 		goto out;
298 	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
299 
300 	/* Check to see whether len was rounded up from small -ve to zero */
301 	if (len_in && !len)
302 		goto out;
303 
304 	end = start + len;
305 	if (end < start)
306 		goto out;
307 
308 	error = 0;
309 	if (end == start)
310 		goto out;
311 
312 	/*
313 	 * If the interval [start,end) covers some unmapped address
314 	 * ranges, just ignore them, but return -ENOMEM at the end.
315 	 * - different from the way of handling in mlock etc.
316 	 */
317 	vma = find_vma_prev(current->mm, start, &prev);
318 	if (vma && start > vma->vm_start)
319 		prev = vma;
320 
321 	for (;;) {
322 		/* Still start < end. */
323 		error = -ENOMEM;
324 		if (!vma)
325 			goto out;
326 
327 		/* Here start < (end|vma->vm_end). */
328 		if (start < vma->vm_start) {
329 			unmapped_error = -ENOMEM;
330 			start = vma->vm_start;
331 			if (start >= end)
332 				goto out;
333 		}
334 
335 		/* Here vma->vm_start <= start < (end|vma->vm_end) */
336 		tmp = vma->vm_end;
337 		if (end < tmp)
338 			tmp = end;
339 
340 		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
341 		error = madvise_vma(vma, &prev, start, tmp, behavior);
342 		if (error)
343 			goto out;
344 		start = tmp;
345 		if (prev && start < prev->vm_end)
346 			start = prev->vm_end;
347 		error = unmapped_error;
348 		if (start >= end)
349 			goto out;
350 		if (prev)
351 			vma = prev->vm_next;
352 		else	/* madvise_remove dropped mmap_sem */
353 			vma = find_vma(current->mm, start);
354 	}
355 out:
356 	if (madvise_need_mmap_write(behavior))
357 		up_write(&current->mm->mmap_sem);
358 	else
359 		up_read(&current->mm->mmap_sem);
360 
361 	return error;
362 }
363