xref: /openbmc/linux/mm/mlock.c (revision fd589a8f)
1 /*
2  *	linux/mm/mlock.c
3  *
4  *  (C) Copyright 1995 Linus Torvalds
5  *  (C) Copyright 2002 Christoph Hellwig
6  */
7 
8 #include <linux/capability.h>
9 #include <linux/mman.h>
10 #include <linux/mm.h>
11 #include <linux/swap.h>
12 #include <linux/swapops.h>
13 #include <linux/pagemap.h>
14 #include <linux/mempolicy.h>
15 #include <linux/syscalls.h>
16 #include <linux/sched.h>
17 #include <linux/module.h>
18 #include <linux/rmap.h>
19 #include <linux/mmzone.h>
20 #include <linux/hugetlb.h>
21 
22 #include "internal.h"
23 
24 int can_do_mlock(void)
25 {
26 	if (capable(CAP_IPC_LOCK))
27 		return 1;
28 	if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
29 		return 1;
30 	return 0;
31 }
32 EXPORT_SYMBOL(can_do_mlock);
33 
34 /*
35  * Mlocked pages are marked with PageMlocked() flag for efficient testing
36  * in vmscan and, possibly, the fault path; and to support semi-accurate
37  * statistics.
38  *
39  * An mlocked page [PageMlocked(page)] is unevictable.  As such, it will
40  * be placed on the LRU "unevictable" list, rather than the [in]active lists.
41  * The unevictable list is an LRU sibling list to the [in]active lists.
42  * PageUnevictable is set to indicate the unevictable state.
43  *
44  * When lazy mlocking via vmscan, it is important to ensure that the
45  * vma's VM_LOCKED status is not concurrently being modified, otherwise we
46  * may have mlocked a page that is being munlocked. So lazy mlock must take
47  * the mmap_sem for read, and verify that the vma really is locked
48  * (see mm/rmap.c).
49  */
50 
51 /*
52  *  LRU accounting for clear_page_mlock()
53  */
54 void __clear_page_mlock(struct page *page)
55 {
56 	VM_BUG_ON(!PageLocked(page));
57 
58 	if (!page->mapping) {	/* truncated ? */
59 		return;
60 	}
61 
62 	dec_zone_page_state(page, NR_MLOCK);
63 	count_vm_event(UNEVICTABLE_PGCLEARED);
64 	if (!isolate_lru_page(page)) {
65 		putback_lru_page(page);
66 	} else {
67 		/*
68 		 * We lost the race. the page already moved to evictable list.
69 		 */
70 		if (PageUnevictable(page))
71 			count_vm_event(UNEVICTABLE_PGSTRANDED);
72 	}
73 }
74 
75 /*
76  * Mark page as mlocked if not already.
77  * If page on LRU, isolate and putback to move to unevictable list.
78  */
79 void mlock_vma_page(struct page *page)
80 {
81 	BUG_ON(!PageLocked(page));
82 
83 	if (!TestSetPageMlocked(page)) {
84 		inc_zone_page_state(page, NR_MLOCK);
85 		count_vm_event(UNEVICTABLE_PGMLOCKED);
86 		if (!isolate_lru_page(page))
87 			putback_lru_page(page);
88 	}
89 }
90 
91 /*
92  * called from munlock()/munmap() path with page supposedly on the LRU.
93  *
94  * Note:  unlike mlock_vma_page(), we can't just clear the PageMlocked
95  * [in try_to_munlock()] and then attempt to isolate the page.  We must
96  * isolate the page to keep others from messing with its unevictable
97  * and mlocked state while trying to munlock.  However, we pre-clear the
98  * mlocked state anyway as we might lose the isolation race and we might
99  * not get another chance to clear PageMlocked.  If we successfully
100  * isolate the page and try_to_munlock() detects other VM_LOCKED vmas
101  * mapping the page, it will restore the PageMlocked state, unless the page
102  * is mapped in a non-linear vma.  So, we go ahead and SetPageMlocked(),
103  * perhaps redundantly.
104  * If we lose the isolation race, and the page is mapped by other VM_LOCKED
105  * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
106  * either of which will restore the PageMlocked state by calling
107  * mlock_vma_page() above, if it can grab the vma's mmap sem.
108  */
109 static void munlock_vma_page(struct page *page)
110 {
111 	BUG_ON(!PageLocked(page));
112 
113 	if (TestClearPageMlocked(page)) {
114 		dec_zone_page_state(page, NR_MLOCK);
115 		if (!isolate_lru_page(page)) {
116 			int ret = try_to_munlock(page);
117 			/*
118 			 * did try_to_unlock() succeed or punt?
119 			 */
120 			if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN)
121 				count_vm_event(UNEVICTABLE_PGMUNLOCKED);
122 
123 			putback_lru_page(page);
124 		} else {
125 			/*
126 			 * We lost the race.  let try_to_unmap() deal
127 			 * with it.  At least we get the page state and
128 			 * mlock stats right.  However, page is still on
129 			 * the noreclaim list.  We'll fix that up when
130 			 * the page is eventually freed or we scan the
131 			 * noreclaim list.
132 			 */
133 			if (PageUnevictable(page))
134 				count_vm_event(UNEVICTABLE_PGSTRANDED);
135 			else
136 				count_vm_event(UNEVICTABLE_PGMUNLOCKED);
137 		}
138 	}
139 }
140 
141 /**
142  * __mlock_vma_pages_range() -  mlock/munlock a range of pages in the vma.
143  * @vma:   target vma
144  * @start: start address
145  * @end:   end address
146  * @mlock: 0 indicate munlock, otherwise mlock.
147  *
148  * If @mlock == 0, unlock an mlocked range;
149  * else mlock the range of pages.  This takes care of making the pages present ,
150  * too.
151  *
152  * return 0 on success, negative error code on error.
153  *
154  * vma->vm_mm->mmap_sem must be held for at least read.
155  */
156 static long __mlock_vma_pages_range(struct vm_area_struct *vma,
157 				   unsigned long start, unsigned long end,
158 				   int mlock)
159 {
160 	struct mm_struct *mm = vma->vm_mm;
161 	unsigned long addr = start;
162 	struct page *pages[16]; /* 16 gives a reasonable batch */
163 	int nr_pages = (end - start) / PAGE_SIZE;
164 	int ret = 0;
165 	int gup_flags = 0;
166 
167 	VM_BUG_ON(start & ~PAGE_MASK);
168 	VM_BUG_ON(end   & ~PAGE_MASK);
169 	VM_BUG_ON(start < vma->vm_start);
170 	VM_BUG_ON(end   > vma->vm_end);
171 	VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
172 		  (atomic_read(&mm->mm_users) != 0));
173 
174 	/*
175 	 * mlock:   don't page populate if vma has PROT_NONE permission.
176 	 * munlock: always do munlock although the vma has PROT_NONE
177 	 *          permission, or SIGKILL is pending.
178 	 */
179 	if (!mlock)
180 		gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
181 			     GUP_FLAGS_IGNORE_SIGKILL;
182 
183 	if (vma->vm_flags & VM_WRITE)
184 		gup_flags |= GUP_FLAGS_WRITE;
185 
186 	while (nr_pages > 0) {
187 		int i;
188 
189 		cond_resched();
190 
191 		/*
192 		 * get_user_pages makes pages present if we are
193 		 * setting mlock. and this extra reference count will
194 		 * disable migration of this page.  However, page may
195 		 * still be truncated out from under us.
196 		 */
197 		ret = __get_user_pages(current, mm, addr,
198 				min_t(int, nr_pages, ARRAY_SIZE(pages)),
199 				gup_flags, pages, NULL);
200 		/*
201 		 * This can happen for, e.g., VM_NONLINEAR regions before
202 		 * a page has been allocated and mapped at a given offset,
203 		 * or for addresses that map beyond end of a file.
204 		 * We'll mlock the the pages if/when they get faulted in.
205 		 */
206 		if (ret < 0)
207 			break;
208 		if (ret == 0) {
209 			/*
210 			 * We know the vma is there, so the only time
211 			 * we cannot get a single page should be an
212 			 * error (ret < 0) case.
213 			 */
214 			WARN_ON(1);
215 			break;
216 		}
217 
218 		lru_add_drain();	/* push cached pages to LRU */
219 
220 		for (i = 0; i < ret; i++) {
221 			struct page *page = pages[i];
222 
223 			lock_page(page);
224 			/*
225 			 * Because we lock page here and migration is blocked
226 			 * by the elevated reference, we need only check for
227 			 * page truncation (file-cache only).
228 			 */
229 			if (page->mapping) {
230 				if (mlock)
231 					mlock_vma_page(page);
232 				else
233 					munlock_vma_page(page);
234 			}
235 			unlock_page(page);
236 			put_page(page);		/* ref from get_user_pages() */
237 
238 			/*
239 			 * here we assume that get_user_pages() has given us
240 			 * a list of virtually contiguous pages.
241 			 */
242 			addr += PAGE_SIZE;	/* for next get_user_pages() */
243 			nr_pages--;
244 		}
245 		ret = 0;
246 	}
247 
248 	return ret;	/* count entire vma as locked_vm */
249 }
250 
251 /*
252  * convert get_user_pages() return value to posix mlock() error
253  */
254 static int __mlock_posix_error_return(long retval)
255 {
256 	if (retval == -EFAULT)
257 		retval = -ENOMEM;
258 	else if (retval == -ENOMEM)
259 		retval = -EAGAIN;
260 	return retval;
261 }
262 
263 /**
264  * mlock_vma_pages_range() - mlock pages in specified vma range.
265  * @vma - the vma containing the specfied address range
266  * @start - starting address in @vma to mlock
267  * @end   - end address [+1] in @vma to mlock
268  *
269  * For mmap()/mremap()/expansion of mlocked vma.
270  *
271  * return 0 on success for "normal" vmas.
272  *
273  * return number of pages [> 0] to be removed from locked_vm on success
274  * of "special" vmas.
275  */
276 long mlock_vma_pages_range(struct vm_area_struct *vma,
277 			unsigned long start, unsigned long end)
278 {
279 	int nr_pages = (end - start) / PAGE_SIZE;
280 	BUG_ON(!(vma->vm_flags & VM_LOCKED));
281 
282 	/*
283 	 * filter unlockable vmas
284 	 */
285 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
286 		goto no_mlock;
287 
288 	if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
289 			is_vm_hugetlb_page(vma) ||
290 			vma == get_gate_vma(current))) {
291 
292 		__mlock_vma_pages_range(vma, start, end, 1);
293 
294 		/* Hide errors from mmap() and other callers */
295 		return 0;
296 	}
297 
298 	/*
299 	 * User mapped kernel pages or huge pages:
300 	 * make these pages present to populate the ptes, but
301 	 * fall thru' to reset VM_LOCKED--no need to unlock, and
302 	 * return nr_pages so these don't get counted against task's
303 	 * locked limit.  huge pages are already counted against
304 	 * locked vm limit.
305 	 */
306 	make_pages_present(start, end);
307 
308 no_mlock:
309 	vma->vm_flags &= ~VM_LOCKED;	/* and don't come back! */
310 	return nr_pages;		/* error or pages NOT mlocked */
311 }
312 
313 
314 /*
315  * munlock_vma_pages_range() - munlock all pages in the vma range.'
316  * @vma - vma containing range to be munlock()ed.
317  * @start - start address in @vma of the range
318  * @end - end of range in @vma.
319  *
320  *  For mremap(), munmap() and exit().
321  *
322  * Called with @vma VM_LOCKED.
323  *
324  * Returns with VM_LOCKED cleared.  Callers must be prepared to
325  * deal with this.
326  *
327  * We don't save and restore VM_LOCKED here because pages are
328  * still on lru.  In unmap path, pages might be scanned by reclaim
329  * and re-mlocked by try_to_{munlock|unmap} before we unmap and
330  * free them.  This will result in freeing mlocked pages.
331  */
332 void munlock_vma_pages_range(struct vm_area_struct *vma,
333 			   unsigned long start, unsigned long end)
334 {
335 	vma->vm_flags &= ~VM_LOCKED;
336 	__mlock_vma_pages_range(vma, start, end, 0);
337 }
338 
339 /*
340  * mlock_fixup  - handle mlock[all]/munlock[all] requests.
341  *
342  * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
343  * munlock is a no-op.  However, for some special vmas, we go ahead and
344  * populate the ptes via make_pages_present().
345  *
346  * For vmas that pass the filters, merge/split as appropriate.
347  */
348 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
349 	unsigned long start, unsigned long end, unsigned int newflags)
350 {
351 	struct mm_struct *mm = vma->vm_mm;
352 	pgoff_t pgoff;
353 	int nr_pages;
354 	int ret = 0;
355 	int lock = newflags & VM_LOCKED;
356 
357 	if (newflags == vma->vm_flags ||
358 			(vma->vm_flags & (VM_IO | VM_PFNMAP)))
359 		goto out;	/* don't set VM_LOCKED,  don't count */
360 
361 	if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
362 			is_vm_hugetlb_page(vma) ||
363 			vma == get_gate_vma(current)) {
364 		if (lock)
365 			make_pages_present(start, end);
366 		goto out;	/* don't set VM_LOCKED,  don't count */
367 	}
368 
369 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
370 	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
371 			  vma->vm_file, pgoff, vma_policy(vma));
372 	if (*prev) {
373 		vma = *prev;
374 		goto success;
375 	}
376 
377 	if (start != vma->vm_start) {
378 		ret = split_vma(mm, vma, start, 1);
379 		if (ret)
380 			goto out;
381 	}
382 
383 	if (end != vma->vm_end) {
384 		ret = split_vma(mm, vma, end, 0);
385 		if (ret)
386 			goto out;
387 	}
388 
389 success:
390 	/*
391 	 * Keep track of amount of locked VM.
392 	 */
393 	nr_pages = (end - start) >> PAGE_SHIFT;
394 	if (!lock)
395 		nr_pages = -nr_pages;
396 	mm->locked_vm += nr_pages;
397 
398 	/*
399 	 * vm_flags is protected by the mmap_sem held in write mode.
400 	 * It's okay if try_to_unmap_one unmaps a page just after we
401 	 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
402 	 */
403 	vma->vm_flags = newflags;
404 
405 	if (lock) {
406 		ret = __mlock_vma_pages_range(vma, start, end, 1);
407 
408 		if (ret > 0) {
409 			mm->locked_vm -= ret;
410 			ret = 0;
411 		} else
412 			ret = __mlock_posix_error_return(ret); /* translate if needed */
413 	} else {
414 		__mlock_vma_pages_range(vma, start, end, 0);
415 	}
416 
417 out:
418 	*prev = vma;
419 	return ret;
420 }
421 
422 static int do_mlock(unsigned long start, size_t len, int on)
423 {
424 	unsigned long nstart, end, tmp;
425 	struct vm_area_struct * vma, * prev;
426 	int error;
427 
428 	len = PAGE_ALIGN(len);
429 	end = start + len;
430 	if (end < start)
431 		return -EINVAL;
432 	if (end == start)
433 		return 0;
434 	vma = find_vma_prev(current->mm, start, &prev);
435 	if (!vma || vma->vm_start > start)
436 		return -ENOMEM;
437 
438 	if (start > vma->vm_start)
439 		prev = vma;
440 
441 	for (nstart = start ; ; ) {
442 		unsigned int newflags;
443 
444 		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
445 
446 		newflags = vma->vm_flags | VM_LOCKED;
447 		if (!on)
448 			newflags &= ~VM_LOCKED;
449 
450 		tmp = vma->vm_end;
451 		if (tmp > end)
452 			tmp = end;
453 		error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
454 		if (error)
455 			break;
456 		nstart = tmp;
457 		if (nstart < prev->vm_end)
458 			nstart = prev->vm_end;
459 		if (nstart >= end)
460 			break;
461 
462 		vma = prev->vm_next;
463 		if (!vma || vma->vm_start != nstart) {
464 			error = -ENOMEM;
465 			break;
466 		}
467 	}
468 	return error;
469 }
470 
471 SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
472 {
473 	unsigned long locked;
474 	unsigned long lock_limit;
475 	int error = -ENOMEM;
476 
477 	if (!can_do_mlock())
478 		return -EPERM;
479 
480 	lru_add_drain_all();	/* flush pagevec */
481 
482 	down_write(&current->mm->mmap_sem);
483 	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
484 	start &= PAGE_MASK;
485 
486 	locked = len >> PAGE_SHIFT;
487 	locked += current->mm->locked_vm;
488 
489 	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
490 	lock_limit >>= PAGE_SHIFT;
491 
492 	/* check against resource limits */
493 	if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
494 		error = do_mlock(start, len, 1);
495 	up_write(&current->mm->mmap_sem);
496 	return error;
497 }
498 
499 SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
500 {
501 	int ret;
502 
503 	down_write(&current->mm->mmap_sem);
504 	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
505 	start &= PAGE_MASK;
506 	ret = do_mlock(start, len, 0);
507 	up_write(&current->mm->mmap_sem);
508 	return ret;
509 }
510 
511 static int do_mlockall(int flags)
512 {
513 	struct vm_area_struct * vma, * prev = NULL;
514 	unsigned int def_flags = 0;
515 
516 	if (flags & MCL_FUTURE)
517 		def_flags = VM_LOCKED;
518 	current->mm->def_flags = def_flags;
519 	if (flags == MCL_FUTURE)
520 		goto out;
521 
522 	for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
523 		unsigned int newflags;
524 
525 		newflags = vma->vm_flags | VM_LOCKED;
526 		if (!(flags & MCL_CURRENT))
527 			newflags &= ~VM_LOCKED;
528 
529 		/* Ignore errors */
530 		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
531 	}
532 out:
533 	return 0;
534 }
535 
536 SYSCALL_DEFINE1(mlockall, int, flags)
537 {
538 	unsigned long lock_limit;
539 	int ret = -EINVAL;
540 
541 	if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
542 		goto out;
543 
544 	ret = -EPERM;
545 	if (!can_do_mlock())
546 		goto out;
547 
548 	lru_add_drain_all();	/* flush pagevec */
549 
550 	down_write(&current->mm->mmap_sem);
551 
552 	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
553 	lock_limit >>= PAGE_SHIFT;
554 
555 	ret = -ENOMEM;
556 	if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
557 	    capable(CAP_IPC_LOCK))
558 		ret = do_mlockall(flags);
559 	up_write(&current->mm->mmap_sem);
560 out:
561 	return ret;
562 }
563 
564 SYSCALL_DEFINE0(munlockall)
565 {
566 	int ret;
567 
568 	down_write(&current->mm->mmap_sem);
569 	ret = do_mlockall(0);
570 	up_write(&current->mm->mmap_sem);
571 	return ret;
572 }
573 
574 /*
575  * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
576  * shm segments) get accounted against the user_struct instead.
577  */
578 static DEFINE_SPINLOCK(shmlock_user_lock);
579 
580 int user_shm_lock(size_t size, struct user_struct *user)
581 {
582 	unsigned long lock_limit, locked;
583 	int allowed = 0;
584 
585 	locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
586 	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
587 	if (lock_limit == RLIM_INFINITY)
588 		allowed = 1;
589 	lock_limit >>= PAGE_SHIFT;
590 	spin_lock(&shmlock_user_lock);
591 	if (!allowed &&
592 	    locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
593 		goto out;
594 	get_uid(user);
595 	user->locked_shm += locked;
596 	allowed = 1;
597 out:
598 	spin_unlock(&shmlock_user_lock);
599 	return allowed;
600 }
601 
602 void user_shm_unlock(size_t size, struct user_struct *user)
603 {
604 	spin_lock(&shmlock_user_lock);
605 	user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
606 	spin_unlock(&shmlock_user_lock);
607 	free_uid(user);
608 }
609 
610 int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
611 			  size_t size)
612 {
613 	unsigned long lim, vm, pgsz;
614 	int error = -ENOMEM;
615 
616 	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
617 
618 	down_write(&mm->mmap_sem);
619 
620 	lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
621 	vm   = mm->total_vm + pgsz;
622 	if (lim < vm)
623 		goto out;
624 
625 	lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
626 	vm   = mm->locked_vm + pgsz;
627 	if (lim < vm)
628 		goto out;
629 
630 	mm->total_vm  += pgsz;
631 	mm->locked_vm += pgsz;
632 
633 	error = 0;
634  out:
635 	up_write(&mm->mmap_sem);
636 	return error;
637 }
638 
639 void refund_locked_memory(struct mm_struct *mm, size_t size)
640 {
641 	unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
642 
643 	down_write(&mm->mmap_sem);
644 
645 	mm->total_vm  -= pgsz;
646 	mm->locked_vm -= pgsz;
647 
648 	up_write(&mm->mmap_sem);
649 }
650