xref: /openbmc/linux/mm/mprotect.c (revision dc6a81c3)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   *  mm/mprotect.c
4   *
5   *  (C) Copyright 1994 Linus Torvalds
6   *  (C) Copyright 2002 Christoph Hellwig
7   *
8   *  Address space accounting code	<alan@lxorguk.ukuu.org.uk>
9   *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
10   */
11  
12  #include <linux/pagewalk.h>
13  #include <linux/hugetlb.h>
14  #include <linux/shm.h>
15  #include <linux/mman.h>
16  #include <linux/fs.h>
17  #include <linux/highmem.h>
18  #include <linux/security.h>
19  #include <linux/mempolicy.h>
20  #include <linux/personality.h>
21  #include <linux/syscalls.h>
22  #include <linux/swap.h>
23  #include <linux/swapops.h>
24  #include <linux/mmu_notifier.h>
25  #include <linux/migrate.h>
26  #include <linux/perf_event.h>
27  #include <linux/pkeys.h>
28  #include <linux/ksm.h>
29  #include <linux/uaccess.h>
30  #include <linux/mm_inline.h>
31  #include <asm/pgtable.h>
32  #include <asm/cacheflush.h>
33  #include <asm/mmu_context.h>
34  #include <asm/tlbflush.h>
35  
36  #include "internal.h"
37  
38  static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
39  		unsigned long addr, unsigned long end, pgprot_t newprot,
40  		int dirty_accountable, int prot_numa)
41  {
42  	pte_t *pte, oldpte;
43  	spinlock_t *ptl;
44  	unsigned long pages = 0;
45  	int target_node = NUMA_NO_NODE;
46  
47  	/*
48  	 * Can be called with only the mmap_sem for reading by
49  	 * prot_numa so we must check the pmd isn't constantly
50  	 * changing from under us from pmd_none to pmd_trans_huge
51  	 * and/or the other way around.
52  	 */
53  	if (pmd_trans_unstable(pmd))
54  		return 0;
55  
56  	/*
57  	 * The pmd points to a regular pte so the pmd can't change
58  	 * from under us even if the mmap_sem is only hold for
59  	 * reading.
60  	 */
61  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
62  
63  	/* Get target node for single threaded private VMAs */
64  	if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
65  	    atomic_read(&vma->vm_mm->mm_users) == 1)
66  		target_node = numa_node_id();
67  
68  	flush_tlb_batched_pending(vma->vm_mm);
69  	arch_enter_lazy_mmu_mode();
70  	do {
71  		oldpte = *pte;
72  		if (pte_present(oldpte)) {
73  			pte_t ptent;
74  			bool preserve_write = prot_numa && pte_write(oldpte);
75  
76  			/*
77  			 * Avoid trapping faults against the zero or KSM
78  			 * pages. See similar comment in change_huge_pmd.
79  			 */
80  			if (prot_numa) {
81  				struct page *page;
82  
83  				/* Avoid TLB flush if possible */
84  				if (pte_protnone(oldpte))
85  					continue;
86  
87  				page = vm_normal_page(vma, addr, oldpte);
88  				if (!page || PageKsm(page))
89  					continue;
90  
91  				/* Also skip shared copy-on-write pages */
92  				if (is_cow_mapping(vma->vm_flags) &&
93  				    page_mapcount(page) != 1)
94  					continue;
95  
96  				/*
97  				 * While migration can move some dirty pages,
98  				 * it cannot move them all from MIGRATE_ASYNC
99  				 * context.
100  				 */
101  				if (page_is_file_cache(page) && PageDirty(page))
102  					continue;
103  
104  				/*
105  				 * Don't mess with PTEs if page is already on the node
106  				 * a single-threaded process is running on.
107  				 */
108  				if (target_node == page_to_nid(page))
109  					continue;
110  			}
111  
112  			oldpte = ptep_modify_prot_start(vma, addr, pte);
113  			ptent = pte_modify(oldpte, newprot);
114  			if (preserve_write)
115  				ptent = pte_mk_savedwrite(ptent);
116  
117  			/* Avoid taking write faults for known dirty pages */
118  			if (dirty_accountable && pte_dirty(ptent) &&
119  					(pte_soft_dirty(ptent) ||
120  					 !(vma->vm_flags & VM_SOFTDIRTY))) {
121  				ptent = pte_mkwrite(ptent);
122  			}
123  			ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
124  			pages++;
125  		} else if (IS_ENABLED(CONFIG_MIGRATION)) {
126  			swp_entry_t entry = pte_to_swp_entry(oldpte);
127  
128  			if (is_write_migration_entry(entry)) {
129  				pte_t newpte;
130  				/*
131  				 * A protection check is difficult so
132  				 * just be safe and disable write
133  				 */
134  				make_migration_entry_read(&entry);
135  				newpte = swp_entry_to_pte(entry);
136  				if (pte_swp_soft_dirty(oldpte))
137  					newpte = pte_swp_mksoft_dirty(newpte);
138  				set_pte_at(vma->vm_mm, addr, pte, newpte);
139  
140  				pages++;
141  			}
142  
143  			if (is_write_device_private_entry(entry)) {
144  				pte_t newpte;
145  
146  				/*
147  				 * We do not preserve soft-dirtiness. See
148  				 * copy_one_pte() for explanation.
149  				 */
150  				make_device_private_entry_read(&entry);
151  				newpte = swp_entry_to_pte(entry);
152  				set_pte_at(vma->vm_mm, addr, pte, newpte);
153  
154  				pages++;
155  			}
156  		}
157  	} while (pte++, addr += PAGE_SIZE, addr != end);
158  	arch_leave_lazy_mmu_mode();
159  	pte_unmap_unlock(pte - 1, ptl);
160  
161  	return pages;
162  }
163  
164  static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
165  		pud_t *pud, unsigned long addr, unsigned long end,
166  		pgprot_t newprot, int dirty_accountable, int prot_numa)
167  {
168  	pmd_t *pmd;
169  	unsigned long next;
170  	unsigned long pages = 0;
171  	unsigned long nr_huge_updates = 0;
172  	struct mmu_notifier_range range;
173  
174  	range.start = 0;
175  
176  	pmd = pmd_offset(pud, addr);
177  	do {
178  		unsigned long this_pages;
179  
180  		next = pmd_addr_end(addr, end);
181  		if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
182  				&& pmd_none_or_clear_bad(pmd))
183  			goto next;
184  
185  		/* invoke the mmu notifier if the pmd is populated */
186  		if (!range.start) {
187  			mmu_notifier_range_init(&range,
188  				MMU_NOTIFY_PROTECTION_VMA, 0,
189  				vma, vma->vm_mm, addr, end);
190  			mmu_notifier_invalidate_range_start(&range);
191  		}
192  
193  		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
194  			if (next - addr != HPAGE_PMD_SIZE) {
195  				__split_huge_pmd(vma, pmd, addr, false, NULL);
196  			} else {
197  				int nr_ptes = change_huge_pmd(vma, pmd, addr,
198  						newprot, prot_numa);
199  
200  				if (nr_ptes) {
201  					if (nr_ptes == HPAGE_PMD_NR) {
202  						pages += HPAGE_PMD_NR;
203  						nr_huge_updates++;
204  					}
205  
206  					/* huge pmd was handled */
207  					goto next;
208  				}
209  			}
210  			/* fall through, the trans huge pmd just split */
211  		}
212  		this_pages = change_pte_range(vma, pmd, addr, next, newprot,
213  				 dirty_accountable, prot_numa);
214  		pages += this_pages;
215  next:
216  		cond_resched();
217  	} while (pmd++, addr = next, addr != end);
218  
219  	if (range.start)
220  		mmu_notifier_invalidate_range_end(&range);
221  
222  	if (nr_huge_updates)
223  		count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
224  	return pages;
225  }
226  
227  static inline unsigned long change_pud_range(struct vm_area_struct *vma,
228  		p4d_t *p4d, unsigned long addr, unsigned long end,
229  		pgprot_t newprot, int dirty_accountable, int prot_numa)
230  {
231  	pud_t *pud;
232  	unsigned long next;
233  	unsigned long pages = 0;
234  
235  	pud = pud_offset(p4d, addr);
236  	do {
237  		next = pud_addr_end(addr, end);
238  		if (pud_none_or_clear_bad(pud))
239  			continue;
240  		pages += change_pmd_range(vma, pud, addr, next, newprot,
241  				 dirty_accountable, prot_numa);
242  	} while (pud++, addr = next, addr != end);
243  
244  	return pages;
245  }
246  
247  static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
248  		pgd_t *pgd, unsigned long addr, unsigned long end,
249  		pgprot_t newprot, int dirty_accountable, int prot_numa)
250  {
251  	p4d_t *p4d;
252  	unsigned long next;
253  	unsigned long pages = 0;
254  
255  	p4d = p4d_offset(pgd, addr);
256  	do {
257  		next = p4d_addr_end(addr, end);
258  		if (p4d_none_or_clear_bad(p4d))
259  			continue;
260  		pages += change_pud_range(vma, p4d, addr, next, newprot,
261  				 dirty_accountable, prot_numa);
262  	} while (p4d++, addr = next, addr != end);
263  
264  	return pages;
265  }
266  
267  static unsigned long change_protection_range(struct vm_area_struct *vma,
268  		unsigned long addr, unsigned long end, pgprot_t newprot,
269  		int dirty_accountable, int prot_numa)
270  {
271  	struct mm_struct *mm = vma->vm_mm;
272  	pgd_t *pgd;
273  	unsigned long next;
274  	unsigned long start = addr;
275  	unsigned long pages = 0;
276  
277  	BUG_ON(addr >= end);
278  	pgd = pgd_offset(mm, addr);
279  	flush_cache_range(vma, addr, end);
280  	inc_tlb_flush_pending(mm);
281  	do {
282  		next = pgd_addr_end(addr, end);
283  		if (pgd_none_or_clear_bad(pgd))
284  			continue;
285  		pages += change_p4d_range(vma, pgd, addr, next, newprot,
286  				 dirty_accountable, prot_numa);
287  	} while (pgd++, addr = next, addr != end);
288  
289  	/* Only flush the TLB if we actually modified any entries: */
290  	if (pages)
291  		flush_tlb_range(vma, start, end);
292  	dec_tlb_flush_pending(mm);
293  
294  	return pages;
295  }
296  
297  unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
298  		       unsigned long end, pgprot_t newprot,
299  		       int dirty_accountable, int prot_numa)
300  {
301  	unsigned long pages;
302  
303  	if (is_vm_hugetlb_page(vma))
304  		pages = hugetlb_change_protection(vma, start, end, newprot);
305  	else
306  		pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
307  
308  	return pages;
309  }
310  
311  static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
312  			       unsigned long next, struct mm_walk *walk)
313  {
314  	return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
315  		0 : -EACCES;
316  }
317  
318  static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
319  				   unsigned long addr, unsigned long next,
320  				   struct mm_walk *walk)
321  {
322  	return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
323  		0 : -EACCES;
324  }
325  
326  static int prot_none_test(unsigned long addr, unsigned long next,
327  			  struct mm_walk *walk)
328  {
329  	return 0;
330  }
331  
332  static const struct mm_walk_ops prot_none_walk_ops = {
333  	.pte_entry		= prot_none_pte_entry,
334  	.hugetlb_entry		= prot_none_hugetlb_entry,
335  	.test_walk		= prot_none_test,
336  };
337  
338  int
339  mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
340  	unsigned long start, unsigned long end, unsigned long newflags)
341  {
342  	struct mm_struct *mm = vma->vm_mm;
343  	unsigned long oldflags = vma->vm_flags;
344  	long nrpages = (end - start) >> PAGE_SHIFT;
345  	unsigned long charged = 0;
346  	pgoff_t pgoff;
347  	int error;
348  	int dirty_accountable = 0;
349  
350  	if (newflags == oldflags) {
351  		*pprev = vma;
352  		return 0;
353  	}
354  
355  	/*
356  	 * Do PROT_NONE PFN permission checks here when we can still
357  	 * bail out without undoing a lot of state. This is a rather
358  	 * uncommon case, so doesn't need to be very optimized.
359  	 */
360  	if (arch_has_pfn_modify_check() &&
361  	    (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
362  	    (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
363  		pgprot_t new_pgprot = vm_get_page_prot(newflags);
364  
365  		error = walk_page_range(current->mm, start, end,
366  				&prot_none_walk_ops, &new_pgprot);
367  		if (error)
368  			return error;
369  	}
370  
371  	/*
372  	 * If we make a private mapping writable we increase our commit;
373  	 * but (without finer accounting) cannot reduce our commit if we
374  	 * make it unwritable again. hugetlb mapping were accounted for
375  	 * even if read-only so there is no need to account for them here
376  	 */
377  	if (newflags & VM_WRITE) {
378  		/* Check space limits when area turns into data. */
379  		if (!may_expand_vm(mm, newflags, nrpages) &&
380  				may_expand_vm(mm, oldflags, nrpages))
381  			return -ENOMEM;
382  		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
383  						VM_SHARED|VM_NORESERVE))) {
384  			charged = nrpages;
385  			if (security_vm_enough_memory_mm(mm, charged))
386  				return -ENOMEM;
387  			newflags |= VM_ACCOUNT;
388  		}
389  	}
390  
391  	/*
392  	 * First try to merge with previous and/or next vma.
393  	 */
394  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
395  	*pprev = vma_merge(mm, *pprev, start, end, newflags,
396  			   vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
397  			   vma->vm_userfaultfd_ctx);
398  	if (*pprev) {
399  		vma = *pprev;
400  		VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
401  		goto success;
402  	}
403  
404  	*pprev = vma;
405  
406  	if (start != vma->vm_start) {
407  		error = split_vma(mm, vma, start, 1);
408  		if (error)
409  			goto fail;
410  	}
411  
412  	if (end != vma->vm_end) {
413  		error = split_vma(mm, vma, end, 0);
414  		if (error)
415  			goto fail;
416  	}
417  
418  success:
419  	/*
420  	 * vm_flags and vm_page_prot are protected by the mmap_sem
421  	 * held in write mode.
422  	 */
423  	vma->vm_flags = newflags;
424  	dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
425  	vma_set_page_prot(vma);
426  
427  	change_protection(vma, start, end, vma->vm_page_prot,
428  			  dirty_accountable, 0);
429  
430  	/*
431  	 * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
432  	 * fault on access.
433  	 */
434  	if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
435  			(newflags & VM_WRITE)) {
436  		populate_vma_page_range(vma, start, end, NULL);
437  	}
438  
439  	vm_stat_account(mm, oldflags, -nrpages);
440  	vm_stat_account(mm, newflags, nrpages);
441  	perf_event_mmap(vma);
442  	return 0;
443  
444  fail:
445  	vm_unacct_memory(charged);
446  	return error;
447  }
448  
449  /*
450   * pkey==-1 when doing a legacy mprotect()
451   */
452  static int do_mprotect_pkey(unsigned long start, size_t len,
453  		unsigned long prot, int pkey)
454  {
455  	unsigned long nstart, end, tmp, reqprot;
456  	struct vm_area_struct *vma, *prev;
457  	int error = -EINVAL;
458  	const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
459  	const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
460  				(prot & PROT_READ);
461  
462  	start = untagged_addr(start);
463  
464  	prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
465  	if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
466  		return -EINVAL;
467  
468  	if (start & ~PAGE_MASK)
469  		return -EINVAL;
470  	if (!len)
471  		return 0;
472  	len = PAGE_ALIGN(len);
473  	end = start + len;
474  	if (end <= start)
475  		return -ENOMEM;
476  	if (!arch_validate_prot(prot, start))
477  		return -EINVAL;
478  
479  	reqprot = prot;
480  
481  	if (down_write_killable(&current->mm->mmap_sem))
482  		return -EINTR;
483  
484  	/*
485  	 * If userspace did not allocate the pkey, do not let
486  	 * them use it here.
487  	 */
488  	error = -EINVAL;
489  	if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
490  		goto out;
491  
492  	vma = find_vma(current->mm, start);
493  	error = -ENOMEM;
494  	if (!vma)
495  		goto out;
496  	prev = vma->vm_prev;
497  	if (unlikely(grows & PROT_GROWSDOWN)) {
498  		if (vma->vm_start >= end)
499  			goto out;
500  		start = vma->vm_start;
501  		error = -EINVAL;
502  		if (!(vma->vm_flags & VM_GROWSDOWN))
503  			goto out;
504  	} else {
505  		if (vma->vm_start > start)
506  			goto out;
507  		if (unlikely(grows & PROT_GROWSUP)) {
508  			end = vma->vm_end;
509  			error = -EINVAL;
510  			if (!(vma->vm_flags & VM_GROWSUP))
511  				goto out;
512  		}
513  	}
514  	if (start > vma->vm_start)
515  		prev = vma;
516  
517  	for (nstart = start ; ; ) {
518  		unsigned long mask_off_old_flags;
519  		unsigned long newflags;
520  		int new_vma_pkey;
521  
522  		/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
523  
524  		/* Does the application expect PROT_READ to imply PROT_EXEC */
525  		if (rier && (vma->vm_flags & VM_MAYEXEC))
526  			prot |= PROT_EXEC;
527  
528  		/*
529  		 * Each mprotect() call explicitly passes r/w/x permissions.
530  		 * If a permission is not passed to mprotect(), it must be
531  		 * cleared from the VMA.
532  		 */
533  		mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
534  					VM_FLAGS_CLEAR;
535  
536  		new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
537  		newflags = calc_vm_prot_bits(prot, new_vma_pkey);
538  		newflags |= (vma->vm_flags & ~mask_off_old_flags);
539  
540  		/* newflags >> 4 shift VM_MAY% in place of VM_% */
541  		if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
542  			error = -EACCES;
543  			goto out;
544  		}
545  
546  		error = security_file_mprotect(vma, reqprot, prot);
547  		if (error)
548  			goto out;
549  
550  		tmp = vma->vm_end;
551  		if (tmp > end)
552  			tmp = end;
553  		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
554  		if (error)
555  			goto out;
556  		nstart = tmp;
557  
558  		if (nstart < prev->vm_end)
559  			nstart = prev->vm_end;
560  		if (nstart >= end)
561  			goto out;
562  
563  		vma = prev->vm_next;
564  		if (!vma || vma->vm_start != nstart) {
565  			error = -ENOMEM;
566  			goto out;
567  		}
568  		prot = reqprot;
569  	}
570  out:
571  	up_write(&current->mm->mmap_sem);
572  	return error;
573  }
574  
575  SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
576  		unsigned long, prot)
577  {
578  	return do_mprotect_pkey(start, len, prot, -1);
579  }
580  
581  #ifdef CONFIG_ARCH_HAS_PKEYS
582  
583  SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
584  		unsigned long, prot, int, pkey)
585  {
586  	return do_mprotect_pkey(start, len, prot, pkey);
587  }
588  
589  SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
590  {
591  	int pkey;
592  	int ret;
593  
594  	/* No flags supported yet. */
595  	if (flags)
596  		return -EINVAL;
597  	/* check for unsupported init values */
598  	if (init_val & ~PKEY_ACCESS_MASK)
599  		return -EINVAL;
600  
601  	down_write(&current->mm->mmap_sem);
602  	pkey = mm_pkey_alloc(current->mm);
603  
604  	ret = -ENOSPC;
605  	if (pkey == -1)
606  		goto out;
607  
608  	ret = arch_set_user_pkey_access(current, pkey, init_val);
609  	if (ret) {
610  		mm_pkey_free(current->mm, pkey);
611  		goto out;
612  	}
613  	ret = pkey;
614  out:
615  	up_write(&current->mm->mmap_sem);
616  	return ret;
617  }
618  
619  SYSCALL_DEFINE1(pkey_free, int, pkey)
620  {
621  	int ret;
622  
623  	down_write(&current->mm->mmap_sem);
624  	ret = mm_pkey_free(current->mm, pkey);
625  	up_write(&current->mm->mmap_sem);
626  
627  	/*
628  	 * We could provie warnings or errors if any VMA still
629  	 * has the pkey set here.
630  	 */
631  	return ret;
632  }
633  
634  #endif /* CONFIG_ARCH_HAS_PKEYS */
635