1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * mm/userfaultfd.c
4 *
5 * Copyright (C) 2015 Red Hat, Inc.
6 */
7
8 #include <linux/mm.h>
9 #include <linux/sched/signal.h>
10 #include <linux/pagemap.h>
11 #include <linux/rmap.h>
12 #include <linux/swap.h>
13 #include <linux/swapops.h>
14 #include <linux/userfaultfd_k.h>
15 #include <linux/mmu_notifier.h>
16 #include <linux/hugetlb.h>
17 #include <linux/shmem_fs.h>
18 #include <asm/tlbflush.h>
19 #include <asm/tlb.h>
20 #include "internal.h"
21
22 static __always_inline
find_dst_vma(struct mm_struct * dst_mm,unsigned long dst_start,unsigned long len)23 struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
24 unsigned long dst_start,
25 unsigned long len)
26 {
27 /*
28 * Make sure that the dst range is both valid and fully within a
29 * single existing vma.
30 */
31 struct vm_area_struct *dst_vma;
32
33 dst_vma = find_vma(dst_mm, dst_start);
34 if (!range_in_vma(dst_vma, dst_start, dst_start + len))
35 return NULL;
36
37 /*
38 * Check the vma is registered in uffd, this is required to
39 * enforce the VM_MAYWRITE check done at uffd registration
40 * time.
41 */
42 if (!dst_vma->vm_userfaultfd_ctx.ctx)
43 return NULL;
44
45 return dst_vma;
46 }
47
48 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
mfill_file_over_size(struct vm_area_struct * dst_vma,unsigned long dst_addr)49 static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
50 unsigned long dst_addr)
51 {
52 struct inode *inode;
53 pgoff_t offset, max_off;
54
55 if (!dst_vma->vm_file)
56 return false;
57
58 inode = dst_vma->vm_file->f_inode;
59 offset = linear_page_index(dst_vma, dst_addr);
60 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
61 return offset >= max_off;
62 }
63
64 /*
65 * Install PTEs, to map dst_addr (within dst_vma) to page.
66 *
67 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
68 * and anon, and for both shared and private VMAs.
69 */
mfill_atomic_install_pte(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr,struct page * page,bool newly_allocated,uffd_flags_t flags)70 int mfill_atomic_install_pte(pmd_t *dst_pmd,
71 struct vm_area_struct *dst_vma,
72 unsigned long dst_addr, struct page *page,
73 bool newly_allocated, uffd_flags_t flags)
74 {
75 int ret;
76 struct mm_struct *dst_mm = dst_vma->vm_mm;
77 pte_t _dst_pte, *dst_pte;
78 bool writable = dst_vma->vm_flags & VM_WRITE;
79 bool vm_shared = dst_vma->vm_flags & VM_SHARED;
80 bool page_in_cache = page_mapping(page);
81 spinlock_t *ptl;
82 struct folio *folio;
83
84 _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
85 _dst_pte = pte_mkdirty(_dst_pte);
86 if (page_in_cache && !vm_shared)
87 writable = false;
88 if (writable)
89 _dst_pte = pte_mkwrite(_dst_pte, dst_vma);
90 if (flags & MFILL_ATOMIC_WP)
91 _dst_pte = pte_mkuffd_wp(_dst_pte);
92
93 ret = -EAGAIN;
94 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
95 if (!dst_pte)
96 goto out;
97
98 if (mfill_file_over_size(dst_vma, dst_addr)) {
99 ret = -EFAULT;
100 goto out_unlock;
101 }
102
103 ret = -EEXIST;
104 /*
105 * We allow to overwrite a pte marker: consider when both MISSING|WP
106 * registered, we firstly wr-protect a none pte which has no page cache
107 * page backing it, then access the page.
108 */
109 if (!pte_none_mostly(ptep_get(dst_pte)))
110 goto out_unlock;
111
112 folio = page_folio(page);
113 if (page_in_cache) {
114 /* Usually, cache pages are already added to LRU */
115 if (newly_allocated)
116 folio_add_lru(folio);
117 page_add_file_rmap(page, dst_vma, false);
118 } else {
119 page_add_new_anon_rmap(page, dst_vma, dst_addr);
120 folio_add_lru_vma(folio, dst_vma);
121 }
122
123 /*
124 * Must happen after rmap, as mm_counter() checks mapping (via
125 * PageAnon()), which is set by __page_set_anon_rmap().
126 */
127 inc_mm_counter(dst_mm, mm_counter(page));
128
129 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
130
131 /* No need to invalidate - it was non-present before */
132 update_mmu_cache(dst_vma, dst_addr, dst_pte);
133 ret = 0;
134 out_unlock:
135 pte_unmap_unlock(dst_pte, ptl);
136 out:
137 return ret;
138 }
139
mfill_atomic_pte_copy(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr,unsigned long src_addr,uffd_flags_t flags,struct folio ** foliop)140 static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
141 struct vm_area_struct *dst_vma,
142 unsigned long dst_addr,
143 unsigned long src_addr,
144 uffd_flags_t flags,
145 struct folio **foliop)
146 {
147 void *kaddr;
148 int ret;
149 struct folio *folio;
150
151 if (!*foliop) {
152 ret = -ENOMEM;
153 folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
154 dst_addr, false);
155 if (!folio)
156 goto out;
157
158 kaddr = kmap_local_folio(folio, 0);
159 /*
160 * The read mmap_lock is held here. Despite the
161 * mmap_lock being read recursive a deadlock is still
162 * possible if a writer has taken a lock. For example:
163 *
164 * process A thread 1 takes read lock on own mmap_lock
165 * process A thread 2 calls mmap, blocks taking write lock
166 * process B thread 1 takes page fault, read lock on own mmap lock
167 * process B thread 2 calls mmap, blocks taking write lock
168 * process A thread 1 blocks taking read lock on process B
169 * process B thread 1 blocks taking read lock on process A
170 *
171 * Disable page faults to prevent potential deadlock
172 * and retry the copy outside the mmap_lock.
173 */
174 pagefault_disable();
175 ret = copy_from_user(kaddr, (const void __user *) src_addr,
176 PAGE_SIZE);
177 pagefault_enable();
178 kunmap_local(kaddr);
179
180 /* fallback to copy_from_user outside mmap_lock */
181 if (unlikely(ret)) {
182 ret = -ENOENT;
183 *foliop = folio;
184 /* don't free the page */
185 goto out;
186 }
187
188 flush_dcache_folio(folio);
189 } else {
190 folio = *foliop;
191 *foliop = NULL;
192 }
193
194 /*
195 * The memory barrier inside __folio_mark_uptodate makes sure that
196 * preceding stores to the page contents become visible before
197 * the set_pte_at() write.
198 */
199 __folio_mark_uptodate(folio);
200
201 ret = -ENOMEM;
202 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
203 goto out_release;
204
205 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
206 &folio->page, true, flags);
207 if (ret)
208 goto out_release;
209 out:
210 return ret;
211 out_release:
212 folio_put(folio);
213 goto out;
214 }
215
mfill_atomic_pte_zeroed_folio(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr)216 static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
217 struct vm_area_struct *dst_vma,
218 unsigned long dst_addr)
219 {
220 struct folio *folio;
221 int ret = -ENOMEM;
222
223 folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
224 if (!folio)
225 return ret;
226
227 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
228 goto out_put;
229
230 /*
231 * The memory barrier inside __folio_mark_uptodate makes sure that
232 * zeroing out the folio become visible before mapping the page
233 * using set_pte_at(). See do_anonymous_page().
234 */
235 __folio_mark_uptodate(folio);
236
237 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
238 &folio->page, true, 0);
239 if (ret)
240 goto out_put;
241
242 return 0;
243 out_put:
244 folio_put(folio);
245 return ret;
246 }
247
mfill_atomic_pte_zeropage(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr)248 static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
249 struct vm_area_struct *dst_vma,
250 unsigned long dst_addr)
251 {
252 pte_t _dst_pte, *dst_pte;
253 spinlock_t *ptl;
254 int ret;
255
256 if (mm_forbids_zeropage(dst_vma->vm_mm))
257 return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
258
259 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
260 dst_vma->vm_page_prot));
261 ret = -EAGAIN;
262 dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
263 if (!dst_pte)
264 goto out;
265 if (mfill_file_over_size(dst_vma, dst_addr)) {
266 ret = -EFAULT;
267 goto out_unlock;
268 }
269 ret = -EEXIST;
270 if (!pte_none(ptep_get(dst_pte)))
271 goto out_unlock;
272 set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
273 /* No need to invalidate - it was non-present before */
274 update_mmu_cache(dst_vma, dst_addr, dst_pte);
275 ret = 0;
276 out_unlock:
277 pte_unmap_unlock(dst_pte, ptl);
278 out:
279 return ret;
280 }
281
282 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
mfill_atomic_pte_continue(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr,uffd_flags_t flags)283 static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
284 struct vm_area_struct *dst_vma,
285 unsigned long dst_addr,
286 uffd_flags_t flags)
287 {
288 struct inode *inode = file_inode(dst_vma->vm_file);
289 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
290 struct folio *folio;
291 struct page *page;
292 int ret;
293
294 ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC);
295 /* Our caller expects us to return -EFAULT if we failed to find folio */
296 if (ret == -ENOENT)
297 ret = -EFAULT;
298 if (ret)
299 goto out;
300 if (!folio) {
301 ret = -EFAULT;
302 goto out;
303 }
304
305 page = folio_file_page(folio, pgoff);
306 if (PageHWPoison(page)) {
307 ret = -EIO;
308 goto out_release;
309 }
310
311 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
312 page, false, flags);
313 if (ret)
314 goto out_release;
315
316 folio_unlock(folio);
317 ret = 0;
318 out:
319 return ret;
320 out_release:
321 folio_unlock(folio);
322 folio_put(folio);
323 goto out;
324 }
325
326 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
mfill_atomic_pte_poison(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr,uffd_flags_t flags)327 static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
328 struct vm_area_struct *dst_vma,
329 unsigned long dst_addr,
330 uffd_flags_t flags)
331 {
332 int ret;
333 struct mm_struct *dst_mm = dst_vma->vm_mm;
334 pte_t _dst_pte, *dst_pte;
335 spinlock_t *ptl;
336
337 _dst_pte = make_pte_marker(PTE_MARKER_POISONED);
338 ret = -EAGAIN;
339 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
340 if (!dst_pte)
341 goto out;
342
343 if (mfill_file_over_size(dst_vma, dst_addr)) {
344 ret = -EFAULT;
345 goto out_unlock;
346 }
347
348 ret = -EEXIST;
349 /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */
350 if (!pte_none(*dst_pte))
351 goto out_unlock;
352
353 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
354
355 /* No need to invalidate - it was non-present before */
356 update_mmu_cache(dst_vma, dst_addr, dst_pte);
357 ret = 0;
358 out_unlock:
359 pte_unmap_unlock(dst_pte, ptl);
360 out:
361 return ret;
362 }
363
mm_alloc_pmd(struct mm_struct * mm,unsigned long address)364 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
365 {
366 pgd_t *pgd;
367 p4d_t *p4d;
368 pud_t *pud;
369
370 pgd = pgd_offset(mm, address);
371 p4d = p4d_alloc(mm, pgd, address);
372 if (!p4d)
373 return NULL;
374 pud = pud_alloc(mm, p4d, address);
375 if (!pud)
376 return NULL;
377 /*
378 * Note that we didn't run this because the pmd was
379 * missing, the *pmd may be already established and in
380 * turn it may also be a trans_huge_pmd.
381 */
382 return pmd_alloc(mm, pud, address);
383 }
384
385 #ifdef CONFIG_HUGETLB_PAGE
386 /*
387 * mfill_atomic processing for HUGETLB vmas. Note that this routine is
388 * called with mmap_lock held, it will release mmap_lock before returning.
389 */
mfill_atomic_hugetlb(struct vm_area_struct * dst_vma,unsigned long dst_start,unsigned long src_start,unsigned long len,atomic_t * mmap_changing,uffd_flags_t flags)390 static __always_inline ssize_t mfill_atomic_hugetlb(
391 struct vm_area_struct *dst_vma,
392 unsigned long dst_start,
393 unsigned long src_start,
394 unsigned long len,
395 atomic_t *mmap_changing,
396 uffd_flags_t flags)
397 {
398 struct mm_struct *dst_mm = dst_vma->vm_mm;
399 int vm_shared = dst_vma->vm_flags & VM_SHARED;
400 ssize_t err;
401 pte_t *dst_pte;
402 unsigned long src_addr, dst_addr;
403 long copied;
404 struct folio *folio;
405 unsigned long vma_hpagesize;
406 pgoff_t idx;
407 u32 hash;
408 struct address_space *mapping;
409
410 /*
411 * There is no default zero huge page for all huge page sizes as
412 * supported by hugetlb. A PMD_SIZE huge pages may exist as used
413 * by THP. Since we can not reliably insert a zero page, this
414 * feature is not supported.
415 */
416 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
417 mmap_read_unlock(dst_mm);
418 return -EINVAL;
419 }
420
421 src_addr = src_start;
422 dst_addr = dst_start;
423 copied = 0;
424 folio = NULL;
425 vma_hpagesize = vma_kernel_pagesize(dst_vma);
426
427 /*
428 * Validate alignment based on huge page size
429 */
430 err = -EINVAL;
431 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
432 goto out_unlock;
433
434 retry:
435 /*
436 * On routine entry dst_vma is set. If we had to drop mmap_lock and
437 * retry, dst_vma will be set to NULL and we must lookup again.
438 */
439 if (!dst_vma) {
440 err = -ENOENT;
441 dst_vma = find_dst_vma(dst_mm, dst_start, len);
442 if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
443 goto out_unlock;
444
445 err = -EINVAL;
446 if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
447 goto out_unlock;
448
449 vm_shared = dst_vma->vm_flags & VM_SHARED;
450 }
451
452 /*
453 * If not shared, ensure the dst_vma has a anon_vma.
454 */
455 err = -ENOMEM;
456 if (!vm_shared) {
457 if (unlikely(anon_vma_prepare(dst_vma)))
458 goto out_unlock;
459 }
460
461 while (src_addr < src_start + len) {
462 BUG_ON(dst_addr >= dst_start + len);
463
464 /*
465 * Serialize via vma_lock and hugetlb_fault_mutex.
466 * vma_lock ensures the dst_pte remains valid even
467 * in the case of shared pmds. fault mutex prevents
468 * races with other faulting threads.
469 */
470 idx = linear_page_index(dst_vma, dst_addr);
471 mapping = dst_vma->vm_file->f_mapping;
472 hash = hugetlb_fault_mutex_hash(mapping, idx);
473 mutex_lock(&hugetlb_fault_mutex_table[hash]);
474 hugetlb_vma_lock_read(dst_vma);
475
476 err = -ENOMEM;
477 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
478 if (!dst_pte) {
479 hugetlb_vma_unlock_read(dst_vma);
480 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
481 goto out_unlock;
482 }
483
484 if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
485 !huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
486 err = -EEXIST;
487 hugetlb_vma_unlock_read(dst_vma);
488 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
489 goto out_unlock;
490 }
491
492 err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
493 src_addr, flags, &folio);
494
495 hugetlb_vma_unlock_read(dst_vma);
496 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
497
498 cond_resched();
499
500 if (unlikely(err == -ENOENT)) {
501 mmap_read_unlock(dst_mm);
502 BUG_ON(!folio);
503
504 err = copy_folio_from_user(folio,
505 (const void __user *)src_addr, true);
506 if (unlikely(err)) {
507 err = -EFAULT;
508 goto out;
509 }
510 mmap_read_lock(dst_mm);
511 /*
512 * If memory mappings are changing because of non-cooperative
513 * operation (e.g. mremap) running in parallel, bail out and
514 * request the user to retry later
515 */
516 if (mmap_changing && atomic_read(mmap_changing)) {
517 err = -EAGAIN;
518 break;
519 }
520
521 dst_vma = NULL;
522 goto retry;
523 } else
524 BUG_ON(folio);
525
526 if (!err) {
527 dst_addr += vma_hpagesize;
528 src_addr += vma_hpagesize;
529 copied += vma_hpagesize;
530
531 if (fatal_signal_pending(current))
532 err = -EINTR;
533 }
534 if (err)
535 break;
536 }
537
538 out_unlock:
539 mmap_read_unlock(dst_mm);
540 out:
541 if (folio)
542 folio_put(folio);
543 BUG_ON(copied < 0);
544 BUG_ON(err > 0);
545 BUG_ON(!copied && !err);
546 return copied ? copied : err;
547 }
548 #else /* !CONFIG_HUGETLB_PAGE */
549 /* fail at build time if gcc attempts to use this */
550 extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
551 unsigned long dst_start,
552 unsigned long src_start,
553 unsigned long len,
554 atomic_t *mmap_changing,
555 uffd_flags_t flags);
556 #endif /* CONFIG_HUGETLB_PAGE */
557
mfill_atomic_pte(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr,unsigned long src_addr,uffd_flags_t flags,struct folio ** foliop)558 static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
559 struct vm_area_struct *dst_vma,
560 unsigned long dst_addr,
561 unsigned long src_addr,
562 uffd_flags_t flags,
563 struct folio **foliop)
564 {
565 ssize_t err;
566
567 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
568 return mfill_atomic_pte_continue(dst_pmd, dst_vma,
569 dst_addr, flags);
570 } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
571 return mfill_atomic_pte_poison(dst_pmd, dst_vma,
572 dst_addr, flags);
573 }
574
575 /*
576 * The normal page fault path for a shmem will invoke the
577 * fault, fill the hole in the file and COW it right away. The
578 * result generates plain anonymous memory. So when we are
579 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
580 * generate anonymous memory directly without actually filling
581 * the hole. For the MAP_PRIVATE case the robustness check
582 * only happens in the pagetable (to verify it's still none)
583 * and not in the radix tree.
584 */
585 if (!(dst_vma->vm_flags & VM_SHARED)) {
586 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
587 err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
588 dst_addr, src_addr,
589 flags, foliop);
590 else
591 err = mfill_atomic_pte_zeropage(dst_pmd,
592 dst_vma, dst_addr);
593 } else {
594 err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
595 dst_addr, src_addr,
596 flags, foliop);
597 }
598
599 return err;
600 }
601
mfill_atomic(struct mm_struct * dst_mm,unsigned long dst_start,unsigned long src_start,unsigned long len,atomic_t * mmap_changing,uffd_flags_t flags)602 static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
603 unsigned long dst_start,
604 unsigned long src_start,
605 unsigned long len,
606 atomic_t *mmap_changing,
607 uffd_flags_t flags)
608 {
609 struct vm_area_struct *dst_vma;
610 ssize_t err;
611 pmd_t *dst_pmd;
612 unsigned long src_addr, dst_addr;
613 long copied;
614 struct folio *folio;
615
616 /*
617 * Sanitize the command parameters:
618 */
619 BUG_ON(dst_start & ~PAGE_MASK);
620 BUG_ON(len & ~PAGE_MASK);
621
622 /* Does the address range wrap, or is the span zero-sized? */
623 BUG_ON(src_start + len <= src_start);
624 BUG_ON(dst_start + len <= dst_start);
625
626 src_addr = src_start;
627 dst_addr = dst_start;
628 copied = 0;
629 folio = NULL;
630 retry:
631 mmap_read_lock(dst_mm);
632
633 /*
634 * If memory mappings are changing because of non-cooperative
635 * operation (e.g. mremap) running in parallel, bail out and
636 * request the user to retry later
637 */
638 err = -EAGAIN;
639 if (mmap_changing && atomic_read(mmap_changing))
640 goto out_unlock;
641
642 /*
643 * Make sure the vma is not shared, that the dst range is
644 * both valid and fully within a single existing vma.
645 */
646 err = -ENOENT;
647 dst_vma = find_dst_vma(dst_mm, dst_start, len);
648 if (!dst_vma)
649 goto out_unlock;
650
651 err = -EINVAL;
652 /*
653 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
654 * it will overwrite vm_ops, so vma_is_anonymous must return false.
655 */
656 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
657 dst_vma->vm_flags & VM_SHARED))
658 goto out_unlock;
659
660 /*
661 * validate 'mode' now that we know the dst_vma: don't allow
662 * a wrprotect copy if the userfaultfd didn't register as WP.
663 */
664 if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
665 goto out_unlock;
666
667 /*
668 * If this is a HUGETLB vma, pass off to appropriate routine
669 */
670 if (is_vm_hugetlb_page(dst_vma))
671 return mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
672 len, mmap_changing, flags);
673
674 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
675 goto out_unlock;
676 if (!vma_is_shmem(dst_vma) &&
677 uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
678 goto out_unlock;
679
680 /*
681 * Ensure the dst_vma has a anon_vma or this page
682 * would get a NULL anon_vma when moved in the
683 * dst_vma.
684 */
685 err = -ENOMEM;
686 if (!(dst_vma->vm_flags & VM_SHARED) &&
687 unlikely(anon_vma_prepare(dst_vma)))
688 goto out_unlock;
689
690 while (src_addr < src_start + len) {
691 pmd_t dst_pmdval;
692
693 BUG_ON(dst_addr >= dst_start + len);
694
695 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
696 if (unlikely(!dst_pmd)) {
697 err = -ENOMEM;
698 break;
699 }
700
701 dst_pmdval = pmdp_get_lockless(dst_pmd);
702 if (unlikely(pmd_none(dst_pmdval)) &&
703 unlikely(__pte_alloc(dst_mm, dst_pmd))) {
704 err = -ENOMEM;
705 break;
706 }
707 dst_pmdval = pmdp_get_lockless(dst_pmd);
708 /*
709 * If the dst_pmd is THP don't override it and just be strict.
710 * (This includes the case where the PMD used to be THP and
711 * changed back to none after __pte_alloc().)
712 */
713 if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) ||
714 pmd_devmap(dst_pmdval))) {
715 err = -EEXIST;
716 break;
717 }
718 if (unlikely(pmd_bad(dst_pmdval))) {
719 err = -EFAULT;
720 break;
721 }
722 /*
723 * For shmem mappings, khugepaged is allowed to remove page
724 * tables under us; pte_offset_map_lock() will deal with that.
725 */
726
727 err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
728 src_addr, flags, &folio);
729 cond_resched();
730
731 if (unlikely(err == -ENOENT)) {
732 void *kaddr;
733
734 mmap_read_unlock(dst_mm);
735 BUG_ON(!folio);
736
737 kaddr = kmap_local_folio(folio, 0);
738 err = copy_from_user(kaddr,
739 (const void __user *) src_addr,
740 PAGE_SIZE);
741 kunmap_local(kaddr);
742 if (unlikely(err)) {
743 err = -EFAULT;
744 goto out;
745 }
746 flush_dcache_folio(folio);
747 goto retry;
748 } else
749 BUG_ON(folio);
750
751 if (!err) {
752 dst_addr += PAGE_SIZE;
753 src_addr += PAGE_SIZE;
754 copied += PAGE_SIZE;
755
756 if (fatal_signal_pending(current))
757 err = -EINTR;
758 }
759 if (err)
760 break;
761 }
762
763 out_unlock:
764 mmap_read_unlock(dst_mm);
765 out:
766 if (folio)
767 folio_put(folio);
768 BUG_ON(copied < 0);
769 BUG_ON(err > 0);
770 BUG_ON(!copied && !err);
771 return copied ? copied : err;
772 }
773
mfill_atomic_copy(struct mm_struct * dst_mm,unsigned long dst_start,unsigned long src_start,unsigned long len,atomic_t * mmap_changing,uffd_flags_t flags)774 ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
775 unsigned long src_start, unsigned long len,
776 atomic_t *mmap_changing, uffd_flags_t flags)
777 {
778 return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
779 uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
780 }
781
mfill_atomic_zeropage(struct mm_struct * dst_mm,unsigned long start,unsigned long len,atomic_t * mmap_changing)782 ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
783 unsigned long len, atomic_t *mmap_changing)
784 {
785 return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
786 uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
787 }
788
mfill_atomic_continue(struct mm_struct * dst_mm,unsigned long start,unsigned long len,atomic_t * mmap_changing,uffd_flags_t flags)789 ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
790 unsigned long len, atomic_t *mmap_changing,
791 uffd_flags_t flags)
792 {
793 return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
794 uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
795 }
796
mfill_atomic_poison(struct mm_struct * dst_mm,unsigned long start,unsigned long len,atomic_t * mmap_changing,uffd_flags_t flags)797 ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
798 unsigned long len, atomic_t *mmap_changing,
799 uffd_flags_t flags)
800 {
801 return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
802 uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
803 }
804
uffd_wp_range(struct vm_area_struct * dst_vma,unsigned long start,unsigned long len,bool enable_wp)805 long uffd_wp_range(struct vm_area_struct *dst_vma,
806 unsigned long start, unsigned long len, bool enable_wp)
807 {
808 unsigned int mm_cp_flags;
809 struct mmu_gather tlb;
810 long ret;
811
812 VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end,
813 "The address range exceeds VMA boundary.\n");
814 if (enable_wp)
815 mm_cp_flags = MM_CP_UFFD_WP;
816 else
817 mm_cp_flags = MM_CP_UFFD_WP_RESOLVE;
818
819 /*
820 * vma->vm_page_prot already reflects that uffd-wp is enabled for this
821 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed
822 * to be write-protected as default whenever protection changes.
823 * Try upgrading write permissions manually.
824 */
825 if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
826 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
827 tlb_gather_mmu(&tlb, dst_vma->vm_mm);
828 ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
829 tlb_finish_mmu(&tlb);
830
831 return ret;
832 }
833
mwriteprotect_range(struct mm_struct * dst_mm,unsigned long start,unsigned long len,bool enable_wp,atomic_t * mmap_changing)834 int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
835 unsigned long len, bool enable_wp,
836 atomic_t *mmap_changing)
837 {
838 unsigned long end = start + len;
839 unsigned long _start, _end;
840 struct vm_area_struct *dst_vma;
841 unsigned long page_mask;
842 long err;
843 VMA_ITERATOR(vmi, dst_mm, start);
844
845 /*
846 * Sanitize the command parameters:
847 */
848 BUG_ON(start & ~PAGE_MASK);
849 BUG_ON(len & ~PAGE_MASK);
850
851 /* Does the address range wrap, or is the span zero-sized? */
852 BUG_ON(start + len <= start);
853
854 mmap_read_lock(dst_mm);
855
856 /*
857 * If memory mappings are changing because of non-cooperative
858 * operation (e.g. mremap) running in parallel, bail out and
859 * request the user to retry later
860 */
861 err = -EAGAIN;
862 if (mmap_changing && atomic_read(mmap_changing))
863 goto out_unlock;
864
865 err = -ENOENT;
866 for_each_vma_range(vmi, dst_vma, end) {
867
868 if (!userfaultfd_wp(dst_vma)) {
869 err = -ENOENT;
870 break;
871 }
872
873 if (is_vm_hugetlb_page(dst_vma)) {
874 err = -EINVAL;
875 page_mask = vma_kernel_pagesize(dst_vma) - 1;
876 if ((start & page_mask) || (len & page_mask))
877 break;
878 }
879
880 _start = max(dst_vma->vm_start, start);
881 _end = min(dst_vma->vm_end, end);
882
883 err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp);
884
885 /* Return 0 on success, <0 on failures */
886 if (err < 0)
887 break;
888 err = 0;
889 }
890 out_unlock:
891 mmap_read_unlock(dst_mm);
892 return err;
893 }
894