xref: /openbmc/linux/mm/userfaultfd.c (revision 17bfcd6a)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  mm/userfaultfd.c
4   *
5   *  Copyright (C) 2015  Red Hat, Inc.
6   */
7  
8  #include <linux/mm.h>
9  #include <linux/sched/signal.h>
10  #include <linux/pagemap.h>
11  #include <linux/rmap.h>
12  #include <linux/swap.h>
13  #include <linux/swapops.h>
14  #include <linux/userfaultfd_k.h>
15  #include <linux/mmu_notifier.h>
16  #include <linux/hugetlb.h>
17  #include <linux/shmem_fs.h>
18  #include <asm/tlbflush.h>
19  #include <asm/tlb.h>
20  #include "internal.h"
21  
22  static __always_inline
23  struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
24  				    unsigned long dst_start,
25  				    unsigned long len)
26  {
27  	/*
28  	 * Make sure that the dst range is both valid and fully within a
29  	 * single existing vma.
30  	 */
31  	struct vm_area_struct *dst_vma;
32  
33  	dst_vma = find_vma(dst_mm, dst_start);
34  	if (!range_in_vma(dst_vma, dst_start, dst_start + len))
35  		return NULL;
36  
37  	/*
38  	 * Check the vma is registered in uffd, this is required to
39  	 * enforce the VM_MAYWRITE check done at uffd registration
40  	 * time.
41  	 */
42  	if (!dst_vma->vm_userfaultfd_ctx.ctx)
43  		return NULL;
44  
45  	return dst_vma;
46  }
47  
48  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
49  static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
50  				 unsigned long dst_addr)
51  {
52  	struct inode *inode;
53  	pgoff_t offset, max_off;
54  
55  	if (!dst_vma->vm_file)
56  		return false;
57  
58  	inode = dst_vma->vm_file->f_inode;
59  	offset = linear_page_index(dst_vma, dst_addr);
60  	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
61  	return offset >= max_off;
62  }
63  
64  /*
65   * Install PTEs, to map dst_addr (within dst_vma) to page.
66   *
67   * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
68   * and anon, and for both shared and private VMAs.
69   */
70  int mfill_atomic_install_pte(pmd_t *dst_pmd,
71  			     struct vm_area_struct *dst_vma,
72  			     unsigned long dst_addr, struct page *page,
73  			     bool newly_allocated, uffd_flags_t flags)
74  {
75  	int ret;
76  	struct mm_struct *dst_mm = dst_vma->vm_mm;
77  	pte_t _dst_pte, *dst_pte;
78  	bool writable = dst_vma->vm_flags & VM_WRITE;
79  	bool vm_shared = dst_vma->vm_flags & VM_SHARED;
80  	bool page_in_cache = page_mapping(page);
81  	spinlock_t *ptl;
82  	struct folio *folio;
83  
84  	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
85  	_dst_pte = pte_mkdirty(_dst_pte);
86  	if (page_in_cache && !vm_shared)
87  		writable = false;
88  	if (writable)
89  		_dst_pte = pte_mkwrite(_dst_pte, dst_vma);
90  	if (flags & MFILL_ATOMIC_WP)
91  		_dst_pte = pte_mkuffd_wp(_dst_pte);
92  
93  	ret = -EAGAIN;
94  	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
95  	if (!dst_pte)
96  		goto out;
97  
98  	if (mfill_file_over_size(dst_vma, dst_addr)) {
99  		ret = -EFAULT;
100  		goto out_unlock;
101  	}
102  
103  	ret = -EEXIST;
104  	/*
105  	 * We allow to overwrite a pte marker: consider when both MISSING|WP
106  	 * registered, we firstly wr-protect a none pte which has no page cache
107  	 * page backing it, then access the page.
108  	 */
109  	if (!pte_none_mostly(ptep_get(dst_pte)))
110  		goto out_unlock;
111  
112  	folio = page_folio(page);
113  	if (page_in_cache) {
114  		/* Usually, cache pages are already added to LRU */
115  		if (newly_allocated)
116  			folio_add_lru(folio);
117  		page_add_file_rmap(page, dst_vma, false);
118  	} else {
119  		page_add_new_anon_rmap(page, dst_vma, dst_addr);
120  		folio_add_lru_vma(folio, dst_vma);
121  	}
122  
123  	/*
124  	 * Must happen after rmap, as mm_counter() checks mapping (via
125  	 * PageAnon()), which is set by __page_set_anon_rmap().
126  	 */
127  	inc_mm_counter(dst_mm, mm_counter(page));
128  
129  	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
130  
131  	/* No need to invalidate - it was non-present before */
132  	update_mmu_cache(dst_vma, dst_addr, dst_pte);
133  	ret = 0;
134  out_unlock:
135  	pte_unmap_unlock(dst_pte, ptl);
136  out:
137  	return ret;
138  }
139  
140  static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
141  				 struct vm_area_struct *dst_vma,
142  				 unsigned long dst_addr,
143  				 unsigned long src_addr,
144  				 uffd_flags_t flags,
145  				 struct folio **foliop)
146  {
147  	void *kaddr;
148  	int ret;
149  	struct folio *folio;
150  
151  	if (!*foliop) {
152  		ret = -ENOMEM;
153  		folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
154  					dst_addr, false);
155  		if (!folio)
156  			goto out;
157  
158  		kaddr = kmap_local_folio(folio, 0);
159  		/*
160  		 * The read mmap_lock is held here.  Despite the
161  		 * mmap_lock being read recursive a deadlock is still
162  		 * possible if a writer has taken a lock.  For example:
163  		 *
164  		 * process A thread 1 takes read lock on own mmap_lock
165  		 * process A thread 2 calls mmap, blocks taking write lock
166  		 * process B thread 1 takes page fault, read lock on own mmap lock
167  		 * process B thread 2 calls mmap, blocks taking write lock
168  		 * process A thread 1 blocks taking read lock on process B
169  		 * process B thread 1 blocks taking read lock on process A
170  		 *
171  		 * Disable page faults to prevent potential deadlock
172  		 * and retry the copy outside the mmap_lock.
173  		 */
174  		pagefault_disable();
175  		ret = copy_from_user(kaddr, (const void __user *) src_addr,
176  				     PAGE_SIZE);
177  		pagefault_enable();
178  		kunmap_local(kaddr);
179  
180  		/* fallback to copy_from_user outside mmap_lock */
181  		if (unlikely(ret)) {
182  			ret = -ENOENT;
183  			*foliop = folio;
184  			/* don't free the page */
185  			goto out;
186  		}
187  
188  		flush_dcache_folio(folio);
189  	} else {
190  		folio = *foliop;
191  		*foliop = NULL;
192  	}
193  
194  	/*
195  	 * The memory barrier inside __folio_mark_uptodate makes sure that
196  	 * preceding stores to the page contents become visible before
197  	 * the set_pte_at() write.
198  	 */
199  	__folio_mark_uptodate(folio);
200  
201  	ret = -ENOMEM;
202  	if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
203  		goto out_release;
204  
205  	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
206  				       &folio->page, true, flags);
207  	if (ret)
208  		goto out_release;
209  out:
210  	return ret;
211  out_release:
212  	folio_put(folio);
213  	goto out;
214  }
215  
216  static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
217  				     struct vm_area_struct *dst_vma,
218  				     unsigned long dst_addr)
219  {
220  	pte_t _dst_pte, *dst_pte;
221  	spinlock_t *ptl;
222  	int ret;
223  
224  	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
225  					 dst_vma->vm_page_prot));
226  	ret = -EAGAIN;
227  	dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
228  	if (!dst_pte)
229  		goto out;
230  	if (mfill_file_over_size(dst_vma, dst_addr)) {
231  		ret = -EFAULT;
232  		goto out_unlock;
233  	}
234  	ret = -EEXIST;
235  	if (!pte_none(ptep_get(dst_pte)))
236  		goto out_unlock;
237  	set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
238  	/* No need to invalidate - it was non-present before */
239  	update_mmu_cache(dst_vma, dst_addr, dst_pte);
240  	ret = 0;
241  out_unlock:
242  	pte_unmap_unlock(dst_pte, ptl);
243  out:
244  	return ret;
245  }
246  
247  /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
248  static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
249  				     struct vm_area_struct *dst_vma,
250  				     unsigned long dst_addr,
251  				     uffd_flags_t flags)
252  {
253  	struct inode *inode = file_inode(dst_vma->vm_file);
254  	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
255  	struct folio *folio;
256  	struct page *page;
257  	int ret;
258  
259  	ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC);
260  	/* Our caller expects us to return -EFAULT if we failed to find folio */
261  	if (ret == -ENOENT)
262  		ret = -EFAULT;
263  	if (ret)
264  		goto out;
265  	if (!folio) {
266  		ret = -EFAULT;
267  		goto out;
268  	}
269  
270  	page = folio_file_page(folio, pgoff);
271  	if (PageHWPoison(page)) {
272  		ret = -EIO;
273  		goto out_release;
274  	}
275  
276  	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
277  				       page, false, flags);
278  	if (ret)
279  		goto out_release;
280  
281  	folio_unlock(folio);
282  	ret = 0;
283  out:
284  	return ret;
285  out_release:
286  	folio_unlock(folio);
287  	folio_put(folio);
288  	goto out;
289  }
290  
291  /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
292  static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
293  				   struct vm_area_struct *dst_vma,
294  				   unsigned long dst_addr,
295  				   uffd_flags_t flags)
296  {
297  	int ret;
298  	struct mm_struct *dst_mm = dst_vma->vm_mm;
299  	pte_t _dst_pte, *dst_pte;
300  	spinlock_t *ptl;
301  
302  	_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
303  	ret = -EAGAIN;
304  	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
305  	if (!dst_pte)
306  		goto out;
307  
308  	if (mfill_file_over_size(dst_vma, dst_addr)) {
309  		ret = -EFAULT;
310  		goto out_unlock;
311  	}
312  
313  	ret = -EEXIST;
314  	/* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */
315  	if (!pte_none(*dst_pte))
316  		goto out_unlock;
317  
318  	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
319  
320  	/* No need to invalidate - it was non-present before */
321  	update_mmu_cache(dst_vma, dst_addr, dst_pte);
322  	ret = 0;
323  out_unlock:
324  	pte_unmap_unlock(dst_pte, ptl);
325  out:
326  	return ret;
327  }
328  
329  static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
330  {
331  	pgd_t *pgd;
332  	p4d_t *p4d;
333  	pud_t *pud;
334  
335  	pgd = pgd_offset(mm, address);
336  	p4d = p4d_alloc(mm, pgd, address);
337  	if (!p4d)
338  		return NULL;
339  	pud = pud_alloc(mm, p4d, address);
340  	if (!pud)
341  		return NULL;
342  	/*
343  	 * Note that we didn't run this because the pmd was
344  	 * missing, the *pmd may be already established and in
345  	 * turn it may also be a trans_huge_pmd.
346  	 */
347  	return pmd_alloc(mm, pud, address);
348  }
349  
350  #ifdef CONFIG_HUGETLB_PAGE
351  /*
352   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
353   * called with mmap_lock held, it will release mmap_lock before returning.
354   */
355  static __always_inline ssize_t mfill_atomic_hugetlb(
356  					      struct vm_area_struct *dst_vma,
357  					      unsigned long dst_start,
358  					      unsigned long src_start,
359  					      unsigned long len,
360  					      uffd_flags_t flags)
361  {
362  	struct mm_struct *dst_mm = dst_vma->vm_mm;
363  	int vm_shared = dst_vma->vm_flags & VM_SHARED;
364  	ssize_t err;
365  	pte_t *dst_pte;
366  	unsigned long src_addr, dst_addr;
367  	long copied;
368  	struct folio *folio;
369  	unsigned long vma_hpagesize;
370  	pgoff_t idx;
371  	u32 hash;
372  	struct address_space *mapping;
373  
374  	/*
375  	 * There is no default zero huge page for all huge page sizes as
376  	 * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
377  	 * by THP.  Since we can not reliably insert a zero page, this
378  	 * feature is not supported.
379  	 */
380  	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
381  		mmap_read_unlock(dst_mm);
382  		return -EINVAL;
383  	}
384  
385  	src_addr = src_start;
386  	dst_addr = dst_start;
387  	copied = 0;
388  	folio = NULL;
389  	vma_hpagesize = vma_kernel_pagesize(dst_vma);
390  
391  	/*
392  	 * Validate alignment based on huge page size
393  	 */
394  	err = -EINVAL;
395  	if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
396  		goto out_unlock;
397  
398  retry:
399  	/*
400  	 * On routine entry dst_vma is set.  If we had to drop mmap_lock and
401  	 * retry, dst_vma will be set to NULL and we must lookup again.
402  	 */
403  	if (!dst_vma) {
404  		err = -ENOENT;
405  		dst_vma = find_dst_vma(dst_mm, dst_start, len);
406  		if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
407  			goto out_unlock;
408  
409  		err = -EINVAL;
410  		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
411  			goto out_unlock;
412  
413  		vm_shared = dst_vma->vm_flags & VM_SHARED;
414  	}
415  
416  	/*
417  	 * If not shared, ensure the dst_vma has a anon_vma.
418  	 */
419  	err = -ENOMEM;
420  	if (!vm_shared) {
421  		if (unlikely(anon_vma_prepare(dst_vma)))
422  			goto out_unlock;
423  	}
424  
425  	while (src_addr < src_start + len) {
426  		BUG_ON(dst_addr >= dst_start + len);
427  
428  		/*
429  		 * Serialize via vma_lock and hugetlb_fault_mutex.
430  		 * vma_lock ensures the dst_pte remains valid even
431  		 * in the case of shared pmds.  fault mutex prevents
432  		 * races with other faulting threads.
433  		 */
434  		idx = linear_page_index(dst_vma, dst_addr);
435  		mapping = dst_vma->vm_file->f_mapping;
436  		hash = hugetlb_fault_mutex_hash(mapping, idx);
437  		mutex_lock(&hugetlb_fault_mutex_table[hash]);
438  		hugetlb_vma_lock_read(dst_vma);
439  
440  		err = -ENOMEM;
441  		dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
442  		if (!dst_pte) {
443  			hugetlb_vma_unlock_read(dst_vma);
444  			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
445  			goto out_unlock;
446  		}
447  
448  		if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
449  		    !huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
450  			err = -EEXIST;
451  			hugetlb_vma_unlock_read(dst_vma);
452  			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
453  			goto out_unlock;
454  		}
455  
456  		err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
457  					       src_addr, flags, &folio);
458  
459  		hugetlb_vma_unlock_read(dst_vma);
460  		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
461  
462  		cond_resched();
463  
464  		if (unlikely(err == -ENOENT)) {
465  			mmap_read_unlock(dst_mm);
466  			BUG_ON(!folio);
467  
468  			err = copy_folio_from_user(folio,
469  						   (const void __user *)src_addr, true);
470  			if (unlikely(err)) {
471  				err = -EFAULT;
472  				goto out;
473  			}
474  			mmap_read_lock(dst_mm);
475  
476  			dst_vma = NULL;
477  			goto retry;
478  		} else
479  			BUG_ON(folio);
480  
481  		if (!err) {
482  			dst_addr += vma_hpagesize;
483  			src_addr += vma_hpagesize;
484  			copied += vma_hpagesize;
485  
486  			if (fatal_signal_pending(current))
487  				err = -EINTR;
488  		}
489  		if (err)
490  			break;
491  	}
492  
493  out_unlock:
494  	mmap_read_unlock(dst_mm);
495  out:
496  	if (folio)
497  		folio_put(folio);
498  	BUG_ON(copied < 0);
499  	BUG_ON(err > 0);
500  	BUG_ON(!copied && !err);
501  	return copied ? copied : err;
502  }
503  #else /* !CONFIG_HUGETLB_PAGE */
504  /* fail at build time if gcc attempts to use this */
505  extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
506  				    unsigned long dst_start,
507  				    unsigned long src_start,
508  				    unsigned long len,
509  				    uffd_flags_t flags);
510  #endif /* CONFIG_HUGETLB_PAGE */
511  
512  static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
513  						struct vm_area_struct *dst_vma,
514  						unsigned long dst_addr,
515  						unsigned long src_addr,
516  						uffd_flags_t flags,
517  						struct folio **foliop)
518  {
519  	ssize_t err;
520  
521  	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
522  		return mfill_atomic_pte_continue(dst_pmd, dst_vma,
523  						 dst_addr, flags);
524  	} else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
525  		return mfill_atomic_pte_poison(dst_pmd, dst_vma,
526  					       dst_addr, flags);
527  	}
528  
529  	/*
530  	 * The normal page fault path for a shmem will invoke the
531  	 * fault, fill the hole in the file and COW it right away. The
532  	 * result generates plain anonymous memory. So when we are
533  	 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
534  	 * generate anonymous memory directly without actually filling
535  	 * the hole. For the MAP_PRIVATE case the robustness check
536  	 * only happens in the pagetable (to verify it's still none)
537  	 * and not in the radix tree.
538  	 */
539  	if (!(dst_vma->vm_flags & VM_SHARED)) {
540  		if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
541  			err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
542  						    dst_addr, src_addr,
543  						    flags, foliop);
544  		else
545  			err = mfill_atomic_pte_zeropage(dst_pmd,
546  						 dst_vma, dst_addr);
547  	} else {
548  		err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
549  					     dst_addr, src_addr,
550  					     flags, foliop);
551  	}
552  
553  	return err;
554  }
555  
556  static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
557  					    unsigned long dst_start,
558  					    unsigned long src_start,
559  					    unsigned long len,
560  					    atomic_t *mmap_changing,
561  					    uffd_flags_t flags)
562  {
563  	struct vm_area_struct *dst_vma;
564  	ssize_t err;
565  	pmd_t *dst_pmd;
566  	unsigned long src_addr, dst_addr;
567  	long copied;
568  	struct folio *folio;
569  
570  	/*
571  	 * Sanitize the command parameters:
572  	 */
573  	BUG_ON(dst_start & ~PAGE_MASK);
574  	BUG_ON(len & ~PAGE_MASK);
575  
576  	/* Does the address range wrap, or is the span zero-sized? */
577  	BUG_ON(src_start + len <= src_start);
578  	BUG_ON(dst_start + len <= dst_start);
579  
580  	src_addr = src_start;
581  	dst_addr = dst_start;
582  	copied = 0;
583  	folio = NULL;
584  retry:
585  	mmap_read_lock(dst_mm);
586  
587  	/*
588  	 * If memory mappings are changing because of non-cooperative
589  	 * operation (e.g. mremap) running in parallel, bail out and
590  	 * request the user to retry later
591  	 */
592  	err = -EAGAIN;
593  	if (mmap_changing && atomic_read(mmap_changing))
594  		goto out_unlock;
595  
596  	/*
597  	 * Make sure the vma is not shared, that the dst range is
598  	 * both valid and fully within a single existing vma.
599  	 */
600  	err = -ENOENT;
601  	dst_vma = find_dst_vma(dst_mm, dst_start, len);
602  	if (!dst_vma)
603  		goto out_unlock;
604  
605  	err = -EINVAL;
606  	/*
607  	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
608  	 * it will overwrite vm_ops, so vma_is_anonymous must return false.
609  	 */
610  	if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
611  	    dst_vma->vm_flags & VM_SHARED))
612  		goto out_unlock;
613  
614  	/*
615  	 * validate 'mode' now that we know the dst_vma: don't allow
616  	 * a wrprotect copy if the userfaultfd didn't register as WP.
617  	 */
618  	if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
619  		goto out_unlock;
620  
621  	/*
622  	 * If this is a HUGETLB vma, pass off to appropriate routine
623  	 */
624  	if (is_vm_hugetlb_page(dst_vma))
625  		return  mfill_atomic_hugetlb(dst_vma, dst_start,
626  					     src_start, len, flags);
627  
628  	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
629  		goto out_unlock;
630  	if (!vma_is_shmem(dst_vma) &&
631  	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
632  		goto out_unlock;
633  
634  	/*
635  	 * Ensure the dst_vma has a anon_vma or this page
636  	 * would get a NULL anon_vma when moved in the
637  	 * dst_vma.
638  	 */
639  	err = -ENOMEM;
640  	if (!(dst_vma->vm_flags & VM_SHARED) &&
641  	    unlikely(anon_vma_prepare(dst_vma)))
642  		goto out_unlock;
643  
644  	while (src_addr < src_start + len) {
645  		pmd_t dst_pmdval;
646  
647  		BUG_ON(dst_addr >= dst_start + len);
648  
649  		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
650  		if (unlikely(!dst_pmd)) {
651  			err = -ENOMEM;
652  			break;
653  		}
654  
655  		dst_pmdval = pmdp_get_lockless(dst_pmd);
656  		/*
657  		 * If the dst_pmd is mapped as THP don't
658  		 * override it and just be strict.
659  		 */
660  		if (unlikely(pmd_trans_huge(dst_pmdval))) {
661  			err = -EEXIST;
662  			break;
663  		}
664  		if (unlikely(pmd_none(dst_pmdval)) &&
665  		    unlikely(__pte_alloc(dst_mm, dst_pmd))) {
666  			err = -ENOMEM;
667  			break;
668  		}
669  		/* If an huge pmd materialized from under us fail */
670  		if (unlikely(pmd_trans_huge(*dst_pmd))) {
671  			err = -EFAULT;
672  			break;
673  		}
674  
675  		BUG_ON(pmd_none(*dst_pmd));
676  		BUG_ON(pmd_trans_huge(*dst_pmd));
677  
678  		err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
679  				       src_addr, flags, &folio);
680  		cond_resched();
681  
682  		if (unlikely(err == -ENOENT)) {
683  			void *kaddr;
684  
685  			mmap_read_unlock(dst_mm);
686  			BUG_ON(!folio);
687  
688  			kaddr = kmap_local_folio(folio, 0);
689  			err = copy_from_user(kaddr,
690  					     (const void __user *) src_addr,
691  					     PAGE_SIZE);
692  			kunmap_local(kaddr);
693  			if (unlikely(err)) {
694  				err = -EFAULT;
695  				goto out;
696  			}
697  			flush_dcache_folio(folio);
698  			goto retry;
699  		} else
700  			BUG_ON(folio);
701  
702  		if (!err) {
703  			dst_addr += PAGE_SIZE;
704  			src_addr += PAGE_SIZE;
705  			copied += PAGE_SIZE;
706  
707  			if (fatal_signal_pending(current))
708  				err = -EINTR;
709  		}
710  		if (err)
711  			break;
712  	}
713  
714  out_unlock:
715  	mmap_read_unlock(dst_mm);
716  out:
717  	if (folio)
718  		folio_put(folio);
719  	BUG_ON(copied < 0);
720  	BUG_ON(err > 0);
721  	BUG_ON(!copied && !err);
722  	return copied ? copied : err;
723  }
724  
725  ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
726  			  unsigned long src_start, unsigned long len,
727  			  atomic_t *mmap_changing, uffd_flags_t flags)
728  {
729  	return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
730  			    uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
731  }
732  
733  ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
734  			      unsigned long len, atomic_t *mmap_changing)
735  {
736  	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
737  			    uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
738  }
739  
740  ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
741  			      unsigned long len, atomic_t *mmap_changing,
742  			      uffd_flags_t flags)
743  {
744  	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
745  			    uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
746  }
747  
748  ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
749  			    unsigned long len, atomic_t *mmap_changing,
750  			    uffd_flags_t flags)
751  {
752  	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
753  			    uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
754  }
755  
756  long uffd_wp_range(struct vm_area_struct *dst_vma,
757  		   unsigned long start, unsigned long len, bool enable_wp)
758  {
759  	unsigned int mm_cp_flags;
760  	struct mmu_gather tlb;
761  	long ret;
762  
763  	VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end,
764  			"The address range exceeds VMA boundary.\n");
765  	if (enable_wp)
766  		mm_cp_flags = MM_CP_UFFD_WP;
767  	else
768  		mm_cp_flags = MM_CP_UFFD_WP_RESOLVE;
769  
770  	/*
771  	 * vma->vm_page_prot already reflects that uffd-wp is enabled for this
772  	 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed
773  	 * to be write-protected as default whenever protection changes.
774  	 * Try upgrading write permissions manually.
775  	 */
776  	if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
777  		mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
778  	tlb_gather_mmu(&tlb, dst_vma->vm_mm);
779  	ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
780  	tlb_finish_mmu(&tlb);
781  
782  	return ret;
783  }
784  
785  int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
786  			unsigned long len, bool enable_wp,
787  			atomic_t *mmap_changing)
788  {
789  	unsigned long end = start + len;
790  	unsigned long _start, _end;
791  	struct vm_area_struct *dst_vma;
792  	unsigned long page_mask;
793  	long err;
794  	VMA_ITERATOR(vmi, dst_mm, start);
795  
796  	/*
797  	 * Sanitize the command parameters:
798  	 */
799  	BUG_ON(start & ~PAGE_MASK);
800  	BUG_ON(len & ~PAGE_MASK);
801  
802  	/* Does the address range wrap, or is the span zero-sized? */
803  	BUG_ON(start + len <= start);
804  
805  	mmap_read_lock(dst_mm);
806  
807  	/*
808  	 * If memory mappings are changing because of non-cooperative
809  	 * operation (e.g. mremap) running in parallel, bail out and
810  	 * request the user to retry later
811  	 */
812  	err = -EAGAIN;
813  	if (mmap_changing && atomic_read(mmap_changing))
814  		goto out_unlock;
815  
816  	err = -ENOENT;
817  	for_each_vma_range(vmi, dst_vma, end) {
818  
819  		if (!userfaultfd_wp(dst_vma)) {
820  			err = -ENOENT;
821  			break;
822  		}
823  
824  		if (is_vm_hugetlb_page(dst_vma)) {
825  			err = -EINVAL;
826  			page_mask = vma_kernel_pagesize(dst_vma) - 1;
827  			if ((start & page_mask) || (len & page_mask))
828  				break;
829  		}
830  
831  		_start = max(dst_vma->vm_start, start);
832  		_end = min(dst_vma->vm_end, end);
833  
834  		err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp);
835  
836  		/* Return 0 on success, <0 on failures */
837  		if (err < 0)
838  			break;
839  		err = 0;
840  	}
841  out_unlock:
842  	mmap_read_unlock(dst_mm);
843  	return err;
844  }
845