xref: /openbmc/linux/mm/gup.c (revision fed8b7e3)
1 #include <linux/kernel.h>
2 #include <linux/errno.h>
3 #include <linux/err.h>
4 #include <linux/spinlock.h>
5 
6 #include <linux/mm.h>
7 #include <linux/memremap.h>
8 #include <linux/pagemap.h>
9 #include <linux/rmap.h>
10 #include <linux/swap.h>
11 #include <linux/swapops.h>
12 
13 #include <linux/sched/signal.h>
14 #include <linux/rwsem.h>
15 #include <linux/hugetlb.h>
16 
17 #include <asm/mmu_context.h>
18 #include <asm/pgtable.h>
19 #include <asm/tlbflush.h>
20 
21 #include "internal.h"
22 
23 struct follow_page_context {
24 	struct dev_pagemap *pgmap;
25 	unsigned int page_mask;
26 };
27 
28 static struct page *no_page_table(struct vm_area_struct *vma,
29 		unsigned int flags)
30 {
31 	/*
32 	 * When core dumping an enormous anonymous area that nobody
33 	 * has touched so far, we don't want to allocate unnecessary pages or
34 	 * page tables.  Return error instead of NULL to skip handle_mm_fault,
35 	 * then get_dump_page() will return NULL to leave a hole in the dump.
36 	 * But we can only make this optimization where a hole would surely
37 	 * be zero-filled if handle_mm_fault() actually did handle it.
38 	 */
39 	if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
40 		return ERR_PTR(-EFAULT);
41 	return NULL;
42 }
43 
44 static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
45 		pte_t *pte, unsigned int flags)
46 {
47 	/* No page to get reference */
48 	if (flags & FOLL_GET)
49 		return -EFAULT;
50 
51 	if (flags & FOLL_TOUCH) {
52 		pte_t entry = *pte;
53 
54 		if (flags & FOLL_WRITE)
55 			entry = pte_mkdirty(entry);
56 		entry = pte_mkyoung(entry);
57 
58 		if (!pte_same(*pte, entry)) {
59 			set_pte_at(vma->vm_mm, address, pte, entry);
60 			update_mmu_cache(vma, address, pte);
61 		}
62 	}
63 
64 	/* Proper page table entry exists, but no corresponding struct page */
65 	return -EEXIST;
66 }
67 
68 /*
69  * FOLL_FORCE can write to even unwritable pte's, but only
70  * after we've gone through a COW cycle and they are dirty.
71  */
72 static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
73 {
74 	return pte_write(pte) ||
75 		((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
76 }
77 
78 static struct page *follow_page_pte(struct vm_area_struct *vma,
79 		unsigned long address, pmd_t *pmd, unsigned int flags,
80 		struct dev_pagemap **pgmap)
81 {
82 	struct mm_struct *mm = vma->vm_mm;
83 	struct page *page;
84 	spinlock_t *ptl;
85 	pte_t *ptep, pte;
86 
87 retry:
88 	if (unlikely(pmd_bad(*pmd)))
89 		return no_page_table(vma, flags);
90 
91 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
92 	pte = *ptep;
93 	if (!pte_present(pte)) {
94 		swp_entry_t entry;
95 		/*
96 		 * KSM's break_ksm() relies upon recognizing a ksm page
97 		 * even while it is being migrated, so for that case we
98 		 * need migration_entry_wait().
99 		 */
100 		if (likely(!(flags & FOLL_MIGRATION)))
101 			goto no_page;
102 		if (pte_none(pte))
103 			goto no_page;
104 		entry = pte_to_swp_entry(pte);
105 		if (!is_migration_entry(entry))
106 			goto no_page;
107 		pte_unmap_unlock(ptep, ptl);
108 		migration_entry_wait(mm, pmd, address);
109 		goto retry;
110 	}
111 	if ((flags & FOLL_NUMA) && pte_protnone(pte))
112 		goto no_page;
113 	if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
114 		pte_unmap_unlock(ptep, ptl);
115 		return NULL;
116 	}
117 
118 	page = vm_normal_page(vma, address, pte);
119 	if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
120 		/*
121 		 * Only return device mapping pages in the FOLL_GET case since
122 		 * they are only valid while holding the pgmap reference.
123 		 */
124 		*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
125 		if (*pgmap)
126 			page = pte_page(pte);
127 		else
128 			goto no_page;
129 	} else if (unlikely(!page)) {
130 		if (flags & FOLL_DUMP) {
131 			/* Avoid special (like zero) pages in core dumps */
132 			page = ERR_PTR(-EFAULT);
133 			goto out;
134 		}
135 
136 		if (is_zero_pfn(pte_pfn(pte))) {
137 			page = pte_page(pte);
138 		} else {
139 			int ret;
140 
141 			ret = follow_pfn_pte(vma, address, ptep, flags);
142 			page = ERR_PTR(ret);
143 			goto out;
144 		}
145 	}
146 
147 	if (flags & FOLL_SPLIT && PageTransCompound(page)) {
148 		int ret;
149 		get_page(page);
150 		pte_unmap_unlock(ptep, ptl);
151 		lock_page(page);
152 		ret = split_huge_page(page);
153 		unlock_page(page);
154 		put_page(page);
155 		if (ret)
156 			return ERR_PTR(ret);
157 		goto retry;
158 	}
159 
160 	if (flags & FOLL_GET)
161 		get_page(page);
162 	if (flags & FOLL_TOUCH) {
163 		if ((flags & FOLL_WRITE) &&
164 		    !pte_dirty(pte) && !PageDirty(page))
165 			set_page_dirty(page);
166 		/*
167 		 * pte_mkyoung() would be more correct here, but atomic care
168 		 * is needed to avoid losing the dirty bit: it is easier to use
169 		 * mark_page_accessed().
170 		 */
171 		mark_page_accessed(page);
172 	}
173 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
174 		/* Do not mlock pte-mapped THP */
175 		if (PageTransCompound(page))
176 			goto out;
177 
178 		/*
179 		 * The preliminary mapping check is mainly to avoid the
180 		 * pointless overhead of lock_page on the ZERO_PAGE
181 		 * which might bounce very badly if there is contention.
182 		 *
183 		 * If the page is already locked, we don't need to
184 		 * handle it now - vmscan will handle it later if and
185 		 * when it attempts to reclaim the page.
186 		 */
187 		if (page->mapping && trylock_page(page)) {
188 			lru_add_drain();  /* push cached pages to LRU */
189 			/*
190 			 * Because we lock page here, and migration is
191 			 * blocked by the pte's page reference, and we
192 			 * know the page is still mapped, we don't even
193 			 * need to check for file-cache page truncation.
194 			 */
195 			mlock_vma_page(page);
196 			unlock_page(page);
197 		}
198 	}
199 out:
200 	pte_unmap_unlock(ptep, ptl);
201 	return page;
202 no_page:
203 	pte_unmap_unlock(ptep, ptl);
204 	if (!pte_none(pte))
205 		return NULL;
206 	return no_page_table(vma, flags);
207 }
208 
209 static struct page *follow_pmd_mask(struct vm_area_struct *vma,
210 				    unsigned long address, pud_t *pudp,
211 				    unsigned int flags,
212 				    struct follow_page_context *ctx)
213 {
214 	pmd_t *pmd, pmdval;
215 	spinlock_t *ptl;
216 	struct page *page;
217 	struct mm_struct *mm = vma->vm_mm;
218 
219 	pmd = pmd_offset(pudp, address);
220 	/*
221 	 * The READ_ONCE() will stabilize the pmdval in a register or
222 	 * on the stack so that it will stop changing under the code.
223 	 */
224 	pmdval = READ_ONCE(*pmd);
225 	if (pmd_none(pmdval))
226 		return no_page_table(vma, flags);
227 	if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
228 		page = follow_huge_pmd(mm, address, pmd, flags);
229 		if (page)
230 			return page;
231 		return no_page_table(vma, flags);
232 	}
233 	if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
234 		page = follow_huge_pd(vma, address,
235 				      __hugepd(pmd_val(pmdval)), flags,
236 				      PMD_SHIFT);
237 		if (page)
238 			return page;
239 		return no_page_table(vma, flags);
240 	}
241 retry:
242 	if (!pmd_present(pmdval)) {
243 		if (likely(!(flags & FOLL_MIGRATION)))
244 			return no_page_table(vma, flags);
245 		VM_BUG_ON(thp_migration_supported() &&
246 				  !is_pmd_migration_entry(pmdval));
247 		if (is_pmd_migration_entry(pmdval))
248 			pmd_migration_entry_wait(mm, pmd);
249 		pmdval = READ_ONCE(*pmd);
250 		/*
251 		 * MADV_DONTNEED may convert the pmd to null because
252 		 * mmap_sem is held in read mode
253 		 */
254 		if (pmd_none(pmdval))
255 			return no_page_table(vma, flags);
256 		goto retry;
257 	}
258 	if (pmd_devmap(pmdval)) {
259 		ptl = pmd_lock(mm, pmd);
260 		page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
261 		spin_unlock(ptl);
262 		if (page)
263 			return page;
264 	}
265 	if (likely(!pmd_trans_huge(pmdval)))
266 		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
267 
268 	if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
269 		return no_page_table(vma, flags);
270 
271 retry_locked:
272 	ptl = pmd_lock(mm, pmd);
273 	if (unlikely(pmd_none(*pmd))) {
274 		spin_unlock(ptl);
275 		return no_page_table(vma, flags);
276 	}
277 	if (unlikely(!pmd_present(*pmd))) {
278 		spin_unlock(ptl);
279 		if (likely(!(flags & FOLL_MIGRATION)))
280 			return no_page_table(vma, flags);
281 		pmd_migration_entry_wait(mm, pmd);
282 		goto retry_locked;
283 	}
284 	if (unlikely(!pmd_trans_huge(*pmd))) {
285 		spin_unlock(ptl);
286 		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
287 	}
288 	if (flags & FOLL_SPLIT) {
289 		int ret;
290 		page = pmd_page(*pmd);
291 		if (is_huge_zero_page(page)) {
292 			spin_unlock(ptl);
293 			ret = 0;
294 			split_huge_pmd(vma, pmd, address);
295 			if (pmd_trans_unstable(pmd))
296 				ret = -EBUSY;
297 		} else {
298 			get_page(page);
299 			spin_unlock(ptl);
300 			lock_page(page);
301 			ret = split_huge_page(page);
302 			unlock_page(page);
303 			put_page(page);
304 			if (pmd_none(*pmd))
305 				return no_page_table(vma, flags);
306 		}
307 
308 		return ret ? ERR_PTR(ret) :
309 			follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
310 	}
311 	page = follow_trans_huge_pmd(vma, address, pmd, flags);
312 	spin_unlock(ptl);
313 	ctx->page_mask = HPAGE_PMD_NR - 1;
314 	return page;
315 }
316 
317 static struct page *follow_pud_mask(struct vm_area_struct *vma,
318 				    unsigned long address, p4d_t *p4dp,
319 				    unsigned int flags,
320 				    struct follow_page_context *ctx)
321 {
322 	pud_t *pud;
323 	spinlock_t *ptl;
324 	struct page *page;
325 	struct mm_struct *mm = vma->vm_mm;
326 
327 	pud = pud_offset(p4dp, address);
328 	if (pud_none(*pud))
329 		return no_page_table(vma, flags);
330 	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
331 		page = follow_huge_pud(mm, address, pud, flags);
332 		if (page)
333 			return page;
334 		return no_page_table(vma, flags);
335 	}
336 	if (is_hugepd(__hugepd(pud_val(*pud)))) {
337 		page = follow_huge_pd(vma, address,
338 				      __hugepd(pud_val(*pud)), flags,
339 				      PUD_SHIFT);
340 		if (page)
341 			return page;
342 		return no_page_table(vma, flags);
343 	}
344 	if (pud_devmap(*pud)) {
345 		ptl = pud_lock(mm, pud);
346 		page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
347 		spin_unlock(ptl);
348 		if (page)
349 			return page;
350 	}
351 	if (unlikely(pud_bad(*pud)))
352 		return no_page_table(vma, flags);
353 
354 	return follow_pmd_mask(vma, address, pud, flags, ctx);
355 }
356 
357 static struct page *follow_p4d_mask(struct vm_area_struct *vma,
358 				    unsigned long address, pgd_t *pgdp,
359 				    unsigned int flags,
360 				    struct follow_page_context *ctx)
361 {
362 	p4d_t *p4d;
363 	struct page *page;
364 
365 	p4d = p4d_offset(pgdp, address);
366 	if (p4d_none(*p4d))
367 		return no_page_table(vma, flags);
368 	BUILD_BUG_ON(p4d_huge(*p4d));
369 	if (unlikely(p4d_bad(*p4d)))
370 		return no_page_table(vma, flags);
371 
372 	if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
373 		page = follow_huge_pd(vma, address,
374 				      __hugepd(p4d_val(*p4d)), flags,
375 				      P4D_SHIFT);
376 		if (page)
377 			return page;
378 		return no_page_table(vma, flags);
379 	}
380 	return follow_pud_mask(vma, address, p4d, flags, ctx);
381 }
382 
383 /**
384  * follow_page_mask - look up a page descriptor from a user-virtual address
385  * @vma: vm_area_struct mapping @address
386  * @address: virtual address to look up
387  * @flags: flags modifying lookup behaviour
388  * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
389  *       pointer to output page_mask
390  *
391  * @flags can have FOLL_ flags set, defined in <linux/mm.h>
392  *
393  * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
394  * the device's dev_pagemap metadata to avoid repeating expensive lookups.
395  *
396  * On output, the @ctx->page_mask is set according to the size of the page.
397  *
398  * Return: the mapped (struct page *), %NULL if no mapping exists, or
399  * an error pointer if there is a mapping to something not represented
400  * by a page descriptor (see also vm_normal_page()).
401  */
402 struct page *follow_page_mask(struct vm_area_struct *vma,
403 			      unsigned long address, unsigned int flags,
404 			      struct follow_page_context *ctx)
405 {
406 	pgd_t *pgd;
407 	struct page *page;
408 	struct mm_struct *mm = vma->vm_mm;
409 
410 	ctx->page_mask = 0;
411 
412 	/* make this handle hugepd */
413 	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
414 	if (!IS_ERR(page)) {
415 		BUG_ON(flags & FOLL_GET);
416 		return page;
417 	}
418 
419 	pgd = pgd_offset(mm, address);
420 
421 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
422 		return no_page_table(vma, flags);
423 
424 	if (pgd_huge(*pgd)) {
425 		page = follow_huge_pgd(mm, address, pgd, flags);
426 		if (page)
427 			return page;
428 		return no_page_table(vma, flags);
429 	}
430 	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
431 		page = follow_huge_pd(vma, address,
432 				      __hugepd(pgd_val(*pgd)), flags,
433 				      PGDIR_SHIFT);
434 		if (page)
435 			return page;
436 		return no_page_table(vma, flags);
437 	}
438 
439 	return follow_p4d_mask(vma, address, pgd, flags, ctx);
440 }
441 
442 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
443 			 unsigned int foll_flags)
444 {
445 	struct follow_page_context ctx = { NULL };
446 	struct page *page;
447 
448 	page = follow_page_mask(vma, address, foll_flags, &ctx);
449 	if (ctx.pgmap)
450 		put_dev_pagemap(ctx.pgmap);
451 	return page;
452 }
453 
454 static int get_gate_page(struct mm_struct *mm, unsigned long address,
455 		unsigned int gup_flags, struct vm_area_struct **vma,
456 		struct page **page)
457 {
458 	pgd_t *pgd;
459 	p4d_t *p4d;
460 	pud_t *pud;
461 	pmd_t *pmd;
462 	pte_t *pte;
463 	int ret = -EFAULT;
464 
465 	/* user gate pages are read-only */
466 	if (gup_flags & FOLL_WRITE)
467 		return -EFAULT;
468 	if (address > TASK_SIZE)
469 		pgd = pgd_offset_k(address);
470 	else
471 		pgd = pgd_offset_gate(mm, address);
472 	BUG_ON(pgd_none(*pgd));
473 	p4d = p4d_offset(pgd, address);
474 	BUG_ON(p4d_none(*p4d));
475 	pud = pud_offset(p4d, address);
476 	BUG_ON(pud_none(*pud));
477 	pmd = pmd_offset(pud, address);
478 	if (!pmd_present(*pmd))
479 		return -EFAULT;
480 	VM_BUG_ON(pmd_trans_huge(*pmd));
481 	pte = pte_offset_map(pmd, address);
482 	if (pte_none(*pte))
483 		goto unmap;
484 	*vma = get_gate_vma(mm);
485 	if (!page)
486 		goto out;
487 	*page = vm_normal_page(*vma, address, *pte);
488 	if (!*page) {
489 		if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
490 			goto unmap;
491 		*page = pte_page(*pte);
492 
493 		/*
494 		 * This should never happen (a device public page in the gate
495 		 * area).
496 		 */
497 		if (is_device_public_page(*page))
498 			goto unmap;
499 	}
500 	get_page(*page);
501 out:
502 	ret = 0;
503 unmap:
504 	pte_unmap(pte);
505 	return ret;
506 }
507 
508 /*
509  * mmap_sem must be held on entry.  If @nonblocking != NULL and
510  * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
511  * If it is, *@nonblocking will be set to 0 and -EBUSY returned.
512  */
513 static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
514 		unsigned long address, unsigned int *flags, int *nonblocking)
515 {
516 	unsigned int fault_flags = 0;
517 	vm_fault_t ret;
518 
519 	/* mlock all present pages, but do not fault in new pages */
520 	if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
521 		return -ENOENT;
522 	if (*flags & FOLL_WRITE)
523 		fault_flags |= FAULT_FLAG_WRITE;
524 	if (*flags & FOLL_REMOTE)
525 		fault_flags |= FAULT_FLAG_REMOTE;
526 	if (nonblocking)
527 		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
528 	if (*flags & FOLL_NOWAIT)
529 		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
530 	if (*flags & FOLL_TRIED) {
531 		VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
532 		fault_flags |= FAULT_FLAG_TRIED;
533 	}
534 
535 	ret = handle_mm_fault(vma, address, fault_flags);
536 	if (ret & VM_FAULT_ERROR) {
537 		int err = vm_fault_to_errno(ret, *flags);
538 
539 		if (err)
540 			return err;
541 		BUG();
542 	}
543 
544 	if (tsk) {
545 		if (ret & VM_FAULT_MAJOR)
546 			tsk->maj_flt++;
547 		else
548 			tsk->min_flt++;
549 	}
550 
551 	if (ret & VM_FAULT_RETRY) {
552 		if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
553 			*nonblocking = 0;
554 		return -EBUSY;
555 	}
556 
557 	/*
558 	 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
559 	 * necessary, even if maybe_mkwrite decided not to set pte_write. We
560 	 * can thus safely do subsequent page lookups as if they were reads.
561 	 * But only do so when looping for pte_write is futile: in some cases
562 	 * userspace may also be wanting to write to the gotten user page,
563 	 * which a read fault here might prevent (a readonly page might get
564 	 * reCOWed by userspace write).
565 	 */
566 	if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
567 		*flags |= FOLL_COW;
568 	return 0;
569 }
570 
571 static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
572 {
573 	vm_flags_t vm_flags = vma->vm_flags;
574 	int write = (gup_flags & FOLL_WRITE);
575 	int foreign = (gup_flags & FOLL_REMOTE);
576 
577 	if (vm_flags & (VM_IO | VM_PFNMAP))
578 		return -EFAULT;
579 
580 	if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
581 		return -EFAULT;
582 
583 	if (write) {
584 		if (!(vm_flags & VM_WRITE)) {
585 			if (!(gup_flags & FOLL_FORCE))
586 				return -EFAULT;
587 			/*
588 			 * We used to let the write,force case do COW in a
589 			 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
590 			 * set a breakpoint in a read-only mapping of an
591 			 * executable, without corrupting the file (yet only
592 			 * when that file had been opened for writing!).
593 			 * Anon pages in shared mappings are surprising: now
594 			 * just reject it.
595 			 */
596 			if (!is_cow_mapping(vm_flags))
597 				return -EFAULT;
598 		}
599 	} else if (!(vm_flags & VM_READ)) {
600 		if (!(gup_flags & FOLL_FORCE))
601 			return -EFAULT;
602 		/*
603 		 * Is there actually any vma we can reach here which does not
604 		 * have VM_MAYREAD set?
605 		 */
606 		if (!(vm_flags & VM_MAYREAD))
607 			return -EFAULT;
608 	}
609 	/*
610 	 * gups are always data accesses, not instruction
611 	 * fetches, so execute=false here
612 	 */
613 	if (!arch_vma_access_permitted(vma, write, false, foreign))
614 		return -EFAULT;
615 	return 0;
616 }
617 
618 /**
619  * __get_user_pages() - pin user pages in memory
620  * @tsk:	task_struct of target task
621  * @mm:		mm_struct of target mm
622  * @start:	starting user address
623  * @nr_pages:	number of pages from start to pin
624  * @gup_flags:	flags modifying pin behaviour
625  * @pages:	array that receives pointers to the pages pinned.
626  *		Should be at least nr_pages long. Or NULL, if caller
627  *		only intends to ensure the pages are faulted in.
628  * @vmas:	array of pointers to vmas corresponding to each page.
629  *		Or NULL if the caller does not require them.
630  * @nonblocking: whether waiting for disk IO or mmap_sem contention
631  *
632  * Returns number of pages pinned. This may be fewer than the number
633  * requested. If nr_pages is 0 or negative, returns 0. If no pages
634  * were pinned, returns -errno. Each page returned must be released
635  * with a put_page() call when it is finished with. vmas will only
636  * remain valid while mmap_sem is held.
637  *
638  * Must be called with mmap_sem held.  It may be released.  See below.
639  *
640  * __get_user_pages walks a process's page tables and takes a reference to
641  * each struct page that each user address corresponds to at a given
642  * instant. That is, it takes the page that would be accessed if a user
643  * thread accesses the given user virtual address at that instant.
644  *
645  * This does not guarantee that the page exists in the user mappings when
646  * __get_user_pages returns, and there may even be a completely different
647  * page there in some cases (eg. if mmapped pagecache has been invalidated
648  * and subsequently re faulted). However it does guarantee that the page
649  * won't be freed completely. And mostly callers simply care that the page
650  * contains data that was valid *at some point in time*. Typically, an IO
651  * or similar operation cannot guarantee anything stronger anyway because
652  * locks can't be held over the syscall boundary.
653  *
654  * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
655  * the page is written to, set_page_dirty (or set_page_dirty_lock, as
656  * appropriate) must be called after the page is finished with, and
657  * before put_page is called.
658  *
659  * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
660  * or mmap_sem contention, and if waiting is needed to pin all pages,
661  * *@nonblocking will be set to 0.  Further, if @gup_flags does not
662  * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
663  * this case.
664  *
665  * A caller using such a combination of @nonblocking and @gup_flags
666  * must therefore hold the mmap_sem for reading only, and recognize
667  * when it's been released.  Otherwise, it must be held for either
668  * reading or writing and will not be released.
669  *
670  * In most cases, get_user_pages or get_user_pages_fast should be used
671  * instead of __get_user_pages. __get_user_pages should be used only if
672  * you need some special @gup_flags.
673  */
674 static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
675 		unsigned long start, unsigned long nr_pages,
676 		unsigned int gup_flags, struct page **pages,
677 		struct vm_area_struct **vmas, int *nonblocking)
678 {
679 	long ret = 0, i = 0;
680 	struct vm_area_struct *vma = NULL;
681 	struct follow_page_context ctx = { NULL };
682 
683 	if (!nr_pages)
684 		return 0;
685 
686 	VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
687 
688 	/*
689 	 * If FOLL_FORCE is set then do not force a full fault as the hinting
690 	 * fault information is unrelated to the reference behaviour of a task
691 	 * using the address space
692 	 */
693 	if (!(gup_flags & FOLL_FORCE))
694 		gup_flags |= FOLL_NUMA;
695 
696 	do {
697 		struct page *page;
698 		unsigned int foll_flags = gup_flags;
699 		unsigned int page_increm;
700 
701 		/* first iteration or cross vma bound */
702 		if (!vma || start >= vma->vm_end) {
703 			vma = find_extend_vma(mm, start);
704 			if (!vma && in_gate_area(mm, start)) {
705 				ret = get_gate_page(mm, start & PAGE_MASK,
706 						gup_flags, &vma,
707 						pages ? &pages[i] : NULL);
708 				if (ret)
709 					goto out;
710 				ctx.page_mask = 0;
711 				goto next_page;
712 			}
713 
714 			if (!vma || check_vma_flags(vma, gup_flags)) {
715 				ret = -EFAULT;
716 				goto out;
717 			}
718 			if (is_vm_hugetlb_page(vma)) {
719 				i = follow_hugetlb_page(mm, vma, pages, vmas,
720 						&start, &nr_pages, i,
721 						gup_flags, nonblocking);
722 				continue;
723 			}
724 		}
725 retry:
726 		/*
727 		 * If we have a pending SIGKILL, don't keep faulting pages and
728 		 * potentially allocating memory.
729 		 */
730 		if (unlikely(fatal_signal_pending(current))) {
731 			ret = -ERESTARTSYS;
732 			goto out;
733 		}
734 		cond_resched();
735 
736 		page = follow_page_mask(vma, start, foll_flags, &ctx);
737 		if (!page) {
738 			ret = faultin_page(tsk, vma, start, &foll_flags,
739 					nonblocking);
740 			switch (ret) {
741 			case 0:
742 				goto retry;
743 			case -EBUSY:
744 				ret = 0;
745 				/* FALLTHRU */
746 			case -EFAULT:
747 			case -ENOMEM:
748 			case -EHWPOISON:
749 				goto out;
750 			case -ENOENT:
751 				goto next_page;
752 			}
753 			BUG();
754 		} else if (PTR_ERR(page) == -EEXIST) {
755 			/*
756 			 * Proper page table entry exists, but no corresponding
757 			 * struct page.
758 			 */
759 			goto next_page;
760 		} else if (IS_ERR(page)) {
761 			ret = PTR_ERR(page);
762 			goto out;
763 		}
764 		if (pages) {
765 			pages[i] = page;
766 			flush_anon_page(vma, page, start);
767 			flush_dcache_page(page);
768 			ctx.page_mask = 0;
769 		}
770 next_page:
771 		if (vmas) {
772 			vmas[i] = vma;
773 			ctx.page_mask = 0;
774 		}
775 		page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
776 		if (page_increm > nr_pages)
777 			page_increm = nr_pages;
778 		i += page_increm;
779 		start += page_increm * PAGE_SIZE;
780 		nr_pages -= page_increm;
781 	} while (nr_pages);
782 out:
783 	if (ctx.pgmap)
784 		put_dev_pagemap(ctx.pgmap);
785 	return i ? i : ret;
786 }
787 
788 static bool vma_permits_fault(struct vm_area_struct *vma,
789 			      unsigned int fault_flags)
790 {
791 	bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
792 	bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
793 	vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
794 
795 	if (!(vm_flags & vma->vm_flags))
796 		return false;
797 
798 	/*
799 	 * The architecture might have a hardware protection
800 	 * mechanism other than read/write that can deny access.
801 	 *
802 	 * gup always represents data access, not instruction
803 	 * fetches, so execute=false here:
804 	 */
805 	if (!arch_vma_access_permitted(vma, write, false, foreign))
806 		return false;
807 
808 	return true;
809 }
810 
811 /*
812  * fixup_user_fault() - manually resolve a user page fault
813  * @tsk:	the task_struct to use for page fault accounting, or
814  *		NULL if faults are not to be recorded.
815  * @mm:		mm_struct of target mm
816  * @address:	user address
817  * @fault_flags:flags to pass down to handle_mm_fault()
818  * @unlocked:	did we unlock the mmap_sem while retrying, maybe NULL if caller
819  *		does not allow retry
820  *
821  * This is meant to be called in the specific scenario where for locking reasons
822  * we try to access user memory in atomic context (within a pagefault_disable()
823  * section), this returns -EFAULT, and we want to resolve the user fault before
824  * trying again.
825  *
826  * Typically this is meant to be used by the futex code.
827  *
828  * The main difference with get_user_pages() is that this function will
829  * unconditionally call handle_mm_fault() which will in turn perform all the
830  * necessary SW fixup of the dirty and young bits in the PTE, while
831  * get_user_pages() only guarantees to update these in the struct page.
832  *
833  * This is important for some architectures where those bits also gate the
834  * access permission to the page because they are maintained in software.  On
835  * such architectures, gup() will not be enough to make a subsequent access
836  * succeed.
837  *
838  * This function will not return with an unlocked mmap_sem. So it has not the
839  * same semantics wrt the @mm->mmap_sem as does filemap_fault().
840  */
841 int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
842 		     unsigned long address, unsigned int fault_flags,
843 		     bool *unlocked)
844 {
845 	struct vm_area_struct *vma;
846 	vm_fault_t ret, major = 0;
847 
848 	if (unlocked)
849 		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
850 
851 retry:
852 	vma = find_extend_vma(mm, address);
853 	if (!vma || address < vma->vm_start)
854 		return -EFAULT;
855 
856 	if (!vma_permits_fault(vma, fault_flags))
857 		return -EFAULT;
858 
859 	ret = handle_mm_fault(vma, address, fault_flags);
860 	major |= ret & VM_FAULT_MAJOR;
861 	if (ret & VM_FAULT_ERROR) {
862 		int err = vm_fault_to_errno(ret, 0);
863 
864 		if (err)
865 			return err;
866 		BUG();
867 	}
868 
869 	if (ret & VM_FAULT_RETRY) {
870 		down_read(&mm->mmap_sem);
871 		if (!(fault_flags & FAULT_FLAG_TRIED)) {
872 			*unlocked = true;
873 			fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
874 			fault_flags |= FAULT_FLAG_TRIED;
875 			goto retry;
876 		}
877 	}
878 
879 	if (tsk) {
880 		if (major)
881 			tsk->maj_flt++;
882 		else
883 			tsk->min_flt++;
884 	}
885 	return 0;
886 }
887 EXPORT_SYMBOL_GPL(fixup_user_fault);
888 
889 static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
890 						struct mm_struct *mm,
891 						unsigned long start,
892 						unsigned long nr_pages,
893 						struct page **pages,
894 						struct vm_area_struct **vmas,
895 						int *locked,
896 						unsigned int flags)
897 {
898 	long ret, pages_done;
899 	bool lock_dropped;
900 
901 	if (locked) {
902 		/* if VM_FAULT_RETRY can be returned, vmas become invalid */
903 		BUG_ON(vmas);
904 		/* check caller initialized locked */
905 		BUG_ON(*locked != 1);
906 	}
907 
908 	if (pages)
909 		flags |= FOLL_GET;
910 
911 	pages_done = 0;
912 	lock_dropped = false;
913 	for (;;) {
914 		ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
915 				       vmas, locked);
916 		if (!locked)
917 			/* VM_FAULT_RETRY couldn't trigger, bypass */
918 			return ret;
919 
920 		/* VM_FAULT_RETRY cannot return errors */
921 		if (!*locked) {
922 			BUG_ON(ret < 0);
923 			BUG_ON(ret >= nr_pages);
924 		}
925 
926 		if (!pages)
927 			/* If it's a prefault don't insist harder */
928 			return ret;
929 
930 		if (ret > 0) {
931 			nr_pages -= ret;
932 			pages_done += ret;
933 			if (!nr_pages)
934 				break;
935 		}
936 		if (*locked) {
937 			/*
938 			 * VM_FAULT_RETRY didn't trigger or it was a
939 			 * FOLL_NOWAIT.
940 			 */
941 			if (!pages_done)
942 				pages_done = ret;
943 			break;
944 		}
945 		/* VM_FAULT_RETRY triggered, so seek to the faulting offset */
946 		pages += ret;
947 		start += ret << PAGE_SHIFT;
948 
949 		/*
950 		 * Repeat on the address that fired VM_FAULT_RETRY
951 		 * without FAULT_FLAG_ALLOW_RETRY but with
952 		 * FAULT_FLAG_TRIED.
953 		 */
954 		*locked = 1;
955 		lock_dropped = true;
956 		down_read(&mm->mmap_sem);
957 		ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
958 				       pages, NULL, NULL);
959 		if (ret != 1) {
960 			BUG_ON(ret > 1);
961 			if (!pages_done)
962 				pages_done = ret;
963 			break;
964 		}
965 		nr_pages--;
966 		pages_done++;
967 		if (!nr_pages)
968 			break;
969 		pages++;
970 		start += PAGE_SIZE;
971 	}
972 	if (lock_dropped && *locked) {
973 		/*
974 		 * We must let the caller know we temporarily dropped the lock
975 		 * and so the critical section protected by it was lost.
976 		 */
977 		up_read(&mm->mmap_sem);
978 		*locked = 0;
979 	}
980 	return pages_done;
981 }
982 
983 /*
984  * We can leverage the VM_FAULT_RETRY functionality in the page fault
985  * paths better by using either get_user_pages_locked() or
986  * get_user_pages_unlocked().
987  *
988  * get_user_pages_locked() is suitable to replace the form:
989  *
990  *      down_read(&mm->mmap_sem);
991  *      do_something()
992  *      get_user_pages(tsk, mm, ..., pages, NULL);
993  *      up_read(&mm->mmap_sem);
994  *
995  *  to:
996  *
997  *      int locked = 1;
998  *      down_read(&mm->mmap_sem);
999  *      do_something()
1000  *      get_user_pages_locked(tsk, mm, ..., pages, &locked);
1001  *      if (locked)
1002  *          up_read(&mm->mmap_sem);
1003  */
1004 long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
1005 			   unsigned int gup_flags, struct page **pages,
1006 			   int *locked)
1007 {
1008 	return __get_user_pages_locked(current, current->mm, start, nr_pages,
1009 				       pages, NULL, locked,
1010 				       gup_flags | FOLL_TOUCH);
1011 }
1012 EXPORT_SYMBOL(get_user_pages_locked);
1013 
1014 /*
1015  * get_user_pages_unlocked() is suitable to replace the form:
1016  *
1017  *      down_read(&mm->mmap_sem);
1018  *      get_user_pages(tsk, mm, ..., pages, NULL);
1019  *      up_read(&mm->mmap_sem);
1020  *
1021  *  with:
1022  *
1023  *      get_user_pages_unlocked(tsk, mm, ..., pages);
1024  *
1025  * It is functionally equivalent to get_user_pages_fast so
1026  * get_user_pages_fast should be used instead if specific gup_flags
1027  * (e.g. FOLL_FORCE) are not required.
1028  */
1029 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
1030 			     struct page **pages, unsigned int gup_flags)
1031 {
1032 	struct mm_struct *mm = current->mm;
1033 	int locked = 1;
1034 	long ret;
1035 
1036 	down_read(&mm->mmap_sem);
1037 	ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
1038 				      &locked, gup_flags | FOLL_TOUCH);
1039 	if (locked)
1040 		up_read(&mm->mmap_sem);
1041 	return ret;
1042 }
1043 EXPORT_SYMBOL(get_user_pages_unlocked);
1044 
1045 /*
1046  * get_user_pages_remote() - pin user pages in memory
1047  * @tsk:	the task_struct to use for page fault accounting, or
1048  *		NULL if faults are not to be recorded.
1049  * @mm:		mm_struct of target mm
1050  * @start:	starting user address
1051  * @nr_pages:	number of pages from start to pin
1052  * @gup_flags:	flags modifying lookup behaviour
1053  * @pages:	array that receives pointers to the pages pinned.
1054  *		Should be at least nr_pages long. Or NULL, if caller
1055  *		only intends to ensure the pages are faulted in.
1056  * @vmas:	array of pointers to vmas corresponding to each page.
1057  *		Or NULL if the caller does not require them.
1058  * @locked:	pointer to lock flag indicating whether lock is held and
1059  *		subsequently whether VM_FAULT_RETRY functionality can be
1060  *		utilised. Lock must initially be held.
1061  *
1062  * Returns number of pages pinned. This may be fewer than the number
1063  * requested. If nr_pages is 0 or negative, returns 0. If no pages
1064  * were pinned, returns -errno. Each page returned must be released
1065  * with a put_page() call when it is finished with. vmas will only
1066  * remain valid while mmap_sem is held.
1067  *
1068  * Must be called with mmap_sem held for read or write.
1069  *
1070  * get_user_pages walks a process's page tables and takes a reference to
1071  * each struct page that each user address corresponds to at a given
1072  * instant. That is, it takes the page that would be accessed if a user
1073  * thread accesses the given user virtual address at that instant.
1074  *
1075  * This does not guarantee that the page exists in the user mappings when
1076  * get_user_pages returns, and there may even be a completely different
1077  * page there in some cases (eg. if mmapped pagecache has been invalidated
1078  * and subsequently re faulted). However it does guarantee that the page
1079  * won't be freed completely. And mostly callers simply care that the page
1080  * contains data that was valid *at some point in time*. Typically, an IO
1081  * or similar operation cannot guarantee anything stronger anyway because
1082  * locks can't be held over the syscall boundary.
1083  *
1084  * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
1085  * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
1086  * be called after the page is finished with, and before put_page is called.
1087  *
1088  * get_user_pages is typically used for fewer-copy IO operations, to get a
1089  * handle on the memory by some means other than accesses via the user virtual
1090  * addresses. The pages may be submitted for DMA to devices or accessed via
1091  * their kernel linear mapping (via the kmap APIs). Care should be taken to
1092  * use the correct cache flushing APIs.
1093  *
1094  * See also get_user_pages_fast, for performance critical applications.
1095  *
1096  * get_user_pages should be phased out in favor of
1097  * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
1098  * should use get_user_pages because it cannot pass
1099  * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
1100  */
1101 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
1102 		unsigned long start, unsigned long nr_pages,
1103 		unsigned int gup_flags, struct page **pages,
1104 		struct vm_area_struct **vmas, int *locked)
1105 {
1106 	return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
1107 				       locked,
1108 				       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
1109 }
1110 EXPORT_SYMBOL(get_user_pages_remote);
1111 
1112 /*
1113  * This is the same as get_user_pages_remote(), just with a
1114  * less-flexible calling convention where we assume that the task
1115  * and mm being operated on are the current task's and don't allow
1116  * passing of a locked parameter.  We also obviously don't pass
1117  * FOLL_REMOTE in here.
1118  */
1119 long get_user_pages(unsigned long start, unsigned long nr_pages,
1120 		unsigned int gup_flags, struct page **pages,
1121 		struct vm_area_struct **vmas)
1122 {
1123 	return __get_user_pages_locked(current, current->mm, start, nr_pages,
1124 				       pages, vmas, NULL,
1125 				       gup_flags | FOLL_TOUCH);
1126 }
1127 EXPORT_SYMBOL(get_user_pages);
1128 
1129 #ifdef CONFIG_FS_DAX
1130 /*
1131  * This is the same as get_user_pages() in that it assumes we are
1132  * operating on the current task's mm, but it goes further to validate
1133  * that the vmas associated with the address range are suitable for
1134  * longterm elevated page reference counts. For example, filesystem-dax
1135  * mappings are subject to the lifetime enforced by the filesystem and
1136  * we need guarantees that longterm users like RDMA and V4L2 only
1137  * establish mappings that have a kernel enforced revocation mechanism.
1138  *
1139  * "longterm" == userspace controlled elevated page count lifetime.
1140  * Contrast this to iov_iter_get_pages() usages which are transient.
1141  */
1142 long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
1143 		unsigned int gup_flags, struct page **pages,
1144 		struct vm_area_struct **vmas_arg)
1145 {
1146 	struct vm_area_struct **vmas = vmas_arg;
1147 	struct vm_area_struct *vma_prev = NULL;
1148 	long rc, i;
1149 
1150 	if (!pages)
1151 		return -EINVAL;
1152 
1153 	if (!vmas) {
1154 		vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
1155 			       GFP_KERNEL);
1156 		if (!vmas)
1157 			return -ENOMEM;
1158 	}
1159 
1160 	rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
1161 
1162 	for (i = 0; i < rc; i++) {
1163 		struct vm_area_struct *vma = vmas[i];
1164 
1165 		if (vma == vma_prev)
1166 			continue;
1167 
1168 		vma_prev = vma;
1169 
1170 		if (vma_is_fsdax(vma))
1171 			break;
1172 	}
1173 
1174 	/*
1175 	 * Either get_user_pages() failed, or the vma validation
1176 	 * succeeded, in either case we don't need to put_page() before
1177 	 * returning.
1178 	 */
1179 	if (i >= rc)
1180 		goto out;
1181 
1182 	for (i = 0; i < rc; i++)
1183 		put_page(pages[i]);
1184 	rc = -EOPNOTSUPP;
1185 out:
1186 	if (vmas != vmas_arg)
1187 		kfree(vmas);
1188 	return rc;
1189 }
1190 EXPORT_SYMBOL(get_user_pages_longterm);
1191 #endif /* CONFIG_FS_DAX */
1192 
1193 /**
1194  * populate_vma_page_range() -  populate a range of pages in the vma.
1195  * @vma:   target vma
1196  * @start: start address
1197  * @end:   end address
1198  * @nonblocking:
1199  *
1200  * This takes care of mlocking the pages too if VM_LOCKED is set.
1201  *
1202  * return 0 on success, negative error code on error.
1203  *
1204  * vma->vm_mm->mmap_sem must be held.
1205  *
1206  * If @nonblocking is NULL, it may be held for read or write and will
1207  * be unperturbed.
1208  *
1209  * If @nonblocking is non-NULL, it must held for read only and may be
1210  * released.  If it's released, *@nonblocking will be set to 0.
1211  */
1212 long populate_vma_page_range(struct vm_area_struct *vma,
1213 		unsigned long start, unsigned long end, int *nonblocking)
1214 {
1215 	struct mm_struct *mm = vma->vm_mm;
1216 	unsigned long nr_pages = (end - start) / PAGE_SIZE;
1217 	int gup_flags;
1218 
1219 	VM_BUG_ON(start & ~PAGE_MASK);
1220 	VM_BUG_ON(end   & ~PAGE_MASK);
1221 	VM_BUG_ON_VMA(start < vma->vm_start, vma);
1222 	VM_BUG_ON_VMA(end   > vma->vm_end, vma);
1223 	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
1224 
1225 	gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
1226 	if (vma->vm_flags & VM_LOCKONFAULT)
1227 		gup_flags &= ~FOLL_POPULATE;
1228 	/*
1229 	 * We want to touch writable mappings with a write fault in order
1230 	 * to break COW, except for shared mappings because these don't COW
1231 	 * and we would not want to dirty them for nothing.
1232 	 */
1233 	if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
1234 		gup_flags |= FOLL_WRITE;
1235 
1236 	/*
1237 	 * We want mlock to succeed for regions that have any permissions
1238 	 * other than PROT_NONE.
1239 	 */
1240 	if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
1241 		gup_flags |= FOLL_FORCE;
1242 
1243 	/*
1244 	 * We made sure addr is within a VMA, so the following will
1245 	 * not result in a stack expansion that recurses back here.
1246 	 */
1247 	return __get_user_pages(current, mm, start, nr_pages, gup_flags,
1248 				NULL, NULL, nonblocking);
1249 }
1250 
1251 /*
1252  * __mm_populate - populate and/or mlock pages within a range of address space.
1253  *
1254  * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
1255  * flags. VMAs must be already marked with the desired vm_flags, and
1256  * mmap_sem must not be held.
1257  */
1258 int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
1259 {
1260 	struct mm_struct *mm = current->mm;
1261 	unsigned long end, nstart, nend;
1262 	struct vm_area_struct *vma = NULL;
1263 	int locked = 0;
1264 	long ret = 0;
1265 
1266 	end = start + len;
1267 
1268 	for (nstart = start; nstart < end; nstart = nend) {
1269 		/*
1270 		 * We want to fault in pages for [nstart; end) address range.
1271 		 * Find first corresponding VMA.
1272 		 */
1273 		if (!locked) {
1274 			locked = 1;
1275 			down_read(&mm->mmap_sem);
1276 			vma = find_vma(mm, nstart);
1277 		} else if (nstart >= vma->vm_end)
1278 			vma = vma->vm_next;
1279 		if (!vma || vma->vm_start >= end)
1280 			break;
1281 		/*
1282 		 * Set [nstart; nend) to intersection of desired address
1283 		 * range with the first VMA. Also, skip undesirable VMA types.
1284 		 */
1285 		nend = min(end, vma->vm_end);
1286 		if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1287 			continue;
1288 		if (nstart < vma->vm_start)
1289 			nstart = vma->vm_start;
1290 		/*
1291 		 * Now fault in a range of pages. populate_vma_page_range()
1292 		 * double checks the vma flags, so that it won't mlock pages
1293 		 * if the vma was already munlocked.
1294 		 */
1295 		ret = populate_vma_page_range(vma, nstart, nend, &locked);
1296 		if (ret < 0) {
1297 			if (ignore_errors) {
1298 				ret = 0;
1299 				continue;	/* continue at next VMA */
1300 			}
1301 			break;
1302 		}
1303 		nend = nstart + ret * PAGE_SIZE;
1304 		ret = 0;
1305 	}
1306 	if (locked)
1307 		up_read(&mm->mmap_sem);
1308 	return ret;	/* 0 or negative error code */
1309 }
1310 
1311 /**
1312  * get_dump_page() - pin user page in memory while writing it to core dump
1313  * @addr: user address
1314  *
1315  * Returns struct page pointer of user page pinned for dump,
1316  * to be freed afterwards by put_page().
1317  *
1318  * Returns NULL on any kind of failure - a hole must then be inserted into
1319  * the corefile, to preserve alignment with its headers; and also returns
1320  * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1321  * allowing a hole to be left in the corefile to save diskspace.
1322  *
1323  * Called without mmap_sem, but after all other threads have been killed.
1324  */
1325 #ifdef CONFIG_ELF_CORE
1326 struct page *get_dump_page(unsigned long addr)
1327 {
1328 	struct vm_area_struct *vma;
1329 	struct page *page;
1330 
1331 	if (__get_user_pages(current, current->mm, addr, 1,
1332 			     FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1333 			     NULL) < 1)
1334 		return NULL;
1335 	flush_cache_page(vma, addr, page_to_pfn(page));
1336 	return page;
1337 }
1338 #endif /* CONFIG_ELF_CORE */
1339 
1340 /*
1341  * Generic Fast GUP
1342  *
1343  * get_user_pages_fast attempts to pin user pages by walking the page
1344  * tables directly and avoids taking locks. Thus the walker needs to be
1345  * protected from page table pages being freed from under it, and should
1346  * block any THP splits.
1347  *
1348  * One way to achieve this is to have the walker disable interrupts, and
1349  * rely on IPIs from the TLB flushing code blocking before the page table
1350  * pages are freed. This is unsuitable for architectures that do not need
1351  * to broadcast an IPI when invalidating TLBs.
1352  *
1353  * Another way to achieve this is to batch up page table containing pages
1354  * belonging to more than one mm_user, then rcu_sched a callback to free those
1355  * pages. Disabling interrupts will allow the fast_gup walker to both block
1356  * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
1357  * (which is a relatively rare event). The code below adopts this strategy.
1358  *
1359  * Before activating this code, please be aware that the following assumptions
1360  * are currently made:
1361  *
1362  *  *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
1363  *  free pages containing page tables or TLB flushing requires IPI broadcast.
1364  *
1365  *  *) ptes can be read atomically by the architecture.
1366  *
1367  *  *) access_ok is sufficient to validate userspace address ranges.
1368  *
1369  * The last two assumptions can be relaxed by the addition of helper functions.
1370  *
1371  * This code is based heavily on the PowerPC implementation by Nick Piggin.
1372  */
1373 #ifdef CONFIG_HAVE_GENERIC_GUP
1374 
1375 #ifndef gup_get_pte
1376 /*
1377  * We assume that the PTE can be read atomically. If this is not the case for
1378  * your architecture, please provide the helper.
1379  */
1380 static inline pte_t gup_get_pte(pte_t *ptep)
1381 {
1382 	return READ_ONCE(*ptep);
1383 }
1384 #endif
1385 
1386 static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
1387 {
1388 	while ((*nr) - nr_start) {
1389 		struct page *page = pages[--(*nr)];
1390 
1391 		ClearPageReferenced(page);
1392 		put_page(page);
1393 	}
1394 }
1395 
1396 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
1397 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1398 			 int write, struct page **pages, int *nr)
1399 {
1400 	struct dev_pagemap *pgmap = NULL;
1401 	int nr_start = *nr, ret = 0;
1402 	pte_t *ptep, *ptem;
1403 
1404 	ptem = ptep = pte_offset_map(&pmd, addr);
1405 	do {
1406 		pte_t pte = gup_get_pte(ptep);
1407 		struct page *head, *page;
1408 
1409 		/*
1410 		 * Similar to the PMD case below, NUMA hinting must take slow
1411 		 * path using the pte_protnone check.
1412 		 */
1413 		if (pte_protnone(pte))
1414 			goto pte_unmap;
1415 
1416 		if (!pte_access_permitted(pte, write))
1417 			goto pte_unmap;
1418 
1419 		if (pte_devmap(pte)) {
1420 			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
1421 			if (unlikely(!pgmap)) {
1422 				undo_dev_pagemap(nr, nr_start, pages);
1423 				goto pte_unmap;
1424 			}
1425 		} else if (pte_special(pte))
1426 			goto pte_unmap;
1427 
1428 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1429 		page = pte_page(pte);
1430 		head = compound_head(page);
1431 
1432 		if (!page_cache_get_speculative(head))
1433 			goto pte_unmap;
1434 
1435 		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
1436 			put_page(head);
1437 			goto pte_unmap;
1438 		}
1439 
1440 		VM_BUG_ON_PAGE(compound_head(page) != head, page);
1441 
1442 		SetPageReferenced(page);
1443 		pages[*nr] = page;
1444 		(*nr)++;
1445 
1446 	} while (ptep++, addr += PAGE_SIZE, addr != end);
1447 
1448 	ret = 1;
1449 
1450 pte_unmap:
1451 	if (pgmap)
1452 		put_dev_pagemap(pgmap);
1453 	pte_unmap(ptem);
1454 	return ret;
1455 }
1456 #else
1457 
1458 /*
1459  * If we can't determine whether or not a pte is special, then fail immediately
1460  * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
1461  * to be special.
1462  *
1463  * For a futex to be placed on a THP tail page, get_futex_key requires a
1464  * __get_user_pages_fast implementation that can pin pages. Thus it's still
1465  * useful to have gup_huge_pmd even if we can't operate on ptes.
1466  */
1467 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1468 			 int write, struct page **pages, int *nr)
1469 {
1470 	return 0;
1471 }
1472 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
1473 
1474 #if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1475 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
1476 		unsigned long end, struct page **pages, int *nr)
1477 {
1478 	int nr_start = *nr;
1479 	struct dev_pagemap *pgmap = NULL;
1480 
1481 	do {
1482 		struct page *page = pfn_to_page(pfn);
1483 
1484 		pgmap = get_dev_pagemap(pfn, pgmap);
1485 		if (unlikely(!pgmap)) {
1486 			undo_dev_pagemap(nr, nr_start, pages);
1487 			return 0;
1488 		}
1489 		SetPageReferenced(page);
1490 		pages[*nr] = page;
1491 		get_page(page);
1492 		(*nr)++;
1493 		pfn++;
1494 	} while (addr += PAGE_SIZE, addr != end);
1495 
1496 	if (pgmap)
1497 		put_dev_pagemap(pgmap);
1498 	return 1;
1499 }
1500 
1501 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1502 		unsigned long end, struct page **pages, int *nr)
1503 {
1504 	unsigned long fault_pfn;
1505 	int nr_start = *nr;
1506 
1507 	fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1508 	if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
1509 		return 0;
1510 
1511 	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
1512 		undo_dev_pagemap(nr, nr_start, pages);
1513 		return 0;
1514 	}
1515 	return 1;
1516 }
1517 
1518 static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1519 		unsigned long end, struct page **pages, int *nr)
1520 {
1521 	unsigned long fault_pfn;
1522 	int nr_start = *nr;
1523 
1524 	fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
1525 	if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
1526 		return 0;
1527 
1528 	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
1529 		undo_dev_pagemap(nr, nr_start, pages);
1530 		return 0;
1531 	}
1532 	return 1;
1533 }
1534 #else
1535 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1536 		unsigned long end, struct page **pages, int *nr)
1537 {
1538 	BUILD_BUG();
1539 	return 0;
1540 }
1541 
1542 static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
1543 		unsigned long end, struct page **pages, int *nr)
1544 {
1545 	BUILD_BUG();
1546 	return 0;
1547 }
1548 #endif
1549 
1550 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1551 		unsigned long end, int write, struct page **pages, int *nr)
1552 {
1553 	struct page *head, *page;
1554 	int refs;
1555 
1556 	if (!pmd_access_permitted(orig, write))
1557 		return 0;
1558 
1559 	if (pmd_devmap(orig))
1560 		return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
1561 
1562 	refs = 0;
1563 	page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1564 	do {
1565 		pages[*nr] = page;
1566 		(*nr)++;
1567 		page++;
1568 		refs++;
1569 	} while (addr += PAGE_SIZE, addr != end);
1570 
1571 	head = compound_head(pmd_page(orig));
1572 	if (!page_cache_add_speculative(head, refs)) {
1573 		*nr -= refs;
1574 		return 0;
1575 	}
1576 
1577 	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
1578 		*nr -= refs;
1579 		while (refs--)
1580 			put_page(head);
1581 		return 0;
1582 	}
1583 
1584 	SetPageReferenced(head);
1585 	return 1;
1586 }
1587 
1588 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1589 		unsigned long end, int write, struct page **pages, int *nr)
1590 {
1591 	struct page *head, *page;
1592 	int refs;
1593 
1594 	if (!pud_access_permitted(orig, write))
1595 		return 0;
1596 
1597 	if (pud_devmap(orig))
1598 		return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
1599 
1600 	refs = 0;
1601 	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
1602 	do {
1603 		pages[*nr] = page;
1604 		(*nr)++;
1605 		page++;
1606 		refs++;
1607 	} while (addr += PAGE_SIZE, addr != end);
1608 
1609 	head = compound_head(pud_page(orig));
1610 	if (!page_cache_add_speculative(head, refs)) {
1611 		*nr -= refs;
1612 		return 0;
1613 	}
1614 
1615 	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
1616 		*nr -= refs;
1617 		while (refs--)
1618 			put_page(head);
1619 		return 0;
1620 	}
1621 
1622 	SetPageReferenced(head);
1623 	return 1;
1624 }
1625 
1626 static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
1627 			unsigned long end, int write,
1628 			struct page **pages, int *nr)
1629 {
1630 	int refs;
1631 	struct page *head, *page;
1632 
1633 	if (!pgd_access_permitted(orig, write))
1634 		return 0;
1635 
1636 	BUILD_BUG_ON(pgd_devmap(orig));
1637 	refs = 0;
1638 	page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
1639 	do {
1640 		pages[*nr] = page;
1641 		(*nr)++;
1642 		page++;
1643 		refs++;
1644 	} while (addr += PAGE_SIZE, addr != end);
1645 
1646 	head = compound_head(pgd_page(orig));
1647 	if (!page_cache_add_speculative(head, refs)) {
1648 		*nr -= refs;
1649 		return 0;
1650 	}
1651 
1652 	if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
1653 		*nr -= refs;
1654 		while (refs--)
1655 			put_page(head);
1656 		return 0;
1657 	}
1658 
1659 	SetPageReferenced(head);
1660 	return 1;
1661 }
1662 
1663 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
1664 		int write, struct page **pages, int *nr)
1665 {
1666 	unsigned long next;
1667 	pmd_t *pmdp;
1668 
1669 	pmdp = pmd_offset(&pud, addr);
1670 	do {
1671 		pmd_t pmd = READ_ONCE(*pmdp);
1672 
1673 		next = pmd_addr_end(addr, end);
1674 		if (!pmd_present(pmd))
1675 			return 0;
1676 
1677 		if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
1678 			/*
1679 			 * NUMA hinting faults need to be handled in the GUP
1680 			 * slowpath for accounting purposes and so that they
1681 			 * can be serialised against THP migration.
1682 			 */
1683 			if (pmd_protnone(pmd))
1684 				return 0;
1685 
1686 			if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
1687 				pages, nr))
1688 				return 0;
1689 
1690 		} else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
1691 			/*
1692 			 * architecture have different format for hugetlbfs
1693 			 * pmd format and THP pmd format
1694 			 */
1695 			if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
1696 					 PMD_SHIFT, next, write, pages, nr))
1697 				return 0;
1698 		} else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
1699 			return 0;
1700 	} while (pmdp++, addr = next, addr != end);
1701 
1702 	return 1;
1703 }
1704 
1705 static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
1706 			 int write, struct page **pages, int *nr)
1707 {
1708 	unsigned long next;
1709 	pud_t *pudp;
1710 
1711 	pudp = pud_offset(&p4d, addr);
1712 	do {
1713 		pud_t pud = READ_ONCE(*pudp);
1714 
1715 		next = pud_addr_end(addr, end);
1716 		if (pud_none(pud))
1717 			return 0;
1718 		if (unlikely(pud_huge(pud))) {
1719 			if (!gup_huge_pud(pud, pudp, addr, next, write,
1720 					  pages, nr))
1721 				return 0;
1722 		} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
1723 			if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
1724 					 PUD_SHIFT, next, write, pages, nr))
1725 				return 0;
1726 		} else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
1727 			return 0;
1728 	} while (pudp++, addr = next, addr != end);
1729 
1730 	return 1;
1731 }
1732 
1733 static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
1734 			 int write, struct page **pages, int *nr)
1735 {
1736 	unsigned long next;
1737 	p4d_t *p4dp;
1738 
1739 	p4dp = p4d_offset(&pgd, addr);
1740 	do {
1741 		p4d_t p4d = READ_ONCE(*p4dp);
1742 
1743 		next = p4d_addr_end(addr, end);
1744 		if (p4d_none(p4d))
1745 			return 0;
1746 		BUILD_BUG_ON(p4d_huge(p4d));
1747 		if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
1748 			if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
1749 					 P4D_SHIFT, next, write, pages, nr))
1750 				return 0;
1751 		} else if (!gup_pud_range(p4d, addr, next, write, pages, nr))
1752 			return 0;
1753 	} while (p4dp++, addr = next, addr != end);
1754 
1755 	return 1;
1756 }
1757 
1758 static void gup_pgd_range(unsigned long addr, unsigned long end,
1759 		int write, struct page **pages, int *nr)
1760 {
1761 	unsigned long next;
1762 	pgd_t *pgdp;
1763 
1764 	pgdp = pgd_offset(current->mm, addr);
1765 	do {
1766 		pgd_t pgd = READ_ONCE(*pgdp);
1767 
1768 		next = pgd_addr_end(addr, end);
1769 		if (pgd_none(pgd))
1770 			return;
1771 		if (unlikely(pgd_huge(pgd))) {
1772 			if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
1773 					  pages, nr))
1774 				return;
1775 		} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
1776 			if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
1777 					 PGDIR_SHIFT, next, write, pages, nr))
1778 				return;
1779 		} else if (!gup_p4d_range(pgd, addr, next, write, pages, nr))
1780 			return;
1781 	} while (pgdp++, addr = next, addr != end);
1782 }
1783 
1784 #ifndef gup_fast_permitted
1785 /*
1786  * Check if it's allowed to use __get_user_pages_fast() for the range, or
1787  * we need to fall back to the slow version:
1788  */
1789 bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
1790 {
1791 	unsigned long len, end;
1792 
1793 	len = (unsigned long) nr_pages << PAGE_SHIFT;
1794 	end = start + len;
1795 	return end >= start;
1796 }
1797 #endif
1798 
1799 /*
1800  * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
1801  * the regular GUP.
1802  * Note a difference with get_user_pages_fast: this always returns the
1803  * number of pages pinned, 0 if no pages were pinned.
1804  */
1805 int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
1806 			  struct page **pages)
1807 {
1808 	unsigned long len, end;
1809 	unsigned long flags;
1810 	int nr = 0;
1811 
1812 	start &= PAGE_MASK;
1813 	len = (unsigned long) nr_pages << PAGE_SHIFT;
1814 	end = start + len;
1815 
1816 	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
1817 					(void __user *)start, len)))
1818 		return 0;
1819 
1820 	/*
1821 	 * Disable interrupts.  We use the nested form as we can already have
1822 	 * interrupts disabled by get_futex_key.
1823 	 *
1824 	 * With interrupts disabled, we block page table pages from being
1825 	 * freed from under us. See struct mmu_table_batch comments in
1826 	 * include/asm-generic/tlb.h for more details.
1827 	 *
1828 	 * We do not adopt an rcu_read_lock(.) here as we also want to
1829 	 * block IPIs that come from THPs splitting.
1830 	 */
1831 
1832 	if (gup_fast_permitted(start, nr_pages, write)) {
1833 		local_irq_save(flags);
1834 		gup_pgd_range(start, end, write, pages, &nr);
1835 		local_irq_restore(flags);
1836 	}
1837 
1838 	return nr;
1839 }
1840 
1841 /**
1842  * get_user_pages_fast() - pin user pages in memory
1843  * @start:	starting user address
1844  * @nr_pages:	number of pages from start to pin
1845  * @write:	whether pages will be written to
1846  * @pages:	array that receives pointers to the pages pinned.
1847  *		Should be at least nr_pages long.
1848  *
1849  * Attempt to pin user pages in memory without taking mm->mmap_sem.
1850  * If not successful, it will fall back to taking the lock and
1851  * calling get_user_pages().
1852  *
1853  * Returns number of pages pinned. This may be fewer than the number
1854  * requested. If nr_pages is 0 or negative, returns 0. If no pages
1855  * were pinned, returns -errno.
1856  */
1857 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1858 			struct page **pages)
1859 {
1860 	unsigned long addr, len, end;
1861 	int nr = 0, ret = 0;
1862 
1863 	start &= PAGE_MASK;
1864 	addr = start;
1865 	len = (unsigned long) nr_pages << PAGE_SHIFT;
1866 	end = start + len;
1867 
1868 	if (nr_pages <= 0)
1869 		return 0;
1870 
1871 	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
1872 					(void __user *)start, len)))
1873 		return -EFAULT;
1874 
1875 	if (gup_fast_permitted(start, nr_pages, write)) {
1876 		local_irq_disable();
1877 		gup_pgd_range(addr, end, write, pages, &nr);
1878 		local_irq_enable();
1879 		ret = nr;
1880 	}
1881 
1882 	if (nr < nr_pages) {
1883 		/* Try to get the remaining pages with get_user_pages */
1884 		start += nr << PAGE_SHIFT;
1885 		pages += nr;
1886 
1887 		ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
1888 				write ? FOLL_WRITE : 0);
1889 
1890 		/* Have to be a bit careful with return values */
1891 		if (nr > 0) {
1892 			if (ret < 0)
1893 				ret = nr;
1894 			else
1895 				ret += nr;
1896 		}
1897 	}
1898 
1899 	return ret;
1900 }
1901 
1902 #endif /* CONFIG_HAVE_GENERIC_GUP */
1903