xref: /openbmc/linux/mm/gup.c (revision 3213486f)
1 #include <linux/kernel.h>
2 #include <linux/errno.h>
3 #include <linux/err.h>
4 #include <linux/spinlock.h>
5 
6 #include <linux/mm.h>
7 #include <linux/memremap.h>
8 #include <linux/pagemap.h>
9 #include <linux/rmap.h>
10 #include <linux/swap.h>
11 #include <linux/swapops.h>
12 
13 #include <linux/sched/signal.h>
14 #include <linux/rwsem.h>
15 #include <linux/hugetlb.h>
16 #include <linux/migrate.h>
17 #include <linux/mm_inline.h>
18 #include <linux/sched/mm.h>
19 
20 #include <asm/mmu_context.h>
21 #include <asm/pgtable.h>
22 #include <asm/tlbflush.h>
23 
24 #include "internal.h"
25 
26 struct follow_page_context {
27 	struct dev_pagemap *pgmap;
28 	unsigned int page_mask;
29 };
30 
31 static struct page *no_page_table(struct vm_area_struct *vma,
32 		unsigned int flags)
33 {
34 	/*
35 	 * When core dumping an enormous anonymous area that nobody
36 	 * has touched so far, we don't want to allocate unnecessary pages or
37 	 * page tables.  Return error instead of NULL to skip handle_mm_fault,
38 	 * then get_dump_page() will return NULL to leave a hole in the dump.
39 	 * But we can only make this optimization where a hole would surely
40 	 * be zero-filled if handle_mm_fault() actually did handle it.
41 	 */
42 	if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
43 		return ERR_PTR(-EFAULT);
44 	return NULL;
45 }
46 
47 static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
48 		pte_t *pte, unsigned int flags)
49 {
50 	/* No page to get reference */
51 	if (flags & FOLL_GET)
52 		return -EFAULT;
53 
54 	if (flags & FOLL_TOUCH) {
55 		pte_t entry = *pte;
56 
57 		if (flags & FOLL_WRITE)
58 			entry = pte_mkdirty(entry);
59 		entry = pte_mkyoung(entry);
60 
61 		if (!pte_same(*pte, entry)) {
62 			set_pte_at(vma->vm_mm, address, pte, entry);
63 			update_mmu_cache(vma, address, pte);
64 		}
65 	}
66 
67 	/* Proper page table entry exists, but no corresponding struct page */
68 	return -EEXIST;
69 }
70 
71 /*
72  * FOLL_FORCE can write to even unwritable pte's, but only
73  * after we've gone through a COW cycle and they are dirty.
74  */
75 static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
76 {
77 	return pte_write(pte) ||
78 		((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
79 }
80 
81 static struct page *follow_page_pte(struct vm_area_struct *vma,
82 		unsigned long address, pmd_t *pmd, unsigned int flags,
83 		struct dev_pagemap **pgmap)
84 {
85 	struct mm_struct *mm = vma->vm_mm;
86 	struct page *page;
87 	spinlock_t *ptl;
88 	pte_t *ptep, pte;
89 
90 retry:
91 	if (unlikely(pmd_bad(*pmd)))
92 		return no_page_table(vma, flags);
93 
94 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
95 	pte = *ptep;
96 	if (!pte_present(pte)) {
97 		swp_entry_t entry;
98 		/*
99 		 * KSM's break_ksm() relies upon recognizing a ksm page
100 		 * even while it is being migrated, so for that case we
101 		 * need migration_entry_wait().
102 		 */
103 		if (likely(!(flags & FOLL_MIGRATION)))
104 			goto no_page;
105 		if (pte_none(pte))
106 			goto no_page;
107 		entry = pte_to_swp_entry(pte);
108 		if (!is_migration_entry(entry))
109 			goto no_page;
110 		pte_unmap_unlock(ptep, ptl);
111 		migration_entry_wait(mm, pmd, address);
112 		goto retry;
113 	}
114 	if ((flags & FOLL_NUMA) && pte_protnone(pte))
115 		goto no_page;
116 	if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
117 		pte_unmap_unlock(ptep, ptl);
118 		return NULL;
119 	}
120 
121 	page = vm_normal_page(vma, address, pte);
122 	if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
123 		/*
124 		 * Only return device mapping pages in the FOLL_GET case since
125 		 * they are only valid while holding the pgmap reference.
126 		 */
127 		*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
128 		if (*pgmap)
129 			page = pte_page(pte);
130 		else
131 			goto no_page;
132 	} else if (unlikely(!page)) {
133 		if (flags & FOLL_DUMP) {
134 			/* Avoid special (like zero) pages in core dumps */
135 			page = ERR_PTR(-EFAULT);
136 			goto out;
137 		}
138 
139 		if (is_zero_pfn(pte_pfn(pte))) {
140 			page = pte_page(pte);
141 		} else {
142 			int ret;
143 
144 			ret = follow_pfn_pte(vma, address, ptep, flags);
145 			page = ERR_PTR(ret);
146 			goto out;
147 		}
148 	}
149 
150 	if (flags & FOLL_SPLIT && PageTransCompound(page)) {
151 		int ret;
152 		get_page(page);
153 		pte_unmap_unlock(ptep, ptl);
154 		lock_page(page);
155 		ret = split_huge_page(page);
156 		unlock_page(page);
157 		put_page(page);
158 		if (ret)
159 			return ERR_PTR(ret);
160 		goto retry;
161 	}
162 
163 	if (flags & FOLL_GET) {
164 		if (unlikely(!try_get_page(page))) {
165 			page = ERR_PTR(-ENOMEM);
166 			goto out;
167 		}
168 	}
169 	if (flags & FOLL_TOUCH) {
170 		if ((flags & FOLL_WRITE) &&
171 		    !pte_dirty(pte) && !PageDirty(page))
172 			set_page_dirty(page);
173 		/*
174 		 * pte_mkyoung() would be more correct here, but atomic care
175 		 * is needed to avoid losing the dirty bit: it is easier to use
176 		 * mark_page_accessed().
177 		 */
178 		mark_page_accessed(page);
179 	}
180 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
181 		/* Do not mlock pte-mapped THP */
182 		if (PageTransCompound(page))
183 			goto out;
184 
185 		/*
186 		 * The preliminary mapping check is mainly to avoid the
187 		 * pointless overhead of lock_page on the ZERO_PAGE
188 		 * which might bounce very badly if there is contention.
189 		 *
190 		 * If the page is already locked, we don't need to
191 		 * handle it now - vmscan will handle it later if and
192 		 * when it attempts to reclaim the page.
193 		 */
194 		if (page->mapping && trylock_page(page)) {
195 			lru_add_drain();  /* push cached pages to LRU */
196 			/*
197 			 * Because we lock page here, and migration is
198 			 * blocked by the pte's page reference, and we
199 			 * know the page is still mapped, we don't even
200 			 * need to check for file-cache page truncation.
201 			 */
202 			mlock_vma_page(page);
203 			unlock_page(page);
204 		}
205 	}
206 out:
207 	pte_unmap_unlock(ptep, ptl);
208 	return page;
209 no_page:
210 	pte_unmap_unlock(ptep, ptl);
211 	if (!pte_none(pte))
212 		return NULL;
213 	return no_page_table(vma, flags);
214 }
215 
216 static struct page *follow_pmd_mask(struct vm_area_struct *vma,
217 				    unsigned long address, pud_t *pudp,
218 				    unsigned int flags,
219 				    struct follow_page_context *ctx)
220 {
221 	pmd_t *pmd, pmdval;
222 	spinlock_t *ptl;
223 	struct page *page;
224 	struct mm_struct *mm = vma->vm_mm;
225 
226 	pmd = pmd_offset(pudp, address);
227 	/*
228 	 * The READ_ONCE() will stabilize the pmdval in a register or
229 	 * on the stack so that it will stop changing under the code.
230 	 */
231 	pmdval = READ_ONCE(*pmd);
232 	if (pmd_none(pmdval))
233 		return no_page_table(vma, flags);
234 	if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
235 		page = follow_huge_pmd(mm, address, pmd, flags);
236 		if (page)
237 			return page;
238 		return no_page_table(vma, flags);
239 	}
240 	if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
241 		page = follow_huge_pd(vma, address,
242 				      __hugepd(pmd_val(pmdval)), flags,
243 				      PMD_SHIFT);
244 		if (page)
245 			return page;
246 		return no_page_table(vma, flags);
247 	}
248 retry:
249 	if (!pmd_present(pmdval)) {
250 		if (likely(!(flags & FOLL_MIGRATION)))
251 			return no_page_table(vma, flags);
252 		VM_BUG_ON(thp_migration_supported() &&
253 				  !is_pmd_migration_entry(pmdval));
254 		if (is_pmd_migration_entry(pmdval))
255 			pmd_migration_entry_wait(mm, pmd);
256 		pmdval = READ_ONCE(*pmd);
257 		/*
258 		 * MADV_DONTNEED may convert the pmd to null because
259 		 * mmap_sem is held in read mode
260 		 */
261 		if (pmd_none(pmdval))
262 			return no_page_table(vma, flags);
263 		goto retry;
264 	}
265 	if (pmd_devmap(pmdval)) {
266 		ptl = pmd_lock(mm, pmd);
267 		page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
268 		spin_unlock(ptl);
269 		if (page)
270 			return page;
271 	}
272 	if (likely(!pmd_trans_huge(pmdval)))
273 		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
274 
275 	if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
276 		return no_page_table(vma, flags);
277 
278 retry_locked:
279 	ptl = pmd_lock(mm, pmd);
280 	if (unlikely(pmd_none(*pmd))) {
281 		spin_unlock(ptl);
282 		return no_page_table(vma, flags);
283 	}
284 	if (unlikely(!pmd_present(*pmd))) {
285 		spin_unlock(ptl);
286 		if (likely(!(flags & FOLL_MIGRATION)))
287 			return no_page_table(vma, flags);
288 		pmd_migration_entry_wait(mm, pmd);
289 		goto retry_locked;
290 	}
291 	if (unlikely(!pmd_trans_huge(*pmd))) {
292 		spin_unlock(ptl);
293 		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
294 	}
295 	if (flags & FOLL_SPLIT) {
296 		int ret;
297 		page = pmd_page(*pmd);
298 		if (is_huge_zero_page(page)) {
299 			spin_unlock(ptl);
300 			ret = 0;
301 			split_huge_pmd(vma, pmd, address);
302 			if (pmd_trans_unstable(pmd))
303 				ret = -EBUSY;
304 		} else {
305 			if (unlikely(!try_get_page(page))) {
306 				spin_unlock(ptl);
307 				return ERR_PTR(-ENOMEM);
308 			}
309 			spin_unlock(ptl);
310 			lock_page(page);
311 			ret = split_huge_page(page);
312 			unlock_page(page);
313 			put_page(page);
314 			if (pmd_none(*pmd))
315 				return no_page_table(vma, flags);
316 		}
317 
318 		return ret ? ERR_PTR(ret) :
319 			follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
320 	}
321 	page = follow_trans_huge_pmd(vma, address, pmd, flags);
322 	spin_unlock(ptl);
323 	ctx->page_mask = HPAGE_PMD_NR - 1;
324 	return page;
325 }
326 
327 static struct page *follow_pud_mask(struct vm_area_struct *vma,
328 				    unsigned long address, p4d_t *p4dp,
329 				    unsigned int flags,
330 				    struct follow_page_context *ctx)
331 {
332 	pud_t *pud;
333 	spinlock_t *ptl;
334 	struct page *page;
335 	struct mm_struct *mm = vma->vm_mm;
336 
337 	pud = pud_offset(p4dp, address);
338 	if (pud_none(*pud))
339 		return no_page_table(vma, flags);
340 	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
341 		page = follow_huge_pud(mm, address, pud, flags);
342 		if (page)
343 			return page;
344 		return no_page_table(vma, flags);
345 	}
346 	if (is_hugepd(__hugepd(pud_val(*pud)))) {
347 		page = follow_huge_pd(vma, address,
348 				      __hugepd(pud_val(*pud)), flags,
349 				      PUD_SHIFT);
350 		if (page)
351 			return page;
352 		return no_page_table(vma, flags);
353 	}
354 	if (pud_devmap(*pud)) {
355 		ptl = pud_lock(mm, pud);
356 		page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
357 		spin_unlock(ptl);
358 		if (page)
359 			return page;
360 	}
361 	if (unlikely(pud_bad(*pud)))
362 		return no_page_table(vma, flags);
363 
364 	return follow_pmd_mask(vma, address, pud, flags, ctx);
365 }
366 
367 static struct page *follow_p4d_mask(struct vm_area_struct *vma,
368 				    unsigned long address, pgd_t *pgdp,
369 				    unsigned int flags,
370 				    struct follow_page_context *ctx)
371 {
372 	p4d_t *p4d;
373 	struct page *page;
374 
375 	p4d = p4d_offset(pgdp, address);
376 	if (p4d_none(*p4d))
377 		return no_page_table(vma, flags);
378 	BUILD_BUG_ON(p4d_huge(*p4d));
379 	if (unlikely(p4d_bad(*p4d)))
380 		return no_page_table(vma, flags);
381 
382 	if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
383 		page = follow_huge_pd(vma, address,
384 				      __hugepd(p4d_val(*p4d)), flags,
385 				      P4D_SHIFT);
386 		if (page)
387 			return page;
388 		return no_page_table(vma, flags);
389 	}
390 	return follow_pud_mask(vma, address, p4d, flags, ctx);
391 }
392 
393 /**
394  * follow_page_mask - look up a page descriptor from a user-virtual address
395  * @vma: vm_area_struct mapping @address
396  * @address: virtual address to look up
397  * @flags: flags modifying lookup behaviour
398  * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
399  *       pointer to output page_mask
400  *
401  * @flags can have FOLL_ flags set, defined in <linux/mm.h>
402  *
403  * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
404  * the device's dev_pagemap metadata to avoid repeating expensive lookups.
405  *
406  * On output, the @ctx->page_mask is set according to the size of the page.
407  *
408  * Return: the mapped (struct page *), %NULL if no mapping exists, or
409  * an error pointer if there is a mapping to something not represented
410  * by a page descriptor (see also vm_normal_page()).
411  */
412 struct page *follow_page_mask(struct vm_area_struct *vma,
413 			      unsigned long address, unsigned int flags,
414 			      struct follow_page_context *ctx)
415 {
416 	pgd_t *pgd;
417 	struct page *page;
418 	struct mm_struct *mm = vma->vm_mm;
419 
420 	ctx->page_mask = 0;
421 
422 	/* make this handle hugepd */
423 	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
424 	if (!IS_ERR(page)) {
425 		BUG_ON(flags & FOLL_GET);
426 		return page;
427 	}
428 
429 	pgd = pgd_offset(mm, address);
430 
431 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
432 		return no_page_table(vma, flags);
433 
434 	if (pgd_huge(*pgd)) {
435 		page = follow_huge_pgd(mm, address, pgd, flags);
436 		if (page)
437 			return page;
438 		return no_page_table(vma, flags);
439 	}
440 	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
441 		page = follow_huge_pd(vma, address,
442 				      __hugepd(pgd_val(*pgd)), flags,
443 				      PGDIR_SHIFT);
444 		if (page)
445 			return page;
446 		return no_page_table(vma, flags);
447 	}
448 
449 	return follow_p4d_mask(vma, address, pgd, flags, ctx);
450 }
451 
452 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
453 			 unsigned int foll_flags)
454 {
455 	struct follow_page_context ctx = { NULL };
456 	struct page *page;
457 
458 	page = follow_page_mask(vma, address, foll_flags, &ctx);
459 	if (ctx.pgmap)
460 		put_dev_pagemap(ctx.pgmap);
461 	return page;
462 }
463 
464 static int get_gate_page(struct mm_struct *mm, unsigned long address,
465 		unsigned int gup_flags, struct vm_area_struct **vma,
466 		struct page **page)
467 {
468 	pgd_t *pgd;
469 	p4d_t *p4d;
470 	pud_t *pud;
471 	pmd_t *pmd;
472 	pte_t *pte;
473 	int ret = -EFAULT;
474 
475 	/* user gate pages are read-only */
476 	if (gup_flags & FOLL_WRITE)
477 		return -EFAULT;
478 	if (address > TASK_SIZE)
479 		pgd = pgd_offset_k(address);
480 	else
481 		pgd = pgd_offset_gate(mm, address);
482 	BUG_ON(pgd_none(*pgd));
483 	p4d = p4d_offset(pgd, address);
484 	BUG_ON(p4d_none(*p4d));
485 	pud = pud_offset(p4d, address);
486 	BUG_ON(pud_none(*pud));
487 	pmd = pmd_offset(pud, address);
488 	if (!pmd_present(*pmd))
489 		return -EFAULT;
490 	VM_BUG_ON(pmd_trans_huge(*pmd));
491 	pte = pte_offset_map(pmd, address);
492 	if (pte_none(*pte))
493 		goto unmap;
494 	*vma = get_gate_vma(mm);
495 	if (!page)
496 		goto out;
497 	*page = vm_normal_page(*vma, address, *pte);
498 	if (!*page) {
499 		if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
500 			goto unmap;
501 		*page = pte_page(*pte);
502 
503 		/*
504 		 * This should never happen (a device public page in the gate
505 		 * area).
506 		 */
507 		if (is_device_public_page(*page))
508 			goto unmap;
509 	}
510 	if (unlikely(!try_get_page(*page))) {
511 		ret = -ENOMEM;
512 		goto unmap;
513 	}
514 out:
515 	ret = 0;
516 unmap:
517 	pte_unmap(pte);
518 	return ret;
519 }
520 
521 /*
522  * mmap_sem must be held on entry.  If @nonblocking != NULL and
523  * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
524  * If it is, *@nonblocking will be set to 0 and -EBUSY returned.
525  */
526 static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
527 		unsigned long address, unsigned int *flags, int *nonblocking)
528 {
529 	unsigned int fault_flags = 0;
530 	vm_fault_t ret;
531 
532 	/* mlock all present pages, but do not fault in new pages */
533 	if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
534 		return -ENOENT;
535 	if (*flags & FOLL_WRITE)
536 		fault_flags |= FAULT_FLAG_WRITE;
537 	if (*flags & FOLL_REMOTE)
538 		fault_flags |= FAULT_FLAG_REMOTE;
539 	if (nonblocking)
540 		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
541 	if (*flags & FOLL_NOWAIT)
542 		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
543 	if (*flags & FOLL_TRIED) {
544 		VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
545 		fault_flags |= FAULT_FLAG_TRIED;
546 	}
547 
548 	ret = handle_mm_fault(vma, address, fault_flags);
549 	if (ret & VM_FAULT_ERROR) {
550 		int err = vm_fault_to_errno(ret, *flags);
551 
552 		if (err)
553 			return err;
554 		BUG();
555 	}
556 
557 	if (tsk) {
558 		if (ret & VM_FAULT_MAJOR)
559 			tsk->maj_flt++;
560 		else
561 			tsk->min_flt++;
562 	}
563 
564 	if (ret & VM_FAULT_RETRY) {
565 		if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
566 			*nonblocking = 0;
567 		return -EBUSY;
568 	}
569 
570 	/*
571 	 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
572 	 * necessary, even if maybe_mkwrite decided not to set pte_write. We
573 	 * can thus safely do subsequent page lookups as if they were reads.
574 	 * But only do so when looping for pte_write is futile: in some cases
575 	 * userspace may also be wanting to write to the gotten user page,
576 	 * which a read fault here might prevent (a readonly page might get
577 	 * reCOWed by userspace write).
578 	 */
579 	if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
580 		*flags |= FOLL_COW;
581 	return 0;
582 }
583 
584 static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
585 {
586 	vm_flags_t vm_flags = vma->vm_flags;
587 	int write = (gup_flags & FOLL_WRITE);
588 	int foreign = (gup_flags & FOLL_REMOTE);
589 
590 	if (vm_flags & (VM_IO | VM_PFNMAP))
591 		return -EFAULT;
592 
593 	if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
594 		return -EFAULT;
595 
596 	if (write) {
597 		if (!(vm_flags & VM_WRITE)) {
598 			if (!(gup_flags & FOLL_FORCE))
599 				return -EFAULT;
600 			/*
601 			 * We used to let the write,force case do COW in a
602 			 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
603 			 * set a breakpoint in a read-only mapping of an
604 			 * executable, without corrupting the file (yet only
605 			 * when that file had been opened for writing!).
606 			 * Anon pages in shared mappings are surprising: now
607 			 * just reject it.
608 			 */
609 			if (!is_cow_mapping(vm_flags))
610 				return -EFAULT;
611 		}
612 	} else if (!(vm_flags & VM_READ)) {
613 		if (!(gup_flags & FOLL_FORCE))
614 			return -EFAULT;
615 		/*
616 		 * Is there actually any vma we can reach here which does not
617 		 * have VM_MAYREAD set?
618 		 */
619 		if (!(vm_flags & VM_MAYREAD))
620 			return -EFAULT;
621 	}
622 	/*
623 	 * gups are always data accesses, not instruction
624 	 * fetches, so execute=false here
625 	 */
626 	if (!arch_vma_access_permitted(vma, write, false, foreign))
627 		return -EFAULT;
628 	return 0;
629 }
630 
631 /**
632  * __get_user_pages() - pin user pages in memory
633  * @tsk:	task_struct of target task
634  * @mm:		mm_struct of target mm
635  * @start:	starting user address
636  * @nr_pages:	number of pages from start to pin
637  * @gup_flags:	flags modifying pin behaviour
638  * @pages:	array that receives pointers to the pages pinned.
639  *		Should be at least nr_pages long. Or NULL, if caller
640  *		only intends to ensure the pages are faulted in.
641  * @vmas:	array of pointers to vmas corresponding to each page.
642  *		Or NULL if the caller does not require them.
643  * @nonblocking: whether waiting for disk IO or mmap_sem contention
644  *
645  * Returns number of pages pinned. This may be fewer than the number
646  * requested. If nr_pages is 0 or negative, returns 0. If no pages
647  * were pinned, returns -errno. Each page returned must be released
648  * with a put_page() call when it is finished with. vmas will only
649  * remain valid while mmap_sem is held.
650  *
651  * Must be called with mmap_sem held.  It may be released.  See below.
652  *
653  * __get_user_pages walks a process's page tables and takes a reference to
654  * each struct page that each user address corresponds to at a given
655  * instant. That is, it takes the page that would be accessed if a user
656  * thread accesses the given user virtual address at that instant.
657  *
658  * This does not guarantee that the page exists in the user mappings when
659  * __get_user_pages returns, and there may even be a completely different
660  * page there in some cases (eg. if mmapped pagecache has been invalidated
661  * and subsequently re faulted). However it does guarantee that the page
662  * won't be freed completely. And mostly callers simply care that the page
663  * contains data that was valid *at some point in time*. Typically, an IO
664  * or similar operation cannot guarantee anything stronger anyway because
665  * locks can't be held over the syscall boundary.
666  *
667  * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
668  * the page is written to, set_page_dirty (or set_page_dirty_lock, as
669  * appropriate) must be called after the page is finished with, and
670  * before put_page is called.
671  *
672  * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
673  * or mmap_sem contention, and if waiting is needed to pin all pages,
674  * *@nonblocking will be set to 0.  Further, if @gup_flags does not
675  * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
676  * this case.
677  *
678  * A caller using such a combination of @nonblocking and @gup_flags
679  * must therefore hold the mmap_sem for reading only, and recognize
680  * when it's been released.  Otherwise, it must be held for either
681  * reading or writing and will not be released.
682  *
683  * In most cases, get_user_pages or get_user_pages_fast should be used
684  * instead of __get_user_pages. __get_user_pages should be used only if
685  * you need some special @gup_flags.
686  */
687 static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
688 		unsigned long start, unsigned long nr_pages,
689 		unsigned int gup_flags, struct page **pages,
690 		struct vm_area_struct **vmas, int *nonblocking)
691 {
692 	long ret = 0, i = 0;
693 	struct vm_area_struct *vma = NULL;
694 	struct follow_page_context ctx = { NULL };
695 
696 	if (!nr_pages)
697 		return 0;
698 
699 	VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
700 
701 	/*
702 	 * If FOLL_FORCE is set then do not force a full fault as the hinting
703 	 * fault information is unrelated to the reference behaviour of a task
704 	 * using the address space
705 	 */
706 	if (!(gup_flags & FOLL_FORCE))
707 		gup_flags |= FOLL_NUMA;
708 
709 	do {
710 		struct page *page;
711 		unsigned int foll_flags = gup_flags;
712 		unsigned int page_increm;
713 
714 		/* first iteration or cross vma bound */
715 		if (!vma || start >= vma->vm_end) {
716 			vma = find_extend_vma(mm, start);
717 			if (!vma && in_gate_area(mm, start)) {
718 				ret = get_gate_page(mm, start & PAGE_MASK,
719 						gup_flags, &vma,
720 						pages ? &pages[i] : NULL);
721 				if (ret)
722 					goto out;
723 				ctx.page_mask = 0;
724 				goto next_page;
725 			}
726 
727 			if (!vma || check_vma_flags(vma, gup_flags)) {
728 				ret = -EFAULT;
729 				goto out;
730 			}
731 			if (is_vm_hugetlb_page(vma)) {
732 				i = follow_hugetlb_page(mm, vma, pages, vmas,
733 						&start, &nr_pages, i,
734 						gup_flags, nonblocking);
735 				continue;
736 			}
737 		}
738 retry:
739 		/*
740 		 * If we have a pending SIGKILL, don't keep faulting pages and
741 		 * potentially allocating memory.
742 		 */
743 		if (fatal_signal_pending(current)) {
744 			ret = -ERESTARTSYS;
745 			goto out;
746 		}
747 		cond_resched();
748 
749 		page = follow_page_mask(vma, start, foll_flags, &ctx);
750 		if (!page) {
751 			ret = faultin_page(tsk, vma, start, &foll_flags,
752 					nonblocking);
753 			switch (ret) {
754 			case 0:
755 				goto retry;
756 			case -EBUSY:
757 				ret = 0;
758 				/* FALLTHRU */
759 			case -EFAULT:
760 			case -ENOMEM:
761 			case -EHWPOISON:
762 				goto out;
763 			case -ENOENT:
764 				goto next_page;
765 			}
766 			BUG();
767 		} else if (PTR_ERR(page) == -EEXIST) {
768 			/*
769 			 * Proper page table entry exists, but no corresponding
770 			 * struct page.
771 			 */
772 			goto next_page;
773 		} else if (IS_ERR(page)) {
774 			ret = PTR_ERR(page);
775 			goto out;
776 		}
777 		if (pages) {
778 			pages[i] = page;
779 			flush_anon_page(vma, page, start);
780 			flush_dcache_page(page);
781 			ctx.page_mask = 0;
782 		}
783 next_page:
784 		if (vmas) {
785 			vmas[i] = vma;
786 			ctx.page_mask = 0;
787 		}
788 		page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
789 		if (page_increm > nr_pages)
790 			page_increm = nr_pages;
791 		i += page_increm;
792 		start += page_increm * PAGE_SIZE;
793 		nr_pages -= page_increm;
794 	} while (nr_pages);
795 out:
796 	if (ctx.pgmap)
797 		put_dev_pagemap(ctx.pgmap);
798 	return i ? i : ret;
799 }
800 
801 static bool vma_permits_fault(struct vm_area_struct *vma,
802 			      unsigned int fault_flags)
803 {
804 	bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
805 	bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
806 	vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
807 
808 	if (!(vm_flags & vma->vm_flags))
809 		return false;
810 
811 	/*
812 	 * The architecture might have a hardware protection
813 	 * mechanism other than read/write that can deny access.
814 	 *
815 	 * gup always represents data access, not instruction
816 	 * fetches, so execute=false here:
817 	 */
818 	if (!arch_vma_access_permitted(vma, write, false, foreign))
819 		return false;
820 
821 	return true;
822 }
823 
824 /*
825  * fixup_user_fault() - manually resolve a user page fault
826  * @tsk:	the task_struct to use for page fault accounting, or
827  *		NULL if faults are not to be recorded.
828  * @mm:		mm_struct of target mm
829  * @address:	user address
830  * @fault_flags:flags to pass down to handle_mm_fault()
831  * @unlocked:	did we unlock the mmap_sem while retrying, maybe NULL if caller
832  *		does not allow retry
833  *
834  * This is meant to be called in the specific scenario where for locking reasons
835  * we try to access user memory in atomic context (within a pagefault_disable()
836  * section), this returns -EFAULT, and we want to resolve the user fault before
837  * trying again.
838  *
839  * Typically this is meant to be used by the futex code.
840  *
841  * The main difference with get_user_pages() is that this function will
842  * unconditionally call handle_mm_fault() which will in turn perform all the
843  * necessary SW fixup of the dirty and young bits in the PTE, while
844  * get_user_pages() only guarantees to update these in the struct page.
845  *
846  * This is important for some architectures where those bits also gate the
847  * access permission to the page because they are maintained in software.  On
848  * such architectures, gup() will not be enough to make a subsequent access
849  * succeed.
850  *
851  * This function will not return with an unlocked mmap_sem. So it has not the
852  * same semantics wrt the @mm->mmap_sem as does filemap_fault().
853  */
854 int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
855 		     unsigned long address, unsigned int fault_flags,
856 		     bool *unlocked)
857 {
858 	struct vm_area_struct *vma;
859 	vm_fault_t ret, major = 0;
860 
861 	if (unlocked)
862 		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
863 
864 retry:
865 	vma = find_extend_vma(mm, address);
866 	if (!vma || address < vma->vm_start)
867 		return -EFAULT;
868 
869 	if (!vma_permits_fault(vma, fault_flags))
870 		return -EFAULT;
871 
872 	ret = handle_mm_fault(vma, address, fault_flags);
873 	major |= ret & VM_FAULT_MAJOR;
874 	if (ret & VM_FAULT_ERROR) {
875 		int err = vm_fault_to_errno(ret, 0);
876 
877 		if (err)
878 			return err;
879 		BUG();
880 	}
881 
882 	if (ret & VM_FAULT_RETRY) {
883 		down_read(&mm->mmap_sem);
884 		if (!(fault_flags & FAULT_FLAG_TRIED)) {
885 			*unlocked = true;
886 			fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
887 			fault_flags |= FAULT_FLAG_TRIED;
888 			goto retry;
889 		}
890 	}
891 
892 	if (tsk) {
893 		if (major)
894 			tsk->maj_flt++;
895 		else
896 			tsk->min_flt++;
897 	}
898 	return 0;
899 }
900 EXPORT_SYMBOL_GPL(fixup_user_fault);
901 
902 static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
903 						struct mm_struct *mm,
904 						unsigned long start,
905 						unsigned long nr_pages,
906 						struct page **pages,
907 						struct vm_area_struct **vmas,
908 						int *locked,
909 						unsigned int flags)
910 {
911 	long ret, pages_done;
912 	bool lock_dropped;
913 
914 	if (locked) {
915 		/* if VM_FAULT_RETRY can be returned, vmas become invalid */
916 		BUG_ON(vmas);
917 		/* check caller initialized locked */
918 		BUG_ON(*locked != 1);
919 	}
920 
921 	if (pages)
922 		flags |= FOLL_GET;
923 
924 	pages_done = 0;
925 	lock_dropped = false;
926 	for (;;) {
927 		ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
928 				       vmas, locked);
929 		if (!locked)
930 			/* VM_FAULT_RETRY couldn't trigger, bypass */
931 			return ret;
932 
933 		/* VM_FAULT_RETRY cannot return errors */
934 		if (!*locked) {
935 			BUG_ON(ret < 0);
936 			BUG_ON(ret >= nr_pages);
937 		}
938 
939 		if (!pages)
940 			/* If it's a prefault don't insist harder */
941 			return ret;
942 
943 		if (ret > 0) {
944 			nr_pages -= ret;
945 			pages_done += ret;
946 			if (!nr_pages)
947 				break;
948 		}
949 		if (*locked) {
950 			/*
951 			 * VM_FAULT_RETRY didn't trigger or it was a
952 			 * FOLL_NOWAIT.
953 			 */
954 			if (!pages_done)
955 				pages_done = ret;
956 			break;
957 		}
958 		/* VM_FAULT_RETRY triggered, so seek to the faulting offset */
959 		pages += ret;
960 		start += ret << PAGE_SHIFT;
961 
962 		/*
963 		 * Repeat on the address that fired VM_FAULT_RETRY
964 		 * without FAULT_FLAG_ALLOW_RETRY but with
965 		 * FAULT_FLAG_TRIED.
966 		 */
967 		*locked = 1;
968 		lock_dropped = true;
969 		down_read(&mm->mmap_sem);
970 		ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
971 				       pages, NULL, NULL);
972 		if (ret != 1) {
973 			BUG_ON(ret > 1);
974 			if (!pages_done)
975 				pages_done = ret;
976 			break;
977 		}
978 		nr_pages--;
979 		pages_done++;
980 		if (!nr_pages)
981 			break;
982 		pages++;
983 		start += PAGE_SIZE;
984 	}
985 	if (lock_dropped && *locked) {
986 		/*
987 		 * We must let the caller know we temporarily dropped the lock
988 		 * and so the critical section protected by it was lost.
989 		 */
990 		up_read(&mm->mmap_sem);
991 		*locked = 0;
992 	}
993 	return pages_done;
994 }
995 
996 /*
997  * We can leverage the VM_FAULT_RETRY functionality in the page fault
998  * paths better by using either get_user_pages_locked() or
999  * get_user_pages_unlocked().
1000  *
1001  * get_user_pages_locked() is suitable to replace the form:
1002  *
1003  *      down_read(&mm->mmap_sem);
1004  *      do_something()
1005  *      get_user_pages(tsk, mm, ..., pages, NULL);
1006  *      up_read(&mm->mmap_sem);
1007  *
1008  *  to:
1009  *
1010  *      int locked = 1;
1011  *      down_read(&mm->mmap_sem);
1012  *      do_something()
1013  *      get_user_pages_locked(tsk, mm, ..., pages, &locked);
1014  *      if (locked)
1015  *          up_read(&mm->mmap_sem);
1016  */
1017 long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
1018 			   unsigned int gup_flags, struct page **pages,
1019 			   int *locked)
1020 {
1021 	return __get_user_pages_locked(current, current->mm, start, nr_pages,
1022 				       pages, NULL, locked,
1023 				       gup_flags | FOLL_TOUCH);
1024 }
1025 EXPORT_SYMBOL(get_user_pages_locked);
1026 
1027 /*
1028  * get_user_pages_unlocked() is suitable to replace the form:
1029  *
1030  *      down_read(&mm->mmap_sem);
1031  *      get_user_pages(tsk, mm, ..., pages, NULL);
1032  *      up_read(&mm->mmap_sem);
1033  *
1034  *  with:
1035  *
1036  *      get_user_pages_unlocked(tsk, mm, ..., pages);
1037  *
1038  * It is functionally equivalent to get_user_pages_fast so
1039  * get_user_pages_fast should be used instead if specific gup_flags
1040  * (e.g. FOLL_FORCE) are not required.
1041  */
1042 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
1043 			     struct page **pages, unsigned int gup_flags)
1044 {
1045 	struct mm_struct *mm = current->mm;
1046 	int locked = 1;
1047 	long ret;
1048 
1049 	down_read(&mm->mmap_sem);
1050 	ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
1051 				      &locked, gup_flags | FOLL_TOUCH);
1052 	if (locked)
1053 		up_read(&mm->mmap_sem);
1054 	return ret;
1055 }
1056 EXPORT_SYMBOL(get_user_pages_unlocked);
1057 
1058 /*
1059  * get_user_pages_remote() - pin user pages in memory
1060  * @tsk:	the task_struct to use for page fault accounting, or
1061  *		NULL if faults are not to be recorded.
1062  * @mm:		mm_struct of target mm
1063  * @start:	starting user address
1064  * @nr_pages:	number of pages from start to pin
1065  * @gup_flags:	flags modifying lookup behaviour
1066  * @pages:	array that receives pointers to the pages pinned.
1067  *		Should be at least nr_pages long. Or NULL, if caller
1068  *		only intends to ensure the pages are faulted in.
1069  * @vmas:	array of pointers to vmas corresponding to each page.
1070  *		Or NULL if the caller does not require them.
1071  * @locked:	pointer to lock flag indicating whether lock is held and
1072  *		subsequently whether VM_FAULT_RETRY functionality can be
1073  *		utilised. Lock must initially be held.
1074  *
1075  * Returns number of pages pinned. This may be fewer than the number
1076  * requested. If nr_pages is 0 or negative, returns 0. If no pages
1077  * were pinned, returns -errno. Each page returned must be released
1078  * with a put_page() call when it is finished with. vmas will only
1079  * remain valid while mmap_sem is held.
1080  *
1081  * Must be called with mmap_sem held for read or write.
1082  *
1083  * get_user_pages walks a process's page tables and takes a reference to
1084  * each struct page that each user address corresponds to at a given
1085  * instant. That is, it takes the page that would be accessed if a user
1086  * thread accesses the given user virtual address at that instant.
1087  *
1088  * This does not guarantee that the page exists in the user mappings when
1089  * get_user_pages returns, and there may even be a completely different
1090  * page there in some cases (eg. if mmapped pagecache has been invalidated
1091  * and subsequently re faulted). However it does guarantee that the page
1092  * won't be freed completely. And mostly callers simply care that the page
1093  * contains data that was valid *at some point in time*. Typically, an IO
1094  * or similar operation cannot guarantee anything stronger anyway because
1095  * locks can't be held over the syscall boundary.
1096  *
1097  * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
1098  * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
1099  * be called after the page is finished with, and before put_page is called.
1100  *
1101  * get_user_pages is typically used for fewer-copy IO operations, to get a
1102  * handle on the memory by some means other than accesses via the user virtual
1103  * addresses. The pages may be submitted for DMA to devices or accessed via
1104  * their kernel linear mapping (via the kmap APIs). Care should be taken to
1105  * use the correct cache flushing APIs.
1106  *
1107  * See also get_user_pages_fast, for performance critical applications.
1108  *
1109  * get_user_pages should be phased out in favor of
1110  * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
1111  * should use get_user_pages because it cannot pass
1112  * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
1113  */
1114 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
1115 		unsigned long start, unsigned long nr_pages,
1116 		unsigned int gup_flags, struct page **pages,
1117 		struct vm_area_struct **vmas, int *locked)
1118 {
1119 	return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
1120 				       locked,
1121 				       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
1122 }
1123 EXPORT_SYMBOL(get_user_pages_remote);
1124 
1125 /*
1126  * This is the same as get_user_pages_remote(), just with a
1127  * less-flexible calling convention where we assume that the task
1128  * and mm being operated on are the current task's and don't allow
1129  * passing of a locked parameter.  We also obviously don't pass
1130  * FOLL_REMOTE in here.
1131  */
1132 long get_user_pages(unsigned long start, unsigned long nr_pages,
1133 		unsigned int gup_flags, struct page **pages,
1134 		struct vm_area_struct **vmas)
1135 {
1136 	return __get_user_pages_locked(current, current->mm, start, nr_pages,
1137 				       pages, vmas, NULL,
1138 				       gup_flags | FOLL_TOUCH);
1139 }
1140 EXPORT_SYMBOL(get_user_pages);
1141 
1142 #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
1143 
1144 #ifdef CONFIG_FS_DAX
1145 static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
1146 {
1147 	long i;
1148 	struct vm_area_struct *vma_prev = NULL;
1149 
1150 	for (i = 0; i < nr_pages; i++) {
1151 		struct vm_area_struct *vma = vmas[i];
1152 
1153 		if (vma == vma_prev)
1154 			continue;
1155 
1156 		vma_prev = vma;
1157 
1158 		if (vma_is_fsdax(vma))
1159 			return true;
1160 	}
1161 	return false;
1162 }
1163 #else
1164 static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
1165 {
1166 	return false;
1167 }
1168 #endif
1169 
1170 #ifdef CONFIG_CMA
1171 static struct page *new_non_cma_page(struct page *page, unsigned long private)
1172 {
1173 	/*
1174 	 * We want to make sure we allocate the new page from the same node
1175 	 * as the source page.
1176 	 */
1177 	int nid = page_to_nid(page);
1178 	/*
1179 	 * Trying to allocate a page for migration. Ignore allocation
1180 	 * failure warnings. We don't force __GFP_THISNODE here because
1181 	 * this node here is the node where we have CMA reservation and
1182 	 * in some case these nodes will have really less non movable
1183 	 * allocation memory.
1184 	 */
1185 	gfp_t gfp_mask = GFP_USER | __GFP_NOWARN;
1186 
1187 	if (PageHighMem(page))
1188 		gfp_mask |= __GFP_HIGHMEM;
1189 
1190 #ifdef CONFIG_HUGETLB_PAGE
1191 	if (PageHuge(page)) {
1192 		struct hstate *h = page_hstate(page);
1193 		/*
1194 		 * We don't want to dequeue from the pool because pool pages will
1195 		 * mostly be from the CMA region.
1196 		 */
1197 		return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
1198 	}
1199 #endif
1200 	if (PageTransHuge(page)) {
1201 		struct page *thp;
1202 		/*
1203 		 * ignore allocation failure warnings
1204 		 */
1205 		gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN;
1206 
1207 		/*
1208 		 * Remove the movable mask so that we don't allocate from
1209 		 * CMA area again.
1210 		 */
1211 		thp_gfpmask &= ~__GFP_MOVABLE;
1212 		thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
1213 		if (!thp)
1214 			return NULL;
1215 		prep_transhuge_page(thp);
1216 		return thp;
1217 	}
1218 
1219 	return __alloc_pages_node(nid, gfp_mask, 0);
1220 }
1221 
1222 static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
1223 					unsigned int gup_flags,
1224 					struct page **pages,
1225 					struct vm_area_struct **vmas)
1226 {
1227 	long i;
1228 	bool drain_allow = true;
1229 	bool migrate_allow = true;
1230 	LIST_HEAD(cma_page_list);
1231 
1232 check_again:
1233 	for (i = 0; i < nr_pages; i++) {
1234 		/*
1235 		 * If we get a page from the CMA zone, since we are going to
1236 		 * be pinning these entries, we might as well move them out
1237 		 * of the CMA zone if possible.
1238 		 */
1239 		if (is_migrate_cma_page(pages[i])) {
1240 
1241 			struct page *head = compound_head(pages[i]);
1242 
1243 			if (PageHuge(head)) {
1244 				isolate_huge_page(head, &cma_page_list);
1245 			} else {
1246 				if (!PageLRU(head) && drain_allow) {
1247 					lru_add_drain_all();
1248 					drain_allow = false;
1249 				}
1250 
1251 				if (!isolate_lru_page(head)) {
1252 					list_add_tail(&head->lru, &cma_page_list);
1253 					mod_node_page_state(page_pgdat(head),
1254 							    NR_ISOLATED_ANON +
1255 							    page_is_file_cache(head),
1256 							    hpage_nr_pages(head));
1257 				}
1258 			}
1259 		}
1260 	}
1261 
1262 	if (!list_empty(&cma_page_list)) {
1263 		/*
1264 		 * drop the above get_user_pages reference.
1265 		 */
1266 		for (i = 0; i < nr_pages; i++)
1267 			put_page(pages[i]);
1268 
1269 		if (migrate_pages(&cma_page_list, new_non_cma_page,
1270 				  NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
1271 			/*
1272 			 * some of the pages failed migration. Do get_user_pages
1273 			 * without migration.
1274 			 */
1275 			migrate_allow = false;
1276 
1277 			if (!list_empty(&cma_page_list))
1278 				putback_movable_pages(&cma_page_list);
1279 		}
1280 		/*
1281 		 * We did migrate all the pages, Try to get the page references again
1282 		 * migrating any new CMA pages which we failed to isolate earlier.
1283 		 */
1284 		nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
1285 		if ((nr_pages > 0) && migrate_allow) {
1286 			drain_allow = true;
1287 			goto check_again;
1288 		}
1289 	}
1290 
1291 	return nr_pages;
1292 }
1293 #else
1294 static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
1295 					       unsigned int gup_flags,
1296 					       struct page **pages,
1297 					       struct vm_area_struct **vmas)
1298 {
1299 	return nr_pages;
1300 }
1301 #endif
1302 
1303 /*
1304  * This is the same as get_user_pages() in that it assumes we are
1305  * operating on the current task's mm, but it goes further to validate
1306  * that the vmas associated with the address range are suitable for
1307  * longterm elevated page reference counts. For example, filesystem-dax
1308  * mappings are subject to the lifetime enforced by the filesystem and
1309  * we need guarantees that longterm users like RDMA and V4L2 only
1310  * establish mappings that have a kernel enforced revocation mechanism.
1311  *
1312  * "longterm" == userspace controlled elevated page count lifetime.
1313  * Contrast this to iov_iter_get_pages() usages which are transient.
1314  */
1315 long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
1316 			     unsigned int gup_flags, struct page **pages,
1317 			     struct vm_area_struct **vmas_arg)
1318 {
1319 	struct vm_area_struct **vmas = vmas_arg;
1320 	unsigned long flags;
1321 	long rc, i;
1322 
1323 	if (!pages)
1324 		return -EINVAL;
1325 
1326 	if (!vmas) {
1327 		vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
1328 			       GFP_KERNEL);
1329 		if (!vmas)
1330 			return -ENOMEM;
1331 	}
1332 
1333 	flags = memalloc_nocma_save();
1334 	rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
1335 	memalloc_nocma_restore(flags);
1336 	if (rc < 0)
1337 		goto out;
1338 
1339 	if (check_dax_vmas(vmas, rc)) {
1340 		for (i = 0; i < rc; i++)
1341 			put_page(pages[i]);
1342 		rc = -EOPNOTSUPP;
1343 		goto out;
1344 	}
1345 
1346 	rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas);
1347 out:
1348 	if (vmas != vmas_arg)
1349 		kfree(vmas);
1350 	return rc;
1351 }
1352 EXPORT_SYMBOL(get_user_pages_longterm);
1353 #endif /* CONFIG_FS_DAX */
1354 
1355 /**
1356  * populate_vma_page_range() -  populate a range of pages in the vma.
1357  * @vma:   target vma
1358  * @start: start address
1359  * @end:   end address
1360  * @nonblocking:
1361  *
1362  * This takes care of mlocking the pages too if VM_LOCKED is set.
1363  *
1364  * return 0 on success, negative error code on error.
1365  *
1366  * vma->vm_mm->mmap_sem must be held.
1367  *
1368  * If @nonblocking is NULL, it may be held for read or write and will
1369  * be unperturbed.
1370  *
1371  * If @nonblocking is non-NULL, it must held for read only and may be
1372  * released.  If it's released, *@nonblocking will be set to 0.
1373  */
1374 long populate_vma_page_range(struct vm_area_struct *vma,
1375 		unsigned long start, unsigned long end, int *nonblocking)
1376 {
1377 	struct mm_struct *mm = vma->vm_mm;
1378 	unsigned long nr_pages = (end - start) / PAGE_SIZE;
1379 	int gup_flags;
1380 
1381 	VM_BUG_ON(start & ~PAGE_MASK);
1382 	VM_BUG_ON(end   & ~PAGE_MASK);
1383 	VM_BUG_ON_VMA(start < vma->vm_start, vma);
1384 	VM_BUG_ON_VMA(end   > vma->vm_end, vma);
1385 	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
1386 
1387 	gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
1388 	if (vma->vm_flags & VM_LOCKONFAULT)
1389 		gup_flags &= ~FOLL_POPULATE;
1390 	/*
1391 	 * We want to touch writable mappings with a write fault in order
1392 	 * to break COW, except for shared mappings because these don't COW
1393 	 * and we would not want to dirty them for nothing.
1394 	 */
1395 	if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
1396 		gup_flags |= FOLL_WRITE;
1397 
1398 	/*
1399 	 * We want mlock to succeed for regions that have any permissions
1400 	 * other than PROT_NONE.
1401 	 */
1402 	if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
1403 		gup_flags |= FOLL_FORCE;
1404 
1405 	/*
1406 	 * We made sure addr is within a VMA, so the following will
1407 	 * not result in a stack expansion that recurses back here.
1408 	 */
1409 	return __get_user_pages(current, mm, start, nr_pages, gup_flags,
1410 				NULL, NULL, nonblocking);
1411 }
1412 
1413 /*
1414  * __mm_populate - populate and/or mlock pages within a range of address space.
1415  *
1416  * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
1417  * flags. VMAs must be already marked with the desired vm_flags, and
1418  * mmap_sem must not be held.
1419  */
1420 int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
1421 {
1422 	struct mm_struct *mm = current->mm;
1423 	unsigned long end, nstart, nend;
1424 	struct vm_area_struct *vma = NULL;
1425 	int locked = 0;
1426 	long ret = 0;
1427 
1428 	end = start + len;
1429 
1430 	for (nstart = start; nstart < end; nstart = nend) {
1431 		/*
1432 		 * We want to fault in pages for [nstart; end) address range.
1433 		 * Find first corresponding VMA.
1434 		 */
1435 		if (!locked) {
1436 			locked = 1;
1437 			down_read(&mm->mmap_sem);
1438 			vma = find_vma(mm, nstart);
1439 		} else if (nstart >= vma->vm_end)
1440 			vma = vma->vm_next;
1441 		if (!vma || vma->vm_start >= end)
1442 			break;
1443 		/*
1444 		 * Set [nstart; nend) to intersection of desired address
1445 		 * range with the first VMA. Also, skip undesirable VMA types.
1446 		 */
1447 		nend = min(end, vma->vm_end);
1448 		if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1449 			continue;
1450 		if (nstart < vma->vm_start)
1451 			nstart = vma->vm_start;
1452 		/*
1453 		 * Now fault in a range of pages. populate_vma_page_range()
1454 		 * double checks the vma flags, so that it won't mlock pages
1455 		 * if the vma was already munlocked.
1456 		 */
1457 		ret = populate_vma_page_range(vma, nstart, nend, &locked);
1458 		if (ret < 0) {
1459 			if (ignore_errors) {
1460 				ret = 0;
1461 				continue;	/* continue at next VMA */
1462 			}
1463 			break;
1464 		}
1465 		nend = nstart + ret * PAGE_SIZE;
1466 		ret = 0;
1467 	}
1468 	if (locked)
1469 		up_read(&mm->mmap_sem);
1470 	return ret;	/* 0 or negative error code */
1471 }
1472 
1473 /**
1474  * get_dump_page() - pin user page in memory while writing it to core dump
1475  * @addr: user address
1476  *
1477  * Returns struct page pointer of user page pinned for dump,
1478  * to be freed afterwards by put_page().
1479  *
1480  * Returns NULL on any kind of failure - a hole must then be inserted into
1481  * the corefile, to preserve alignment with its headers; and also returns
1482  * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1483  * allowing a hole to be left in the corefile to save diskspace.
1484  *
1485  * Called without mmap_sem, but after all other threads have been killed.
1486  */
1487 #ifdef CONFIG_ELF_CORE
1488 struct page *get_dump_page(unsigned long addr)
1489 {
1490 	struct vm_area_struct *vma;
1491 	struct page *page;
1492 
1493 	if (__get_user_pages(current, current->mm, addr, 1,
1494 			     FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1495 			     NULL) < 1)
1496 		return NULL;
1497 	flush_cache_page(vma, addr, page_to_pfn(page));
1498 	return page;
1499 }
1500 #endif /* CONFIG_ELF_CORE */
1501 
1502 /*
1503  * Generic Fast GUP
1504  *
1505  * get_user_pages_fast attempts to pin user pages by walking the page
1506  * tables directly and avoids taking locks. Thus the walker needs to be
1507  * protected from page table pages being freed from under it, and should
1508  * block any THP splits.
1509  *
1510  * One way to achieve this is to have the walker disable interrupts, and
1511  * rely on IPIs from the TLB flushing code blocking before the page table
1512  * pages are freed. This is unsuitable for architectures that do not need
1513  * to broadcast an IPI when invalidating TLBs.
1514  *
1515  * Another way to achieve this is to batch up page table containing pages
1516  * belonging to more than one mm_user, then rcu_sched a callback to free those
1517  * pages. Disabling interrupts will allow the fast_gup walker to both block
1518  * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
1519  * (which is a relatively rare event). The code below adopts this strategy.
1520  *
1521  * Before activating this code, please be aware that the following assumptions
1522  * are currently made:
1523  *
1524  *  *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
1525  *  free pages containing page tables or TLB flushing requires IPI broadcast.
1526  *
1527  *  *) ptes can be read atomically by the architecture.
1528  *
1529  *  *) access_ok is sufficient to validate userspace address ranges.
1530  *
1531  * The last two assumptions can be relaxed by the addition of helper functions.
1532  *
1533  * This code is based heavily on the PowerPC implementation by Nick Piggin.
1534  */
1535 #ifdef CONFIG_HAVE_GENERIC_GUP
1536 
1537 #ifndef gup_get_pte
1538 /*
1539  * We assume that the PTE can be read atomically. If this is not the case for
1540  * your architecture, please provide the helper.
1541  */
1542 static inline pte_t gup_get_pte(pte_t *ptep)
1543 {
1544 	return READ_ONCE(*ptep);
1545 }
1546 #endif
1547 
1548 static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
1549 {
1550 	while ((*nr) - nr_start) {
1551 		struct page *page = pages[--(*nr)];
1552 
1553 		ClearPageReferenced(page);
1554 		put_page(page);
1555 	}
1556 }
1557 
1558 /*
1559  * Return the compund head page with ref appropriately incremented,
1560  * or NULL if that failed.
1561  */
1562 static inline struct page *try_get_compound_head(struct page *page, int refs)
1563 {
1564 	struct page *head = compound_head(page);
1565 	if (WARN_ON_ONCE(page_ref_count(head) < 0))
1566 		return NULL;
1567 	if (unlikely(!page_cache_add_speculative(head, refs)))
1568 		return NULL;
1569 	return head;
1570 }
1571 
1572 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
1573 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1574 			 int write, struct page **pages, int *nr)
1575 {
1576 	struct dev_pagemap *pgmap = NULL;
1577 	int nr_start = *nr, ret = 0;
1578 	pte_t *ptep, *ptem;
1579 
1580 	ptem = ptep = pte_offset_map(&pmd, addr);
1581 	do {
1582 		pte_t pte = gup_get_pte(ptep);
1583 		struct page *head, *page;
1584 
1585 		/*
1586 		 * Similar to the PMD case below, NUMA hinting must take slow
1587 		 * path using the pte_protnone check.
1588 		 */
1589 		if (pte_protnone(pte))
1590 			goto pte_unmap;
1591 
1592 		if (!pte_access_permitted(pte, write))
1593 			goto pte_unmap;
1594 
1595 		if (pte_devmap(pte)) {
1596 			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
1597 			if (unlikely(!pgmap)) {
1598 				undo_dev_pagemap(nr, nr_start, pages);
1599 				goto pte_unmap;
1600 			}
1601 		} else if (pte_special(pte))
1602 			goto pte_unmap;
1603 
1604 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1605 		page = pte_page(pte);
1606 
1607 		head = try_get_compound_head(page, 1);
1608 		if (!head)
1609 			goto pte_unmap;
1610 
1611 		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
1612 			put_page(head);
1613 			goto pte_unmap;
1614 		}
1615 
1616 		VM_BUG_ON_PAGE(compound_head(page) != head, page);
1617 
1618 		SetPageReferenced(page);
1619 		pages[*nr] = page;
1620 		(*nr)++;
1621 
1622 	} while (ptep++, addr += PAGE_SIZE, addr != end);
1623 
1624 	ret = 1;
1625 
1626 pte_unmap:
1627 	if (pgmap)
1628 		put_dev_pagemap(pgmap);
1629 	pte_unmap(ptem);
1630 	return ret;
1631 }
1632 #else
1633 
1634 /*
1635  * If we can't determine whether or not a pte is special, then fail immediately
1636  * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
1637  * to be special.
1638  *
1639  * For a futex to be placed on a THP tail page, get_futex_key requires a
1640  * __get_user_pages_fast implementation that can pin pages. Thus it's still
1641  * useful to have gup_huge_pmd even if we can't operate on ptes.
1642  */
1643 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1644 			 int write, struct page **pages, int *nr)
1645 {
1646 	return 0;
1647 }
1648 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
1649 
1650 #if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1651 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
1652 		unsigned long end, struct page **pages, int *nr)
1653 {
1654 	int nr_start = *nr;
1655 	struct dev_pagemap *pgmap = NULL;
1656 
1657 	do {
1658 		struct page *page = pfn_to_page(pfn);
1659 
1660 		pgmap = get_dev_pagemap(pfn, pgmap);
1661 		if (unlikely(!pgmap)) {
1662 			undo_dev_pagemap(nr, nr_start, pages);
1663 			return 0;
1664 		}
1665 		SetPageReferenced(page);
1666 		pages[*nr] = page;
1667 		get_page(page);
1668 		(*nr)++;
1669 		pfn++;
1670 	} while (addr += PAGE_SIZE, addr != end);
1671 
1672 	if (pgmap)
1673 		put_dev_pagemap(pgmap);
1674 	return 1;
1675 }
1676 
1677 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1678 		unsigned long end, struct page **pages, int *nr)
1679 {
1680 	unsigned long fault_pfn;
1681 	int nr_start = *nr;
1682 
1683 	fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1684 	if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
1685 		return 0;
1686 
1687 	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
1688 		undo_dev_pagemap(nr, nr_start, pages);
1689 		return 0;
1690 	}
1691 	return 1;
1692 }
1693 
1694 static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1695 		unsigned long end, struct page **pages, int *nr)
1696 {
1697 	unsigned long fault_pfn;
1698 	int nr_start = *nr;
1699 
1700 	fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
1701 	if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
1702 		return 0;
1703 
1704 	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
1705 		undo_dev_pagemap(nr, nr_start, pages);
1706 		return 0;
1707 	}
1708 	return 1;
1709 }
1710 #else
1711 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1712 		unsigned long end, struct page **pages, int *nr)
1713 {
1714 	BUILD_BUG();
1715 	return 0;
1716 }
1717 
1718 static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
1719 		unsigned long end, struct page **pages, int *nr)
1720 {
1721 	BUILD_BUG();
1722 	return 0;
1723 }
1724 #endif
1725 
1726 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1727 		unsigned long end, int write, struct page **pages, int *nr)
1728 {
1729 	struct page *head, *page;
1730 	int refs;
1731 
1732 	if (!pmd_access_permitted(orig, write))
1733 		return 0;
1734 
1735 	if (pmd_devmap(orig))
1736 		return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
1737 
1738 	refs = 0;
1739 	page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1740 	do {
1741 		pages[*nr] = page;
1742 		(*nr)++;
1743 		page++;
1744 		refs++;
1745 	} while (addr += PAGE_SIZE, addr != end);
1746 
1747 	head = try_get_compound_head(pmd_page(orig), refs);
1748 	if (!head) {
1749 		*nr -= refs;
1750 		return 0;
1751 	}
1752 
1753 	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
1754 		*nr -= refs;
1755 		while (refs--)
1756 			put_page(head);
1757 		return 0;
1758 	}
1759 
1760 	SetPageReferenced(head);
1761 	return 1;
1762 }
1763 
1764 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1765 		unsigned long end, int write, struct page **pages, int *nr)
1766 {
1767 	struct page *head, *page;
1768 	int refs;
1769 
1770 	if (!pud_access_permitted(orig, write))
1771 		return 0;
1772 
1773 	if (pud_devmap(orig))
1774 		return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
1775 
1776 	refs = 0;
1777 	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
1778 	do {
1779 		pages[*nr] = page;
1780 		(*nr)++;
1781 		page++;
1782 		refs++;
1783 	} while (addr += PAGE_SIZE, addr != end);
1784 
1785 	head = try_get_compound_head(pud_page(orig), refs);
1786 	if (!head) {
1787 		*nr -= refs;
1788 		return 0;
1789 	}
1790 
1791 	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
1792 		*nr -= refs;
1793 		while (refs--)
1794 			put_page(head);
1795 		return 0;
1796 	}
1797 
1798 	SetPageReferenced(head);
1799 	return 1;
1800 }
1801 
1802 static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
1803 			unsigned long end, int write,
1804 			struct page **pages, int *nr)
1805 {
1806 	int refs;
1807 	struct page *head, *page;
1808 
1809 	if (!pgd_access_permitted(orig, write))
1810 		return 0;
1811 
1812 	BUILD_BUG_ON(pgd_devmap(orig));
1813 	refs = 0;
1814 	page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
1815 	do {
1816 		pages[*nr] = page;
1817 		(*nr)++;
1818 		page++;
1819 		refs++;
1820 	} while (addr += PAGE_SIZE, addr != end);
1821 
1822 	head = try_get_compound_head(pgd_page(orig), refs);
1823 	if (!head) {
1824 		*nr -= refs;
1825 		return 0;
1826 	}
1827 
1828 	if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
1829 		*nr -= refs;
1830 		while (refs--)
1831 			put_page(head);
1832 		return 0;
1833 	}
1834 
1835 	SetPageReferenced(head);
1836 	return 1;
1837 }
1838 
1839 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
1840 		int write, struct page **pages, int *nr)
1841 {
1842 	unsigned long next;
1843 	pmd_t *pmdp;
1844 
1845 	pmdp = pmd_offset(&pud, addr);
1846 	do {
1847 		pmd_t pmd = READ_ONCE(*pmdp);
1848 
1849 		next = pmd_addr_end(addr, end);
1850 		if (!pmd_present(pmd))
1851 			return 0;
1852 
1853 		if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
1854 			     pmd_devmap(pmd))) {
1855 			/*
1856 			 * NUMA hinting faults need to be handled in the GUP
1857 			 * slowpath for accounting purposes and so that they
1858 			 * can be serialised against THP migration.
1859 			 */
1860 			if (pmd_protnone(pmd))
1861 				return 0;
1862 
1863 			if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
1864 				pages, nr))
1865 				return 0;
1866 
1867 		} else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
1868 			/*
1869 			 * architecture have different format for hugetlbfs
1870 			 * pmd format and THP pmd format
1871 			 */
1872 			if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
1873 					 PMD_SHIFT, next, write, pages, nr))
1874 				return 0;
1875 		} else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
1876 			return 0;
1877 	} while (pmdp++, addr = next, addr != end);
1878 
1879 	return 1;
1880 }
1881 
1882 static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
1883 			 int write, struct page **pages, int *nr)
1884 {
1885 	unsigned long next;
1886 	pud_t *pudp;
1887 
1888 	pudp = pud_offset(&p4d, addr);
1889 	do {
1890 		pud_t pud = READ_ONCE(*pudp);
1891 
1892 		next = pud_addr_end(addr, end);
1893 		if (pud_none(pud))
1894 			return 0;
1895 		if (unlikely(pud_huge(pud))) {
1896 			if (!gup_huge_pud(pud, pudp, addr, next, write,
1897 					  pages, nr))
1898 				return 0;
1899 		} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
1900 			if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
1901 					 PUD_SHIFT, next, write, pages, nr))
1902 				return 0;
1903 		} else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
1904 			return 0;
1905 	} while (pudp++, addr = next, addr != end);
1906 
1907 	return 1;
1908 }
1909 
1910 static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
1911 			 int write, struct page **pages, int *nr)
1912 {
1913 	unsigned long next;
1914 	p4d_t *p4dp;
1915 
1916 	p4dp = p4d_offset(&pgd, addr);
1917 	do {
1918 		p4d_t p4d = READ_ONCE(*p4dp);
1919 
1920 		next = p4d_addr_end(addr, end);
1921 		if (p4d_none(p4d))
1922 			return 0;
1923 		BUILD_BUG_ON(p4d_huge(p4d));
1924 		if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
1925 			if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
1926 					 P4D_SHIFT, next, write, pages, nr))
1927 				return 0;
1928 		} else if (!gup_pud_range(p4d, addr, next, write, pages, nr))
1929 			return 0;
1930 	} while (p4dp++, addr = next, addr != end);
1931 
1932 	return 1;
1933 }
1934 
1935 static void gup_pgd_range(unsigned long addr, unsigned long end,
1936 		int write, struct page **pages, int *nr)
1937 {
1938 	unsigned long next;
1939 	pgd_t *pgdp;
1940 
1941 	pgdp = pgd_offset(current->mm, addr);
1942 	do {
1943 		pgd_t pgd = READ_ONCE(*pgdp);
1944 
1945 		next = pgd_addr_end(addr, end);
1946 		if (pgd_none(pgd))
1947 			return;
1948 		if (unlikely(pgd_huge(pgd))) {
1949 			if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
1950 					  pages, nr))
1951 				return;
1952 		} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
1953 			if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
1954 					 PGDIR_SHIFT, next, write, pages, nr))
1955 				return;
1956 		} else if (!gup_p4d_range(pgd, addr, next, write, pages, nr))
1957 			return;
1958 	} while (pgdp++, addr = next, addr != end);
1959 }
1960 
1961 #ifndef gup_fast_permitted
1962 /*
1963  * Check if it's allowed to use __get_user_pages_fast() for the range, or
1964  * we need to fall back to the slow version:
1965  */
1966 bool gup_fast_permitted(unsigned long start, int nr_pages)
1967 {
1968 	unsigned long len, end;
1969 
1970 	len = (unsigned long) nr_pages << PAGE_SHIFT;
1971 	end = start + len;
1972 	return end >= start;
1973 }
1974 #endif
1975 
1976 /*
1977  * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
1978  * the regular GUP.
1979  * Note a difference with get_user_pages_fast: this always returns the
1980  * number of pages pinned, 0 if no pages were pinned.
1981  */
1982 int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
1983 			  struct page **pages)
1984 {
1985 	unsigned long len, end;
1986 	unsigned long flags;
1987 	int nr = 0;
1988 
1989 	start &= PAGE_MASK;
1990 	len = (unsigned long) nr_pages << PAGE_SHIFT;
1991 	end = start + len;
1992 
1993 	if (unlikely(!access_ok((void __user *)start, len)))
1994 		return 0;
1995 
1996 	/*
1997 	 * Disable interrupts.  We use the nested form as we can already have
1998 	 * interrupts disabled by get_futex_key.
1999 	 *
2000 	 * With interrupts disabled, we block page table pages from being
2001 	 * freed from under us. See struct mmu_table_batch comments in
2002 	 * include/asm-generic/tlb.h for more details.
2003 	 *
2004 	 * We do not adopt an rcu_read_lock(.) here as we also want to
2005 	 * block IPIs that come from THPs splitting.
2006 	 */
2007 
2008 	if (gup_fast_permitted(start, nr_pages)) {
2009 		local_irq_save(flags);
2010 		gup_pgd_range(start, end, write, pages, &nr);
2011 		local_irq_restore(flags);
2012 	}
2013 
2014 	return nr;
2015 }
2016 
2017 /**
2018  * get_user_pages_fast() - pin user pages in memory
2019  * @start:	starting user address
2020  * @nr_pages:	number of pages from start to pin
2021  * @write:	whether pages will be written to
2022  * @pages:	array that receives pointers to the pages pinned.
2023  *		Should be at least nr_pages long.
2024  *
2025  * Attempt to pin user pages in memory without taking mm->mmap_sem.
2026  * If not successful, it will fall back to taking the lock and
2027  * calling get_user_pages().
2028  *
2029  * Returns number of pages pinned. This may be fewer than the number
2030  * requested. If nr_pages is 0 or negative, returns 0. If no pages
2031  * were pinned, returns -errno.
2032  */
2033 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
2034 			struct page **pages)
2035 {
2036 	unsigned long addr, len, end;
2037 	int nr = 0, ret = 0;
2038 
2039 	start &= PAGE_MASK;
2040 	addr = start;
2041 	len = (unsigned long) nr_pages << PAGE_SHIFT;
2042 	end = start + len;
2043 
2044 	if (nr_pages <= 0)
2045 		return 0;
2046 
2047 	if (unlikely(!access_ok((void __user *)start, len)))
2048 		return -EFAULT;
2049 
2050 	if (gup_fast_permitted(start, nr_pages)) {
2051 		local_irq_disable();
2052 		gup_pgd_range(addr, end, write, pages, &nr);
2053 		local_irq_enable();
2054 		ret = nr;
2055 	}
2056 
2057 	if (nr < nr_pages) {
2058 		/* Try to get the remaining pages with get_user_pages */
2059 		start += nr << PAGE_SHIFT;
2060 		pages += nr;
2061 
2062 		ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
2063 				write ? FOLL_WRITE : 0);
2064 
2065 		/* Have to be a bit careful with return values */
2066 		if (nr > 0) {
2067 			if (ret < 0)
2068 				ret = nr;
2069 			else
2070 				ret += nr;
2071 		}
2072 	}
2073 
2074 	return ret;
2075 }
2076 
2077 #endif /* CONFIG_HAVE_GENERIC_GUP */
2078