xref: /openbmc/linux/mm/gup.c (revision 4161b450)
1 #include <linux/kernel.h>
2 #include <linux/errno.h>
3 #include <linux/err.h>
4 #include <linux/spinlock.h>
5 
6 #include <linux/mm.h>
7 #include <linux/pagemap.h>
8 #include <linux/rmap.h>
9 #include <linux/swap.h>
10 #include <linux/swapops.h>
11 
12 #include <linux/sched.h>
13 #include <linux/rwsem.h>
14 #include <linux/hugetlb.h>
15 #include <asm/pgtable.h>
16 
17 #include "internal.h"
18 
19 static struct page *no_page_table(struct vm_area_struct *vma,
20 		unsigned int flags)
21 {
22 	/*
23 	 * When core dumping an enormous anonymous area that nobody
24 	 * has touched so far, we don't want to allocate unnecessary pages or
25 	 * page tables.  Return error instead of NULL to skip handle_mm_fault,
26 	 * then get_dump_page() will return NULL to leave a hole in the dump.
27 	 * But we can only make this optimization where a hole would surely
28 	 * be zero-filled if handle_mm_fault() actually did handle it.
29 	 */
30 	if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
31 		return ERR_PTR(-EFAULT);
32 	return NULL;
33 }
34 
35 static struct page *follow_page_pte(struct vm_area_struct *vma,
36 		unsigned long address, pmd_t *pmd, unsigned int flags)
37 {
38 	struct mm_struct *mm = vma->vm_mm;
39 	struct page *page;
40 	spinlock_t *ptl;
41 	pte_t *ptep, pte;
42 
43 retry:
44 	if (unlikely(pmd_bad(*pmd)))
45 		return no_page_table(vma, flags);
46 
47 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
48 	pte = *ptep;
49 	if (!pte_present(pte)) {
50 		swp_entry_t entry;
51 		/*
52 		 * KSM's break_ksm() relies upon recognizing a ksm page
53 		 * even while it is being migrated, so for that case we
54 		 * need migration_entry_wait().
55 		 */
56 		if (likely(!(flags & FOLL_MIGRATION)))
57 			goto no_page;
58 		if (pte_none(pte) || pte_file(pte))
59 			goto no_page;
60 		entry = pte_to_swp_entry(pte);
61 		if (!is_migration_entry(entry))
62 			goto no_page;
63 		pte_unmap_unlock(ptep, ptl);
64 		migration_entry_wait(mm, pmd, address);
65 		goto retry;
66 	}
67 	if ((flags & FOLL_NUMA) && pte_numa(pte))
68 		goto no_page;
69 	if ((flags & FOLL_WRITE) && !pte_write(pte)) {
70 		pte_unmap_unlock(ptep, ptl);
71 		return NULL;
72 	}
73 
74 	page = vm_normal_page(vma, address, pte);
75 	if (unlikely(!page)) {
76 		if ((flags & FOLL_DUMP) ||
77 		    !is_zero_pfn(pte_pfn(pte)))
78 			goto bad_page;
79 		page = pte_page(pte);
80 	}
81 
82 	if (flags & FOLL_GET)
83 		get_page_foll(page);
84 	if (flags & FOLL_TOUCH) {
85 		if ((flags & FOLL_WRITE) &&
86 		    !pte_dirty(pte) && !PageDirty(page))
87 			set_page_dirty(page);
88 		/*
89 		 * pte_mkyoung() would be more correct here, but atomic care
90 		 * is needed to avoid losing the dirty bit: it is easier to use
91 		 * mark_page_accessed().
92 		 */
93 		mark_page_accessed(page);
94 	}
95 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
96 		/*
97 		 * The preliminary mapping check is mainly to avoid the
98 		 * pointless overhead of lock_page on the ZERO_PAGE
99 		 * which might bounce very badly if there is contention.
100 		 *
101 		 * If the page is already locked, we don't need to
102 		 * handle it now - vmscan will handle it later if and
103 		 * when it attempts to reclaim the page.
104 		 */
105 		if (page->mapping && trylock_page(page)) {
106 			lru_add_drain();  /* push cached pages to LRU */
107 			/*
108 			 * Because we lock page here, and migration is
109 			 * blocked by the pte's page reference, and we
110 			 * know the page is still mapped, we don't even
111 			 * need to check for file-cache page truncation.
112 			 */
113 			mlock_vma_page(page);
114 			unlock_page(page);
115 		}
116 	}
117 	pte_unmap_unlock(ptep, ptl);
118 	return page;
119 bad_page:
120 	pte_unmap_unlock(ptep, ptl);
121 	return ERR_PTR(-EFAULT);
122 
123 no_page:
124 	pte_unmap_unlock(ptep, ptl);
125 	if (!pte_none(pte))
126 		return NULL;
127 	return no_page_table(vma, flags);
128 }
129 
130 /**
131  * follow_page_mask - look up a page descriptor from a user-virtual address
132  * @vma: vm_area_struct mapping @address
133  * @address: virtual address to look up
134  * @flags: flags modifying lookup behaviour
135  * @page_mask: on output, *page_mask is set according to the size of the page
136  *
137  * @flags can have FOLL_ flags set, defined in <linux/mm.h>
138  *
139  * Returns the mapped (struct page *), %NULL if no mapping exists, or
140  * an error pointer if there is a mapping to something not represented
141  * by a page descriptor (see also vm_normal_page()).
142  */
143 struct page *follow_page_mask(struct vm_area_struct *vma,
144 			      unsigned long address, unsigned int flags,
145 			      unsigned int *page_mask)
146 {
147 	pgd_t *pgd;
148 	pud_t *pud;
149 	pmd_t *pmd;
150 	spinlock_t *ptl;
151 	struct page *page;
152 	struct mm_struct *mm = vma->vm_mm;
153 
154 	*page_mask = 0;
155 
156 	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
157 	if (!IS_ERR(page)) {
158 		BUG_ON(flags & FOLL_GET);
159 		return page;
160 	}
161 
162 	pgd = pgd_offset(mm, address);
163 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
164 		return no_page_table(vma, flags);
165 
166 	pud = pud_offset(pgd, address);
167 	if (pud_none(*pud))
168 		return no_page_table(vma, flags);
169 	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
170 		if (flags & FOLL_GET)
171 			return NULL;
172 		page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
173 		return page;
174 	}
175 	if (unlikely(pud_bad(*pud)))
176 		return no_page_table(vma, flags);
177 
178 	pmd = pmd_offset(pud, address);
179 	if (pmd_none(*pmd))
180 		return no_page_table(vma, flags);
181 	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
182 		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
183 		if (flags & FOLL_GET) {
184 			/*
185 			 * Refcount on tail pages are not well-defined and
186 			 * shouldn't be taken. The caller should handle a NULL
187 			 * return when trying to follow tail pages.
188 			 */
189 			if (PageHead(page))
190 				get_page(page);
191 			else
192 				page = NULL;
193 		}
194 		return page;
195 	}
196 	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
197 		return no_page_table(vma, flags);
198 	if (pmd_trans_huge(*pmd)) {
199 		if (flags & FOLL_SPLIT) {
200 			split_huge_page_pmd(vma, address, pmd);
201 			return follow_page_pte(vma, address, pmd, flags);
202 		}
203 		ptl = pmd_lock(mm, pmd);
204 		if (likely(pmd_trans_huge(*pmd))) {
205 			if (unlikely(pmd_trans_splitting(*pmd))) {
206 				spin_unlock(ptl);
207 				wait_split_huge_page(vma->anon_vma, pmd);
208 			} else {
209 				page = follow_trans_huge_pmd(vma, address,
210 							     pmd, flags);
211 				spin_unlock(ptl);
212 				*page_mask = HPAGE_PMD_NR - 1;
213 				return page;
214 			}
215 		} else
216 			spin_unlock(ptl);
217 	}
218 	return follow_page_pte(vma, address, pmd, flags);
219 }
220 
221 static int get_gate_page(struct mm_struct *mm, unsigned long address,
222 		unsigned int gup_flags, struct vm_area_struct **vma,
223 		struct page **page)
224 {
225 	pgd_t *pgd;
226 	pud_t *pud;
227 	pmd_t *pmd;
228 	pte_t *pte;
229 	int ret = -EFAULT;
230 
231 	/* user gate pages are read-only */
232 	if (gup_flags & FOLL_WRITE)
233 		return -EFAULT;
234 	if (address > TASK_SIZE)
235 		pgd = pgd_offset_k(address);
236 	else
237 		pgd = pgd_offset_gate(mm, address);
238 	BUG_ON(pgd_none(*pgd));
239 	pud = pud_offset(pgd, address);
240 	BUG_ON(pud_none(*pud));
241 	pmd = pmd_offset(pud, address);
242 	if (pmd_none(*pmd))
243 		return -EFAULT;
244 	VM_BUG_ON(pmd_trans_huge(*pmd));
245 	pte = pte_offset_map(pmd, address);
246 	if (pte_none(*pte))
247 		goto unmap;
248 	*vma = get_gate_vma(mm);
249 	if (!page)
250 		goto out;
251 	*page = vm_normal_page(*vma, address, *pte);
252 	if (!*page) {
253 		if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
254 			goto unmap;
255 		*page = pte_page(*pte);
256 	}
257 	get_page(*page);
258 out:
259 	ret = 0;
260 unmap:
261 	pte_unmap(pte);
262 	return ret;
263 }
264 
265 /*
266  * mmap_sem must be held on entry.  If @nonblocking != NULL and
267  * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
268  * If it is, *@nonblocking will be set to 0 and -EBUSY returned.
269  */
270 static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
271 		unsigned long address, unsigned int *flags, int *nonblocking)
272 {
273 	struct mm_struct *mm = vma->vm_mm;
274 	unsigned int fault_flags = 0;
275 	int ret;
276 
277 	/* For mlock, just skip the stack guard page. */
278 	if ((*flags & FOLL_MLOCK) &&
279 			(stack_guard_page_start(vma, address) ||
280 			 stack_guard_page_end(vma, address + PAGE_SIZE)))
281 		return -ENOENT;
282 	if (*flags & FOLL_WRITE)
283 		fault_flags |= FAULT_FLAG_WRITE;
284 	if (nonblocking)
285 		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
286 	if (*flags & FOLL_NOWAIT)
287 		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
288 	if (*flags & FOLL_TRIED) {
289 		VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
290 		fault_flags |= FAULT_FLAG_TRIED;
291 	}
292 
293 	ret = handle_mm_fault(mm, vma, address, fault_flags);
294 	if (ret & VM_FAULT_ERROR) {
295 		if (ret & VM_FAULT_OOM)
296 			return -ENOMEM;
297 		if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
298 			return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
299 		if (ret & VM_FAULT_SIGBUS)
300 			return -EFAULT;
301 		BUG();
302 	}
303 
304 	if (tsk) {
305 		if (ret & VM_FAULT_MAJOR)
306 			tsk->maj_flt++;
307 		else
308 			tsk->min_flt++;
309 	}
310 
311 	if (ret & VM_FAULT_RETRY) {
312 		if (nonblocking)
313 			*nonblocking = 0;
314 		return -EBUSY;
315 	}
316 
317 	/*
318 	 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
319 	 * necessary, even if maybe_mkwrite decided not to set pte_write. We
320 	 * can thus safely do subsequent page lookups as if they were reads.
321 	 * But only do so when looping for pte_write is futile: in some cases
322 	 * userspace may also be wanting to write to the gotten user page,
323 	 * which a read fault here might prevent (a readonly page might get
324 	 * reCOWed by userspace write).
325 	 */
326 	if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
327 		*flags &= ~FOLL_WRITE;
328 	return 0;
329 }
330 
331 static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
332 {
333 	vm_flags_t vm_flags = vma->vm_flags;
334 
335 	if (vm_flags & (VM_IO | VM_PFNMAP))
336 		return -EFAULT;
337 
338 	if (gup_flags & FOLL_WRITE) {
339 		if (!(vm_flags & VM_WRITE)) {
340 			if (!(gup_flags & FOLL_FORCE))
341 				return -EFAULT;
342 			/*
343 			 * We used to let the write,force case do COW in a
344 			 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
345 			 * set a breakpoint in a read-only mapping of an
346 			 * executable, without corrupting the file (yet only
347 			 * when that file had been opened for writing!).
348 			 * Anon pages in shared mappings are surprising: now
349 			 * just reject it.
350 			 */
351 			if (!is_cow_mapping(vm_flags)) {
352 				WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
353 				return -EFAULT;
354 			}
355 		}
356 	} else if (!(vm_flags & VM_READ)) {
357 		if (!(gup_flags & FOLL_FORCE))
358 			return -EFAULT;
359 		/*
360 		 * Is there actually any vma we can reach here which does not
361 		 * have VM_MAYREAD set?
362 		 */
363 		if (!(vm_flags & VM_MAYREAD))
364 			return -EFAULT;
365 	}
366 	return 0;
367 }
368 
369 /**
370  * __get_user_pages() - pin user pages in memory
371  * @tsk:	task_struct of target task
372  * @mm:		mm_struct of target mm
373  * @start:	starting user address
374  * @nr_pages:	number of pages from start to pin
375  * @gup_flags:	flags modifying pin behaviour
376  * @pages:	array that receives pointers to the pages pinned.
377  *		Should be at least nr_pages long. Or NULL, if caller
378  *		only intends to ensure the pages are faulted in.
379  * @vmas:	array of pointers to vmas corresponding to each page.
380  *		Or NULL if the caller does not require them.
381  * @nonblocking: whether waiting for disk IO or mmap_sem contention
382  *
383  * Returns number of pages pinned. This may be fewer than the number
384  * requested. If nr_pages is 0 or negative, returns 0. If no pages
385  * were pinned, returns -errno. Each page returned must be released
386  * with a put_page() call when it is finished with. vmas will only
387  * remain valid while mmap_sem is held.
388  *
389  * Must be called with mmap_sem held.  It may be released.  See below.
390  *
391  * __get_user_pages walks a process's page tables and takes a reference to
392  * each struct page that each user address corresponds to at a given
393  * instant. That is, it takes the page that would be accessed if a user
394  * thread accesses the given user virtual address at that instant.
395  *
396  * This does not guarantee that the page exists in the user mappings when
397  * __get_user_pages returns, and there may even be a completely different
398  * page there in some cases (eg. if mmapped pagecache has been invalidated
399  * and subsequently re faulted). However it does guarantee that the page
400  * won't be freed completely. And mostly callers simply care that the page
401  * contains data that was valid *at some point in time*. Typically, an IO
402  * or similar operation cannot guarantee anything stronger anyway because
403  * locks can't be held over the syscall boundary.
404  *
405  * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
406  * the page is written to, set_page_dirty (or set_page_dirty_lock, as
407  * appropriate) must be called after the page is finished with, and
408  * before put_page is called.
409  *
410  * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
411  * or mmap_sem contention, and if waiting is needed to pin all pages,
412  * *@nonblocking will be set to 0.  Further, if @gup_flags does not
413  * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
414  * this case.
415  *
416  * A caller using such a combination of @nonblocking and @gup_flags
417  * must therefore hold the mmap_sem for reading only, and recognize
418  * when it's been released.  Otherwise, it must be held for either
419  * reading or writing and will not be released.
420  *
421  * In most cases, get_user_pages or get_user_pages_fast should be used
422  * instead of __get_user_pages. __get_user_pages should be used only if
423  * you need some special @gup_flags.
424  */
425 long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
426 		unsigned long start, unsigned long nr_pages,
427 		unsigned int gup_flags, struct page **pages,
428 		struct vm_area_struct **vmas, int *nonblocking)
429 {
430 	long i = 0;
431 	unsigned int page_mask;
432 	struct vm_area_struct *vma = NULL;
433 
434 	if (!nr_pages)
435 		return 0;
436 
437 	VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
438 
439 	/*
440 	 * If FOLL_FORCE is set then do not force a full fault as the hinting
441 	 * fault information is unrelated to the reference behaviour of a task
442 	 * using the address space
443 	 */
444 	if (!(gup_flags & FOLL_FORCE))
445 		gup_flags |= FOLL_NUMA;
446 
447 	do {
448 		struct page *page;
449 		unsigned int foll_flags = gup_flags;
450 		unsigned int page_increm;
451 
452 		/* first iteration or cross vma bound */
453 		if (!vma || start >= vma->vm_end) {
454 			vma = find_extend_vma(mm, start);
455 			if (!vma && in_gate_area(mm, start)) {
456 				int ret;
457 				ret = get_gate_page(mm, start & PAGE_MASK,
458 						gup_flags, &vma,
459 						pages ? &pages[i] : NULL);
460 				if (ret)
461 					return i ? : ret;
462 				page_mask = 0;
463 				goto next_page;
464 			}
465 
466 			if (!vma || check_vma_flags(vma, gup_flags))
467 				return i ? : -EFAULT;
468 			if (is_vm_hugetlb_page(vma)) {
469 				i = follow_hugetlb_page(mm, vma, pages, vmas,
470 						&start, &nr_pages, i,
471 						gup_flags);
472 				continue;
473 			}
474 		}
475 retry:
476 		/*
477 		 * If we have a pending SIGKILL, don't keep faulting pages and
478 		 * potentially allocating memory.
479 		 */
480 		if (unlikely(fatal_signal_pending(current)))
481 			return i ? i : -ERESTARTSYS;
482 		cond_resched();
483 		page = follow_page_mask(vma, start, foll_flags, &page_mask);
484 		if (!page) {
485 			int ret;
486 			ret = faultin_page(tsk, vma, start, &foll_flags,
487 					nonblocking);
488 			switch (ret) {
489 			case 0:
490 				goto retry;
491 			case -EFAULT:
492 			case -ENOMEM:
493 			case -EHWPOISON:
494 				return i ? i : ret;
495 			case -EBUSY:
496 				return i;
497 			case -ENOENT:
498 				goto next_page;
499 			}
500 			BUG();
501 		}
502 		if (IS_ERR(page))
503 			return i ? i : PTR_ERR(page);
504 		if (pages) {
505 			pages[i] = page;
506 			flush_anon_page(vma, page, start);
507 			flush_dcache_page(page);
508 			page_mask = 0;
509 		}
510 next_page:
511 		if (vmas) {
512 			vmas[i] = vma;
513 			page_mask = 0;
514 		}
515 		page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
516 		if (page_increm > nr_pages)
517 			page_increm = nr_pages;
518 		i += page_increm;
519 		start += page_increm * PAGE_SIZE;
520 		nr_pages -= page_increm;
521 	} while (nr_pages);
522 	return i;
523 }
524 EXPORT_SYMBOL(__get_user_pages);
525 
526 /*
527  * fixup_user_fault() - manually resolve a user page fault
528  * @tsk:	the task_struct to use for page fault accounting, or
529  *		NULL if faults are not to be recorded.
530  * @mm:		mm_struct of target mm
531  * @address:	user address
532  * @fault_flags:flags to pass down to handle_mm_fault()
533  *
534  * This is meant to be called in the specific scenario where for locking reasons
535  * we try to access user memory in atomic context (within a pagefault_disable()
536  * section), this returns -EFAULT, and we want to resolve the user fault before
537  * trying again.
538  *
539  * Typically this is meant to be used by the futex code.
540  *
541  * The main difference with get_user_pages() is that this function will
542  * unconditionally call handle_mm_fault() which will in turn perform all the
543  * necessary SW fixup of the dirty and young bits in the PTE, while
544  * handle_mm_fault() only guarantees to update these in the struct page.
545  *
546  * This is important for some architectures where those bits also gate the
547  * access permission to the page because they are maintained in software.  On
548  * such architectures, gup() will not be enough to make a subsequent access
549  * succeed.
550  *
551  * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault().
552  */
553 int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
554 		     unsigned long address, unsigned int fault_flags)
555 {
556 	struct vm_area_struct *vma;
557 	vm_flags_t vm_flags;
558 	int ret;
559 
560 	vma = find_extend_vma(mm, address);
561 	if (!vma || address < vma->vm_start)
562 		return -EFAULT;
563 
564 	vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
565 	if (!(vm_flags & vma->vm_flags))
566 		return -EFAULT;
567 
568 	ret = handle_mm_fault(mm, vma, address, fault_flags);
569 	if (ret & VM_FAULT_ERROR) {
570 		if (ret & VM_FAULT_OOM)
571 			return -ENOMEM;
572 		if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
573 			return -EHWPOISON;
574 		if (ret & VM_FAULT_SIGBUS)
575 			return -EFAULT;
576 		BUG();
577 	}
578 	if (tsk) {
579 		if (ret & VM_FAULT_MAJOR)
580 			tsk->maj_flt++;
581 		else
582 			tsk->min_flt++;
583 	}
584 	return 0;
585 }
586 
587 /*
588  * get_user_pages() - pin user pages in memory
589  * @tsk:	the task_struct to use for page fault accounting, or
590  *		NULL if faults are not to be recorded.
591  * @mm:		mm_struct of target mm
592  * @start:	starting user address
593  * @nr_pages:	number of pages from start to pin
594  * @write:	whether pages will be written to by the caller
595  * @force:	whether to force access even when user mapping is currently
596  *		protected (but never forces write access to shared mapping).
597  * @pages:	array that receives pointers to the pages pinned.
598  *		Should be at least nr_pages long. Or NULL, if caller
599  *		only intends to ensure the pages are faulted in.
600  * @vmas:	array of pointers to vmas corresponding to each page.
601  *		Or NULL if the caller does not require them.
602  *
603  * Returns number of pages pinned. This may be fewer than the number
604  * requested. If nr_pages is 0 or negative, returns 0. If no pages
605  * were pinned, returns -errno. Each page returned must be released
606  * with a put_page() call when it is finished with. vmas will only
607  * remain valid while mmap_sem is held.
608  *
609  * Must be called with mmap_sem held for read or write.
610  *
611  * get_user_pages walks a process's page tables and takes a reference to
612  * each struct page that each user address corresponds to at a given
613  * instant. That is, it takes the page that would be accessed if a user
614  * thread accesses the given user virtual address at that instant.
615  *
616  * This does not guarantee that the page exists in the user mappings when
617  * get_user_pages returns, and there may even be a completely different
618  * page there in some cases (eg. if mmapped pagecache has been invalidated
619  * and subsequently re faulted). However it does guarantee that the page
620  * won't be freed completely. And mostly callers simply care that the page
621  * contains data that was valid *at some point in time*. Typically, an IO
622  * or similar operation cannot guarantee anything stronger anyway because
623  * locks can't be held over the syscall boundary.
624  *
625  * If write=0, the page must not be written to. If the page is written to,
626  * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
627  * after the page is finished with, and before put_page is called.
628  *
629  * get_user_pages is typically used for fewer-copy IO operations, to get a
630  * handle on the memory by some means other than accesses via the user virtual
631  * addresses. The pages may be submitted for DMA to devices or accessed via
632  * their kernel linear mapping (via the kmap APIs). Care should be taken to
633  * use the correct cache flushing APIs.
634  *
635  * See also get_user_pages_fast, for performance critical applications.
636  */
637 long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
638 		unsigned long start, unsigned long nr_pages, int write,
639 		int force, struct page **pages, struct vm_area_struct **vmas)
640 {
641 	int flags = FOLL_TOUCH;
642 
643 	if (pages)
644 		flags |= FOLL_GET;
645 	if (write)
646 		flags |= FOLL_WRITE;
647 	if (force)
648 		flags |= FOLL_FORCE;
649 
650 	return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
651 				NULL);
652 }
653 EXPORT_SYMBOL(get_user_pages);
654 
655 /**
656  * get_dump_page() - pin user page in memory while writing it to core dump
657  * @addr: user address
658  *
659  * Returns struct page pointer of user page pinned for dump,
660  * to be freed afterwards by page_cache_release() or put_page().
661  *
662  * Returns NULL on any kind of failure - a hole must then be inserted into
663  * the corefile, to preserve alignment with its headers; and also returns
664  * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
665  * allowing a hole to be left in the corefile to save diskspace.
666  *
667  * Called without mmap_sem, but after all other threads have been killed.
668  */
669 #ifdef CONFIG_ELF_CORE
670 struct page *get_dump_page(unsigned long addr)
671 {
672 	struct vm_area_struct *vma;
673 	struct page *page;
674 
675 	if (__get_user_pages(current, current->mm, addr, 1,
676 			     FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
677 			     NULL) < 1)
678 		return NULL;
679 	flush_cache_page(vma, addr, page_to_pfn(page));
680 	return page;
681 }
682 #endif /* CONFIG_ELF_CORE */
683 
684 /*
685  * Generic RCU Fast GUP
686  *
687  * get_user_pages_fast attempts to pin user pages by walking the page
688  * tables directly and avoids taking locks. Thus the walker needs to be
689  * protected from page table pages being freed from under it, and should
690  * block any THP splits.
691  *
692  * One way to achieve this is to have the walker disable interrupts, and
693  * rely on IPIs from the TLB flushing code blocking before the page table
694  * pages are freed. This is unsuitable for architectures that do not need
695  * to broadcast an IPI when invalidating TLBs.
696  *
697  * Another way to achieve this is to batch up page table containing pages
698  * belonging to more than one mm_user, then rcu_sched a callback to free those
699  * pages. Disabling interrupts will allow the fast_gup walker to both block
700  * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
701  * (which is a relatively rare event). The code below adopts this strategy.
702  *
703  * Before activating this code, please be aware that the following assumptions
704  * are currently made:
705  *
706  *  *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
707  *      pages containing page tables.
708  *
709  *  *) THP splits will broadcast an IPI, this can be achieved by overriding
710  *      pmdp_splitting_flush.
711  *
712  *  *) ptes can be read atomically by the architecture.
713  *
714  *  *) access_ok is sufficient to validate userspace address ranges.
715  *
716  * The last two assumptions can be relaxed by the addition of helper functions.
717  *
718  * This code is based heavily on the PowerPC implementation by Nick Piggin.
719  */
720 #ifdef CONFIG_HAVE_GENERIC_RCU_GUP
721 
722 #ifdef __HAVE_ARCH_PTE_SPECIAL
723 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
724 			 int write, struct page **pages, int *nr)
725 {
726 	pte_t *ptep, *ptem;
727 	int ret = 0;
728 
729 	ptem = ptep = pte_offset_map(&pmd, addr);
730 	do {
731 		/*
732 		 * In the line below we are assuming that the pte can be read
733 		 * atomically. If this is not the case for your architecture,
734 		 * please wrap this in a helper function!
735 		 *
736 		 * for an example see gup_get_pte in arch/x86/mm/gup.c
737 		 */
738 		pte_t pte = ACCESS_ONCE(*ptep);
739 		struct page *page;
740 
741 		/*
742 		 * Similar to the PMD case below, NUMA hinting must take slow
743 		 * path
744 		 */
745 		if (!pte_present(pte) || pte_special(pte) ||
746 			pte_numa(pte) || (write && !pte_write(pte)))
747 			goto pte_unmap;
748 
749 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
750 		page = pte_page(pte);
751 
752 		if (!page_cache_get_speculative(page))
753 			goto pte_unmap;
754 
755 		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
756 			put_page(page);
757 			goto pte_unmap;
758 		}
759 
760 		pages[*nr] = page;
761 		(*nr)++;
762 
763 	} while (ptep++, addr += PAGE_SIZE, addr != end);
764 
765 	ret = 1;
766 
767 pte_unmap:
768 	pte_unmap(ptem);
769 	return ret;
770 }
771 #else
772 
773 /*
774  * If we can't determine whether or not a pte is special, then fail immediately
775  * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
776  * to be special.
777  *
778  * For a futex to be placed on a THP tail page, get_futex_key requires a
779  * __get_user_pages_fast implementation that can pin pages. Thus it's still
780  * useful to have gup_huge_pmd even if we can't operate on ptes.
781  */
782 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
783 			 int write, struct page **pages, int *nr)
784 {
785 	return 0;
786 }
787 #endif /* __HAVE_ARCH_PTE_SPECIAL */
788 
789 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
790 		unsigned long end, int write, struct page **pages, int *nr)
791 {
792 	struct page *head, *page, *tail;
793 	int refs;
794 
795 	if (write && !pmd_write(orig))
796 		return 0;
797 
798 	refs = 0;
799 	head = pmd_page(orig);
800 	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
801 	tail = page;
802 	do {
803 		VM_BUG_ON_PAGE(compound_head(page) != head, page);
804 		pages[*nr] = page;
805 		(*nr)++;
806 		page++;
807 		refs++;
808 	} while (addr += PAGE_SIZE, addr != end);
809 
810 	if (!page_cache_add_speculative(head, refs)) {
811 		*nr -= refs;
812 		return 0;
813 	}
814 
815 	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
816 		*nr -= refs;
817 		while (refs--)
818 			put_page(head);
819 		return 0;
820 	}
821 
822 	/*
823 	 * Any tail pages need their mapcount reference taken before we
824 	 * return. (This allows the THP code to bump their ref count when
825 	 * they are split into base pages).
826 	 */
827 	while (refs--) {
828 		if (PageTail(tail))
829 			get_huge_page_tail(tail);
830 		tail++;
831 	}
832 
833 	return 1;
834 }
835 
836 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
837 		unsigned long end, int write, struct page **pages, int *nr)
838 {
839 	struct page *head, *page, *tail;
840 	int refs;
841 
842 	if (write && !pud_write(orig))
843 		return 0;
844 
845 	refs = 0;
846 	head = pud_page(orig);
847 	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
848 	tail = page;
849 	do {
850 		VM_BUG_ON_PAGE(compound_head(page) != head, page);
851 		pages[*nr] = page;
852 		(*nr)++;
853 		page++;
854 		refs++;
855 	} while (addr += PAGE_SIZE, addr != end);
856 
857 	if (!page_cache_add_speculative(head, refs)) {
858 		*nr -= refs;
859 		return 0;
860 	}
861 
862 	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
863 		*nr -= refs;
864 		while (refs--)
865 			put_page(head);
866 		return 0;
867 	}
868 
869 	while (refs--) {
870 		if (PageTail(tail))
871 			get_huge_page_tail(tail);
872 		tail++;
873 	}
874 
875 	return 1;
876 }
877 
878 static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
879 			unsigned long end, int write,
880 			struct page **pages, int *nr)
881 {
882 	int refs;
883 	struct page *head, *page, *tail;
884 
885 	if (write && !pgd_write(orig))
886 		return 0;
887 
888 	refs = 0;
889 	head = pgd_page(orig);
890 	page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
891 	tail = page;
892 	do {
893 		VM_BUG_ON_PAGE(compound_head(page) != head, page);
894 		pages[*nr] = page;
895 		(*nr)++;
896 		page++;
897 		refs++;
898 	} while (addr += PAGE_SIZE, addr != end);
899 
900 	if (!page_cache_add_speculative(head, refs)) {
901 		*nr -= refs;
902 		return 0;
903 	}
904 
905 	if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
906 		*nr -= refs;
907 		while (refs--)
908 			put_page(head);
909 		return 0;
910 	}
911 
912 	while (refs--) {
913 		if (PageTail(tail))
914 			get_huge_page_tail(tail);
915 		tail++;
916 	}
917 
918 	return 1;
919 }
920 
921 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
922 		int write, struct page **pages, int *nr)
923 {
924 	unsigned long next;
925 	pmd_t *pmdp;
926 
927 	pmdp = pmd_offset(&pud, addr);
928 	do {
929 		pmd_t pmd = ACCESS_ONCE(*pmdp);
930 
931 		next = pmd_addr_end(addr, end);
932 		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
933 			return 0;
934 
935 		if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
936 			/*
937 			 * NUMA hinting faults need to be handled in the GUP
938 			 * slowpath for accounting purposes and so that they
939 			 * can be serialised against THP migration.
940 			 */
941 			if (pmd_numa(pmd))
942 				return 0;
943 
944 			if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
945 				pages, nr))
946 				return 0;
947 
948 		} else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
949 			/*
950 			 * architecture have different format for hugetlbfs
951 			 * pmd format and THP pmd format
952 			 */
953 			if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
954 					 PMD_SHIFT, next, write, pages, nr))
955 				return 0;
956 		} else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
957 				return 0;
958 	} while (pmdp++, addr = next, addr != end);
959 
960 	return 1;
961 }
962 
963 static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
964 			 int write, struct page **pages, int *nr)
965 {
966 	unsigned long next;
967 	pud_t *pudp;
968 
969 	pudp = pud_offset(&pgd, addr);
970 	do {
971 		pud_t pud = READ_ONCE(*pudp);
972 
973 		next = pud_addr_end(addr, end);
974 		if (pud_none(pud))
975 			return 0;
976 		if (unlikely(pud_huge(pud))) {
977 			if (!gup_huge_pud(pud, pudp, addr, next, write,
978 					  pages, nr))
979 				return 0;
980 		} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
981 			if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
982 					 PUD_SHIFT, next, write, pages, nr))
983 				return 0;
984 		} else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
985 			return 0;
986 	} while (pudp++, addr = next, addr != end);
987 
988 	return 1;
989 }
990 
991 /*
992  * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
993  * the regular GUP. It will only return non-negative values.
994  */
995 int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
996 			  struct page **pages)
997 {
998 	struct mm_struct *mm = current->mm;
999 	unsigned long addr, len, end;
1000 	unsigned long next, flags;
1001 	pgd_t *pgdp;
1002 	int nr = 0;
1003 
1004 	start &= PAGE_MASK;
1005 	addr = start;
1006 	len = (unsigned long) nr_pages << PAGE_SHIFT;
1007 	end = start + len;
1008 
1009 	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
1010 					start, len)))
1011 		return 0;
1012 
1013 	/*
1014 	 * Disable interrupts.  We use the nested form as we can already have
1015 	 * interrupts disabled by get_futex_key.
1016 	 *
1017 	 * With interrupts disabled, we block page table pages from being
1018 	 * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h
1019 	 * for more details.
1020 	 *
1021 	 * We do not adopt an rcu_read_lock(.) here as we also want to
1022 	 * block IPIs that come from THPs splitting.
1023 	 */
1024 
1025 	local_irq_save(flags);
1026 	pgdp = pgd_offset(mm, addr);
1027 	do {
1028 		pgd_t pgd = ACCESS_ONCE(*pgdp);
1029 
1030 		next = pgd_addr_end(addr, end);
1031 		if (pgd_none(pgd))
1032 			break;
1033 		if (unlikely(pgd_huge(pgd))) {
1034 			if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
1035 					  pages, &nr))
1036 				break;
1037 		} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
1038 			if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
1039 					 PGDIR_SHIFT, next, write, pages, &nr))
1040 				break;
1041 		} else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
1042 			break;
1043 	} while (pgdp++, addr = next, addr != end);
1044 	local_irq_restore(flags);
1045 
1046 	return nr;
1047 }
1048 
1049 /**
1050  * get_user_pages_fast() - pin user pages in memory
1051  * @start:	starting user address
1052  * @nr_pages:	number of pages from start to pin
1053  * @write:	whether pages will be written to
1054  * @pages:	array that receives pointers to the pages pinned.
1055  *		Should be at least nr_pages long.
1056  *
1057  * Attempt to pin user pages in memory without taking mm->mmap_sem.
1058  * If not successful, it will fall back to taking the lock and
1059  * calling get_user_pages().
1060  *
1061  * Returns number of pages pinned. This may be fewer than the number
1062  * requested. If nr_pages is 0 or negative, returns 0. If no pages
1063  * were pinned, returns -errno.
1064  */
1065 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1066 			struct page **pages)
1067 {
1068 	struct mm_struct *mm = current->mm;
1069 	int nr, ret;
1070 
1071 	start &= PAGE_MASK;
1072 	nr = __get_user_pages_fast(start, nr_pages, write, pages);
1073 	ret = nr;
1074 
1075 	if (nr < nr_pages) {
1076 		/* Try to get the remaining pages with get_user_pages */
1077 		start += nr << PAGE_SHIFT;
1078 		pages += nr;
1079 
1080 		down_read(&mm->mmap_sem);
1081 		ret = get_user_pages(current, mm, start,
1082 				     nr_pages - nr, write, 0, pages, NULL);
1083 		up_read(&mm->mmap_sem);
1084 
1085 		/* Have to be a bit careful with return values */
1086 		if (nr > 0) {
1087 			if (ret < 0)
1088 				ret = nr;
1089 			else
1090 				ret += nr;
1091 		}
1092 	}
1093 
1094 	return ret;
1095 }
1096 
1097 #endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
1098