xref: /openbmc/linux/arch/powerpc/mm/hugetlbpage.c (revision 151f4e2b)
1 /*
2  * PPC Huge TLB Page Support for Kernel.
3  *
4  * Copyright (C) 2003 David Gibson, IBM Corporation.
5  * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
6  *
7  * Based on the IA-32 version:
8  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
9  */
10 
11 #include <linux/mm.h>
12 #include <linux/io.h>
13 #include <linux/slab.h>
14 #include <linux/hugetlb.h>
15 #include <linux/export.h>
16 #include <linux/of_fdt.h>
17 #include <linux/memblock.h>
18 #include <linux/moduleparam.h>
19 #include <linux/swap.h>
20 #include <linux/swapops.h>
21 #include <linux/kmemleak.h>
22 #include <asm/pgtable.h>
23 #include <asm/pgalloc.h>
24 #include <asm/tlb.h>
25 #include <asm/setup.h>
26 #include <asm/hugetlb.h>
27 #include <asm/pte-walk.h>
28 
29 bool hugetlb_disabled = false;
30 
31 #define hugepd_none(hpd)	(hpd_val(hpd) == 0)
32 
33 #define PTE_T_ORDER	(__builtin_ffs(sizeof(pte_t)) - __builtin_ffs(sizeof(void *)))
34 
35 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
36 {
37 	/*
38 	 * Only called for hugetlbfs pages, hence can ignore THP and the
39 	 * irq disabled walk.
40 	 */
41 	return __find_linux_pte(mm->pgd, addr, NULL, NULL);
42 }
43 
44 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
45 			   unsigned long address, unsigned int pdshift,
46 			   unsigned int pshift, spinlock_t *ptl)
47 {
48 	struct kmem_cache *cachep;
49 	pte_t *new;
50 	int i;
51 	int num_hugepd;
52 
53 	if (pshift >= pdshift) {
54 		cachep = PGT_CACHE(PTE_T_ORDER);
55 		num_hugepd = 1 << (pshift - pdshift);
56 	} else if (IS_ENABLED(CONFIG_PPC_8xx)) {
57 		cachep = PGT_CACHE(PTE_INDEX_SIZE);
58 		num_hugepd = 1;
59 	} else {
60 		cachep = PGT_CACHE(pdshift - pshift);
61 		num_hugepd = 1;
62 	}
63 
64 	new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
65 
66 	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
67 	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
68 
69 	if (! new)
70 		return -ENOMEM;
71 
72 	/*
73 	 * Make sure other cpus find the hugepd set only after a
74 	 * properly initialized page table is visible to them.
75 	 * For more details look for comment in __pte_alloc().
76 	 */
77 	smp_wmb();
78 
79 	spin_lock(ptl);
80 	/*
81 	 * We have multiple higher-level entries that point to the same
82 	 * actual pte location.  Fill in each as we go and backtrack on error.
83 	 * We need all of these so the DTLB pgtable walk code can find the
84 	 * right higher-level entry without knowing if it's a hugepage or not.
85 	 */
86 	for (i = 0; i < num_hugepd; i++, hpdp++) {
87 		if (unlikely(!hugepd_none(*hpdp)))
88 			break;
89 		hugepd_populate(hpdp, new, pshift);
90 	}
91 	/* If we bailed from the for loop early, an error occurred, clean up */
92 	if (i < num_hugepd) {
93 		for (i = i - 1 ; i >= 0; i--, hpdp--)
94 			*hpdp = __hugepd(0);
95 		kmem_cache_free(cachep, new);
96 	} else {
97 		kmemleak_ignore(new);
98 	}
99 	spin_unlock(ptl);
100 	return 0;
101 }
102 
103 /*
104  * At this point we do the placement change only for BOOK3S 64. This would
105  * possibly work on other subarchs.
106  */
107 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
108 {
109 	pgd_t *pg;
110 	pud_t *pu;
111 	pmd_t *pm;
112 	hugepd_t *hpdp = NULL;
113 	unsigned pshift = __ffs(sz);
114 	unsigned pdshift = PGDIR_SHIFT;
115 	spinlock_t *ptl;
116 
117 	addr &= ~(sz-1);
118 	pg = pgd_offset(mm, addr);
119 
120 #ifdef CONFIG_PPC_BOOK3S_64
121 	if (pshift == PGDIR_SHIFT)
122 		/* 16GB huge page */
123 		return (pte_t *) pg;
124 	else if (pshift > PUD_SHIFT) {
125 		/*
126 		 * We need to use hugepd table
127 		 */
128 		ptl = &mm->page_table_lock;
129 		hpdp = (hugepd_t *)pg;
130 	} else {
131 		pdshift = PUD_SHIFT;
132 		pu = pud_alloc(mm, pg, addr);
133 		if (pshift == PUD_SHIFT)
134 			return (pte_t *)pu;
135 		else if (pshift > PMD_SHIFT) {
136 			ptl = pud_lockptr(mm, pu);
137 			hpdp = (hugepd_t *)pu;
138 		} else {
139 			pdshift = PMD_SHIFT;
140 			pm = pmd_alloc(mm, pu, addr);
141 			if (pshift == PMD_SHIFT)
142 				/* 16MB hugepage */
143 				return (pte_t *)pm;
144 			else {
145 				ptl = pmd_lockptr(mm, pm);
146 				hpdp = (hugepd_t *)pm;
147 			}
148 		}
149 	}
150 #else
151 	if (pshift >= PGDIR_SHIFT) {
152 		ptl = &mm->page_table_lock;
153 		hpdp = (hugepd_t *)pg;
154 	} else {
155 		pdshift = PUD_SHIFT;
156 		pu = pud_alloc(mm, pg, addr);
157 		if (pshift >= PUD_SHIFT) {
158 			ptl = pud_lockptr(mm, pu);
159 			hpdp = (hugepd_t *)pu;
160 		} else {
161 			pdshift = PMD_SHIFT;
162 			pm = pmd_alloc(mm, pu, addr);
163 			ptl = pmd_lockptr(mm, pm);
164 			hpdp = (hugepd_t *)pm;
165 		}
166 	}
167 #endif
168 	if (!hpdp)
169 		return NULL;
170 
171 	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
172 
173 	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
174 						  pdshift, pshift, ptl))
175 		return NULL;
176 
177 	return hugepte_offset(*hpdp, addr, pdshift);
178 }
179 
180 #ifdef CONFIG_PPC_BOOK3S_64
181 /*
182  * Tracks gpages after the device tree is scanned and before the
183  * huge_boot_pages list is ready on pseries.
184  */
185 #define MAX_NUMBER_GPAGES	1024
186 __initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
187 __initdata static unsigned nr_gpages;
188 
189 /*
190  * Build list of addresses of gigantic pages.  This function is used in early
191  * boot before the buddy allocator is setup.
192  */
193 void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
194 {
195 	if (!addr)
196 		return;
197 	while (number_of_pages > 0) {
198 		gpage_freearray[nr_gpages] = addr;
199 		nr_gpages++;
200 		number_of_pages--;
201 		addr += page_size;
202 	}
203 }
204 
205 int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
206 {
207 	struct huge_bootmem_page *m;
208 	if (nr_gpages == 0)
209 		return 0;
210 	m = phys_to_virt(gpage_freearray[--nr_gpages]);
211 	gpage_freearray[nr_gpages] = 0;
212 	list_add(&m->list, &huge_boot_pages);
213 	m->hstate = hstate;
214 	return 1;
215 }
216 #endif
217 
218 
219 int __init alloc_bootmem_huge_page(struct hstate *h)
220 {
221 
222 #ifdef CONFIG_PPC_BOOK3S_64
223 	if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
224 		return pseries_alloc_bootmem_huge_page(h);
225 #endif
226 	return __alloc_bootmem_huge_page(h);
227 }
228 
229 #ifndef CONFIG_PPC_BOOK3S_64
230 #define HUGEPD_FREELIST_SIZE \
231 	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
232 
233 struct hugepd_freelist {
234 	struct rcu_head	rcu;
235 	unsigned int index;
236 	void *ptes[0];
237 };
238 
239 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
240 
241 static void hugepd_free_rcu_callback(struct rcu_head *head)
242 {
243 	struct hugepd_freelist *batch =
244 		container_of(head, struct hugepd_freelist, rcu);
245 	unsigned int i;
246 
247 	for (i = 0; i < batch->index; i++)
248 		kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]);
249 
250 	free_page((unsigned long)batch);
251 }
252 
253 static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
254 {
255 	struct hugepd_freelist **batchp;
256 
257 	batchp = &get_cpu_var(hugepd_freelist_cur);
258 
259 	if (atomic_read(&tlb->mm->mm_users) < 2 ||
260 	    mm_is_thread_local(tlb->mm)) {
261 		kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte);
262 		put_cpu_var(hugepd_freelist_cur);
263 		return;
264 	}
265 
266 	if (*batchp == NULL) {
267 		*batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
268 		(*batchp)->index = 0;
269 	}
270 
271 	(*batchp)->ptes[(*batchp)->index++] = hugepte;
272 	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
273 		call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback);
274 		*batchp = NULL;
275 	}
276 	put_cpu_var(hugepd_freelist_cur);
277 }
278 #else
279 static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
280 #endif
281 
282 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
283 			      unsigned long start, unsigned long end,
284 			      unsigned long floor, unsigned long ceiling)
285 {
286 	pte_t *hugepte = hugepd_page(*hpdp);
287 	int i;
288 
289 	unsigned long pdmask = ~((1UL << pdshift) - 1);
290 	unsigned int num_hugepd = 1;
291 	unsigned int shift = hugepd_shift(*hpdp);
292 
293 	/* Note: On fsl the hpdp may be the first of several */
294 	if (shift > pdshift)
295 		num_hugepd = 1 << (shift - pdshift);
296 
297 	start &= pdmask;
298 	if (start < floor)
299 		return;
300 	if (ceiling) {
301 		ceiling &= pdmask;
302 		if (! ceiling)
303 			return;
304 	}
305 	if (end - 1 > ceiling - 1)
306 		return;
307 
308 	for (i = 0; i < num_hugepd; i++, hpdp++)
309 		*hpdp = __hugepd(0);
310 
311 	if (shift >= pdshift)
312 		hugepd_free(tlb, hugepte);
313 	else if (IS_ENABLED(CONFIG_PPC_8xx))
314 		pgtable_free_tlb(tlb, hugepte,
315 				 get_hugepd_cache_index(PTE_INDEX_SIZE));
316 	else
317 		pgtable_free_tlb(tlb, hugepte,
318 				 get_hugepd_cache_index(pdshift - shift));
319 }
320 
321 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
322 				   unsigned long addr, unsigned long end,
323 				   unsigned long floor, unsigned long ceiling)
324 {
325 	pmd_t *pmd;
326 	unsigned long next;
327 	unsigned long start;
328 
329 	start = addr;
330 	do {
331 		unsigned long more;
332 
333 		pmd = pmd_offset(pud, addr);
334 		next = pmd_addr_end(addr, end);
335 		if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
336 			/*
337 			 * if it is not hugepd pointer, we should already find
338 			 * it cleared.
339 			 */
340 			WARN_ON(!pmd_none_or_clear_bad(pmd));
341 			continue;
342 		}
343 		/*
344 		 * Increment next by the size of the huge mapping since
345 		 * there may be more than one entry at this level for a
346 		 * single hugepage, but all of them point to
347 		 * the same kmem cache that holds the hugepte.
348 		 */
349 		more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
350 		if (more > next)
351 			next = more;
352 
353 		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
354 				  addr, next, floor, ceiling);
355 	} while (addr = next, addr != end);
356 
357 	start &= PUD_MASK;
358 	if (start < floor)
359 		return;
360 	if (ceiling) {
361 		ceiling &= PUD_MASK;
362 		if (!ceiling)
363 			return;
364 	}
365 	if (end - 1 > ceiling - 1)
366 		return;
367 
368 	pmd = pmd_offset(pud, start);
369 	pud_clear(pud);
370 	pmd_free_tlb(tlb, pmd, start);
371 	mm_dec_nr_pmds(tlb->mm);
372 }
373 
374 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
375 				   unsigned long addr, unsigned long end,
376 				   unsigned long floor, unsigned long ceiling)
377 {
378 	pud_t *pud;
379 	unsigned long next;
380 	unsigned long start;
381 
382 	start = addr;
383 	do {
384 		pud = pud_offset(pgd, addr);
385 		next = pud_addr_end(addr, end);
386 		if (!is_hugepd(__hugepd(pud_val(*pud)))) {
387 			if (pud_none_or_clear_bad(pud))
388 				continue;
389 			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
390 					       ceiling);
391 		} else {
392 			unsigned long more;
393 			/*
394 			 * Increment next by the size of the huge mapping since
395 			 * there may be more than one entry at this level for a
396 			 * single hugepage, but all of them point to
397 			 * the same kmem cache that holds the hugepte.
398 			 */
399 			more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
400 			if (more > next)
401 				next = more;
402 
403 			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
404 					  addr, next, floor, ceiling);
405 		}
406 	} while (addr = next, addr != end);
407 
408 	start &= PGDIR_MASK;
409 	if (start < floor)
410 		return;
411 	if (ceiling) {
412 		ceiling &= PGDIR_MASK;
413 		if (!ceiling)
414 			return;
415 	}
416 	if (end - 1 > ceiling - 1)
417 		return;
418 
419 	pud = pud_offset(pgd, start);
420 	pgd_clear(pgd);
421 	pud_free_tlb(tlb, pud, start);
422 	mm_dec_nr_puds(tlb->mm);
423 }
424 
425 /*
426  * This function frees user-level page tables of a process.
427  */
428 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
429 			    unsigned long addr, unsigned long end,
430 			    unsigned long floor, unsigned long ceiling)
431 {
432 	pgd_t *pgd;
433 	unsigned long next;
434 
435 	/*
436 	 * Because there are a number of different possible pagetable
437 	 * layouts for hugepage ranges, we limit knowledge of how
438 	 * things should be laid out to the allocation path
439 	 * (huge_pte_alloc(), above).  Everything else works out the
440 	 * structure as it goes from information in the hugepd
441 	 * pointers.  That means that we can't here use the
442 	 * optimization used in the normal page free_pgd_range(), of
443 	 * checking whether we're actually covering a large enough
444 	 * range to have to do anything at the top level of the walk
445 	 * instead of at the bottom.
446 	 *
447 	 * To make sense of this, you should probably go read the big
448 	 * block comment at the top of the normal free_pgd_range(),
449 	 * too.
450 	 */
451 
452 	do {
453 		next = pgd_addr_end(addr, end);
454 		pgd = pgd_offset(tlb->mm, addr);
455 		if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
456 			if (pgd_none_or_clear_bad(pgd))
457 				continue;
458 			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
459 		} else {
460 			unsigned long more;
461 			/*
462 			 * Increment next by the size of the huge mapping since
463 			 * there may be more than one entry at the pgd level
464 			 * for a single hugepage, but all of them point to the
465 			 * same kmem cache that holds the hugepte.
466 			 */
467 			more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
468 			if (more > next)
469 				next = more;
470 
471 			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
472 					  addr, next, floor, ceiling);
473 		}
474 	} while (addr = next, addr != end);
475 }
476 
477 struct page *follow_huge_pd(struct vm_area_struct *vma,
478 			    unsigned long address, hugepd_t hpd,
479 			    int flags, int pdshift)
480 {
481 	pte_t *ptep;
482 	spinlock_t *ptl;
483 	struct page *page = NULL;
484 	unsigned long mask;
485 	int shift = hugepd_shift(hpd);
486 	struct mm_struct *mm = vma->vm_mm;
487 
488 retry:
489 	/*
490 	 * hugepage directory entries are protected by mm->page_table_lock
491 	 * Use this instead of huge_pte_lockptr
492 	 */
493 	ptl = &mm->page_table_lock;
494 	spin_lock(ptl);
495 
496 	ptep = hugepte_offset(hpd, address, pdshift);
497 	if (pte_present(*ptep)) {
498 		mask = (1UL << shift) - 1;
499 		page = pte_page(*ptep);
500 		page += ((address & mask) >> PAGE_SHIFT);
501 		if (flags & FOLL_GET)
502 			get_page(page);
503 	} else {
504 		if (is_hugetlb_entry_migration(*ptep)) {
505 			spin_unlock(ptl);
506 			__migration_entry_wait(mm, ptep, ptl);
507 			goto retry;
508 		}
509 	}
510 	spin_unlock(ptl);
511 	return page;
512 }
513 
514 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
515 				      unsigned long sz)
516 {
517 	unsigned long __boundary = (addr + sz) & ~(sz-1);
518 	return (__boundary - 1 < end - 1) ? __boundary : end;
519 }
520 
521 #ifdef CONFIG_PPC_MM_SLICES
522 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
523 					unsigned long len, unsigned long pgoff,
524 					unsigned long flags)
525 {
526 	struct hstate *hstate = hstate_file(file);
527 	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
528 
529 #ifdef CONFIG_PPC_RADIX_MMU
530 	if (radix_enabled())
531 		return radix__hugetlb_get_unmapped_area(file, addr, len,
532 						       pgoff, flags);
533 #endif
534 	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
535 }
536 #endif
537 
538 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
539 {
540 	/* With radix we don't use slice, so derive it from vma*/
541 	if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) {
542 		unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
543 
544 		return 1UL << mmu_psize_to_shift(psize);
545 	}
546 	return vma_kernel_pagesize(vma);
547 }
548 
549 static int __init add_huge_page_size(unsigned long long size)
550 {
551 	int shift = __ffs(size);
552 	int mmu_psize;
553 
554 	/* Check that it is a page size supported by the hardware and
555 	 * that it fits within pagetable and slice limits. */
556 	if (size <= PAGE_SIZE || !is_power_of_2(size))
557 		return -EINVAL;
558 
559 	mmu_psize = check_and_get_huge_psize(shift);
560 	if (mmu_psize < 0)
561 		return -EINVAL;
562 
563 	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
564 
565 	/* Return if huge page size has already been setup */
566 	if (size_to_hstate(size))
567 		return 0;
568 
569 	hugetlb_add_hstate(shift - PAGE_SHIFT);
570 
571 	return 0;
572 }
573 
574 static int __init hugepage_setup_sz(char *str)
575 {
576 	unsigned long long size;
577 
578 	size = memparse(str, &str);
579 
580 	if (add_huge_page_size(size) != 0) {
581 		hugetlb_bad_size();
582 		pr_err("Invalid huge page size specified(%llu)\n", size);
583 	}
584 
585 	return 1;
586 }
587 __setup("hugepagesz=", hugepage_setup_sz);
588 
589 static int __init hugetlbpage_init(void)
590 {
591 	int psize;
592 
593 	if (hugetlb_disabled) {
594 		pr_info("HugeTLB support is disabled!\n");
595 		return 0;
596 	}
597 
598 	if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() &&
599 	    !mmu_has_feature(MMU_FTR_16M_PAGE))
600 		return -ENODEV;
601 
602 	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
603 		unsigned shift;
604 		unsigned pdshift;
605 
606 		if (!mmu_psize_defs[psize].shift)
607 			continue;
608 
609 		shift = mmu_psize_to_shift(psize);
610 
611 #ifdef CONFIG_PPC_BOOK3S_64
612 		if (shift > PGDIR_SHIFT)
613 			continue;
614 		else if (shift > PUD_SHIFT)
615 			pdshift = PGDIR_SHIFT;
616 		else if (shift > PMD_SHIFT)
617 			pdshift = PUD_SHIFT;
618 		else
619 			pdshift = PMD_SHIFT;
620 #else
621 		if (shift < PUD_SHIFT)
622 			pdshift = PMD_SHIFT;
623 		else if (shift < PGDIR_SHIFT)
624 			pdshift = PUD_SHIFT;
625 		else
626 			pdshift = PGDIR_SHIFT;
627 #endif
628 
629 		if (add_huge_page_size(1ULL << shift) < 0)
630 			continue;
631 		/*
632 		 * if we have pdshift and shift value same, we don't
633 		 * use pgt cache for hugepd.
634 		 */
635 		if (pdshift > shift && IS_ENABLED(CONFIG_PPC_8xx))
636 			pgtable_cache_add(PTE_INDEX_SIZE);
637 		else if (pdshift > shift)
638 			pgtable_cache_add(pdshift - shift);
639 		else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || IS_ENABLED(CONFIG_PPC_8xx))
640 			pgtable_cache_add(PTE_T_ORDER);
641 	}
642 
643 	if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE))
644 		hugetlbpage_init_default();
645 
646 	return 0;
647 }
648 
649 arch_initcall(hugetlbpage_init);
650 
651 void flush_dcache_icache_hugepage(struct page *page)
652 {
653 	int i;
654 	void *start;
655 
656 	BUG_ON(!PageCompound(page));
657 
658 	for (i = 0; i < (1UL << compound_order(page)); i++) {
659 		if (!PageHighMem(page)) {
660 			__flush_dcache_icache(page_address(page+i));
661 		} else {
662 			start = kmap_atomic(page+i);
663 			__flush_dcache_icache(start);
664 			kunmap_atomic(start);
665 		}
666 	}
667 }
668 
669 static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
670 		       unsigned long end, int write, struct page **pages, int *nr)
671 {
672 	unsigned long pte_end;
673 	struct page *head, *page;
674 	pte_t pte;
675 	int refs;
676 
677 	pte_end = (addr + sz) & ~(sz-1);
678 	if (pte_end < end)
679 		end = pte_end;
680 
681 	pte = READ_ONCE(*ptep);
682 
683 	if (!pte_access_permitted(pte, write))
684 		return 0;
685 
686 	/* hugepages are never "special" */
687 	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
688 
689 	refs = 0;
690 	head = pte_page(pte);
691 
692 	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
693 	do {
694 		VM_BUG_ON(compound_head(page) != head);
695 		pages[*nr] = page;
696 		(*nr)++;
697 		page++;
698 		refs++;
699 	} while (addr += PAGE_SIZE, addr != end);
700 
701 	if (!page_cache_add_speculative(head, refs)) {
702 		*nr -= refs;
703 		return 0;
704 	}
705 
706 	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
707 		/* Could be optimized better */
708 		*nr -= refs;
709 		while (refs--)
710 			put_page(head);
711 		return 0;
712 	}
713 
714 	return 1;
715 }
716 
717 int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift,
718 		unsigned long end, int write, struct page **pages, int *nr)
719 {
720 	pte_t *ptep;
721 	unsigned long sz = 1UL << hugepd_shift(hugepd);
722 	unsigned long next;
723 
724 	ptep = hugepte_offset(hugepd, addr, pdshift);
725 	do {
726 		next = hugepte_addr_end(addr, end, sz);
727 		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
728 			return 0;
729 	} while (ptep++, addr = next, addr != end);
730 
731 	return 1;
732 }
733