xref: /openbmc/linux/arch/powerpc/mm/hugetlbpage.c (revision a977d045)
1 /*
2  * PPC Huge TLB Page Support for Kernel.
3  *
4  * Copyright (C) 2003 David Gibson, IBM Corporation.
5  * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
6  *
7  * Based on the IA-32 version:
8  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
9  */
10 
11 #include <linux/mm.h>
12 #include <linux/io.h>
13 #include <linux/slab.h>
14 #include <linux/hugetlb.h>
15 #include <linux/export.h>
16 #include <linux/of_fdt.h>
17 #include <linux/memblock.h>
18 #include <linux/bootmem.h>
19 #include <linux/moduleparam.h>
20 #include <linux/swap.h>
21 #include <linux/swapops.h>
22 #include <asm/pgtable.h>
23 #include <asm/pgalloc.h>
24 #include <asm/tlb.h>
25 #include <asm/setup.h>
26 #include <asm/hugetlb.h>
27 
28 #ifdef CONFIG_HUGETLB_PAGE
29 
30 #define PAGE_SHIFT_64K	16
31 #define PAGE_SHIFT_512K	19
32 #define PAGE_SHIFT_8M	23
33 #define PAGE_SHIFT_16M	24
34 #define PAGE_SHIFT_16G	34
35 
36 unsigned int HPAGE_SHIFT;
37 EXPORT_SYMBOL(HPAGE_SHIFT);
38 
39 /*
40  * Tracks gpages after the device tree is scanned and before the
41  * huge_boot_pages list is ready.  On non-Freescale implementations, this is
42  * just used to track 16G pages and so is a single array.  FSL-based
43  * implementations may have more than one gpage size, so we need multiple
44  * arrays
45  */
46 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
47 #define MAX_NUMBER_GPAGES	128
48 struct psize_gpages {
49 	u64 gpage_list[MAX_NUMBER_GPAGES];
50 	unsigned int nr_gpages;
51 };
52 static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
53 #else
54 #define MAX_NUMBER_GPAGES	1024
55 static u64 gpage_freearray[MAX_NUMBER_GPAGES];
56 static unsigned nr_gpages;
57 #endif
58 
59 #define hugepd_none(hpd)	(hpd_val(hpd) == 0)
60 
61 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
62 {
63 	/* Only called for hugetlbfs pages, hence can ignore THP */
64 	return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
65 }
66 
67 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
68 			   unsigned long address, unsigned pdshift, unsigned pshift)
69 {
70 	struct kmem_cache *cachep;
71 	pte_t *new;
72 	int i;
73 	int num_hugepd;
74 
75 	if (pshift >= pdshift) {
76 		cachep = hugepte_cache;
77 		num_hugepd = 1 << (pshift - pdshift);
78 	} else {
79 		cachep = PGT_CACHE(pdshift - pshift);
80 		num_hugepd = 1;
81 	}
82 
83 	new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
84 
85 	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
86 	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
87 
88 	if (! new)
89 		return -ENOMEM;
90 
91 	/*
92 	 * Make sure other cpus find the hugepd set only after a
93 	 * properly initialized page table is visible to them.
94 	 * For more details look for comment in __pte_alloc().
95 	 */
96 	smp_wmb();
97 
98 	spin_lock(&mm->page_table_lock);
99 
100 	/*
101 	 * We have multiple higher-level entries that point to the same
102 	 * actual pte location.  Fill in each as we go and backtrack on error.
103 	 * We need all of these so the DTLB pgtable walk code can find the
104 	 * right higher-level entry without knowing if it's a hugepage or not.
105 	 */
106 	for (i = 0; i < num_hugepd; i++, hpdp++) {
107 		if (unlikely(!hugepd_none(*hpdp)))
108 			break;
109 		else {
110 #ifdef CONFIG_PPC_BOOK3S_64
111 			*hpdp = __hugepd(__pa(new) |
112 					 (shift_to_mmu_psize(pshift) << 2));
113 #elif defined(CONFIG_PPC_8xx)
114 			*hpdp = __hugepd(__pa(new) |
115 					 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
116 					  _PMD_PAGE_512K) | _PMD_PRESENT);
117 #else
118 			/* We use the old format for PPC_FSL_BOOK3E */
119 			*hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
120 #endif
121 		}
122 	}
123 	/* If we bailed from the for loop early, an error occurred, clean up */
124 	if (i < num_hugepd) {
125 		for (i = i - 1 ; i >= 0; i--, hpdp--)
126 			*hpdp = __hugepd(0);
127 		kmem_cache_free(cachep, new);
128 	}
129 	spin_unlock(&mm->page_table_lock);
130 	return 0;
131 }
132 
133 /*
134  * These macros define how to determine which level of the page table holds
135  * the hpdp.
136  */
137 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
138 #define HUGEPD_PGD_SHIFT PGDIR_SHIFT
139 #define HUGEPD_PUD_SHIFT PUD_SHIFT
140 #else
141 #define HUGEPD_PGD_SHIFT PUD_SHIFT
142 #define HUGEPD_PUD_SHIFT PMD_SHIFT
143 #endif
144 
145 /*
146  * At this point we do the placement change only for BOOK3S 64. This would
147  * possibly work on other subarchs.
148  */
149 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
150 {
151 	pgd_t *pg;
152 	pud_t *pu;
153 	pmd_t *pm;
154 	hugepd_t *hpdp = NULL;
155 	unsigned pshift = __ffs(sz);
156 	unsigned pdshift = PGDIR_SHIFT;
157 
158 	addr &= ~(sz-1);
159 	pg = pgd_offset(mm, addr);
160 
161 #ifdef CONFIG_PPC_BOOK3S_64
162 	if (pshift == PGDIR_SHIFT)
163 		/* 16GB huge page */
164 		return (pte_t *) pg;
165 	else if (pshift > PUD_SHIFT)
166 		/*
167 		 * We need to use hugepd table
168 		 */
169 		hpdp = (hugepd_t *)pg;
170 	else {
171 		pdshift = PUD_SHIFT;
172 		pu = pud_alloc(mm, pg, addr);
173 		if (pshift == PUD_SHIFT)
174 			return (pte_t *)pu;
175 		else if (pshift > PMD_SHIFT)
176 			hpdp = (hugepd_t *)pu;
177 		else {
178 			pdshift = PMD_SHIFT;
179 			pm = pmd_alloc(mm, pu, addr);
180 			if (pshift == PMD_SHIFT)
181 				/* 16MB hugepage */
182 				return (pte_t *)pm;
183 			else
184 				hpdp = (hugepd_t *)pm;
185 		}
186 	}
187 #else
188 	if (pshift >= HUGEPD_PGD_SHIFT) {
189 		hpdp = (hugepd_t *)pg;
190 	} else {
191 		pdshift = PUD_SHIFT;
192 		pu = pud_alloc(mm, pg, addr);
193 		if (pshift >= HUGEPD_PUD_SHIFT) {
194 			hpdp = (hugepd_t *)pu;
195 		} else {
196 			pdshift = PMD_SHIFT;
197 			pm = pmd_alloc(mm, pu, addr);
198 			hpdp = (hugepd_t *)pm;
199 		}
200 	}
201 #endif
202 	if (!hpdp)
203 		return NULL;
204 
205 	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
206 
207 	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
208 		return NULL;
209 
210 	return hugepte_offset(*hpdp, addr, pdshift);
211 }
212 
213 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
214 /* Build list of addresses of gigantic pages.  This function is used in early
215  * boot before the buddy allocator is setup.
216  */
217 void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
218 {
219 	unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
220 	int i;
221 
222 	if (addr == 0)
223 		return;
224 
225 	gpage_freearray[idx].nr_gpages = number_of_pages;
226 
227 	for (i = 0; i < number_of_pages; i++) {
228 		gpage_freearray[idx].gpage_list[i] = addr;
229 		addr += page_size;
230 	}
231 }
232 
233 /*
234  * Moves the gigantic page addresses from the temporary list to the
235  * huge_boot_pages list.
236  */
237 int alloc_bootmem_huge_page(struct hstate *hstate)
238 {
239 	struct huge_bootmem_page *m;
240 	int idx = shift_to_mmu_psize(huge_page_shift(hstate));
241 	int nr_gpages = gpage_freearray[idx].nr_gpages;
242 
243 	if (nr_gpages == 0)
244 		return 0;
245 
246 #ifdef CONFIG_HIGHMEM
247 	/*
248 	 * If gpages can be in highmem we can't use the trick of storing the
249 	 * data structure in the page; allocate space for this
250 	 */
251 	m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0);
252 	m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
253 #else
254 	m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
255 #endif
256 
257 	list_add(&m->list, &huge_boot_pages);
258 	gpage_freearray[idx].nr_gpages = nr_gpages;
259 	gpage_freearray[idx].gpage_list[nr_gpages] = 0;
260 	m->hstate = hstate;
261 
262 	return 1;
263 }
264 /*
265  * Scan the command line hugepagesz= options for gigantic pages; store those in
266  * a list that we use to allocate the memory once all options are parsed.
267  */
268 
269 unsigned long gpage_npages[MMU_PAGE_COUNT];
270 
271 static int __init do_gpage_early_setup(char *param, char *val,
272 				       const char *unused, void *arg)
273 {
274 	static phys_addr_t size;
275 	unsigned long npages;
276 
277 	/*
278 	 * The hugepagesz and hugepages cmdline options are interleaved.  We
279 	 * use the size variable to keep track of whether or not this was done
280 	 * properly and skip over instances where it is incorrect.  Other
281 	 * command-line parsing code will issue warnings, so we don't need to.
282 	 *
283 	 */
284 	if ((strcmp(param, "default_hugepagesz") == 0) ||
285 	    (strcmp(param, "hugepagesz") == 0)) {
286 		size = memparse(val, NULL);
287 	} else if (strcmp(param, "hugepages") == 0) {
288 		if (size != 0) {
289 			if (sscanf(val, "%lu", &npages) <= 0)
290 				npages = 0;
291 			if (npages > MAX_NUMBER_GPAGES) {
292 				pr_warn("MMU: %lu pages requested for page "
293 #ifdef CONFIG_PHYS_ADDR_T_64BIT
294 					"size %llu KB, limiting to "
295 #else
296 					"size %u KB, limiting to "
297 #endif
298 					__stringify(MAX_NUMBER_GPAGES) "\n",
299 					npages, size / 1024);
300 				npages = MAX_NUMBER_GPAGES;
301 			}
302 			gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
303 			size = 0;
304 		}
305 	}
306 	return 0;
307 }
308 
309 
310 /*
311  * This function allocates physical space for pages that are larger than the
312  * buddy allocator can handle.  We want to allocate these in highmem because
313  * the amount of lowmem is limited.  This means that this function MUST be
314  * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
315  * allocate to grab highmem.
316  */
317 void __init reserve_hugetlb_gpages(void)
318 {
319 	static __initdata char cmdline[COMMAND_LINE_SIZE];
320 	phys_addr_t size, base;
321 	int i;
322 
323 	strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
324 	parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
325 			NULL, &do_gpage_early_setup);
326 
327 	/*
328 	 * Walk gpage list in reverse, allocating larger page sizes first.
329 	 * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
330 	 * When we reach the point in the list where pages are no longer
331 	 * considered gpages, we're done.
332 	 */
333 	for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
334 		if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
335 			continue;
336 		else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
337 			break;
338 
339 		size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
340 		base = memblock_alloc_base(size * gpage_npages[i], size,
341 					   MEMBLOCK_ALLOC_ANYWHERE);
342 		add_gpage(base, size, gpage_npages[i]);
343 	}
344 }
345 
346 #else /* !PPC_FSL_BOOK3E */
347 
348 /* Build list of addresses of gigantic pages.  This function is used in early
349  * boot before the buddy allocator is setup.
350  */
351 void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
352 {
353 	if (!addr)
354 		return;
355 	while (number_of_pages > 0) {
356 		gpage_freearray[nr_gpages] = addr;
357 		nr_gpages++;
358 		number_of_pages--;
359 		addr += page_size;
360 	}
361 }
362 
363 /* Moves the gigantic page addresses from the temporary list to the
364  * huge_boot_pages list.
365  */
366 int alloc_bootmem_huge_page(struct hstate *hstate)
367 {
368 	struct huge_bootmem_page *m;
369 	if (nr_gpages == 0)
370 		return 0;
371 	m = phys_to_virt(gpage_freearray[--nr_gpages]);
372 	gpage_freearray[nr_gpages] = 0;
373 	list_add(&m->list, &huge_boot_pages);
374 	m->hstate = hstate;
375 	return 1;
376 }
377 #endif
378 
379 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
380 #define HUGEPD_FREELIST_SIZE \
381 	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
382 
383 struct hugepd_freelist {
384 	struct rcu_head	rcu;
385 	unsigned int index;
386 	void *ptes[0];
387 };
388 
389 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
390 
391 static void hugepd_free_rcu_callback(struct rcu_head *head)
392 {
393 	struct hugepd_freelist *batch =
394 		container_of(head, struct hugepd_freelist, rcu);
395 	unsigned int i;
396 
397 	for (i = 0; i < batch->index; i++)
398 		kmem_cache_free(hugepte_cache, batch->ptes[i]);
399 
400 	free_page((unsigned long)batch);
401 }
402 
403 static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
404 {
405 	struct hugepd_freelist **batchp;
406 
407 	batchp = &get_cpu_var(hugepd_freelist_cur);
408 
409 	if (atomic_read(&tlb->mm->mm_users) < 2 ||
410 	    cpumask_equal(mm_cpumask(tlb->mm),
411 			  cpumask_of(smp_processor_id()))) {
412 		kmem_cache_free(hugepte_cache, hugepte);
413 		put_cpu_var(hugepd_freelist_cur);
414 		return;
415 	}
416 
417 	if (*batchp == NULL) {
418 		*batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
419 		(*batchp)->index = 0;
420 	}
421 
422 	(*batchp)->ptes[(*batchp)->index++] = hugepte;
423 	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
424 		call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
425 		*batchp = NULL;
426 	}
427 	put_cpu_var(hugepd_freelist_cur);
428 }
429 #else
430 static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
431 #endif
432 
433 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
434 			      unsigned long start, unsigned long end,
435 			      unsigned long floor, unsigned long ceiling)
436 {
437 	pte_t *hugepte = hugepd_page(*hpdp);
438 	int i;
439 
440 	unsigned long pdmask = ~((1UL << pdshift) - 1);
441 	unsigned int num_hugepd = 1;
442 	unsigned int shift = hugepd_shift(*hpdp);
443 
444 	/* Note: On fsl the hpdp may be the first of several */
445 	if (shift > pdshift)
446 		num_hugepd = 1 << (shift - pdshift);
447 
448 	start &= pdmask;
449 	if (start < floor)
450 		return;
451 	if (ceiling) {
452 		ceiling &= pdmask;
453 		if (! ceiling)
454 			return;
455 	}
456 	if (end - 1 > ceiling - 1)
457 		return;
458 
459 	for (i = 0; i < num_hugepd; i++, hpdp++)
460 		*hpdp = __hugepd(0);
461 
462 	if (shift >= pdshift)
463 		hugepd_free(tlb, hugepte);
464 	else
465 		pgtable_free_tlb(tlb, hugepte, pdshift - shift);
466 }
467 
468 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
469 				   unsigned long addr, unsigned long end,
470 				   unsigned long floor, unsigned long ceiling)
471 {
472 	pmd_t *pmd;
473 	unsigned long next;
474 	unsigned long start;
475 
476 	start = addr;
477 	do {
478 		unsigned long more;
479 
480 		pmd = pmd_offset(pud, addr);
481 		next = pmd_addr_end(addr, end);
482 		if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
483 			/*
484 			 * if it is not hugepd pointer, we should already find
485 			 * it cleared.
486 			 */
487 			WARN_ON(!pmd_none_or_clear_bad(pmd));
488 			continue;
489 		}
490 		/*
491 		 * Increment next by the size of the huge mapping since
492 		 * there may be more than one entry at this level for a
493 		 * single hugepage, but all of them point to
494 		 * the same kmem cache that holds the hugepte.
495 		 */
496 		more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
497 		if (more > next)
498 			next = more;
499 
500 		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
501 				  addr, next, floor, ceiling);
502 	} while (addr = next, addr != end);
503 
504 	start &= PUD_MASK;
505 	if (start < floor)
506 		return;
507 	if (ceiling) {
508 		ceiling &= PUD_MASK;
509 		if (!ceiling)
510 			return;
511 	}
512 	if (end - 1 > ceiling - 1)
513 		return;
514 
515 	pmd = pmd_offset(pud, start);
516 	pud_clear(pud);
517 	pmd_free_tlb(tlb, pmd, start);
518 	mm_dec_nr_pmds(tlb->mm);
519 }
520 
521 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
522 				   unsigned long addr, unsigned long end,
523 				   unsigned long floor, unsigned long ceiling)
524 {
525 	pud_t *pud;
526 	unsigned long next;
527 	unsigned long start;
528 
529 	start = addr;
530 	do {
531 		pud = pud_offset(pgd, addr);
532 		next = pud_addr_end(addr, end);
533 		if (!is_hugepd(__hugepd(pud_val(*pud)))) {
534 			if (pud_none_or_clear_bad(pud))
535 				continue;
536 			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
537 					       ceiling);
538 		} else {
539 			unsigned long more;
540 			/*
541 			 * Increment next by the size of the huge mapping since
542 			 * there may be more than one entry at this level for a
543 			 * single hugepage, but all of them point to
544 			 * the same kmem cache that holds the hugepte.
545 			 */
546 			more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
547 			if (more > next)
548 				next = more;
549 
550 			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
551 					  addr, next, floor, ceiling);
552 		}
553 	} while (addr = next, addr != end);
554 
555 	start &= PGDIR_MASK;
556 	if (start < floor)
557 		return;
558 	if (ceiling) {
559 		ceiling &= PGDIR_MASK;
560 		if (!ceiling)
561 			return;
562 	}
563 	if (end - 1 > ceiling - 1)
564 		return;
565 
566 	pud = pud_offset(pgd, start);
567 	pgd_clear(pgd);
568 	pud_free_tlb(tlb, pud, start);
569 }
570 
571 /*
572  * This function frees user-level page tables of a process.
573  */
574 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
575 			    unsigned long addr, unsigned long end,
576 			    unsigned long floor, unsigned long ceiling)
577 {
578 	pgd_t *pgd;
579 	unsigned long next;
580 
581 	/*
582 	 * Because there are a number of different possible pagetable
583 	 * layouts for hugepage ranges, we limit knowledge of how
584 	 * things should be laid out to the allocation path
585 	 * (huge_pte_alloc(), above).  Everything else works out the
586 	 * structure as it goes from information in the hugepd
587 	 * pointers.  That means that we can't here use the
588 	 * optimization used in the normal page free_pgd_range(), of
589 	 * checking whether we're actually covering a large enough
590 	 * range to have to do anything at the top level of the walk
591 	 * instead of at the bottom.
592 	 *
593 	 * To make sense of this, you should probably go read the big
594 	 * block comment at the top of the normal free_pgd_range(),
595 	 * too.
596 	 */
597 
598 	do {
599 		next = pgd_addr_end(addr, end);
600 		pgd = pgd_offset(tlb->mm, addr);
601 		if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
602 			if (pgd_none_or_clear_bad(pgd))
603 				continue;
604 			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
605 		} else {
606 			unsigned long more;
607 			/*
608 			 * Increment next by the size of the huge mapping since
609 			 * there may be more than one entry at the pgd level
610 			 * for a single hugepage, but all of them point to the
611 			 * same kmem cache that holds the hugepte.
612 			 */
613 			more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
614 			if (more > next)
615 				next = more;
616 
617 			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
618 					  addr, next, floor, ceiling);
619 		}
620 	} while (addr = next, addr != end);
621 }
622 
623 struct page *follow_huge_pd(struct vm_area_struct *vma,
624 			    unsigned long address, hugepd_t hpd,
625 			    int flags, int pdshift)
626 {
627 	pte_t *ptep;
628 	spinlock_t *ptl;
629 	struct page *page = NULL;
630 	unsigned long mask;
631 	int shift = hugepd_shift(hpd);
632 	struct mm_struct *mm = vma->vm_mm;
633 
634 retry:
635 	ptl = &mm->page_table_lock;
636 	spin_lock(ptl);
637 
638 	ptep = hugepte_offset(hpd, address, pdshift);
639 	if (pte_present(*ptep)) {
640 		mask = (1UL << shift) - 1;
641 		page = pte_page(*ptep);
642 		page += ((address & mask) >> PAGE_SHIFT);
643 		if (flags & FOLL_GET)
644 			get_page(page);
645 	} else {
646 		if (is_hugetlb_entry_migration(*ptep)) {
647 			spin_unlock(ptl);
648 			__migration_entry_wait(mm, ptep, ptl);
649 			goto retry;
650 		}
651 	}
652 	spin_unlock(ptl);
653 	return page;
654 }
655 
656 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
657 				      unsigned long sz)
658 {
659 	unsigned long __boundary = (addr + sz) & ~(sz-1);
660 	return (__boundary - 1 < end - 1) ? __boundary : end;
661 }
662 
663 int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
664 		unsigned long end, int write, struct page **pages, int *nr)
665 {
666 	pte_t *ptep;
667 	unsigned long sz = 1UL << hugepd_shift(hugepd);
668 	unsigned long next;
669 
670 	ptep = hugepte_offset(hugepd, addr, pdshift);
671 	do {
672 		next = hugepte_addr_end(addr, end, sz);
673 		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
674 			return 0;
675 	} while (ptep++, addr = next, addr != end);
676 
677 	return 1;
678 }
679 
680 #ifdef CONFIG_PPC_MM_SLICES
681 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
682 					unsigned long len, unsigned long pgoff,
683 					unsigned long flags)
684 {
685 	struct hstate *hstate = hstate_file(file);
686 	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
687 
688 	if (radix_enabled())
689 		return radix__hugetlb_get_unmapped_area(file, addr, len,
690 						       pgoff, flags);
691 	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
692 }
693 #endif
694 
695 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
696 {
697 #ifdef CONFIG_PPC_MM_SLICES
698 	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
699 	/* With radix we don't use slice, so derive it from vma*/
700 	if (!radix_enabled())
701 		return 1UL << mmu_psize_to_shift(psize);
702 #endif
703 	if (!is_vm_hugetlb_page(vma))
704 		return PAGE_SIZE;
705 
706 	return huge_page_size(hstate_vma(vma));
707 }
708 
709 static inline bool is_power_of_4(unsigned long x)
710 {
711 	if (is_power_of_2(x))
712 		return (__ilog2(x) % 2) ? false : true;
713 	return false;
714 }
715 
716 static int __init add_huge_page_size(unsigned long long size)
717 {
718 	int shift = __ffs(size);
719 	int mmu_psize;
720 
721 	/* Check that it is a page size supported by the hardware and
722 	 * that it fits within pagetable and slice limits. */
723 	if (size <= PAGE_SIZE)
724 		return -EINVAL;
725 #if defined(CONFIG_PPC_FSL_BOOK3E)
726 	if (!is_power_of_4(size))
727 		return -EINVAL;
728 #elif !defined(CONFIG_PPC_8xx)
729 	if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
730 		return -EINVAL;
731 #endif
732 
733 	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
734 		return -EINVAL;
735 
736 #ifdef CONFIG_PPC_BOOK3S_64
737 	/*
738 	 * We need to make sure that for different page sizes reported by
739 	 * firmware we only add hugetlb support for page sizes that can be
740 	 * supported by linux page table layout.
741 	 * For now we have
742 	 * Radix: 2M
743 	 * Hash: 16M and 16G
744 	 */
745 	if (radix_enabled()) {
746 		if (mmu_psize != MMU_PAGE_2M) {
747 			if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
748 			    (mmu_psize != MMU_PAGE_1G))
749 				return -EINVAL;
750 		}
751 	} else {
752 		if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
753 			return -EINVAL;
754 	}
755 #endif
756 
757 	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
758 
759 	/* Return if huge page size has already been setup */
760 	if (size_to_hstate(size))
761 		return 0;
762 
763 	hugetlb_add_hstate(shift - PAGE_SHIFT);
764 
765 	return 0;
766 }
767 
768 static int __init hugepage_setup_sz(char *str)
769 {
770 	unsigned long long size;
771 
772 	size = memparse(str, &str);
773 
774 	if (add_huge_page_size(size) != 0) {
775 		hugetlb_bad_size();
776 		pr_err("Invalid huge page size specified(%llu)\n", size);
777 	}
778 
779 	return 1;
780 }
781 __setup("hugepagesz=", hugepage_setup_sz);
782 
783 struct kmem_cache *hugepte_cache;
784 static int __init hugetlbpage_init(void)
785 {
786 	int psize;
787 
788 #if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
789 	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
790 		return -ENODEV;
791 #endif
792 	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
793 		unsigned shift;
794 		unsigned pdshift;
795 
796 		if (!mmu_psize_defs[psize].shift)
797 			continue;
798 
799 		shift = mmu_psize_to_shift(psize);
800 
801 		if (add_huge_page_size(1ULL << shift) < 0)
802 			continue;
803 
804 		if (shift < HUGEPD_PUD_SHIFT)
805 			pdshift = PMD_SHIFT;
806 		else if (shift < HUGEPD_PGD_SHIFT)
807 			pdshift = PUD_SHIFT;
808 		else
809 			pdshift = PGDIR_SHIFT;
810 		/*
811 		 * if we have pdshift and shift value same, we don't
812 		 * use pgt cache for hugepd.
813 		 */
814 		if (pdshift > shift)
815 			pgtable_cache_add(pdshift - shift, NULL);
816 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
817 		else if (!hugepte_cache) {
818 			/*
819 			 * Create a kmem cache for hugeptes.  The bottom bits in
820 			 * the pte have size information encoded in them, so
821 			 * align them to allow this
822 			 */
823 			hugepte_cache = kmem_cache_create("hugepte-cache",
824 							  sizeof(pte_t),
825 							  HUGEPD_SHIFT_MASK + 1,
826 							  0, NULL);
827 			if (hugepte_cache == NULL)
828 				panic("%s: Unable to create kmem cache "
829 				      "for hugeptes\n", __func__);
830 
831 		}
832 #endif
833 	}
834 
835 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
836 	/* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
837 	if (mmu_psize_defs[MMU_PAGE_4M].shift)
838 		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
839 	else if (mmu_psize_defs[MMU_PAGE_512K].shift)
840 		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
841 #else
842 	/* Set default large page size. Currently, we pick 16M or 1M
843 	 * depending on what is available
844 	 */
845 	if (mmu_psize_defs[MMU_PAGE_16M].shift)
846 		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
847 	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
848 		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
849 	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
850 		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
851 #endif
852 	return 0;
853 }
854 
855 arch_initcall(hugetlbpage_init);
856 
857 void flush_dcache_icache_hugepage(struct page *page)
858 {
859 	int i;
860 	void *start;
861 
862 	BUG_ON(!PageCompound(page));
863 
864 	for (i = 0; i < (1UL << compound_order(page)); i++) {
865 		if (!PageHighMem(page)) {
866 			__flush_dcache_icache(page_address(page+i));
867 		} else {
868 			start = kmap_atomic(page+i);
869 			__flush_dcache_icache(start);
870 			kunmap_atomic(start);
871 		}
872 	}
873 }
874 
875 #endif /* CONFIG_HUGETLB_PAGE */
876 
877 /*
878  * We have 4 cases for pgds and pmds:
879  * (1) invalid (all zeroes)
880  * (2) pointer to next table, as normal; bottom 6 bits == 0
881  * (3) leaf pte for huge page _PAGE_PTE set
882  * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
883  *
884  * So long as we atomically load page table pointers we are safe against teardown,
885  * we can follow the address down to the the page and take a ref on it.
886  * This function need to be called with interrupts disabled. We use this variant
887  * when we have MSR[EE] = 0 but the paca->soft_enabled = 1
888  */
889 
890 pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
891 				   bool *is_thp, unsigned *shift)
892 {
893 	pgd_t pgd, *pgdp;
894 	pud_t pud, *pudp;
895 	pmd_t pmd, *pmdp;
896 	pte_t *ret_pte;
897 	hugepd_t *hpdp = NULL;
898 	unsigned pdshift = PGDIR_SHIFT;
899 
900 	if (shift)
901 		*shift = 0;
902 
903 	if (is_thp)
904 		*is_thp = false;
905 
906 	pgdp = pgdir + pgd_index(ea);
907 	pgd  = READ_ONCE(*pgdp);
908 	/*
909 	 * Always operate on the local stack value. This make sure the
910 	 * value don't get updated by a parallel THP split/collapse,
911 	 * page fault or a page unmap. The return pte_t * is still not
912 	 * stable. So should be checked there for above conditions.
913 	 */
914 	if (pgd_none(pgd))
915 		return NULL;
916 	else if (pgd_huge(pgd)) {
917 		ret_pte = (pte_t *) pgdp;
918 		goto out;
919 	} else if (is_hugepd(__hugepd(pgd_val(pgd))))
920 		hpdp = (hugepd_t *)&pgd;
921 	else {
922 		/*
923 		 * Even if we end up with an unmap, the pgtable will not
924 		 * be freed, because we do an rcu free and here we are
925 		 * irq disabled
926 		 */
927 		pdshift = PUD_SHIFT;
928 		pudp = pud_offset(&pgd, ea);
929 		pud  = READ_ONCE(*pudp);
930 
931 		if (pud_none(pud))
932 			return NULL;
933 		else if (pud_huge(pud)) {
934 			ret_pte = (pte_t *) pudp;
935 			goto out;
936 		} else if (is_hugepd(__hugepd(pud_val(pud))))
937 			hpdp = (hugepd_t *)&pud;
938 		else {
939 			pdshift = PMD_SHIFT;
940 			pmdp = pmd_offset(&pud, ea);
941 			pmd  = READ_ONCE(*pmdp);
942 			/*
943 			 * A hugepage collapse is captured by pmd_none, because
944 			 * it mark the pmd none and do a hpte invalidate.
945 			 */
946 			if (pmd_none(pmd))
947 				return NULL;
948 
949 			if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
950 				if (is_thp)
951 					*is_thp = true;
952 				ret_pte = (pte_t *) pmdp;
953 				goto out;
954 			}
955 
956 			if (pmd_huge(pmd)) {
957 				ret_pte = (pte_t *) pmdp;
958 				goto out;
959 			} else if (is_hugepd(__hugepd(pmd_val(pmd))))
960 				hpdp = (hugepd_t *)&pmd;
961 			else
962 				return pte_offset_kernel(&pmd, ea);
963 		}
964 	}
965 	if (!hpdp)
966 		return NULL;
967 
968 	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
969 	pdshift = hugepd_shift(*hpdp);
970 out:
971 	if (shift)
972 		*shift = pdshift;
973 	return ret_pte;
974 }
975 EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte);
976 
977 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
978 		unsigned long end, int write, struct page **pages, int *nr)
979 {
980 	unsigned long mask;
981 	unsigned long pte_end;
982 	struct page *head, *page;
983 	pte_t pte;
984 	int refs;
985 
986 	pte_end = (addr + sz) & ~(sz-1);
987 	if (pte_end < end)
988 		end = pte_end;
989 
990 	pte = READ_ONCE(*ptep);
991 	mask = _PAGE_PRESENT | _PAGE_READ;
992 
993 	/*
994 	 * On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined
995 	 * as 0 and _PAGE_RO has to be set when a page is not writable
996 	 */
997 	if (write)
998 		mask |= _PAGE_WRITE;
999 	else
1000 		mask |= _PAGE_RO;
1001 
1002 	if ((pte_val(pte) & mask) != mask)
1003 		return 0;
1004 
1005 	/* hugepages are never "special" */
1006 	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1007 
1008 	refs = 0;
1009 	head = pte_page(pte);
1010 
1011 	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
1012 	do {
1013 		VM_BUG_ON(compound_head(page) != head);
1014 		pages[*nr] = page;
1015 		(*nr)++;
1016 		page++;
1017 		refs++;
1018 	} while (addr += PAGE_SIZE, addr != end);
1019 
1020 	if (!page_cache_add_speculative(head, refs)) {
1021 		*nr -= refs;
1022 		return 0;
1023 	}
1024 
1025 	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
1026 		/* Could be optimized better */
1027 		*nr -= refs;
1028 		while (refs--)
1029 			put_page(head);
1030 		return 0;
1031 	}
1032 
1033 	return 1;
1034 }
1035