xref: /openbmc/linux/arch/powerpc/mm/hugetlbpage.c (revision 545e4006)
1 /*
2  * PPC64 (POWER4) Huge TLB Page Support for Kernel.
3  *
4  * Copyright (C) 2003 David Gibson, IBM Corporation.
5  *
6  * Based on the IA-32 version:
7  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
8  */
9 
10 #include <linux/init.h>
11 #include <linux/fs.h>
12 #include <linux/mm.h>
13 #include <linux/hugetlb.h>
14 #include <linux/pagemap.h>
15 #include <linux/slab.h>
16 #include <linux/err.h>
17 #include <linux/sysctl.h>
18 #include <asm/mman.h>
19 #include <asm/pgalloc.h>
20 #include <asm/tlb.h>
21 #include <asm/tlbflush.h>
22 #include <asm/mmu_context.h>
23 #include <asm/machdep.h>
24 #include <asm/cputable.h>
25 #include <asm/spu.h>
26 
27 #define PAGE_SHIFT_64K	16
28 #define PAGE_SHIFT_16M	24
29 #define PAGE_SHIFT_16G	34
30 
31 #define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
32 #define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
33 #define MAX_NUMBER_GPAGES	1024
34 
35 /* Tracks the 16G pages after the device tree is scanned and before the
36  * huge_boot_pages list is ready.  */
37 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
38 static unsigned nr_gpages;
39 
40 /* Array of valid huge page sizes - non-zero value(hugepte_shift) is
41  * stored for the huge page sizes that are valid.
42  */
43 unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
44 
45 #define hugepte_shift			mmu_huge_psizes
46 #define PTRS_PER_HUGEPTE(psize)		(1 << hugepte_shift[psize])
47 #define HUGEPTE_TABLE_SIZE(psize)	(sizeof(pte_t) << hugepte_shift[psize])
48 
49 #define HUGEPD_SHIFT(psize)		(mmu_psize_to_shift(psize) \
50 						+ hugepte_shift[psize])
51 #define HUGEPD_SIZE(psize)		(1UL << HUGEPD_SHIFT(psize))
52 #define HUGEPD_MASK(psize)		(~(HUGEPD_SIZE(psize)-1))
53 
54 /* Subtract one from array size because we don't need a cache for 4K since
55  * is not a huge page size */
56 #define huge_pgtable_cache(psize)	(pgtable_cache[HUGEPTE_CACHE_NUM \
57 							+ psize-1])
58 #define HUGEPTE_CACHE_NAME(psize)	(huge_pgtable_cache_name[psize])
59 
60 static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
61 	"unused_4K", "hugepte_cache_64K", "unused_64K_AP",
62 	"hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G"
63 };
64 
65 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
66  * will choke on pointers to hugepte tables, which is handy for
67  * catching screwups early. */
68 #define HUGEPD_OK	0x1
69 
70 typedef struct { unsigned long pd; } hugepd_t;
71 
72 #define hugepd_none(hpd)	((hpd).pd == 0)
73 
74 static inline int shift_to_mmu_psize(unsigned int shift)
75 {
76 	switch (shift) {
77 #ifndef CONFIG_PPC_64K_PAGES
78 	case PAGE_SHIFT_64K:
79 	    return MMU_PAGE_64K;
80 #endif
81 	case PAGE_SHIFT_16M:
82 	    return MMU_PAGE_16M;
83 	case PAGE_SHIFT_16G:
84 	    return MMU_PAGE_16G;
85 	}
86 	return -1;
87 }
88 
89 static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
90 {
91 	if (mmu_psize_defs[mmu_psize].shift)
92 		return mmu_psize_defs[mmu_psize].shift;
93 	BUG();
94 }
95 
96 static inline pte_t *hugepd_page(hugepd_t hpd)
97 {
98 	BUG_ON(!(hpd.pd & HUGEPD_OK));
99 	return (pte_t *)(hpd.pd & ~HUGEPD_OK);
100 }
101 
102 static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
103 				    struct hstate *hstate)
104 {
105 	unsigned int shift = huge_page_shift(hstate);
106 	int psize = shift_to_mmu_psize(shift);
107 	unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
108 	pte_t *dir = hugepd_page(*hpdp);
109 
110 	return dir + idx;
111 }
112 
113 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
114 			   unsigned long address, unsigned int psize)
115 {
116 	pte_t *new = kmem_cache_alloc(huge_pgtable_cache(psize),
117 				      GFP_KERNEL|__GFP_REPEAT);
118 
119 	if (! new)
120 		return -ENOMEM;
121 
122 	spin_lock(&mm->page_table_lock);
123 	if (!hugepd_none(*hpdp))
124 		kmem_cache_free(huge_pgtable_cache(psize), new);
125 	else
126 		hpdp->pd = (unsigned long)new | HUGEPD_OK;
127 	spin_unlock(&mm->page_table_lock);
128 	return 0;
129 }
130 
131 /* Base page size affects how we walk hugetlb page tables */
132 #ifdef CONFIG_PPC_64K_PAGES
133 #define hpmd_offset(pud, addr, h)	pmd_offset(pud, addr)
134 #define hpmd_alloc(mm, pud, addr, h)	pmd_alloc(mm, pud, addr)
135 #else
136 static inline
137 pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
138 {
139 	if (huge_page_shift(hstate) == PAGE_SHIFT_64K)
140 		return pmd_offset(pud, addr);
141 	else
142 		return (pmd_t *) pud;
143 }
144 static inline
145 pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
146 		  struct hstate *hstate)
147 {
148 	if (huge_page_shift(hstate) == PAGE_SHIFT_64K)
149 		return pmd_alloc(mm, pud, addr);
150 	else
151 		return (pmd_t *) pud;
152 }
153 #endif
154 
155 /* Build list of addresses of gigantic pages.  This function is used in early
156  * boot before the buddy or bootmem allocator is setup.
157  */
158 void add_gpage(unsigned long addr, unsigned long page_size,
159 	unsigned long number_of_pages)
160 {
161 	if (!addr)
162 		return;
163 	while (number_of_pages > 0) {
164 		gpage_freearray[nr_gpages] = addr;
165 		nr_gpages++;
166 		number_of_pages--;
167 		addr += page_size;
168 	}
169 }
170 
171 /* Moves the gigantic page addresses from the temporary list to the
172  * huge_boot_pages list.
173  */
174 int alloc_bootmem_huge_page(struct hstate *hstate)
175 {
176 	struct huge_bootmem_page *m;
177 	if (nr_gpages == 0)
178 		return 0;
179 	m = phys_to_virt(gpage_freearray[--nr_gpages]);
180 	gpage_freearray[nr_gpages] = 0;
181 	list_add(&m->list, &huge_boot_pages);
182 	m->hstate = hstate;
183 	return 1;
184 }
185 
186 
187 /* Modelled after find_linux_pte() */
188 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
189 {
190 	pgd_t *pg;
191 	pud_t *pu;
192 	pmd_t *pm;
193 
194 	unsigned int psize;
195 	unsigned int shift;
196 	unsigned long sz;
197 	struct hstate *hstate;
198 	psize = get_slice_psize(mm, addr);
199 	shift = mmu_psize_to_shift(psize);
200 	sz = ((1UL) << shift);
201 	hstate = size_to_hstate(sz);
202 
203 	addr &= hstate->mask;
204 
205 	pg = pgd_offset(mm, addr);
206 	if (!pgd_none(*pg)) {
207 		pu = pud_offset(pg, addr);
208 		if (!pud_none(*pu)) {
209 			pm = hpmd_offset(pu, addr, hstate);
210 			if (!pmd_none(*pm))
211 				return hugepte_offset((hugepd_t *)pm, addr,
212 						      hstate);
213 		}
214 	}
215 
216 	return NULL;
217 }
218 
219 pte_t *huge_pte_alloc(struct mm_struct *mm,
220 			unsigned long addr, unsigned long sz)
221 {
222 	pgd_t *pg;
223 	pud_t *pu;
224 	pmd_t *pm;
225 	hugepd_t *hpdp = NULL;
226 	struct hstate *hstate;
227 	unsigned int psize;
228 	hstate = size_to_hstate(sz);
229 
230 	psize = get_slice_psize(mm, addr);
231 	BUG_ON(!mmu_huge_psizes[psize]);
232 
233 	addr &= hstate->mask;
234 
235 	pg = pgd_offset(mm, addr);
236 	pu = pud_alloc(mm, pg, addr);
237 
238 	if (pu) {
239 		pm = hpmd_alloc(mm, pu, addr, hstate);
240 		if (pm)
241 			hpdp = (hugepd_t *)pm;
242 	}
243 
244 	if (! hpdp)
245 		return NULL;
246 
247 	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
248 		return NULL;
249 
250 	return hugepte_offset(hpdp, addr, hstate);
251 }
252 
253 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
254 {
255 	return 0;
256 }
257 
258 static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
259 			       unsigned int psize)
260 {
261 	pte_t *hugepte = hugepd_page(*hpdp);
262 
263 	hpdp->pd = 0;
264 	tlb->need_flush = 1;
265 	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
266 						 HUGEPTE_CACHE_NUM+psize-1,
267 						 PGF_CACHENUM_MASK));
268 }
269 
270 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
271 				   unsigned long addr, unsigned long end,
272 				   unsigned long floor, unsigned long ceiling,
273 				   unsigned int psize)
274 {
275 	pmd_t *pmd;
276 	unsigned long next;
277 	unsigned long start;
278 
279 	start = addr;
280 	pmd = pmd_offset(pud, addr);
281 	do {
282 		next = pmd_addr_end(addr, end);
283 		if (pmd_none(*pmd))
284 			continue;
285 		free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
286 	} while (pmd++, addr = next, addr != end);
287 
288 	start &= PUD_MASK;
289 	if (start < floor)
290 		return;
291 	if (ceiling) {
292 		ceiling &= PUD_MASK;
293 		if (!ceiling)
294 			return;
295 	}
296 	if (end - 1 > ceiling - 1)
297 		return;
298 
299 	pmd = pmd_offset(pud, start);
300 	pud_clear(pud);
301 	pmd_free_tlb(tlb, pmd);
302 }
303 
304 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
305 				   unsigned long addr, unsigned long end,
306 				   unsigned long floor, unsigned long ceiling)
307 {
308 	pud_t *pud;
309 	unsigned long next;
310 	unsigned long start;
311 	unsigned int shift;
312 	unsigned int psize = get_slice_psize(tlb->mm, addr);
313 	shift = mmu_psize_to_shift(psize);
314 
315 	start = addr;
316 	pud = pud_offset(pgd, addr);
317 	do {
318 		next = pud_addr_end(addr, end);
319 #ifdef CONFIG_PPC_64K_PAGES
320 		if (pud_none_or_clear_bad(pud))
321 			continue;
322 		hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling,
323 				       psize);
324 #else
325 		if (shift == PAGE_SHIFT_64K) {
326 			if (pud_none_or_clear_bad(pud))
327 				continue;
328 			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
329 					       ceiling, psize);
330 		} else {
331 			if (pud_none(*pud))
332 				continue;
333 			free_hugepte_range(tlb, (hugepd_t *)pud, psize);
334 		}
335 #endif
336 	} while (pud++, addr = next, addr != end);
337 
338 	start &= PGDIR_MASK;
339 	if (start < floor)
340 		return;
341 	if (ceiling) {
342 		ceiling &= PGDIR_MASK;
343 		if (!ceiling)
344 			return;
345 	}
346 	if (end - 1 > ceiling - 1)
347 		return;
348 
349 	pud = pud_offset(pgd, start);
350 	pgd_clear(pgd);
351 	pud_free_tlb(tlb, pud);
352 }
353 
354 /*
355  * This function frees user-level page tables of a process.
356  *
357  * Must be called with pagetable lock held.
358  */
359 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
360 			    unsigned long addr, unsigned long end,
361 			    unsigned long floor, unsigned long ceiling)
362 {
363 	pgd_t *pgd;
364 	unsigned long next;
365 	unsigned long start;
366 
367 	/*
368 	 * Comments below take from the normal free_pgd_range().  They
369 	 * apply here too.  The tests against HUGEPD_MASK below are
370 	 * essential, because we *don't* test for this at the bottom
371 	 * level.  Without them we'll attempt to free a hugepte table
372 	 * when we unmap just part of it, even if there are other
373 	 * active mappings using it.
374 	 *
375 	 * The next few lines have given us lots of grief...
376 	 *
377 	 * Why are we testing HUGEPD* at this top level?  Because
378 	 * often there will be no work to do at all, and we'd prefer
379 	 * not to go all the way down to the bottom just to discover
380 	 * that.
381 	 *
382 	 * Why all these "- 1"s?  Because 0 represents both the bottom
383 	 * of the address space and the top of it (using -1 for the
384 	 * top wouldn't help much: the masks would do the wrong thing).
385 	 * The rule is that addr 0 and floor 0 refer to the bottom of
386 	 * the address space, but end 0 and ceiling 0 refer to the top
387 	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
388 	 * that end 0 case should be mythical).
389 	 *
390 	 * Wherever addr is brought up or ceiling brought down, we
391 	 * must be careful to reject "the opposite 0" before it
392 	 * confuses the subsequent tests.  But what about where end is
393 	 * brought down by HUGEPD_SIZE below? no, end can't go down to
394 	 * 0 there.
395 	 *
396 	 * Whereas we round start (addr) and ceiling down, by different
397 	 * masks at different levels, in order to test whether a table
398 	 * now has no other vmas using it, so can be freed, we don't
399 	 * bother to round floor or end up - the tests don't need that.
400 	 */
401 	unsigned int psize = get_slice_psize(tlb->mm, addr);
402 
403 	addr &= HUGEPD_MASK(psize);
404 	if (addr < floor) {
405 		addr += HUGEPD_SIZE(psize);
406 		if (!addr)
407 			return;
408 	}
409 	if (ceiling) {
410 		ceiling &= HUGEPD_MASK(psize);
411 		if (!ceiling)
412 			return;
413 	}
414 	if (end - 1 > ceiling - 1)
415 		end -= HUGEPD_SIZE(psize);
416 	if (addr > end - 1)
417 		return;
418 
419 	start = addr;
420 	pgd = pgd_offset(tlb->mm, addr);
421 	do {
422 		psize = get_slice_psize(tlb->mm, addr);
423 		BUG_ON(!mmu_huge_psizes[psize]);
424 		next = pgd_addr_end(addr, end);
425 		if (pgd_none_or_clear_bad(pgd))
426 			continue;
427 		hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
428 	} while (pgd++, addr = next, addr != end);
429 }
430 
431 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
432 		     pte_t *ptep, pte_t pte)
433 {
434 	if (pte_present(*ptep)) {
435 		/* We open-code pte_clear because we need to pass the right
436 		 * argument to hpte_need_flush (huge / !huge). Might not be
437 		 * necessary anymore if we make hpte_need_flush() get the
438 		 * page size from the slices
439 		 */
440 		unsigned int psize = get_slice_psize(mm, addr);
441 		unsigned int shift = mmu_psize_to_shift(psize);
442 		unsigned long sz = ((1UL) << shift);
443 		struct hstate *hstate = size_to_hstate(sz);
444 		pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
445 	}
446 	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
447 }
448 
449 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
450 			      pte_t *ptep)
451 {
452 	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
453 	return __pte(old);
454 }
455 
456 struct page *
457 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
458 {
459 	pte_t *ptep;
460 	struct page *page;
461 	unsigned int mmu_psize = get_slice_psize(mm, address);
462 
463 	/* Verify it is a huge page else bail. */
464 	if (!mmu_huge_psizes[mmu_psize])
465 		return ERR_PTR(-EINVAL);
466 
467 	ptep = huge_pte_offset(mm, address);
468 	page = pte_page(*ptep);
469 	if (page) {
470 		unsigned int shift = mmu_psize_to_shift(mmu_psize);
471 		unsigned long sz = ((1UL) << shift);
472 		page += (address % sz) / PAGE_SIZE;
473 	}
474 
475 	return page;
476 }
477 
478 int pmd_huge(pmd_t pmd)
479 {
480 	return 0;
481 }
482 
483 int pud_huge(pud_t pud)
484 {
485 	return 0;
486 }
487 
488 struct page *
489 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
490 		pmd_t *pmd, int write)
491 {
492 	BUG();
493 	return NULL;
494 }
495 
496 
497 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
498 					unsigned long len, unsigned long pgoff,
499 					unsigned long flags)
500 {
501 	struct hstate *hstate = hstate_file(file);
502 	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
503 	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
504 }
505 
506 /*
507  * Called by asm hashtable.S for doing lazy icache flush
508  */
509 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
510 					pte_t pte, int trap, unsigned long sz)
511 {
512 	struct page *page;
513 	int i;
514 
515 	if (!pfn_valid(pte_pfn(pte)))
516 		return rflags;
517 
518 	page = pte_page(pte);
519 
520 	/* page is dirty */
521 	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
522 		if (trap == 0x400) {
523 			for (i = 0; i < (sz / PAGE_SIZE); i++)
524 				__flush_dcache_icache(page_address(page+i));
525 			set_bit(PG_arch_1, &page->flags);
526 		} else {
527 			rflags |= HPTE_R_N;
528 		}
529 	}
530 	return rflags;
531 }
532 
533 int hash_huge_page(struct mm_struct *mm, unsigned long access,
534 		   unsigned long ea, unsigned long vsid, int local,
535 		   unsigned long trap)
536 {
537 	pte_t *ptep;
538 	unsigned long old_pte, new_pte;
539 	unsigned long va, rflags, pa, sz;
540 	long slot;
541 	int err = 1;
542 	int ssize = user_segment_size(ea);
543 	unsigned int mmu_psize;
544 	int shift;
545 	mmu_psize = get_slice_psize(mm, ea);
546 
547 	if (!mmu_huge_psizes[mmu_psize])
548 		goto out;
549 	ptep = huge_pte_offset(mm, ea);
550 
551 	/* Search the Linux page table for a match with va */
552 	va = hpt_va(ea, vsid, ssize);
553 
554 	/*
555 	 * If no pte found or not present, send the problem up to
556 	 * do_page_fault
557 	 */
558 	if (unlikely(!ptep || pte_none(*ptep)))
559 		goto out;
560 
561 	/*
562 	 * Check the user's access rights to the page.  If access should be
563 	 * prevented then send the problem up to do_page_fault.
564 	 */
565 	if (unlikely(access & ~pte_val(*ptep)))
566 		goto out;
567 	/*
568 	 * At this point, we have a pte (old_pte) which can be used to build
569 	 * or update an HPTE. There are 2 cases:
570 	 *
571 	 * 1. There is a valid (present) pte with no associated HPTE (this is
572 	 *	the most common case)
573 	 * 2. There is a valid (present) pte with an associated HPTE. The
574 	 *	current values of the pp bits in the HPTE prevent access
575 	 *	because we are doing software DIRTY bit management and the
576 	 *	page is currently not DIRTY.
577 	 */
578 
579 
580 	do {
581 		old_pte = pte_val(*ptep);
582 		if (old_pte & _PAGE_BUSY)
583 			goto out;
584 		new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
585 	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
586 					 old_pte, new_pte));
587 
588 	rflags = 0x2 | (!(new_pte & _PAGE_RW));
589  	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
590 	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
591 	shift = mmu_psize_to_shift(mmu_psize);
592 	sz = ((1UL) << shift);
593 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
594 		/* No CPU has hugepages but lacks no execute, so we
595 		 * don't need to worry about that case */
596 		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
597 						       trap, sz);
598 
599 	/* Check if pte already has an hpte (case 2) */
600 	if (unlikely(old_pte & _PAGE_HASHPTE)) {
601 		/* There MIGHT be an HPTE for this pte */
602 		unsigned long hash, slot;
603 
604 		hash = hpt_hash(va, shift, ssize);
605 		if (old_pte & _PAGE_F_SECOND)
606 			hash = ~hash;
607 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
608 		slot += (old_pte & _PAGE_F_GIX) >> 12;
609 
610 		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
611 					 ssize, local) == -1)
612 			old_pte &= ~_PAGE_HPTEFLAGS;
613 	}
614 
615 	if (likely(!(old_pte & _PAGE_HASHPTE))) {
616 		unsigned long hash = hpt_hash(va, shift, ssize);
617 		unsigned long hpte_group;
618 
619 		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
620 
621 repeat:
622 		hpte_group = ((hash & htab_hash_mask) *
623 			      HPTES_PER_GROUP) & ~0x7UL;
624 
625 		/* clear HPTE slot informations in new PTE */
626 #ifdef CONFIG_PPC_64K_PAGES
627 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
628 #else
629 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
630 #endif
631 		/* Add in WIMG bits */
632 		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
633 				      _PAGE_COHERENT | _PAGE_GUARDED));
634 
635 		/* Insert into the hash table, primary slot */
636 		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
637 					  mmu_psize, ssize);
638 
639 		/* Primary is full, try the secondary */
640 		if (unlikely(slot == -1)) {
641 			hpte_group = ((~hash & htab_hash_mask) *
642 				      HPTES_PER_GROUP) & ~0x7UL;
643 			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
644 						  HPTE_V_SECONDARY,
645 						  mmu_psize, ssize);
646 			if (slot == -1) {
647 				if (mftb() & 0x1)
648 					hpte_group = ((hash & htab_hash_mask) *
649 						      HPTES_PER_GROUP)&~0x7UL;
650 
651 				ppc_md.hpte_remove(hpte_group);
652 				goto repeat;
653                         }
654 		}
655 
656 		if (unlikely(slot == -2))
657 			panic("hash_huge_page: pte_insert failed\n");
658 
659 		new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
660 	}
661 
662 	/*
663 	 * No need to use ldarx/stdcx here
664 	 */
665 	*ptep = __pte(new_pte & ~_PAGE_BUSY);
666 
667 	err = 0;
668 
669  out:
670 	return err;
671 }
672 
673 void set_huge_psize(int psize)
674 {
675 	/* Check that it is a page size supported by the hardware and
676 	 * that it fits within pagetable limits. */
677 	if (mmu_psize_defs[psize].shift &&
678 		mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
679 		(mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
680 		 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
681 		 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
682 		/* Return if huge page size has already been setup or is the
683 		 * same as the base page size. */
684 		if (mmu_huge_psizes[psize] ||
685 		   mmu_psize_defs[psize].shift == PAGE_SHIFT)
686 			return;
687 		hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
688 
689 		switch (mmu_psize_defs[psize].shift) {
690 		case PAGE_SHIFT_64K:
691 		    /* We only allow 64k hpages with 4k base page,
692 		     * which was checked above, and always put them
693 		     * at the PMD */
694 		    hugepte_shift[psize] = PMD_SHIFT;
695 		    break;
696 		case PAGE_SHIFT_16M:
697 		    /* 16M pages can be at two different levels
698 		     * of pagestables based on base page size */
699 		    if (PAGE_SHIFT == PAGE_SHIFT_64K)
700 			    hugepte_shift[psize] = PMD_SHIFT;
701 		    else /* 4k base page */
702 			    hugepte_shift[psize] = PUD_SHIFT;
703 		    break;
704 		case PAGE_SHIFT_16G:
705 		    /* 16G pages are always at PGD level */
706 		    hugepte_shift[psize] = PGDIR_SHIFT;
707 		    break;
708 		}
709 		hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
710 	} else
711 		hugepte_shift[psize] = 0;
712 }
713 
714 static int __init hugepage_setup_sz(char *str)
715 {
716 	unsigned long long size;
717 	int mmu_psize;
718 	int shift;
719 
720 	size = memparse(str, &str);
721 
722 	shift = __ffs(size);
723 	mmu_psize = shift_to_mmu_psize(shift);
724 	if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
725 		set_huge_psize(mmu_psize);
726 	else
727 		printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
728 
729 	return 1;
730 }
731 __setup("hugepagesz=", hugepage_setup_sz);
732 
733 static void zero_ctor(struct kmem_cache *cache, void *addr)
734 {
735 	memset(addr, 0, kmem_cache_size(cache));
736 }
737 
738 static int __init hugetlbpage_init(void)
739 {
740 	unsigned int psize;
741 
742 	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
743 		return -ENODEV;
744 	/* Add supported huge page sizes.  Need to change HUGE_MAX_HSTATE
745 	 * and adjust PTE_NONCACHE_NUM if the number of supported huge page
746 	 * sizes changes.
747 	 */
748 	set_huge_psize(MMU_PAGE_16M);
749 	set_huge_psize(MMU_PAGE_64K);
750 	set_huge_psize(MMU_PAGE_16G);
751 
752 	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
753 		if (mmu_huge_psizes[psize]) {
754 			huge_pgtable_cache(psize) = kmem_cache_create(
755 						HUGEPTE_CACHE_NAME(psize),
756 						HUGEPTE_TABLE_SIZE(psize),
757 						HUGEPTE_TABLE_SIZE(psize),
758 						0,
759 						zero_ctor);
760 			if (!huge_pgtable_cache(psize))
761 				panic("hugetlbpage_init(): could not create %s"\
762 				      "\n", HUGEPTE_CACHE_NAME(psize));
763 		}
764 	}
765 
766 	return 0;
767 }
768 
769 module_init(hugetlbpage_init);
770