xref: /openbmc/linux/arch/powerpc/mm/hugetlbpage.c (revision 87c2ce3b)
1 /*
2  * PPC64 (POWER4) Huge TLB Page Support for Kernel.
3  *
4  * Copyright (C) 2003 David Gibson, IBM Corporation.
5  *
6  * Based on the IA-32 version:
7  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
8  */
9 
10 #include <linux/init.h>
11 #include <linux/fs.h>
12 #include <linux/mm.h>
13 #include <linux/hugetlb.h>
14 #include <linux/pagemap.h>
15 #include <linux/smp_lock.h>
16 #include <linux/slab.h>
17 #include <linux/err.h>
18 #include <linux/sysctl.h>
19 #include <asm/mman.h>
20 #include <asm/pgalloc.h>
21 #include <asm/tlb.h>
22 #include <asm/tlbflush.h>
23 #include <asm/mmu_context.h>
24 #include <asm/machdep.h>
25 #include <asm/cputable.h>
26 #include <asm/tlb.h>
27 
28 #include <linux/sysctl.h>
29 
30 #define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
31 #define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
32 
33 /* Modelled after find_linux_pte() */
34 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
35 {
36 	pgd_t *pg;
37 	pud_t *pu;
38 	pmd_t *pm;
39 	pte_t *pt;
40 
41 	BUG_ON(! in_hugepage_area(mm->context, addr));
42 
43 	addr &= HPAGE_MASK;
44 
45 	pg = pgd_offset(mm, addr);
46 	if (!pgd_none(*pg)) {
47 		pu = pud_offset(pg, addr);
48 		if (!pud_none(*pu)) {
49 			pm = pmd_offset(pu, addr);
50 #ifdef CONFIG_PPC_64K_PAGES
51 			/* Currently, we use the normal PTE offset within full
52 			 * size PTE pages, thus our huge PTEs are scattered in
53 			 * the PTE page and we do waste some. We may change
54 			 * that in the future, but the current mecanism keeps
55 			 * things much simpler
56 			 */
57 			if (!pmd_none(*pm)) {
58 				/* Note: pte_offset_* are all equivalent on
59 				 * ppc64 as we don't have HIGHMEM
60 				 */
61 				pt = pte_offset_kernel(pm, addr);
62 				return pt;
63 			}
64 #else /* CONFIG_PPC_64K_PAGES */
65 			/* On 4k pages, we put huge PTEs in the PMD page */
66 			pt = (pte_t *)pm;
67 			return pt;
68 #endif /* CONFIG_PPC_64K_PAGES */
69 		}
70 	}
71 
72 	return NULL;
73 }
74 
75 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
76 {
77 	pgd_t *pg;
78 	pud_t *pu;
79 	pmd_t *pm;
80 	pte_t *pt;
81 
82 	BUG_ON(! in_hugepage_area(mm->context, addr));
83 
84 	addr &= HPAGE_MASK;
85 
86 	pg = pgd_offset(mm, addr);
87 	pu = pud_alloc(mm, pg, addr);
88 
89 	if (pu) {
90 		pm = pmd_alloc(mm, pu, addr);
91 		if (pm) {
92 #ifdef CONFIG_PPC_64K_PAGES
93 			/* See comment in huge_pte_offset. Note that if we ever
94 			 * want to put the page size in the PMD, we would have
95 			 * to open code our own pte_alloc* function in order
96 			 * to populate and set the size atomically
97 			 */
98 			pt = pte_alloc_map(mm, pm, addr);
99 #else /* CONFIG_PPC_64K_PAGES */
100 			pt = (pte_t *)pm;
101 #endif /* CONFIG_PPC_64K_PAGES */
102 			return pt;
103 		}
104 	}
105 
106 	return NULL;
107 }
108 
109 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
110 		     pte_t *ptep, pte_t pte)
111 {
112 	if (pte_present(*ptep)) {
113 		/* We open-code pte_clear because we need to pass the right
114 		 * argument to hpte_update (huge / !huge)
115 		 */
116 		unsigned long old = pte_update(ptep, ~0UL);
117 		if (old & _PAGE_HASHPTE)
118 			hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
119 		flush_tlb_pending();
120 	}
121 	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
122 }
123 
124 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
125 			      pte_t *ptep)
126 {
127 	unsigned long old = pte_update(ptep, ~0UL);
128 
129 	if (old & _PAGE_HASHPTE)
130 		hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
131 	*ptep = __pte(0);
132 
133 	return __pte(old);
134 }
135 
136 /*
137  * This function checks for proper alignment of input addr and len parameters.
138  */
139 int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
140 {
141 	if (len & ~HPAGE_MASK)
142 		return -EINVAL;
143 	if (addr & ~HPAGE_MASK)
144 		return -EINVAL;
145 	if (! (within_hugepage_low_range(addr, len)
146 	       || within_hugepage_high_range(addr, len)) )
147 		return -EINVAL;
148 	return 0;
149 }
150 
151 struct slb_flush_info {
152 	struct mm_struct *mm;
153 	u16 newareas;
154 };
155 
156 static void flush_low_segments(void *parm)
157 {
158 	struct slb_flush_info *fi = parm;
159 	unsigned long i;
160 
161 	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS);
162 
163 	if (current->active_mm != fi->mm)
164 		return;
165 
166 	/* Only need to do anything if this CPU is working in the same
167 	 * mm as the one which has changed */
168 
169 	/* update the paca copy of the context struct */
170 	get_paca()->context = current->active_mm->context;
171 
172 	asm volatile("isync" : : : "memory");
173 	for (i = 0; i < NUM_LOW_AREAS; i++) {
174 		if (! (fi->newareas & (1U << i)))
175 			continue;
176 		asm volatile("slbie %0"
177 			     : : "r" ((i << SID_SHIFT) | SLBIE_C));
178 	}
179 	asm volatile("isync" : : : "memory");
180 }
181 
182 static void flush_high_segments(void *parm)
183 {
184 	struct slb_flush_info *fi = parm;
185 	unsigned long i, j;
186 
187 
188 	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS);
189 
190 	if (current->active_mm != fi->mm)
191 		return;
192 
193 	/* Only need to do anything if this CPU is working in the same
194 	 * mm as the one which has changed */
195 
196 	/* update the paca copy of the context struct */
197 	get_paca()->context = current->active_mm->context;
198 
199 	asm volatile("isync" : : : "memory");
200 	for (i = 0; i < NUM_HIGH_AREAS; i++) {
201 		if (! (fi->newareas & (1U << i)))
202 			continue;
203 		for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
204 			asm volatile("slbie %0"
205 				     :: "r" (((i << HTLB_AREA_SHIFT)
206 					      + (j << SID_SHIFT)) | SLBIE_C));
207 	}
208 	asm volatile("isync" : : : "memory");
209 }
210 
211 static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
212 {
213 	unsigned long start = area << SID_SHIFT;
214 	unsigned long end = (area+1) << SID_SHIFT;
215 	struct vm_area_struct *vma;
216 
217 	BUG_ON(area >= NUM_LOW_AREAS);
218 
219 	/* Check no VMAs are in the region */
220 	vma = find_vma(mm, start);
221 	if (vma && (vma->vm_start < end))
222 		return -EBUSY;
223 
224 	return 0;
225 }
226 
227 static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
228 {
229 	unsigned long start = area << HTLB_AREA_SHIFT;
230 	unsigned long end = (area+1) << HTLB_AREA_SHIFT;
231 	struct vm_area_struct *vma;
232 
233 	BUG_ON(area >= NUM_HIGH_AREAS);
234 
235 	/* Hack, so that each addresses is controlled by exactly one
236 	 * of the high or low area bitmaps, the first high area starts
237 	 * at 4GB, not 0 */
238 	if (start == 0)
239 		start = 0x100000000UL;
240 
241 	/* Check no VMAs are in the region */
242 	vma = find_vma(mm, start);
243 	if (vma && (vma->vm_start < end))
244 		return -EBUSY;
245 
246 	return 0;
247 }
248 
249 static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
250 {
251 	unsigned long i;
252 	struct slb_flush_info fi;
253 
254 	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
255 	BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
256 
257 	newareas &= ~(mm->context.low_htlb_areas);
258 	if (! newareas)
259 		return 0; /* The segments we want are already open */
260 
261 	for (i = 0; i < NUM_LOW_AREAS; i++)
262 		if ((1 << i) & newareas)
263 			if (prepare_low_area_for_htlb(mm, i) != 0)
264 				return -EBUSY;
265 
266 	mm->context.low_htlb_areas |= newareas;
267 
268 	/* the context change must make it to memory before the flush,
269 	 * so that further SLB misses do the right thing. */
270 	mb();
271 
272 	fi.mm = mm;
273 	fi.newareas = newareas;
274 	on_each_cpu(flush_low_segments, &fi, 0, 1);
275 
276 	return 0;
277 }
278 
279 static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
280 {
281 	struct slb_flush_info fi;
282 	unsigned long i;
283 
284 	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
285 	BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
286 		     != NUM_HIGH_AREAS);
287 
288 	newareas &= ~(mm->context.high_htlb_areas);
289 	if (! newareas)
290 		return 0; /* The areas we want are already open */
291 
292 	for (i = 0; i < NUM_HIGH_AREAS; i++)
293 		if ((1 << i) & newareas)
294 			if (prepare_high_area_for_htlb(mm, i) != 0)
295 				return -EBUSY;
296 
297 	mm->context.high_htlb_areas |= newareas;
298 
299 	/* update the paca copy of the context struct */
300 	get_paca()->context = mm->context;
301 
302 	/* the context change must make it to memory before the flush,
303 	 * so that further SLB misses do the right thing. */
304 	mb();
305 
306 	fi.mm = mm;
307 	fi.newareas = newareas;
308 	on_each_cpu(flush_high_segments, &fi, 0, 1);
309 
310 	return 0;
311 }
312 
313 int prepare_hugepage_range(unsigned long addr, unsigned long len)
314 {
315 	int err = 0;
316 
317 	if ( (addr+len) < addr )
318 		return -EINVAL;
319 
320 	if (addr < 0x100000000UL)
321 		err = open_low_hpage_areas(current->mm,
322 					  LOW_ESID_MASK(addr, len));
323 	if ((addr + len) > 0x100000000UL)
324 		err = open_high_hpage_areas(current->mm,
325 					    HTLB_AREA_MASK(addr, len));
326 	if (err) {
327 		printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
328 		       " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
329 		       addr, len,
330 		       LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
331 		return err;
332 	}
333 
334 	return 0;
335 }
336 
337 struct page *
338 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
339 {
340 	pte_t *ptep;
341 	struct page *page;
342 
343 	if (! in_hugepage_area(mm->context, address))
344 		return ERR_PTR(-EINVAL);
345 
346 	ptep = huge_pte_offset(mm, address);
347 	page = pte_page(*ptep);
348 	if (page)
349 		page += (address % HPAGE_SIZE) / PAGE_SIZE;
350 
351 	return page;
352 }
353 
354 int pmd_huge(pmd_t pmd)
355 {
356 	return 0;
357 }
358 
359 struct page *
360 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
361 		pmd_t *pmd, int write)
362 {
363 	BUG();
364 	return NULL;
365 }
366 
367 /* Because we have an exclusive hugepage region which lies within the
368  * normal user address space, we have to take special measures to make
369  * non-huge mmap()s evade the hugepage reserved regions. */
370 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
371 				     unsigned long len, unsigned long pgoff,
372 				     unsigned long flags)
373 {
374 	struct mm_struct *mm = current->mm;
375 	struct vm_area_struct *vma;
376 	unsigned long start_addr;
377 
378 	if (len > TASK_SIZE)
379 		return -ENOMEM;
380 
381 	if (addr) {
382 		addr = PAGE_ALIGN(addr);
383 		vma = find_vma(mm, addr);
384 		if (((TASK_SIZE - len) >= addr)
385 		    && (!vma || (addr+len) <= vma->vm_start)
386 		    && !is_hugepage_only_range(mm, addr,len))
387 			return addr;
388 	}
389 	if (len > mm->cached_hole_size) {
390 	        start_addr = addr = mm->free_area_cache;
391 	} else {
392 	        start_addr = addr = TASK_UNMAPPED_BASE;
393 	        mm->cached_hole_size = 0;
394 	}
395 
396 full_search:
397 	vma = find_vma(mm, addr);
398 	while (TASK_SIZE - len >= addr) {
399 		BUG_ON(vma && (addr >= vma->vm_end));
400 
401 		if (touches_hugepage_low_range(mm, addr, len)) {
402 			addr = ALIGN(addr+1, 1<<SID_SHIFT);
403 			vma = find_vma(mm, addr);
404 			continue;
405 		}
406 		if (touches_hugepage_high_range(mm, addr, len)) {
407 			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
408 			vma = find_vma(mm, addr);
409 			continue;
410 		}
411 		if (!vma || addr + len <= vma->vm_start) {
412 			/*
413 			 * Remember the place where we stopped the search:
414 			 */
415 			mm->free_area_cache = addr + len;
416 			return addr;
417 		}
418 		if (addr + mm->cached_hole_size < vma->vm_start)
419 		        mm->cached_hole_size = vma->vm_start - addr;
420 		addr = vma->vm_end;
421 		vma = vma->vm_next;
422 	}
423 
424 	/* Make sure we didn't miss any holes */
425 	if (start_addr != TASK_UNMAPPED_BASE) {
426 		start_addr = addr = TASK_UNMAPPED_BASE;
427 		mm->cached_hole_size = 0;
428 		goto full_search;
429 	}
430 	return -ENOMEM;
431 }
432 
433 /*
434  * This mmap-allocator allocates new areas top-down from below the
435  * stack's low limit (the base):
436  *
437  * Because we have an exclusive hugepage region which lies within the
438  * normal user address space, we have to take special measures to make
439  * non-huge mmap()s evade the hugepage reserved regions.
440  */
441 unsigned long
442 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
443 			  const unsigned long len, const unsigned long pgoff,
444 			  const unsigned long flags)
445 {
446 	struct vm_area_struct *vma, *prev_vma;
447 	struct mm_struct *mm = current->mm;
448 	unsigned long base = mm->mmap_base, addr = addr0;
449 	unsigned long largest_hole = mm->cached_hole_size;
450 	int first_time = 1;
451 
452 	/* requested length too big for entire address space */
453 	if (len > TASK_SIZE)
454 		return -ENOMEM;
455 
456 	/* dont allow allocations above current base */
457 	if (mm->free_area_cache > base)
458 		mm->free_area_cache = base;
459 
460 	/* requesting a specific address */
461 	if (addr) {
462 		addr = PAGE_ALIGN(addr);
463 		vma = find_vma(mm, addr);
464 		if (TASK_SIZE - len >= addr &&
465 				(!vma || addr + len <= vma->vm_start)
466 				&& !is_hugepage_only_range(mm, addr,len))
467 			return addr;
468 	}
469 
470 	if (len <= largest_hole) {
471 	        largest_hole = 0;
472 		mm->free_area_cache = base;
473 	}
474 try_again:
475 	/* make sure it can fit in the remaining address space */
476 	if (mm->free_area_cache < len)
477 		goto fail;
478 
479 	/* either no address requested or cant fit in requested address hole */
480 	addr = (mm->free_area_cache - len) & PAGE_MASK;
481 	do {
482 hugepage_recheck:
483 		if (touches_hugepage_low_range(mm, addr, len)) {
484 			addr = (addr & ((~0) << SID_SHIFT)) - len;
485 			goto hugepage_recheck;
486 		} else if (touches_hugepage_high_range(mm, addr, len)) {
487 			addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
488 			goto hugepage_recheck;
489 		}
490 
491 		/*
492 		 * Lookup failure means no vma is above this address,
493 		 * i.e. return with success:
494 		 */
495  	 	if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
496 			return addr;
497 
498 		/*
499 		 * new region fits between prev_vma->vm_end and
500 		 * vma->vm_start, use it:
501 		 */
502 		if (addr+len <= vma->vm_start &&
503 		          (!prev_vma || (addr >= prev_vma->vm_end))) {
504 			/* remember the address as a hint for next time */
505 		        mm->cached_hole_size = largest_hole;
506 		        return (mm->free_area_cache = addr);
507 		} else {
508 			/* pull free_area_cache down to the first hole */
509 		        if (mm->free_area_cache == vma->vm_end) {
510 				mm->free_area_cache = vma->vm_start;
511 				mm->cached_hole_size = largest_hole;
512 			}
513 		}
514 
515 		/* remember the largest hole we saw so far */
516 		if (addr + largest_hole < vma->vm_start)
517 		        largest_hole = vma->vm_start - addr;
518 
519 		/* try just below the current vma->vm_start */
520 		addr = vma->vm_start-len;
521 	} while (len <= vma->vm_start);
522 
523 fail:
524 	/*
525 	 * if hint left us with no space for the requested
526 	 * mapping then try again:
527 	 */
528 	if (first_time) {
529 		mm->free_area_cache = base;
530 		largest_hole = 0;
531 		first_time = 0;
532 		goto try_again;
533 	}
534 	/*
535 	 * A failed mmap() very likely causes application failure,
536 	 * so fall back to the bottom-up function here. This scenario
537 	 * can happen with large stack limits and large mmap()
538 	 * allocations.
539 	 */
540 	mm->free_area_cache = TASK_UNMAPPED_BASE;
541 	mm->cached_hole_size = ~0UL;
542 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
543 	/*
544 	 * Restore the topdown base:
545 	 */
546 	mm->free_area_cache = base;
547 	mm->cached_hole_size = ~0UL;
548 
549 	return addr;
550 }
551 
552 static int htlb_check_hinted_area(unsigned long addr, unsigned long len)
553 {
554 	struct vm_area_struct *vma;
555 
556 	vma = find_vma(current->mm, addr);
557 	if (!vma || ((addr + len) <= vma->vm_start))
558 		return 0;
559 
560 	return -ENOMEM;
561 }
562 
563 static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
564 {
565 	unsigned long addr = 0;
566 	struct vm_area_struct *vma;
567 
568 	vma = find_vma(current->mm, addr);
569 	while (addr + len <= 0x100000000UL) {
570 		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
571 
572 		if (! __within_hugepage_low_range(addr, len, segmask)) {
573 			addr = ALIGN(addr+1, 1<<SID_SHIFT);
574 			vma = find_vma(current->mm, addr);
575 			continue;
576 		}
577 
578 		if (!vma || (addr + len) <= vma->vm_start)
579 			return addr;
580 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
581 		/* Depending on segmask this might not be a confirmed
582 		 * hugepage region, so the ALIGN could have skipped
583 		 * some VMAs */
584 		vma = find_vma(current->mm, addr);
585 	}
586 
587 	return -ENOMEM;
588 }
589 
590 static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
591 {
592 	unsigned long addr = 0x100000000UL;
593 	struct vm_area_struct *vma;
594 
595 	vma = find_vma(current->mm, addr);
596 	while (addr + len <= TASK_SIZE_USER64) {
597 		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
598 
599 		if (! __within_hugepage_high_range(addr, len, areamask)) {
600 			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
601 			vma = find_vma(current->mm, addr);
602 			continue;
603 		}
604 
605 		if (!vma || (addr + len) <= vma->vm_start)
606 			return addr;
607 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
608 		/* Depending on segmask this might not be a confirmed
609 		 * hugepage region, so the ALIGN could have skipped
610 		 * some VMAs */
611 		vma = find_vma(current->mm, addr);
612 	}
613 
614 	return -ENOMEM;
615 }
616 
617 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
618 					unsigned long len, unsigned long pgoff,
619 					unsigned long flags)
620 {
621 	int lastshift;
622 	u16 areamask, curareas;
623 
624 	if (HPAGE_SHIFT == 0)
625 		return -EINVAL;
626 	if (len & ~HPAGE_MASK)
627 		return -EINVAL;
628 
629 	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
630 		return -EINVAL;
631 
632 	/* Paranoia, caller should have dealt with this */
633 	BUG_ON((addr + len)  < addr);
634 
635 	if (test_thread_flag(TIF_32BIT)) {
636 		/* Paranoia, caller should have dealt with this */
637 		BUG_ON((addr + len) > 0x100000000UL);
638 
639 		curareas = current->mm->context.low_htlb_areas;
640 
641 		/* First see if we can use the hint address */
642 		if (addr && (htlb_check_hinted_area(addr, len) == 0)) {
643 			areamask = LOW_ESID_MASK(addr, len);
644 			if (open_low_hpage_areas(current->mm, areamask) == 0)
645 				return addr;
646 		}
647 
648 		/* Next see if we can map in the existing low areas */
649 		addr = htlb_get_low_area(len, curareas);
650 		if (addr != -ENOMEM)
651 			return addr;
652 
653 		/* Finally go looking for areas to open */
654 		lastshift = 0;
655 		for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
656 		     ! lastshift; areamask >>=1) {
657 			if (areamask & 1)
658 				lastshift = 1;
659 
660 			addr = htlb_get_low_area(len, curareas | areamask);
661 			if ((addr != -ENOMEM)
662 			    && open_low_hpage_areas(current->mm, areamask) == 0)
663 				return addr;
664 		}
665 	} else {
666 		curareas = current->mm->context.high_htlb_areas;
667 
668 		/* First see if we can use the hint address */
669 		/* We discourage 64-bit processes from doing hugepage
670 		 * mappings below 4GB (must use MAP_FIXED) */
671 		if ((addr >= 0x100000000UL)
672 		    && (htlb_check_hinted_area(addr, len) == 0)) {
673 			areamask = HTLB_AREA_MASK(addr, len);
674 			if (open_high_hpage_areas(current->mm, areamask) == 0)
675 				return addr;
676 		}
677 
678 		/* Next see if we can map in the existing high areas */
679 		addr = htlb_get_high_area(len, curareas);
680 		if (addr != -ENOMEM)
681 			return addr;
682 
683 		/* Finally go looking for areas to open */
684 		lastshift = 0;
685 		for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
686 		     ! lastshift; areamask >>=1) {
687 			if (areamask & 1)
688 				lastshift = 1;
689 
690 			addr = htlb_get_high_area(len, curareas | areamask);
691 			if ((addr != -ENOMEM)
692 			    && open_high_hpage_areas(current->mm, areamask) == 0)
693 				return addr;
694 		}
695 	}
696 	printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
697 	       " enough areas\n");
698 	return -ENOMEM;
699 }
700 
701 /*
702  * Called by asm hashtable.S for doing lazy icache flush
703  */
704 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
705 						  pte_t pte, int trap)
706 {
707 	struct page *page;
708 	int i;
709 
710 	if (!pfn_valid(pte_pfn(pte)))
711 		return rflags;
712 
713 	page = pte_page(pte);
714 
715 	/* page is dirty */
716 	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
717 		if (trap == 0x400) {
718 			for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
719 				__flush_dcache_icache(page_address(page+i));
720 			set_bit(PG_arch_1, &page->flags);
721 		} else {
722 			rflags |= HPTE_R_N;
723 		}
724 	}
725 	return rflags;
726 }
727 
728 int hash_huge_page(struct mm_struct *mm, unsigned long access,
729 		   unsigned long ea, unsigned long vsid, int local,
730 		   unsigned long trap)
731 {
732 	pte_t *ptep;
733 	unsigned long old_pte, new_pte;
734 	unsigned long va, rflags, pa;
735 	long slot;
736 	int err = 1;
737 
738 	ptep = huge_pte_offset(mm, ea);
739 
740 	/* Search the Linux page table for a match with va */
741 	va = (vsid << 28) | (ea & 0x0fffffff);
742 
743 	/*
744 	 * If no pte found or not present, send the problem up to
745 	 * do_page_fault
746 	 */
747 	if (unlikely(!ptep || pte_none(*ptep)))
748 		goto out;
749 
750 	/*
751 	 * Check the user's access rights to the page.  If access should be
752 	 * prevented then send the problem up to do_page_fault.
753 	 */
754 	if (unlikely(access & ~pte_val(*ptep)))
755 		goto out;
756 	/*
757 	 * At this point, we have a pte (old_pte) which can be used to build
758 	 * or update an HPTE. There are 2 cases:
759 	 *
760 	 * 1. There is a valid (present) pte with no associated HPTE (this is
761 	 *	the most common case)
762 	 * 2. There is a valid (present) pte with an associated HPTE. The
763 	 *	current values of the pp bits in the HPTE prevent access
764 	 *	because we are doing software DIRTY bit management and the
765 	 *	page is currently not DIRTY.
766 	 */
767 
768 
769 	do {
770 		old_pte = pte_val(*ptep);
771 		if (old_pte & _PAGE_BUSY)
772 			goto out;
773 		new_pte = old_pte | _PAGE_BUSY |
774 			_PAGE_ACCESSED | _PAGE_HASHPTE;
775 	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
776 					 old_pte, new_pte));
777 
778 	rflags = 0x2 | (!(new_pte & _PAGE_RW));
779  	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
780 	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
781 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
782 		/* No CPU has hugepages but lacks no execute, so we
783 		 * don't need to worry about that case */
784 		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
785 						       trap);
786 
787 	/* Check if pte already has an hpte (case 2) */
788 	if (unlikely(old_pte & _PAGE_HASHPTE)) {
789 		/* There MIGHT be an HPTE for this pte */
790 		unsigned long hash, slot;
791 
792 		hash = hpt_hash(va, HPAGE_SHIFT);
793 		if (old_pte & _PAGE_F_SECOND)
794 			hash = ~hash;
795 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
796 		slot += (old_pte & _PAGE_F_GIX) >> 12;
797 
798 		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
799 					 local) == -1)
800 			old_pte &= ~_PAGE_HPTEFLAGS;
801 	}
802 
803 	if (likely(!(old_pte & _PAGE_HASHPTE))) {
804 		unsigned long hash = hpt_hash(va, HPAGE_SHIFT);
805 		unsigned long hpte_group;
806 
807 		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
808 
809 repeat:
810 		hpte_group = ((hash & htab_hash_mask) *
811 			      HPTES_PER_GROUP) & ~0x7UL;
812 
813 		/* clear HPTE slot informations in new PTE */
814 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
815 
816 		/* Add in WIMG bits */
817 		/* XXX We should store these in the pte */
818 		/* --BenH: I think they are ... */
819 		rflags |= _PAGE_COHERENT;
820 
821 		/* Insert into the hash table, primary slot */
822 		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
823 					  mmu_huge_psize);
824 
825 		/* Primary is full, try the secondary */
826 		if (unlikely(slot == -1)) {
827 			new_pte |= _PAGE_F_SECOND;
828 			hpte_group = ((~hash & htab_hash_mask) *
829 				      HPTES_PER_GROUP) & ~0x7UL;
830 			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
831 						  HPTE_V_SECONDARY,
832 						  mmu_huge_psize);
833 			if (slot == -1) {
834 				if (mftb() & 0x1)
835 					hpte_group = ((hash & htab_hash_mask) *
836 						      HPTES_PER_GROUP)&~0x7UL;
837 
838 				ppc_md.hpte_remove(hpte_group);
839 				goto repeat;
840                         }
841 		}
842 
843 		if (unlikely(slot == -2))
844 			panic("hash_huge_page: pte_insert failed\n");
845 
846 		new_pte |= (slot << 12) & _PAGE_F_GIX;
847 	}
848 
849 	/*
850 	 * No need to use ldarx/stdcx here
851 	 */
852 	*ptep = __pte(new_pte & ~_PAGE_BUSY);
853 
854 	err = 0;
855 
856  out:
857 	return err;
858 }
859