xref: /openbmc/linux/arch/sparc/mm/init_64.c (revision 275876e2)
1 /*
2  *  arch/sparc64/mm/init.c
3  *
4  *  Copyright (C) 1996-1999 David S. Miller (davem@caip.rutgers.edu)
5  *  Copyright (C) 1997-1999 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
6  */
7 
8 #include <linux/module.h>
9 #include <linux/kernel.h>
10 #include <linux/sched.h>
11 #include <linux/string.h>
12 #include <linux/init.h>
13 #include <linux/bootmem.h>
14 #include <linux/mm.h>
15 #include <linux/hugetlb.h>
16 #include <linux/initrd.h>
17 #include <linux/swap.h>
18 #include <linux/pagemap.h>
19 #include <linux/poison.h>
20 #include <linux/fs.h>
21 #include <linux/seq_file.h>
22 #include <linux/kprobes.h>
23 #include <linux/cache.h>
24 #include <linux/sort.h>
25 #include <linux/ioport.h>
26 #include <linux/percpu.h>
27 #include <linux/memblock.h>
28 #include <linux/mmzone.h>
29 #include <linux/gfp.h>
30 
31 #include <asm/head.h>
32 #include <asm/page.h>
33 #include <asm/pgalloc.h>
34 #include <asm/pgtable.h>
35 #include <asm/oplib.h>
36 #include <asm/iommu.h>
37 #include <asm/io.h>
38 #include <asm/uaccess.h>
39 #include <asm/mmu_context.h>
40 #include <asm/tlbflush.h>
41 #include <asm/dma.h>
42 #include <asm/starfire.h>
43 #include <asm/tlb.h>
44 #include <asm/spitfire.h>
45 #include <asm/sections.h>
46 #include <asm/tsb.h>
47 #include <asm/hypervisor.h>
48 #include <asm/prom.h>
49 #include <asm/mdesc.h>
50 #include <asm/cpudata.h>
51 #include <asm/setup.h>
52 #include <asm/irq.h>
53 
54 #include "init_64.h"
55 
56 unsigned long kern_linear_pte_xor[4] __read_mostly;
57 
58 /* A bitmap, two bits for every 256MB of physical memory.  These two
59  * bits determine what page size we use for kernel linear
60  * translations.  They form an index into kern_linear_pte_xor[].  The
61  * value in the indexed slot is XOR'd with the TLB miss virtual
62  * address to form the resulting TTE.  The mapping is:
63  *
64  *	0	==>	4MB
65  *	1	==>	256MB
66  *	2	==>	2GB
67  *	3	==>	16GB
68  *
69  * All sun4v chips support 256MB pages.  Only SPARC-T4 and later
70  * support 2GB pages, and hopefully future cpus will support the 16GB
71  * pages as well.  For slots 2 and 3, we encode a 256MB TTE xor there
72  * if these larger page sizes are not supported by the cpu.
73  *
74  * It would be nice to determine this from the machine description
75  * 'cpu' properties, but we need to have this table setup before the
76  * MDESC is initialized.
77  */
78 unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)];
79 
80 #ifndef CONFIG_DEBUG_PAGEALLOC
81 /* A special kernel TSB for 4MB, 256MB, 2GB and 16GB linear mappings.
82  * Space is allocated for this right after the trap table in
83  * arch/sparc64/kernel/head.S
84  */
85 extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES];
86 #endif
87 
88 static unsigned long cpu_pgsz_mask;
89 
90 #define MAX_BANKS	32
91 
92 static struct linux_prom64_registers pavail[MAX_BANKS];
93 static int pavail_ents;
94 
95 static int cmp_p64(const void *a, const void *b)
96 {
97 	const struct linux_prom64_registers *x = a, *y = b;
98 
99 	if (x->phys_addr > y->phys_addr)
100 		return 1;
101 	if (x->phys_addr < y->phys_addr)
102 		return -1;
103 	return 0;
104 }
105 
106 static void __init read_obp_memory(const char *property,
107 				   struct linux_prom64_registers *regs,
108 				   int *num_ents)
109 {
110 	phandle node = prom_finddevice("/memory");
111 	int prop_size = prom_getproplen(node, property);
112 	int ents, ret, i;
113 
114 	ents = prop_size / sizeof(struct linux_prom64_registers);
115 	if (ents > MAX_BANKS) {
116 		prom_printf("The machine has more %s property entries than "
117 			    "this kernel can support (%d).\n",
118 			    property, MAX_BANKS);
119 		prom_halt();
120 	}
121 
122 	ret = prom_getproperty(node, property, (char *) regs, prop_size);
123 	if (ret == -1) {
124 		prom_printf("Couldn't get %s property from /memory.\n",
125 				property);
126 		prom_halt();
127 	}
128 
129 	/* Sanitize what we got from the firmware, by page aligning
130 	 * everything.
131 	 */
132 	for (i = 0; i < ents; i++) {
133 		unsigned long base, size;
134 
135 		base = regs[i].phys_addr;
136 		size = regs[i].reg_size;
137 
138 		size &= PAGE_MASK;
139 		if (base & ~PAGE_MASK) {
140 			unsigned long new_base = PAGE_ALIGN(base);
141 
142 			size -= new_base - base;
143 			if ((long) size < 0L)
144 				size = 0UL;
145 			base = new_base;
146 		}
147 		if (size == 0UL) {
148 			/* If it is empty, simply get rid of it.
149 			 * This simplifies the logic of the other
150 			 * functions that process these arrays.
151 			 */
152 			memmove(&regs[i], &regs[i + 1],
153 				(ents - i - 1) * sizeof(regs[0]));
154 			i--;
155 			ents--;
156 			continue;
157 		}
158 		regs[i].phys_addr = base;
159 		regs[i].reg_size = size;
160 	}
161 
162 	*num_ents = ents;
163 
164 	sort(regs, ents, sizeof(struct linux_prom64_registers),
165 	     cmp_p64, NULL);
166 }
167 
168 unsigned long sparc64_valid_addr_bitmap[VALID_ADDR_BITMAP_BYTES /
169 					sizeof(unsigned long)];
170 EXPORT_SYMBOL(sparc64_valid_addr_bitmap);
171 
172 /* Kernel physical address base and size in bytes.  */
173 unsigned long kern_base __read_mostly;
174 unsigned long kern_size __read_mostly;
175 
176 /* Initial ramdisk setup */
177 extern unsigned long sparc_ramdisk_image64;
178 extern unsigned int sparc_ramdisk_image;
179 extern unsigned int sparc_ramdisk_size;
180 
181 struct page *mem_map_zero __read_mostly;
182 EXPORT_SYMBOL(mem_map_zero);
183 
184 unsigned int sparc64_highest_unlocked_tlb_ent __read_mostly;
185 
186 unsigned long sparc64_kern_pri_context __read_mostly;
187 unsigned long sparc64_kern_pri_nuc_bits __read_mostly;
188 unsigned long sparc64_kern_sec_context __read_mostly;
189 
190 int num_kernel_image_mappings;
191 
192 #ifdef CONFIG_DEBUG_DCFLUSH
193 atomic_t dcpage_flushes = ATOMIC_INIT(0);
194 #ifdef CONFIG_SMP
195 atomic_t dcpage_flushes_xcall = ATOMIC_INIT(0);
196 #endif
197 #endif
198 
199 inline void flush_dcache_page_impl(struct page *page)
200 {
201 	BUG_ON(tlb_type == hypervisor);
202 #ifdef CONFIG_DEBUG_DCFLUSH
203 	atomic_inc(&dcpage_flushes);
204 #endif
205 
206 #ifdef DCACHE_ALIASING_POSSIBLE
207 	__flush_dcache_page(page_address(page),
208 			    ((tlb_type == spitfire) &&
209 			     page_mapping(page) != NULL));
210 #else
211 	if (page_mapping(page) != NULL &&
212 	    tlb_type == spitfire)
213 		__flush_icache_page(__pa(page_address(page)));
214 #endif
215 }
216 
217 #define PG_dcache_dirty		PG_arch_1
218 #define PG_dcache_cpu_shift	32UL
219 #define PG_dcache_cpu_mask	\
220 	((1UL<<ilog2(roundup_pow_of_two(NR_CPUS)))-1UL)
221 
222 #define dcache_dirty_cpu(page) \
223 	(((page)->flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask)
224 
225 static inline void set_dcache_dirty(struct page *page, int this_cpu)
226 {
227 	unsigned long mask = this_cpu;
228 	unsigned long non_cpu_bits;
229 
230 	non_cpu_bits = ~(PG_dcache_cpu_mask << PG_dcache_cpu_shift);
231 	mask = (mask << PG_dcache_cpu_shift) | (1UL << PG_dcache_dirty);
232 
233 	__asm__ __volatile__("1:\n\t"
234 			     "ldx	[%2], %%g7\n\t"
235 			     "and	%%g7, %1, %%g1\n\t"
236 			     "or	%%g1, %0, %%g1\n\t"
237 			     "casx	[%2], %%g7, %%g1\n\t"
238 			     "cmp	%%g7, %%g1\n\t"
239 			     "bne,pn	%%xcc, 1b\n\t"
240 			     " nop"
241 			     : /* no outputs */
242 			     : "r" (mask), "r" (non_cpu_bits), "r" (&page->flags)
243 			     : "g1", "g7");
244 }
245 
246 static inline void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu)
247 {
248 	unsigned long mask = (1UL << PG_dcache_dirty);
249 
250 	__asm__ __volatile__("! test_and_clear_dcache_dirty\n"
251 			     "1:\n\t"
252 			     "ldx	[%2], %%g7\n\t"
253 			     "srlx	%%g7, %4, %%g1\n\t"
254 			     "and	%%g1, %3, %%g1\n\t"
255 			     "cmp	%%g1, %0\n\t"
256 			     "bne,pn	%%icc, 2f\n\t"
257 			     " andn	%%g7, %1, %%g1\n\t"
258 			     "casx	[%2], %%g7, %%g1\n\t"
259 			     "cmp	%%g7, %%g1\n\t"
260 			     "bne,pn	%%xcc, 1b\n\t"
261 			     " nop\n"
262 			     "2:"
263 			     : /* no outputs */
264 			     : "r" (cpu), "r" (mask), "r" (&page->flags),
265 			       "i" (PG_dcache_cpu_mask),
266 			       "i" (PG_dcache_cpu_shift)
267 			     : "g1", "g7");
268 }
269 
270 static inline void tsb_insert(struct tsb *ent, unsigned long tag, unsigned long pte)
271 {
272 	unsigned long tsb_addr = (unsigned long) ent;
273 
274 	if (tlb_type == cheetah_plus || tlb_type == hypervisor)
275 		tsb_addr = __pa(tsb_addr);
276 
277 	__tsb_insert(tsb_addr, tag, pte);
278 }
279 
280 unsigned long _PAGE_ALL_SZ_BITS __read_mostly;
281 
282 static void flush_dcache(unsigned long pfn)
283 {
284 	struct page *page;
285 
286 	page = pfn_to_page(pfn);
287 	if (page) {
288 		unsigned long pg_flags;
289 
290 		pg_flags = page->flags;
291 		if (pg_flags & (1UL << PG_dcache_dirty)) {
292 			int cpu = ((pg_flags >> PG_dcache_cpu_shift) &
293 				   PG_dcache_cpu_mask);
294 			int this_cpu = get_cpu();
295 
296 			/* This is just to optimize away some function calls
297 			 * in the SMP case.
298 			 */
299 			if (cpu == this_cpu)
300 				flush_dcache_page_impl(page);
301 			else
302 				smp_flush_dcache_page_impl(page, cpu);
303 
304 			clear_dcache_dirty_cpu(page, cpu);
305 
306 			put_cpu();
307 		}
308 	}
309 }
310 
311 /* mm->context.lock must be held */
312 static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_index,
313 				    unsigned long tsb_hash_shift, unsigned long address,
314 				    unsigned long tte)
315 {
316 	struct tsb *tsb = mm->context.tsb_block[tsb_index].tsb;
317 	unsigned long tag;
318 
319 	if (unlikely(!tsb))
320 		return;
321 
322 	tsb += ((address >> tsb_hash_shift) &
323 		(mm->context.tsb_block[tsb_index].tsb_nentries - 1UL));
324 	tag = (address >> 22UL);
325 	tsb_insert(tsb, tag, tte);
326 }
327 
328 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
329 static inline bool is_hugetlb_pte(pte_t pte)
330 {
331 	if ((tlb_type == hypervisor &&
332 	     (pte_val(pte) & _PAGE_SZALL_4V) == _PAGE_SZHUGE_4V) ||
333 	    (tlb_type != hypervisor &&
334 	     (pte_val(pte) & _PAGE_SZALL_4U) == _PAGE_SZHUGE_4U))
335 		return true;
336 	return false;
337 }
338 #endif
339 
340 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
341 {
342 	struct mm_struct *mm;
343 	unsigned long flags;
344 	pte_t pte = *ptep;
345 
346 	if (tlb_type != hypervisor) {
347 		unsigned long pfn = pte_pfn(pte);
348 
349 		if (pfn_valid(pfn))
350 			flush_dcache(pfn);
351 	}
352 
353 	mm = vma->vm_mm;
354 
355 	/* Don't insert a non-valid PTE into the TSB, we'll deadlock.  */
356 	if (!pte_accessible(mm, pte))
357 		return;
358 
359 	spin_lock_irqsave(&mm->context.lock, flags);
360 
361 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
362 	if (mm->context.huge_pte_count && is_hugetlb_pte(pte))
363 		__update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT,
364 					address, pte_val(pte));
365 	else
366 #endif
367 		__update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT,
368 					address, pte_val(pte));
369 
370 	spin_unlock_irqrestore(&mm->context.lock, flags);
371 }
372 
373 void flush_dcache_page(struct page *page)
374 {
375 	struct address_space *mapping;
376 	int this_cpu;
377 
378 	if (tlb_type == hypervisor)
379 		return;
380 
381 	/* Do not bother with the expensive D-cache flush if it
382 	 * is merely the zero page.  The 'bigcore' testcase in GDB
383 	 * causes this case to run millions of times.
384 	 */
385 	if (page == ZERO_PAGE(0))
386 		return;
387 
388 	this_cpu = get_cpu();
389 
390 	mapping = page_mapping(page);
391 	if (mapping && !mapping_mapped(mapping)) {
392 		int dirty = test_bit(PG_dcache_dirty, &page->flags);
393 		if (dirty) {
394 			int dirty_cpu = dcache_dirty_cpu(page);
395 
396 			if (dirty_cpu == this_cpu)
397 				goto out;
398 			smp_flush_dcache_page_impl(page, dirty_cpu);
399 		}
400 		set_dcache_dirty(page, this_cpu);
401 	} else {
402 		/* We could delay the flush for the !page_mapping
403 		 * case too.  But that case is for exec env/arg
404 		 * pages and those are %99 certainly going to get
405 		 * faulted into the tlb (and thus flushed) anyways.
406 		 */
407 		flush_dcache_page_impl(page);
408 	}
409 
410 out:
411 	put_cpu();
412 }
413 EXPORT_SYMBOL(flush_dcache_page);
414 
415 void __kprobes flush_icache_range(unsigned long start, unsigned long end)
416 {
417 	/* Cheetah and Hypervisor platform cpus have coherent I-cache. */
418 	if (tlb_type == spitfire) {
419 		unsigned long kaddr;
420 
421 		/* This code only runs on Spitfire cpus so this is
422 		 * why we can assume _PAGE_PADDR_4U.
423 		 */
424 		for (kaddr = start; kaddr < end; kaddr += PAGE_SIZE) {
425 			unsigned long paddr, mask = _PAGE_PADDR_4U;
426 
427 			if (kaddr >= PAGE_OFFSET)
428 				paddr = kaddr & mask;
429 			else {
430 				pgd_t *pgdp = pgd_offset_k(kaddr);
431 				pud_t *pudp = pud_offset(pgdp, kaddr);
432 				pmd_t *pmdp = pmd_offset(pudp, kaddr);
433 				pte_t *ptep = pte_offset_kernel(pmdp, kaddr);
434 
435 				paddr = pte_val(*ptep) & mask;
436 			}
437 			__flush_icache_page(paddr);
438 		}
439 	}
440 }
441 EXPORT_SYMBOL(flush_icache_range);
442 
443 void mmu_info(struct seq_file *m)
444 {
445 	static const char *pgsz_strings[] = {
446 		"8K", "64K", "512K", "4MB", "32MB",
447 		"256MB", "2GB", "16GB",
448 	};
449 	int i, printed;
450 
451 	if (tlb_type == cheetah)
452 		seq_printf(m, "MMU Type\t: Cheetah\n");
453 	else if (tlb_type == cheetah_plus)
454 		seq_printf(m, "MMU Type\t: Cheetah+\n");
455 	else if (tlb_type == spitfire)
456 		seq_printf(m, "MMU Type\t: Spitfire\n");
457 	else if (tlb_type == hypervisor)
458 		seq_printf(m, "MMU Type\t: Hypervisor (sun4v)\n");
459 	else
460 		seq_printf(m, "MMU Type\t: ???\n");
461 
462 	seq_printf(m, "MMU PGSZs\t: ");
463 	printed = 0;
464 	for (i = 0; i < ARRAY_SIZE(pgsz_strings); i++) {
465 		if (cpu_pgsz_mask & (1UL << i)) {
466 			seq_printf(m, "%s%s",
467 				   printed ? "," : "", pgsz_strings[i]);
468 			printed++;
469 		}
470 	}
471 	seq_putc(m, '\n');
472 
473 #ifdef CONFIG_DEBUG_DCFLUSH
474 	seq_printf(m, "DCPageFlushes\t: %d\n",
475 		   atomic_read(&dcpage_flushes));
476 #ifdef CONFIG_SMP
477 	seq_printf(m, "DCPageFlushesXC\t: %d\n",
478 		   atomic_read(&dcpage_flushes_xcall));
479 #endif /* CONFIG_SMP */
480 #endif /* CONFIG_DEBUG_DCFLUSH */
481 }
482 
483 struct linux_prom_translation prom_trans[512] __read_mostly;
484 unsigned int prom_trans_ents __read_mostly;
485 
486 unsigned long kern_locked_tte_data;
487 
488 /* The obp translations are saved based on 8k pagesize, since obp can
489  * use a mixture of pagesizes. Misses to the LOW_OBP_ADDRESS ->
490  * HI_OBP_ADDRESS range are handled in ktlb.S.
491  */
492 static inline int in_obp_range(unsigned long vaddr)
493 {
494 	return (vaddr >= LOW_OBP_ADDRESS &&
495 		vaddr < HI_OBP_ADDRESS);
496 }
497 
498 static int cmp_ptrans(const void *a, const void *b)
499 {
500 	const struct linux_prom_translation *x = a, *y = b;
501 
502 	if (x->virt > y->virt)
503 		return 1;
504 	if (x->virt < y->virt)
505 		return -1;
506 	return 0;
507 }
508 
509 /* Read OBP translations property into 'prom_trans[]'.  */
510 static void __init read_obp_translations(void)
511 {
512 	int n, node, ents, first, last, i;
513 
514 	node = prom_finddevice("/virtual-memory");
515 	n = prom_getproplen(node, "translations");
516 	if (unlikely(n == 0 || n == -1)) {
517 		prom_printf("prom_mappings: Couldn't get size.\n");
518 		prom_halt();
519 	}
520 	if (unlikely(n > sizeof(prom_trans))) {
521 		prom_printf("prom_mappings: Size %d is too big.\n", n);
522 		prom_halt();
523 	}
524 
525 	if ((n = prom_getproperty(node, "translations",
526 				  (char *)&prom_trans[0],
527 				  sizeof(prom_trans))) == -1) {
528 		prom_printf("prom_mappings: Couldn't get property.\n");
529 		prom_halt();
530 	}
531 
532 	n = n / sizeof(struct linux_prom_translation);
533 
534 	ents = n;
535 
536 	sort(prom_trans, ents, sizeof(struct linux_prom_translation),
537 	     cmp_ptrans, NULL);
538 
539 	/* Now kick out all the non-OBP entries.  */
540 	for (i = 0; i < ents; i++) {
541 		if (in_obp_range(prom_trans[i].virt))
542 			break;
543 	}
544 	first = i;
545 	for (; i < ents; i++) {
546 		if (!in_obp_range(prom_trans[i].virt))
547 			break;
548 	}
549 	last = i;
550 
551 	for (i = 0; i < (last - first); i++) {
552 		struct linux_prom_translation *src = &prom_trans[i + first];
553 		struct linux_prom_translation *dest = &prom_trans[i];
554 
555 		*dest = *src;
556 	}
557 	for (; i < ents; i++) {
558 		struct linux_prom_translation *dest = &prom_trans[i];
559 		dest->virt = dest->size = dest->data = 0x0UL;
560 	}
561 
562 	prom_trans_ents = last - first;
563 
564 	if (tlb_type == spitfire) {
565 		/* Clear diag TTE bits. */
566 		for (i = 0; i < prom_trans_ents; i++)
567 			prom_trans[i].data &= ~0x0003fe0000000000UL;
568 	}
569 
570 	/* Force execute bit on.  */
571 	for (i = 0; i < prom_trans_ents; i++)
572 		prom_trans[i].data |= (tlb_type == hypervisor ?
573 				       _PAGE_EXEC_4V : _PAGE_EXEC_4U);
574 }
575 
576 static void __init hypervisor_tlb_lock(unsigned long vaddr,
577 				       unsigned long pte,
578 				       unsigned long mmu)
579 {
580 	unsigned long ret = sun4v_mmu_map_perm_addr(vaddr, 0, pte, mmu);
581 
582 	if (ret != 0) {
583 		prom_printf("hypervisor_tlb_lock[%lx:%x:%lx:%lx]: "
584 			    "errors with %lx\n", vaddr, 0, pte, mmu, ret);
585 		prom_halt();
586 	}
587 }
588 
589 static unsigned long kern_large_tte(unsigned long paddr);
590 
591 static void __init remap_kernel(void)
592 {
593 	unsigned long phys_page, tte_vaddr, tte_data;
594 	int i, tlb_ent = sparc64_highest_locked_tlbent();
595 
596 	tte_vaddr = (unsigned long) KERNBASE;
597 	phys_page = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB;
598 	tte_data = kern_large_tte(phys_page);
599 
600 	kern_locked_tte_data = tte_data;
601 
602 	/* Now lock us into the TLBs via Hypervisor or OBP. */
603 	if (tlb_type == hypervisor) {
604 		for (i = 0; i < num_kernel_image_mappings; i++) {
605 			hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_DMMU);
606 			hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_IMMU);
607 			tte_vaddr += 0x400000;
608 			tte_data += 0x400000;
609 		}
610 	} else {
611 		for (i = 0; i < num_kernel_image_mappings; i++) {
612 			prom_dtlb_load(tlb_ent - i, tte_data, tte_vaddr);
613 			prom_itlb_load(tlb_ent - i, tte_data, tte_vaddr);
614 			tte_vaddr += 0x400000;
615 			tte_data += 0x400000;
616 		}
617 		sparc64_highest_unlocked_tlb_ent = tlb_ent - i;
618 	}
619 	if (tlb_type == cheetah_plus) {
620 		sparc64_kern_pri_context = (CTX_CHEETAH_PLUS_CTX0 |
621 					    CTX_CHEETAH_PLUS_NUC);
622 		sparc64_kern_pri_nuc_bits = CTX_CHEETAH_PLUS_NUC;
623 		sparc64_kern_sec_context = CTX_CHEETAH_PLUS_CTX0;
624 	}
625 }
626 
627 
628 static void __init inherit_prom_mappings(void)
629 {
630 	/* Now fixup OBP's idea about where we really are mapped. */
631 	printk("Remapping the kernel... ");
632 	remap_kernel();
633 	printk("done.\n");
634 }
635 
636 void prom_world(int enter)
637 {
638 	if (!enter)
639 		set_fs(get_fs());
640 
641 	__asm__ __volatile__("flushw");
642 }
643 
644 void __flush_dcache_range(unsigned long start, unsigned long end)
645 {
646 	unsigned long va;
647 
648 	if (tlb_type == spitfire) {
649 		int n = 0;
650 
651 		for (va = start; va < end; va += 32) {
652 			spitfire_put_dcache_tag(va & 0x3fe0, 0x0);
653 			if (++n >= 512)
654 				break;
655 		}
656 	} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
657 		start = __pa(start);
658 		end = __pa(end);
659 		for (va = start; va < end; va += 32)
660 			__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
661 					     "membar #Sync"
662 					     : /* no outputs */
663 					     : "r" (va),
664 					       "i" (ASI_DCACHE_INVALIDATE));
665 	}
666 }
667 EXPORT_SYMBOL(__flush_dcache_range);
668 
669 /* get_new_mmu_context() uses "cache + 1".  */
670 DEFINE_SPINLOCK(ctx_alloc_lock);
671 unsigned long tlb_context_cache = CTX_FIRST_VERSION - 1;
672 #define MAX_CTX_NR	(1UL << CTX_NR_BITS)
673 #define CTX_BMAP_SLOTS	BITS_TO_LONGS(MAX_CTX_NR)
674 DECLARE_BITMAP(mmu_context_bmap, MAX_CTX_NR);
675 
676 /* Caller does TLB context flushing on local CPU if necessary.
677  * The caller also ensures that CTX_VALID(mm->context) is false.
678  *
679  * We must be careful about boundary cases so that we never
680  * let the user have CTX 0 (nucleus) or we ever use a CTX
681  * version of zero (and thus NO_CONTEXT would not be caught
682  * by version mis-match tests in mmu_context.h).
683  *
684  * Always invoked with interrupts disabled.
685  */
686 void get_new_mmu_context(struct mm_struct *mm)
687 {
688 	unsigned long ctx, new_ctx;
689 	unsigned long orig_pgsz_bits;
690 	int new_version;
691 
692 	spin_lock(&ctx_alloc_lock);
693 	orig_pgsz_bits = (mm->context.sparc64_ctx_val & CTX_PGSZ_MASK);
694 	ctx = (tlb_context_cache + 1) & CTX_NR_MASK;
695 	new_ctx = find_next_zero_bit(mmu_context_bmap, 1 << CTX_NR_BITS, ctx);
696 	new_version = 0;
697 	if (new_ctx >= (1 << CTX_NR_BITS)) {
698 		new_ctx = find_next_zero_bit(mmu_context_bmap, ctx, 1);
699 		if (new_ctx >= ctx) {
700 			int i;
701 			new_ctx = (tlb_context_cache & CTX_VERSION_MASK) +
702 				CTX_FIRST_VERSION;
703 			if (new_ctx == 1)
704 				new_ctx = CTX_FIRST_VERSION;
705 
706 			/* Don't call memset, for 16 entries that's just
707 			 * plain silly...
708 			 */
709 			mmu_context_bmap[0] = 3;
710 			mmu_context_bmap[1] = 0;
711 			mmu_context_bmap[2] = 0;
712 			mmu_context_bmap[3] = 0;
713 			for (i = 4; i < CTX_BMAP_SLOTS; i += 4) {
714 				mmu_context_bmap[i + 0] = 0;
715 				mmu_context_bmap[i + 1] = 0;
716 				mmu_context_bmap[i + 2] = 0;
717 				mmu_context_bmap[i + 3] = 0;
718 			}
719 			new_version = 1;
720 			goto out;
721 		}
722 	}
723 	mmu_context_bmap[new_ctx>>6] |= (1UL << (new_ctx & 63));
724 	new_ctx |= (tlb_context_cache & CTX_VERSION_MASK);
725 out:
726 	tlb_context_cache = new_ctx;
727 	mm->context.sparc64_ctx_val = new_ctx | orig_pgsz_bits;
728 	spin_unlock(&ctx_alloc_lock);
729 
730 	if (unlikely(new_version))
731 		smp_new_mmu_context_version();
732 }
733 
734 static int numa_enabled = 1;
735 static int numa_debug;
736 
737 static int __init early_numa(char *p)
738 {
739 	if (!p)
740 		return 0;
741 
742 	if (strstr(p, "off"))
743 		numa_enabled = 0;
744 
745 	if (strstr(p, "debug"))
746 		numa_debug = 1;
747 
748 	return 0;
749 }
750 early_param("numa", early_numa);
751 
752 #define numadbg(f, a...) \
753 do {	if (numa_debug) \
754 		printk(KERN_INFO f, ## a); \
755 } while (0)
756 
757 static void __init find_ramdisk(unsigned long phys_base)
758 {
759 #ifdef CONFIG_BLK_DEV_INITRD
760 	if (sparc_ramdisk_image || sparc_ramdisk_image64) {
761 		unsigned long ramdisk_image;
762 
763 		/* Older versions of the bootloader only supported a
764 		 * 32-bit physical address for the ramdisk image
765 		 * location, stored at sparc_ramdisk_image.  Newer
766 		 * SILO versions set sparc_ramdisk_image to zero and
767 		 * provide a full 64-bit physical address at
768 		 * sparc_ramdisk_image64.
769 		 */
770 		ramdisk_image = sparc_ramdisk_image;
771 		if (!ramdisk_image)
772 			ramdisk_image = sparc_ramdisk_image64;
773 
774 		/* Another bootloader quirk.  The bootloader normalizes
775 		 * the physical address to KERNBASE, so we have to
776 		 * factor that back out and add in the lowest valid
777 		 * physical page address to get the true physical address.
778 		 */
779 		ramdisk_image -= KERNBASE;
780 		ramdisk_image += phys_base;
781 
782 		numadbg("Found ramdisk at physical address 0x%lx, size %u\n",
783 			ramdisk_image, sparc_ramdisk_size);
784 
785 		initrd_start = ramdisk_image;
786 		initrd_end = ramdisk_image + sparc_ramdisk_size;
787 
788 		memblock_reserve(initrd_start, sparc_ramdisk_size);
789 
790 		initrd_start += PAGE_OFFSET;
791 		initrd_end += PAGE_OFFSET;
792 	}
793 #endif
794 }
795 
796 struct node_mem_mask {
797 	unsigned long mask;
798 	unsigned long val;
799 };
800 static struct node_mem_mask node_masks[MAX_NUMNODES];
801 static int num_node_masks;
802 
803 #ifdef CONFIG_NEED_MULTIPLE_NODES
804 
805 int numa_cpu_lookup_table[NR_CPUS];
806 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
807 
808 struct mdesc_mblock {
809 	u64	base;
810 	u64	size;
811 	u64	offset; /* RA-to-PA */
812 };
813 static struct mdesc_mblock *mblocks;
814 static int num_mblocks;
815 
816 static unsigned long ra_to_pa(unsigned long addr)
817 {
818 	int i;
819 
820 	for (i = 0; i < num_mblocks; i++) {
821 		struct mdesc_mblock *m = &mblocks[i];
822 
823 		if (addr >= m->base &&
824 		    addr < (m->base + m->size)) {
825 			addr += m->offset;
826 			break;
827 		}
828 	}
829 	return addr;
830 }
831 
832 static int find_node(unsigned long addr)
833 {
834 	int i;
835 
836 	addr = ra_to_pa(addr);
837 	for (i = 0; i < num_node_masks; i++) {
838 		struct node_mem_mask *p = &node_masks[i];
839 
840 		if ((addr & p->mask) == p->val)
841 			return i;
842 	}
843 	return -1;
844 }
845 
846 static u64 memblock_nid_range(u64 start, u64 end, int *nid)
847 {
848 	*nid = find_node(start);
849 	start += PAGE_SIZE;
850 	while (start < end) {
851 		int n = find_node(start);
852 
853 		if (n != *nid)
854 			break;
855 		start += PAGE_SIZE;
856 	}
857 
858 	if (start > end)
859 		start = end;
860 
861 	return start;
862 }
863 #endif
864 
865 /* This must be invoked after performing all of the necessary
866  * memblock_set_node() calls for 'nid'.  We need to be able to get
867  * correct data from get_pfn_range_for_nid().
868  */
869 static void __init allocate_node_data(int nid)
870 {
871 	struct pglist_data *p;
872 	unsigned long start_pfn, end_pfn;
873 #ifdef CONFIG_NEED_MULTIPLE_NODES
874 	unsigned long paddr;
875 
876 	paddr = memblock_alloc_try_nid(sizeof(struct pglist_data), SMP_CACHE_BYTES, nid);
877 	if (!paddr) {
878 		prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid);
879 		prom_halt();
880 	}
881 	NODE_DATA(nid) = __va(paddr);
882 	memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
883 
884 	NODE_DATA(nid)->node_id = nid;
885 #endif
886 
887 	p = NODE_DATA(nid);
888 
889 	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
890 	p->node_start_pfn = start_pfn;
891 	p->node_spanned_pages = end_pfn - start_pfn;
892 }
893 
894 static void init_node_masks_nonnuma(void)
895 {
896 #ifdef CONFIG_NEED_MULTIPLE_NODES
897 	int i;
898 #endif
899 
900 	numadbg("Initializing tables for non-numa.\n");
901 
902 	node_masks[0].mask = node_masks[0].val = 0;
903 	num_node_masks = 1;
904 
905 #ifdef CONFIG_NEED_MULTIPLE_NODES
906 	for (i = 0; i < NR_CPUS; i++)
907 		numa_cpu_lookup_table[i] = 0;
908 
909 	cpumask_setall(&numa_cpumask_lookup_table[0]);
910 #endif
911 }
912 
913 #ifdef CONFIG_NEED_MULTIPLE_NODES
914 struct pglist_data *node_data[MAX_NUMNODES];
915 
916 EXPORT_SYMBOL(numa_cpu_lookup_table);
917 EXPORT_SYMBOL(numa_cpumask_lookup_table);
918 EXPORT_SYMBOL(node_data);
919 
920 struct mdesc_mlgroup {
921 	u64	node;
922 	u64	latency;
923 	u64	match;
924 	u64	mask;
925 };
926 static struct mdesc_mlgroup *mlgroups;
927 static int num_mlgroups;
928 
929 static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio,
930 				   u32 cfg_handle)
931 {
932 	u64 arc;
933 
934 	mdesc_for_each_arc(arc, md, pio, MDESC_ARC_TYPE_FWD) {
935 		u64 target = mdesc_arc_target(md, arc);
936 		const u64 *val;
937 
938 		val = mdesc_get_property(md, target,
939 					 "cfg-handle", NULL);
940 		if (val && *val == cfg_handle)
941 			return 0;
942 	}
943 	return -ENODEV;
944 }
945 
946 static int scan_arcs_for_cfg_handle(struct mdesc_handle *md, u64 grp,
947 				    u32 cfg_handle)
948 {
949 	u64 arc, candidate, best_latency = ~(u64)0;
950 
951 	candidate = MDESC_NODE_NULL;
952 	mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) {
953 		u64 target = mdesc_arc_target(md, arc);
954 		const char *name = mdesc_node_name(md, target);
955 		const u64 *val;
956 
957 		if (strcmp(name, "pio-latency-group"))
958 			continue;
959 
960 		val = mdesc_get_property(md, target, "latency", NULL);
961 		if (!val)
962 			continue;
963 
964 		if (*val < best_latency) {
965 			candidate = target;
966 			best_latency = *val;
967 		}
968 	}
969 
970 	if (candidate == MDESC_NODE_NULL)
971 		return -ENODEV;
972 
973 	return scan_pio_for_cfg_handle(md, candidate, cfg_handle);
974 }
975 
976 int of_node_to_nid(struct device_node *dp)
977 {
978 	const struct linux_prom64_registers *regs;
979 	struct mdesc_handle *md;
980 	u32 cfg_handle;
981 	int count, nid;
982 	u64 grp;
983 
984 	/* This is the right thing to do on currently supported
985 	 * SUN4U NUMA platforms as well, as the PCI controller does
986 	 * not sit behind any particular memory controller.
987 	 */
988 	if (!mlgroups)
989 		return -1;
990 
991 	regs = of_get_property(dp, "reg", NULL);
992 	if (!regs)
993 		return -1;
994 
995 	cfg_handle = (regs->phys_addr >> 32UL) & 0x0fffffff;
996 
997 	md = mdesc_grab();
998 
999 	count = 0;
1000 	nid = -1;
1001 	mdesc_for_each_node_by_name(md, grp, "group") {
1002 		if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) {
1003 			nid = count;
1004 			break;
1005 		}
1006 		count++;
1007 	}
1008 
1009 	mdesc_release(md);
1010 
1011 	return nid;
1012 }
1013 
1014 static void __init add_node_ranges(void)
1015 {
1016 	struct memblock_region *reg;
1017 
1018 	for_each_memblock(memory, reg) {
1019 		unsigned long size = reg->size;
1020 		unsigned long start, end;
1021 
1022 		start = reg->base;
1023 		end = start + size;
1024 		while (start < end) {
1025 			unsigned long this_end;
1026 			int nid;
1027 
1028 			this_end = memblock_nid_range(start, end, &nid);
1029 
1030 			numadbg("Setting memblock NUMA node nid[%d] "
1031 				"start[%lx] end[%lx]\n",
1032 				nid, start, this_end);
1033 
1034 			memblock_set_node(start, this_end - start,
1035 					  &memblock.memory, nid);
1036 			start = this_end;
1037 		}
1038 	}
1039 }
1040 
1041 static int __init grab_mlgroups(struct mdesc_handle *md)
1042 {
1043 	unsigned long paddr;
1044 	int count = 0;
1045 	u64 node;
1046 
1047 	mdesc_for_each_node_by_name(md, node, "memory-latency-group")
1048 		count++;
1049 	if (!count)
1050 		return -ENOENT;
1051 
1052 	paddr = memblock_alloc(count * sizeof(struct mdesc_mlgroup),
1053 			  SMP_CACHE_BYTES);
1054 	if (!paddr)
1055 		return -ENOMEM;
1056 
1057 	mlgroups = __va(paddr);
1058 	num_mlgroups = count;
1059 
1060 	count = 0;
1061 	mdesc_for_each_node_by_name(md, node, "memory-latency-group") {
1062 		struct mdesc_mlgroup *m = &mlgroups[count++];
1063 		const u64 *val;
1064 
1065 		m->node = node;
1066 
1067 		val = mdesc_get_property(md, node, "latency", NULL);
1068 		m->latency = *val;
1069 		val = mdesc_get_property(md, node, "address-match", NULL);
1070 		m->match = *val;
1071 		val = mdesc_get_property(md, node, "address-mask", NULL);
1072 		m->mask = *val;
1073 
1074 		numadbg("MLGROUP[%d]: node[%llx] latency[%llx] "
1075 			"match[%llx] mask[%llx]\n",
1076 			count - 1, m->node, m->latency, m->match, m->mask);
1077 	}
1078 
1079 	return 0;
1080 }
1081 
1082 static int __init grab_mblocks(struct mdesc_handle *md)
1083 {
1084 	unsigned long paddr;
1085 	int count = 0;
1086 	u64 node;
1087 
1088 	mdesc_for_each_node_by_name(md, node, "mblock")
1089 		count++;
1090 	if (!count)
1091 		return -ENOENT;
1092 
1093 	paddr = memblock_alloc(count * sizeof(struct mdesc_mblock),
1094 			  SMP_CACHE_BYTES);
1095 	if (!paddr)
1096 		return -ENOMEM;
1097 
1098 	mblocks = __va(paddr);
1099 	num_mblocks = count;
1100 
1101 	count = 0;
1102 	mdesc_for_each_node_by_name(md, node, "mblock") {
1103 		struct mdesc_mblock *m = &mblocks[count++];
1104 		const u64 *val;
1105 
1106 		val = mdesc_get_property(md, node, "base", NULL);
1107 		m->base = *val;
1108 		val = mdesc_get_property(md, node, "size", NULL);
1109 		m->size = *val;
1110 		val = mdesc_get_property(md, node,
1111 					 "address-congruence-offset", NULL);
1112 
1113 		/* The address-congruence-offset property is optional.
1114 		 * Explicity zero it be identifty this.
1115 		 */
1116 		if (val)
1117 			m->offset = *val;
1118 		else
1119 			m->offset = 0UL;
1120 
1121 		numadbg("MBLOCK[%d]: base[%llx] size[%llx] offset[%llx]\n",
1122 			count - 1, m->base, m->size, m->offset);
1123 	}
1124 
1125 	return 0;
1126 }
1127 
1128 static void __init numa_parse_mdesc_group_cpus(struct mdesc_handle *md,
1129 					       u64 grp, cpumask_t *mask)
1130 {
1131 	u64 arc;
1132 
1133 	cpumask_clear(mask);
1134 
1135 	mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_BACK) {
1136 		u64 target = mdesc_arc_target(md, arc);
1137 		const char *name = mdesc_node_name(md, target);
1138 		const u64 *id;
1139 
1140 		if (strcmp(name, "cpu"))
1141 			continue;
1142 		id = mdesc_get_property(md, target, "id", NULL);
1143 		if (*id < nr_cpu_ids)
1144 			cpumask_set_cpu(*id, mask);
1145 	}
1146 }
1147 
1148 static struct mdesc_mlgroup * __init find_mlgroup(u64 node)
1149 {
1150 	int i;
1151 
1152 	for (i = 0; i < num_mlgroups; i++) {
1153 		struct mdesc_mlgroup *m = &mlgroups[i];
1154 		if (m->node == node)
1155 			return m;
1156 	}
1157 	return NULL;
1158 }
1159 
1160 static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp,
1161 				      int index)
1162 {
1163 	struct mdesc_mlgroup *candidate = NULL;
1164 	u64 arc, best_latency = ~(u64)0;
1165 	struct node_mem_mask *n;
1166 
1167 	mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) {
1168 		u64 target = mdesc_arc_target(md, arc);
1169 		struct mdesc_mlgroup *m = find_mlgroup(target);
1170 		if (!m)
1171 			continue;
1172 		if (m->latency < best_latency) {
1173 			candidate = m;
1174 			best_latency = m->latency;
1175 		}
1176 	}
1177 	if (!candidate)
1178 		return -ENOENT;
1179 
1180 	if (num_node_masks != index) {
1181 		printk(KERN_ERR "Inconsistent NUMA state, "
1182 		       "index[%d] != num_node_masks[%d]\n",
1183 		       index, num_node_masks);
1184 		return -EINVAL;
1185 	}
1186 
1187 	n = &node_masks[num_node_masks++];
1188 
1189 	n->mask = candidate->mask;
1190 	n->val = candidate->match;
1191 
1192 	numadbg("NUMA NODE[%d]: mask[%lx] val[%lx] (latency[%llx])\n",
1193 		index, n->mask, n->val, candidate->latency);
1194 
1195 	return 0;
1196 }
1197 
1198 static int __init numa_parse_mdesc_group(struct mdesc_handle *md, u64 grp,
1199 					 int index)
1200 {
1201 	cpumask_t mask;
1202 	int cpu;
1203 
1204 	numa_parse_mdesc_group_cpus(md, grp, &mask);
1205 
1206 	for_each_cpu(cpu, &mask)
1207 		numa_cpu_lookup_table[cpu] = index;
1208 	cpumask_copy(&numa_cpumask_lookup_table[index], &mask);
1209 
1210 	if (numa_debug) {
1211 		printk(KERN_INFO "NUMA GROUP[%d]: cpus [ ", index);
1212 		for_each_cpu(cpu, &mask)
1213 			printk("%d ", cpu);
1214 		printk("]\n");
1215 	}
1216 
1217 	return numa_attach_mlgroup(md, grp, index);
1218 }
1219 
1220 static int __init numa_parse_mdesc(void)
1221 {
1222 	struct mdesc_handle *md = mdesc_grab();
1223 	int i, err, count;
1224 	u64 node;
1225 
1226 	node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups");
1227 	if (node == MDESC_NODE_NULL) {
1228 		mdesc_release(md);
1229 		return -ENOENT;
1230 	}
1231 
1232 	err = grab_mblocks(md);
1233 	if (err < 0)
1234 		goto out;
1235 
1236 	err = grab_mlgroups(md);
1237 	if (err < 0)
1238 		goto out;
1239 
1240 	count = 0;
1241 	mdesc_for_each_node_by_name(md, node, "group") {
1242 		err = numa_parse_mdesc_group(md, node, count);
1243 		if (err < 0)
1244 			break;
1245 		count++;
1246 	}
1247 
1248 	add_node_ranges();
1249 
1250 	for (i = 0; i < num_node_masks; i++) {
1251 		allocate_node_data(i);
1252 		node_set_online(i);
1253 	}
1254 
1255 	err = 0;
1256 out:
1257 	mdesc_release(md);
1258 	return err;
1259 }
1260 
1261 static int __init numa_parse_jbus(void)
1262 {
1263 	unsigned long cpu, index;
1264 
1265 	/* NUMA node id is encoded in bits 36 and higher, and there is
1266 	 * a 1-to-1 mapping from CPU ID to NUMA node ID.
1267 	 */
1268 	index = 0;
1269 	for_each_present_cpu(cpu) {
1270 		numa_cpu_lookup_table[cpu] = index;
1271 		cpumask_copy(&numa_cpumask_lookup_table[index], cpumask_of(cpu));
1272 		node_masks[index].mask = ~((1UL << 36UL) - 1UL);
1273 		node_masks[index].val = cpu << 36UL;
1274 
1275 		index++;
1276 	}
1277 	num_node_masks = index;
1278 
1279 	add_node_ranges();
1280 
1281 	for (index = 0; index < num_node_masks; index++) {
1282 		allocate_node_data(index);
1283 		node_set_online(index);
1284 	}
1285 
1286 	return 0;
1287 }
1288 
1289 static int __init numa_parse_sun4u(void)
1290 {
1291 	if (tlb_type == cheetah || tlb_type == cheetah_plus) {
1292 		unsigned long ver;
1293 
1294 		__asm__ ("rdpr %%ver, %0" : "=r" (ver));
1295 		if ((ver >> 32UL) == __JALAPENO_ID ||
1296 		    (ver >> 32UL) == __SERRANO_ID)
1297 			return numa_parse_jbus();
1298 	}
1299 	return -1;
1300 }
1301 
1302 static int __init bootmem_init_numa(void)
1303 {
1304 	int err = -1;
1305 
1306 	numadbg("bootmem_init_numa()\n");
1307 
1308 	if (numa_enabled) {
1309 		if (tlb_type == hypervisor)
1310 			err = numa_parse_mdesc();
1311 		else
1312 			err = numa_parse_sun4u();
1313 	}
1314 	return err;
1315 }
1316 
1317 #else
1318 
1319 static int bootmem_init_numa(void)
1320 {
1321 	return -1;
1322 }
1323 
1324 #endif
1325 
1326 static void __init bootmem_init_nonnuma(void)
1327 {
1328 	unsigned long top_of_ram = memblock_end_of_DRAM();
1329 	unsigned long total_ram = memblock_phys_mem_size();
1330 
1331 	numadbg("bootmem_init_nonnuma()\n");
1332 
1333 	printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
1334 	       top_of_ram, total_ram);
1335 	printk(KERN_INFO "Memory hole size: %ldMB\n",
1336 	       (top_of_ram - total_ram) >> 20);
1337 
1338 	init_node_masks_nonnuma();
1339 	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
1340 	allocate_node_data(0);
1341 	node_set_online(0);
1342 }
1343 
1344 static unsigned long __init bootmem_init(unsigned long phys_base)
1345 {
1346 	unsigned long end_pfn;
1347 
1348 	end_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
1349 	max_pfn = max_low_pfn = end_pfn;
1350 	min_low_pfn = (phys_base >> PAGE_SHIFT);
1351 
1352 	if (bootmem_init_numa() < 0)
1353 		bootmem_init_nonnuma();
1354 
1355 	/* Dump memblock with node info. */
1356 	memblock_dump_all();
1357 
1358 	/* XXX cpu notifier XXX */
1359 
1360 	sparse_memory_present_with_active_regions(MAX_NUMNODES);
1361 	sparse_init();
1362 
1363 	return end_pfn;
1364 }
1365 
1366 static struct linux_prom64_registers pall[MAX_BANKS] __initdata;
1367 static int pall_ents __initdata;
1368 
1369 #ifdef CONFIG_DEBUG_PAGEALLOC
1370 static unsigned long __ref kernel_map_range(unsigned long pstart,
1371 					    unsigned long pend, pgprot_t prot)
1372 {
1373 	unsigned long vstart = PAGE_OFFSET + pstart;
1374 	unsigned long vend = PAGE_OFFSET + pend;
1375 	unsigned long alloc_bytes = 0UL;
1376 
1377 	if ((vstart & ~PAGE_MASK) || (vend & ~PAGE_MASK)) {
1378 		prom_printf("kernel_map: Unaligned physmem[%lx:%lx]\n",
1379 			    vstart, vend);
1380 		prom_halt();
1381 	}
1382 
1383 	while (vstart < vend) {
1384 		unsigned long this_end, paddr = __pa(vstart);
1385 		pgd_t *pgd = pgd_offset_k(vstart);
1386 		pud_t *pud;
1387 		pmd_t *pmd;
1388 		pte_t *pte;
1389 
1390 		pud = pud_offset(pgd, vstart);
1391 		if (pud_none(*pud)) {
1392 			pmd_t *new;
1393 
1394 			new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
1395 			alloc_bytes += PAGE_SIZE;
1396 			pud_populate(&init_mm, pud, new);
1397 		}
1398 
1399 		pmd = pmd_offset(pud, vstart);
1400 		if (!pmd_present(*pmd)) {
1401 			pte_t *new;
1402 
1403 			new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
1404 			alloc_bytes += PAGE_SIZE;
1405 			pmd_populate_kernel(&init_mm, pmd, new);
1406 		}
1407 
1408 		pte = pte_offset_kernel(pmd, vstart);
1409 		this_end = (vstart + PMD_SIZE) & PMD_MASK;
1410 		if (this_end > vend)
1411 			this_end = vend;
1412 
1413 		while (vstart < this_end) {
1414 			pte_val(*pte) = (paddr | pgprot_val(prot));
1415 
1416 			vstart += PAGE_SIZE;
1417 			paddr += PAGE_SIZE;
1418 			pte++;
1419 		}
1420 	}
1421 
1422 	return alloc_bytes;
1423 }
1424 
1425 extern unsigned int kvmap_linear_patch[1];
1426 #endif /* CONFIG_DEBUG_PAGEALLOC */
1427 
1428 static void __init kpte_set_val(unsigned long index, unsigned long val)
1429 {
1430 	unsigned long *ptr = kpte_linear_bitmap;
1431 
1432 	val <<= ((index % (BITS_PER_LONG / 2)) * 2);
1433 	ptr += (index / (BITS_PER_LONG / 2));
1434 
1435 	*ptr |= val;
1436 }
1437 
1438 static const unsigned long kpte_shift_min = 28; /* 256MB */
1439 static const unsigned long kpte_shift_max = 34; /* 16GB */
1440 static const unsigned long kpte_shift_incr = 3;
1441 
1442 static unsigned long kpte_mark_using_shift(unsigned long start, unsigned long end,
1443 					   unsigned long shift)
1444 {
1445 	unsigned long size = (1UL << shift);
1446 	unsigned long mask = (size - 1UL);
1447 	unsigned long remains = end - start;
1448 	unsigned long val;
1449 
1450 	if (remains < size || (start & mask))
1451 		return start;
1452 
1453 	/* VAL maps:
1454 	 *
1455 	 *	shift 28 --> kern_linear_pte_xor index 1
1456 	 *	shift 31 --> kern_linear_pte_xor index 2
1457 	 *	shift 34 --> kern_linear_pte_xor index 3
1458 	 */
1459 	val = ((shift - kpte_shift_min) / kpte_shift_incr) + 1;
1460 
1461 	remains &= ~mask;
1462 	if (shift != kpte_shift_max)
1463 		remains = size;
1464 
1465 	while (remains) {
1466 		unsigned long index = start >> kpte_shift_min;
1467 
1468 		kpte_set_val(index, val);
1469 
1470 		start += 1UL << kpte_shift_min;
1471 		remains -= 1UL << kpte_shift_min;
1472 	}
1473 
1474 	return start;
1475 }
1476 
1477 static void __init mark_kpte_bitmap(unsigned long start, unsigned long end)
1478 {
1479 	unsigned long smallest_size, smallest_mask;
1480 	unsigned long s;
1481 
1482 	smallest_size = (1UL << kpte_shift_min);
1483 	smallest_mask = (smallest_size - 1UL);
1484 
1485 	while (start < end) {
1486 		unsigned long orig_start = start;
1487 
1488 		for (s = kpte_shift_max; s >= kpte_shift_min; s -= kpte_shift_incr) {
1489 			start = kpte_mark_using_shift(start, end, s);
1490 
1491 			if (start != orig_start)
1492 				break;
1493 		}
1494 
1495 		if (start == orig_start)
1496 			start = (start + smallest_size) & ~smallest_mask;
1497 	}
1498 }
1499 
1500 static void __init init_kpte_bitmap(void)
1501 {
1502 	unsigned long i;
1503 
1504 	for (i = 0; i < pall_ents; i++) {
1505 		unsigned long phys_start, phys_end;
1506 
1507 		phys_start = pall[i].phys_addr;
1508 		phys_end = phys_start + pall[i].reg_size;
1509 
1510 		mark_kpte_bitmap(phys_start, phys_end);
1511 	}
1512 }
1513 
1514 static void __init kernel_physical_mapping_init(void)
1515 {
1516 #ifdef CONFIG_DEBUG_PAGEALLOC
1517 	unsigned long i, mem_alloced = 0UL;
1518 
1519 	for (i = 0; i < pall_ents; i++) {
1520 		unsigned long phys_start, phys_end;
1521 
1522 		phys_start = pall[i].phys_addr;
1523 		phys_end = phys_start + pall[i].reg_size;
1524 
1525 		mem_alloced += kernel_map_range(phys_start, phys_end,
1526 						PAGE_KERNEL);
1527 	}
1528 
1529 	printk("Allocated %ld bytes for kernel page tables.\n",
1530 	       mem_alloced);
1531 
1532 	kvmap_linear_patch[0] = 0x01000000; /* nop */
1533 	flushi(&kvmap_linear_patch[0]);
1534 
1535 	__flush_tlb_all();
1536 #endif
1537 }
1538 
1539 #ifdef CONFIG_DEBUG_PAGEALLOC
1540 void kernel_map_pages(struct page *page, int numpages, int enable)
1541 {
1542 	unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT;
1543 	unsigned long phys_end = phys_start + (numpages * PAGE_SIZE);
1544 
1545 	kernel_map_range(phys_start, phys_end,
1546 			 (enable ? PAGE_KERNEL : __pgprot(0)));
1547 
1548 	flush_tsb_kernel_range(PAGE_OFFSET + phys_start,
1549 			       PAGE_OFFSET + phys_end);
1550 
1551 	/* we should perform an IPI and flush all tlbs,
1552 	 * but that can deadlock->flush only current cpu.
1553 	 */
1554 	__flush_tlb_kernel_range(PAGE_OFFSET + phys_start,
1555 				 PAGE_OFFSET + phys_end);
1556 }
1557 #endif
1558 
1559 unsigned long __init find_ecache_flush_span(unsigned long size)
1560 {
1561 	int i;
1562 
1563 	for (i = 0; i < pavail_ents; i++) {
1564 		if (pavail[i].reg_size >= size)
1565 			return pavail[i].phys_addr;
1566 	}
1567 
1568 	return ~0UL;
1569 }
1570 
1571 unsigned long PAGE_OFFSET;
1572 EXPORT_SYMBOL(PAGE_OFFSET);
1573 
1574 static void __init page_offset_shift_patch_one(unsigned int *insn, unsigned long phys_bits)
1575 {
1576 	unsigned long final_shift;
1577 	unsigned int val = *insn;
1578 	unsigned int cnt;
1579 
1580 	/* We are patching in ilog2(max_supported_phys_address), and
1581 	 * we are doing so in a manner similar to a relocation addend.
1582 	 * That is, we are adding the shift value to whatever value
1583 	 * is in the shift instruction count field already.
1584 	 */
1585 	cnt = (val & 0x3f);
1586 	val &= ~0x3f;
1587 
1588 	/* If we are trying to shift >= 64 bits, clear the destination
1589 	 * register.  This can happen when phys_bits ends up being equal
1590 	 * to MAX_PHYS_ADDRESS_BITS.
1591 	 */
1592 	final_shift = (cnt + (64 - phys_bits));
1593 	if (final_shift >= 64) {
1594 		unsigned int rd = (val >> 25) & 0x1f;
1595 
1596 		val = 0x80100000 | (rd << 25);
1597 	} else {
1598 		val |= final_shift;
1599 	}
1600 	*insn = val;
1601 
1602 	__asm__ __volatile__("flush	%0"
1603 			     : /* no outputs */
1604 			     : "r" (insn));
1605 }
1606 
1607 static void __init page_offset_shift_patch(unsigned long phys_bits)
1608 {
1609 	extern unsigned int __page_offset_shift_patch;
1610 	extern unsigned int __page_offset_shift_patch_end;
1611 	unsigned int *p;
1612 
1613 	p = &__page_offset_shift_patch;
1614 	while (p < &__page_offset_shift_patch_end) {
1615 		unsigned int *insn = (unsigned int *)(unsigned long)*p;
1616 
1617 		page_offset_shift_patch_one(insn, phys_bits);
1618 
1619 		p++;
1620 	}
1621 }
1622 
1623 static void __init setup_page_offset(void)
1624 {
1625 	unsigned long max_phys_bits = 40;
1626 
1627 	if (tlb_type == cheetah || tlb_type == cheetah_plus) {
1628 		max_phys_bits = 42;
1629 	} else if (tlb_type == hypervisor) {
1630 		switch (sun4v_chip_type) {
1631 		case SUN4V_CHIP_NIAGARA1:
1632 		case SUN4V_CHIP_NIAGARA2:
1633 			max_phys_bits = 39;
1634 			break;
1635 		case SUN4V_CHIP_NIAGARA3:
1636 			max_phys_bits = 43;
1637 			break;
1638 		case SUN4V_CHIP_NIAGARA4:
1639 		case SUN4V_CHIP_NIAGARA5:
1640 		case SUN4V_CHIP_SPARC64X:
1641 		default:
1642 			max_phys_bits = 47;
1643 			break;
1644 		}
1645 	}
1646 
1647 	if (max_phys_bits > MAX_PHYS_ADDRESS_BITS) {
1648 		prom_printf("MAX_PHYS_ADDRESS_BITS is too small, need %lu\n",
1649 			    max_phys_bits);
1650 		prom_halt();
1651 	}
1652 
1653 	PAGE_OFFSET = PAGE_OFFSET_BY_BITS(max_phys_bits);
1654 
1655 	pr_info("PAGE_OFFSET is 0x%016lx (max_phys_bits == %lu)\n",
1656 		PAGE_OFFSET, max_phys_bits);
1657 
1658 	page_offset_shift_patch(max_phys_bits);
1659 }
1660 
1661 static void __init tsb_phys_patch(void)
1662 {
1663 	struct tsb_ldquad_phys_patch_entry *pquad;
1664 	struct tsb_phys_patch_entry *p;
1665 
1666 	pquad = &__tsb_ldquad_phys_patch;
1667 	while (pquad < &__tsb_ldquad_phys_patch_end) {
1668 		unsigned long addr = pquad->addr;
1669 
1670 		if (tlb_type == hypervisor)
1671 			*(unsigned int *) addr = pquad->sun4v_insn;
1672 		else
1673 			*(unsigned int *) addr = pquad->sun4u_insn;
1674 		wmb();
1675 		__asm__ __volatile__("flush	%0"
1676 				     : /* no outputs */
1677 				     : "r" (addr));
1678 
1679 		pquad++;
1680 	}
1681 
1682 	p = &__tsb_phys_patch;
1683 	while (p < &__tsb_phys_patch_end) {
1684 		unsigned long addr = p->addr;
1685 
1686 		*(unsigned int *) addr = p->insn;
1687 		wmb();
1688 		__asm__ __volatile__("flush	%0"
1689 				     : /* no outputs */
1690 				     : "r" (addr));
1691 
1692 		p++;
1693 	}
1694 }
1695 
1696 /* Don't mark as init, we give this to the Hypervisor.  */
1697 #ifndef CONFIG_DEBUG_PAGEALLOC
1698 #define NUM_KTSB_DESCR	2
1699 #else
1700 #define NUM_KTSB_DESCR	1
1701 #endif
1702 static struct hv_tsb_descr ktsb_descr[NUM_KTSB_DESCR];
1703 extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
1704 
1705 static void patch_one_ktsb_phys(unsigned int *start, unsigned int *end, unsigned long pa)
1706 {
1707 	pa >>= KTSB_PHYS_SHIFT;
1708 
1709 	while (start < end) {
1710 		unsigned int *ia = (unsigned int *)(unsigned long)*start;
1711 
1712 		ia[0] = (ia[0] & ~0x3fffff) | (pa >> 10);
1713 		__asm__ __volatile__("flush	%0" : : "r" (ia));
1714 
1715 		ia[1] = (ia[1] & ~0x3ff) | (pa & 0x3ff);
1716 		__asm__ __volatile__("flush	%0" : : "r" (ia + 1));
1717 
1718 		start++;
1719 	}
1720 }
1721 
1722 static void ktsb_phys_patch(void)
1723 {
1724 	extern unsigned int __swapper_tsb_phys_patch;
1725 	extern unsigned int __swapper_tsb_phys_patch_end;
1726 	unsigned long ktsb_pa;
1727 
1728 	ktsb_pa = kern_base + ((unsigned long)&swapper_tsb[0] - KERNBASE);
1729 	patch_one_ktsb_phys(&__swapper_tsb_phys_patch,
1730 			    &__swapper_tsb_phys_patch_end, ktsb_pa);
1731 #ifndef CONFIG_DEBUG_PAGEALLOC
1732 	{
1733 	extern unsigned int __swapper_4m_tsb_phys_patch;
1734 	extern unsigned int __swapper_4m_tsb_phys_patch_end;
1735 	ktsb_pa = (kern_base +
1736 		   ((unsigned long)&swapper_4m_tsb[0] - KERNBASE));
1737 	patch_one_ktsb_phys(&__swapper_4m_tsb_phys_patch,
1738 			    &__swapper_4m_tsb_phys_patch_end, ktsb_pa);
1739 	}
1740 #endif
1741 }
1742 
1743 static void __init sun4v_ktsb_init(void)
1744 {
1745 	unsigned long ktsb_pa;
1746 
1747 	/* First KTSB for PAGE_SIZE mappings.  */
1748 	ktsb_pa = kern_base + ((unsigned long)&swapper_tsb[0] - KERNBASE);
1749 
1750 	switch (PAGE_SIZE) {
1751 	case 8 * 1024:
1752 	default:
1753 		ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_8K;
1754 		ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_8K;
1755 		break;
1756 
1757 	case 64 * 1024:
1758 		ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_64K;
1759 		ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_64K;
1760 		break;
1761 
1762 	case 512 * 1024:
1763 		ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_512K;
1764 		ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_512K;
1765 		break;
1766 
1767 	case 4 * 1024 * 1024:
1768 		ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_4MB;
1769 		ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_4MB;
1770 		break;
1771 	}
1772 
1773 	ktsb_descr[0].assoc = 1;
1774 	ktsb_descr[0].num_ttes = KERNEL_TSB_NENTRIES;
1775 	ktsb_descr[0].ctx_idx = 0;
1776 	ktsb_descr[0].tsb_base = ktsb_pa;
1777 	ktsb_descr[0].resv = 0;
1778 
1779 #ifndef CONFIG_DEBUG_PAGEALLOC
1780 	/* Second KTSB for 4MB/256MB/2GB/16GB mappings.  */
1781 	ktsb_pa = (kern_base +
1782 		   ((unsigned long)&swapper_4m_tsb[0] - KERNBASE));
1783 
1784 	ktsb_descr[1].pgsz_idx = HV_PGSZ_IDX_4MB;
1785 	ktsb_descr[1].pgsz_mask = ((HV_PGSZ_MASK_4MB |
1786 				    HV_PGSZ_MASK_256MB |
1787 				    HV_PGSZ_MASK_2GB |
1788 				    HV_PGSZ_MASK_16GB) &
1789 				   cpu_pgsz_mask);
1790 	ktsb_descr[1].assoc = 1;
1791 	ktsb_descr[1].num_ttes = KERNEL_TSB4M_NENTRIES;
1792 	ktsb_descr[1].ctx_idx = 0;
1793 	ktsb_descr[1].tsb_base = ktsb_pa;
1794 	ktsb_descr[1].resv = 0;
1795 #endif
1796 }
1797 
1798 void sun4v_ktsb_register(void)
1799 {
1800 	unsigned long pa, ret;
1801 
1802 	pa = kern_base + ((unsigned long)&ktsb_descr[0] - KERNBASE);
1803 
1804 	ret = sun4v_mmu_tsb_ctx0(NUM_KTSB_DESCR, pa);
1805 	if (ret != 0) {
1806 		prom_printf("hypervisor_mmu_tsb_ctx0[%lx]: "
1807 			    "errors with %lx\n", pa, ret);
1808 		prom_halt();
1809 	}
1810 }
1811 
1812 static void __init sun4u_linear_pte_xor_finalize(void)
1813 {
1814 #ifndef CONFIG_DEBUG_PAGEALLOC
1815 	/* This is where we would add Panther support for
1816 	 * 32MB and 256MB pages.
1817 	 */
1818 #endif
1819 }
1820 
1821 static void __init sun4v_linear_pte_xor_finalize(void)
1822 {
1823 #ifndef CONFIG_DEBUG_PAGEALLOC
1824 	if (cpu_pgsz_mask & HV_PGSZ_MASK_256MB) {
1825 		kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^
1826 			PAGE_OFFSET;
1827 		kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V |
1828 					   _PAGE_P_4V | _PAGE_W_4V);
1829 	} else {
1830 		kern_linear_pte_xor[1] = kern_linear_pte_xor[0];
1831 	}
1832 
1833 	if (cpu_pgsz_mask & HV_PGSZ_MASK_2GB) {
1834 		kern_linear_pte_xor[2] = (_PAGE_VALID | _PAGE_SZ2GB_4V) ^
1835 			PAGE_OFFSET;
1836 		kern_linear_pte_xor[2] |= (_PAGE_CP_4V | _PAGE_CV_4V |
1837 					   _PAGE_P_4V | _PAGE_W_4V);
1838 	} else {
1839 		kern_linear_pte_xor[2] = kern_linear_pte_xor[1];
1840 	}
1841 
1842 	if (cpu_pgsz_mask & HV_PGSZ_MASK_16GB) {
1843 		kern_linear_pte_xor[3] = (_PAGE_VALID | _PAGE_SZ16GB_4V) ^
1844 			PAGE_OFFSET;
1845 		kern_linear_pte_xor[3] |= (_PAGE_CP_4V | _PAGE_CV_4V |
1846 					   _PAGE_P_4V | _PAGE_W_4V);
1847 	} else {
1848 		kern_linear_pte_xor[3] = kern_linear_pte_xor[2];
1849 	}
1850 #endif
1851 }
1852 
1853 /* paging_init() sets up the page tables */
1854 
1855 static unsigned long last_valid_pfn;
1856 pgd_t swapper_pg_dir[PTRS_PER_PGD];
1857 
1858 static void sun4u_pgprot_init(void);
1859 static void sun4v_pgprot_init(void);
1860 
1861 void __init paging_init(void)
1862 {
1863 	unsigned long end_pfn, shift, phys_base;
1864 	unsigned long real_end, i;
1865 	int node;
1866 
1867 	setup_page_offset();
1868 
1869 	/* These build time checkes make sure that the dcache_dirty_cpu()
1870 	 * page->flags usage will work.
1871 	 *
1872 	 * When a page gets marked as dcache-dirty, we store the
1873 	 * cpu number starting at bit 32 in the page->flags.  Also,
1874 	 * functions like clear_dcache_dirty_cpu use the cpu mask
1875 	 * in 13-bit signed-immediate instruction fields.
1876 	 */
1877 
1878 	/*
1879 	 * Page flags must not reach into upper 32 bits that are used
1880 	 * for the cpu number
1881 	 */
1882 	BUILD_BUG_ON(NR_PAGEFLAGS > 32);
1883 
1884 	/*
1885 	 * The bit fields placed in the high range must not reach below
1886 	 * the 32 bit boundary. Otherwise we cannot place the cpu field
1887 	 * at the 32 bit boundary.
1888 	 */
1889 	BUILD_BUG_ON(SECTIONS_WIDTH + NODES_WIDTH + ZONES_WIDTH +
1890 		ilog2(roundup_pow_of_two(NR_CPUS)) > 32);
1891 
1892 	BUILD_BUG_ON(NR_CPUS > 4096);
1893 
1894 	kern_base = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB;
1895 	kern_size = (unsigned long)&_end - (unsigned long)KERNBASE;
1896 
1897 	/* Invalidate both kernel TSBs.  */
1898 	memset(swapper_tsb, 0x40, sizeof(swapper_tsb));
1899 #ifndef CONFIG_DEBUG_PAGEALLOC
1900 	memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb));
1901 #endif
1902 
1903 	if (tlb_type == hypervisor)
1904 		sun4v_pgprot_init();
1905 	else
1906 		sun4u_pgprot_init();
1907 
1908 	if (tlb_type == cheetah_plus ||
1909 	    tlb_type == hypervisor) {
1910 		tsb_phys_patch();
1911 		ktsb_phys_patch();
1912 	}
1913 
1914 	if (tlb_type == hypervisor)
1915 		sun4v_patch_tlb_handlers();
1916 
1917 	/* Find available physical memory...
1918 	 *
1919 	 * Read it twice in order to work around a bug in openfirmware.
1920 	 * The call to grab this table itself can cause openfirmware to
1921 	 * allocate memory, which in turn can take away some space from
1922 	 * the list of available memory.  Reading it twice makes sure
1923 	 * we really do get the final value.
1924 	 */
1925 	read_obp_translations();
1926 	read_obp_memory("reg", &pall[0], &pall_ents);
1927 	read_obp_memory("available", &pavail[0], &pavail_ents);
1928 	read_obp_memory("available", &pavail[0], &pavail_ents);
1929 
1930 	phys_base = 0xffffffffffffffffUL;
1931 	for (i = 0; i < pavail_ents; i++) {
1932 		phys_base = min(phys_base, pavail[i].phys_addr);
1933 		memblock_add(pavail[i].phys_addr, pavail[i].reg_size);
1934 	}
1935 
1936 	memblock_reserve(kern_base, kern_size);
1937 
1938 	find_ramdisk(phys_base);
1939 
1940 	memblock_enforce_memory_limit(cmdline_memory_size);
1941 
1942 	memblock_allow_resize();
1943 	memblock_dump_all();
1944 
1945 	set_bit(0, mmu_context_bmap);
1946 
1947 	shift = kern_base + PAGE_OFFSET - ((unsigned long)KERNBASE);
1948 
1949 	real_end = (unsigned long)_end;
1950 	num_kernel_image_mappings = DIV_ROUND_UP(real_end - KERNBASE, 1 << ILOG2_4MB);
1951 	printk("Kernel: Using %d locked TLB entries for main kernel image.\n",
1952 	       num_kernel_image_mappings);
1953 
1954 	/* Set kernel pgd to upper alias so physical page computations
1955 	 * work.
1956 	 */
1957 	init_mm.pgd += ((shift) / (sizeof(pgd_t)));
1958 
1959 	memset(swapper_low_pmd_dir, 0, sizeof(swapper_low_pmd_dir));
1960 
1961 	/* Now can init the kernel/bad page tables. */
1962 	pud_set(pud_offset(&swapper_pg_dir[0], 0),
1963 		swapper_low_pmd_dir + (shift / sizeof(pgd_t)));
1964 
1965 	inherit_prom_mappings();
1966 
1967 	init_kpte_bitmap();
1968 
1969 	/* Ok, we can use our TLB miss and window trap handlers safely.  */
1970 	setup_tba();
1971 
1972 	__flush_tlb_all();
1973 
1974 	prom_build_devicetree();
1975 	of_populate_present_mask();
1976 #ifndef CONFIG_SMP
1977 	of_fill_in_cpu_data();
1978 #endif
1979 
1980 	if (tlb_type == hypervisor) {
1981 		sun4v_mdesc_init();
1982 		mdesc_populate_present_mask(cpu_all_mask);
1983 #ifndef CONFIG_SMP
1984 		mdesc_fill_in_cpu_data(cpu_all_mask);
1985 #endif
1986 		mdesc_get_page_sizes(cpu_all_mask, &cpu_pgsz_mask);
1987 
1988 		sun4v_linear_pte_xor_finalize();
1989 
1990 		sun4v_ktsb_init();
1991 		sun4v_ktsb_register();
1992 	} else {
1993 		unsigned long impl, ver;
1994 
1995 		cpu_pgsz_mask = (HV_PGSZ_MASK_8K | HV_PGSZ_MASK_64K |
1996 				 HV_PGSZ_MASK_512K | HV_PGSZ_MASK_4MB);
1997 
1998 		__asm__ __volatile__("rdpr %%ver, %0" : "=r" (ver));
1999 		impl = ((ver >> 32) & 0xffff);
2000 		if (impl == PANTHER_IMPL)
2001 			cpu_pgsz_mask |= (HV_PGSZ_MASK_32MB |
2002 					  HV_PGSZ_MASK_256MB);
2003 
2004 		sun4u_linear_pte_xor_finalize();
2005 	}
2006 
2007 	/* Flush the TLBs and the 4M TSB so that the updated linear
2008 	 * pte XOR settings are realized for all mappings.
2009 	 */
2010 	__flush_tlb_all();
2011 #ifndef CONFIG_DEBUG_PAGEALLOC
2012 	memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb));
2013 #endif
2014 	__flush_tlb_all();
2015 
2016 	/* Setup bootmem... */
2017 	last_valid_pfn = end_pfn = bootmem_init(phys_base);
2018 
2019 	/* Once the OF device tree and MDESC have been setup, we know
2020 	 * the list of possible cpus.  Therefore we can allocate the
2021 	 * IRQ stacks.
2022 	 */
2023 	for_each_possible_cpu(i) {
2024 		node = cpu_to_node(i);
2025 
2026 		softirq_stack[i] = __alloc_bootmem_node(NODE_DATA(node),
2027 							THREAD_SIZE,
2028 							THREAD_SIZE, 0);
2029 		hardirq_stack[i] = __alloc_bootmem_node(NODE_DATA(node),
2030 							THREAD_SIZE,
2031 							THREAD_SIZE, 0);
2032 	}
2033 
2034 	kernel_physical_mapping_init();
2035 
2036 	{
2037 		unsigned long max_zone_pfns[MAX_NR_ZONES];
2038 
2039 		memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
2040 
2041 		max_zone_pfns[ZONE_NORMAL] = end_pfn;
2042 
2043 		free_area_init_nodes(max_zone_pfns);
2044 	}
2045 
2046 	printk("Booting Linux...\n");
2047 }
2048 
2049 int page_in_phys_avail(unsigned long paddr)
2050 {
2051 	int i;
2052 
2053 	paddr &= PAGE_MASK;
2054 
2055 	for (i = 0; i < pavail_ents; i++) {
2056 		unsigned long start, end;
2057 
2058 		start = pavail[i].phys_addr;
2059 		end = start + pavail[i].reg_size;
2060 
2061 		if (paddr >= start && paddr < end)
2062 			return 1;
2063 	}
2064 	if (paddr >= kern_base && paddr < (kern_base + kern_size))
2065 		return 1;
2066 #ifdef CONFIG_BLK_DEV_INITRD
2067 	if (paddr >= __pa(initrd_start) &&
2068 	    paddr < __pa(PAGE_ALIGN(initrd_end)))
2069 		return 1;
2070 #endif
2071 
2072 	return 0;
2073 }
2074 
2075 static struct linux_prom64_registers pavail_rescan[MAX_BANKS] __initdata;
2076 static int pavail_rescan_ents __initdata;
2077 
2078 /* Certain OBP calls, such as fetching "available" properties, can
2079  * claim physical memory.  So, along with initializing the valid
2080  * address bitmap, what we do here is refetch the physical available
2081  * memory list again, and make sure it provides at least as much
2082  * memory as 'pavail' does.
2083  */
2084 static void __init setup_valid_addr_bitmap_from_pavail(unsigned long *bitmap)
2085 {
2086 	int i;
2087 
2088 	read_obp_memory("available", &pavail_rescan[0], &pavail_rescan_ents);
2089 
2090 	for (i = 0; i < pavail_ents; i++) {
2091 		unsigned long old_start, old_end;
2092 
2093 		old_start = pavail[i].phys_addr;
2094 		old_end = old_start + pavail[i].reg_size;
2095 		while (old_start < old_end) {
2096 			int n;
2097 
2098 			for (n = 0; n < pavail_rescan_ents; n++) {
2099 				unsigned long new_start, new_end;
2100 
2101 				new_start = pavail_rescan[n].phys_addr;
2102 				new_end = new_start +
2103 					pavail_rescan[n].reg_size;
2104 
2105 				if (new_start <= old_start &&
2106 				    new_end >= (old_start + PAGE_SIZE)) {
2107 					set_bit(old_start >> ILOG2_4MB, bitmap);
2108 					goto do_next_page;
2109 				}
2110 			}
2111 
2112 			prom_printf("mem_init: Lost memory in pavail\n");
2113 			prom_printf("mem_init: OLD start[%lx] size[%lx]\n",
2114 				    pavail[i].phys_addr,
2115 				    pavail[i].reg_size);
2116 			prom_printf("mem_init: NEW start[%lx] size[%lx]\n",
2117 				    pavail_rescan[i].phys_addr,
2118 				    pavail_rescan[i].reg_size);
2119 			prom_printf("mem_init: Cannot continue, aborting.\n");
2120 			prom_halt();
2121 
2122 		do_next_page:
2123 			old_start += PAGE_SIZE;
2124 		}
2125 	}
2126 }
2127 
2128 static void __init patch_tlb_miss_handler_bitmap(void)
2129 {
2130 	extern unsigned int valid_addr_bitmap_insn[];
2131 	extern unsigned int valid_addr_bitmap_patch[];
2132 
2133 	valid_addr_bitmap_insn[1] = valid_addr_bitmap_patch[1];
2134 	mb();
2135 	valid_addr_bitmap_insn[0] = valid_addr_bitmap_patch[0];
2136 	flushi(&valid_addr_bitmap_insn[0]);
2137 }
2138 
2139 static void __init register_page_bootmem_info(void)
2140 {
2141 #ifdef CONFIG_NEED_MULTIPLE_NODES
2142 	int i;
2143 
2144 	for_each_online_node(i)
2145 		if (NODE_DATA(i)->node_spanned_pages)
2146 			register_page_bootmem_info_node(NODE_DATA(i));
2147 #endif
2148 }
2149 void __init mem_init(void)
2150 {
2151 	unsigned long addr, last;
2152 
2153 	addr = PAGE_OFFSET + kern_base;
2154 	last = PAGE_ALIGN(kern_size) + addr;
2155 	while (addr < last) {
2156 		set_bit(__pa(addr) >> ILOG2_4MB, sparc64_valid_addr_bitmap);
2157 		addr += PAGE_SIZE;
2158 	}
2159 
2160 	setup_valid_addr_bitmap_from_pavail(sparc64_valid_addr_bitmap);
2161 	patch_tlb_miss_handler_bitmap();
2162 
2163 	high_memory = __va(last_valid_pfn << PAGE_SHIFT);
2164 
2165 	register_page_bootmem_info();
2166 	free_all_bootmem();
2167 
2168 	/*
2169 	 * Set up the zero page, mark it reserved, so that page count
2170 	 * is not manipulated when freeing the page from user ptes.
2171 	 */
2172 	mem_map_zero = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0);
2173 	if (mem_map_zero == NULL) {
2174 		prom_printf("paging_init: Cannot alloc zero page.\n");
2175 		prom_halt();
2176 	}
2177 	mark_page_reserved(mem_map_zero);
2178 
2179 	mem_init_print_info(NULL);
2180 
2181 	if (tlb_type == cheetah || tlb_type == cheetah_plus)
2182 		cheetah_ecache_flush_init();
2183 }
2184 
2185 void free_initmem(void)
2186 {
2187 	unsigned long addr, initend;
2188 	int do_free = 1;
2189 
2190 	/* If the physical memory maps were trimmed by kernel command
2191 	 * line options, don't even try freeing this initmem stuff up.
2192 	 * The kernel image could have been in the trimmed out region
2193 	 * and if so the freeing below will free invalid page structs.
2194 	 */
2195 	if (cmdline_memory_size)
2196 		do_free = 0;
2197 
2198 	/*
2199 	 * The init section is aligned to 8k in vmlinux.lds. Page align for >8k pagesizes.
2200 	 */
2201 	addr = PAGE_ALIGN((unsigned long)(__init_begin));
2202 	initend = (unsigned long)(__init_end) & PAGE_MASK;
2203 	for (; addr < initend; addr += PAGE_SIZE) {
2204 		unsigned long page;
2205 
2206 		page = (addr +
2207 			((unsigned long) __va(kern_base)) -
2208 			((unsigned long) KERNBASE));
2209 		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
2210 
2211 		if (do_free)
2212 			free_reserved_page(virt_to_page(page));
2213 	}
2214 }
2215 
2216 #ifdef CONFIG_BLK_DEV_INITRD
2217 void free_initrd_mem(unsigned long start, unsigned long end)
2218 {
2219 	free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
2220 			   "initrd");
2221 }
2222 #endif
2223 
2224 #define _PAGE_CACHE_4U	(_PAGE_CP_4U | _PAGE_CV_4U)
2225 #define _PAGE_CACHE_4V	(_PAGE_CP_4V | _PAGE_CV_4V)
2226 #define __DIRTY_BITS_4U	 (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U)
2227 #define __DIRTY_BITS_4V	 (_PAGE_MODIFIED_4V | _PAGE_WRITE_4V | _PAGE_W_4V)
2228 #define __ACCESS_BITS_4U (_PAGE_ACCESSED_4U | _PAGE_READ_4U | _PAGE_R)
2229 #define __ACCESS_BITS_4V (_PAGE_ACCESSED_4V | _PAGE_READ_4V | _PAGE_R)
2230 
2231 pgprot_t PAGE_KERNEL __read_mostly;
2232 EXPORT_SYMBOL(PAGE_KERNEL);
2233 
2234 pgprot_t PAGE_KERNEL_LOCKED __read_mostly;
2235 pgprot_t PAGE_COPY __read_mostly;
2236 
2237 pgprot_t PAGE_SHARED __read_mostly;
2238 EXPORT_SYMBOL(PAGE_SHARED);
2239 
2240 unsigned long pg_iobits __read_mostly;
2241 
2242 unsigned long _PAGE_IE __read_mostly;
2243 EXPORT_SYMBOL(_PAGE_IE);
2244 
2245 unsigned long _PAGE_E __read_mostly;
2246 EXPORT_SYMBOL(_PAGE_E);
2247 
2248 unsigned long _PAGE_CACHE __read_mostly;
2249 EXPORT_SYMBOL(_PAGE_CACHE);
2250 
2251 #ifdef CONFIG_SPARSEMEM_VMEMMAP
2252 unsigned long vmemmap_table[VMEMMAP_SIZE];
2253 
2254 static long __meminitdata addr_start, addr_end;
2255 static int __meminitdata node_start;
2256 
2257 int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend,
2258 			       int node)
2259 {
2260 	unsigned long phys_start = (vstart - VMEMMAP_BASE);
2261 	unsigned long phys_end = (vend - VMEMMAP_BASE);
2262 	unsigned long addr = phys_start & VMEMMAP_CHUNK_MASK;
2263 	unsigned long end = VMEMMAP_ALIGN(phys_end);
2264 	unsigned long pte_base;
2265 
2266 	pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4U |
2267 		    _PAGE_CP_4U | _PAGE_CV_4U |
2268 		    _PAGE_P_4U | _PAGE_W_4U);
2269 	if (tlb_type == hypervisor)
2270 		pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4V |
2271 			    _PAGE_CP_4V | _PAGE_CV_4V |
2272 			    _PAGE_P_4V | _PAGE_W_4V);
2273 
2274 	for (; addr < end; addr += VMEMMAP_CHUNK) {
2275 		unsigned long *vmem_pp =
2276 			vmemmap_table + (addr >> VMEMMAP_CHUNK_SHIFT);
2277 		void *block;
2278 
2279 		if (!(*vmem_pp & _PAGE_VALID)) {
2280 			block = vmemmap_alloc_block(1UL << ILOG2_4MB, node);
2281 			if (!block)
2282 				return -ENOMEM;
2283 
2284 			*vmem_pp = pte_base | __pa(block);
2285 
2286 			/* check to see if we have contiguous blocks */
2287 			if (addr_end != addr || node_start != node) {
2288 				if (addr_start)
2289 					printk(KERN_DEBUG " [%lx-%lx] on node %d\n",
2290 					       addr_start, addr_end-1, node_start);
2291 				addr_start = addr;
2292 				node_start = node;
2293 			}
2294 			addr_end = addr + VMEMMAP_CHUNK;
2295 		}
2296 	}
2297 	return 0;
2298 }
2299 
2300 void __meminit vmemmap_populate_print_last(void)
2301 {
2302 	if (addr_start) {
2303 		printk(KERN_DEBUG " [%lx-%lx] on node %d\n",
2304 		       addr_start, addr_end-1, node_start);
2305 		addr_start = 0;
2306 		addr_end = 0;
2307 		node_start = 0;
2308 	}
2309 }
2310 
2311 void vmemmap_free(unsigned long start, unsigned long end)
2312 {
2313 }
2314 
2315 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
2316 
2317 static void prot_init_common(unsigned long page_none,
2318 			     unsigned long page_shared,
2319 			     unsigned long page_copy,
2320 			     unsigned long page_readonly,
2321 			     unsigned long page_exec_bit)
2322 {
2323 	PAGE_COPY = __pgprot(page_copy);
2324 	PAGE_SHARED = __pgprot(page_shared);
2325 
2326 	protection_map[0x0] = __pgprot(page_none);
2327 	protection_map[0x1] = __pgprot(page_readonly & ~page_exec_bit);
2328 	protection_map[0x2] = __pgprot(page_copy & ~page_exec_bit);
2329 	protection_map[0x3] = __pgprot(page_copy & ~page_exec_bit);
2330 	protection_map[0x4] = __pgprot(page_readonly);
2331 	protection_map[0x5] = __pgprot(page_readonly);
2332 	protection_map[0x6] = __pgprot(page_copy);
2333 	protection_map[0x7] = __pgprot(page_copy);
2334 	protection_map[0x8] = __pgprot(page_none);
2335 	protection_map[0x9] = __pgprot(page_readonly & ~page_exec_bit);
2336 	protection_map[0xa] = __pgprot(page_shared & ~page_exec_bit);
2337 	protection_map[0xb] = __pgprot(page_shared & ~page_exec_bit);
2338 	protection_map[0xc] = __pgprot(page_readonly);
2339 	protection_map[0xd] = __pgprot(page_readonly);
2340 	protection_map[0xe] = __pgprot(page_shared);
2341 	protection_map[0xf] = __pgprot(page_shared);
2342 }
2343 
2344 static void __init sun4u_pgprot_init(void)
2345 {
2346 	unsigned long page_none, page_shared, page_copy, page_readonly;
2347 	unsigned long page_exec_bit;
2348 	int i;
2349 
2350 	PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID |
2351 				_PAGE_CACHE_4U | _PAGE_P_4U |
2352 				__ACCESS_BITS_4U | __DIRTY_BITS_4U |
2353 				_PAGE_EXEC_4U);
2354 	PAGE_KERNEL_LOCKED = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID |
2355 				       _PAGE_CACHE_4U | _PAGE_P_4U |
2356 				       __ACCESS_BITS_4U | __DIRTY_BITS_4U |
2357 				       _PAGE_EXEC_4U | _PAGE_L_4U);
2358 
2359 	_PAGE_IE = _PAGE_IE_4U;
2360 	_PAGE_E = _PAGE_E_4U;
2361 	_PAGE_CACHE = _PAGE_CACHE_4U;
2362 
2363 	pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4U | __DIRTY_BITS_4U |
2364 		     __ACCESS_BITS_4U | _PAGE_E_4U);
2365 
2366 #ifdef CONFIG_DEBUG_PAGEALLOC
2367 	kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET;
2368 #else
2369 	kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^
2370 		PAGE_OFFSET;
2371 #endif
2372 	kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U |
2373 				   _PAGE_P_4U | _PAGE_W_4U);
2374 
2375 	for (i = 1; i < 4; i++)
2376 		kern_linear_pte_xor[i] = kern_linear_pte_xor[0];
2377 
2378 	_PAGE_ALL_SZ_BITS =  (_PAGE_SZ4MB_4U | _PAGE_SZ512K_4U |
2379 			      _PAGE_SZ64K_4U | _PAGE_SZ8K_4U |
2380 			      _PAGE_SZ32MB_4U | _PAGE_SZ256MB_4U);
2381 
2382 
2383 	page_none = _PAGE_PRESENT_4U | _PAGE_ACCESSED_4U | _PAGE_CACHE_4U;
2384 	page_shared = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U |
2385 		       __ACCESS_BITS_4U | _PAGE_WRITE_4U | _PAGE_EXEC_4U);
2386 	page_copy   = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U |
2387 		       __ACCESS_BITS_4U | _PAGE_EXEC_4U);
2388 	page_readonly   = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U |
2389 			   __ACCESS_BITS_4U | _PAGE_EXEC_4U);
2390 
2391 	page_exec_bit = _PAGE_EXEC_4U;
2392 
2393 	prot_init_common(page_none, page_shared, page_copy, page_readonly,
2394 			 page_exec_bit);
2395 }
2396 
2397 static void __init sun4v_pgprot_init(void)
2398 {
2399 	unsigned long page_none, page_shared, page_copy, page_readonly;
2400 	unsigned long page_exec_bit;
2401 	int i;
2402 
2403 	PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4V | _PAGE_VALID |
2404 				_PAGE_CACHE_4V | _PAGE_P_4V |
2405 				__ACCESS_BITS_4V | __DIRTY_BITS_4V |
2406 				_PAGE_EXEC_4V);
2407 	PAGE_KERNEL_LOCKED = PAGE_KERNEL;
2408 
2409 	_PAGE_IE = _PAGE_IE_4V;
2410 	_PAGE_E = _PAGE_E_4V;
2411 	_PAGE_CACHE = _PAGE_CACHE_4V;
2412 
2413 #ifdef CONFIG_DEBUG_PAGEALLOC
2414 	kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET;
2415 #else
2416 	kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^
2417 		PAGE_OFFSET;
2418 #endif
2419 	kern_linear_pte_xor[0] |= (_PAGE_CP_4V | _PAGE_CV_4V |
2420 				   _PAGE_P_4V | _PAGE_W_4V);
2421 
2422 	for (i = 1; i < 4; i++)
2423 		kern_linear_pte_xor[i] = kern_linear_pte_xor[0];
2424 
2425 	pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4V | __DIRTY_BITS_4V |
2426 		     __ACCESS_BITS_4V | _PAGE_E_4V);
2427 
2428 	_PAGE_ALL_SZ_BITS = (_PAGE_SZ16GB_4V | _PAGE_SZ2GB_4V |
2429 			     _PAGE_SZ256MB_4V | _PAGE_SZ32MB_4V |
2430 			     _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V |
2431 			     _PAGE_SZ64K_4V | _PAGE_SZ8K_4V);
2432 
2433 	page_none = _PAGE_PRESENT_4V | _PAGE_ACCESSED_4V | _PAGE_CACHE_4V;
2434 	page_shared = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
2435 		       __ACCESS_BITS_4V | _PAGE_WRITE_4V | _PAGE_EXEC_4V);
2436 	page_copy   = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
2437 		       __ACCESS_BITS_4V | _PAGE_EXEC_4V);
2438 	page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
2439 			 __ACCESS_BITS_4V | _PAGE_EXEC_4V);
2440 
2441 	page_exec_bit = _PAGE_EXEC_4V;
2442 
2443 	prot_init_common(page_none, page_shared, page_copy, page_readonly,
2444 			 page_exec_bit);
2445 }
2446 
2447 unsigned long pte_sz_bits(unsigned long sz)
2448 {
2449 	if (tlb_type == hypervisor) {
2450 		switch (sz) {
2451 		case 8 * 1024:
2452 		default:
2453 			return _PAGE_SZ8K_4V;
2454 		case 64 * 1024:
2455 			return _PAGE_SZ64K_4V;
2456 		case 512 * 1024:
2457 			return _PAGE_SZ512K_4V;
2458 		case 4 * 1024 * 1024:
2459 			return _PAGE_SZ4MB_4V;
2460 		}
2461 	} else {
2462 		switch (sz) {
2463 		case 8 * 1024:
2464 		default:
2465 			return _PAGE_SZ8K_4U;
2466 		case 64 * 1024:
2467 			return _PAGE_SZ64K_4U;
2468 		case 512 * 1024:
2469 			return _PAGE_SZ512K_4U;
2470 		case 4 * 1024 * 1024:
2471 			return _PAGE_SZ4MB_4U;
2472 		}
2473 	}
2474 }
2475 
2476 pte_t mk_pte_io(unsigned long page, pgprot_t prot, int space, unsigned long page_size)
2477 {
2478 	pte_t pte;
2479 
2480 	pte_val(pte)  = page | pgprot_val(pgprot_noncached(prot));
2481 	pte_val(pte) |= (((unsigned long)space) << 32);
2482 	pte_val(pte) |= pte_sz_bits(page_size);
2483 
2484 	return pte;
2485 }
2486 
2487 static unsigned long kern_large_tte(unsigned long paddr)
2488 {
2489 	unsigned long val;
2490 
2491 	val = (_PAGE_VALID | _PAGE_SZ4MB_4U |
2492 	       _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_P_4U |
2493 	       _PAGE_EXEC_4U | _PAGE_L_4U | _PAGE_W_4U);
2494 	if (tlb_type == hypervisor)
2495 		val = (_PAGE_VALID | _PAGE_SZ4MB_4V |
2496 		       _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_P_4V |
2497 		       _PAGE_EXEC_4V | _PAGE_W_4V);
2498 
2499 	return val | paddr;
2500 }
2501 
2502 /* If not locked, zap it. */
2503 void __flush_tlb_all(void)
2504 {
2505 	unsigned long pstate;
2506 	int i;
2507 
2508 	__asm__ __volatile__("flushw\n\t"
2509 			     "rdpr	%%pstate, %0\n\t"
2510 			     "wrpr	%0, %1, %%pstate"
2511 			     : "=r" (pstate)
2512 			     : "i" (PSTATE_IE));
2513 	if (tlb_type == hypervisor) {
2514 		sun4v_mmu_demap_all();
2515 	} else if (tlb_type == spitfire) {
2516 		for (i = 0; i < 64; i++) {
2517 			/* Spitfire Errata #32 workaround */
2518 			/* NOTE: Always runs on spitfire, so no
2519 			 *       cheetah+ page size encodings.
2520 			 */
2521 			__asm__ __volatile__("stxa	%0, [%1] %2\n\t"
2522 					     "flush	%%g6"
2523 					     : /* No outputs */
2524 					     : "r" (0),
2525 					     "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
2526 
2527 			if (!(spitfire_get_dtlb_data(i) & _PAGE_L_4U)) {
2528 				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
2529 						     "membar #Sync"
2530 						     : /* no outputs */
2531 						     : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU));
2532 				spitfire_put_dtlb_data(i, 0x0UL);
2533 			}
2534 
2535 			/* Spitfire Errata #32 workaround */
2536 			/* NOTE: Always runs on spitfire, so no
2537 			 *       cheetah+ page size encodings.
2538 			 */
2539 			__asm__ __volatile__("stxa	%0, [%1] %2\n\t"
2540 					     "flush	%%g6"
2541 					     : /* No outputs */
2542 					     : "r" (0),
2543 					     "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
2544 
2545 			if (!(spitfire_get_itlb_data(i) & _PAGE_L_4U)) {
2546 				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
2547 						     "membar #Sync"
2548 						     : /* no outputs */
2549 						     : "r" (TLB_TAG_ACCESS), "i" (ASI_IMMU));
2550 				spitfire_put_itlb_data(i, 0x0UL);
2551 			}
2552 		}
2553 	} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
2554 		cheetah_flush_dtlb_all();
2555 		cheetah_flush_itlb_all();
2556 	}
2557 	__asm__ __volatile__("wrpr	%0, 0, %%pstate"
2558 			     : : "r" (pstate));
2559 }
2560 
2561 pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
2562 			    unsigned long address)
2563 {
2564 	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
2565 				       __GFP_REPEAT | __GFP_ZERO);
2566 	pte_t *pte = NULL;
2567 
2568 	if (page)
2569 		pte = (pte_t *) page_address(page);
2570 
2571 	return pte;
2572 }
2573 
2574 pgtable_t pte_alloc_one(struct mm_struct *mm,
2575 			unsigned long address)
2576 {
2577 	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
2578 				       __GFP_REPEAT | __GFP_ZERO);
2579 	if (!page)
2580 		return NULL;
2581 	if (!pgtable_page_ctor(page)) {
2582 		free_hot_cold_page(page, 0);
2583 		return NULL;
2584 	}
2585 	return (pte_t *) page_address(page);
2586 }
2587 
2588 void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
2589 {
2590 	free_page((unsigned long)pte);
2591 }
2592 
2593 static void __pte_free(pgtable_t pte)
2594 {
2595 	struct page *page = virt_to_page(pte);
2596 
2597 	pgtable_page_dtor(page);
2598 	__free_page(page);
2599 }
2600 
2601 void pte_free(struct mm_struct *mm, pgtable_t pte)
2602 {
2603 	__pte_free(pte);
2604 }
2605 
2606 void pgtable_free(void *table, bool is_page)
2607 {
2608 	if (is_page)
2609 		__pte_free(table);
2610 	else
2611 		kmem_cache_free(pgtable_cache, table);
2612 }
2613 
2614 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2615 void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
2616 			  pmd_t *pmd)
2617 {
2618 	unsigned long pte, flags;
2619 	struct mm_struct *mm;
2620 	pmd_t entry = *pmd;
2621 
2622 	if (!pmd_large(entry) || !pmd_young(entry))
2623 		return;
2624 
2625 	pte = pmd_val(entry);
2626 
2627 	/* Don't insert a non-valid PMD into the TSB, we'll deadlock.  */
2628 	if (!(pte & _PAGE_VALID))
2629 		return;
2630 
2631 	/* We are fabricating 8MB pages using 4MB real hw pages.  */
2632 	pte |= (addr & (1UL << REAL_HPAGE_SHIFT));
2633 
2634 	mm = vma->vm_mm;
2635 
2636 	spin_lock_irqsave(&mm->context.lock, flags);
2637 
2638 	if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL)
2639 		__update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT,
2640 					addr, pte);
2641 
2642 	spin_unlock_irqrestore(&mm->context.lock, flags);
2643 }
2644 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2645 
2646 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
2647 static void context_reload(void *__data)
2648 {
2649 	struct mm_struct *mm = __data;
2650 
2651 	if (mm == current->mm)
2652 		load_secondary_context(mm);
2653 }
2654 
2655 void hugetlb_setup(struct pt_regs *regs)
2656 {
2657 	struct mm_struct *mm = current->mm;
2658 	struct tsb_config *tp;
2659 
2660 	if (in_atomic() || !mm) {
2661 		const struct exception_table_entry *entry;
2662 
2663 		entry = search_exception_tables(regs->tpc);
2664 		if (entry) {
2665 			regs->tpc = entry->fixup;
2666 			regs->tnpc = regs->tpc + 4;
2667 			return;
2668 		}
2669 		pr_alert("Unexpected HugeTLB setup in atomic context.\n");
2670 		die_if_kernel("HugeTSB in atomic", regs);
2671 	}
2672 
2673 	tp = &mm->context.tsb_block[MM_TSB_HUGE];
2674 	if (likely(tp->tsb == NULL))
2675 		tsb_grow(mm, MM_TSB_HUGE, 0);
2676 
2677 	tsb_context_switch(mm);
2678 	smp_tsb_sync(mm);
2679 
2680 	/* On UltraSPARC-III+ and later, configure the second half of
2681 	 * the Data-TLB for huge pages.
2682 	 */
2683 	if (tlb_type == cheetah_plus) {
2684 		unsigned long ctx;
2685 
2686 		spin_lock(&ctx_alloc_lock);
2687 		ctx = mm->context.sparc64_ctx_val;
2688 		ctx &= ~CTX_PGSZ_MASK;
2689 		ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT;
2690 		ctx |= CTX_PGSZ_HUGE << CTX_PGSZ1_SHIFT;
2691 
2692 		if (ctx != mm->context.sparc64_ctx_val) {
2693 			/* When changing the page size fields, we
2694 			 * must perform a context flush so that no
2695 			 * stale entries match.  This flush must
2696 			 * occur with the original context register
2697 			 * settings.
2698 			 */
2699 			do_flush_tlb_mm(mm);
2700 
2701 			/* Reload the context register of all processors
2702 			 * also executing in this address space.
2703 			 */
2704 			mm->context.sparc64_ctx_val = ctx;
2705 			on_each_cpu(context_reload, mm, 0);
2706 		}
2707 		spin_unlock(&ctx_alloc_lock);
2708 	}
2709 }
2710 #endif
2711 
2712 static struct resource code_resource = {
2713 	.name	= "Kernel code",
2714 	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
2715 };
2716 
2717 static struct resource data_resource = {
2718 	.name	= "Kernel data",
2719 	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
2720 };
2721 
2722 static struct resource bss_resource = {
2723 	.name	= "Kernel bss",
2724 	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
2725 };
2726 
2727 static inline resource_size_t compute_kern_paddr(void *addr)
2728 {
2729 	return (resource_size_t) (addr - KERNBASE + kern_base);
2730 }
2731 
2732 static void __init kernel_lds_init(void)
2733 {
2734 	code_resource.start = compute_kern_paddr(_text);
2735 	code_resource.end   = compute_kern_paddr(_etext - 1);
2736 	data_resource.start = compute_kern_paddr(_etext);
2737 	data_resource.end   = compute_kern_paddr(_edata - 1);
2738 	bss_resource.start  = compute_kern_paddr(__bss_start);
2739 	bss_resource.end    = compute_kern_paddr(_end - 1);
2740 }
2741 
2742 static int __init report_memory(void)
2743 {
2744 	int i;
2745 	struct resource *res;
2746 
2747 	kernel_lds_init();
2748 
2749 	for (i = 0; i < pavail_ents; i++) {
2750 		res = kzalloc(sizeof(struct resource), GFP_KERNEL);
2751 
2752 		if (!res) {
2753 			pr_warn("Failed to allocate source.\n");
2754 			break;
2755 		}
2756 
2757 		res->name = "System RAM";
2758 		res->start = pavail[i].phys_addr;
2759 		res->end = pavail[i].phys_addr + pavail[i].reg_size - 1;
2760 		res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
2761 
2762 		if (insert_resource(&iomem_resource, res) < 0) {
2763 			pr_warn("Resource insertion failed.\n");
2764 			break;
2765 		}
2766 
2767 		insert_resource(res, &code_resource);
2768 		insert_resource(res, &data_resource);
2769 		insert_resource(res, &bss_resource);
2770 	}
2771 
2772 	return 0;
2773 }
2774 device_initcall(report_memory);
2775 
2776 #ifdef CONFIG_SMP
2777 #define do_flush_tlb_kernel_range	smp_flush_tlb_kernel_range
2778 #else
2779 #define do_flush_tlb_kernel_range	__flush_tlb_kernel_range
2780 #endif
2781 
2782 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
2783 {
2784 	if (start < HI_OBP_ADDRESS && end > LOW_OBP_ADDRESS) {
2785 		if (start < LOW_OBP_ADDRESS) {
2786 			flush_tsb_kernel_range(start, LOW_OBP_ADDRESS);
2787 			do_flush_tlb_kernel_range(start, LOW_OBP_ADDRESS);
2788 		}
2789 		if (end > HI_OBP_ADDRESS) {
2790 			flush_tsb_kernel_range(end, HI_OBP_ADDRESS);
2791 			do_flush_tlb_kernel_range(end, HI_OBP_ADDRESS);
2792 		}
2793 	} else {
2794 		flush_tsb_kernel_range(start, end);
2795 		do_flush_tlb_kernel_range(start, end);
2796 	}
2797 }
2798