1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Page table handling routines for radix page table.
4  *
5  * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
6  */
7 
8 #define pr_fmt(fmt) "radix-mmu: " fmt
9 
10 #include <linux/kernel.h>
11 #include <linux/sched/mm.h>
12 #include <linux/memblock.h>
13 #include <linux/of_fdt.h>
14 #include <linux/mm.h>
15 #include <linux/string_helpers.h>
16 #include <linux/stop_machine.h>
17 
18 #include <asm/pgtable.h>
19 #include <asm/pgalloc.h>
20 #include <asm/mmu_context.h>
21 #include <asm/dma.h>
22 #include <asm/machdep.h>
23 #include <asm/mmu.h>
24 #include <asm/firmware.h>
25 #include <asm/powernv.h>
26 #include <asm/sections.h>
27 #include <asm/trace.h>
28 #include <asm/uaccess.h>
29 
30 #include <trace/events/thp.h>
31 
32 unsigned int mmu_pid_bits;
33 unsigned int mmu_base_pid;
34 
35 static int native_register_process_table(unsigned long base, unsigned long pg_sz,
36 					 unsigned long table_size)
37 {
38 	unsigned long patb0, patb1;
39 
40 	patb0 = be64_to_cpu(partition_tb[0].patb0);
41 	patb1 = base | table_size | PATB_GR;
42 
43 	mmu_partition_table_set_entry(0, patb0, patb1);
44 
45 	return 0;
46 }
47 
48 static __ref void *early_alloc_pgtable(unsigned long size, int nid,
49 			unsigned long region_start, unsigned long region_end)
50 {
51 	phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
52 	phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
53 	void *ptr;
54 
55 	if (region_start)
56 		min_addr = region_start;
57 	if (region_end)
58 		max_addr = region_end;
59 
60 	ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
61 
62 	if (!ptr)
63 		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
64 		      __func__, size, size, nid, &min_addr, &max_addr);
65 
66 	return ptr;
67 }
68 
69 static int early_map_kernel_page(unsigned long ea, unsigned long pa,
70 			  pgprot_t flags,
71 			  unsigned int map_page_size,
72 			  int nid,
73 			  unsigned long region_start, unsigned long region_end)
74 {
75 	unsigned long pfn = pa >> PAGE_SHIFT;
76 	pgd_t *pgdp;
77 	pud_t *pudp;
78 	pmd_t *pmdp;
79 	pte_t *ptep;
80 
81 	pgdp = pgd_offset_k(ea);
82 	if (pgd_none(*pgdp)) {
83 		pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
84 						region_start, region_end);
85 		pgd_populate(&init_mm, pgdp, pudp);
86 	}
87 	pudp = pud_offset(pgdp, ea);
88 	if (map_page_size == PUD_SIZE) {
89 		ptep = (pte_t *)pudp;
90 		goto set_the_pte;
91 	}
92 	if (pud_none(*pudp)) {
93 		pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
94 						region_start, region_end);
95 		pud_populate(&init_mm, pudp, pmdp);
96 	}
97 	pmdp = pmd_offset(pudp, ea);
98 	if (map_page_size == PMD_SIZE) {
99 		ptep = pmdp_ptep(pmdp);
100 		goto set_the_pte;
101 	}
102 	if (!pmd_present(*pmdp)) {
103 		ptep = early_alloc_pgtable(PAGE_SIZE, nid,
104 						region_start, region_end);
105 		pmd_populate_kernel(&init_mm, pmdp, ptep);
106 	}
107 	ptep = pte_offset_kernel(pmdp, ea);
108 
109 set_the_pte:
110 	set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
111 	smp_wmb();
112 	return 0;
113 }
114 
115 /*
116  * nid, region_start, and region_end are hints to try to place the page
117  * table memory in the same node or region.
118  */
119 static int __map_kernel_page(unsigned long ea, unsigned long pa,
120 			  pgprot_t flags,
121 			  unsigned int map_page_size,
122 			  int nid,
123 			  unsigned long region_start, unsigned long region_end)
124 {
125 	unsigned long pfn = pa >> PAGE_SHIFT;
126 	pgd_t *pgdp;
127 	pud_t *pudp;
128 	pmd_t *pmdp;
129 	pte_t *ptep;
130 	/*
131 	 * Make sure task size is correct as per the max adddr
132 	 */
133 	BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
134 
135 #ifdef CONFIG_PPC_64K_PAGES
136 	BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
137 #endif
138 
139 	if (unlikely(!slab_is_available()))
140 		return early_map_kernel_page(ea, pa, flags, map_page_size,
141 						nid, region_start, region_end);
142 
143 	/*
144 	 * Should make page table allocation functions be able to take a
145 	 * node, so we can place kernel page tables on the right nodes after
146 	 * boot.
147 	 */
148 	pgdp = pgd_offset_k(ea);
149 	pudp = pud_alloc(&init_mm, pgdp, ea);
150 	if (!pudp)
151 		return -ENOMEM;
152 	if (map_page_size == PUD_SIZE) {
153 		ptep = (pte_t *)pudp;
154 		goto set_the_pte;
155 	}
156 	pmdp = pmd_alloc(&init_mm, pudp, ea);
157 	if (!pmdp)
158 		return -ENOMEM;
159 	if (map_page_size == PMD_SIZE) {
160 		ptep = pmdp_ptep(pmdp);
161 		goto set_the_pte;
162 	}
163 	ptep = pte_alloc_kernel(pmdp, ea);
164 	if (!ptep)
165 		return -ENOMEM;
166 
167 set_the_pte:
168 	set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
169 	smp_wmb();
170 	return 0;
171 }
172 
173 int radix__map_kernel_page(unsigned long ea, unsigned long pa,
174 			  pgprot_t flags,
175 			  unsigned int map_page_size)
176 {
177 	return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
178 }
179 
180 #ifdef CONFIG_STRICT_KERNEL_RWX
181 void radix__change_memory_range(unsigned long start, unsigned long end,
182 				unsigned long clear)
183 {
184 	unsigned long idx;
185 	pgd_t *pgdp;
186 	pud_t *pudp;
187 	pmd_t *pmdp;
188 	pte_t *ptep;
189 
190 	start = ALIGN_DOWN(start, PAGE_SIZE);
191 	end = PAGE_ALIGN(end); // aligns up
192 
193 	pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
194 		 start, end, clear);
195 
196 	for (idx = start; idx < end; idx += PAGE_SIZE) {
197 		pgdp = pgd_offset_k(idx);
198 		pudp = pud_alloc(&init_mm, pgdp, idx);
199 		if (!pudp)
200 			continue;
201 		if (pud_huge(*pudp)) {
202 			ptep = (pte_t *)pudp;
203 			goto update_the_pte;
204 		}
205 		pmdp = pmd_alloc(&init_mm, pudp, idx);
206 		if (!pmdp)
207 			continue;
208 		if (pmd_huge(*pmdp)) {
209 			ptep = pmdp_ptep(pmdp);
210 			goto update_the_pte;
211 		}
212 		ptep = pte_alloc_kernel(pmdp, idx);
213 		if (!ptep)
214 			continue;
215 update_the_pte:
216 		radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
217 	}
218 
219 	radix__flush_tlb_kernel_range(start, end);
220 }
221 
222 void radix__mark_rodata_ro(void)
223 {
224 	unsigned long start, end;
225 
226 	start = (unsigned long)_stext;
227 	end = (unsigned long)__init_begin;
228 
229 	radix__change_memory_range(start, end, _PAGE_WRITE);
230 }
231 
232 void radix__mark_initmem_nx(void)
233 {
234 	unsigned long start = (unsigned long)__init_begin;
235 	unsigned long end = (unsigned long)__init_end;
236 
237 	radix__change_memory_range(start, end, _PAGE_EXEC);
238 }
239 #endif /* CONFIG_STRICT_KERNEL_RWX */
240 
241 static inline void __meminit
242 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
243 {
244 	char buf[10];
245 
246 	if (end <= start)
247 		return;
248 
249 	string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
250 
251 	pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
252 		exec ? " (exec)" : "");
253 }
254 
255 static unsigned long next_boundary(unsigned long addr, unsigned long end)
256 {
257 #ifdef CONFIG_STRICT_KERNEL_RWX
258 	if (addr < __pa_symbol(__init_begin))
259 		return __pa_symbol(__init_begin);
260 #endif
261 	return end;
262 }
263 
264 static int __meminit create_physical_mapping(unsigned long start,
265 					     unsigned long end,
266 					     int nid)
267 {
268 	unsigned long vaddr, addr, mapping_size = 0;
269 	bool prev_exec, exec = false;
270 	pgprot_t prot;
271 	int psize;
272 
273 	start = _ALIGN_UP(start, PAGE_SIZE);
274 	for (addr = start; addr < end; addr += mapping_size) {
275 		unsigned long gap, previous_size;
276 		int rc;
277 
278 		gap = next_boundary(addr, end) - addr;
279 		previous_size = mapping_size;
280 		prev_exec = exec;
281 
282 		if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
283 		    mmu_psize_defs[MMU_PAGE_1G].shift) {
284 			mapping_size = PUD_SIZE;
285 			psize = MMU_PAGE_1G;
286 		} else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
287 			   mmu_psize_defs[MMU_PAGE_2M].shift) {
288 			mapping_size = PMD_SIZE;
289 			psize = MMU_PAGE_2M;
290 		} else {
291 			mapping_size = PAGE_SIZE;
292 			psize = mmu_virtual_psize;
293 		}
294 
295 		vaddr = (unsigned long)__va(addr);
296 
297 		if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
298 		    overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
299 			prot = PAGE_KERNEL_X;
300 			exec = true;
301 		} else {
302 			prot = PAGE_KERNEL;
303 			exec = false;
304 		}
305 
306 		if (mapping_size != previous_size || exec != prev_exec) {
307 			print_mapping(start, addr, previous_size, prev_exec);
308 			start = addr;
309 		}
310 
311 		rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
312 		if (rc)
313 			return rc;
314 
315 		update_page_count(psize, 1);
316 	}
317 
318 	print_mapping(start, addr, mapping_size, exec);
319 	return 0;
320 }
321 
322 void __init radix_init_pgtable(void)
323 {
324 	unsigned long rts_field;
325 	struct memblock_region *reg;
326 
327 	/* We don't support slb for radix */
328 	mmu_slb_size = 0;
329 	/*
330 	 * Create the linear mapping, using standard page size for now
331 	 */
332 	for_each_memblock(memory, reg) {
333 		/*
334 		 * The memblock allocator  is up at this point, so the
335 		 * page tables will be allocated within the range. No
336 		 * need or a node (which we don't have yet).
337 		 */
338 
339 		if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
340 			pr_warn("Outside the supported range\n");
341 			continue;
342 		}
343 
344 		WARN_ON(create_physical_mapping(reg->base,
345 						reg->base + reg->size,
346 						-1));
347 	}
348 
349 	/* Find out how many PID bits are supported */
350 	if (cpu_has_feature(CPU_FTR_HVMODE)) {
351 		if (!mmu_pid_bits)
352 			mmu_pid_bits = 20;
353 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
354 		/*
355 		 * When KVM is possible, we only use the top half of the
356 		 * PID space to avoid collisions between host and guest PIDs
357 		 * which can cause problems due to prefetch when exiting the
358 		 * guest with AIL=3
359 		 */
360 		mmu_base_pid = 1 << (mmu_pid_bits - 1);
361 #else
362 		mmu_base_pid = 1;
363 #endif
364 	} else {
365 		/* The guest uses the bottom half of the PID space */
366 		if (!mmu_pid_bits)
367 			mmu_pid_bits = 19;
368 		mmu_base_pid = 1;
369 	}
370 
371 	/*
372 	 * Allocate Partition table and process table for the
373 	 * host.
374 	 */
375 	BUG_ON(PRTB_SIZE_SHIFT > 36);
376 	process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
377 	/*
378 	 * Fill in the process table.
379 	 */
380 	rts_field = radix__get_tree_size();
381 	process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
382 	/*
383 	 * Fill in the partition table. We are suppose to use effective address
384 	 * of process table here. But our linear mapping also enable us to use
385 	 * physical address here.
386 	 */
387 	register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
388 	pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
389 	asm volatile("ptesync" : : : "memory");
390 	asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
391 		     "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
392 	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
393 	trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
394 
395 	/*
396 	 * The init_mm context is given the first available (non-zero) PID,
397 	 * which is the "guard PID" and contains no page table. PIDR should
398 	 * never be set to zero because that duplicates the kernel address
399 	 * space at the 0x0... offset (quadrant 0)!
400 	 *
401 	 * An arbitrary PID that may later be allocated by the PID allocator
402 	 * for userspace processes must not be used either, because that
403 	 * would cause stale user mappings for that PID on CPUs outside of
404 	 * the TLB invalidation scheme (because it won't be in mm_cpumask).
405 	 *
406 	 * So permanently carve out one PID for the purpose of a guard PID.
407 	 */
408 	init_mm.context.id = mmu_base_pid;
409 	mmu_base_pid++;
410 }
411 
412 static void __init radix_init_partition_table(void)
413 {
414 	unsigned long rts_field, dw0;
415 
416 	mmu_partition_table_init();
417 	rts_field = radix__get_tree_size();
418 	dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
419 	mmu_partition_table_set_entry(0, dw0, 0);
420 
421 	pr_info("Initializing Radix MMU\n");
422 	pr_info("Partition table %p\n", partition_tb);
423 }
424 
425 void __init radix_init_native(void)
426 {
427 	register_process_table = native_register_process_table;
428 }
429 
430 static int __init get_idx_from_shift(unsigned int shift)
431 {
432 	int idx = -1;
433 
434 	switch (shift) {
435 	case 0xc:
436 		idx = MMU_PAGE_4K;
437 		break;
438 	case 0x10:
439 		idx = MMU_PAGE_64K;
440 		break;
441 	case 0x15:
442 		idx = MMU_PAGE_2M;
443 		break;
444 	case 0x1e:
445 		idx = MMU_PAGE_1G;
446 		break;
447 	}
448 	return idx;
449 }
450 
451 static int __init radix_dt_scan_page_sizes(unsigned long node,
452 					   const char *uname, int depth,
453 					   void *data)
454 {
455 	int size = 0;
456 	int shift, idx;
457 	unsigned int ap;
458 	const __be32 *prop;
459 	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
460 
461 	/* We are scanning "cpu" nodes only */
462 	if (type == NULL || strcmp(type, "cpu") != 0)
463 		return 0;
464 
465 	/* Find MMU PID size */
466 	prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
467 	if (prop && size == 4)
468 		mmu_pid_bits = be32_to_cpup(prop);
469 
470 	/* Grab page size encodings */
471 	prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
472 	if (!prop)
473 		return 0;
474 
475 	pr_info("Page sizes from device-tree:\n");
476 	for (; size >= 4; size -= 4, ++prop) {
477 
478 		struct mmu_psize_def *def;
479 
480 		/* top 3 bit is AP encoding */
481 		shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
482 		ap = be32_to_cpu(prop[0]) >> 29;
483 		pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
484 
485 		idx = get_idx_from_shift(shift);
486 		if (idx < 0)
487 			continue;
488 
489 		def = &mmu_psize_defs[idx];
490 		def->shift = shift;
491 		def->ap  = ap;
492 	}
493 
494 	/* needed ? */
495 	cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
496 	return 1;
497 }
498 
499 void __init radix__early_init_devtree(void)
500 {
501 	int rc;
502 
503 	/*
504 	 * Try to find the available page sizes in the device-tree
505 	 */
506 	rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
507 	if (rc != 0)  /* Found */
508 		goto found;
509 	/*
510 	 * let's assume we have page 4k and 64k support
511 	 */
512 	mmu_psize_defs[MMU_PAGE_4K].shift = 12;
513 	mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
514 
515 	mmu_psize_defs[MMU_PAGE_64K].shift = 16;
516 	mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
517 found:
518 #ifdef CONFIG_SPARSEMEM_VMEMMAP
519 	if (mmu_psize_defs[MMU_PAGE_2M].shift) {
520 		/*
521 		 * map vmemmap using 2M if available
522 		 */
523 		mmu_vmemmap_psize = MMU_PAGE_2M;
524 	}
525 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
526 	return;
527 }
528 
529 static void radix_init_amor(void)
530 {
531 	/*
532 	* In HV mode, we init AMOR (Authority Mask Override Register) so that
533 	* the hypervisor and guest can setup IAMR (Instruction Authority Mask
534 	* Register), enable key 0 and set it to 1.
535 	*
536 	* AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
537 	*/
538 	mtspr(SPRN_AMOR, (3ul << 62));
539 }
540 
541 #ifdef CONFIG_PPC_KUEP
542 void setup_kuep(bool disabled)
543 {
544 	if (disabled || !early_radix_enabled())
545 		return;
546 
547 	if (smp_processor_id() == boot_cpuid)
548 		pr_info("Activating Kernel Userspace Execution Prevention\n");
549 
550 	/*
551 	 * Radix always uses key0 of the IAMR to determine if an access is
552 	 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
553 	 * fetch.
554 	 */
555 	mtspr(SPRN_IAMR, (1ul << 62));
556 }
557 #endif
558 
559 #ifdef CONFIG_PPC_KUAP
560 void setup_kuap(bool disabled)
561 {
562 	if (disabled || !early_radix_enabled())
563 		return;
564 
565 	if (smp_processor_id() == boot_cpuid) {
566 		pr_info("Activating Kernel Userspace Access Prevention\n");
567 		cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP;
568 	}
569 
570 	/* Make sure userspace can't change the AMR */
571 	mtspr(SPRN_UAMOR, 0);
572 	mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
573 	isync();
574 }
575 #endif
576 
577 void __init radix__early_init_mmu(void)
578 {
579 	unsigned long lpcr;
580 
581 #ifdef CONFIG_PPC_64K_PAGES
582 	/* PAGE_SIZE mappings */
583 	mmu_virtual_psize = MMU_PAGE_64K;
584 #else
585 	mmu_virtual_psize = MMU_PAGE_4K;
586 #endif
587 
588 #ifdef CONFIG_SPARSEMEM_VMEMMAP
589 	/* vmemmap mapping */
590 	mmu_vmemmap_psize = mmu_virtual_psize;
591 #endif
592 	/*
593 	 * initialize page table size
594 	 */
595 	__pte_index_size = RADIX_PTE_INDEX_SIZE;
596 	__pmd_index_size = RADIX_PMD_INDEX_SIZE;
597 	__pud_index_size = RADIX_PUD_INDEX_SIZE;
598 	__pgd_index_size = RADIX_PGD_INDEX_SIZE;
599 	__pud_cache_index = RADIX_PUD_INDEX_SIZE;
600 	__pte_table_size = RADIX_PTE_TABLE_SIZE;
601 	__pmd_table_size = RADIX_PMD_TABLE_SIZE;
602 	__pud_table_size = RADIX_PUD_TABLE_SIZE;
603 	__pgd_table_size = RADIX_PGD_TABLE_SIZE;
604 
605 	__pmd_val_bits = RADIX_PMD_VAL_BITS;
606 	__pud_val_bits = RADIX_PUD_VAL_BITS;
607 	__pgd_val_bits = RADIX_PGD_VAL_BITS;
608 
609 	__kernel_virt_start = RADIX_KERN_VIRT_START;
610 	__vmalloc_start = RADIX_VMALLOC_START;
611 	__vmalloc_end = RADIX_VMALLOC_END;
612 	__kernel_io_start = RADIX_KERN_IO_START;
613 	__kernel_io_end = RADIX_KERN_IO_END;
614 	vmemmap = (struct page *)RADIX_VMEMMAP_START;
615 	ioremap_bot = IOREMAP_BASE;
616 
617 #ifdef CONFIG_PCI
618 	pci_io_base = ISA_IO_BASE;
619 #endif
620 	__pte_frag_nr = RADIX_PTE_FRAG_NR;
621 	__pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
622 	__pmd_frag_nr = RADIX_PMD_FRAG_NR;
623 	__pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
624 
625 	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
626 		radix_init_native();
627 		lpcr = mfspr(SPRN_LPCR);
628 		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
629 		radix_init_partition_table();
630 		radix_init_amor();
631 	} else {
632 		radix_init_pseries();
633 	}
634 
635 	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
636 
637 	radix_init_pgtable();
638 	/* Switch to the guard PID before turning on MMU */
639 	radix__switch_mmu_context(NULL, &init_mm);
640 	if (cpu_has_feature(CPU_FTR_HVMODE))
641 		tlbiel_all();
642 }
643 
644 void radix__early_init_mmu_secondary(void)
645 {
646 	unsigned long lpcr;
647 	/*
648 	 * update partition table control register and UPRT
649 	 */
650 	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
651 		lpcr = mfspr(SPRN_LPCR);
652 		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
653 
654 		mtspr(SPRN_PTCR,
655 		      __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
656 		radix_init_amor();
657 	}
658 
659 	radix__switch_mmu_context(NULL, &init_mm);
660 	if (cpu_has_feature(CPU_FTR_HVMODE))
661 		tlbiel_all();
662 }
663 
664 void radix__mmu_cleanup_all(void)
665 {
666 	unsigned long lpcr;
667 
668 	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
669 		lpcr = mfspr(SPRN_LPCR);
670 		mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
671 		mtspr(SPRN_PTCR, 0);
672 		powernv_set_nmmu_ptcr(0);
673 		radix__flush_tlb_all();
674 	}
675 }
676 
677 void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
678 				phys_addr_t first_memblock_size)
679 {
680 	/*
681 	 * We don't currently support the first MEMBLOCK not mapping 0
682 	 * physical on those processors
683 	 */
684 	BUG_ON(first_memblock_base != 0);
685 
686 	/*
687 	 * Radix mode is not limited by RMA / VRMA addressing.
688 	 */
689 	ppc64_rma_size = ULONG_MAX;
690 }
691 
692 #ifdef CONFIG_MEMORY_HOTPLUG
693 static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
694 {
695 	pte_t *pte;
696 	int i;
697 
698 	for (i = 0; i < PTRS_PER_PTE; i++) {
699 		pte = pte_start + i;
700 		if (!pte_none(*pte))
701 			return;
702 	}
703 
704 	pte_free_kernel(&init_mm, pte_start);
705 	pmd_clear(pmd);
706 }
707 
708 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
709 {
710 	pmd_t *pmd;
711 	int i;
712 
713 	for (i = 0; i < PTRS_PER_PMD; i++) {
714 		pmd = pmd_start + i;
715 		if (!pmd_none(*pmd))
716 			return;
717 	}
718 
719 	pmd_free(&init_mm, pmd_start);
720 	pud_clear(pud);
721 }
722 
723 struct change_mapping_params {
724 	pte_t *pte;
725 	unsigned long start;
726 	unsigned long end;
727 	unsigned long aligned_start;
728 	unsigned long aligned_end;
729 };
730 
731 static int __meminit stop_machine_change_mapping(void *data)
732 {
733 	struct change_mapping_params *params =
734 			(struct change_mapping_params *)data;
735 
736 	if (!data)
737 		return -1;
738 
739 	spin_unlock(&init_mm.page_table_lock);
740 	pte_clear(&init_mm, params->aligned_start, params->pte);
741 	create_physical_mapping(params->aligned_start, params->start, -1);
742 	create_physical_mapping(params->end, params->aligned_end, -1);
743 	spin_lock(&init_mm.page_table_lock);
744 	return 0;
745 }
746 
747 static void remove_pte_table(pte_t *pte_start, unsigned long addr,
748 			     unsigned long end)
749 {
750 	unsigned long next;
751 	pte_t *pte;
752 
753 	pte = pte_start + pte_index(addr);
754 	for (; addr < end; addr = next, pte++) {
755 		next = (addr + PAGE_SIZE) & PAGE_MASK;
756 		if (next > end)
757 			next = end;
758 
759 		if (!pte_present(*pte))
760 			continue;
761 
762 		if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
763 			/*
764 			 * The vmemmap_free() and remove_section_mapping()
765 			 * codepaths call us with aligned addresses.
766 			 */
767 			WARN_ONCE(1, "%s: unaligned range\n", __func__);
768 			continue;
769 		}
770 
771 		pte_clear(&init_mm, addr, pte);
772 	}
773 }
774 
775 /*
776  * clear the pte and potentially split the mapping helper
777  */
778 static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
779 				unsigned long size, pte_t *pte)
780 {
781 	unsigned long mask = ~(size - 1);
782 	unsigned long aligned_start = addr & mask;
783 	unsigned long aligned_end = addr + size;
784 	struct change_mapping_params params;
785 	bool split_region = false;
786 
787 	if ((end - addr) < size) {
788 		/*
789 		 * We're going to clear the PTE, but not flushed
790 		 * the mapping, time to remap and flush. The
791 		 * effects if visible outside the processor or
792 		 * if we are running in code close to the
793 		 * mapping we cleared, we are in trouble.
794 		 */
795 		if (overlaps_kernel_text(aligned_start, addr) ||
796 			overlaps_kernel_text(end, aligned_end)) {
797 			/*
798 			 * Hack, just return, don't pte_clear
799 			 */
800 			WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
801 				  "text, not splitting\n", addr, end);
802 			return;
803 		}
804 		split_region = true;
805 	}
806 
807 	if (split_region) {
808 		params.pte = pte;
809 		params.start = addr;
810 		params.end = end;
811 		params.aligned_start = addr & ~(size - 1);
812 		params.aligned_end = min_t(unsigned long, aligned_end,
813 				(unsigned long)__va(memblock_end_of_DRAM()));
814 		stop_machine(stop_machine_change_mapping, &params, NULL);
815 		return;
816 	}
817 
818 	pte_clear(&init_mm, addr, pte);
819 }
820 
821 static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
822 			     unsigned long end)
823 {
824 	unsigned long next;
825 	pte_t *pte_base;
826 	pmd_t *pmd;
827 
828 	pmd = pmd_start + pmd_index(addr);
829 	for (; addr < end; addr = next, pmd++) {
830 		next = pmd_addr_end(addr, end);
831 
832 		if (!pmd_present(*pmd))
833 			continue;
834 
835 		if (pmd_huge(*pmd)) {
836 			split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
837 			continue;
838 		}
839 
840 		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
841 		remove_pte_table(pte_base, addr, next);
842 		free_pte_table(pte_base, pmd);
843 	}
844 }
845 
846 static void remove_pud_table(pud_t *pud_start, unsigned long addr,
847 			     unsigned long end)
848 {
849 	unsigned long next;
850 	pmd_t *pmd_base;
851 	pud_t *pud;
852 
853 	pud = pud_start + pud_index(addr);
854 	for (; addr < end; addr = next, pud++) {
855 		next = pud_addr_end(addr, end);
856 
857 		if (!pud_present(*pud))
858 			continue;
859 
860 		if (pud_huge(*pud)) {
861 			split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
862 			continue;
863 		}
864 
865 		pmd_base = (pmd_t *)pud_page_vaddr(*pud);
866 		remove_pmd_table(pmd_base, addr, next);
867 		free_pmd_table(pmd_base, pud);
868 	}
869 }
870 
871 static void __meminit remove_pagetable(unsigned long start, unsigned long end)
872 {
873 	unsigned long addr, next;
874 	pud_t *pud_base;
875 	pgd_t *pgd;
876 
877 	spin_lock(&init_mm.page_table_lock);
878 
879 	for (addr = start; addr < end; addr = next) {
880 		next = pgd_addr_end(addr, end);
881 
882 		pgd = pgd_offset_k(addr);
883 		if (!pgd_present(*pgd))
884 			continue;
885 
886 		if (pgd_huge(*pgd)) {
887 			split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
888 			continue;
889 		}
890 
891 		pud_base = (pud_t *)pgd_page_vaddr(*pgd);
892 		remove_pud_table(pud_base, addr, next);
893 	}
894 
895 	spin_unlock(&init_mm.page_table_lock);
896 	radix__flush_tlb_kernel_range(start, end);
897 }
898 
899 int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
900 {
901 	if (end >= RADIX_VMALLOC_START) {
902 		pr_warn("Outside the supported range\n");
903 		return -1;
904 	}
905 
906 	return create_physical_mapping(start, end, nid);
907 }
908 
909 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
910 {
911 	remove_pagetable(start, end);
912 	return 0;
913 }
914 #endif /* CONFIG_MEMORY_HOTPLUG */
915 
916 #ifdef CONFIG_SPARSEMEM_VMEMMAP
917 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
918 				 pgprot_t flags, unsigned int map_page_size,
919 				 int nid)
920 {
921 	return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
922 }
923 
924 int __meminit radix__vmemmap_create_mapping(unsigned long start,
925 				      unsigned long page_size,
926 				      unsigned long phys)
927 {
928 	/* Create a PTE encoding */
929 	unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
930 	int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
931 	int ret;
932 
933 	if ((start + page_size) >= RADIX_VMEMMAP_END) {
934 		pr_warn("Outside the supported range\n");
935 		return -1;
936 	}
937 
938 	ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
939 	BUG_ON(ret);
940 
941 	return 0;
942 }
943 
944 #ifdef CONFIG_MEMORY_HOTPLUG
945 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
946 {
947 	remove_pagetable(start, start + page_size);
948 }
949 #endif
950 #endif
951 
952 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
953 
954 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
955 				  pmd_t *pmdp, unsigned long clr,
956 				  unsigned long set)
957 {
958 	unsigned long old;
959 
960 #ifdef CONFIG_DEBUG_VM
961 	WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
962 	assert_spin_locked(pmd_lockptr(mm, pmdp));
963 #endif
964 
965 	old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
966 	trace_hugepage_update(addr, old, clr, set);
967 
968 	return old;
969 }
970 
971 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
972 			pmd_t *pmdp)
973 
974 {
975 	pmd_t pmd;
976 
977 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
978 	VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
979 	VM_BUG_ON(pmd_devmap(*pmdp));
980 	/*
981 	 * khugepaged calls this for normal pmd
982 	 */
983 	pmd = *pmdp;
984 	pmd_clear(pmdp);
985 
986 	/*FIXME!!  Verify whether we need this kick below */
987 	serialize_against_pte_lookup(vma->vm_mm);
988 
989 	radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
990 
991 	return pmd;
992 }
993 
994 /*
995  * For us pgtable_t is pte_t *. Inorder to save the deposisted
996  * page table, we consider the allocated page table as a list
997  * head. On withdraw we need to make sure we zero out the used
998  * list_head memory area.
999  */
1000 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1001 				 pgtable_t pgtable)
1002 {
1003 	struct list_head *lh = (struct list_head *) pgtable;
1004 
1005 	assert_spin_locked(pmd_lockptr(mm, pmdp));
1006 
1007 	/* FIFO */
1008 	if (!pmd_huge_pte(mm, pmdp))
1009 		INIT_LIST_HEAD(lh);
1010 	else
1011 		list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1012 	pmd_huge_pte(mm, pmdp) = pgtable;
1013 }
1014 
1015 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1016 {
1017 	pte_t *ptep;
1018 	pgtable_t pgtable;
1019 	struct list_head *lh;
1020 
1021 	assert_spin_locked(pmd_lockptr(mm, pmdp));
1022 
1023 	/* FIFO */
1024 	pgtable = pmd_huge_pte(mm, pmdp);
1025 	lh = (struct list_head *) pgtable;
1026 	if (list_empty(lh))
1027 		pmd_huge_pte(mm, pmdp) = NULL;
1028 	else {
1029 		pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
1030 		list_del(lh);
1031 	}
1032 	ptep = (pte_t *) pgtable;
1033 	*ptep = __pte(0);
1034 	ptep++;
1035 	*ptep = __pte(0);
1036 	return pgtable;
1037 }
1038 
1039 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
1040 				     unsigned long addr, pmd_t *pmdp)
1041 {
1042 	pmd_t old_pmd;
1043 	unsigned long old;
1044 
1045 	old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
1046 	old_pmd = __pmd(old);
1047 	/*
1048 	 * Serialize against find_current_mm_pte which does lock-less
1049 	 * lookup in page tables with local interrupts disabled. For huge pages
1050 	 * it casts pmd_t to pte_t. Since format of pte_t is different from
1051 	 * pmd_t we want to prevent transit from pmd pointing to page table
1052 	 * to pmd pointing to huge page (and back) while interrupts are disabled.
1053 	 * We clear pmd to possibly replace it with page table pointer in
1054 	 * different code paths. So make sure we wait for the parallel
1055 	 * find_current_mm_pte to finish.
1056 	 */
1057 	serialize_against_pte_lookup(mm);
1058 	return old_pmd;
1059 }
1060 
1061 int radix__has_transparent_hugepage(void)
1062 {
1063 	/* For radix 2M at PMD level means thp */
1064 	if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
1065 		return 1;
1066 	return 0;
1067 }
1068 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1069 
1070 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
1071 				  pte_t entry, unsigned long address, int psize)
1072 {
1073 	struct mm_struct *mm = vma->vm_mm;
1074 	unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
1075 					      _PAGE_RW | _PAGE_EXEC);
1076 
1077 	unsigned long change = pte_val(entry) ^ pte_val(*ptep);
1078 	/*
1079 	 * To avoid NMMU hang while relaxing access, we need mark
1080 	 * the pte invalid in between.
1081 	 */
1082 	if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
1083 		unsigned long old_pte, new_pte;
1084 
1085 		old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
1086 		/*
1087 		 * new value of pte
1088 		 */
1089 		new_pte = old_pte | set;
1090 		radix__flush_tlb_page_psize(mm, address, psize);
1091 		__radix_pte_update(ptep, _PAGE_INVALID, new_pte);
1092 	} else {
1093 		__radix_pte_update(ptep, 0, set);
1094 		/*
1095 		 * Book3S does not require a TLB flush when relaxing access
1096 		 * restrictions when the address space is not attached to a
1097 		 * NMMU, because the core MMU will reload the pte after taking
1098 		 * an access fault, which is defined by the architectue.
1099 		 */
1100 	}
1101 	/* See ptesync comment in radix__set_pte_at */
1102 }
1103 
1104 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
1105 				    unsigned long addr, pte_t *ptep,
1106 				    pte_t old_pte, pte_t pte)
1107 {
1108 	struct mm_struct *mm = vma->vm_mm;
1109 
1110 	/*
1111 	 * To avoid NMMU hang while relaxing access we need to flush the tlb before
1112 	 * we set the new value. We need to do this only for radix, because hash
1113 	 * translation does flush when updating the linux pte.
1114 	 */
1115 	if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
1116 	    (atomic_read(&mm->context.copros) > 0))
1117 		radix__flush_tlb_page(vma, addr);
1118 
1119 	set_pte_at(mm, addr, ptep, pte);
1120 }
1121