1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *
4  * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
5  */
6 
7 #include <linux/types.h>
8 #include <linux/string.h>
9 #include <linux/kvm.h>
10 #include <linux/kvm_host.h>
11 #include <linux/anon_inodes.h>
12 #include <linux/file.h>
13 #include <linux/debugfs.h>
14 #include <linux/pgtable.h>
15 
16 #include <asm/kvm_ppc.h>
17 #include <asm/kvm_book3s.h>
18 #include <asm/page.h>
19 #include <asm/mmu.h>
20 #include <asm/pgalloc.h>
21 #include <asm/pte-walk.h>
22 #include <asm/ultravisor.h>
23 #include <asm/kvm_book3s_uvmem.h>
24 #include <asm/plpar_wrappers.h>
25 
26 /*
27  * Supported radix tree geometry.
28  * Like p9, we support either 5 or 9 bits at the first (lowest) level,
29  * for a page size of 64k or 4k.
30  */
31 static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
32 
33 unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
34 					      gva_t eaddr, void *to, void *from,
35 					      unsigned long n)
36 {
37 	int old_pid, old_lpid;
38 	unsigned long quadrant, ret = n;
39 	bool is_load = !!to;
40 
41 	/* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
42 	if (kvmhv_on_pseries())
43 		return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
44 					  (to != NULL) ? __pa(to): 0,
45 					  (from != NULL) ? __pa(from): 0, n);
46 
47 	if (eaddr & (0xFFFUL << 52))
48 		return ret;
49 
50 	quadrant = 1;
51 	if (!pid)
52 		quadrant = 2;
53 	if (is_load)
54 		from = (void *) (eaddr | (quadrant << 62));
55 	else
56 		to = (void *) (eaddr | (quadrant << 62));
57 
58 	preempt_disable();
59 
60 	asm volatile("hwsync" ::: "memory");
61 	isync();
62 	/* switch the lpid first to avoid running host with unallocated pid */
63 	old_lpid = mfspr(SPRN_LPID);
64 	if (old_lpid != lpid)
65 		mtspr(SPRN_LPID, lpid);
66 	if (quadrant == 1) {
67 		old_pid = mfspr(SPRN_PID);
68 		if (old_pid != pid)
69 			mtspr(SPRN_PID, pid);
70 	}
71 	isync();
72 
73 	pagefault_disable();
74 	if (is_load)
75 		ret = __copy_from_user_inatomic(to, (const void __user *)from, n);
76 	else
77 		ret = __copy_to_user_inatomic((void __user *)to, from, n);
78 	pagefault_enable();
79 
80 	asm volatile("hwsync" ::: "memory");
81 	isync();
82 	/* switch the pid first to avoid running host with unallocated pid */
83 	if (quadrant == 1 && pid != old_pid)
84 		mtspr(SPRN_PID, old_pid);
85 	if (lpid != old_lpid)
86 		mtspr(SPRN_LPID, old_lpid);
87 	isync();
88 
89 	preempt_enable();
90 
91 	return ret;
92 }
93 
94 static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
95 					  void *to, void *from, unsigned long n)
96 {
97 	int lpid = vcpu->kvm->arch.lpid;
98 	int pid = vcpu->arch.pid;
99 
100 	/* This would cause a data segment intr so don't allow the access */
101 	if (eaddr & (0x3FFUL << 52))
102 		return -EINVAL;
103 
104 	/* Should we be using the nested lpid */
105 	if (vcpu->arch.nested)
106 		lpid = vcpu->arch.nested->shadow_lpid;
107 
108 	/* If accessing quadrant 3 then pid is expected to be 0 */
109 	if (((eaddr >> 62) & 0x3) == 0x3)
110 		pid = 0;
111 
112 	eaddr &= ~(0xFFFUL << 52);
113 
114 	return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
115 }
116 
117 long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
118 				 unsigned long n)
119 {
120 	long ret;
121 
122 	ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
123 	if (ret > 0)
124 		memset(to + (n - ret), 0, ret);
125 
126 	return ret;
127 }
128 
129 long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
130 			       unsigned long n)
131 {
132 	return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
133 }
134 
135 int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
136 			       struct kvmppc_pte *gpte, u64 root,
137 			       u64 *pte_ret_p)
138 {
139 	struct kvm *kvm = vcpu->kvm;
140 	int ret, level, ps;
141 	unsigned long rts, bits, offset, index;
142 	u64 pte, base, gpa;
143 	__be64 rpte;
144 
145 	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
146 		((root & RTS2_MASK) >> RTS2_SHIFT);
147 	bits = root & RPDS_MASK;
148 	base = root & RPDB_MASK;
149 
150 	offset = rts + 31;
151 
152 	/* Current implementations only support 52-bit space */
153 	if (offset != 52)
154 		return -EINVAL;
155 
156 	/* Walk each level of the radix tree */
157 	for (level = 3; level >= 0; --level) {
158 		u64 addr;
159 		/* Check a valid size */
160 		if (level && bits != p9_supported_radix_bits[level])
161 			return -EINVAL;
162 		if (level == 0 && !(bits == 5 || bits == 9))
163 			return -EINVAL;
164 		offset -= bits;
165 		index = (eaddr >> offset) & ((1UL << bits) - 1);
166 		/* Check that low bits of page table base are zero */
167 		if (base & ((1UL << (bits + 3)) - 1))
168 			return -EINVAL;
169 		/* Read the entry from guest memory */
170 		addr = base + (index * sizeof(rpte));
171 		vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
172 		ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
173 		srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
174 		if (ret) {
175 			if (pte_ret_p)
176 				*pte_ret_p = addr;
177 			return ret;
178 		}
179 		pte = __be64_to_cpu(rpte);
180 		if (!(pte & _PAGE_PRESENT))
181 			return -ENOENT;
182 		/* Check if a leaf entry */
183 		if (pte & _PAGE_PTE)
184 			break;
185 		/* Get ready to walk the next level */
186 		base = pte & RPDB_MASK;
187 		bits = pte & RPDS_MASK;
188 	}
189 
190 	/* Need a leaf at lowest level; 512GB pages not supported */
191 	if (level < 0 || level == 3)
192 		return -EINVAL;
193 
194 	/* We found a valid leaf PTE */
195 	/* Offset is now log base 2 of the page size */
196 	gpa = pte & 0x01fffffffffff000ul;
197 	if (gpa & ((1ul << offset) - 1))
198 		return -EINVAL;
199 	gpa |= eaddr & ((1ul << offset) - 1);
200 	for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
201 		if (offset == mmu_psize_defs[ps].shift)
202 			break;
203 	gpte->page_size = ps;
204 	gpte->page_shift = offset;
205 
206 	gpte->eaddr = eaddr;
207 	gpte->raddr = gpa;
208 
209 	/* Work out permissions */
210 	gpte->may_read = !!(pte & _PAGE_READ);
211 	gpte->may_write = !!(pte & _PAGE_WRITE);
212 	gpte->may_execute = !!(pte & _PAGE_EXEC);
213 
214 	gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
215 
216 	if (pte_ret_p)
217 		*pte_ret_p = pte;
218 
219 	return 0;
220 }
221 
222 /*
223  * Used to walk a partition or process table radix tree in guest memory
224  * Note: We exploit the fact that a partition table and a process
225  * table have the same layout, a partition-scoped page table and a
226  * process-scoped page table have the same layout, and the 2nd
227  * doubleword of a partition table entry has the same layout as
228  * the PTCR register.
229  */
230 int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
231 				     struct kvmppc_pte *gpte, u64 table,
232 				     int table_index, u64 *pte_ret_p)
233 {
234 	struct kvm *kvm = vcpu->kvm;
235 	int ret;
236 	unsigned long size, ptbl, root;
237 	struct prtb_entry entry;
238 
239 	if ((table & PRTS_MASK) > 24)
240 		return -EINVAL;
241 	size = 1ul << ((table & PRTS_MASK) + 12);
242 
243 	/* Is the table big enough to contain this entry? */
244 	if ((table_index * sizeof(entry)) >= size)
245 		return -EINVAL;
246 
247 	/* Read the table to find the root of the radix tree */
248 	ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
249 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
250 	ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
251 	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
252 	if (ret)
253 		return ret;
254 
255 	/* Root is stored in the first double word */
256 	root = be64_to_cpu(entry.prtb0);
257 
258 	return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
259 }
260 
261 int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
262 			   struct kvmppc_pte *gpte, bool data, bool iswrite)
263 {
264 	u32 pid;
265 	u64 pte;
266 	int ret;
267 
268 	/* Work out effective PID */
269 	switch (eaddr >> 62) {
270 	case 0:
271 		pid = vcpu->arch.pid;
272 		break;
273 	case 3:
274 		pid = 0;
275 		break;
276 	default:
277 		return -EINVAL;
278 	}
279 
280 	ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
281 				vcpu->kvm->arch.process_table, pid, &pte);
282 	if (ret)
283 		return ret;
284 
285 	/* Check privilege (applies only to process scoped translations) */
286 	if (kvmppc_get_msr(vcpu) & MSR_PR) {
287 		if (pte & _PAGE_PRIVILEGED) {
288 			gpte->may_read = 0;
289 			gpte->may_write = 0;
290 			gpte->may_execute = 0;
291 		}
292 	} else {
293 		if (!(pte & _PAGE_PRIVILEGED)) {
294 			/* Check AMR/IAMR to see if strict mode is in force */
295 			if (vcpu->arch.amr & (1ul << 62))
296 				gpte->may_read = 0;
297 			if (vcpu->arch.amr & (1ul << 63))
298 				gpte->may_write = 0;
299 			if (vcpu->arch.iamr & (1ul << 62))
300 				gpte->may_execute = 0;
301 		}
302 	}
303 
304 	return 0;
305 }
306 
307 void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
308 			     unsigned int pshift, unsigned int lpid)
309 {
310 	unsigned long psize = PAGE_SIZE;
311 	int psi;
312 	long rc;
313 	unsigned long rb;
314 
315 	if (pshift)
316 		psize = 1UL << pshift;
317 	else
318 		pshift = PAGE_SHIFT;
319 
320 	addr &= ~(psize - 1);
321 
322 	if (!kvmhv_on_pseries()) {
323 		radix__flush_tlb_lpid_page(lpid, addr, psize);
324 		return;
325 	}
326 
327 	psi = shift_to_mmu_psize(pshift);
328 
329 	if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) {
330 		rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
331 		rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
332 					lpid, rb);
333 	} else {
334 		rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
335 					    H_RPTI_TYPE_NESTED |
336 					    H_RPTI_TYPE_TLB,
337 					    psize_to_rpti_pgsize(psi),
338 					    addr, addr + psize);
339 	}
340 
341 	if (rc)
342 		pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
343 }
344 
345 static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
346 {
347 	long rc;
348 
349 	if (!kvmhv_on_pseries()) {
350 		radix__flush_pwc_lpid(lpid);
351 		return;
352 	}
353 
354 	if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
355 		rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
356 					lpid, TLBIEL_INVAL_SET_LPID);
357 	else
358 		rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
359 					    H_RPTI_TYPE_NESTED |
360 					    H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL,
361 					    0, -1UL);
362 	if (rc)
363 		pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
364 }
365 
366 static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
367 				      unsigned long clr, unsigned long set,
368 				      unsigned long addr, unsigned int shift)
369 {
370 	return __radix_pte_update(ptep, clr, set);
371 }
372 
373 static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
374 			     pte_t *ptep, pte_t pte)
375 {
376 	radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
377 }
378 
379 static struct kmem_cache *kvm_pte_cache;
380 static struct kmem_cache *kvm_pmd_cache;
381 
382 static pte_t *kvmppc_pte_alloc(void)
383 {
384 	pte_t *pte;
385 
386 	pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
387 	/* pmd_populate() will only reference _pa(pte). */
388 	kmemleak_ignore(pte);
389 
390 	return pte;
391 }
392 
393 static void kvmppc_pte_free(pte_t *ptep)
394 {
395 	kmem_cache_free(kvm_pte_cache, ptep);
396 }
397 
398 static pmd_t *kvmppc_pmd_alloc(void)
399 {
400 	pmd_t *pmd;
401 
402 	pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
403 	/* pud_populate() will only reference _pa(pmd). */
404 	kmemleak_ignore(pmd);
405 
406 	return pmd;
407 }
408 
409 static void kvmppc_pmd_free(pmd_t *pmdp)
410 {
411 	kmem_cache_free(kvm_pmd_cache, pmdp);
412 }
413 
414 /* Called with kvm->mmu_lock held */
415 void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
416 		      unsigned int shift,
417 		      const struct kvm_memory_slot *memslot,
418 		      unsigned int lpid)
419 
420 {
421 	unsigned long old;
422 	unsigned long gfn = gpa >> PAGE_SHIFT;
423 	unsigned long page_size = PAGE_SIZE;
424 	unsigned long hpa;
425 
426 	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
427 	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
428 
429 	/* The following only applies to L1 entries */
430 	if (lpid != kvm->arch.lpid)
431 		return;
432 
433 	if (!memslot) {
434 		memslot = gfn_to_memslot(kvm, gfn);
435 		if (!memslot)
436 			return;
437 	}
438 	if (shift) { /* 1GB or 2MB page */
439 		page_size = 1ul << shift;
440 		if (shift == PMD_SHIFT)
441 			kvm->stat.num_2M_pages--;
442 		else if (shift == PUD_SHIFT)
443 			kvm->stat.num_1G_pages--;
444 	}
445 
446 	gpa &= ~(page_size - 1);
447 	hpa = old & PTE_RPN_MASK;
448 	kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
449 
450 	if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
451 		kvmppc_update_dirty_map(memslot, gfn, page_size);
452 }
453 
454 /*
455  * kvmppc_free_p?d are used to free existing page tables, and recursively
456  * descend and clear and free children.
457  * Callers are responsible for flushing the PWC.
458  *
459  * When page tables are being unmapped/freed as part of page fault path
460  * (full == false), valid ptes are generally not expected; however, there
461  * is one situation where they arise, which is when dirty page logging is
462  * turned off for a memslot while the VM is running.  The new memslot
463  * becomes visible to page faults before the memslot commit function
464  * gets to flush the memslot, which can lead to a 2MB page mapping being
465  * installed for a guest physical address where there are already 64kB
466  * (or 4kB) mappings (of sub-pages of the same 2MB page).
467  */
468 static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
469 				  unsigned int lpid)
470 {
471 	if (full) {
472 		memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
473 	} else {
474 		pte_t *p = pte;
475 		unsigned long it;
476 
477 		for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
478 			if (pte_val(*p) == 0)
479 				continue;
480 			kvmppc_unmap_pte(kvm, p,
481 					 pte_pfn(*p) << PAGE_SHIFT,
482 					 PAGE_SHIFT, NULL, lpid);
483 		}
484 	}
485 
486 	kvmppc_pte_free(pte);
487 }
488 
489 static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
490 				  unsigned int lpid)
491 {
492 	unsigned long im;
493 	pmd_t *p = pmd;
494 
495 	for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
496 		if (!pmd_present(*p))
497 			continue;
498 		if (pmd_is_leaf(*p)) {
499 			if (full) {
500 				pmd_clear(p);
501 			} else {
502 				WARN_ON_ONCE(1);
503 				kvmppc_unmap_pte(kvm, (pte_t *)p,
504 					 pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
505 					 PMD_SHIFT, NULL, lpid);
506 			}
507 		} else {
508 			pte_t *pte;
509 
510 			pte = pte_offset_map(p, 0);
511 			kvmppc_unmap_free_pte(kvm, pte, full, lpid);
512 			pmd_clear(p);
513 		}
514 	}
515 	kvmppc_pmd_free(pmd);
516 }
517 
518 static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
519 				  unsigned int lpid)
520 {
521 	unsigned long iu;
522 	pud_t *p = pud;
523 
524 	for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
525 		if (!pud_present(*p))
526 			continue;
527 		if (pud_is_leaf(*p)) {
528 			pud_clear(p);
529 		} else {
530 			pmd_t *pmd;
531 
532 			pmd = pmd_offset(p, 0);
533 			kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
534 			pud_clear(p);
535 		}
536 	}
537 	pud_free(kvm->mm, pud);
538 }
539 
540 void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
541 {
542 	unsigned long ig;
543 
544 	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
545 		p4d_t *p4d = p4d_offset(pgd, 0);
546 		pud_t *pud;
547 
548 		if (!p4d_present(*p4d))
549 			continue;
550 		pud = pud_offset(p4d, 0);
551 		kvmppc_unmap_free_pud(kvm, pud, lpid);
552 		p4d_clear(p4d);
553 	}
554 }
555 
556 void kvmppc_free_radix(struct kvm *kvm)
557 {
558 	if (kvm->arch.pgtable) {
559 		kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
560 					  kvm->arch.lpid);
561 		pgd_free(kvm->mm, kvm->arch.pgtable);
562 		kvm->arch.pgtable = NULL;
563 	}
564 }
565 
566 static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
567 					unsigned long gpa, unsigned int lpid)
568 {
569 	pte_t *pte = pte_offset_kernel(pmd, 0);
570 
571 	/*
572 	 * Clearing the pmd entry then flushing the PWC ensures that the pte
573 	 * page no longer be cached by the MMU, so can be freed without
574 	 * flushing the PWC again.
575 	 */
576 	pmd_clear(pmd);
577 	kvmppc_radix_flush_pwc(kvm, lpid);
578 
579 	kvmppc_unmap_free_pte(kvm, pte, false, lpid);
580 }
581 
582 static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
583 					unsigned long gpa, unsigned int lpid)
584 {
585 	pmd_t *pmd = pmd_offset(pud, 0);
586 
587 	/*
588 	 * Clearing the pud entry then flushing the PWC ensures that the pmd
589 	 * page and any children pte pages will no longer be cached by the MMU,
590 	 * so can be freed without flushing the PWC again.
591 	 */
592 	pud_clear(pud);
593 	kvmppc_radix_flush_pwc(kvm, lpid);
594 
595 	kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
596 }
597 
598 /*
599  * There are a number of bits which may differ between different faults to
600  * the same partition scope entry. RC bits, in the course of cleaning and
601  * aging. And the write bit can change, either the access could have been
602  * upgraded, or a read fault could happen concurrently with a write fault
603  * that sets those bits first.
604  */
605 #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
606 
607 int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
608 		      unsigned long gpa, unsigned int level,
609 		      unsigned long mmu_seq, unsigned int lpid,
610 		      unsigned long *rmapp, struct rmap_nested **n_rmap)
611 {
612 	pgd_t *pgd;
613 	p4d_t *p4d;
614 	pud_t *pud, *new_pud = NULL;
615 	pmd_t *pmd, *new_pmd = NULL;
616 	pte_t *ptep, *new_ptep = NULL;
617 	int ret;
618 
619 	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
620 	pgd = pgtable + pgd_index(gpa);
621 	p4d = p4d_offset(pgd, gpa);
622 
623 	pud = NULL;
624 	if (p4d_present(*p4d))
625 		pud = pud_offset(p4d, gpa);
626 	else
627 		new_pud = pud_alloc_one(kvm->mm, gpa);
628 
629 	pmd = NULL;
630 	if (pud && pud_present(*pud) && !pud_is_leaf(*pud))
631 		pmd = pmd_offset(pud, gpa);
632 	else if (level <= 1)
633 		new_pmd = kvmppc_pmd_alloc();
634 
635 	if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
636 		new_ptep = kvmppc_pte_alloc();
637 
638 	/* Check if we might have been invalidated; let the guest retry if so */
639 	spin_lock(&kvm->mmu_lock);
640 	ret = -EAGAIN;
641 	if (mmu_notifier_retry(kvm, mmu_seq))
642 		goto out_unlock;
643 
644 	/* Now traverse again under the lock and change the tree */
645 	ret = -ENOMEM;
646 	if (p4d_none(*p4d)) {
647 		if (!new_pud)
648 			goto out_unlock;
649 		p4d_populate(kvm->mm, p4d, new_pud);
650 		new_pud = NULL;
651 	}
652 	pud = pud_offset(p4d, gpa);
653 	if (pud_is_leaf(*pud)) {
654 		unsigned long hgpa = gpa & PUD_MASK;
655 
656 		/* Check if we raced and someone else has set the same thing */
657 		if (level == 2) {
658 			if (pud_raw(*pud) == pte_raw(pte)) {
659 				ret = 0;
660 				goto out_unlock;
661 			}
662 			/* Valid 1GB page here already, add our extra bits */
663 			WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
664 							PTE_BITS_MUST_MATCH);
665 			kvmppc_radix_update_pte(kvm, (pte_t *)pud,
666 					      0, pte_val(pte), hgpa, PUD_SHIFT);
667 			ret = 0;
668 			goto out_unlock;
669 		}
670 		/*
671 		 * If we raced with another CPU which has just put
672 		 * a 1GB pte in after we saw a pmd page, try again.
673 		 */
674 		if (!new_pmd) {
675 			ret = -EAGAIN;
676 			goto out_unlock;
677 		}
678 		/* Valid 1GB page here already, remove it */
679 		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
680 				 lpid);
681 	}
682 	if (level == 2) {
683 		if (!pud_none(*pud)) {
684 			/*
685 			 * There's a page table page here, but we wanted to
686 			 * install a large page, so remove and free the page
687 			 * table page.
688 			 */
689 			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
690 		}
691 		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
692 		if (rmapp && n_rmap)
693 			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
694 		ret = 0;
695 		goto out_unlock;
696 	}
697 	if (pud_none(*pud)) {
698 		if (!new_pmd)
699 			goto out_unlock;
700 		pud_populate(kvm->mm, pud, new_pmd);
701 		new_pmd = NULL;
702 	}
703 	pmd = pmd_offset(pud, gpa);
704 	if (pmd_is_leaf(*pmd)) {
705 		unsigned long lgpa = gpa & PMD_MASK;
706 
707 		/* Check if we raced and someone else has set the same thing */
708 		if (level == 1) {
709 			if (pmd_raw(*pmd) == pte_raw(pte)) {
710 				ret = 0;
711 				goto out_unlock;
712 			}
713 			/* Valid 2MB page here already, add our extra bits */
714 			WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
715 							PTE_BITS_MUST_MATCH);
716 			kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
717 					0, pte_val(pte), lgpa, PMD_SHIFT);
718 			ret = 0;
719 			goto out_unlock;
720 		}
721 
722 		/*
723 		 * If we raced with another CPU which has just put
724 		 * a 2MB pte in after we saw a pte page, try again.
725 		 */
726 		if (!new_ptep) {
727 			ret = -EAGAIN;
728 			goto out_unlock;
729 		}
730 		/* Valid 2MB page here already, remove it */
731 		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
732 				 lpid);
733 	}
734 	if (level == 1) {
735 		if (!pmd_none(*pmd)) {
736 			/*
737 			 * There's a page table page here, but we wanted to
738 			 * install a large page, so remove and free the page
739 			 * table page.
740 			 */
741 			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
742 		}
743 		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
744 		if (rmapp && n_rmap)
745 			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
746 		ret = 0;
747 		goto out_unlock;
748 	}
749 	if (pmd_none(*pmd)) {
750 		if (!new_ptep)
751 			goto out_unlock;
752 		pmd_populate(kvm->mm, pmd, new_ptep);
753 		new_ptep = NULL;
754 	}
755 	ptep = pte_offset_kernel(pmd, gpa);
756 	if (pte_present(*ptep)) {
757 		/* Check if someone else set the same thing */
758 		if (pte_raw(*ptep) == pte_raw(pte)) {
759 			ret = 0;
760 			goto out_unlock;
761 		}
762 		/* Valid page here already, add our extra bits */
763 		WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
764 							PTE_BITS_MUST_MATCH);
765 		kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
766 		ret = 0;
767 		goto out_unlock;
768 	}
769 	kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
770 	if (rmapp && n_rmap)
771 		kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
772 	ret = 0;
773 
774  out_unlock:
775 	spin_unlock(&kvm->mmu_lock);
776 	if (new_pud)
777 		pud_free(kvm->mm, new_pud);
778 	if (new_pmd)
779 		kvmppc_pmd_free(new_pmd);
780 	if (new_ptep)
781 		kvmppc_pte_free(new_ptep);
782 	return ret;
783 }
784 
785 bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,
786 			     unsigned long gpa, unsigned int lpid)
787 {
788 	unsigned long pgflags;
789 	unsigned int shift;
790 	pte_t *ptep;
791 
792 	/*
793 	 * Need to set an R or C bit in the 2nd-level tables;
794 	 * since we are just helping out the hardware here,
795 	 * it is sufficient to do what the hardware does.
796 	 */
797 	pgflags = _PAGE_ACCESSED;
798 	if (writing)
799 		pgflags |= _PAGE_DIRTY;
800 
801 	if (nested)
802 		ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
803 	else
804 		ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
805 
806 	if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
807 		kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
808 		return true;
809 	}
810 	return false;
811 }
812 
813 int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
814 				   unsigned long gpa,
815 				   struct kvm_memory_slot *memslot,
816 				   bool writing, bool kvm_ro,
817 				   pte_t *inserted_pte, unsigned int *levelp)
818 {
819 	struct kvm *kvm = vcpu->kvm;
820 	struct page *page = NULL;
821 	unsigned long mmu_seq;
822 	unsigned long hva, gfn = gpa >> PAGE_SHIFT;
823 	bool upgrade_write = false;
824 	bool *upgrade_p = &upgrade_write;
825 	pte_t pte, *ptep;
826 	unsigned int shift, level;
827 	int ret;
828 	bool large_enable;
829 
830 	/* used to check for invalidations in progress */
831 	mmu_seq = kvm->mmu_notifier_seq;
832 	smp_rmb();
833 
834 	/*
835 	 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
836 	 * do it with !atomic && !async, which is how we call it.
837 	 * We always ask for write permission since the common case
838 	 * is that the page is writable.
839 	 */
840 	hva = gfn_to_hva_memslot(memslot, gfn);
841 	if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
842 		upgrade_write = true;
843 	} else {
844 		unsigned long pfn;
845 
846 		/* Call KVM generic code to do the slow-path check */
847 		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
848 					   writing, upgrade_p, NULL);
849 		if (is_error_noslot_pfn(pfn))
850 			return -EFAULT;
851 		page = NULL;
852 		if (pfn_valid(pfn)) {
853 			page = pfn_to_page(pfn);
854 			if (PageReserved(page))
855 				page = NULL;
856 		}
857 	}
858 
859 	/*
860 	 * Read the PTE from the process' radix tree and use that
861 	 * so we get the shift and attribute bits.
862 	 */
863 	spin_lock(&kvm->mmu_lock);
864 	ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
865 	pte = __pte(0);
866 	if (ptep)
867 		pte = READ_ONCE(*ptep);
868 	spin_unlock(&kvm->mmu_lock);
869 	/*
870 	 * If the PTE disappeared temporarily due to a THP
871 	 * collapse, just return and let the guest try again.
872 	 */
873 	if (!pte_present(pte)) {
874 		if (page)
875 			put_page(page);
876 		return RESUME_GUEST;
877 	}
878 
879 	/* If we're logging dirty pages, always map single pages */
880 	large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
881 
882 	/* Get pte level from shift/size */
883 	if (large_enable && shift == PUD_SHIFT &&
884 	    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
885 	    (hva & (PUD_SIZE - PAGE_SIZE))) {
886 		level = 2;
887 	} else if (large_enable && shift == PMD_SHIFT &&
888 		   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
889 		   (hva & (PMD_SIZE - PAGE_SIZE))) {
890 		level = 1;
891 	} else {
892 		level = 0;
893 		if (shift > PAGE_SHIFT) {
894 			/*
895 			 * If the pte maps more than one page, bring over
896 			 * bits from the virtual address to get the real
897 			 * address of the specific single page we want.
898 			 */
899 			unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
900 			pte = __pte(pte_val(pte) | (hva & rpnmask));
901 		}
902 	}
903 
904 	pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
905 	if (writing || upgrade_write) {
906 		if (pte_val(pte) & _PAGE_WRITE)
907 			pte = __pte(pte_val(pte) | _PAGE_DIRTY);
908 	} else {
909 		pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
910 	}
911 
912 	/* Allocate space in the tree and write the PTE */
913 	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
914 				mmu_seq, kvm->arch.lpid, NULL, NULL);
915 	if (inserted_pte)
916 		*inserted_pte = pte;
917 	if (levelp)
918 		*levelp = level;
919 
920 	if (page) {
921 		if (!ret && (pte_val(pte) & _PAGE_WRITE))
922 			set_page_dirty_lock(page);
923 		put_page(page);
924 	}
925 
926 	/* Increment number of large pages if we (successfully) inserted one */
927 	if (!ret) {
928 		if (level == 1)
929 			kvm->stat.num_2M_pages++;
930 		else if (level == 2)
931 			kvm->stat.num_1G_pages++;
932 	}
933 
934 	return ret;
935 }
936 
937 int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
938 				   unsigned long ea, unsigned long dsisr)
939 {
940 	struct kvm *kvm = vcpu->kvm;
941 	unsigned long gpa, gfn;
942 	struct kvm_memory_slot *memslot;
943 	long ret;
944 	bool writing = !!(dsisr & DSISR_ISSTORE);
945 	bool kvm_ro = false;
946 
947 	/* Check for unusual errors */
948 	if (dsisr & DSISR_UNSUPP_MMU) {
949 		pr_err("KVM: Got unsupported MMU fault\n");
950 		return -EFAULT;
951 	}
952 	if (dsisr & DSISR_BADACCESS) {
953 		/* Reflect to the guest as DSI */
954 		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
955 		kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
956 		return RESUME_GUEST;
957 	}
958 
959 	/* Translate the logical address */
960 	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
961 	gpa &= ~0xF000000000000000ul;
962 	gfn = gpa >> PAGE_SHIFT;
963 	if (!(dsisr & DSISR_PRTABLE_FAULT))
964 		gpa |= ea & 0xfff;
965 
966 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
967 		return kvmppc_send_page_to_uv(kvm, gfn);
968 
969 	/* Get the corresponding memslot */
970 	memslot = gfn_to_memslot(kvm, gfn);
971 
972 	/* No memslot means it's an emulated MMIO region */
973 	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
974 		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
975 			     DSISR_SET_RC)) {
976 			/*
977 			 * Bad address in guest page table tree, or other
978 			 * unusual error - reflect it to the guest as DSI.
979 			 */
980 			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
981 			return RESUME_GUEST;
982 		}
983 		return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);
984 	}
985 
986 	if (memslot->flags & KVM_MEM_READONLY) {
987 		if (writing) {
988 			/* give the guest a DSI */
989 			kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
990 						       DSISR_PROTFAULT);
991 			return RESUME_GUEST;
992 		}
993 		kvm_ro = true;
994 	}
995 
996 	/* Failed to set the reference/change bits */
997 	if (dsisr & DSISR_SET_RC) {
998 		spin_lock(&kvm->mmu_lock);
999 		if (kvmppc_hv_handle_set_rc(kvm, false, writing,
1000 					    gpa, kvm->arch.lpid))
1001 			dsisr &= ~DSISR_SET_RC;
1002 		spin_unlock(&kvm->mmu_lock);
1003 
1004 		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
1005 			       DSISR_PROTFAULT | DSISR_SET_RC)))
1006 			return RESUME_GUEST;
1007 	}
1008 
1009 	/* Try to insert a pte */
1010 	ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
1011 					     kvm_ro, NULL, NULL);
1012 
1013 	if (ret == 0 || ret == -EAGAIN)
1014 		ret = RESUME_GUEST;
1015 	return ret;
1016 }
1017 
1018 /* Called with kvm->mmu_lock held */
1019 void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1020 		     unsigned long gfn)
1021 {
1022 	pte_t *ptep;
1023 	unsigned long gpa = gfn << PAGE_SHIFT;
1024 	unsigned int shift;
1025 
1026 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
1027 		uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
1028 		return;
1029 	}
1030 
1031 	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1032 	if (ptep && pte_present(*ptep))
1033 		kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1034 				 kvm->arch.lpid);
1035 }
1036 
1037 /* Called with kvm->mmu_lock held */
1038 bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1039 		   unsigned long gfn)
1040 {
1041 	pte_t *ptep;
1042 	unsigned long gpa = gfn << PAGE_SHIFT;
1043 	unsigned int shift;
1044 	bool ref = false;
1045 	unsigned long old, *rmapp;
1046 
1047 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1048 		return ref;
1049 
1050 	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1051 	if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
1052 		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
1053 					      gpa, shift);
1054 		/* XXX need to flush tlb here? */
1055 		/* Also clear bit in ptes in shadow pgtable for nested guests */
1056 		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1057 		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
1058 					       old & PTE_RPN_MASK,
1059 					       1UL << shift);
1060 		ref = true;
1061 	}
1062 	return ref;
1063 }
1064 
1065 /* Called with kvm->mmu_lock held */
1066 bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1067 			unsigned long gfn)
1068 
1069 {
1070 	pte_t *ptep;
1071 	unsigned long gpa = gfn << PAGE_SHIFT;
1072 	unsigned int shift;
1073 	bool ref = false;
1074 
1075 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1076 		return ref;
1077 
1078 	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1079 	if (ptep && pte_present(*ptep) && pte_young(*ptep))
1080 		ref = true;
1081 	return ref;
1082 }
1083 
1084 /* Returns the number of PAGE_SIZE pages that are dirty */
1085 static int kvm_radix_test_clear_dirty(struct kvm *kvm,
1086 				struct kvm_memory_slot *memslot, int pagenum)
1087 {
1088 	unsigned long gfn = memslot->base_gfn + pagenum;
1089 	unsigned long gpa = gfn << PAGE_SHIFT;
1090 	pte_t *ptep, pte;
1091 	unsigned int shift;
1092 	int ret = 0;
1093 	unsigned long old, *rmapp;
1094 
1095 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1096 		return ret;
1097 
1098 	/*
1099 	 * For performance reasons we don't hold kvm->mmu_lock while walking the
1100 	 * partition scoped table.
1101 	 */
1102 	ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift);
1103 	if (!ptep)
1104 		return 0;
1105 
1106 	pte = READ_ONCE(*ptep);
1107 	if (pte_present(pte) && pte_dirty(pte)) {
1108 		spin_lock(&kvm->mmu_lock);
1109 		/*
1110 		 * Recheck the pte again
1111 		 */
1112 		if (pte_val(pte) != pte_val(*ptep)) {
1113 			/*
1114 			 * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can
1115 			 * only find PAGE_SIZE pte entries here. We can continue
1116 			 * to use the pte addr returned by above page table
1117 			 * walk.
1118 			 */
1119 			if (!pte_present(*ptep) || !pte_dirty(*ptep)) {
1120 				spin_unlock(&kvm->mmu_lock);
1121 				return 0;
1122 			}
1123 		}
1124 
1125 		ret = 1;
1126 		VM_BUG_ON(shift);
1127 		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
1128 					      gpa, shift);
1129 		kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
1130 		/* Also clear bit in ptes in shadow pgtable for nested guests */
1131 		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1132 		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
1133 					       old & PTE_RPN_MASK,
1134 					       1UL << shift);
1135 		spin_unlock(&kvm->mmu_lock);
1136 	}
1137 	return ret;
1138 }
1139 
1140 long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
1141 			struct kvm_memory_slot *memslot, unsigned long *map)
1142 {
1143 	unsigned long i, j;
1144 	int npages;
1145 
1146 	for (i = 0; i < memslot->npages; i = j) {
1147 		npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
1148 
1149 		/*
1150 		 * Note that if npages > 0 then i must be a multiple of npages,
1151 		 * since huge pages are only used to back the guest at guest
1152 		 * real addresses that are a multiple of their size.
1153 		 * Since we have at most one PTE covering any given guest
1154 		 * real address, if npages > 1 we can skip to i + npages.
1155 		 */
1156 		j = i + 1;
1157 		if (npages) {
1158 			set_dirty_bits(map, i, npages);
1159 			j = i + npages;
1160 		}
1161 	}
1162 	return 0;
1163 }
1164 
1165 void kvmppc_radix_flush_memslot(struct kvm *kvm,
1166 				const struct kvm_memory_slot *memslot)
1167 {
1168 	unsigned long n;
1169 	pte_t *ptep;
1170 	unsigned long gpa;
1171 	unsigned int shift;
1172 
1173 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
1174 		kvmppc_uvmem_drop_pages(memslot, kvm, true);
1175 
1176 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1177 		return;
1178 
1179 	gpa = memslot->base_gfn << PAGE_SHIFT;
1180 	spin_lock(&kvm->mmu_lock);
1181 	for (n = memslot->npages; n; --n) {
1182 		ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1183 		if (ptep && pte_present(*ptep))
1184 			kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1185 					 kvm->arch.lpid);
1186 		gpa += PAGE_SIZE;
1187 	}
1188 	/*
1189 	 * Increase the mmu notifier sequence number to prevent any page
1190 	 * fault that read the memslot earlier from writing a PTE.
1191 	 */
1192 	kvm->mmu_notifier_seq++;
1193 	spin_unlock(&kvm->mmu_lock);
1194 }
1195 
1196 static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
1197 				 int psize, int *indexp)
1198 {
1199 	if (!mmu_psize_defs[psize].shift)
1200 		return;
1201 	info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
1202 		(mmu_psize_defs[psize].ap << 29);
1203 	++(*indexp);
1204 }
1205 
1206 int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
1207 {
1208 	int i;
1209 
1210 	if (!radix_enabled())
1211 		return -EINVAL;
1212 	memset(info, 0, sizeof(*info));
1213 
1214 	/* 4k page size */
1215 	info->geometries[0].page_shift = 12;
1216 	info->geometries[0].level_bits[0] = 9;
1217 	for (i = 1; i < 4; ++i)
1218 		info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
1219 	/* 64k page size */
1220 	info->geometries[1].page_shift = 16;
1221 	for (i = 0; i < 4; ++i)
1222 		info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
1223 
1224 	i = 0;
1225 	add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
1226 	add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
1227 	add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
1228 	add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
1229 
1230 	return 0;
1231 }
1232 
1233 int kvmppc_init_vm_radix(struct kvm *kvm)
1234 {
1235 	kvm->arch.pgtable = pgd_alloc(kvm->mm);
1236 	if (!kvm->arch.pgtable)
1237 		return -ENOMEM;
1238 	return 0;
1239 }
1240 
1241 static void pte_ctor(void *addr)
1242 {
1243 	memset(addr, 0, RADIX_PTE_TABLE_SIZE);
1244 }
1245 
1246 static void pmd_ctor(void *addr)
1247 {
1248 	memset(addr, 0, RADIX_PMD_TABLE_SIZE);
1249 }
1250 
1251 struct debugfs_radix_state {
1252 	struct kvm	*kvm;
1253 	struct mutex	mutex;
1254 	unsigned long	gpa;
1255 	int		lpid;
1256 	int		chars_left;
1257 	int		buf_index;
1258 	char		buf[128];
1259 	u8		hdr;
1260 };
1261 
1262 static int debugfs_radix_open(struct inode *inode, struct file *file)
1263 {
1264 	struct kvm *kvm = inode->i_private;
1265 	struct debugfs_radix_state *p;
1266 
1267 	p = kzalloc(sizeof(*p), GFP_KERNEL);
1268 	if (!p)
1269 		return -ENOMEM;
1270 
1271 	kvm_get_kvm(kvm);
1272 	p->kvm = kvm;
1273 	mutex_init(&p->mutex);
1274 	file->private_data = p;
1275 
1276 	return nonseekable_open(inode, file);
1277 }
1278 
1279 static int debugfs_radix_release(struct inode *inode, struct file *file)
1280 {
1281 	struct debugfs_radix_state *p = file->private_data;
1282 
1283 	kvm_put_kvm(p->kvm);
1284 	kfree(p);
1285 	return 0;
1286 }
1287 
1288 static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
1289 				 size_t len, loff_t *ppos)
1290 {
1291 	struct debugfs_radix_state *p = file->private_data;
1292 	ssize_t ret, r;
1293 	unsigned long n;
1294 	struct kvm *kvm;
1295 	unsigned long gpa;
1296 	pgd_t *pgt;
1297 	struct kvm_nested_guest *nested;
1298 	pgd_t *pgdp;
1299 	p4d_t p4d, *p4dp;
1300 	pud_t pud, *pudp;
1301 	pmd_t pmd, *pmdp;
1302 	pte_t *ptep;
1303 	int shift;
1304 	unsigned long pte;
1305 
1306 	kvm = p->kvm;
1307 	if (!kvm_is_radix(kvm))
1308 		return 0;
1309 
1310 	ret = mutex_lock_interruptible(&p->mutex);
1311 	if (ret)
1312 		return ret;
1313 
1314 	if (p->chars_left) {
1315 		n = p->chars_left;
1316 		if (n > len)
1317 			n = len;
1318 		r = copy_to_user(buf, p->buf + p->buf_index, n);
1319 		n -= r;
1320 		p->chars_left -= n;
1321 		p->buf_index += n;
1322 		buf += n;
1323 		len -= n;
1324 		ret = n;
1325 		if (r) {
1326 			if (!n)
1327 				ret = -EFAULT;
1328 			goto out;
1329 		}
1330 	}
1331 
1332 	gpa = p->gpa;
1333 	nested = NULL;
1334 	pgt = NULL;
1335 	while (len != 0 && p->lpid >= 0) {
1336 		if (gpa >= RADIX_PGTABLE_RANGE) {
1337 			gpa = 0;
1338 			pgt = NULL;
1339 			if (nested) {
1340 				kvmhv_put_nested(nested);
1341 				nested = NULL;
1342 			}
1343 			p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
1344 			p->hdr = 0;
1345 			if (p->lpid < 0)
1346 				break;
1347 		}
1348 		if (!pgt) {
1349 			if (p->lpid == 0) {
1350 				pgt = kvm->arch.pgtable;
1351 			} else {
1352 				nested = kvmhv_get_nested(kvm, p->lpid, false);
1353 				if (!nested) {
1354 					gpa = RADIX_PGTABLE_RANGE;
1355 					continue;
1356 				}
1357 				pgt = nested->shadow_pgtable;
1358 			}
1359 		}
1360 		n = 0;
1361 		if (!p->hdr) {
1362 			if (p->lpid > 0)
1363 				n = scnprintf(p->buf, sizeof(p->buf),
1364 					      "\nNested LPID %d: ", p->lpid);
1365 			n += scnprintf(p->buf + n, sizeof(p->buf) - n,
1366 				      "pgdir: %lx\n", (unsigned long)pgt);
1367 			p->hdr = 1;
1368 			goto copy;
1369 		}
1370 
1371 		pgdp = pgt + pgd_index(gpa);
1372 		p4dp = p4d_offset(pgdp, gpa);
1373 		p4d = READ_ONCE(*p4dp);
1374 		if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
1375 			gpa = (gpa & P4D_MASK) + P4D_SIZE;
1376 			continue;
1377 		}
1378 
1379 		pudp = pud_offset(&p4d, gpa);
1380 		pud = READ_ONCE(*pudp);
1381 		if (!(pud_val(pud) & _PAGE_PRESENT)) {
1382 			gpa = (gpa & PUD_MASK) + PUD_SIZE;
1383 			continue;
1384 		}
1385 		if (pud_val(pud) & _PAGE_PTE) {
1386 			pte = pud_val(pud);
1387 			shift = PUD_SHIFT;
1388 			goto leaf;
1389 		}
1390 
1391 		pmdp = pmd_offset(&pud, gpa);
1392 		pmd = READ_ONCE(*pmdp);
1393 		if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
1394 			gpa = (gpa & PMD_MASK) + PMD_SIZE;
1395 			continue;
1396 		}
1397 		if (pmd_val(pmd) & _PAGE_PTE) {
1398 			pte = pmd_val(pmd);
1399 			shift = PMD_SHIFT;
1400 			goto leaf;
1401 		}
1402 
1403 		ptep = pte_offset_kernel(&pmd, gpa);
1404 		pte = pte_val(READ_ONCE(*ptep));
1405 		if (!(pte & _PAGE_PRESENT)) {
1406 			gpa += PAGE_SIZE;
1407 			continue;
1408 		}
1409 		shift = PAGE_SHIFT;
1410 	leaf:
1411 		n = scnprintf(p->buf, sizeof(p->buf),
1412 			      " %lx: %lx %d\n", gpa, pte, shift);
1413 		gpa += 1ul << shift;
1414 	copy:
1415 		p->chars_left = n;
1416 		if (n > len)
1417 			n = len;
1418 		r = copy_to_user(buf, p->buf, n);
1419 		n -= r;
1420 		p->chars_left -= n;
1421 		p->buf_index = n;
1422 		buf += n;
1423 		len -= n;
1424 		ret += n;
1425 		if (r) {
1426 			if (!ret)
1427 				ret = -EFAULT;
1428 			break;
1429 		}
1430 	}
1431 	p->gpa = gpa;
1432 	if (nested)
1433 		kvmhv_put_nested(nested);
1434 
1435  out:
1436 	mutex_unlock(&p->mutex);
1437 	return ret;
1438 }
1439 
1440 static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
1441 			   size_t len, loff_t *ppos)
1442 {
1443 	return -EACCES;
1444 }
1445 
1446 static const struct file_operations debugfs_radix_fops = {
1447 	.owner	 = THIS_MODULE,
1448 	.open	 = debugfs_radix_open,
1449 	.release = debugfs_radix_release,
1450 	.read	 = debugfs_radix_read,
1451 	.write	 = debugfs_radix_write,
1452 	.llseek	 = generic_file_llseek,
1453 };
1454 
1455 void kvmhv_radix_debugfs_init(struct kvm *kvm)
1456 {
1457 	debugfs_create_file("radix", 0400, kvm->arch.debugfs_dir, kvm,
1458 			    &debugfs_radix_fops);
1459 }
1460 
1461 int kvmppc_radix_init(void)
1462 {
1463 	unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
1464 
1465 	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
1466 	if (!kvm_pte_cache)
1467 		return -ENOMEM;
1468 
1469 	size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
1470 
1471 	kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
1472 	if (!kvm_pmd_cache) {
1473 		kmem_cache_destroy(kvm_pte_cache);
1474 		return -ENOMEM;
1475 	}
1476 
1477 	return 0;
1478 }
1479 
1480 void kvmppc_radix_exit(void)
1481 {
1482 	kmem_cache_destroy(kvm_pte_cache);
1483 	kmem_cache_destroy(kvm_pmd_cache);
1484 }
1485