xref: /openbmc/linux/arch/powerpc/kvm/book3s_64_mmu_radix.c (revision 05cf4fe738242183f1237f1b3a28b4479348c0a1)
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License, version 2, as
4  * published by the Free Software Foundation.
5  *
6  * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
7  */
8 
9 #include <linux/types.h>
10 #include <linux/string.h>
11 #include <linux/kvm.h>
12 #include <linux/kvm_host.h>
13 #include <linux/anon_inodes.h>
14 #include <linux/file.h>
15 #include <linux/debugfs.h>
16 
17 #include <asm/kvm_ppc.h>
18 #include <asm/kvm_book3s.h>
19 #include <asm/page.h>
20 #include <asm/mmu.h>
21 #include <asm/pgtable.h>
22 #include <asm/pgalloc.h>
23 #include <asm/pte-walk.h>
24 
25 /*
26  * Supported radix tree geometry.
27  * Like p9, we support either 5 or 9 bits at the first (lowest) level,
28  * for a page size of 64k or 4k.
29  */
30 static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
31 
32 int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
33 			       struct kvmppc_pte *gpte, u64 root,
34 			       u64 *pte_ret_p)
35 {
36 	struct kvm *kvm = vcpu->kvm;
37 	int ret, level, ps;
38 	unsigned long rts, bits, offset, index;
39 	u64 pte, base, gpa;
40 	__be64 rpte;
41 
42 	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
43 		((root & RTS2_MASK) >> RTS2_SHIFT);
44 	bits = root & RPDS_MASK;
45 	base = root & RPDB_MASK;
46 
47 	offset = rts + 31;
48 
49 	/* Current implementations only support 52-bit space */
50 	if (offset != 52)
51 		return -EINVAL;
52 
53 	/* Walk each level of the radix tree */
54 	for (level = 3; level >= 0; --level) {
55 		u64 addr;
56 		/* Check a valid size */
57 		if (level && bits != p9_supported_radix_bits[level])
58 			return -EINVAL;
59 		if (level == 0 && !(bits == 5 || bits == 9))
60 			return -EINVAL;
61 		offset -= bits;
62 		index = (eaddr >> offset) & ((1UL << bits) - 1);
63 		/* Check that low bits of page table base are zero */
64 		if (base & ((1UL << (bits + 3)) - 1))
65 			return -EINVAL;
66 		/* Read the entry from guest memory */
67 		addr = base + (index * sizeof(rpte));
68 		ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
69 		if (ret) {
70 			if (pte_ret_p)
71 				*pte_ret_p = addr;
72 			return ret;
73 		}
74 		pte = __be64_to_cpu(rpte);
75 		if (!(pte & _PAGE_PRESENT))
76 			return -ENOENT;
77 		/* Check if a leaf entry */
78 		if (pte & _PAGE_PTE)
79 			break;
80 		/* Get ready to walk the next level */
81 		base = pte & RPDB_MASK;
82 		bits = pte & RPDS_MASK;
83 	}
84 
85 	/* Need a leaf at lowest level; 512GB pages not supported */
86 	if (level < 0 || level == 3)
87 		return -EINVAL;
88 
89 	/* We found a valid leaf PTE */
90 	/* Offset is now log base 2 of the page size */
91 	gpa = pte & 0x01fffffffffff000ul;
92 	if (gpa & ((1ul << offset) - 1))
93 		return -EINVAL;
94 	gpa |= eaddr & ((1ul << offset) - 1);
95 	for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
96 		if (offset == mmu_psize_defs[ps].shift)
97 			break;
98 	gpte->page_size = ps;
99 	gpte->page_shift = offset;
100 
101 	gpte->eaddr = eaddr;
102 	gpte->raddr = gpa;
103 
104 	/* Work out permissions */
105 	gpte->may_read = !!(pte & _PAGE_READ);
106 	gpte->may_write = !!(pte & _PAGE_WRITE);
107 	gpte->may_execute = !!(pte & _PAGE_EXEC);
108 
109 	gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
110 
111 	if (pte_ret_p)
112 		*pte_ret_p = pte;
113 
114 	return 0;
115 }
116 
117 /*
118  * Used to walk a partition or process table radix tree in guest memory
119  * Note: We exploit the fact that a partition table and a process
120  * table have the same layout, a partition-scoped page table and a
121  * process-scoped page table have the same layout, and the 2nd
122  * doubleword of a partition table entry has the same layout as
123  * the PTCR register.
124  */
125 int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
126 				     struct kvmppc_pte *gpte, u64 table,
127 				     int table_index, u64 *pte_ret_p)
128 {
129 	struct kvm *kvm = vcpu->kvm;
130 	int ret;
131 	unsigned long size, ptbl, root;
132 	struct prtb_entry entry;
133 
134 	if ((table & PRTS_MASK) > 24)
135 		return -EINVAL;
136 	size = 1ul << ((table & PRTS_MASK) + 12);
137 
138 	/* Is the table big enough to contain this entry? */
139 	if ((table_index * sizeof(entry)) >= size)
140 		return -EINVAL;
141 
142 	/* Read the table to find the root of the radix tree */
143 	ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
144 	ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
145 	if (ret)
146 		return ret;
147 
148 	/* Root is stored in the first double word */
149 	root = be64_to_cpu(entry.prtb0);
150 
151 	return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
152 }
153 
154 int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
155 			   struct kvmppc_pte *gpte, bool data, bool iswrite)
156 {
157 	u32 pid;
158 	u64 pte;
159 	int ret;
160 
161 	/* Work out effective PID */
162 	switch (eaddr >> 62) {
163 	case 0:
164 		pid = vcpu->arch.pid;
165 		break;
166 	case 3:
167 		pid = 0;
168 		break;
169 	default:
170 		return -EINVAL;
171 	}
172 
173 	ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
174 				vcpu->kvm->arch.process_table, pid, &pte);
175 	if (ret)
176 		return ret;
177 
178 	/* Check privilege (applies only to process scoped translations) */
179 	if (kvmppc_get_msr(vcpu) & MSR_PR) {
180 		if (pte & _PAGE_PRIVILEGED) {
181 			gpte->may_read = 0;
182 			gpte->may_write = 0;
183 			gpte->may_execute = 0;
184 		}
185 	} else {
186 		if (!(pte & _PAGE_PRIVILEGED)) {
187 			/* Check AMR/IAMR to see if strict mode is in force */
188 			if (vcpu->arch.amr & (1ul << 62))
189 				gpte->may_read = 0;
190 			if (vcpu->arch.amr & (1ul << 63))
191 				gpte->may_write = 0;
192 			if (vcpu->arch.iamr & (1ul << 62))
193 				gpte->may_execute = 0;
194 		}
195 	}
196 
197 	return 0;
198 }
199 
200 static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
201 				    unsigned int pshift, unsigned int lpid)
202 {
203 	unsigned long psize = PAGE_SIZE;
204 	int psi;
205 	long rc;
206 	unsigned long rb;
207 
208 	if (pshift)
209 		psize = 1UL << pshift;
210 	else
211 		pshift = PAGE_SHIFT;
212 
213 	addr &= ~(psize - 1);
214 
215 	if (!kvmhv_on_pseries()) {
216 		radix__flush_tlb_lpid_page(lpid, addr, psize);
217 		return;
218 	}
219 
220 	psi = shift_to_mmu_psize(pshift);
221 	rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
222 	rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
223 				lpid, rb);
224 	if (rc)
225 		pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
226 }
227 
228 static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
229 {
230 	long rc;
231 
232 	if (!kvmhv_on_pseries()) {
233 		radix__flush_pwc_lpid(lpid);
234 		return;
235 	}
236 
237 	rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
238 				lpid, TLBIEL_INVAL_SET_LPID);
239 	if (rc)
240 		pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
241 }
242 
243 static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
244 				      unsigned long clr, unsigned long set,
245 				      unsigned long addr, unsigned int shift)
246 {
247 	return __radix_pte_update(ptep, clr, set);
248 }
249 
250 void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
251 			     pte_t *ptep, pte_t pte)
252 {
253 	radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
254 }
255 
256 static struct kmem_cache *kvm_pte_cache;
257 static struct kmem_cache *kvm_pmd_cache;
258 
259 static pte_t *kvmppc_pte_alloc(void)
260 {
261 	return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
262 }
263 
264 static void kvmppc_pte_free(pte_t *ptep)
265 {
266 	kmem_cache_free(kvm_pte_cache, ptep);
267 }
268 
269 /* Like pmd_huge() and pmd_large(), but works regardless of config options */
270 static inline int pmd_is_leaf(pmd_t pmd)
271 {
272 	return !!(pmd_val(pmd) & _PAGE_PTE);
273 }
274 
275 static pmd_t *kvmppc_pmd_alloc(void)
276 {
277 	return kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
278 }
279 
280 static void kvmppc_pmd_free(pmd_t *pmdp)
281 {
282 	kmem_cache_free(kvm_pmd_cache, pmdp);
283 }
284 
285 /* Called with kvm->mmu_lock held */
286 void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
287 		      unsigned int shift, struct kvm_memory_slot *memslot,
288 		      unsigned int lpid)
289 
290 {
291 	unsigned long old;
292 	unsigned long gfn = gpa >> PAGE_SHIFT;
293 	unsigned long page_size = PAGE_SIZE;
294 	unsigned long hpa;
295 
296 	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
297 	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
298 
299 	/* The following only applies to L1 entries */
300 	if (lpid != kvm->arch.lpid)
301 		return;
302 
303 	if (!memslot) {
304 		memslot = gfn_to_memslot(kvm, gfn);
305 		if (!memslot)
306 			return;
307 	}
308 	if (shift)
309 		page_size = 1ul << shift;
310 
311 	gpa &= ~(page_size - 1);
312 	hpa = old & PTE_RPN_MASK;
313 	kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
314 
315 	if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
316 		kvmppc_update_dirty_map(memslot, gfn, page_size);
317 }
318 
319 /*
320  * kvmppc_free_p?d are used to free existing page tables, and recursively
321  * descend and clear and free children.
322  * Callers are responsible for flushing the PWC.
323  *
324  * When page tables are being unmapped/freed as part of page fault path
325  * (full == false), ptes are not expected. There is code to unmap them
326  * and emit a warning if encountered, but there may already be data
327  * corruption due to the unexpected mappings.
328  */
329 static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
330 				  unsigned int lpid)
331 {
332 	if (full) {
333 		memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
334 	} else {
335 		pte_t *p = pte;
336 		unsigned long it;
337 
338 		for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
339 			if (pte_val(*p) == 0)
340 				continue;
341 			WARN_ON_ONCE(1);
342 			kvmppc_unmap_pte(kvm, p,
343 					 pte_pfn(*p) << PAGE_SHIFT,
344 					 PAGE_SHIFT, NULL, lpid);
345 		}
346 	}
347 
348 	kvmppc_pte_free(pte);
349 }
350 
351 static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
352 				  unsigned int lpid)
353 {
354 	unsigned long im;
355 	pmd_t *p = pmd;
356 
357 	for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
358 		if (!pmd_present(*p))
359 			continue;
360 		if (pmd_is_leaf(*p)) {
361 			if (full) {
362 				pmd_clear(p);
363 			} else {
364 				WARN_ON_ONCE(1);
365 				kvmppc_unmap_pte(kvm, (pte_t *)p,
366 					 pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
367 					 PMD_SHIFT, NULL, lpid);
368 			}
369 		} else {
370 			pte_t *pte;
371 
372 			pte = pte_offset_map(p, 0);
373 			kvmppc_unmap_free_pte(kvm, pte, full, lpid);
374 			pmd_clear(p);
375 		}
376 	}
377 	kvmppc_pmd_free(pmd);
378 }
379 
380 static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
381 				  unsigned int lpid)
382 {
383 	unsigned long iu;
384 	pud_t *p = pud;
385 
386 	for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
387 		if (!pud_present(*p))
388 			continue;
389 		if (pud_huge(*p)) {
390 			pud_clear(p);
391 		} else {
392 			pmd_t *pmd;
393 
394 			pmd = pmd_offset(p, 0);
395 			kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
396 			pud_clear(p);
397 		}
398 	}
399 	pud_free(kvm->mm, pud);
400 }
401 
402 void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
403 {
404 	unsigned long ig;
405 
406 	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
407 		pud_t *pud;
408 
409 		if (!pgd_present(*pgd))
410 			continue;
411 		pud = pud_offset(pgd, 0);
412 		kvmppc_unmap_free_pud(kvm, pud, lpid);
413 		pgd_clear(pgd);
414 	}
415 }
416 
417 void kvmppc_free_radix(struct kvm *kvm)
418 {
419 	if (kvm->arch.pgtable) {
420 		kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
421 					  kvm->arch.lpid);
422 		pgd_free(kvm->mm, kvm->arch.pgtable);
423 		kvm->arch.pgtable = NULL;
424 	}
425 }
426 
427 static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
428 					unsigned long gpa, unsigned int lpid)
429 {
430 	pte_t *pte = pte_offset_kernel(pmd, 0);
431 
432 	/*
433 	 * Clearing the pmd entry then flushing the PWC ensures that the pte
434 	 * page no longer be cached by the MMU, so can be freed without
435 	 * flushing the PWC again.
436 	 */
437 	pmd_clear(pmd);
438 	kvmppc_radix_flush_pwc(kvm, lpid);
439 
440 	kvmppc_unmap_free_pte(kvm, pte, false, lpid);
441 }
442 
443 static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
444 					unsigned long gpa, unsigned int lpid)
445 {
446 	pmd_t *pmd = pmd_offset(pud, 0);
447 
448 	/*
449 	 * Clearing the pud entry then flushing the PWC ensures that the pmd
450 	 * page and any children pte pages will no longer be cached by the MMU,
451 	 * so can be freed without flushing the PWC again.
452 	 */
453 	pud_clear(pud);
454 	kvmppc_radix_flush_pwc(kvm, lpid);
455 
456 	kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
457 }
458 
459 /*
460  * There are a number of bits which may differ between different faults to
461  * the same partition scope entry. RC bits, in the course of cleaning and
462  * aging. And the write bit can change, either the access could have been
463  * upgraded, or a read fault could happen concurrently with a write fault
464  * that sets those bits first.
465  */
466 #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
467 
468 int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
469 		      unsigned long gpa, unsigned int level,
470 		      unsigned long mmu_seq, unsigned int lpid,
471 		      unsigned long *rmapp, struct rmap_nested **n_rmap)
472 {
473 	pgd_t *pgd;
474 	pud_t *pud, *new_pud = NULL;
475 	pmd_t *pmd, *new_pmd = NULL;
476 	pte_t *ptep, *new_ptep = NULL;
477 	int ret;
478 
479 	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
480 	pgd = pgtable + pgd_index(gpa);
481 	pud = NULL;
482 	if (pgd_present(*pgd))
483 		pud = pud_offset(pgd, gpa);
484 	else
485 		new_pud = pud_alloc_one(kvm->mm, gpa);
486 
487 	pmd = NULL;
488 	if (pud && pud_present(*pud) && !pud_huge(*pud))
489 		pmd = pmd_offset(pud, gpa);
490 	else if (level <= 1)
491 		new_pmd = kvmppc_pmd_alloc();
492 
493 	if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
494 		new_ptep = kvmppc_pte_alloc();
495 
496 	/* Check if we might have been invalidated; let the guest retry if so */
497 	spin_lock(&kvm->mmu_lock);
498 	ret = -EAGAIN;
499 	if (mmu_notifier_retry(kvm, mmu_seq))
500 		goto out_unlock;
501 
502 	/* Now traverse again under the lock and change the tree */
503 	ret = -ENOMEM;
504 	if (pgd_none(*pgd)) {
505 		if (!new_pud)
506 			goto out_unlock;
507 		pgd_populate(kvm->mm, pgd, new_pud);
508 		new_pud = NULL;
509 	}
510 	pud = pud_offset(pgd, gpa);
511 	if (pud_huge(*pud)) {
512 		unsigned long hgpa = gpa & PUD_MASK;
513 
514 		/* Check if we raced and someone else has set the same thing */
515 		if (level == 2) {
516 			if (pud_raw(*pud) == pte_raw(pte)) {
517 				ret = 0;
518 				goto out_unlock;
519 			}
520 			/* Valid 1GB page here already, add our extra bits */
521 			WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
522 							PTE_BITS_MUST_MATCH);
523 			kvmppc_radix_update_pte(kvm, (pte_t *)pud,
524 					      0, pte_val(pte), hgpa, PUD_SHIFT);
525 			ret = 0;
526 			goto out_unlock;
527 		}
528 		/*
529 		 * If we raced with another CPU which has just put
530 		 * a 1GB pte in after we saw a pmd page, try again.
531 		 */
532 		if (!new_pmd) {
533 			ret = -EAGAIN;
534 			goto out_unlock;
535 		}
536 		/* Valid 1GB page here already, remove it */
537 		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
538 				 lpid);
539 	}
540 	if (level == 2) {
541 		if (!pud_none(*pud)) {
542 			/*
543 			 * There's a page table page here, but we wanted to
544 			 * install a large page, so remove and free the page
545 			 * table page.
546 			 */
547 			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
548 		}
549 		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
550 		if (rmapp && n_rmap)
551 			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
552 		ret = 0;
553 		goto out_unlock;
554 	}
555 	if (pud_none(*pud)) {
556 		if (!new_pmd)
557 			goto out_unlock;
558 		pud_populate(kvm->mm, pud, new_pmd);
559 		new_pmd = NULL;
560 	}
561 	pmd = pmd_offset(pud, gpa);
562 	if (pmd_is_leaf(*pmd)) {
563 		unsigned long lgpa = gpa & PMD_MASK;
564 
565 		/* Check if we raced and someone else has set the same thing */
566 		if (level == 1) {
567 			if (pmd_raw(*pmd) == pte_raw(pte)) {
568 				ret = 0;
569 				goto out_unlock;
570 			}
571 			/* Valid 2MB page here already, add our extra bits */
572 			WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
573 							PTE_BITS_MUST_MATCH);
574 			kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
575 					0, pte_val(pte), lgpa, PMD_SHIFT);
576 			ret = 0;
577 			goto out_unlock;
578 		}
579 
580 		/*
581 		 * If we raced with another CPU which has just put
582 		 * a 2MB pte in after we saw a pte page, try again.
583 		 */
584 		if (!new_ptep) {
585 			ret = -EAGAIN;
586 			goto out_unlock;
587 		}
588 		/* Valid 2MB page here already, remove it */
589 		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
590 				 lpid);
591 	}
592 	if (level == 1) {
593 		if (!pmd_none(*pmd)) {
594 			/*
595 			 * There's a page table page here, but we wanted to
596 			 * install a large page, so remove and free the page
597 			 * table page.
598 			 */
599 			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
600 		}
601 		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
602 		if (rmapp && n_rmap)
603 			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
604 		ret = 0;
605 		goto out_unlock;
606 	}
607 	if (pmd_none(*pmd)) {
608 		if (!new_ptep)
609 			goto out_unlock;
610 		pmd_populate(kvm->mm, pmd, new_ptep);
611 		new_ptep = NULL;
612 	}
613 	ptep = pte_offset_kernel(pmd, gpa);
614 	if (pte_present(*ptep)) {
615 		/* Check if someone else set the same thing */
616 		if (pte_raw(*ptep) == pte_raw(pte)) {
617 			ret = 0;
618 			goto out_unlock;
619 		}
620 		/* Valid page here already, add our extra bits */
621 		WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
622 							PTE_BITS_MUST_MATCH);
623 		kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
624 		ret = 0;
625 		goto out_unlock;
626 	}
627 	kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
628 	if (rmapp && n_rmap)
629 		kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
630 	ret = 0;
631 
632  out_unlock:
633 	spin_unlock(&kvm->mmu_lock);
634 	if (new_pud)
635 		pud_free(kvm->mm, new_pud);
636 	if (new_pmd)
637 		kvmppc_pmd_free(new_pmd);
638 	if (new_ptep)
639 		kvmppc_pte_free(new_ptep);
640 	return ret;
641 }
642 
643 bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
644 			     unsigned long gpa, unsigned int lpid)
645 {
646 	unsigned long pgflags;
647 	unsigned int shift;
648 	pte_t *ptep;
649 
650 	/*
651 	 * Need to set an R or C bit in the 2nd-level tables;
652 	 * since we are just helping out the hardware here,
653 	 * it is sufficient to do what the hardware does.
654 	 */
655 	pgflags = _PAGE_ACCESSED;
656 	if (writing)
657 		pgflags |= _PAGE_DIRTY;
658 	/*
659 	 * We are walking the secondary (partition-scoped) page table here.
660 	 * We can do this without disabling irq because the Linux MM
661 	 * subsystem doesn't do THP splits and collapses on this tree.
662 	 */
663 	ptep = __find_linux_pte(pgtable, gpa, NULL, &shift);
664 	if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
665 		kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
666 		return true;
667 	}
668 	return false;
669 }
670 
671 int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
672 				   unsigned long gpa,
673 				   struct kvm_memory_slot *memslot,
674 				   bool writing, bool kvm_ro,
675 				   pte_t *inserted_pte, unsigned int *levelp)
676 {
677 	struct kvm *kvm = vcpu->kvm;
678 	struct page *page = NULL;
679 	unsigned long mmu_seq;
680 	unsigned long hva, gfn = gpa >> PAGE_SHIFT;
681 	bool upgrade_write = false;
682 	bool *upgrade_p = &upgrade_write;
683 	pte_t pte, *ptep;
684 	unsigned int shift, level;
685 	int ret;
686 
687 	/* used to check for invalidations in progress */
688 	mmu_seq = kvm->mmu_notifier_seq;
689 	smp_rmb();
690 
691 	/*
692 	 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
693 	 * do it with !atomic && !async, which is how we call it.
694 	 * We always ask for write permission since the common case
695 	 * is that the page is writable.
696 	 */
697 	hva = gfn_to_hva_memslot(memslot, gfn);
698 	if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
699 		upgrade_write = true;
700 	} else {
701 		unsigned long pfn;
702 
703 		/* Call KVM generic code to do the slow-path check */
704 		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
705 					   writing, upgrade_p);
706 		if (is_error_noslot_pfn(pfn))
707 			return -EFAULT;
708 		page = NULL;
709 		if (pfn_valid(pfn)) {
710 			page = pfn_to_page(pfn);
711 			if (PageReserved(page))
712 				page = NULL;
713 		}
714 	}
715 
716 	/*
717 	 * Read the PTE from the process' radix tree and use that
718 	 * so we get the shift and attribute bits.
719 	 */
720 	local_irq_disable();
721 	ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
722 	/*
723 	 * If the PTE disappeared temporarily due to a THP
724 	 * collapse, just return and let the guest try again.
725 	 */
726 	if (!ptep) {
727 		local_irq_enable();
728 		if (page)
729 			put_page(page);
730 		return RESUME_GUEST;
731 	}
732 	pte = *ptep;
733 	local_irq_enable();
734 
735 	/* Get pte level from shift/size */
736 	if (shift == PUD_SHIFT &&
737 	    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
738 	    (hva & (PUD_SIZE - PAGE_SIZE))) {
739 		level = 2;
740 	} else if (shift == PMD_SHIFT &&
741 		   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
742 		   (hva & (PMD_SIZE - PAGE_SIZE))) {
743 		level = 1;
744 	} else {
745 		level = 0;
746 		if (shift > PAGE_SHIFT) {
747 			/*
748 			 * If the pte maps more than one page, bring over
749 			 * bits from the virtual address to get the real
750 			 * address of the specific single page we want.
751 			 */
752 			unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
753 			pte = __pte(pte_val(pte) | (hva & rpnmask));
754 		}
755 	}
756 
757 	pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
758 	if (writing || upgrade_write) {
759 		if (pte_val(pte) & _PAGE_WRITE)
760 			pte = __pte(pte_val(pte) | _PAGE_DIRTY);
761 	} else {
762 		pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
763 	}
764 
765 	/* Allocate space in the tree and write the PTE */
766 	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
767 				mmu_seq, kvm->arch.lpid, NULL, NULL);
768 	if (inserted_pte)
769 		*inserted_pte = pte;
770 	if (levelp)
771 		*levelp = level;
772 
773 	if (page) {
774 		if (!ret && (pte_val(pte) & _PAGE_WRITE))
775 			set_page_dirty_lock(page);
776 		put_page(page);
777 	}
778 
779 	return ret;
780 }
781 
782 int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
783 				   unsigned long ea, unsigned long dsisr)
784 {
785 	struct kvm *kvm = vcpu->kvm;
786 	unsigned long gpa, gfn;
787 	struct kvm_memory_slot *memslot;
788 	long ret;
789 	bool writing = !!(dsisr & DSISR_ISSTORE);
790 	bool kvm_ro = false;
791 
792 	/* Check for unusual errors */
793 	if (dsisr & DSISR_UNSUPP_MMU) {
794 		pr_err("KVM: Got unsupported MMU fault\n");
795 		return -EFAULT;
796 	}
797 	if (dsisr & DSISR_BADACCESS) {
798 		/* Reflect to the guest as DSI */
799 		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
800 		kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
801 		return RESUME_GUEST;
802 	}
803 
804 	/* Translate the logical address */
805 	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
806 	gpa &= ~0xF000000000000000ul;
807 	gfn = gpa >> PAGE_SHIFT;
808 	if (!(dsisr & DSISR_PRTABLE_FAULT))
809 		gpa |= ea & 0xfff;
810 
811 	/* Get the corresponding memslot */
812 	memslot = gfn_to_memslot(kvm, gfn);
813 
814 	/* No memslot means it's an emulated MMIO region */
815 	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
816 		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
817 			     DSISR_SET_RC)) {
818 			/*
819 			 * Bad address in guest page table tree, or other
820 			 * unusual error - reflect it to the guest as DSI.
821 			 */
822 			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
823 			return RESUME_GUEST;
824 		}
825 		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
826 	}
827 
828 	if (memslot->flags & KVM_MEM_READONLY) {
829 		if (writing) {
830 			/* give the guest a DSI */
831 			kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
832 						       DSISR_PROTFAULT);
833 			return RESUME_GUEST;
834 		}
835 		kvm_ro = true;
836 	}
837 
838 	/* Failed to set the reference/change bits */
839 	if (dsisr & DSISR_SET_RC) {
840 		spin_lock(&kvm->mmu_lock);
841 		if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
842 					    writing, gpa, kvm->arch.lpid))
843 			dsisr &= ~DSISR_SET_RC;
844 		spin_unlock(&kvm->mmu_lock);
845 
846 		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
847 			       DSISR_PROTFAULT | DSISR_SET_RC)))
848 			return RESUME_GUEST;
849 	}
850 
851 	/* Try to insert a pte */
852 	ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
853 					     kvm_ro, NULL, NULL);
854 
855 	if (ret == 0 || ret == -EAGAIN)
856 		ret = RESUME_GUEST;
857 	return ret;
858 }
859 
860 /* Called with kvm->lock held */
861 int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
862 		    unsigned long gfn)
863 {
864 	pte_t *ptep;
865 	unsigned long gpa = gfn << PAGE_SHIFT;
866 	unsigned int shift;
867 
868 	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
869 	if (ptep && pte_present(*ptep))
870 		kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
871 				 kvm->arch.lpid);
872 	return 0;
873 }
874 
875 /* Called with kvm->lock held */
876 int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
877 		  unsigned long gfn)
878 {
879 	pte_t *ptep;
880 	unsigned long gpa = gfn << PAGE_SHIFT;
881 	unsigned int shift;
882 	int ref = 0;
883 
884 	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
885 	if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
886 		kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
887 					gpa, shift);
888 		/* XXX need to flush tlb here? */
889 		ref = 1;
890 	}
891 	return ref;
892 }
893 
894 /* Called with kvm->lock held */
895 int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
896 		       unsigned long gfn)
897 {
898 	pte_t *ptep;
899 	unsigned long gpa = gfn << PAGE_SHIFT;
900 	unsigned int shift;
901 	int ref = 0;
902 
903 	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
904 	if (ptep && pte_present(*ptep) && pte_young(*ptep))
905 		ref = 1;
906 	return ref;
907 }
908 
909 /* Returns the number of PAGE_SIZE pages that are dirty */
910 static int kvm_radix_test_clear_dirty(struct kvm *kvm,
911 				struct kvm_memory_slot *memslot, int pagenum)
912 {
913 	unsigned long gfn = memslot->base_gfn + pagenum;
914 	unsigned long gpa = gfn << PAGE_SHIFT;
915 	pte_t *ptep;
916 	unsigned int shift;
917 	int ret = 0;
918 
919 	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
920 	if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) {
921 		ret = 1;
922 		if (shift)
923 			ret = 1 << (shift - PAGE_SHIFT);
924 		kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
925 					gpa, shift);
926 		kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
927 	}
928 	return ret;
929 }
930 
931 long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
932 			struct kvm_memory_slot *memslot, unsigned long *map)
933 {
934 	unsigned long i, j;
935 	int npages;
936 
937 	for (i = 0; i < memslot->npages; i = j) {
938 		npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
939 
940 		/*
941 		 * Note that if npages > 0 then i must be a multiple of npages,
942 		 * since huge pages are only used to back the guest at guest
943 		 * real addresses that are a multiple of their size.
944 		 * Since we have at most one PTE covering any given guest
945 		 * real address, if npages > 1 we can skip to i + npages.
946 		 */
947 		j = i + 1;
948 		if (npages) {
949 			set_dirty_bits(map, i, npages);
950 			j = i + npages;
951 		}
952 	}
953 	return 0;
954 }
955 
956 static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
957 				 int psize, int *indexp)
958 {
959 	if (!mmu_psize_defs[psize].shift)
960 		return;
961 	info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
962 		(mmu_psize_defs[psize].ap << 29);
963 	++(*indexp);
964 }
965 
966 int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
967 {
968 	int i;
969 
970 	if (!radix_enabled())
971 		return -EINVAL;
972 	memset(info, 0, sizeof(*info));
973 
974 	/* 4k page size */
975 	info->geometries[0].page_shift = 12;
976 	info->geometries[0].level_bits[0] = 9;
977 	for (i = 1; i < 4; ++i)
978 		info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
979 	/* 64k page size */
980 	info->geometries[1].page_shift = 16;
981 	for (i = 0; i < 4; ++i)
982 		info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
983 
984 	i = 0;
985 	add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
986 	add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
987 	add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
988 	add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
989 
990 	return 0;
991 }
992 
993 int kvmppc_init_vm_radix(struct kvm *kvm)
994 {
995 	kvm->arch.pgtable = pgd_alloc(kvm->mm);
996 	if (!kvm->arch.pgtable)
997 		return -ENOMEM;
998 	return 0;
999 }
1000 
1001 static void pte_ctor(void *addr)
1002 {
1003 	memset(addr, 0, RADIX_PTE_TABLE_SIZE);
1004 }
1005 
1006 static void pmd_ctor(void *addr)
1007 {
1008 	memset(addr, 0, RADIX_PMD_TABLE_SIZE);
1009 }
1010 
1011 struct debugfs_radix_state {
1012 	struct kvm	*kvm;
1013 	struct mutex	mutex;
1014 	unsigned long	gpa;
1015 	int		lpid;
1016 	int		chars_left;
1017 	int		buf_index;
1018 	char		buf[128];
1019 	u8		hdr;
1020 };
1021 
1022 static int debugfs_radix_open(struct inode *inode, struct file *file)
1023 {
1024 	struct kvm *kvm = inode->i_private;
1025 	struct debugfs_radix_state *p;
1026 
1027 	p = kzalloc(sizeof(*p), GFP_KERNEL);
1028 	if (!p)
1029 		return -ENOMEM;
1030 
1031 	kvm_get_kvm(kvm);
1032 	p->kvm = kvm;
1033 	mutex_init(&p->mutex);
1034 	file->private_data = p;
1035 
1036 	return nonseekable_open(inode, file);
1037 }
1038 
1039 static int debugfs_radix_release(struct inode *inode, struct file *file)
1040 {
1041 	struct debugfs_radix_state *p = file->private_data;
1042 
1043 	kvm_put_kvm(p->kvm);
1044 	kfree(p);
1045 	return 0;
1046 }
1047 
1048 static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
1049 				 size_t len, loff_t *ppos)
1050 {
1051 	struct debugfs_radix_state *p = file->private_data;
1052 	ssize_t ret, r;
1053 	unsigned long n;
1054 	struct kvm *kvm;
1055 	unsigned long gpa;
1056 	pgd_t *pgt;
1057 	struct kvm_nested_guest *nested;
1058 	pgd_t pgd, *pgdp;
1059 	pud_t pud, *pudp;
1060 	pmd_t pmd, *pmdp;
1061 	pte_t *ptep;
1062 	int shift;
1063 	unsigned long pte;
1064 
1065 	kvm = p->kvm;
1066 	if (!kvm_is_radix(kvm))
1067 		return 0;
1068 
1069 	ret = mutex_lock_interruptible(&p->mutex);
1070 	if (ret)
1071 		return ret;
1072 
1073 	if (p->chars_left) {
1074 		n = p->chars_left;
1075 		if (n > len)
1076 			n = len;
1077 		r = copy_to_user(buf, p->buf + p->buf_index, n);
1078 		n -= r;
1079 		p->chars_left -= n;
1080 		p->buf_index += n;
1081 		buf += n;
1082 		len -= n;
1083 		ret = n;
1084 		if (r) {
1085 			if (!n)
1086 				ret = -EFAULT;
1087 			goto out;
1088 		}
1089 	}
1090 
1091 	gpa = p->gpa;
1092 	nested = NULL;
1093 	pgt = NULL;
1094 	while (len != 0 && p->lpid >= 0) {
1095 		if (gpa >= RADIX_PGTABLE_RANGE) {
1096 			gpa = 0;
1097 			pgt = NULL;
1098 			if (nested) {
1099 				kvmhv_put_nested(nested);
1100 				nested = NULL;
1101 			}
1102 			p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
1103 			p->hdr = 0;
1104 			if (p->lpid < 0)
1105 				break;
1106 		}
1107 		if (!pgt) {
1108 			if (p->lpid == 0) {
1109 				pgt = kvm->arch.pgtable;
1110 			} else {
1111 				nested = kvmhv_get_nested(kvm, p->lpid, false);
1112 				if (!nested) {
1113 					gpa = RADIX_PGTABLE_RANGE;
1114 					continue;
1115 				}
1116 				pgt = nested->shadow_pgtable;
1117 			}
1118 		}
1119 		n = 0;
1120 		if (!p->hdr) {
1121 			if (p->lpid > 0)
1122 				n = scnprintf(p->buf, sizeof(p->buf),
1123 					      "\nNested LPID %d: ", p->lpid);
1124 			n += scnprintf(p->buf + n, sizeof(p->buf) - n,
1125 				      "pgdir: %lx\n", (unsigned long)pgt);
1126 			p->hdr = 1;
1127 			goto copy;
1128 		}
1129 
1130 		pgdp = pgt + pgd_index(gpa);
1131 		pgd = READ_ONCE(*pgdp);
1132 		if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
1133 			gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE;
1134 			continue;
1135 		}
1136 
1137 		pudp = pud_offset(&pgd, gpa);
1138 		pud = READ_ONCE(*pudp);
1139 		if (!(pud_val(pud) & _PAGE_PRESENT)) {
1140 			gpa = (gpa & PUD_MASK) + PUD_SIZE;
1141 			continue;
1142 		}
1143 		if (pud_val(pud) & _PAGE_PTE) {
1144 			pte = pud_val(pud);
1145 			shift = PUD_SHIFT;
1146 			goto leaf;
1147 		}
1148 
1149 		pmdp = pmd_offset(&pud, gpa);
1150 		pmd = READ_ONCE(*pmdp);
1151 		if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
1152 			gpa = (gpa & PMD_MASK) + PMD_SIZE;
1153 			continue;
1154 		}
1155 		if (pmd_val(pmd) & _PAGE_PTE) {
1156 			pte = pmd_val(pmd);
1157 			shift = PMD_SHIFT;
1158 			goto leaf;
1159 		}
1160 
1161 		ptep = pte_offset_kernel(&pmd, gpa);
1162 		pte = pte_val(READ_ONCE(*ptep));
1163 		if (!(pte & _PAGE_PRESENT)) {
1164 			gpa += PAGE_SIZE;
1165 			continue;
1166 		}
1167 		shift = PAGE_SHIFT;
1168 	leaf:
1169 		n = scnprintf(p->buf, sizeof(p->buf),
1170 			      " %lx: %lx %d\n", gpa, pte, shift);
1171 		gpa += 1ul << shift;
1172 	copy:
1173 		p->chars_left = n;
1174 		if (n > len)
1175 			n = len;
1176 		r = copy_to_user(buf, p->buf, n);
1177 		n -= r;
1178 		p->chars_left -= n;
1179 		p->buf_index = n;
1180 		buf += n;
1181 		len -= n;
1182 		ret += n;
1183 		if (r) {
1184 			if (!ret)
1185 				ret = -EFAULT;
1186 			break;
1187 		}
1188 	}
1189 	p->gpa = gpa;
1190 	if (nested)
1191 		kvmhv_put_nested(nested);
1192 
1193  out:
1194 	mutex_unlock(&p->mutex);
1195 	return ret;
1196 }
1197 
1198 static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
1199 			   size_t len, loff_t *ppos)
1200 {
1201 	return -EACCES;
1202 }
1203 
1204 static const struct file_operations debugfs_radix_fops = {
1205 	.owner	 = THIS_MODULE,
1206 	.open	 = debugfs_radix_open,
1207 	.release = debugfs_radix_release,
1208 	.read	 = debugfs_radix_read,
1209 	.write	 = debugfs_radix_write,
1210 	.llseek	 = generic_file_llseek,
1211 };
1212 
1213 void kvmhv_radix_debugfs_init(struct kvm *kvm)
1214 {
1215 	kvm->arch.radix_dentry = debugfs_create_file("radix", 0400,
1216 						     kvm->arch.debugfs_dir, kvm,
1217 						     &debugfs_radix_fops);
1218 }
1219 
1220 int kvmppc_radix_init(void)
1221 {
1222 	unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
1223 
1224 	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
1225 	if (!kvm_pte_cache)
1226 		return -ENOMEM;
1227 
1228 	size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
1229 
1230 	kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
1231 	if (!kvm_pmd_cache) {
1232 		kmem_cache_destroy(kvm_pte_cache);
1233 		return -ENOMEM;
1234 	}
1235 
1236 	return 0;
1237 }
1238 
1239 void kvmppc_radix_exit(void)
1240 {
1241 	kmem_cache_destroy(kvm_pte_cache);
1242 	kmem_cache_destroy(kvm_pmd_cache);
1243 }
1244