1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License, version 2, as
4  * published by the Free Software Foundation.
5  *
6  * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
7  */
8 
9 #include <linux/types.h>
10 #include <linux/string.h>
11 #include <linux/kvm.h>
12 #include <linux/kvm_host.h>
13 #include <linux/hugetlb.h>
14 #include <linux/module.h>
15 
16 #include <asm/tlbflush.h>
17 #include <asm/kvm_ppc.h>
18 #include <asm/kvm_book3s.h>
19 #include <asm/mmu-hash64.h>
20 #include <asm/hvcall.h>
21 #include <asm/synch.h>
22 #include <asm/ppc-opcode.h>
23 
24 /* Translate address of a vmalloc'd thing to a linear map address */
25 static void *real_vmalloc_addr(void *x)
26 {
27 	unsigned long addr = (unsigned long) x;
28 	pte_t *p;
29 
30 	p = find_linux_pte(swapper_pg_dir, addr);
31 	if (!p || !pte_present(*p))
32 		return NULL;
33 	/* assume we don't have huge pages in vmalloc space... */
34 	addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
35 	return __va(addr);
36 }
37 
38 /* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
39 static int global_invalidates(struct kvm *kvm, unsigned long flags)
40 {
41 	int global;
42 
43 	/*
44 	 * If there is only one vcore, and it's currently running,
45 	 * we can use tlbiel as long as we mark all other physical
46 	 * cores as potentially having stale TLB entries for this lpid.
47 	 * If we're not using MMU notifiers, we never take pages away
48 	 * from the guest, so we can use tlbiel if requested.
49 	 * Otherwise, don't use tlbiel.
50 	 */
51 	if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore)
52 		global = 0;
53 	else if (kvm->arch.using_mmu_notifiers)
54 		global = 1;
55 	else
56 		global = !(flags & H_LOCAL);
57 
58 	if (!global) {
59 		/* any other core might now have stale TLB entries... */
60 		smp_wmb();
61 		cpumask_setall(&kvm->arch.need_tlb_flush);
62 		cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
63 				  &kvm->arch.need_tlb_flush);
64 	}
65 
66 	return global;
67 }
68 
69 /*
70  * Add this HPTE into the chain for the real page.
71  * Must be called with the chain locked; it unlocks the chain.
72  */
73 void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
74 			     unsigned long *rmap, long pte_index, int realmode)
75 {
76 	struct revmap_entry *head, *tail;
77 	unsigned long i;
78 
79 	if (*rmap & KVMPPC_RMAP_PRESENT) {
80 		i = *rmap & KVMPPC_RMAP_INDEX;
81 		head = &kvm->arch.revmap[i];
82 		if (realmode)
83 			head = real_vmalloc_addr(head);
84 		tail = &kvm->arch.revmap[head->back];
85 		if (realmode)
86 			tail = real_vmalloc_addr(tail);
87 		rev->forw = i;
88 		rev->back = head->back;
89 		tail->forw = pte_index;
90 		head->back = pte_index;
91 	} else {
92 		rev->forw = rev->back = pte_index;
93 		*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) |
94 			pte_index | KVMPPC_RMAP_PRESENT;
95 	}
96 	unlock_rmap(rmap);
97 }
98 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
99 
100 /*
101  * Note modification of an HPTE; set the HPTE modified bit
102  * if anyone is interested.
103  */
104 static inline void note_hpte_modification(struct kvm *kvm,
105 					  struct revmap_entry *rev)
106 {
107 	if (atomic_read(&kvm->arch.hpte_mod_interest))
108 		rev->guest_rpte |= HPTE_GR_MODIFIED;
109 }
110 
111 /* Remove this HPTE from the chain for a real page */
112 static void remove_revmap_chain(struct kvm *kvm, long pte_index,
113 				struct revmap_entry *rev,
114 				unsigned long hpte_v, unsigned long hpte_r)
115 {
116 	struct revmap_entry *next, *prev;
117 	unsigned long gfn, ptel, head;
118 	struct kvm_memory_slot *memslot;
119 	unsigned long *rmap;
120 	unsigned long rcbits;
121 
122 	rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
123 	ptel = rev->guest_rpte |= rcbits;
124 	gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
125 	memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
126 	if (!memslot)
127 		return;
128 
129 	rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
130 	lock_rmap(rmap);
131 
132 	head = *rmap & KVMPPC_RMAP_INDEX;
133 	next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]);
134 	prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]);
135 	next->back = rev->back;
136 	prev->forw = rev->forw;
137 	if (head == pte_index) {
138 		head = rev->forw;
139 		if (head == pte_index)
140 			*rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
141 		else
142 			*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
143 	}
144 	*rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
145 	unlock_rmap(rmap);
146 }
147 
148 static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
149 			      int writing, unsigned long *pte_sizep)
150 {
151 	pte_t *ptep;
152 	unsigned long ps = *pte_sizep;
153 	unsigned int shift;
154 
155 	ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
156 	if (!ptep)
157 		return __pte(0);
158 	if (shift)
159 		*pte_sizep = 1ul << shift;
160 	else
161 		*pte_sizep = PAGE_SIZE;
162 	if (ps > *pte_sizep)
163 		return __pte(0);
164 	if (!pte_present(*ptep))
165 		return __pte(0);
166 	return kvmppc_read_update_linux_pte(ptep, writing);
167 }
168 
169 static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
170 {
171 	asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
172 	hpte[0] = hpte_v;
173 }
174 
175 long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
176 		       long pte_index, unsigned long pteh, unsigned long ptel,
177 		       pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
178 {
179 	unsigned long i, pa, gpa, gfn, psize;
180 	unsigned long slot_fn, hva;
181 	unsigned long *hpte;
182 	struct revmap_entry *rev;
183 	unsigned long g_ptel;
184 	struct kvm_memory_slot *memslot;
185 	unsigned long *physp, pte_size;
186 	unsigned long is_io;
187 	unsigned long *rmap;
188 	pte_t pte;
189 	unsigned int writing;
190 	unsigned long mmu_seq;
191 	unsigned long rcbits;
192 
193 	psize = hpte_page_size(pteh, ptel);
194 	if (!psize)
195 		return H_PARAMETER;
196 	writing = hpte_is_writable(ptel);
197 	pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
198 	ptel &= ~HPTE_GR_RESERVED;
199 	g_ptel = ptel;
200 
201 	/* used later to detect if we might have been invalidated */
202 	mmu_seq = kvm->mmu_notifier_seq;
203 	smp_rmb();
204 
205 	/* Find the memslot (if any) for this address */
206 	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
207 	gfn = gpa >> PAGE_SHIFT;
208 	memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
209 	pa = 0;
210 	is_io = ~0ul;
211 	rmap = NULL;
212 	if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) {
213 		/* PPC970 can't do emulated MMIO */
214 		if (!cpu_has_feature(CPU_FTR_ARCH_206))
215 			return H_PARAMETER;
216 		/* Emulated MMIO - mark this with key=31 */
217 		pteh |= HPTE_V_ABSENT;
218 		ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO;
219 		goto do_insert;
220 	}
221 
222 	/* Check if the requested page fits entirely in the memslot. */
223 	if (!slot_is_aligned(memslot, psize))
224 		return H_PARAMETER;
225 	slot_fn = gfn - memslot->base_gfn;
226 	rmap = &memslot->arch.rmap[slot_fn];
227 
228 	if (!kvm->arch.using_mmu_notifiers) {
229 		physp = memslot->arch.slot_phys;
230 		if (!physp)
231 			return H_PARAMETER;
232 		physp += slot_fn;
233 		if (realmode)
234 			physp = real_vmalloc_addr(physp);
235 		pa = *physp;
236 		if (!pa)
237 			return H_TOO_HARD;
238 		is_io = pa & (HPTE_R_I | HPTE_R_W);
239 		pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK);
240 		pa &= PAGE_MASK;
241 	} else {
242 		/* Translate to host virtual address */
243 		hva = __gfn_to_hva_memslot(memslot, gfn);
244 
245 		/* Look up the Linux PTE for the backing page */
246 		pte_size = psize;
247 		pte = lookup_linux_pte(pgdir, hva, writing, &pte_size);
248 		if (pte_present(pte)) {
249 			if (writing && !pte_write(pte))
250 				/* make the actual HPTE be read-only */
251 				ptel = hpte_make_readonly(ptel);
252 			is_io = hpte_cache_bits(pte_val(pte));
253 			pa = pte_pfn(pte) << PAGE_SHIFT;
254 		}
255 	}
256 
257 	if (pte_size < psize)
258 		return H_PARAMETER;
259 	if (pa && pte_size > psize)
260 		pa |= gpa & (pte_size - 1);
261 
262 	ptel &= ~(HPTE_R_PP0 - psize);
263 	ptel |= pa;
264 
265 	if (pa)
266 		pteh |= HPTE_V_VALID;
267 	else
268 		pteh |= HPTE_V_ABSENT;
269 
270 	/* Check WIMG */
271 	if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) {
272 		if (is_io)
273 			return H_PARAMETER;
274 		/*
275 		 * Allow guest to map emulated device memory as
276 		 * uncacheable, but actually make it cacheable.
277 		 */
278 		ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G);
279 		ptel |= HPTE_R_M;
280 	}
281 
282 	/* Find and lock the HPTEG slot to use */
283  do_insert:
284 	if (pte_index >= kvm->arch.hpt_npte)
285 		return H_PARAMETER;
286 	if (likely((flags & H_EXACT) == 0)) {
287 		pte_index &= ~7UL;
288 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
289 		for (i = 0; i < 8; ++i) {
290 			if ((*hpte & HPTE_V_VALID) == 0 &&
291 			    try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
292 					  HPTE_V_ABSENT))
293 				break;
294 			hpte += 2;
295 		}
296 		if (i == 8) {
297 			/*
298 			 * Since try_lock_hpte doesn't retry (not even stdcx.
299 			 * failures), it could be that there is a free slot
300 			 * but we transiently failed to lock it.  Try again,
301 			 * actually locking each slot and checking it.
302 			 */
303 			hpte -= 16;
304 			for (i = 0; i < 8; ++i) {
305 				while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
306 					cpu_relax();
307 				if (!(*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)))
308 					break;
309 				*hpte &= ~HPTE_V_HVLOCK;
310 				hpte += 2;
311 			}
312 			if (i == 8)
313 				return H_PTEG_FULL;
314 		}
315 		pte_index += i;
316 	} else {
317 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
318 		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
319 				   HPTE_V_ABSENT)) {
320 			/* Lock the slot and check again */
321 			while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
322 				cpu_relax();
323 			if (*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
324 				*hpte &= ~HPTE_V_HVLOCK;
325 				return H_PTEG_FULL;
326 			}
327 		}
328 	}
329 
330 	/* Save away the guest's idea of the second HPTE dword */
331 	rev = &kvm->arch.revmap[pte_index];
332 	if (realmode)
333 		rev = real_vmalloc_addr(rev);
334 	if (rev) {
335 		rev->guest_rpte = g_ptel;
336 		note_hpte_modification(kvm, rev);
337 	}
338 
339 	/* Link HPTE into reverse-map chain */
340 	if (pteh & HPTE_V_VALID) {
341 		if (realmode)
342 			rmap = real_vmalloc_addr(rmap);
343 		lock_rmap(rmap);
344 		/* Check for pending invalidations under the rmap chain lock */
345 		if (kvm->arch.using_mmu_notifiers &&
346 		    mmu_notifier_retry(kvm, mmu_seq)) {
347 			/* inval in progress, write a non-present HPTE */
348 			pteh |= HPTE_V_ABSENT;
349 			pteh &= ~HPTE_V_VALID;
350 			unlock_rmap(rmap);
351 		} else {
352 			kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index,
353 						realmode);
354 			/* Only set R/C in real HPTE if already set in *rmap */
355 			rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
356 			ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C);
357 		}
358 	}
359 
360 	hpte[1] = ptel;
361 
362 	/* Write the first HPTE dword, unlocking the HPTE and making it valid */
363 	eieio();
364 	hpte[0] = pteh;
365 	asm volatile("ptesync" : : : "memory");
366 
367 	*pte_idx_ret = pte_index;
368 	return H_SUCCESS;
369 }
370 EXPORT_SYMBOL_GPL(kvmppc_do_h_enter);
371 
372 long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
373 		    long pte_index, unsigned long pteh, unsigned long ptel)
374 {
375 	return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel,
376 				 vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]);
377 }
378 
379 #define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
380 
381 static inline int try_lock_tlbie(unsigned int *lock)
382 {
383 	unsigned int tmp, old;
384 	unsigned int token = LOCK_TOKEN;
385 
386 	asm volatile("1:lwarx	%1,0,%2\n"
387 		     "	cmpwi	cr0,%1,0\n"
388 		     "	bne	2f\n"
389 		     "  stwcx.	%3,0,%2\n"
390 		     "	bne-	1b\n"
391 		     "  isync\n"
392 		     "2:"
393 		     : "=&r" (tmp), "=&r" (old)
394 		     : "r" (lock), "r" (token)
395 		     : "cc", "memory");
396 	return old == 0;
397 }
398 
399 long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
400 			unsigned long pte_index, unsigned long avpn,
401 			unsigned long *hpret)
402 {
403 	unsigned long *hpte;
404 	unsigned long v, r, rb;
405 	struct revmap_entry *rev;
406 
407 	if (pte_index >= kvm->arch.hpt_npte)
408 		return H_PARAMETER;
409 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
410 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
411 		cpu_relax();
412 	if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
413 	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
414 	    ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
415 		hpte[0] &= ~HPTE_V_HVLOCK;
416 		return H_NOT_FOUND;
417 	}
418 
419 	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
420 	v = hpte[0] & ~HPTE_V_HVLOCK;
421 	if (v & HPTE_V_VALID) {
422 		hpte[0] &= ~HPTE_V_VALID;
423 		rb = compute_tlbie_rb(v, hpte[1], pte_index);
424 		if (global_invalidates(kvm, flags)) {
425 			while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
426 				cpu_relax();
427 			asm volatile("ptesync" : : : "memory");
428 			asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
429 				     : : "r" (rb), "r" (kvm->arch.lpid));
430 			asm volatile("ptesync" : : : "memory");
431 			kvm->arch.tlbie_lock = 0;
432 		} else {
433 			asm volatile("ptesync" : : : "memory");
434 			asm volatile("tlbiel %0" : : "r" (rb));
435 			asm volatile("ptesync" : : : "memory");
436 		}
437 		/* Read PTE low word after tlbie to get final R/C values */
438 		remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
439 	}
440 	r = rev->guest_rpte & ~HPTE_GR_RESERVED;
441 	note_hpte_modification(kvm, rev);
442 	unlock_hpte(hpte, 0);
443 
444 	hpret[0] = v;
445 	hpret[1] = r;
446 	return H_SUCCESS;
447 }
448 EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);
449 
450 long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
451 		     unsigned long pte_index, unsigned long avpn)
452 {
453 	return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
454 				  &vcpu->arch.gpr[4]);
455 }
456 
457 long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
458 {
459 	struct kvm *kvm = vcpu->kvm;
460 	unsigned long *args = &vcpu->arch.gpr[4];
461 	unsigned long *hp, *hptes[4], tlbrb[4];
462 	long int i, j, k, n, found, indexes[4];
463 	unsigned long flags, req, pte_index, rcbits;
464 	long int local = 0;
465 	long int ret = H_SUCCESS;
466 	struct revmap_entry *rev, *revs[4];
467 
468 	if (atomic_read(&kvm->online_vcpus) == 1)
469 		local = 1;
470 	for (i = 0; i < 4 && ret == H_SUCCESS; ) {
471 		n = 0;
472 		for (; i < 4; ++i) {
473 			j = i * 2;
474 			pte_index = args[j];
475 			flags = pte_index >> 56;
476 			pte_index &= ((1ul << 56) - 1);
477 			req = flags >> 6;
478 			flags &= 3;
479 			if (req == 3) {		/* no more requests */
480 				i = 4;
481 				break;
482 			}
483 			if (req != 1 || flags == 3 ||
484 			    pte_index >= kvm->arch.hpt_npte) {
485 				/* parameter error */
486 				args[j] = ((0xa0 | flags) << 56) + pte_index;
487 				ret = H_PARAMETER;
488 				break;
489 			}
490 			hp = (unsigned long *)
491 				(kvm->arch.hpt_virt + (pte_index << 4));
492 			/* to avoid deadlock, don't spin except for first */
493 			if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
494 				if (n)
495 					break;
496 				while (!try_lock_hpte(hp, HPTE_V_HVLOCK))
497 					cpu_relax();
498 			}
499 			found = 0;
500 			if (hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) {
501 				switch (flags & 3) {
502 				case 0:		/* absolute */
503 					found = 1;
504 					break;
505 				case 1:		/* andcond */
506 					if (!(hp[0] & args[j + 1]))
507 						found = 1;
508 					break;
509 				case 2:		/* AVPN */
510 					if ((hp[0] & ~0x7fUL) == args[j + 1])
511 						found = 1;
512 					break;
513 				}
514 			}
515 			if (!found) {
516 				hp[0] &= ~HPTE_V_HVLOCK;
517 				args[j] = ((0x90 | flags) << 56) + pte_index;
518 				continue;
519 			}
520 
521 			args[j] = ((0x80 | flags) << 56) + pte_index;
522 			rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
523 			note_hpte_modification(kvm, rev);
524 
525 			if (!(hp[0] & HPTE_V_VALID)) {
526 				/* insert R and C bits from PTE */
527 				rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
528 				args[j] |= rcbits << (56 - 5);
529 				hp[0] = 0;
530 				continue;
531 			}
532 
533 			hp[0] &= ~HPTE_V_VALID;		/* leave it locked */
534 			tlbrb[n] = compute_tlbie_rb(hp[0], hp[1], pte_index);
535 			indexes[n] = j;
536 			hptes[n] = hp;
537 			revs[n] = rev;
538 			++n;
539 		}
540 
541 		if (!n)
542 			break;
543 
544 		/* Now that we've collected a batch, do the tlbies */
545 		if (!local) {
546 			while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
547 				cpu_relax();
548 			asm volatile("ptesync" : : : "memory");
549 			for (k = 0; k < n; ++k)
550 				asm volatile(PPC_TLBIE(%1,%0) : :
551 					     "r" (tlbrb[k]),
552 					     "r" (kvm->arch.lpid));
553 			asm volatile("eieio; tlbsync; ptesync" : : : "memory");
554 			kvm->arch.tlbie_lock = 0;
555 		} else {
556 			asm volatile("ptesync" : : : "memory");
557 			for (k = 0; k < n; ++k)
558 				asm volatile("tlbiel %0" : : "r" (tlbrb[k]));
559 			asm volatile("ptesync" : : : "memory");
560 		}
561 
562 		/* Read PTE low words after tlbie to get final R/C values */
563 		for (k = 0; k < n; ++k) {
564 			j = indexes[k];
565 			pte_index = args[j] & ((1ul << 56) - 1);
566 			hp = hptes[k];
567 			rev = revs[k];
568 			remove_revmap_chain(kvm, pte_index, rev, hp[0], hp[1]);
569 			rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
570 			args[j] |= rcbits << (56 - 5);
571 			hp[0] = 0;
572 		}
573 	}
574 
575 	return ret;
576 }
577 
578 long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
579 		      unsigned long pte_index, unsigned long avpn,
580 		      unsigned long va)
581 {
582 	struct kvm *kvm = vcpu->kvm;
583 	unsigned long *hpte;
584 	struct revmap_entry *rev;
585 	unsigned long v, r, rb, mask, bits;
586 
587 	if (pte_index >= kvm->arch.hpt_npte)
588 		return H_PARAMETER;
589 
590 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
591 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
592 		cpu_relax();
593 	if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
594 	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
595 		hpte[0] &= ~HPTE_V_HVLOCK;
596 		return H_NOT_FOUND;
597 	}
598 
599 	v = hpte[0];
600 	bits = (flags << 55) & HPTE_R_PP0;
601 	bits |= (flags << 48) & HPTE_R_KEY_HI;
602 	bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
603 
604 	/* Update guest view of 2nd HPTE dword */
605 	mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
606 		HPTE_R_KEY_HI | HPTE_R_KEY_LO;
607 	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
608 	if (rev) {
609 		r = (rev->guest_rpte & ~mask) | bits;
610 		rev->guest_rpte = r;
611 		note_hpte_modification(kvm, rev);
612 	}
613 	r = (hpte[1] & ~mask) | bits;
614 
615 	/* Update HPTE */
616 	if (v & HPTE_V_VALID) {
617 		rb = compute_tlbie_rb(v, r, pte_index);
618 		hpte[0] = v & ~HPTE_V_VALID;
619 		if (global_invalidates(kvm, flags)) {
620 			while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
621 				cpu_relax();
622 			asm volatile("ptesync" : : : "memory");
623 			asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
624 				     : : "r" (rb), "r" (kvm->arch.lpid));
625 			asm volatile("ptesync" : : : "memory");
626 			kvm->arch.tlbie_lock = 0;
627 		} else {
628 			asm volatile("ptesync" : : : "memory");
629 			asm volatile("tlbiel %0" : : "r" (rb));
630 			asm volatile("ptesync" : : : "memory");
631 		}
632 		/*
633 		 * If the host has this page as readonly but the guest
634 		 * wants to make it read/write, reduce the permissions.
635 		 * Checking the host permissions involves finding the
636 		 * memslot and then the Linux PTE for the page.
637 		 */
638 		if (hpte_is_writable(r) && kvm->arch.using_mmu_notifiers) {
639 			unsigned long psize, gfn, hva;
640 			struct kvm_memory_slot *memslot;
641 			pgd_t *pgdir = vcpu->arch.pgdir;
642 			pte_t pte;
643 
644 			psize = hpte_page_size(v, r);
645 			gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
646 			memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
647 			if (memslot) {
648 				hva = __gfn_to_hva_memslot(memslot, gfn);
649 				pte = lookup_linux_pte(pgdir, hva, 1, &psize);
650 				if (pte_present(pte) && !pte_write(pte))
651 					r = hpte_make_readonly(r);
652 			}
653 		}
654 	}
655 	hpte[1] = r;
656 	eieio();
657 	hpte[0] = v & ~HPTE_V_HVLOCK;
658 	asm volatile("ptesync" : : : "memory");
659 	return H_SUCCESS;
660 }
661 
662 long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
663 		   unsigned long pte_index)
664 {
665 	struct kvm *kvm = vcpu->kvm;
666 	unsigned long *hpte, v, r;
667 	int i, n = 1;
668 	struct revmap_entry *rev = NULL;
669 
670 	if (pte_index >= kvm->arch.hpt_npte)
671 		return H_PARAMETER;
672 	if (flags & H_READ_4) {
673 		pte_index &= ~3;
674 		n = 4;
675 	}
676 	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
677 	for (i = 0; i < n; ++i, ++pte_index) {
678 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
679 		v = hpte[0] & ~HPTE_V_HVLOCK;
680 		r = hpte[1];
681 		if (v & HPTE_V_ABSENT) {
682 			v &= ~HPTE_V_ABSENT;
683 			v |= HPTE_V_VALID;
684 		}
685 		if (v & HPTE_V_VALID) {
686 			r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
687 			r &= ~HPTE_GR_RESERVED;
688 		}
689 		vcpu->arch.gpr[4 + i * 2] = v;
690 		vcpu->arch.gpr[5 + i * 2] = r;
691 	}
692 	return H_SUCCESS;
693 }
694 
695 void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
696 			unsigned long pte_index)
697 {
698 	unsigned long rb;
699 
700 	hptep[0] &= ~HPTE_V_VALID;
701 	rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
702 	while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
703 		cpu_relax();
704 	asm volatile("ptesync" : : : "memory");
705 	asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
706 		     : : "r" (rb), "r" (kvm->arch.lpid));
707 	asm volatile("ptesync" : : : "memory");
708 	kvm->arch.tlbie_lock = 0;
709 }
710 EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);
711 
712 void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
713 			   unsigned long pte_index)
714 {
715 	unsigned long rb;
716 	unsigned char rbyte;
717 
718 	rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
719 	rbyte = (hptep[1] & ~HPTE_R_R) >> 8;
720 	/* modify only the second-last byte, which contains the ref bit */
721 	*((char *)hptep + 14) = rbyte;
722 	while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
723 		cpu_relax();
724 	asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
725 		     : : "r" (rb), "r" (kvm->arch.lpid));
726 	asm volatile("ptesync" : : : "memory");
727 	kvm->arch.tlbie_lock = 0;
728 }
729 EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);
730 
731 static int slb_base_page_shift[4] = {
732 	24,	/* 16M */
733 	16,	/* 64k */
734 	34,	/* 16G */
735 	20,	/* 1M, unsupported */
736 };
737 
738 long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
739 			      unsigned long valid)
740 {
741 	unsigned int i;
742 	unsigned int pshift;
743 	unsigned long somask;
744 	unsigned long vsid, hash;
745 	unsigned long avpn;
746 	unsigned long *hpte;
747 	unsigned long mask, val;
748 	unsigned long v, r;
749 
750 	/* Get page shift, work out hash and AVPN etc. */
751 	mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY;
752 	val = 0;
753 	pshift = 12;
754 	if (slb_v & SLB_VSID_L) {
755 		mask |= HPTE_V_LARGE;
756 		val |= HPTE_V_LARGE;
757 		pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4];
758 	}
759 	if (slb_v & SLB_VSID_B_1T) {
760 		somask = (1UL << 40) - 1;
761 		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T;
762 		vsid ^= vsid << 25;
763 	} else {
764 		somask = (1UL << 28) - 1;
765 		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
766 	}
767 	hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask;
768 	avpn = slb_v & ~(somask >> 16);	/* also includes B */
769 	avpn |= (eaddr & somask) >> 16;
770 
771 	if (pshift >= 24)
772 		avpn &= ~((1UL << (pshift - 16)) - 1);
773 	else
774 		avpn &= ~0x7fUL;
775 	val |= avpn;
776 
777 	for (;;) {
778 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7));
779 
780 		for (i = 0; i < 16; i += 2) {
781 			/* Read the PTE racily */
782 			v = hpte[i] & ~HPTE_V_HVLOCK;
783 
784 			/* Check valid/absent, hash, segment size and AVPN */
785 			if (!(v & valid) || (v & mask) != val)
786 				continue;
787 
788 			/* Lock the PTE and read it under the lock */
789 			while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
790 				cpu_relax();
791 			v = hpte[i] & ~HPTE_V_HVLOCK;
792 			r = hpte[i+1];
793 
794 			/*
795 			 * Check the HPTE again, including large page size
796 			 * Since we don't currently allow any MPSS (mixed
797 			 * page-size segment) page sizes, it is sufficient
798 			 * to check against the actual page size.
799 			 */
800 			if ((v & valid) && (v & mask) == val &&
801 			    hpte_page_size(v, r) == (1ul << pshift))
802 				/* Return with the HPTE still locked */
803 				return (hash << 3) + (i >> 1);
804 
805 			/* Unlock and move on */
806 			hpte[i] = v;
807 		}
808 
809 		if (val & HPTE_V_SECONDARY)
810 			break;
811 		val |= HPTE_V_SECONDARY;
812 		hash = hash ^ kvm->arch.hpt_mask;
813 	}
814 	return -1;
815 }
816 EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte);
817 
818 /*
819  * Called in real mode to check whether an HPTE not found fault
820  * is due to accessing a paged-out page or an emulated MMIO page,
821  * or if a protection fault is due to accessing a page that the
822  * guest wanted read/write access to but which we made read-only.
823  * Returns a possibly modified status (DSISR) value if not
824  * (i.e. pass the interrupt to the guest),
825  * -1 to pass the fault up to host kernel mode code, -2 to do that
826  * and also load the instruction word (for MMIO emulation),
827  * or 0 if we should make the guest retry the access.
828  */
829 long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
830 			  unsigned long slb_v, unsigned int status, bool data)
831 {
832 	struct kvm *kvm = vcpu->kvm;
833 	long int index;
834 	unsigned long v, r, gr;
835 	unsigned long *hpte;
836 	unsigned long valid;
837 	struct revmap_entry *rev;
838 	unsigned long pp, key;
839 
840 	/* For protection fault, expect to find a valid HPTE */
841 	valid = HPTE_V_VALID;
842 	if (status & DSISR_NOHPTE)
843 		valid |= HPTE_V_ABSENT;
844 
845 	index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
846 	if (index < 0) {
847 		if (status & DSISR_NOHPTE)
848 			return status;	/* there really was no HPTE */
849 		return 0;		/* for prot fault, HPTE disappeared */
850 	}
851 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
852 	v = hpte[0] & ~HPTE_V_HVLOCK;
853 	r = hpte[1];
854 	rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
855 	gr = rev->guest_rpte;
856 
857 	unlock_hpte(hpte, v);
858 
859 	/* For not found, if the HPTE is valid by now, retry the instruction */
860 	if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID))
861 		return 0;
862 
863 	/* Check access permissions to the page */
864 	pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
865 	key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
866 	status &= ~DSISR_NOHPTE;	/* DSISR_NOHPTE == SRR1_ISI_NOPT */
867 	if (!data) {
868 		if (gr & (HPTE_R_N | HPTE_R_G))
869 			return status | SRR1_ISI_N_OR_G;
870 		if (!hpte_read_permission(pp, slb_v & key))
871 			return status | SRR1_ISI_PROT;
872 	} else if (status & DSISR_ISSTORE) {
873 		/* check write permission */
874 		if (!hpte_write_permission(pp, slb_v & key))
875 			return status | DSISR_PROTFAULT;
876 	} else {
877 		if (!hpte_read_permission(pp, slb_v & key))
878 			return status | DSISR_PROTFAULT;
879 	}
880 
881 	/* Check storage key, if applicable */
882 	if (data && (vcpu->arch.shregs.msr & MSR_DR)) {
883 		unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr);
884 		if (status & DSISR_ISSTORE)
885 			perm >>= 1;
886 		if (perm & 1)
887 			return status | DSISR_KEYFAULT;
888 	}
889 
890 	/* Save HPTE info for virtual-mode handler */
891 	vcpu->arch.pgfault_addr = addr;
892 	vcpu->arch.pgfault_index = index;
893 	vcpu->arch.pgfault_hpte[0] = v;
894 	vcpu->arch.pgfault_hpte[1] = r;
895 
896 	/* Check the storage key to see if it is possibly emulated MMIO */
897 	if (data && (vcpu->arch.shregs.msr & MSR_IR) &&
898 	    (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
899 	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
900 		return -2;	/* MMIO emulation - load instr word */
901 
902 	return -1;		/* send fault up to host kernel mode */
903 }
904