xref: /openbmc/linux/arch/powerpc/kvm/book3s_hv.c (revision 232b0b08)
1 /*
2  * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
3  * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
4  *
5  * Authors:
6  *    Paul Mackerras <paulus@au1.ibm.com>
7  *    Alexander Graf <agraf@suse.de>
8  *    Kevin Wolf <mail@kevin-wolf.de>
9  *
10  * Description: KVM functions specific to running on Book 3S
11  * processors in hypervisor mode (specifically POWER7 and later).
12  *
13  * This file is derived from arch/powerpc/kvm/book3s.c,
14  * by Alexander Graf <agraf@suse.de>.
15  *
16  * This program is free software; you can redistribute it and/or modify
17  * it under the terms of the GNU General Public License, version 2, as
18  * published by the Free Software Foundation.
19  */
20 
21 #include <linux/kvm_host.h>
22 #include <linux/err.h>
23 #include <linux/slab.h>
24 #include <linux/preempt.h>
25 #include <linux/sched/signal.h>
26 #include <linux/sched/stat.h>
27 #include <linux/delay.h>
28 #include <linux/export.h>
29 #include <linux/fs.h>
30 #include <linux/anon_inodes.h>
31 #include <linux/cpu.h>
32 #include <linux/cpumask.h>
33 #include <linux/spinlock.h>
34 #include <linux/page-flags.h>
35 #include <linux/srcu.h>
36 #include <linux/miscdevice.h>
37 #include <linux/debugfs.h>
38 
39 #include <asm/reg.h>
40 #include <asm/cputable.h>
41 #include <asm/cacheflush.h>
42 #include <asm/tlbflush.h>
43 #include <linux/uaccess.h>
44 #include <asm/io.h>
45 #include <asm/kvm_ppc.h>
46 #include <asm/kvm_book3s.h>
47 #include <asm/mmu_context.h>
48 #include <asm/lppaca.h>
49 #include <asm/processor.h>
50 #include <asm/cputhreads.h>
51 #include <asm/page.h>
52 #include <asm/hvcall.h>
53 #include <asm/switch_to.h>
54 #include <asm/smp.h>
55 #include <asm/dbell.h>
56 #include <asm/hmi.h>
57 #include <asm/pnv-pci.h>
58 #include <asm/mmu.h>
59 #include <asm/opal.h>
60 #include <asm/xics.h>
61 #include <linux/gfp.h>
62 #include <linux/vmalloc.h>
63 #include <linux/highmem.h>
64 #include <linux/hugetlb.h>
65 #include <linux/kvm_irqfd.h>
66 #include <linux/irqbypass.h>
67 #include <linux/module.h>
68 #include <linux/compiler.h>
69 #include <linux/of.h>
70 
71 #include "book3s.h"
72 
73 #define CREATE_TRACE_POINTS
74 #include "trace_hv.h"
75 
76 /* #define EXIT_DEBUG */
77 /* #define EXIT_DEBUG_SIMPLE */
78 /* #define EXIT_DEBUG_INT */
79 
80 /* Used to indicate that a guest page fault needs to be handled */
81 #define RESUME_PAGE_FAULT	(RESUME_GUEST | RESUME_FLAG_ARCH1)
82 /* Used to indicate that a guest passthrough interrupt needs to be handled */
83 #define RESUME_PASSTHROUGH	(RESUME_GUEST | RESUME_FLAG_ARCH2)
84 
85 /* Used as a "null" value for timebase values */
86 #define TB_NIL	(~(u64)0)
87 
88 static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
89 
90 static int dynamic_mt_modes = 6;
91 module_param(dynamic_mt_modes, int, S_IRUGO | S_IWUSR);
92 MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)");
93 static int target_smt_mode;
94 module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
95 MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
96 
97 #ifdef CONFIG_KVM_XICS
98 static struct kernel_param_ops module_param_ops = {
99 	.set = param_set_int,
100 	.get = param_get_int,
101 };
102 
103 module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass,
104 							S_IRUGO | S_IWUSR);
105 MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");
106 
107 module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
108 							S_IRUGO | S_IWUSR);
109 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
110 #endif
111 
112 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
113 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
114 
115 static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
116 		int *ip)
117 {
118 	int i = *ip;
119 	struct kvm_vcpu *vcpu;
120 
121 	while (++i < MAX_SMT_THREADS) {
122 		vcpu = READ_ONCE(vc->runnable_threads[i]);
123 		if (vcpu) {
124 			*ip = i;
125 			return vcpu;
126 		}
127 	}
128 	return NULL;
129 }
130 
131 /* Used to traverse the list of runnable threads for a given vcore */
132 #define for_each_runnable_thread(i, vcpu, vc) \
133 	for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )
134 
135 static bool kvmppc_ipi_thread(int cpu)
136 {
137 	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
138 
139 	/* On POWER9 we can use msgsnd to IPI any cpu */
140 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
141 		msg |= get_hard_smp_processor_id(cpu);
142 		smp_mb();
143 		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
144 		return true;
145 	}
146 
147 	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
148 	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
149 		preempt_disable();
150 		if (cpu_first_thread_sibling(cpu) ==
151 		    cpu_first_thread_sibling(smp_processor_id())) {
152 			msg |= cpu_thread_in_core(cpu);
153 			smp_mb();
154 			__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
155 			preempt_enable();
156 			return true;
157 		}
158 		preempt_enable();
159 	}
160 
161 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
162 	if (cpu >= 0 && cpu < nr_cpu_ids) {
163 		if (paca[cpu].kvm_hstate.xics_phys) {
164 			xics_wake_cpu(cpu);
165 			return true;
166 		}
167 		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
168 		return true;
169 	}
170 #endif
171 
172 	return false;
173 }
174 
175 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
176 {
177 	int cpu;
178 	struct swait_queue_head *wqp;
179 
180 	wqp = kvm_arch_vcpu_wq(vcpu);
181 	if (swait_active(wqp)) {
182 		swake_up(wqp);
183 		++vcpu->stat.halt_wakeup;
184 	}
185 
186 	cpu = READ_ONCE(vcpu->arch.thread_cpu);
187 	if (cpu >= 0 && kvmppc_ipi_thread(cpu))
188 		return;
189 
190 	/* CPU points to the first thread of the core */
191 	cpu = vcpu->cpu;
192 	if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
193 		smp_send_reschedule(cpu);
194 }
195 
196 /*
197  * We use the vcpu_load/put functions to measure stolen time.
198  * Stolen time is counted as time when either the vcpu is able to
199  * run as part of a virtual core, but the task running the vcore
200  * is preempted or sleeping, or when the vcpu needs something done
201  * in the kernel by the task running the vcpu, but that task is
202  * preempted or sleeping.  Those two things have to be counted
203  * separately, since one of the vcpu tasks will take on the job
204  * of running the core, and the other vcpu tasks in the vcore will
205  * sleep waiting for it to do that, but that sleep shouldn't count
206  * as stolen time.
207  *
208  * Hence we accumulate stolen time when the vcpu can run as part of
209  * a vcore using vc->stolen_tb, and the stolen time when the vcpu
210  * needs its task to do other things in the kernel (for example,
211  * service a page fault) in busy_stolen.  We don't accumulate
212  * stolen time for a vcore when it is inactive, or for a vcpu
213  * when it is in state RUNNING or NOTREADY.  NOTREADY is a bit of
214  * a misnomer; it means that the vcpu task is not executing in
215  * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
216  * the kernel.  We don't have any way of dividing up that time
217  * between time that the vcpu is genuinely stopped, time that
218  * the task is actively working on behalf of the vcpu, and time
219  * that the task is preempted, so we don't count any of it as
220  * stolen.
221  *
222  * Updates to busy_stolen are protected by arch.tbacct_lock;
223  * updates to vc->stolen_tb are protected by the vcore->stoltb_lock
224  * lock.  The stolen times are measured in units of timebase ticks.
225  * (Note that the != TB_NIL checks below are purely defensive;
226  * they should never fail.)
227  */
228 
229 static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc)
230 {
231 	unsigned long flags;
232 
233 	spin_lock_irqsave(&vc->stoltb_lock, flags);
234 	vc->preempt_tb = mftb();
235 	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
236 }
237 
238 static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc)
239 {
240 	unsigned long flags;
241 
242 	spin_lock_irqsave(&vc->stoltb_lock, flags);
243 	if (vc->preempt_tb != TB_NIL) {
244 		vc->stolen_tb += mftb() - vc->preempt_tb;
245 		vc->preempt_tb = TB_NIL;
246 	}
247 	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
248 }
249 
250 static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
251 {
252 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
253 	unsigned long flags;
254 
255 	/*
256 	 * We can test vc->runner without taking the vcore lock,
257 	 * because only this task ever sets vc->runner to this
258 	 * vcpu, and once it is set to this vcpu, only this task
259 	 * ever sets it to NULL.
260 	 */
261 	if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
262 		kvmppc_core_end_stolen(vc);
263 
264 	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
265 	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
266 	    vcpu->arch.busy_preempt != TB_NIL) {
267 		vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt;
268 		vcpu->arch.busy_preempt = TB_NIL;
269 	}
270 	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
271 }
272 
273 static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
274 {
275 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
276 	unsigned long flags;
277 
278 	if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
279 		kvmppc_core_start_stolen(vc);
280 
281 	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
282 	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
283 		vcpu->arch.busy_preempt = mftb();
284 	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
285 }
286 
287 static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
288 {
289 	/*
290 	 * Check for illegal transactional state bit combination
291 	 * and if we find it, force the TS field to a safe state.
292 	 */
293 	if ((msr & MSR_TS_MASK) == MSR_TS_MASK)
294 		msr &= ~MSR_TS_MASK;
295 	vcpu->arch.shregs.msr = msr;
296 	kvmppc_end_cede(vcpu);
297 }
298 
299 static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
300 {
301 	vcpu->arch.pvr = pvr;
302 }
303 
304 /* Dummy value used in computing PCR value below */
305 #define PCR_ARCH_300	(PCR_ARCH_207 << 1)
306 
307 static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
308 {
309 	unsigned long host_pcr_bit = 0, guest_pcr_bit = 0;
310 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
311 
312 	/* We can (emulate) our own architecture version and anything older */
313 	if (cpu_has_feature(CPU_FTR_ARCH_300))
314 		host_pcr_bit = PCR_ARCH_300;
315 	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
316 		host_pcr_bit = PCR_ARCH_207;
317 	else if (cpu_has_feature(CPU_FTR_ARCH_206))
318 		host_pcr_bit = PCR_ARCH_206;
319 	else
320 		host_pcr_bit = PCR_ARCH_205;
321 
322 	/* Determine lowest PCR bit needed to run guest in given PVR level */
323 	guest_pcr_bit = host_pcr_bit;
324 	if (arch_compat) {
325 		switch (arch_compat) {
326 		case PVR_ARCH_205:
327 			guest_pcr_bit = PCR_ARCH_205;
328 			break;
329 		case PVR_ARCH_206:
330 		case PVR_ARCH_206p:
331 			guest_pcr_bit = PCR_ARCH_206;
332 			break;
333 		case PVR_ARCH_207:
334 			guest_pcr_bit = PCR_ARCH_207;
335 			break;
336 		case PVR_ARCH_300:
337 			guest_pcr_bit = PCR_ARCH_300;
338 			break;
339 		default:
340 			return -EINVAL;
341 		}
342 	}
343 
344 	/* Check requested PCR bits don't exceed our capabilities */
345 	if (guest_pcr_bit > host_pcr_bit)
346 		return -EINVAL;
347 
348 	spin_lock(&vc->lock);
349 	vc->arch_compat = arch_compat;
350 	/* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */
351 	vc->pcr = host_pcr_bit - guest_pcr_bit;
352 	spin_unlock(&vc->lock);
353 
354 	return 0;
355 }
356 
357 static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
358 {
359 	int r;
360 
361 	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
362 	pr_err("pc  = %.16lx  msr = %.16llx  trap = %x\n",
363 	       vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap);
364 	for (r = 0; r < 16; ++r)
365 		pr_err("r%2d = %.16lx  r%d = %.16lx\n",
366 		       r, kvmppc_get_gpr(vcpu, r),
367 		       r+16, kvmppc_get_gpr(vcpu, r+16));
368 	pr_err("ctr = %.16lx  lr  = %.16lx\n",
369 	       vcpu->arch.ctr, vcpu->arch.lr);
370 	pr_err("srr0 = %.16llx srr1 = %.16llx\n",
371 	       vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
372 	pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
373 	       vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
374 	pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
375 	       vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
376 	pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
377 	       vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr);
378 	pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
379 	pr_err("fault dar = %.16lx dsisr = %.8x\n",
380 	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
381 	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
382 	for (r = 0; r < vcpu->arch.slb_max; ++r)
383 		pr_err("  ESID = %.16llx VSID = %.16llx\n",
384 		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
385 	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
386 	       vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1,
387 	       vcpu->arch.last_inst);
388 }
389 
390 static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
391 {
392 	struct kvm_vcpu *ret;
393 
394 	mutex_lock(&kvm->lock);
395 	ret = kvm_get_vcpu_by_id(kvm, id);
396 	mutex_unlock(&kvm->lock);
397 	return ret;
398 }
399 
400 static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
401 {
402 	vpa->__old_status |= LPPACA_OLD_SHARED_PROC;
403 	vpa->yield_count = cpu_to_be32(1);
404 }
405 
406 static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
407 		   unsigned long addr, unsigned long len)
408 {
409 	/* check address is cacheline aligned */
410 	if (addr & (L1_CACHE_BYTES - 1))
411 		return -EINVAL;
412 	spin_lock(&vcpu->arch.vpa_update_lock);
413 	if (v->next_gpa != addr || v->len != len) {
414 		v->next_gpa = addr;
415 		v->len = addr ? len : 0;
416 		v->update_pending = 1;
417 	}
418 	spin_unlock(&vcpu->arch.vpa_update_lock);
419 	return 0;
420 }
421 
422 /* Length for a per-processor buffer is passed in at offset 4 in the buffer */
423 struct reg_vpa {
424 	u32 dummy;
425 	union {
426 		__be16 hword;
427 		__be32 word;
428 	} length;
429 };
430 
431 static int vpa_is_registered(struct kvmppc_vpa *vpap)
432 {
433 	if (vpap->update_pending)
434 		return vpap->next_gpa != 0;
435 	return vpap->pinned_addr != NULL;
436 }
437 
438 static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
439 				       unsigned long flags,
440 				       unsigned long vcpuid, unsigned long vpa)
441 {
442 	struct kvm *kvm = vcpu->kvm;
443 	unsigned long len, nb;
444 	void *va;
445 	struct kvm_vcpu *tvcpu;
446 	int err;
447 	int subfunc;
448 	struct kvmppc_vpa *vpap;
449 
450 	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
451 	if (!tvcpu)
452 		return H_PARAMETER;
453 
454 	subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK;
455 	if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL ||
456 	    subfunc == H_VPA_REG_SLB) {
457 		/* Registering new area - address must be cache-line aligned */
458 		if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa)
459 			return H_PARAMETER;
460 
461 		/* convert logical addr to kernel addr and read length */
462 		va = kvmppc_pin_guest_page(kvm, vpa, &nb);
463 		if (va == NULL)
464 			return H_PARAMETER;
465 		if (subfunc == H_VPA_REG_VPA)
466 			len = be16_to_cpu(((struct reg_vpa *)va)->length.hword);
467 		else
468 			len = be32_to_cpu(((struct reg_vpa *)va)->length.word);
469 		kvmppc_unpin_guest_page(kvm, va, vpa, false);
470 
471 		/* Check length */
472 		if (len > nb || len < sizeof(struct reg_vpa))
473 			return H_PARAMETER;
474 	} else {
475 		vpa = 0;
476 		len = 0;
477 	}
478 
479 	err = H_PARAMETER;
480 	vpap = NULL;
481 	spin_lock(&tvcpu->arch.vpa_update_lock);
482 
483 	switch (subfunc) {
484 	case H_VPA_REG_VPA:		/* register VPA */
485 		if (len < sizeof(struct lppaca))
486 			break;
487 		vpap = &tvcpu->arch.vpa;
488 		err = 0;
489 		break;
490 
491 	case H_VPA_REG_DTL:		/* register DTL */
492 		if (len < sizeof(struct dtl_entry))
493 			break;
494 		len -= len % sizeof(struct dtl_entry);
495 
496 		/* Check that they have previously registered a VPA */
497 		err = H_RESOURCE;
498 		if (!vpa_is_registered(&tvcpu->arch.vpa))
499 			break;
500 
501 		vpap = &tvcpu->arch.dtl;
502 		err = 0;
503 		break;
504 
505 	case H_VPA_REG_SLB:		/* register SLB shadow buffer */
506 		/* Check that they have previously registered a VPA */
507 		err = H_RESOURCE;
508 		if (!vpa_is_registered(&tvcpu->arch.vpa))
509 			break;
510 
511 		vpap = &tvcpu->arch.slb_shadow;
512 		err = 0;
513 		break;
514 
515 	case H_VPA_DEREG_VPA:		/* deregister VPA */
516 		/* Check they don't still have a DTL or SLB buf registered */
517 		err = H_RESOURCE;
518 		if (vpa_is_registered(&tvcpu->arch.dtl) ||
519 		    vpa_is_registered(&tvcpu->arch.slb_shadow))
520 			break;
521 
522 		vpap = &tvcpu->arch.vpa;
523 		err = 0;
524 		break;
525 
526 	case H_VPA_DEREG_DTL:		/* deregister DTL */
527 		vpap = &tvcpu->arch.dtl;
528 		err = 0;
529 		break;
530 
531 	case H_VPA_DEREG_SLB:		/* deregister SLB shadow buffer */
532 		vpap = &tvcpu->arch.slb_shadow;
533 		err = 0;
534 		break;
535 	}
536 
537 	if (vpap) {
538 		vpap->next_gpa = vpa;
539 		vpap->len = len;
540 		vpap->update_pending = 1;
541 	}
542 
543 	spin_unlock(&tvcpu->arch.vpa_update_lock);
544 
545 	return err;
546 }
547 
548 static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
549 {
550 	struct kvm *kvm = vcpu->kvm;
551 	void *va;
552 	unsigned long nb;
553 	unsigned long gpa;
554 
555 	/*
556 	 * We need to pin the page pointed to by vpap->next_gpa,
557 	 * but we can't call kvmppc_pin_guest_page under the lock
558 	 * as it does get_user_pages() and down_read().  So we
559 	 * have to drop the lock, pin the page, then get the lock
560 	 * again and check that a new area didn't get registered
561 	 * in the meantime.
562 	 */
563 	for (;;) {
564 		gpa = vpap->next_gpa;
565 		spin_unlock(&vcpu->arch.vpa_update_lock);
566 		va = NULL;
567 		nb = 0;
568 		if (gpa)
569 			va = kvmppc_pin_guest_page(kvm, gpa, &nb);
570 		spin_lock(&vcpu->arch.vpa_update_lock);
571 		if (gpa == vpap->next_gpa)
572 			break;
573 		/* sigh... unpin that one and try again */
574 		if (va)
575 			kvmppc_unpin_guest_page(kvm, va, gpa, false);
576 	}
577 
578 	vpap->update_pending = 0;
579 	if (va && nb < vpap->len) {
580 		/*
581 		 * If it's now too short, it must be that userspace
582 		 * has changed the mappings underlying guest memory,
583 		 * so unregister the region.
584 		 */
585 		kvmppc_unpin_guest_page(kvm, va, gpa, false);
586 		va = NULL;
587 	}
588 	if (vpap->pinned_addr)
589 		kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa,
590 					vpap->dirty);
591 	vpap->gpa = gpa;
592 	vpap->pinned_addr = va;
593 	vpap->dirty = false;
594 	if (va)
595 		vpap->pinned_end = va + vpap->len;
596 }
597 
598 static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
599 {
600 	if (!(vcpu->arch.vpa.update_pending ||
601 	      vcpu->arch.slb_shadow.update_pending ||
602 	      vcpu->arch.dtl.update_pending))
603 		return;
604 
605 	spin_lock(&vcpu->arch.vpa_update_lock);
606 	if (vcpu->arch.vpa.update_pending) {
607 		kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
608 		if (vcpu->arch.vpa.pinned_addr)
609 			init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
610 	}
611 	if (vcpu->arch.dtl.update_pending) {
612 		kvmppc_update_vpa(vcpu, &vcpu->arch.dtl);
613 		vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
614 		vcpu->arch.dtl_index = 0;
615 	}
616 	if (vcpu->arch.slb_shadow.update_pending)
617 		kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow);
618 	spin_unlock(&vcpu->arch.vpa_update_lock);
619 }
620 
621 /*
622  * Return the accumulated stolen time for the vcore up until `now'.
623  * The caller should hold the vcore lock.
624  */
625 static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
626 {
627 	u64 p;
628 	unsigned long flags;
629 
630 	spin_lock_irqsave(&vc->stoltb_lock, flags);
631 	p = vc->stolen_tb;
632 	if (vc->vcore_state != VCORE_INACTIVE &&
633 	    vc->preempt_tb != TB_NIL)
634 		p += now - vc->preempt_tb;
635 	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
636 	return p;
637 }
638 
639 static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
640 				    struct kvmppc_vcore *vc)
641 {
642 	struct dtl_entry *dt;
643 	struct lppaca *vpa;
644 	unsigned long stolen;
645 	unsigned long core_stolen;
646 	u64 now;
647 
648 	dt = vcpu->arch.dtl_ptr;
649 	vpa = vcpu->arch.vpa.pinned_addr;
650 	now = mftb();
651 	core_stolen = vcore_stolen_time(vc, now);
652 	stolen = core_stolen - vcpu->arch.stolen_logged;
653 	vcpu->arch.stolen_logged = core_stolen;
654 	spin_lock_irq(&vcpu->arch.tbacct_lock);
655 	stolen += vcpu->arch.busy_stolen;
656 	vcpu->arch.busy_stolen = 0;
657 	spin_unlock_irq(&vcpu->arch.tbacct_lock);
658 	if (!dt || !vpa)
659 		return;
660 	memset(dt, 0, sizeof(struct dtl_entry));
661 	dt->dispatch_reason = 7;
662 	dt->processor_id = cpu_to_be16(vc->pcpu + vcpu->arch.ptid);
663 	dt->timebase = cpu_to_be64(now + vc->tb_offset);
664 	dt->enqueue_to_dispatch_time = cpu_to_be32(stolen);
665 	dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu));
666 	dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr);
667 	++dt;
668 	if (dt == vcpu->arch.dtl.pinned_end)
669 		dt = vcpu->arch.dtl.pinned_addr;
670 	vcpu->arch.dtl_ptr = dt;
671 	/* order writing *dt vs. writing vpa->dtl_idx */
672 	smp_wmb();
673 	vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index);
674 	vcpu->arch.dtl.dirty = true;
675 }
676 
677 static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
678 {
679 	if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207)
680 		return true;
681 	if ((!vcpu->arch.vcore->arch_compat) &&
682 	    cpu_has_feature(CPU_FTR_ARCH_207S))
683 		return true;
684 	return false;
685 }
686 
687 static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
688 			     unsigned long resource, unsigned long value1,
689 			     unsigned long value2)
690 {
691 	switch (resource) {
692 	case H_SET_MODE_RESOURCE_SET_CIABR:
693 		if (!kvmppc_power8_compatible(vcpu))
694 			return H_P2;
695 		if (value2)
696 			return H_P4;
697 		if (mflags)
698 			return H_UNSUPPORTED_FLAG_START;
699 		/* Guests can't breakpoint the hypervisor */
700 		if ((value1 & CIABR_PRIV) == CIABR_PRIV_HYPER)
701 			return H_P3;
702 		vcpu->arch.ciabr  = value1;
703 		return H_SUCCESS;
704 	case H_SET_MODE_RESOURCE_SET_DAWR:
705 		if (!kvmppc_power8_compatible(vcpu))
706 			return H_P2;
707 		if (mflags)
708 			return H_UNSUPPORTED_FLAG_START;
709 		if (value2 & DABRX_HYP)
710 			return H_P4;
711 		vcpu->arch.dawr  = value1;
712 		vcpu->arch.dawrx = value2;
713 		return H_SUCCESS;
714 	default:
715 		return H_TOO_HARD;
716 	}
717 }
718 
719 static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
720 {
721 	struct kvmppc_vcore *vcore = target->arch.vcore;
722 
723 	/*
724 	 * We expect to have been called by the real mode handler
725 	 * (kvmppc_rm_h_confer()) which would have directly returned
726 	 * H_SUCCESS if the source vcore wasn't idle (e.g. if it may
727 	 * have useful work to do and should not confer) so we don't
728 	 * recheck that here.
729 	 */
730 
731 	spin_lock(&vcore->lock);
732 	if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
733 	    vcore->vcore_state != VCORE_INACTIVE &&
734 	    vcore->runner)
735 		target = vcore->runner;
736 	spin_unlock(&vcore->lock);
737 
738 	return kvm_vcpu_yield_to(target);
739 }
740 
741 static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
742 {
743 	int yield_count = 0;
744 	struct lppaca *lppaca;
745 
746 	spin_lock(&vcpu->arch.vpa_update_lock);
747 	lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr;
748 	if (lppaca)
749 		yield_count = be32_to_cpu(lppaca->yield_count);
750 	spin_unlock(&vcpu->arch.vpa_update_lock);
751 	return yield_count;
752 }
753 
754 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
755 {
756 	unsigned long req = kvmppc_get_gpr(vcpu, 3);
757 	unsigned long target, ret = H_SUCCESS;
758 	int yield_count;
759 	struct kvm_vcpu *tvcpu;
760 	int idx, rc;
761 
762 	if (req <= MAX_HCALL_OPCODE &&
763 	    !test_bit(req/4, vcpu->kvm->arch.enabled_hcalls))
764 		return RESUME_HOST;
765 
766 	switch (req) {
767 	case H_CEDE:
768 		break;
769 	case H_PROD:
770 		target = kvmppc_get_gpr(vcpu, 4);
771 		tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
772 		if (!tvcpu) {
773 			ret = H_PARAMETER;
774 			break;
775 		}
776 		tvcpu->arch.prodded = 1;
777 		smp_mb();
778 		if (tvcpu->arch.ceded)
779 			kvmppc_fast_vcpu_kick_hv(tvcpu);
780 		break;
781 	case H_CONFER:
782 		target = kvmppc_get_gpr(vcpu, 4);
783 		if (target == -1)
784 			break;
785 		tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
786 		if (!tvcpu) {
787 			ret = H_PARAMETER;
788 			break;
789 		}
790 		yield_count = kvmppc_get_gpr(vcpu, 5);
791 		if (kvmppc_get_yield_count(tvcpu) != yield_count)
792 			break;
793 		kvm_arch_vcpu_yield_to(tvcpu);
794 		break;
795 	case H_REGISTER_VPA:
796 		ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
797 					kvmppc_get_gpr(vcpu, 5),
798 					kvmppc_get_gpr(vcpu, 6));
799 		break;
800 	case H_RTAS:
801 		if (list_empty(&vcpu->kvm->arch.rtas_tokens))
802 			return RESUME_HOST;
803 
804 		idx = srcu_read_lock(&vcpu->kvm->srcu);
805 		rc = kvmppc_rtas_hcall(vcpu);
806 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
807 
808 		if (rc == -ENOENT)
809 			return RESUME_HOST;
810 		else if (rc == 0)
811 			break;
812 
813 		/* Send the error out to userspace via KVM_RUN */
814 		return rc;
815 	case H_LOGICAL_CI_LOAD:
816 		ret = kvmppc_h_logical_ci_load(vcpu);
817 		if (ret == H_TOO_HARD)
818 			return RESUME_HOST;
819 		break;
820 	case H_LOGICAL_CI_STORE:
821 		ret = kvmppc_h_logical_ci_store(vcpu);
822 		if (ret == H_TOO_HARD)
823 			return RESUME_HOST;
824 		break;
825 	case H_SET_MODE:
826 		ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
827 					kvmppc_get_gpr(vcpu, 5),
828 					kvmppc_get_gpr(vcpu, 6),
829 					kvmppc_get_gpr(vcpu, 7));
830 		if (ret == H_TOO_HARD)
831 			return RESUME_HOST;
832 		break;
833 	case H_XIRR:
834 	case H_CPPR:
835 	case H_EOI:
836 	case H_IPI:
837 	case H_IPOLL:
838 	case H_XIRR_X:
839 		if (kvmppc_xics_enabled(vcpu)) {
840 			ret = kvmppc_xics_hcall(vcpu, req);
841 			break;
842 		}
843 		return RESUME_HOST;
844 	case H_PUT_TCE:
845 		ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
846 						kvmppc_get_gpr(vcpu, 5),
847 						kvmppc_get_gpr(vcpu, 6));
848 		if (ret == H_TOO_HARD)
849 			return RESUME_HOST;
850 		break;
851 	case H_PUT_TCE_INDIRECT:
852 		ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
853 						kvmppc_get_gpr(vcpu, 5),
854 						kvmppc_get_gpr(vcpu, 6),
855 						kvmppc_get_gpr(vcpu, 7));
856 		if (ret == H_TOO_HARD)
857 			return RESUME_HOST;
858 		break;
859 	case H_STUFF_TCE:
860 		ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
861 						kvmppc_get_gpr(vcpu, 5),
862 						kvmppc_get_gpr(vcpu, 6),
863 						kvmppc_get_gpr(vcpu, 7));
864 		if (ret == H_TOO_HARD)
865 			return RESUME_HOST;
866 		break;
867 	default:
868 		return RESUME_HOST;
869 	}
870 	kvmppc_set_gpr(vcpu, 3, ret);
871 	vcpu->arch.hcall_needed = 0;
872 	return RESUME_GUEST;
873 }
874 
875 static int kvmppc_hcall_impl_hv(unsigned long cmd)
876 {
877 	switch (cmd) {
878 	case H_CEDE:
879 	case H_PROD:
880 	case H_CONFER:
881 	case H_REGISTER_VPA:
882 	case H_SET_MODE:
883 	case H_LOGICAL_CI_LOAD:
884 	case H_LOGICAL_CI_STORE:
885 #ifdef CONFIG_KVM_XICS
886 	case H_XIRR:
887 	case H_CPPR:
888 	case H_EOI:
889 	case H_IPI:
890 	case H_IPOLL:
891 	case H_XIRR_X:
892 #endif
893 		return 1;
894 	}
895 
896 	/* See if it's in the real-mode table */
897 	return kvmppc_hcall_impl_hv_realmode(cmd);
898 }
899 
900 static int kvmppc_emulate_debug_inst(struct kvm_run *run,
901 					struct kvm_vcpu *vcpu)
902 {
903 	u32 last_inst;
904 
905 	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
906 					EMULATE_DONE) {
907 		/*
908 		 * Fetch failed, so return to guest and
909 		 * try executing it again.
910 		 */
911 		return RESUME_GUEST;
912 	}
913 
914 	if (last_inst == KVMPPC_INST_SW_BREAKPOINT) {
915 		run->exit_reason = KVM_EXIT_DEBUG;
916 		run->debug.arch.address = kvmppc_get_pc(vcpu);
917 		return RESUME_HOST;
918 	} else {
919 		kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
920 		return RESUME_GUEST;
921 	}
922 }
923 
924 static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
925 				 struct task_struct *tsk)
926 {
927 	int r = RESUME_HOST;
928 
929 	vcpu->stat.sum_exits++;
930 
931 	/*
932 	 * This can happen if an interrupt occurs in the last stages
933 	 * of guest entry or the first stages of guest exit (i.e. after
934 	 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
935 	 * and before setting it to KVM_GUEST_MODE_HOST_HV).
936 	 * That can happen due to a bug, or due to a machine check
937 	 * occurring at just the wrong time.
938 	 */
939 	if (vcpu->arch.shregs.msr & MSR_HV) {
940 		printk(KERN_EMERG "KVM trap in HV mode!\n");
941 		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
942 			vcpu->arch.trap, kvmppc_get_pc(vcpu),
943 			vcpu->arch.shregs.msr);
944 		kvmppc_dump_regs(vcpu);
945 		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
946 		run->hw.hardware_exit_reason = vcpu->arch.trap;
947 		return RESUME_HOST;
948 	}
949 	run->exit_reason = KVM_EXIT_UNKNOWN;
950 	run->ready_for_interrupt_injection = 1;
951 	switch (vcpu->arch.trap) {
952 	/* We're good on these - the host merely wanted to get our attention */
953 	case BOOK3S_INTERRUPT_HV_DECREMENTER:
954 		vcpu->stat.dec_exits++;
955 		r = RESUME_GUEST;
956 		break;
957 	case BOOK3S_INTERRUPT_EXTERNAL:
958 	case BOOK3S_INTERRUPT_H_DOORBELL:
959 	case BOOK3S_INTERRUPT_H_VIRT:
960 		vcpu->stat.ext_intr_exits++;
961 		r = RESUME_GUEST;
962 		break;
963 	/* HMI is hypervisor interrupt and host has handled it. Resume guest.*/
964 	case BOOK3S_INTERRUPT_HMI:
965 	case BOOK3S_INTERRUPT_PERFMON:
966 		r = RESUME_GUEST;
967 		break;
968 	case BOOK3S_INTERRUPT_MACHINE_CHECK:
969 		/*
970 		 * Deliver a machine check interrupt to the guest.
971 		 * We have to do this, even if the host has handled the
972 		 * machine check, because machine checks use SRR0/1 and
973 		 * the interrupt might have trashed guest state in them.
974 		 */
975 		kvmppc_book3s_queue_irqprio(vcpu,
976 					    BOOK3S_INTERRUPT_MACHINE_CHECK);
977 		r = RESUME_GUEST;
978 		break;
979 	case BOOK3S_INTERRUPT_PROGRAM:
980 	{
981 		ulong flags;
982 		/*
983 		 * Normally program interrupts are delivered directly
984 		 * to the guest by the hardware, but we can get here
985 		 * as a result of a hypervisor emulation interrupt
986 		 * (e40) getting turned into a 700 by BML RTAS.
987 		 */
988 		flags = vcpu->arch.shregs.msr & 0x1f0000ull;
989 		kvmppc_core_queue_program(vcpu, flags);
990 		r = RESUME_GUEST;
991 		break;
992 	}
993 	case BOOK3S_INTERRUPT_SYSCALL:
994 	{
995 		/* hcall - punt to userspace */
996 		int i;
997 
998 		/* hypercall with MSR_PR has already been handled in rmode,
999 		 * and never reaches here.
1000 		 */
1001 
1002 		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
1003 		for (i = 0; i < 9; ++i)
1004 			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
1005 		run->exit_reason = KVM_EXIT_PAPR_HCALL;
1006 		vcpu->arch.hcall_needed = 1;
1007 		r = RESUME_HOST;
1008 		break;
1009 	}
1010 	/*
1011 	 * We get these next two if the guest accesses a page which it thinks
1012 	 * it has mapped but which is not actually present, either because
1013 	 * it is for an emulated I/O device or because the corresonding
1014 	 * host page has been paged out.  Any other HDSI/HISI interrupts
1015 	 * have been handled already.
1016 	 */
1017 	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
1018 		r = RESUME_PAGE_FAULT;
1019 		break;
1020 	case BOOK3S_INTERRUPT_H_INST_STORAGE:
1021 		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
1022 		vcpu->arch.fault_dsisr = 0;
1023 		r = RESUME_PAGE_FAULT;
1024 		break;
1025 	/*
1026 	 * This occurs if the guest executes an illegal instruction.
1027 	 * If the guest debug is disabled, generate a program interrupt
1028 	 * to the guest. If guest debug is enabled, we need to check
1029 	 * whether the instruction is a software breakpoint instruction.
1030 	 * Accordingly return to Guest or Host.
1031 	 */
1032 	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
1033 		if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED)
1034 			vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ?
1035 				swab32(vcpu->arch.emul_inst) :
1036 				vcpu->arch.emul_inst;
1037 		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
1038 			r = kvmppc_emulate_debug_inst(run, vcpu);
1039 		} else {
1040 			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
1041 			r = RESUME_GUEST;
1042 		}
1043 		break;
1044 	/*
1045 	 * This occurs if the guest (kernel or userspace), does something that
1046 	 * is prohibited by HFSCR.  We just generate a program interrupt to
1047 	 * the guest.
1048 	 */
1049 	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
1050 		kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
1051 		r = RESUME_GUEST;
1052 		break;
1053 	case BOOK3S_INTERRUPT_HV_RM_HARD:
1054 		r = RESUME_PASSTHROUGH;
1055 		break;
1056 	default:
1057 		kvmppc_dump_regs(vcpu);
1058 		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
1059 			vcpu->arch.trap, kvmppc_get_pc(vcpu),
1060 			vcpu->arch.shregs.msr);
1061 		run->hw.hardware_exit_reason = vcpu->arch.trap;
1062 		r = RESUME_HOST;
1063 		break;
1064 	}
1065 
1066 	return r;
1067 }
1068 
1069 static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
1070 					    struct kvm_sregs *sregs)
1071 {
1072 	int i;
1073 
1074 	memset(sregs, 0, sizeof(struct kvm_sregs));
1075 	sregs->pvr = vcpu->arch.pvr;
1076 	for (i = 0; i < vcpu->arch.slb_max; i++) {
1077 		sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
1078 		sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
1079 	}
1080 
1081 	return 0;
1082 }
1083 
1084 static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu,
1085 					    struct kvm_sregs *sregs)
1086 {
1087 	int i, j;
1088 
1089 	/* Only accept the same PVR as the host's, since we can't spoof it */
1090 	if (sregs->pvr != vcpu->arch.pvr)
1091 		return -EINVAL;
1092 
1093 	j = 0;
1094 	for (i = 0; i < vcpu->arch.slb_nr; i++) {
1095 		if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
1096 			vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
1097 			vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
1098 			++j;
1099 		}
1100 	}
1101 	vcpu->arch.slb_max = j;
1102 
1103 	return 0;
1104 }
1105 
1106 static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
1107 		bool preserve_top32)
1108 {
1109 	struct kvm *kvm = vcpu->kvm;
1110 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
1111 	u64 mask;
1112 
1113 	mutex_lock(&kvm->lock);
1114 	spin_lock(&vc->lock);
1115 	/*
1116 	 * If ILE (interrupt little-endian) has changed, update the
1117 	 * MSR_LE bit in the intr_msr for each vcpu in this vcore.
1118 	 */
1119 	if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) {
1120 		struct kvm_vcpu *vcpu;
1121 		int i;
1122 
1123 		kvm_for_each_vcpu(i, vcpu, kvm) {
1124 			if (vcpu->arch.vcore != vc)
1125 				continue;
1126 			if (new_lpcr & LPCR_ILE)
1127 				vcpu->arch.intr_msr |= MSR_LE;
1128 			else
1129 				vcpu->arch.intr_msr &= ~MSR_LE;
1130 		}
1131 	}
1132 
1133 	/*
1134 	 * Userspace can only modify DPFD (default prefetch depth),
1135 	 * ILE (interrupt little-endian) and TC (translation control).
1136 	 * On POWER8 and POWER9 userspace can also modify AIL (alt. interrupt loc.).
1137 	 */
1138 	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
1139 	if (cpu_has_feature(CPU_FTR_ARCH_207S))
1140 		mask |= LPCR_AIL;
1141 
1142 	/* Broken 32-bit version of LPCR must not clear top bits */
1143 	if (preserve_top32)
1144 		mask &= 0xFFFFFFFF;
1145 	vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask);
1146 	spin_unlock(&vc->lock);
1147 	mutex_unlock(&kvm->lock);
1148 }
1149 
1150 static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
1151 				 union kvmppc_one_reg *val)
1152 {
1153 	int r = 0;
1154 	long int i;
1155 
1156 	switch (id) {
1157 	case KVM_REG_PPC_DEBUG_INST:
1158 		*val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT);
1159 		break;
1160 	case KVM_REG_PPC_HIOR:
1161 		*val = get_reg_val(id, 0);
1162 		break;
1163 	case KVM_REG_PPC_DABR:
1164 		*val = get_reg_val(id, vcpu->arch.dabr);
1165 		break;
1166 	case KVM_REG_PPC_DABRX:
1167 		*val = get_reg_val(id, vcpu->arch.dabrx);
1168 		break;
1169 	case KVM_REG_PPC_DSCR:
1170 		*val = get_reg_val(id, vcpu->arch.dscr);
1171 		break;
1172 	case KVM_REG_PPC_PURR:
1173 		*val = get_reg_val(id, vcpu->arch.purr);
1174 		break;
1175 	case KVM_REG_PPC_SPURR:
1176 		*val = get_reg_val(id, vcpu->arch.spurr);
1177 		break;
1178 	case KVM_REG_PPC_AMR:
1179 		*val = get_reg_val(id, vcpu->arch.amr);
1180 		break;
1181 	case KVM_REG_PPC_UAMOR:
1182 		*val = get_reg_val(id, vcpu->arch.uamor);
1183 		break;
1184 	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
1185 		i = id - KVM_REG_PPC_MMCR0;
1186 		*val = get_reg_val(id, vcpu->arch.mmcr[i]);
1187 		break;
1188 	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
1189 		i = id - KVM_REG_PPC_PMC1;
1190 		*val = get_reg_val(id, vcpu->arch.pmc[i]);
1191 		break;
1192 	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
1193 		i = id - KVM_REG_PPC_SPMC1;
1194 		*val = get_reg_val(id, vcpu->arch.spmc[i]);
1195 		break;
1196 	case KVM_REG_PPC_SIAR:
1197 		*val = get_reg_val(id, vcpu->arch.siar);
1198 		break;
1199 	case KVM_REG_PPC_SDAR:
1200 		*val = get_reg_val(id, vcpu->arch.sdar);
1201 		break;
1202 	case KVM_REG_PPC_SIER:
1203 		*val = get_reg_val(id, vcpu->arch.sier);
1204 		break;
1205 	case KVM_REG_PPC_IAMR:
1206 		*val = get_reg_val(id, vcpu->arch.iamr);
1207 		break;
1208 	case KVM_REG_PPC_PSPB:
1209 		*val = get_reg_val(id, vcpu->arch.pspb);
1210 		break;
1211 	case KVM_REG_PPC_DPDES:
1212 		*val = get_reg_val(id, vcpu->arch.vcore->dpdes);
1213 		break;
1214 	case KVM_REG_PPC_VTB:
1215 		*val = get_reg_val(id, vcpu->arch.vcore->vtb);
1216 		break;
1217 	case KVM_REG_PPC_DAWR:
1218 		*val = get_reg_val(id, vcpu->arch.dawr);
1219 		break;
1220 	case KVM_REG_PPC_DAWRX:
1221 		*val = get_reg_val(id, vcpu->arch.dawrx);
1222 		break;
1223 	case KVM_REG_PPC_CIABR:
1224 		*val = get_reg_val(id, vcpu->arch.ciabr);
1225 		break;
1226 	case KVM_REG_PPC_CSIGR:
1227 		*val = get_reg_val(id, vcpu->arch.csigr);
1228 		break;
1229 	case KVM_REG_PPC_TACR:
1230 		*val = get_reg_val(id, vcpu->arch.tacr);
1231 		break;
1232 	case KVM_REG_PPC_TCSCR:
1233 		*val = get_reg_val(id, vcpu->arch.tcscr);
1234 		break;
1235 	case KVM_REG_PPC_PID:
1236 		*val = get_reg_val(id, vcpu->arch.pid);
1237 		break;
1238 	case KVM_REG_PPC_ACOP:
1239 		*val = get_reg_val(id, vcpu->arch.acop);
1240 		break;
1241 	case KVM_REG_PPC_WORT:
1242 		*val = get_reg_val(id, vcpu->arch.wort);
1243 		break;
1244 	case KVM_REG_PPC_TIDR:
1245 		*val = get_reg_val(id, vcpu->arch.tid);
1246 		break;
1247 	case KVM_REG_PPC_PSSCR:
1248 		*val = get_reg_val(id, vcpu->arch.psscr);
1249 		break;
1250 	case KVM_REG_PPC_VPA_ADDR:
1251 		spin_lock(&vcpu->arch.vpa_update_lock);
1252 		*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
1253 		spin_unlock(&vcpu->arch.vpa_update_lock);
1254 		break;
1255 	case KVM_REG_PPC_VPA_SLB:
1256 		spin_lock(&vcpu->arch.vpa_update_lock);
1257 		val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
1258 		val->vpaval.length = vcpu->arch.slb_shadow.len;
1259 		spin_unlock(&vcpu->arch.vpa_update_lock);
1260 		break;
1261 	case KVM_REG_PPC_VPA_DTL:
1262 		spin_lock(&vcpu->arch.vpa_update_lock);
1263 		val->vpaval.addr = vcpu->arch.dtl.next_gpa;
1264 		val->vpaval.length = vcpu->arch.dtl.len;
1265 		spin_unlock(&vcpu->arch.vpa_update_lock);
1266 		break;
1267 	case KVM_REG_PPC_TB_OFFSET:
1268 		*val = get_reg_val(id, vcpu->arch.vcore->tb_offset);
1269 		break;
1270 	case KVM_REG_PPC_LPCR:
1271 	case KVM_REG_PPC_LPCR_64:
1272 		*val = get_reg_val(id, vcpu->arch.vcore->lpcr);
1273 		break;
1274 	case KVM_REG_PPC_PPR:
1275 		*val = get_reg_val(id, vcpu->arch.ppr);
1276 		break;
1277 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1278 	case KVM_REG_PPC_TFHAR:
1279 		*val = get_reg_val(id, vcpu->arch.tfhar);
1280 		break;
1281 	case KVM_REG_PPC_TFIAR:
1282 		*val = get_reg_val(id, vcpu->arch.tfiar);
1283 		break;
1284 	case KVM_REG_PPC_TEXASR:
1285 		*val = get_reg_val(id, vcpu->arch.texasr);
1286 		break;
1287 	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
1288 		i = id - KVM_REG_PPC_TM_GPR0;
1289 		*val = get_reg_val(id, vcpu->arch.gpr_tm[i]);
1290 		break;
1291 	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
1292 	{
1293 		int j;
1294 		i = id - KVM_REG_PPC_TM_VSR0;
1295 		if (i < 32)
1296 			for (j = 0; j < TS_FPRWIDTH; j++)
1297 				val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
1298 		else {
1299 			if (cpu_has_feature(CPU_FTR_ALTIVEC))
1300 				val->vval = vcpu->arch.vr_tm.vr[i-32];
1301 			else
1302 				r = -ENXIO;
1303 		}
1304 		break;
1305 	}
1306 	case KVM_REG_PPC_TM_CR:
1307 		*val = get_reg_val(id, vcpu->arch.cr_tm);
1308 		break;
1309 	case KVM_REG_PPC_TM_XER:
1310 		*val = get_reg_val(id, vcpu->arch.xer_tm);
1311 		break;
1312 	case KVM_REG_PPC_TM_LR:
1313 		*val = get_reg_val(id, vcpu->arch.lr_tm);
1314 		break;
1315 	case KVM_REG_PPC_TM_CTR:
1316 		*val = get_reg_val(id, vcpu->arch.ctr_tm);
1317 		break;
1318 	case KVM_REG_PPC_TM_FPSCR:
1319 		*val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
1320 		break;
1321 	case KVM_REG_PPC_TM_AMR:
1322 		*val = get_reg_val(id, vcpu->arch.amr_tm);
1323 		break;
1324 	case KVM_REG_PPC_TM_PPR:
1325 		*val = get_reg_val(id, vcpu->arch.ppr_tm);
1326 		break;
1327 	case KVM_REG_PPC_TM_VRSAVE:
1328 		*val = get_reg_val(id, vcpu->arch.vrsave_tm);
1329 		break;
1330 	case KVM_REG_PPC_TM_VSCR:
1331 		if (cpu_has_feature(CPU_FTR_ALTIVEC))
1332 			*val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
1333 		else
1334 			r = -ENXIO;
1335 		break;
1336 	case KVM_REG_PPC_TM_DSCR:
1337 		*val = get_reg_val(id, vcpu->arch.dscr_tm);
1338 		break;
1339 	case KVM_REG_PPC_TM_TAR:
1340 		*val = get_reg_val(id, vcpu->arch.tar_tm);
1341 		break;
1342 #endif
1343 	case KVM_REG_PPC_ARCH_COMPAT:
1344 		*val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
1345 		break;
1346 	default:
1347 		r = -EINVAL;
1348 		break;
1349 	}
1350 
1351 	return r;
1352 }
1353 
1354 static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
1355 				 union kvmppc_one_reg *val)
1356 {
1357 	int r = 0;
1358 	long int i;
1359 	unsigned long addr, len;
1360 
1361 	switch (id) {
1362 	case KVM_REG_PPC_HIOR:
1363 		/* Only allow this to be set to zero */
1364 		if (set_reg_val(id, *val))
1365 			r = -EINVAL;
1366 		break;
1367 	case KVM_REG_PPC_DABR:
1368 		vcpu->arch.dabr = set_reg_val(id, *val);
1369 		break;
1370 	case KVM_REG_PPC_DABRX:
1371 		vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP;
1372 		break;
1373 	case KVM_REG_PPC_DSCR:
1374 		vcpu->arch.dscr = set_reg_val(id, *val);
1375 		break;
1376 	case KVM_REG_PPC_PURR:
1377 		vcpu->arch.purr = set_reg_val(id, *val);
1378 		break;
1379 	case KVM_REG_PPC_SPURR:
1380 		vcpu->arch.spurr = set_reg_val(id, *val);
1381 		break;
1382 	case KVM_REG_PPC_AMR:
1383 		vcpu->arch.amr = set_reg_val(id, *val);
1384 		break;
1385 	case KVM_REG_PPC_UAMOR:
1386 		vcpu->arch.uamor = set_reg_val(id, *val);
1387 		break;
1388 	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
1389 		i = id - KVM_REG_PPC_MMCR0;
1390 		vcpu->arch.mmcr[i] = set_reg_val(id, *val);
1391 		break;
1392 	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
1393 		i = id - KVM_REG_PPC_PMC1;
1394 		vcpu->arch.pmc[i] = set_reg_val(id, *val);
1395 		break;
1396 	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
1397 		i = id - KVM_REG_PPC_SPMC1;
1398 		vcpu->arch.spmc[i] = set_reg_val(id, *val);
1399 		break;
1400 	case KVM_REG_PPC_SIAR:
1401 		vcpu->arch.siar = set_reg_val(id, *val);
1402 		break;
1403 	case KVM_REG_PPC_SDAR:
1404 		vcpu->arch.sdar = set_reg_val(id, *val);
1405 		break;
1406 	case KVM_REG_PPC_SIER:
1407 		vcpu->arch.sier = set_reg_val(id, *val);
1408 		break;
1409 	case KVM_REG_PPC_IAMR:
1410 		vcpu->arch.iamr = set_reg_val(id, *val);
1411 		break;
1412 	case KVM_REG_PPC_PSPB:
1413 		vcpu->arch.pspb = set_reg_val(id, *val);
1414 		break;
1415 	case KVM_REG_PPC_DPDES:
1416 		vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
1417 		break;
1418 	case KVM_REG_PPC_VTB:
1419 		vcpu->arch.vcore->vtb = set_reg_val(id, *val);
1420 		break;
1421 	case KVM_REG_PPC_DAWR:
1422 		vcpu->arch.dawr = set_reg_val(id, *val);
1423 		break;
1424 	case KVM_REG_PPC_DAWRX:
1425 		vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP;
1426 		break;
1427 	case KVM_REG_PPC_CIABR:
1428 		vcpu->arch.ciabr = set_reg_val(id, *val);
1429 		/* Don't allow setting breakpoints in hypervisor code */
1430 		if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
1431 			vcpu->arch.ciabr &= ~CIABR_PRIV;	/* disable */
1432 		break;
1433 	case KVM_REG_PPC_CSIGR:
1434 		vcpu->arch.csigr = set_reg_val(id, *val);
1435 		break;
1436 	case KVM_REG_PPC_TACR:
1437 		vcpu->arch.tacr = set_reg_val(id, *val);
1438 		break;
1439 	case KVM_REG_PPC_TCSCR:
1440 		vcpu->arch.tcscr = set_reg_val(id, *val);
1441 		break;
1442 	case KVM_REG_PPC_PID:
1443 		vcpu->arch.pid = set_reg_val(id, *val);
1444 		break;
1445 	case KVM_REG_PPC_ACOP:
1446 		vcpu->arch.acop = set_reg_val(id, *val);
1447 		break;
1448 	case KVM_REG_PPC_WORT:
1449 		vcpu->arch.wort = set_reg_val(id, *val);
1450 		break;
1451 	case KVM_REG_PPC_TIDR:
1452 		vcpu->arch.tid = set_reg_val(id, *val);
1453 		break;
1454 	case KVM_REG_PPC_PSSCR:
1455 		vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS;
1456 		break;
1457 	case KVM_REG_PPC_VPA_ADDR:
1458 		addr = set_reg_val(id, *val);
1459 		r = -EINVAL;
1460 		if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
1461 			      vcpu->arch.dtl.next_gpa))
1462 			break;
1463 		r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
1464 		break;
1465 	case KVM_REG_PPC_VPA_SLB:
1466 		addr = val->vpaval.addr;
1467 		len = val->vpaval.length;
1468 		r = -EINVAL;
1469 		if (addr && !vcpu->arch.vpa.next_gpa)
1470 			break;
1471 		r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
1472 		break;
1473 	case KVM_REG_PPC_VPA_DTL:
1474 		addr = val->vpaval.addr;
1475 		len = val->vpaval.length;
1476 		r = -EINVAL;
1477 		if (addr && (len < sizeof(struct dtl_entry) ||
1478 			     !vcpu->arch.vpa.next_gpa))
1479 			break;
1480 		len -= len % sizeof(struct dtl_entry);
1481 		r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
1482 		break;
1483 	case KVM_REG_PPC_TB_OFFSET:
1484 		/* round up to multiple of 2^24 */
1485 		vcpu->arch.vcore->tb_offset =
1486 			ALIGN(set_reg_val(id, *val), 1UL << 24);
1487 		break;
1488 	case KVM_REG_PPC_LPCR:
1489 		kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), true);
1490 		break;
1491 	case KVM_REG_PPC_LPCR_64:
1492 		kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), false);
1493 		break;
1494 	case KVM_REG_PPC_PPR:
1495 		vcpu->arch.ppr = set_reg_val(id, *val);
1496 		break;
1497 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1498 	case KVM_REG_PPC_TFHAR:
1499 		vcpu->arch.tfhar = set_reg_val(id, *val);
1500 		break;
1501 	case KVM_REG_PPC_TFIAR:
1502 		vcpu->arch.tfiar = set_reg_val(id, *val);
1503 		break;
1504 	case KVM_REG_PPC_TEXASR:
1505 		vcpu->arch.texasr = set_reg_val(id, *val);
1506 		break;
1507 	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
1508 		i = id - KVM_REG_PPC_TM_GPR0;
1509 		vcpu->arch.gpr_tm[i] = set_reg_val(id, *val);
1510 		break;
1511 	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
1512 	{
1513 		int j;
1514 		i = id - KVM_REG_PPC_TM_VSR0;
1515 		if (i < 32)
1516 			for (j = 0; j < TS_FPRWIDTH; j++)
1517 				vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
1518 		else
1519 			if (cpu_has_feature(CPU_FTR_ALTIVEC))
1520 				vcpu->arch.vr_tm.vr[i-32] = val->vval;
1521 			else
1522 				r = -ENXIO;
1523 		break;
1524 	}
1525 	case KVM_REG_PPC_TM_CR:
1526 		vcpu->arch.cr_tm = set_reg_val(id, *val);
1527 		break;
1528 	case KVM_REG_PPC_TM_XER:
1529 		vcpu->arch.xer_tm = set_reg_val(id, *val);
1530 		break;
1531 	case KVM_REG_PPC_TM_LR:
1532 		vcpu->arch.lr_tm = set_reg_val(id, *val);
1533 		break;
1534 	case KVM_REG_PPC_TM_CTR:
1535 		vcpu->arch.ctr_tm = set_reg_val(id, *val);
1536 		break;
1537 	case KVM_REG_PPC_TM_FPSCR:
1538 		vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
1539 		break;
1540 	case KVM_REG_PPC_TM_AMR:
1541 		vcpu->arch.amr_tm = set_reg_val(id, *val);
1542 		break;
1543 	case KVM_REG_PPC_TM_PPR:
1544 		vcpu->arch.ppr_tm = set_reg_val(id, *val);
1545 		break;
1546 	case KVM_REG_PPC_TM_VRSAVE:
1547 		vcpu->arch.vrsave_tm = set_reg_val(id, *val);
1548 		break;
1549 	case KVM_REG_PPC_TM_VSCR:
1550 		if (cpu_has_feature(CPU_FTR_ALTIVEC))
1551 			vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val);
1552 		else
1553 			r = - ENXIO;
1554 		break;
1555 	case KVM_REG_PPC_TM_DSCR:
1556 		vcpu->arch.dscr_tm = set_reg_val(id, *val);
1557 		break;
1558 	case KVM_REG_PPC_TM_TAR:
1559 		vcpu->arch.tar_tm = set_reg_val(id, *val);
1560 		break;
1561 #endif
1562 	case KVM_REG_PPC_ARCH_COMPAT:
1563 		r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
1564 		break;
1565 	default:
1566 		r = -EINVAL;
1567 		break;
1568 	}
1569 
1570 	return r;
1571 }
1572 
1573 /*
1574  * On POWER9, threads are independent and can be in different partitions.
1575  * Therefore we consider each thread to be a subcore.
1576  * There is a restriction that all threads have to be in the same
1577  * MMU mode (radix or HPT), unfortunately, but since we only support
1578  * HPT guests on a HPT host so far, that isn't an impediment yet.
1579  */
1580 static int threads_per_vcore(void)
1581 {
1582 	if (cpu_has_feature(CPU_FTR_ARCH_300))
1583 		return 1;
1584 	return threads_per_subcore;
1585 }
1586 
1587 static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
1588 {
1589 	struct kvmppc_vcore *vcore;
1590 
1591 	vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
1592 
1593 	if (vcore == NULL)
1594 		return NULL;
1595 
1596 	spin_lock_init(&vcore->lock);
1597 	spin_lock_init(&vcore->stoltb_lock);
1598 	init_swait_queue_head(&vcore->wq);
1599 	vcore->preempt_tb = TB_NIL;
1600 	vcore->lpcr = kvm->arch.lpcr;
1601 	vcore->first_vcpuid = core * threads_per_vcore();
1602 	vcore->kvm = kvm;
1603 	INIT_LIST_HEAD(&vcore->preempt_list);
1604 
1605 	return vcore;
1606 }
1607 
1608 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
1609 static struct debugfs_timings_element {
1610 	const char *name;
1611 	size_t offset;
1612 } timings[] = {
1613 	{"rm_entry",	offsetof(struct kvm_vcpu, arch.rm_entry)},
1614 	{"rm_intr",	offsetof(struct kvm_vcpu, arch.rm_intr)},
1615 	{"rm_exit",	offsetof(struct kvm_vcpu, arch.rm_exit)},
1616 	{"guest",	offsetof(struct kvm_vcpu, arch.guest_time)},
1617 	{"cede",	offsetof(struct kvm_vcpu, arch.cede_time)},
1618 };
1619 
1620 #define N_TIMINGS	(sizeof(timings) / sizeof(timings[0]))
1621 
1622 struct debugfs_timings_state {
1623 	struct kvm_vcpu	*vcpu;
1624 	unsigned int	buflen;
1625 	char		buf[N_TIMINGS * 100];
1626 };
1627 
1628 static int debugfs_timings_open(struct inode *inode, struct file *file)
1629 {
1630 	struct kvm_vcpu *vcpu = inode->i_private;
1631 	struct debugfs_timings_state *p;
1632 
1633 	p = kzalloc(sizeof(*p), GFP_KERNEL);
1634 	if (!p)
1635 		return -ENOMEM;
1636 
1637 	kvm_get_kvm(vcpu->kvm);
1638 	p->vcpu = vcpu;
1639 	file->private_data = p;
1640 
1641 	return nonseekable_open(inode, file);
1642 }
1643 
1644 static int debugfs_timings_release(struct inode *inode, struct file *file)
1645 {
1646 	struct debugfs_timings_state *p = file->private_data;
1647 
1648 	kvm_put_kvm(p->vcpu->kvm);
1649 	kfree(p);
1650 	return 0;
1651 }
1652 
1653 static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
1654 				    size_t len, loff_t *ppos)
1655 {
1656 	struct debugfs_timings_state *p = file->private_data;
1657 	struct kvm_vcpu *vcpu = p->vcpu;
1658 	char *s, *buf_end;
1659 	struct kvmhv_tb_accumulator tb;
1660 	u64 count;
1661 	loff_t pos;
1662 	ssize_t n;
1663 	int i, loops;
1664 	bool ok;
1665 
1666 	if (!p->buflen) {
1667 		s = p->buf;
1668 		buf_end = s + sizeof(p->buf);
1669 		for (i = 0; i < N_TIMINGS; ++i) {
1670 			struct kvmhv_tb_accumulator *acc;
1671 
1672 			acc = (struct kvmhv_tb_accumulator *)
1673 				((unsigned long)vcpu + timings[i].offset);
1674 			ok = false;
1675 			for (loops = 0; loops < 1000; ++loops) {
1676 				count = acc->seqcount;
1677 				if (!(count & 1)) {
1678 					smp_rmb();
1679 					tb = *acc;
1680 					smp_rmb();
1681 					if (count == acc->seqcount) {
1682 						ok = true;
1683 						break;
1684 					}
1685 				}
1686 				udelay(1);
1687 			}
1688 			if (!ok)
1689 				snprintf(s, buf_end - s, "%s: stuck\n",
1690 					timings[i].name);
1691 			else
1692 				snprintf(s, buf_end - s,
1693 					"%s: %llu %llu %llu %llu\n",
1694 					timings[i].name, count / 2,
1695 					tb_to_ns(tb.tb_total),
1696 					tb_to_ns(tb.tb_min),
1697 					tb_to_ns(tb.tb_max));
1698 			s += strlen(s);
1699 		}
1700 		p->buflen = s - p->buf;
1701 	}
1702 
1703 	pos = *ppos;
1704 	if (pos >= p->buflen)
1705 		return 0;
1706 	if (len > p->buflen - pos)
1707 		len = p->buflen - pos;
1708 	n = copy_to_user(buf, p->buf + pos, len);
1709 	if (n) {
1710 		if (n == len)
1711 			return -EFAULT;
1712 		len -= n;
1713 	}
1714 	*ppos = pos + len;
1715 	return len;
1716 }
1717 
1718 static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
1719 				     size_t len, loff_t *ppos)
1720 {
1721 	return -EACCES;
1722 }
1723 
1724 static const struct file_operations debugfs_timings_ops = {
1725 	.owner	 = THIS_MODULE,
1726 	.open	 = debugfs_timings_open,
1727 	.release = debugfs_timings_release,
1728 	.read	 = debugfs_timings_read,
1729 	.write	 = debugfs_timings_write,
1730 	.llseek	 = generic_file_llseek,
1731 };
1732 
1733 /* Create a debugfs directory for the vcpu */
1734 static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
1735 {
1736 	char buf[16];
1737 	struct kvm *kvm = vcpu->kvm;
1738 
1739 	snprintf(buf, sizeof(buf), "vcpu%u", id);
1740 	if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
1741 		return;
1742 	vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir);
1743 	if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir))
1744 		return;
1745 	vcpu->arch.debugfs_timings =
1746 		debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir,
1747 				    vcpu, &debugfs_timings_ops);
1748 }
1749 
1750 #else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
1751 static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
1752 {
1753 }
1754 #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
1755 
1756 static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
1757 						   unsigned int id)
1758 {
1759 	struct kvm_vcpu *vcpu;
1760 	int err = -EINVAL;
1761 	int core;
1762 	struct kvmppc_vcore *vcore;
1763 
1764 	core = id / threads_per_vcore();
1765 	if (core >= KVM_MAX_VCORES)
1766 		goto out;
1767 
1768 	err = -ENOMEM;
1769 	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
1770 	if (!vcpu)
1771 		goto out;
1772 
1773 	err = kvm_vcpu_init(vcpu, kvm, id);
1774 	if (err)
1775 		goto free_vcpu;
1776 
1777 	vcpu->arch.shared = &vcpu->arch.shregs;
1778 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
1779 	/*
1780 	 * The shared struct is never shared on HV,
1781 	 * so we can always use host endianness
1782 	 */
1783 #ifdef __BIG_ENDIAN__
1784 	vcpu->arch.shared_big_endian = true;
1785 #else
1786 	vcpu->arch.shared_big_endian = false;
1787 #endif
1788 #endif
1789 	vcpu->arch.mmcr[0] = MMCR0_FC;
1790 	vcpu->arch.ctrl = CTRL_RUNLATCH;
1791 	/* default to host PVR, since we can't spoof it */
1792 	kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
1793 	spin_lock_init(&vcpu->arch.vpa_update_lock);
1794 	spin_lock_init(&vcpu->arch.tbacct_lock);
1795 	vcpu->arch.busy_preempt = TB_NIL;
1796 	vcpu->arch.intr_msr = MSR_SF | MSR_ME;
1797 
1798 	kvmppc_mmu_book3s_hv_init(vcpu);
1799 
1800 	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
1801 
1802 	init_waitqueue_head(&vcpu->arch.cpu_run);
1803 
1804 	mutex_lock(&kvm->lock);
1805 	vcore = kvm->arch.vcores[core];
1806 	if (!vcore) {
1807 		vcore = kvmppc_vcore_create(kvm, core);
1808 		kvm->arch.vcores[core] = vcore;
1809 		kvm->arch.online_vcores++;
1810 	}
1811 	mutex_unlock(&kvm->lock);
1812 
1813 	if (!vcore)
1814 		goto free_vcpu;
1815 
1816 	spin_lock(&vcore->lock);
1817 	++vcore->num_threads;
1818 	spin_unlock(&vcore->lock);
1819 	vcpu->arch.vcore = vcore;
1820 	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
1821 	vcpu->arch.thread_cpu = -1;
1822 	vcpu->arch.prev_cpu = -1;
1823 
1824 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
1825 	kvmppc_sanity_check(vcpu);
1826 
1827 	debugfs_vcpu_init(vcpu, id);
1828 
1829 	return vcpu;
1830 
1831 free_vcpu:
1832 	kmem_cache_free(kvm_vcpu_cache, vcpu);
1833 out:
1834 	return ERR_PTR(err);
1835 }
1836 
1837 static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
1838 {
1839 	if (vpa->pinned_addr)
1840 		kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
1841 					vpa->dirty);
1842 }
1843 
1844 static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu)
1845 {
1846 	spin_lock(&vcpu->arch.vpa_update_lock);
1847 	unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
1848 	unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
1849 	unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
1850 	spin_unlock(&vcpu->arch.vpa_update_lock);
1851 	kvm_vcpu_uninit(vcpu);
1852 	kmem_cache_free(kvm_vcpu_cache, vcpu);
1853 }
1854 
1855 static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
1856 {
1857 	/* Indicate we want to get back into the guest */
1858 	return 1;
1859 }
1860 
1861 static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
1862 {
1863 	unsigned long dec_nsec, now;
1864 
1865 	now = get_tb();
1866 	if (now > vcpu->arch.dec_expires) {
1867 		/* decrementer has already gone negative */
1868 		kvmppc_core_queue_dec(vcpu);
1869 		kvmppc_core_prepare_to_enter(vcpu);
1870 		return;
1871 	}
1872 	dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
1873 		   / tb_ticks_per_sec;
1874 	hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
1875 	vcpu->arch.timer_running = 1;
1876 }
1877 
1878 static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
1879 {
1880 	vcpu->arch.ceded = 0;
1881 	if (vcpu->arch.timer_running) {
1882 		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
1883 		vcpu->arch.timer_running = 0;
1884 	}
1885 }
1886 
1887 extern void __kvmppc_vcore_entry(void);
1888 
1889 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
1890 				   struct kvm_vcpu *vcpu)
1891 {
1892 	u64 now;
1893 
1894 	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
1895 		return;
1896 	spin_lock_irq(&vcpu->arch.tbacct_lock);
1897 	now = mftb();
1898 	vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
1899 		vcpu->arch.stolen_logged;
1900 	vcpu->arch.busy_preempt = now;
1901 	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
1902 	spin_unlock_irq(&vcpu->arch.tbacct_lock);
1903 	--vc->n_runnable;
1904 	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
1905 }
1906 
1907 static int kvmppc_grab_hwthread(int cpu)
1908 {
1909 	struct paca_struct *tpaca;
1910 	long timeout = 10000;
1911 
1912 	tpaca = &paca[cpu];
1913 
1914 	/* Ensure the thread won't go into the kernel if it wakes */
1915 	tpaca->kvm_hstate.kvm_vcpu = NULL;
1916 	tpaca->kvm_hstate.kvm_vcore = NULL;
1917 	tpaca->kvm_hstate.napping = 0;
1918 	smp_wmb();
1919 	tpaca->kvm_hstate.hwthread_req = 1;
1920 
1921 	/*
1922 	 * If the thread is already executing in the kernel (e.g. handling
1923 	 * a stray interrupt), wait for it to get back to nap mode.
1924 	 * The smp_mb() is to ensure that our setting of hwthread_req
1925 	 * is visible before we look at hwthread_state, so if this
1926 	 * races with the code at system_reset_pSeries and the thread
1927 	 * misses our setting of hwthread_req, we are sure to see its
1928 	 * setting of hwthread_state, and vice versa.
1929 	 */
1930 	smp_mb();
1931 	while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
1932 		if (--timeout <= 0) {
1933 			pr_err("KVM: couldn't grab cpu %d\n", cpu);
1934 			return -EBUSY;
1935 		}
1936 		udelay(1);
1937 	}
1938 	return 0;
1939 }
1940 
1941 static void kvmppc_release_hwthread(int cpu)
1942 {
1943 	struct paca_struct *tpaca;
1944 
1945 	tpaca = &paca[cpu];
1946 	tpaca->kvm_hstate.hwthread_req = 0;
1947 	tpaca->kvm_hstate.kvm_vcpu = NULL;
1948 	tpaca->kvm_hstate.kvm_vcore = NULL;
1949 	tpaca->kvm_hstate.kvm_split_mode = NULL;
1950 }
1951 
1952 static void do_nothing(void *x)
1953 {
1954 }
1955 
1956 static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
1957 {
1958 	int i;
1959 
1960 	cpu = cpu_first_thread_sibling(cpu);
1961 	cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
1962 	/*
1963 	 * Make sure setting of bit in need_tlb_flush precedes
1964 	 * testing of cpu_in_guest bits.  The matching barrier on
1965 	 * the other side is the first smp_mb() in kvmppc_run_core().
1966 	 */
1967 	smp_mb();
1968 	for (i = 0; i < threads_per_core; ++i)
1969 		if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
1970 			smp_call_function_single(cpu + i, do_nothing, NULL, 1);
1971 }
1972 
1973 static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
1974 {
1975 	int cpu;
1976 	struct paca_struct *tpaca;
1977 	struct kvmppc_vcore *mvc = vc->master_vcore;
1978 	struct kvm *kvm = vc->kvm;
1979 
1980 	cpu = vc->pcpu;
1981 	if (vcpu) {
1982 		if (vcpu->arch.timer_running) {
1983 			hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
1984 			vcpu->arch.timer_running = 0;
1985 		}
1986 		cpu += vcpu->arch.ptid;
1987 		vcpu->cpu = mvc->pcpu;
1988 		vcpu->arch.thread_cpu = cpu;
1989 
1990 		/*
1991 		 * With radix, the guest can do TLB invalidations itself,
1992 		 * and it could choose to use the local form (tlbiel) if
1993 		 * it is invalidating a translation that has only ever been
1994 		 * used on one vcpu.  However, that doesn't mean it has
1995 		 * only ever been used on one physical cpu, since vcpus
1996 		 * can move around between pcpus.  To cope with this, when
1997 		 * a vcpu moves from one pcpu to another, we need to tell
1998 		 * any vcpus running on the same core as this vcpu previously
1999 		 * ran to flush the TLB.  The TLB is shared between threads,
2000 		 * so we use a single bit in .need_tlb_flush for all 4 threads.
2001 		 */
2002 		if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) {
2003 			if (vcpu->arch.prev_cpu >= 0 &&
2004 			    cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
2005 			    cpu_first_thread_sibling(cpu))
2006 				radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
2007 			vcpu->arch.prev_cpu = cpu;
2008 		}
2009 		cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
2010 	}
2011 	tpaca = &paca[cpu];
2012 	tpaca->kvm_hstate.kvm_vcpu = vcpu;
2013 	tpaca->kvm_hstate.ptid = cpu - mvc->pcpu;
2014 	/* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
2015 	smp_wmb();
2016 	tpaca->kvm_hstate.kvm_vcore = mvc;
2017 	if (cpu != smp_processor_id())
2018 		kvmppc_ipi_thread(cpu);
2019 }
2020 
2021 static void kvmppc_wait_for_nap(void)
2022 {
2023 	int cpu = smp_processor_id();
2024 	int i, loops;
2025 	int n_threads = threads_per_vcore();
2026 
2027 	if (n_threads <= 1)
2028 		return;
2029 	for (loops = 0; loops < 1000000; ++loops) {
2030 		/*
2031 		 * Check if all threads are finished.
2032 		 * We set the vcore pointer when starting a thread
2033 		 * and the thread clears it when finished, so we look
2034 		 * for any threads that still have a non-NULL vcore ptr.
2035 		 */
2036 		for (i = 1; i < n_threads; ++i)
2037 			if (paca[cpu + i].kvm_hstate.kvm_vcore)
2038 				break;
2039 		if (i == n_threads) {
2040 			HMT_medium();
2041 			return;
2042 		}
2043 		HMT_low();
2044 	}
2045 	HMT_medium();
2046 	for (i = 1; i < n_threads; ++i)
2047 		if (paca[cpu + i].kvm_hstate.kvm_vcore)
2048 			pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
2049 }
2050 
2051 /*
2052  * Check that we are on thread 0 and that any other threads in
2053  * this core are off-line.  Then grab the threads so they can't
2054  * enter the kernel.
2055  */
2056 static int on_primary_thread(void)
2057 {
2058 	int cpu = smp_processor_id();
2059 	int thr;
2060 
2061 	/* Are we on a primary subcore? */
2062 	if (cpu_thread_in_subcore(cpu))
2063 		return 0;
2064 
2065 	thr = 0;
2066 	while (++thr < threads_per_subcore)
2067 		if (cpu_online(cpu + thr))
2068 			return 0;
2069 
2070 	/* Grab all hw threads so they can't go into the kernel */
2071 	for (thr = 1; thr < threads_per_subcore; ++thr) {
2072 		if (kvmppc_grab_hwthread(cpu + thr)) {
2073 			/* Couldn't grab one; let the others go */
2074 			do {
2075 				kvmppc_release_hwthread(cpu + thr);
2076 			} while (--thr > 0);
2077 			return 0;
2078 		}
2079 	}
2080 	return 1;
2081 }
2082 
2083 /*
2084  * A list of virtual cores for each physical CPU.
2085  * These are vcores that could run but their runner VCPU tasks are
2086  * (or may be) preempted.
2087  */
2088 struct preempted_vcore_list {
2089 	struct list_head	list;
2090 	spinlock_t		lock;
2091 };
2092 
2093 static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores);
2094 
2095 static void init_vcore_lists(void)
2096 {
2097 	int cpu;
2098 
2099 	for_each_possible_cpu(cpu) {
2100 		struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu);
2101 		spin_lock_init(&lp->lock);
2102 		INIT_LIST_HEAD(&lp->list);
2103 	}
2104 }
2105 
2106 static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
2107 {
2108 	struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
2109 
2110 	vc->vcore_state = VCORE_PREEMPT;
2111 	vc->pcpu = smp_processor_id();
2112 	if (vc->num_threads < threads_per_vcore()) {
2113 		spin_lock(&lp->lock);
2114 		list_add_tail(&vc->preempt_list, &lp->list);
2115 		spin_unlock(&lp->lock);
2116 	}
2117 
2118 	/* Start accumulating stolen time */
2119 	kvmppc_core_start_stolen(vc);
2120 }
2121 
2122 static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc)
2123 {
2124 	struct preempted_vcore_list *lp;
2125 
2126 	kvmppc_core_end_stolen(vc);
2127 	if (!list_empty(&vc->preempt_list)) {
2128 		lp = &per_cpu(preempted_vcores, vc->pcpu);
2129 		spin_lock(&lp->lock);
2130 		list_del_init(&vc->preempt_list);
2131 		spin_unlock(&lp->lock);
2132 	}
2133 	vc->vcore_state = VCORE_INACTIVE;
2134 }
2135 
2136 /*
2137  * This stores information about the virtual cores currently
2138  * assigned to a physical core.
2139  */
2140 struct core_info {
2141 	int		n_subcores;
2142 	int		max_subcore_threads;
2143 	int		total_threads;
2144 	int		subcore_threads[MAX_SUBCORES];
2145 	struct kvm	*subcore_vm[MAX_SUBCORES];
2146 	struct list_head vcs[MAX_SUBCORES];
2147 };
2148 
2149 /*
2150  * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
2151  * respectively in 2-way micro-threading (split-core) mode.
2152  */
2153 static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
2154 
2155 static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
2156 {
2157 	int sub;
2158 
2159 	memset(cip, 0, sizeof(*cip));
2160 	cip->n_subcores = 1;
2161 	cip->max_subcore_threads = vc->num_threads;
2162 	cip->total_threads = vc->num_threads;
2163 	cip->subcore_threads[0] = vc->num_threads;
2164 	cip->subcore_vm[0] = vc->kvm;
2165 	for (sub = 0; sub < MAX_SUBCORES; ++sub)
2166 		INIT_LIST_HEAD(&cip->vcs[sub]);
2167 	list_add_tail(&vc->preempt_list, &cip->vcs[0]);
2168 }
2169 
2170 static bool subcore_config_ok(int n_subcores, int n_threads)
2171 {
2172 	/* Can only dynamically split if unsplit to begin with */
2173 	if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
2174 		return false;
2175 	if (n_subcores > MAX_SUBCORES)
2176 		return false;
2177 	if (n_subcores > 1) {
2178 		if (!(dynamic_mt_modes & 2))
2179 			n_subcores = 4;
2180 		if (n_subcores > 2 && !(dynamic_mt_modes & 4))
2181 			return false;
2182 	}
2183 
2184 	return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
2185 }
2186 
2187 static void init_master_vcore(struct kvmppc_vcore *vc)
2188 {
2189 	vc->master_vcore = vc;
2190 	vc->entry_exit_map = 0;
2191 	vc->in_guest = 0;
2192 	vc->napping_threads = 0;
2193 	vc->conferring_threads = 0;
2194 }
2195 
2196 static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
2197 {
2198 	int n_threads = vc->num_threads;
2199 	int sub;
2200 
2201 	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
2202 		return false;
2203 
2204 	if (n_threads < cip->max_subcore_threads)
2205 		n_threads = cip->max_subcore_threads;
2206 	if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
2207 		return false;
2208 	cip->max_subcore_threads = n_threads;
2209 
2210 	sub = cip->n_subcores;
2211 	++cip->n_subcores;
2212 	cip->total_threads += vc->num_threads;
2213 	cip->subcore_threads[sub] = vc->num_threads;
2214 	cip->subcore_vm[sub] = vc->kvm;
2215 	init_master_vcore(vc);
2216 	list_move_tail(&vc->preempt_list, &cip->vcs[sub]);
2217 
2218 	return true;
2219 }
2220 
2221 /*
2222  * Work out whether it is possible to piggyback the execution of
2223  * vcore *pvc onto the execution of the other vcores described in *cip.
2224  */
2225 static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
2226 			  int target_threads)
2227 {
2228 	if (cip->total_threads + pvc->num_threads > target_threads)
2229 		return false;
2230 
2231 	return can_dynamic_split(pvc, cip);
2232 }
2233 
2234 static void prepare_threads(struct kvmppc_vcore *vc)
2235 {
2236 	int i;
2237 	struct kvm_vcpu *vcpu;
2238 
2239 	for_each_runnable_thread(i, vcpu, vc) {
2240 		if (signal_pending(vcpu->arch.run_task))
2241 			vcpu->arch.ret = -EINTR;
2242 		else if (vcpu->arch.vpa.update_pending ||
2243 			 vcpu->arch.slb_shadow.update_pending ||
2244 			 vcpu->arch.dtl.update_pending)
2245 			vcpu->arch.ret = RESUME_GUEST;
2246 		else
2247 			continue;
2248 		kvmppc_remove_runnable(vc, vcpu);
2249 		wake_up(&vcpu->arch.cpu_run);
2250 	}
2251 }
2252 
2253 static void collect_piggybacks(struct core_info *cip, int target_threads)
2254 {
2255 	struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
2256 	struct kvmppc_vcore *pvc, *vcnext;
2257 
2258 	spin_lock(&lp->lock);
2259 	list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) {
2260 		if (!spin_trylock(&pvc->lock))
2261 			continue;
2262 		prepare_threads(pvc);
2263 		if (!pvc->n_runnable) {
2264 			list_del_init(&pvc->preempt_list);
2265 			if (pvc->runner == NULL) {
2266 				pvc->vcore_state = VCORE_INACTIVE;
2267 				kvmppc_core_end_stolen(pvc);
2268 			}
2269 			spin_unlock(&pvc->lock);
2270 			continue;
2271 		}
2272 		if (!can_piggyback(pvc, cip, target_threads)) {
2273 			spin_unlock(&pvc->lock);
2274 			continue;
2275 		}
2276 		kvmppc_core_end_stolen(pvc);
2277 		pvc->vcore_state = VCORE_PIGGYBACK;
2278 		if (cip->total_threads >= target_threads)
2279 			break;
2280 	}
2281 	spin_unlock(&lp->lock);
2282 }
2283 
2284 static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
2285 {
2286 	int still_running = 0, i;
2287 	u64 now;
2288 	long ret;
2289 	struct kvm_vcpu *vcpu;
2290 
2291 	spin_lock(&vc->lock);
2292 	now = get_tb();
2293 	for_each_runnable_thread(i, vcpu, vc) {
2294 		/* cancel pending dec exception if dec is positive */
2295 		if (now < vcpu->arch.dec_expires &&
2296 		    kvmppc_core_pending_dec(vcpu))
2297 			kvmppc_core_dequeue_dec(vcpu);
2298 
2299 		trace_kvm_guest_exit(vcpu);
2300 
2301 		ret = RESUME_GUEST;
2302 		if (vcpu->arch.trap)
2303 			ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
2304 						    vcpu->arch.run_task);
2305 
2306 		vcpu->arch.ret = ret;
2307 		vcpu->arch.trap = 0;
2308 
2309 		if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
2310 			if (vcpu->arch.pending_exceptions)
2311 				kvmppc_core_prepare_to_enter(vcpu);
2312 			if (vcpu->arch.ceded)
2313 				kvmppc_set_timer(vcpu);
2314 			else
2315 				++still_running;
2316 		} else {
2317 			kvmppc_remove_runnable(vc, vcpu);
2318 			wake_up(&vcpu->arch.cpu_run);
2319 		}
2320 	}
2321 	list_del_init(&vc->preempt_list);
2322 	if (!is_master) {
2323 		if (still_running > 0) {
2324 			kvmppc_vcore_preempt(vc);
2325 		} else if (vc->runner) {
2326 			vc->vcore_state = VCORE_PREEMPT;
2327 			kvmppc_core_start_stolen(vc);
2328 		} else {
2329 			vc->vcore_state = VCORE_INACTIVE;
2330 		}
2331 		if (vc->n_runnable > 0 && vc->runner == NULL) {
2332 			/* make sure there's a candidate runner awake */
2333 			i = -1;
2334 			vcpu = next_runnable_thread(vc, &i);
2335 			wake_up(&vcpu->arch.cpu_run);
2336 		}
2337 	}
2338 	spin_unlock(&vc->lock);
2339 }
2340 
2341 /*
2342  * Clear core from the list of active host cores as we are about to
2343  * enter the guest. Only do this if it is the primary thread of the
2344  * core (not if a subcore) that is entering the guest.
2345  */
2346 static inline int kvmppc_clear_host_core(unsigned int cpu)
2347 {
2348 	int core;
2349 
2350 	if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
2351 		return 0;
2352 	/*
2353 	 * Memory barrier can be omitted here as we will do a smp_wmb()
2354 	 * later in kvmppc_start_thread and we need ensure that state is
2355 	 * visible to other CPUs only after we enter guest.
2356 	 */
2357 	core = cpu >> threads_shift;
2358 	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
2359 	return 0;
2360 }
2361 
2362 /*
2363  * Advertise this core as an active host core since we exited the guest
2364  * Only need to do this if it is the primary thread of the core that is
2365  * exiting.
2366  */
2367 static inline int kvmppc_set_host_core(unsigned int cpu)
2368 {
2369 	int core;
2370 
2371 	if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
2372 		return 0;
2373 
2374 	/*
2375 	 * Memory barrier can be omitted here because we do a spin_unlock
2376 	 * immediately after this which provides the memory barrier.
2377 	 */
2378 	core = cpu >> threads_shift;
2379 	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
2380 	return 0;
2381 }
2382 
2383 /*
2384  * Run a set of guest threads on a physical core.
2385  * Called with vc->lock held.
2386  */
2387 static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2388 {
2389 	struct kvm_vcpu *vcpu;
2390 	int i;
2391 	int srcu_idx;
2392 	struct core_info core_info;
2393 	struct kvmppc_vcore *pvc, *vcnext;
2394 	struct kvm_split_mode split_info, *sip;
2395 	int split, subcore_size, active;
2396 	int sub;
2397 	bool thr0_done;
2398 	unsigned long cmd_bit, stat_bit;
2399 	int pcpu, thr;
2400 	int target_threads;
2401 	int controlled_threads;
2402 
2403 	/*
2404 	 * Remove from the list any threads that have a signal pending
2405 	 * or need a VPA update done
2406 	 */
2407 	prepare_threads(vc);
2408 
2409 	/* if the runner is no longer runnable, let the caller pick a new one */
2410 	if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
2411 		return;
2412 
2413 	/*
2414 	 * Initialize *vc.
2415 	 */
2416 	init_master_vcore(vc);
2417 	vc->preempt_tb = TB_NIL;
2418 
2419 	/*
2420 	 * Number of threads that we will be controlling: the same as
2421 	 * the number of threads per subcore, except on POWER9,
2422 	 * where it's 1 because the threads are (mostly) independent.
2423 	 */
2424 	controlled_threads = threads_per_vcore();
2425 
2426 	/*
2427 	 * Make sure we are running on primary threads, and that secondary
2428 	 * threads are offline.  Also check if the number of threads in this
2429 	 * guest are greater than the current system threads per guest.
2430 	 */
2431 	if ((controlled_threads > 1) &&
2432 	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
2433 		for_each_runnable_thread(i, vcpu, vc) {
2434 			vcpu->arch.ret = -EBUSY;
2435 			kvmppc_remove_runnable(vc, vcpu);
2436 			wake_up(&vcpu->arch.cpu_run);
2437 		}
2438 		goto out;
2439 	}
2440 
2441 	/*
2442 	 * See if we could run any other vcores on the physical core
2443 	 * along with this one.
2444 	 */
2445 	init_core_info(&core_info, vc);
2446 	pcpu = smp_processor_id();
2447 	target_threads = controlled_threads;
2448 	if (target_smt_mode && target_smt_mode < target_threads)
2449 		target_threads = target_smt_mode;
2450 	if (vc->num_threads < target_threads)
2451 		collect_piggybacks(&core_info, target_threads);
2452 
2453 	/* Decide on micro-threading (split-core) mode */
2454 	subcore_size = threads_per_subcore;
2455 	cmd_bit = stat_bit = 0;
2456 	split = core_info.n_subcores;
2457 	sip = NULL;
2458 	if (split > 1) {
2459 		/* threads_per_subcore must be MAX_SMT_THREADS (8) here */
2460 		if (split == 2 && (dynamic_mt_modes & 2)) {
2461 			cmd_bit = HID0_POWER8_1TO2LPAR;
2462 			stat_bit = HID0_POWER8_2LPARMODE;
2463 		} else {
2464 			split = 4;
2465 			cmd_bit = HID0_POWER8_1TO4LPAR;
2466 			stat_bit = HID0_POWER8_4LPARMODE;
2467 		}
2468 		subcore_size = MAX_SMT_THREADS / split;
2469 		sip = &split_info;
2470 		memset(&split_info, 0, sizeof(split_info));
2471 		split_info.rpr = mfspr(SPRN_RPR);
2472 		split_info.pmmar = mfspr(SPRN_PMMAR);
2473 		split_info.ldbar = mfspr(SPRN_LDBAR);
2474 		split_info.subcore_size = subcore_size;
2475 		for (sub = 0; sub < core_info.n_subcores; ++sub)
2476 			split_info.master_vcs[sub] =
2477 				list_first_entry(&core_info.vcs[sub],
2478 					struct kvmppc_vcore, preempt_list);
2479 		/* order writes to split_info before kvm_split_mode pointer */
2480 		smp_wmb();
2481 	}
2482 	pcpu = smp_processor_id();
2483 	for (thr = 0; thr < controlled_threads; ++thr)
2484 		paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
2485 
2486 	/* Initiate micro-threading (split-core) if required */
2487 	if (cmd_bit) {
2488 		unsigned long hid0 = mfspr(SPRN_HID0);
2489 
2490 		hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS;
2491 		mb();
2492 		mtspr(SPRN_HID0, hid0);
2493 		isync();
2494 		for (;;) {
2495 			hid0 = mfspr(SPRN_HID0);
2496 			if (hid0 & stat_bit)
2497 				break;
2498 			cpu_relax();
2499 		}
2500 	}
2501 
2502 	kvmppc_clear_host_core(pcpu);
2503 
2504 	/* Start all the threads */
2505 	active = 0;
2506 	for (sub = 0; sub < core_info.n_subcores; ++sub) {
2507 		thr = subcore_thread_map[sub];
2508 		thr0_done = false;
2509 		active |= 1 << thr;
2510 		list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
2511 			pvc->pcpu = pcpu + thr;
2512 			for_each_runnable_thread(i, vcpu, pvc) {
2513 				kvmppc_start_thread(vcpu, pvc);
2514 				kvmppc_create_dtl_entry(vcpu, pvc);
2515 				trace_kvm_guest_enter(vcpu);
2516 				if (!vcpu->arch.ptid)
2517 					thr0_done = true;
2518 				active |= 1 << (thr + vcpu->arch.ptid);
2519 			}
2520 			/*
2521 			 * We need to start the first thread of each subcore
2522 			 * even if it doesn't have a vcpu.
2523 			 */
2524 			if (pvc->master_vcore == pvc && !thr0_done)
2525 				kvmppc_start_thread(NULL, pvc);
2526 			thr += pvc->num_threads;
2527 		}
2528 	}
2529 
2530 	/*
2531 	 * Ensure that split_info.do_nap is set after setting
2532 	 * the vcore pointer in the PACA of the secondaries.
2533 	 */
2534 	smp_mb();
2535 	if (cmd_bit)
2536 		split_info.do_nap = 1;	/* ask secondaries to nap when done */
2537 
2538 	/*
2539 	 * When doing micro-threading, poke the inactive threads as well.
2540 	 * This gets them to the nap instruction after kvm_do_nap,
2541 	 * which reduces the time taken to unsplit later.
2542 	 */
2543 	if (split > 1)
2544 		for (thr = 1; thr < threads_per_subcore; ++thr)
2545 			if (!(active & (1 << thr)))
2546 				kvmppc_ipi_thread(pcpu + thr);
2547 
2548 	vc->vcore_state = VCORE_RUNNING;
2549 	preempt_disable();
2550 
2551 	trace_kvmppc_run_core(vc, 0);
2552 
2553 	for (sub = 0; sub < core_info.n_subcores; ++sub)
2554 		list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
2555 			spin_unlock(&pvc->lock);
2556 
2557 	guest_enter();
2558 
2559 	srcu_idx = srcu_read_lock(&vc->kvm->srcu);
2560 
2561 	__kvmppc_vcore_entry();
2562 
2563 	srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
2564 
2565 	spin_lock(&vc->lock);
2566 	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
2567 	vc->vcore_state = VCORE_EXITING;
2568 
2569 	/* wait for secondary threads to finish writing their state to memory */
2570 	kvmppc_wait_for_nap();
2571 
2572 	/* Return to whole-core mode if we split the core earlier */
2573 	if (split > 1) {
2574 		unsigned long hid0 = mfspr(SPRN_HID0);
2575 		unsigned long loops = 0;
2576 
2577 		hid0 &= ~HID0_POWER8_DYNLPARDIS;
2578 		stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
2579 		mb();
2580 		mtspr(SPRN_HID0, hid0);
2581 		isync();
2582 		for (;;) {
2583 			hid0 = mfspr(SPRN_HID0);
2584 			if (!(hid0 & stat_bit))
2585 				break;
2586 			cpu_relax();
2587 			++loops;
2588 		}
2589 		split_info.do_nap = 0;
2590 	}
2591 
2592 	/* Let secondaries go back to the offline loop */
2593 	for (i = 0; i < controlled_threads; ++i) {
2594 		kvmppc_release_hwthread(pcpu + i);
2595 		if (sip && sip->napped[i])
2596 			kvmppc_ipi_thread(pcpu + i);
2597 		cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
2598 	}
2599 
2600 	kvmppc_set_host_core(pcpu);
2601 
2602 	spin_unlock(&vc->lock);
2603 
2604 	/* make sure updates to secondary vcpu structs are visible now */
2605 	smp_mb();
2606 	guest_exit();
2607 
2608 	for (sub = 0; sub < core_info.n_subcores; ++sub)
2609 		list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
2610 					 preempt_list)
2611 			post_guest_process(pvc, pvc == vc);
2612 
2613 	spin_lock(&vc->lock);
2614 	preempt_enable();
2615 
2616  out:
2617 	vc->vcore_state = VCORE_INACTIVE;
2618 	trace_kvmppc_run_core(vc, 1);
2619 }
2620 
2621 /*
2622  * Wait for some other vcpu thread to execute us, and
2623  * wake us up when we need to handle something in the host.
2624  */
2625 static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
2626 				 struct kvm_vcpu *vcpu, int wait_state)
2627 {
2628 	DEFINE_WAIT(wait);
2629 
2630 	prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
2631 	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
2632 		spin_unlock(&vc->lock);
2633 		schedule();
2634 		spin_lock(&vc->lock);
2635 	}
2636 	finish_wait(&vcpu->arch.cpu_run, &wait);
2637 }
2638 
2639 static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
2640 {
2641 	/* 10us base */
2642 	if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
2643 		vc->halt_poll_ns = 10000;
2644 	else
2645 		vc->halt_poll_ns *= halt_poll_ns_grow;
2646 }
2647 
2648 static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
2649 {
2650 	if (halt_poll_ns_shrink == 0)
2651 		vc->halt_poll_ns = 0;
2652 	else
2653 		vc->halt_poll_ns /= halt_poll_ns_shrink;
2654 }
2655 
2656 /*
2657  * Check to see if any of the runnable vcpus on the vcore have pending
2658  * exceptions or are no longer ceded
2659  */
2660 static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
2661 {
2662 	struct kvm_vcpu *vcpu;
2663 	int i;
2664 
2665 	for_each_runnable_thread(i, vcpu, vc) {
2666 		if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded ||
2667 		    vcpu->arch.prodded)
2668 			return 1;
2669 	}
2670 
2671 	return 0;
2672 }
2673 
2674 /*
2675  * All the vcpus in this vcore are idle, so wait for a decrementer
2676  * or external interrupt to one of the vcpus.  vc->lock is held.
2677  */
2678 static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2679 {
2680 	ktime_t cur, start_poll, start_wait;
2681 	int do_sleep = 1;
2682 	u64 block_ns;
2683 	DECLARE_SWAITQUEUE(wait);
2684 
2685 	/* Poll for pending exceptions and ceded state */
2686 	cur = start_poll = ktime_get();
2687 	if (vc->halt_poll_ns) {
2688 		ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
2689 		++vc->runner->stat.halt_attempted_poll;
2690 
2691 		vc->vcore_state = VCORE_POLLING;
2692 		spin_unlock(&vc->lock);
2693 
2694 		do {
2695 			if (kvmppc_vcore_check_block(vc)) {
2696 				do_sleep = 0;
2697 				break;
2698 			}
2699 			cur = ktime_get();
2700 		} while (single_task_running() && ktime_before(cur, stop));
2701 
2702 		spin_lock(&vc->lock);
2703 		vc->vcore_state = VCORE_INACTIVE;
2704 
2705 		if (!do_sleep) {
2706 			++vc->runner->stat.halt_successful_poll;
2707 			goto out;
2708 		}
2709 	}
2710 
2711 	prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
2712 
2713 	if (kvmppc_vcore_check_block(vc)) {
2714 		finish_swait(&vc->wq, &wait);
2715 		do_sleep = 0;
2716 		/* If we polled, count this as a successful poll */
2717 		if (vc->halt_poll_ns)
2718 			++vc->runner->stat.halt_successful_poll;
2719 		goto out;
2720 	}
2721 
2722 	start_wait = ktime_get();
2723 
2724 	vc->vcore_state = VCORE_SLEEPING;
2725 	trace_kvmppc_vcore_blocked(vc, 0);
2726 	spin_unlock(&vc->lock);
2727 	schedule();
2728 	finish_swait(&vc->wq, &wait);
2729 	spin_lock(&vc->lock);
2730 	vc->vcore_state = VCORE_INACTIVE;
2731 	trace_kvmppc_vcore_blocked(vc, 1);
2732 	++vc->runner->stat.halt_successful_wait;
2733 
2734 	cur = ktime_get();
2735 
2736 out:
2737 	block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll);
2738 
2739 	/* Attribute wait time */
2740 	if (do_sleep) {
2741 		vc->runner->stat.halt_wait_ns +=
2742 			ktime_to_ns(cur) - ktime_to_ns(start_wait);
2743 		/* Attribute failed poll time */
2744 		if (vc->halt_poll_ns)
2745 			vc->runner->stat.halt_poll_fail_ns +=
2746 				ktime_to_ns(start_wait) -
2747 				ktime_to_ns(start_poll);
2748 	} else {
2749 		/* Attribute successful poll time */
2750 		if (vc->halt_poll_ns)
2751 			vc->runner->stat.halt_poll_success_ns +=
2752 				ktime_to_ns(cur) -
2753 				ktime_to_ns(start_poll);
2754 	}
2755 
2756 	/* Adjust poll time */
2757 	if (halt_poll_ns) {
2758 		if (block_ns <= vc->halt_poll_ns)
2759 			;
2760 		/* We slept and blocked for longer than the max halt time */
2761 		else if (vc->halt_poll_ns && block_ns > halt_poll_ns)
2762 			shrink_halt_poll_ns(vc);
2763 		/* We slept and our poll time is too small */
2764 		else if (vc->halt_poll_ns < halt_poll_ns &&
2765 				block_ns < halt_poll_ns)
2766 			grow_halt_poll_ns(vc);
2767 		if (vc->halt_poll_ns > halt_poll_ns)
2768 			vc->halt_poll_ns = halt_poll_ns;
2769 	} else
2770 		vc->halt_poll_ns = 0;
2771 
2772 	trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
2773 }
2774 
2775 static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2776 {
2777 	int n_ceded, i;
2778 	struct kvmppc_vcore *vc;
2779 	struct kvm_vcpu *v;
2780 
2781 	trace_kvmppc_run_vcpu_enter(vcpu);
2782 
2783 	kvm_run->exit_reason = 0;
2784 	vcpu->arch.ret = RESUME_GUEST;
2785 	vcpu->arch.trap = 0;
2786 	kvmppc_update_vpas(vcpu);
2787 
2788 	/*
2789 	 * Synchronize with other threads in this virtual core
2790 	 */
2791 	vc = vcpu->arch.vcore;
2792 	spin_lock(&vc->lock);
2793 	vcpu->arch.ceded = 0;
2794 	vcpu->arch.run_task = current;
2795 	vcpu->arch.kvm_run = kvm_run;
2796 	vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
2797 	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
2798 	vcpu->arch.busy_preempt = TB_NIL;
2799 	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
2800 	++vc->n_runnable;
2801 
2802 	/*
2803 	 * This happens the first time this is called for a vcpu.
2804 	 * If the vcore is already running, we may be able to start
2805 	 * this thread straight away and have it join in.
2806 	 */
2807 	if (!signal_pending(current)) {
2808 		if (vc->vcore_state == VCORE_PIGGYBACK) {
2809 			struct kvmppc_vcore *mvc = vc->master_vcore;
2810 			if (spin_trylock(&mvc->lock)) {
2811 				if (mvc->vcore_state == VCORE_RUNNING &&
2812 				    !VCORE_IS_EXITING(mvc)) {
2813 					kvmppc_create_dtl_entry(vcpu, vc);
2814 					kvmppc_start_thread(vcpu, vc);
2815 					trace_kvm_guest_enter(vcpu);
2816 				}
2817 				spin_unlock(&mvc->lock);
2818 			}
2819 		} else if (vc->vcore_state == VCORE_RUNNING &&
2820 			   !VCORE_IS_EXITING(vc)) {
2821 			kvmppc_create_dtl_entry(vcpu, vc);
2822 			kvmppc_start_thread(vcpu, vc);
2823 			trace_kvm_guest_enter(vcpu);
2824 		} else if (vc->vcore_state == VCORE_SLEEPING) {
2825 			swake_up(&vc->wq);
2826 		}
2827 
2828 	}
2829 
2830 	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
2831 	       !signal_pending(current)) {
2832 		if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
2833 			kvmppc_vcore_end_preempt(vc);
2834 
2835 		if (vc->vcore_state != VCORE_INACTIVE) {
2836 			kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
2837 			continue;
2838 		}
2839 		for_each_runnable_thread(i, v, vc) {
2840 			kvmppc_core_prepare_to_enter(v);
2841 			if (signal_pending(v->arch.run_task)) {
2842 				kvmppc_remove_runnable(vc, v);
2843 				v->stat.signal_exits++;
2844 				v->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
2845 				v->arch.ret = -EINTR;
2846 				wake_up(&v->arch.cpu_run);
2847 			}
2848 		}
2849 		if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
2850 			break;
2851 		n_ceded = 0;
2852 		for_each_runnable_thread(i, v, vc) {
2853 			if (!v->arch.pending_exceptions && !v->arch.prodded)
2854 				n_ceded += v->arch.ceded;
2855 			else
2856 				v->arch.ceded = 0;
2857 		}
2858 		vc->runner = vcpu;
2859 		if (n_ceded == vc->n_runnable) {
2860 			kvmppc_vcore_blocked(vc);
2861 		} else if (need_resched()) {
2862 			kvmppc_vcore_preempt(vc);
2863 			/* Let something else run */
2864 			cond_resched_lock(&vc->lock);
2865 			if (vc->vcore_state == VCORE_PREEMPT)
2866 				kvmppc_vcore_end_preempt(vc);
2867 		} else {
2868 			kvmppc_run_core(vc);
2869 		}
2870 		vc->runner = NULL;
2871 	}
2872 
2873 	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
2874 	       (vc->vcore_state == VCORE_RUNNING ||
2875 		vc->vcore_state == VCORE_EXITING ||
2876 		vc->vcore_state == VCORE_PIGGYBACK))
2877 		kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE);
2878 
2879 	if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
2880 		kvmppc_vcore_end_preempt(vc);
2881 
2882 	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
2883 		kvmppc_remove_runnable(vc, vcpu);
2884 		vcpu->stat.signal_exits++;
2885 		kvm_run->exit_reason = KVM_EXIT_INTR;
2886 		vcpu->arch.ret = -EINTR;
2887 	}
2888 
2889 	if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
2890 		/* Wake up some vcpu to run the core */
2891 		i = -1;
2892 		v = next_runnable_thread(vc, &i);
2893 		wake_up(&v->arch.cpu_run);
2894 	}
2895 
2896 	trace_kvmppc_run_vcpu_exit(vcpu, kvm_run);
2897 	spin_unlock(&vc->lock);
2898 	return vcpu->arch.ret;
2899 }
2900 
2901 static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
2902 {
2903 	int r;
2904 	int srcu_idx;
2905 
2906 	if (!vcpu->arch.sane) {
2907 		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2908 		return -EINVAL;
2909 	}
2910 
2911 	kvmppc_core_prepare_to_enter(vcpu);
2912 
2913 	/* No need to go into the guest when all we'll do is come back out */
2914 	if (signal_pending(current)) {
2915 		run->exit_reason = KVM_EXIT_INTR;
2916 		return -EINTR;
2917 	}
2918 
2919 	atomic_inc(&vcpu->kvm->arch.vcpus_running);
2920 	/* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */
2921 	smp_mb();
2922 
2923 	/* On the first time here, set up HTAB and VRMA */
2924 	if (!kvm_is_radix(vcpu->kvm) && !vcpu->kvm->arch.hpte_setup_done) {
2925 		r = kvmppc_hv_setup_htab_rma(vcpu);
2926 		if (r)
2927 			goto out;
2928 	}
2929 
2930 	flush_all_to_thread(current);
2931 
2932 	vcpu->arch.wqp = &vcpu->arch.vcore->wq;
2933 	vcpu->arch.pgdir = current->mm->pgd;
2934 	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
2935 
2936 	do {
2937 		r = kvmppc_run_vcpu(run, vcpu);
2938 
2939 		if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
2940 		    !(vcpu->arch.shregs.msr & MSR_PR)) {
2941 			trace_kvm_hcall_enter(vcpu);
2942 			r = kvmppc_pseries_do_hcall(vcpu);
2943 			trace_kvm_hcall_exit(vcpu, r);
2944 			kvmppc_core_prepare_to_enter(vcpu);
2945 		} else if (r == RESUME_PAGE_FAULT) {
2946 			srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
2947 			r = kvmppc_book3s_hv_page_fault(run, vcpu,
2948 				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
2949 			srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
2950 		} else if (r == RESUME_PASSTHROUGH)
2951 			r = kvmppc_xics_rm_complete(vcpu, 0);
2952 	} while (is_kvmppc_resume_guest(r));
2953 
2954  out:
2955 	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
2956 	atomic_dec(&vcpu->kvm->arch.vcpus_running);
2957 	return r;
2958 }
2959 
2960 static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
2961 				     int linux_psize)
2962 {
2963 	struct mmu_psize_def *def = &mmu_psize_defs[linux_psize];
2964 
2965 	if (!def->shift)
2966 		return;
2967 	(*sps)->page_shift = def->shift;
2968 	(*sps)->slb_enc = def->sllp;
2969 	(*sps)->enc[0].page_shift = def->shift;
2970 	(*sps)->enc[0].pte_enc = def->penc[linux_psize];
2971 	/*
2972 	 * Add 16MB MPSS support if host supports it
2973 	 */
2974 	if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) {
2975 		(*sps)->enc[1].page_shift = 24;
2976 		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
2977 	}
2978 	(*sps)++;
2979 }
2980 
2981 static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
2982 					 struct kvm_ppc_smmu_info *info)
2983 {
2984 	struct kvm_ppc_one_seg_page_size *sps;
2985 
2986 	/*
2987 	 * Since we don't yet support HPT guests on a radix host,
2988 	 * return an error if the host uses radix.
2989 	 */
2990 	if (radix_enabled())
2991 		return -EINVAL;
2992 
2993 	info->flags = KVM_PPC_PAGE_SIZES_REAL;
2994 	if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
2995 		info->flags |= KVM_PPC_1T_SEGMENTS;
2996 	info->slb_size = mmu_slb_size;
2997 
2998 	/* We only support these sizes for now, and no muti-size segments */
2999 	sps = &info->sps[0];
3000 	kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K);
3001 	kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K);
3002 	kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M);
3003 
3004 	return 0;
3005 }
3006 
3007 /*
3008  * Get (and clear) the dirty memory log for a memory slot.
3009  */
3010 static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
3011 					 struct kvm_dirty_log *log)
3012 {
3013 	struct kvm_memslots *slots;
3014 	struct kvm_memory_slot *memslot;
3015 	int i, r;
3016 	unsigned long n;
3017 	unsigned long *buf;
3018 	struct kvm_vcpu *vcpu;
3019 
3020 	mutex_lock(&kvm->slots_lock);
3021 
3022 	r = -EINVAL;
3023 	if (log->slot >= KVM_USER_MEM_SLOTS)
3024 		goto out;
3025 
3026 	slots = kvm_memslots(kvm);
3027 	memslot = id_to_memslot(slots, log->slot);
3028 	r = -ENOENT;
3029 	if (!memslot->dirty_bitmap)
3030 		goto out;
3031 
3032 	/*
3033 	 * Use second half of bitmap area because radix accumulates
3034 	 * bits in the first half.
3035 	 */
3036 	n = kvm_dirty_bitmap_bytes(memslot);
3037 	buf = memslot->dirty_bitmap + n / sizeof(long);
3038 	memset(buf, 0, n);
3039 
3040 	if (kvm_is_radix(kvm))
3041 		r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf);
3042 	else
3043 		r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf);
3044 	if (r)
3045 		goto out;
3046 
3047 	/* Harvest dirty bits from VPA and DTL updates */
3048 	/* Note: we never modify the SLB shadow buffer areas */
3049 	kvm_for_each_vcpu(i, vcpu, kvm) {
3050 		spin_lock(&vcpu->arch.vpa_update_lock);
3051 		kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf);
3052 		kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf);
3053 		spin_unlock(&vcpu->arch.vpa_update_lock);
3054 	}
3055 
3056 	r = -EFAULT;
3057 	if (copy_to_user(log->dirty_bitmap, buf, n))
3058 		goto out;
3059 
3060 	r = 0;
3061 out:
3062 	mutex_unlock(&kvm->slots_lock);
3063 	return r;
3064 }
3065 
3066 static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
3067 					struct kvm_memory_slot *dont)
3068 {
3069 	if (!dont || free->arch.rmap != dont->arch.rmap) {
3070 		vfree(free->arch.rmap);
3071 		free->arch.rmap = NULL;
3072 	}
3073 }
3074 
3075 static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
3076 					 unsigned long npages)
3077 {
3078 	/*
3079 	 * For now, if radix_enabled() then we only support radix guests,
3080 	 * and in that case we don't need the rmap array.
3081 	 */
3082 	if (radix_enabled()) {
3083 		slot->arch.rmap = NULL;
3084 		return 0;
3085 	}
3086 
3087 	slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
3088 	if (!slot->arch.rmap)
3089 		return -ENOMEM;
3090 
3091 	return 0;
3092 }
3093 
3094 static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
3095 					struct kvm_memory_slot *memslot,
3096 					const struct kvm_userspace_memory_region *mem)
3097 {
3098 	return 0;
3099 }
3100 
3101 static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
3102 				const struct kvm_userspace_memory_region *mem,
3103 				const struct kvm_memory_slot *old,
3104 				const struct kvm_memory_slot *new)
3105 {
3106 	unsigned long npages = mem->memory_size >> PAGE_SHIFT;
3107 	struct kvm_memslots *slots;
3108 	struct kvm_memory_slot *memslot;
3109 
3110 	/*
3111 	 * If we are making a new memslot, it might make
3112 	 * some address that was previously cached as emulated
3113 	 * MMIO be no longer emulated MMIO, so invalidate
3114 	 * all the caches of emulated MMIO translations.
3115 	 */
3116 	if (npages)
3117 		atomic64_inc(&kvm->arch.mmio_update);
3118 
3119 	if (npages && old->npages && !kvm_is_radix(kvm)) {
3120 		/*
3121 		 * If modifying a memslot, reset all the rmap dirty bits.
3122 		 * If this is a new memslot, we don't need to do anything
3123 		 * since the rmap array starts out as all zeroes,
3124 		 * i.e. no pages are dirty.
3125 		 */
3126 		slots = kvm_memslots(kvm);
3127 		memslot = id_to_memslot(slots, mem->slot);
3128 		kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL);
3129 	}
3130 }
3131 
3132 /*
3133  * Update LPCR values in kvm->arch and in vcores.
3134  * Caller must hold kvm->lock.
3135  */
3136 void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
3137 {
3138 	long int i;
3139 	u32 cores_done = 0;
3140 
3141 	if ((kvm->arch.lpcr & mask) == lpcr)
3142 		return;
3143 
3144 	kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr;
3145 
3146 	for (i = 0; i < KVM_MAX_VCORES; ++i) {
3147 		struct kvmppc_vcore *vc = kvm->arch.vcores[i];
3148 		if (!vc)
3149 			continue;
3150 		spin_lock(&vc->lock);
3151 		vc->lpcr = (vc->lpcr & ~mask) | lpcr;
3152 		spin_unlock(&vc->lock);
3153 		if (++cores_done >= kvm->arch.online_vcores)
3154 			break;
3155 	}
3156 }
3157 
3158 static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu)
3159 {
3160 	return;
3161 }
3162 
3163 static void kvmppc_setup_partition_table(struct kvm *kvm)
3164 {
3165 	unsigned long dw0, dw1;
3166 
3167 	if (!kvm_is_radix(kvm)) {
3168 		/* PS field - page size for VRMA */
3169 		dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
3170 			((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
3171 		/* HTABSIZE and HTABORG fields */
3172 		dw0 |= kvm->arch.sdr1;
3173 
3174 		/* Second dword as set by userspace */
3175 		dw1 = kvm->arch.process_table;
3176 	} else {
3177 		dw0 = PATB_HR | radix__get_tree_size() |
3178 			__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
3179 		dw1 = PATB_GR | kvm->arch.process_table;
3180 	}
3181 
3182 	mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
3183 }
3184 
3185 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
3186 {
3187 	int err = 0;
3188 	struct kvm *kvm = vcpu->kvm;
3189 	unsigned long hva;
3190 	struct kvm_memory_slot *memslot;
3191 	struct vm_area_struct *vma;
3192 	unsigned long lpcr = 0, senc;
3193 	unsigned long psize, porder;
3194 	int srcu_idx;
3195 
3196 	mutex_lock(&kvm->lock);
3197 	if (kvm->arch.hpte_setup_done)
3198 		goto out;	/* another vcpu beat us to it */
3199 
3200 	/* Allocate hashed page table (if not done already) and reset it */
3201 	if (!kvm->arch.hpt.virt) {
3202 		int order = KVM_DEFAULT_HPT_ORDER;
3203 		struct kvm_hpt_info info;
3204 
3205 		err = kvmppc_allocate_hpt(&info, order);
3206 		/* If we get here, it means userspace didn't specify a
3207 		 * size explicitly.  So, try successively smaller
3208 		 * sizes if the default failed. */
3209 		while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER)
3210 			err  = kvmppc_allocate_hpt(&info, order);
3211 
3212 		if (err < 0) {
3213 			pr_err("KVM: Couldn't alloc HPT\n");
3214 			goto out;
3215 		}
3216 
3217 		kvmppc_set_hpt(kvm, &info);
3218 	}
3219 
3220 	/* Look up the memslot for guest physical address 0 */
3221 	srcu_idx = srcu_read_lock(&kvm->srcu);
3222 	memslot = gfn_to_memslot(kvm, 0);
3223 
3224 	/* We must have some memory at 0 by now */
3225 	err = -EINVAL;
3226 	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
3227 		goto out_srcu;
3228 
3229 	/* Look up the VMA for the start of this memory slot */
3230 	hva = memslot->userspace_addr;
3231 	down_read(&current->mm->mmap_sem);
3232 	vma = find_vma(current->mm, hva);
3233 	if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO))
3234 		goto up_out;
3235 
3236 	psize = vma_kernel_pagesize(vma);
3237 	porder = __ilog2(psize);
3238 
3239 	up_read(&current->mm->mmap_sem);
3240 
3241 	/* We can handle 4k, 64k or 16M pages in the VRMA */
3242 	err = -EINVAL;
3243 	if (!(psize == 0x1000 || psize == 0x10000 ||
3244 	      psize == 0x1000000))
3245 		goto out_srcu;
3246 
3247 	senc = slb_pgsize_encoding(psize);
3248 	kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
3249 		(VRMA_VSID << SLB_VSID_SHIFT_1T);
3250 	/* Create HPTEs in the hash page table for the VRMA */
3251 	kvmppc_map_vrma(vcpu, memslot, porder);
3252 
3253 	/* Update VRMASD field in the LPCR */
3254 	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
3255 		/* the -4 is to account for senc values starting at 0x10 */
3256 		lpcr = senc << (LPCR_VRMASD_SH - 4);
3257 		kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
3258 	} else {
3259 		kvmppc_setup_partition_table(kvm);
3260 	}
3261 
3262 	/* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
3263 	smp_wmb();
3264 	kvm->arch.hpte_setup_done = 1;
3265 	err = 0;
3266  out_srcu:
3267 	srcu_read_unlock(&kvm->srcu, srcu_idx);
3268  out:
3269 	mutex_unlock(&kvm->lock);
3270 	return err;
3271 
3272  up_out:
3273 	up_read(&current->mm->mmap_sem);
3274 	goto out_srcu;
3275 }
3276 
3277 #ifdef CONFIG_KVM_XICS
3278 /*
3279  * Allocate a per-core structure for managing state about which cores are
3280  * running in the host versus the guest and for exchanging data between
3281  * real mode KVM and CPU running in the host.
3282  * This is only done for the first VM.
3283  * The allocated structure stays even if all VMs have stopped.
3284  * It is only freed when the kvm-hv module is unloaded.
3285  * It's OK for this routine to fail, we just don't support host
3286  * core operations like redirecting H_IPI wakeups.
3287  */
3288 void kvmppc_alloc_host_rm_ops(void)
3289 {
3290 	struct kvmppc_host_rm_ops *ops;
3291 	unsigned long l_ops;
3292 	int cpu, core;
3293 	int size;
3294 
3295 	/* Not the first time here ? */
3296 	if (kvmppc_host_rm_ops_hv != NULL)
3297 		return;
3298 
3299 	ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
3300 	if (!ops)
3301 		return;
3302 
3303 	size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
3304 	ops->rm_core = kzalloc(size, GFP_KERNEL);
3305 
3306 	if (!ops->rm_core) {
3307 		kfree(ops);
3308 		return;
3309 	}
3310 
3311 	get_online_cpus();
3312 
3313 	for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
3314 		if (!cpu_online(cpu))
3315 			continue;
3316 
3317 		core = cpu >> threads_shift;
3318 		ops->rm_core[core].rm_state.in_host = 1;
3319 	}
3320 
3321 	ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
3322 
3323 	/*
3324 	 * Make the contents of the kvmppc_host_rm_ops structure visible
3325 	 * to other CPUs before we assign it to the global variable.
3326 	 * Do an atomic assignment (no locks used here), but if someone
3327 	 * beats us to it, just free our copy and return.
3328 	 */
3329 	smp_wmb();
3330 	l_ops = (unsigned long) ops;
3331 
3332 	if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
3333 		put_online_cpus();
3334 		kfree(ops->rm_core);
3335 		kfree(ops);
3336 		return;
3337 	}
3338 
3339 	cpuhp_setup_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE,
3340 				  "ppc/kvm_book3s:prepare",
3341 				  kvmppc_set_host_core,
3342 				  kvmppc_clear_host_core);
3343 	put_online_cpus();
3344 }
3345 
3346 void kvmppc_free_host_rm_ops(void)
3347 {
3348 	if (kvmppc_host_rm_ops_hv) {
3349 		cpuhp_remove_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE);
3350 		kfree(kvmppc_host_rm_ops_hv->rm_core);
3351 		kfree(kvmppc_host_rm_ops_hv);
3352 		kvmppc_host_rm_ops_hv = NULL;
3353 	}
3354 }
3355 #endif
3356 
3357 static int kvmppc_core_init_vm_hv(struct kvm *kvm)
3358 {
3359 	unsigned long lpcr, lpid;
3360 	char buf[32];
3361 	int ret;
3362 
3363 	/* Allocate the guest's logical partition ID */
3364 
3365 	lpid = kvmppc_alloc_lpid();
3366 	if ((long)lpid < 0)
3367 		return -ENOMEM;
3368 	kvm->arch.lpid = lpid;
3369 
3370 	kvmppc_alloc_host_rm_ops();
3371 
3372 	/*
3373 	 * Since we don't flush the TLB when tearing down a VM,
3374 	 * and this lpid might have previously been used,
3375 	 * make sure we flush on each core before running the new VM.
3376 	 * On POWER9, the tlbie in mmu_partition_table_set_entry()
3377 	 * does this flush for us.
3378 	 */
3379 	if (!cpu_has_feature(CPU_FTR_ARCH_300))
3380 		cpumask_setall(&kvm->arch.need_tlb_flush);
3381 
3382 	/* Start out with the default set of hcalls enabled */
3383 	memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls,
3384 	       sizeof(kvm->arch.enabled_hcalls));
3385 
3386 	if (!cpu_has_feature(CPU_FTR_ARCH_300))
3387 		kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
3388 
3389 	/* Init LPCR for virtual RMA mode */
3390 	kvm->arch.host_lpid = mfspr(SPRN_LPID);
3391 	kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
3392 	lpcr &= LPCR_PECE | LPCR_LPES;
3393 	lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
3394 		LPCR_VPM0 | LPCR_VPM1;
3395 	kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
3396 		(VRMA_VSID << SLB_VSID_SHIFT_1T);
3397 	/* On POWER8 turn on online bit to enable PURR/SPURR */
3398 	if (cpu_has_feature(CPU_FTR_ARCH_207S))
3399 		lpcr |= LPCR_ONL;
3400 	/*
3401 	 * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
3402 	 * Set HVICE bit to enable hypervisor virtualization interrupts.
3403 	 */
3404 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
3405 		lpcr &= ~LPCR_VPM0;
3406 		lpcr |= LPCR_HVICE;
3407 	}
3408 
3409 	/*
3410 	 * For now, if the host uses radix, the guest must be radix.
3411 	 */
3412 	if (radix_enabled()) {
3413 		kvm->arch.radix = 1;
3414 		lpcr &= ~LPCR_VPM1;
3415 		lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
3416 		ret = kvmppc_init_vm_radix(kvm);
3417 		if (ret) {
3418 			kvmppc_free_lpid(kvm->arch.lpid);
3419 			return ret;
3420 		}
3421 		kvmppc_setup_partition_table(kvm);
3422 	}
3423 
3424 	kvm->arch.lpcr = lpcr;
3425 
3426 	/* Initialization for future HPT resizes */
3427 	kvm->arch.resize_hpt = NULL;
3428 
3429 	/*
3430 	 * Work out how many sets the TLB has, for the use of
3431 	 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
3432 	 */
3433 	if (kvm_is_radix(kvm))
3434 		kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX;	/* 128 */
3435 	else if (cpu_has_feature(CPU_FTR_ARCH_300))
3436 		kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH;	/* 256 */
3437 	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
3438 		kvm->arch.tlb_sets = POWER8_TLB_SETS;		/* 512 */
3439 	else
3440 		kvm->arch.tlb_sets = POWER7_TLB_SETS;		/* 128 */
3441 
3442 	/*
3443 	 * Track that we now have a HV mode VM active. This blocks secondary
3444 	 * CPU threads from coming online.
3445 	 * On POWER9, we only need to do this for HPT guests on a radix
3446 	 * host, which is not yet supported.
3447 	 */
3448 	if (!cpu_has_feature(CPU_FTR_ARCH_300))
3449 		kvm_hv_vm_activated();
3450 
3451 	/*
3452 	 * Create a debugfs directory for the VM
3453 	 */
3454 	snprintf(buf, sizeof(buf), "vm%d", current->pid);
3455 	kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
3456 	if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
3457 		kvmppc_mmu_debugfs_init(kvm);
3458 
3459 	return 0;
3460 }
3461 
3462 static void kvmppc_free_vcores(struct kvm *kvm)
3463 {
3464 	long int i;
3465 
3466 	for (i = 0; i < KVM_MAX_VCORES; ++i)
3467 		kfree(kvm->arch.vcores[i]);
3468 	kvm->arch.online_vcores = 0;
3469 }
3470 
3471 static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
3472 {
3473 	debugfs_remove_recursive(kvm->arch.debugfs_dir);
3474 
3475 	if (!cpu_has_feature(CPU_FTR_ARCH_300))
3476 		kvm_hv_vm_deactivated();
3477 
3478 	kvmppc_free_vcores(kvm);
3479 
3480 	kvmppc_free_lpid(kvm->arch.lpid);
3481 
3482 	if (kvm_is_radix(kvm))
3483 		kvmppc_free_radix(kvm);
3484 	else
3485 		kvmppc_free_hpt(&kvm->arch.hpt);
3486 
3487 	kvmppc_free_pimap(kvm);
3488 }
3489 
3490 /* We don't need to emulate any privileged instructions or dcbz */
3491 static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
3492 				     unsigned int inst, int *advance)
3493 {
3494 	return EMULATE_FAIL;
3495 }
3496 
3497 static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn,
3498 					ulong spr_val)
3499 {
3500 	return EMULATE_FAIL;
3501 }
3502 
3503 static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
3504 					ulong *spr_val)
3505 {
3506 	return EMULATE_FAIL;
3507 }
3508 
3509 static int kvmppc_core_check_processor_compat_hv(void)
3510 {
3511 	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
3512 	    !cpu_has_feature(CPU_FTR_ARCH_206))
3513 		return -EIO;
3514 
3515 	return 0;
3516 }
3517 
3518 #ifdef CONFIG_KVM_XICS
3519 
3520 void kvmppc_free_pimap(struct kvm *kvm)
3521 {
3522 	kfree(kvm->arch.pimap);
3523 }
3524 
3525 static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
3526 {
3527 	return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
3528 }
3529 
3530 static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
3531 {
3532 	struct irq_desc *desc;
3533 	struct kvmppc_irq_map *irq_map;
3534 	struct kvmppc_passthru_irqmap *pimap;
3535 	struct irq_chip *chip;
3536 	int i;
3537 
3538 	if (!kvm_irq_bypass)
3539 		return 1;
3540 
3541 	desc = irq_to_desc(host_irq);
3542 	if (!desc)
3543 		return -EIO;
3544 
3545 	mutex_lock(&kvm->lock);
3546 
3547 	pimap = kvm->arch.pimap;
3548 	if (pimap == NULL) {
3549 		/* First call, allocate structure to hold IRQ map */
3550 		pimap = kvmppc_alloc_pimap();
3551 		if (pimap == NULL) {
3552 			mutex_unlock(&kvm->lock);
3553 			return -ENOMEM;
3554 		}
3555 		kvm->arch.pimap = pimap;
3556 	}
3557 
3558 	/*
3559 	 * For now, we only support interrupts for which the EOI operation
3560 	 * is an OPAL call followed by a write to XIRR, since that's
3561 	 * what our real-mode EOI code does.
3562 	 */
3563 	chip = irq_data_get_irq_chip(&desc->irq_data);
3564 	if (!chip || !is_pnv_opal_msi(chip)) {
3565 		pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
3566 			host_irq, guest_gsi);
3567 		mutex_unlock(&kvm->lock);
3568 		return -ENOENT;
3569 	}
3570 
3571 	/*
3572 	 * See if we already have an entry for this guest IRQ number.
3573 	 * If it's mapped to a hardware IRQ number, that's an error,
3574 	 * otherwise re-use this entry.
3575 	 */
3576 	for (i = 0; i < pimap->n_mapped; i++) {
3577 		if (guest_gsi == pimap->mapped[i].v_hwirq) {
3578 			if (pimap->mapped[i].r_hwirq) {
3579 				mutex_unlock(&kvm->lock);
3580 				return -EINVAL;
3581 			}
3582 			break;
3583 		}
3584 	}
3585 
3586 	if (i == KVMPPC_PIRQ_MAPPED) {
3587 		mutex_unlock(&kvm->lock);
3588 		return -EAGAIN;		/* table is full */
3589 	}
3590 
3591 	irq_map = &pimap->mapped[i];
3592 
3593 	irq_map->v_hwirq = guest_gsi;
3594 	irq_map->desc = desc;
3595 
3596 	/*
3597 	 * Order the above two stores before the next to serialize with
3598 	 * the KVM real mode handler.
3599 	 */
3600 	smp_wmb();
3601 	irq_map->r_hwirq = desc->irq_data.hwirq;
3602 
3603 	if (i == pimap->n_mapped)
3604 		pimap->n_mapped++;
3605 
3606 	kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
3607 
3608 	mutex_unlock(&kvm->lock);
3609 
3610 	return 0;
3611 }
3612 
3613 static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
3614 {
3615 	struct irq_desc *desc;
3616 	struct kvmppc_passthru_irqmap *pimap;
3617 	int i;
3618 
3619 	if (!kvm_irq_bypass)
3620 		return 0;
3621 
3622 	desc = irq_to_desc(host_irq);
3623 	if (!desc)
3624 		return -EIO;
3625 
3626 	mutex_lock(&kvm->lock);
3627 
3628 	if (kvm->arch.pimap == NULL) {
3629 		mutex_unlock(&kvm->lock);
3630 		return 0;
3631 	}
3632 	pimap = kvm->arch.pimap;
3633 
3634 	for (i = 0; i < pimap->n_mapped; i++) {
3635 		if (guest_gsi == pimap->mapped[i].v_hwirq)
3636 			break;
3637 	}
3638 
3639 	if (i == pimap->n_mapped) {
3640 		mutex_unlock(&kvm->lock);
3641 		return -ENODEV;
3642 	}
3643 
3644 	kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
3645 
3646 	/* invalidate the entry */
3647 	pimap->mapped[i].r_hwirq = 0;
3648 
3649 	/*
3650 	 * We don't free this structure even when the count goes to
3651 	 * zero. The structure is freed when we destroy the VM.
3652 	 */
3653 
3654 	mutex_unlock(&kvm->lock);
3655 	return 0;
3656 }
3657 
3658 static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
3659 					     struct irq_bypass_producer *prod)
3660 {
3661 	int ret = 0;
3662 	struct kvm_kernel_irqfd *irqfd =
3663 		container_of(cons, struct kvm_kernel_irqfd, consumer);
3664 
3665 	irqfd->producer = prod;
3666 
3667 	ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
3668 	if (ret)
3669 		pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
3670 			prod->irq, irqfd->gsi, ret);
3671 
3672 	return ret;
3673 }
3674 
3675 static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons,
3676 					      struct irq_bypass_producer *prod)
3677 {
3678 	int ret;
3679 	struct kvm_kernel_irqfd *irqfd =
3680 		container_of(cons, struct kvm_kernel_irqfd, consumer);
3681 
3682 	irqfd->producer = NULL;
3683 
3684 	/*
3685 	 * When producer of consumer is unregistered, we change back to
3686 	 * default external interrupt handling mode - KVM real mode
3687 	 * will switch back to host.
3688 	 */
3689 	ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
3690 	if (ret)
3691 		pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
3692 			prod->irq, irqfd->gsi, ret);
3693 }
3694 #endif
3695 
3696 static long kvm_arch_vm_ioctl_hv(struct file *filp,
3697 				 unsigned int ioctl, unsigned long arg)
3698 {
3699 	struct kvm *kvm __maybe_unused = filp->private_data;
3700 	void __user *argp = (void __user *)arg;
3701 	long r;
3702 
3703 	switch (ioctl) {
3704 
3705 	case KVM_PPC_ALLOCATE_HTAB: {
3706 		u32 htab_order;
3707 
3708 		r = -EFAULT;
3709 		if (get_user(htab_order, (u32 __user *)argp))
3710 			break;
3711 		r = kvmppc_alloc_reset_hpt(kvm, htab_order);
3712 		if (r)
3713 			break;
3714 		r = 0;
3715 		break;
3716 	}
3717 
3718 	case KVM_PPC_GET_HTAB_FD: {
3719 		struct kvm_get_htab_fd ghf;
3720 
3721 		r = -EFAULT;
3722 		if (copy_from_user(&ghf, argp, sizeof(ghf)))
3723 			break;
3724 		r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
3725 		break;
3726 	}
3727 
3728 	case KVM_PPC_RESIZE_HPT_PREPARE: {
3729 		struct kvm_ppc_resize_hpt rhpt;
3730 
3731 		r = -EFAULT;
3732 		if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
3733 			break;
3734 
3735 		r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt);
3736 		break;
3737 	}
3738 
3739 	case KVM_PPC_RESIZE_HPT_COMMIT: {
3740 		struct kvm_ppc_resize_hpt rhpt;
3741 
3742 		r = -EFAULT;
3743 		if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
3744 			break;
3745 
3746 		r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt);
3747 		break;
3748 	}
3749 
3750 	default:
3751 		r = -ENOTTY;
3752 	}
3753 
3754 	return r;
3755 }
3756 
3757 /*
3758  * List of hcall numbers to enable by default.
3759  * For compatibility with old userspace, we enable by default
3760  * all hcalls that were implemented before the hcall-enabling
3761  * facility was added.  Note this list should not include H_RTAS.
3762  */
3763 static unsigned int default_hcall_list[] = {
3764 	H_REMOVE,
3765 	H_ENTER,
3766 	H_READ,
3767 	H_PROTECT,
3768 	H_BULK_REMOVE,
3769 	H_GET_TCE,
3770 	H_PUT_TCE,
3771 	H_SET_DABR,
3772 	H_SET_XDABR,
3773 	H_CEDE,
3774 	H_PROD,
3775 	H_CONFER,
3776 	H_REGISTER_VPA,
3777 #ifdef CONFIG_KVM_XICS
3778 	H_EOI,
3779 	H_CPPR,
3780 	H_IPI,
3781 	H_IPOLL,
3782 	H_XIRR,
3783 	H_XIRR_X,
3784 #endif
3785 	0
3786 };
3787 
3788 static void init_default_hcalls(void)
3789 {
3790 	int i;
3791 	unsigned int hcall;
3792 
3793 	for (i = 0; default_hcall_list[i]; ++i) {
3794 		hcall = default_hcall_list[i];
3795 		WARN_ON(!kvmppc_hcall_impl_hv(hcall));
3796 		__set_bit(hcall / 4, default_enabled_hcalls);
3797 	}
3798 }
3799 
3800 static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
3801 {
3802 	unsigned long lpcr;
3803 	int radix;
3804 
3805 	/* If not on a POWER9, reject it */
3806 	if (!cpu_has_feature(CPU_FTR_ARCH_300))
3807 		return -ENODEV;
3808 
3809 	/* If any unknown flags set, reject it */
3810 	if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
3811 		return -EINVAL;
3812 
3813 	/* We can't change a guest to/from radix yet */
3814 	radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
3815 	if (radix != kvm_is_radix(kvm))
3816 		return -EINVAL;
3817 
3818 	/* GR (guest radix) bit in process_table field must match */
3819 	if (!!(cfg->process_table & PATB_GR) != radix)
3820 		return -EINVAL;
3821 
3822 	/* Process table size field must be reasonable, i.e. <= 24 */
3823 	if ((cfg->process_table & PRTS_MASK) > 24)
3824 		return -EINVAL;
3825 
3826 	kvm->arch.process_table = cfg->process_table;
3827 	kvmppc_setup_partition_table(kvm);
3828 
3829 	lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
3830 	kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
3831 
3832 	return 0;
3833 }
3834 
3835 static struct kvmppc_ops kvm_ops_hv = {
3836 	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
3837 	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
3838 	.get_one_reg = kvmppc_get_one_reg_hv,
3839 	.set_one_reg = kvmppc_set_one_reg_hv,
3840 	.vcpu_load   = kvmppc_core_vcpu_load_hv,
3841 	.vcpu_put    = kvmppc_core_vcpu_put_hv,
3842 	.set_msr     = kvmppc_set_msr_hv,
3843 	.vcpu_run    = kvmppc_vcpu_run_hv,
3844 	.vcpu_create = kvmppc_core_vcpu_create_hv,
3845 	.vcpu_free   = kvmppc_core_vcpu_free_hv,
3846 	.check_requests = kvmppc_core_check_requests_hv,
3847 	.get_dirty_log  = kvm_vm_ioctl_get_dirty_log_hv,
3848 	.flush_memslot  = kvmppc_core_flush_memslot_hv,
3849 	.prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
3850 	.commit_memory_region  = kvmppc_core_commit_memory_region_hv,
3851 	.unmap_hva = kvm_unmap_hva_hv,
3852 	.unmap_hva_range = kvm_unmap_hva_range_hv,
3853 	.age_hva  = kvm_age_hva_hv,
3854 	.test_age_hva = kvm_test_age_hva_hv,
3855 	.set_spte_hva = kvm_set_spte_hva_hv,
3856 	.mmu_destroy  = kvmppc_mmu_destroy_hv,
3857 	.free_memslot = kvmppc_core_free_memslot_hv,
3858 	.create_memslot = kvmppc_core_create_memslot_hv,
3859 	.init_vm =  kvmppc_core_init_vm_hv,
3860 	.destroy_vm = kvmppc_core_destroy_vm_hv,
3861 	.get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
3862 	.emulate_op = kvmppc_core_emulate_op_hv,
3863 	.emulate_mtspr = kvmppc_core_emulate_mtspr_hv,
3864 	.emulate_mfspr = kvmppc_core_emulate_mfspr_hv,
3865 	.fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
3866 	.arch_vm_ioctl  = kvm_arch_vm_ioctl_hv,
3867 	.hcall_implemented = kvmppc_hcall_impl_hv,
3868 #ifdef CONFIG_KVM_XICS
3869 	.irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
3870 	.irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
3871 #endif
3872 	.configure_mmu = kvmhv_configure_mmu,
3873 	.get_rmmu_info = kvmhv_get_rmmu_info,
3874 };
3875 
3876 static int kvm_init_subcore_bitmap(void)
3877 {
3878 	int i, j;
3879 	int nr_cores = cpu_nr_cores();
3880 	struct sibling_subcore_state *sibling_subcore_state;
3881 
3882 	for (i = 0; i < nr_cores; i++) {
3883 		int first_cpu = i * threads_per_core;
3884 		int node = cpu_to_node(first_cpu);
3885 
3886 		/* Ignore if it is already allocated. */
3887 		if (paca[first_cpu].sibling_subcore_state)
3888 			continue;
3889 
3890 		sibling_subcore_state =
3891 			kmalloc_node(sizeof(struct sibling_subcore_state),
3892 							GFP_KERNEL, node);
3893 		if (!sibling_subcore_state)
3894 			return -ENOMEM;
3895 
3896 		memset(sibling_subcore_state, 0,
3897 				sizeof(struct sibling_subcore_state));
3898 
3899 		for (j = 0; j < threads_per_core; j++) {
3900 			int cpu = first_cpu + j;
3901 
3902 			paca[cpu].sibling_subcore_state = sibling_subcore_state;
3903 		}
3904 	}
3905 	return 0;
3906 }
3907 
3908 static int kvmppc_radix_possible(void)
3909 {
3910 	return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
3911 }
3912 
3913 static int kvmppc_book3s_init_hv(void)
3914 {
3915 	int r;
3916 	/*
3917 	 * FIXME!! Do we need to check on all cpus ?
3918 	 */
3919 	r = kvmppc_core_check_processor_compat_hv();
3920 	if (r < 0)
3921 		return -ENODEV;
3922 
3923 	r = kvm_init_subcore_bitmap();
3924 	if (r)
3925 		return r;
3926 
3927 	/*
3928 	 * We need a way of accessing the XICS interrupt controller,
3929 	 * either directly, via paca[cpu].kvm_hstate.xics_phys, or
3930 	 * indirectly, via OPAL.
3931 	 */
3932 #ifdef CONFIG_SMP
3933 	if (!get_paca()->kvm_hstate.xics_phys) {
3934 		struct device_node *np;
3935 
3936 		np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
3937 		if (!np) {
3938 			pr_err("KVM-HV: Cannot determine method for accessing XICS\n");
3939 			return -ENODEV;
3940 		}
3941 	}
3942 #endif
3943 
3944 	kvm_ops_hv.owner = THIS_MODULE;
3945 	kvmppc_hv_ops = &kvm_ops_hv;
3946 
3947 	init_default_hcalls();
3948 
3949 	init_vcore_lists();
3950 
3951 	r = kvmppc_mmu_hv_init();
3952 	if (r)
3953 		return r;
3954 
3955 	if (kvmppc_radix_possible())
3956 		r = kvmppc_radix_init();
3957 	return r;
3958 }
3959 
3960 static void kvmppc_book3s_exit_hv(void)
3961 {
3962 	kvmppc_free_host_rm_ops();
3963 	if (kvmppc_radix_possible())
3964 		kvmppc_radix_exit();
3965 	kvmppc_hv_ops = NULL;
3966 }
3967 
3968 module_init(kvmppc_book3s_init_hv);
3969 module_exit(kvmppc_book3s_exit_hv);
3970 MODULE_LICENSE("GPL");
3971 MODULE_ALIAS_MISCDEV(KVM_MINOR);
3972 MODULE_ALIAS("devname:kvm");
3973 
3974