xref: /openbmc/linux/arch/s390/kvm/kvm-s390.c (revision 93f5715e)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * hosting IBM Z kernel virtual machines (s390x)
4  *
5  * Copyright IBM Corp. 2008, 2018
6  *
7  *    Author(s): Carsten Otte <cotte@de.ibm.com>
8  *               Christian Borntraeger <borntraeger@de.ibm.com>
9  *               Heiko Carstens <heiko.carstens@de.ibm.com>
10  *               Christian Ehrhardt <ehrhardt@de.ibm.com>
11  *               Jason J. Herne <jjherne@us.ibm.com>
12  */
13 
14 #include <linux/compiler.h>
15 #include <linux/err.h>
16 #include <linux/fs.h>
17 #include <linux/hrtimer.h>
18 #include <linux/init.h>
19 #include <linux/kvm.h>
20 #include <linux/kvm_host.h>
21 #include <linux/mman.h>
22 #include <linux/module.h>
23 #include <linux/moduleparam.h>
24 #include <linux/random.h>
25 #include <linux/slab.h>
26 #include <linux/timer.h>
27 #include <linux/vmalloc.h>
28 #include <linux/bitmap.h>
29 #include <linux/sched/signal.h>
30 #include <linux/string.h>
31 
32 #include <asm/asm-offsets.h>
33 #include <asm/lowcore.h>
34 #include <asm/stp.h>
35 #include <asm/pgtable.h>
36 #include <asm/gmap.h>
37 #include <asm/nmi.h>
38 #include <asm/switch_to.h>
39 #include <asm/isc.h>
40 #include <asm/sclp.h>
41 #include <asm/cpacf.h>
42 #include <asm/timex.h>
43 #include "kvm-s390.h"
44 #include "gaccess.h"
45 
46 #define KMSG_COMPONENT "kvm-s390"
47 #undef pr_fmt
48 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
49 
50 #define CREATE_TRACE_POINTS
51 #include "trace.h"
52 #include "trace-s390.h"
53 
54 #define MEM_OP_MAX_SIZE 65536	/* Maximum transfer size for KVM_S390_MEM_OP */
55 #define LOCAL_IRQS 32
56 #define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \
57 			   (KVM_MAX_VCPUS + LOCAL_IRQS))
58 
59 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
60 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
61 
62 struct kvm_stats_debugfs_item debugfs_entries[] = {
63 	{ "userspace_handled", VCPU_STAT(exit_userspace) },
64 	{ "exit_null", VCPU_STAT(exit_null) },
65 	{ "exit_validity", VCPU_STAT(exit_validity) },
66 	{ "exit_stop_request", VCPU_STAT(exit_stop_request) },
67 	{ "exit_external_request", VCPU_STAT(exit_external_request) },
68 	{ "exit_io_request", VCPU_STAT(exit_io_request) },
69 	{ "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) },
70 	{ "exit_instruction", VCPU_STAT(exit_instruction) },
71 	{ "exit_pei", VCPU_STAT(exit_pei) },
72 	{ "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
73 	{ "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
74 	{ "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
75 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
76 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
77 	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
78 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
79 	{ "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
80 	{ "instruction_lctl", VCPU_STAT(instruction_lctl) },
81 	{ "instruction_stctl", VCPU_STAT(instruction_stctl) },
82 	{ "instruction_stctg", VCPU_STAT(instruction_stctg) },
83 	{ "deliver_ckc", VCPU_STAT(deliver_ckc) },
84 	{ "deliver_cputm", VCPU_STAT(deliver_cputm) },
85 	{ "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
86 	{ "deliver_external_call", VCPU_STAT(deliver_external_call) },
87 	{ "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
88 	{ "deliver_virtio", VCPU_STAT(deliver_virtio) },
89 	{ "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) },
90 	{ "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) },
91 	{ "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) },
92 	{ "deliver_program", VCPU_STAT(deliver_program) },
93 	{ "deliver_io", VCPU_STAT(deliver_io) },
94 	{ "deliver_machine_check", VCPU_STAT(deliver_machine_check) },
95 	{ "exit_wait_state", VCPU_STAT(exit_wait_state) },
96 	{ "inject_ckc", VCPU_STAT(inject_ckc) },
97 	{ "inject_cputm", VCPU_STAT(inject_cputm) },
98 	{ "inject_external_call", VCPU_STAT(inject_external_call) },
99 	{ "inject_float_mchk", VM_STAT(inject_float_mchk) },
100 	{ "inject_emergency_signal", VCPU_STAT(inject_emergency_signal) },
101 	{ "inject_io", VM_STAT(inject_io) },
102 	{ "inject_mchk", VCPU_STAT(inject_mchk) },
103 	{ "inject_pfault_done", VM_STAT(inject_pfault_done) },
104 	{ "inject_program", VCPU_STAT(inject_program) },
105 	{ "inject_restart", VCPU_STAT(inject_restart) },
106 	{ "inject_service_signal", VM_STAT(inject_service_signal) },
107 	{ "inject_set_prefix", VCPU_STAT(inject_set_prefix) },
108 	{ "inject_stop_signal", VCPU_STAT(inject_stop_signal) },
109 	{ "inject_pfault_init", VCPU_STAT(inject_pfault_init) },
110 	{ "inject_virtio", VM_STAT(inject_virtio) },
111 	{ "instruction_epsw", VCPU_STAT(instruction_epsw) },
112 	{ "instruction_gs", VCPU_STAT(instruction_gs) },
113 	{ "instruction_io_other", VCPU_STAT(instruction_io_other) },
114 	{ "instruction_lpsw", VCPU_STAT(instruction_lpsw) },
115 	{ "instruction_lpswe", VCPU_STAT(instruction_lpswe) },
116 	{ "instruction_pfmf", VCPU_STAT(instruction_pfmf) },
117 	{ "instruction_ptff", VCPU_STAT(instruction_ptff) },
118 	{ "instruction_stidp", VCPU_STAT(instruction_stidp) },
119 	{ "instruction_sck", VCPU_STAT(instruction_sck) },
120 	{ "instruction_sckpf", VCPU_STAT(instruction_sckpf) },
121 	{ "instruction_spx", VCPU_STAT(instruction_spx) },
122 	{ "instruction_stpx", VCPU_STAT(instruction_stpx) },
123 	{ "instruction_stap", VCPU_STAT(instruction_stap) },
124 	{ "instruction_iske", VCPU_STAT(instruction_iske) },
125 	{ "instruction_ri", VCPU_STAT(instruction_ri) },
126 	{ "instruction_rrbe", VCPU_STAT(instruction_rrbe) },
127 	{ "instruction_sske", VCPU_STAT(instruction_sske) },
128 	{ "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) },
129 	{ "instruction_essa", VCPU_STAT(instruction_essa) },
130 	{ "instruction_stsi", VCPU_STAT(instruction_stsi) },
131 	{ "instruction_stfl", VCPU_STAT(instruction_stfl) },
132 	{ "instruction_tb", VCPU_STAT(instruction_tb) },
133 	{ "instruction_tpi", VCPU_STAT(instruction_tpi) },
134 	{ "instruction_tprot", VCPU_STAT(instruction_tprot) },
135 	{ "instruction_tsch", VCPU_STAT(instruction_tsch) },
136 	{ "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
137 	{ "instruction_sie", VCPU_STAT(instruction_sie) },
138 	{ "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
139 	{ "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
140 	{ "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
141 	{ "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) },
142 	{ "instruction_sigp_cond_emergency", VCPU_STAT(instruction_sigp_cond_emergency) },
143 	{ "instruction_sigp_start", VCPU_STAT(instruction_sigp_start) },
144 	{ "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) },
145 	{ "instruction_sigp_stop_store_status", VCPU_STAT(instruction_sigp_stop_store_status) },
146 	{ "instruction_sigp_store_status", VCPU_STAT(instruction_sigp_store_status) },
147 	{ "instruction_sigp_store_adtl_status", VCPU_STAT(instruction_sigp_store_adtl_status) },
148 	{ "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) },
149 	{ "instruction_sigp_set_prefix", VCPU_STAT(instruction_sigp_prefix) },
150 	{ "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) },
151 	{ "instruction_sigp_cpu_reset", VCPU_STAT(instruction_sigp_cpu_reset) },
152 	{ "instruction_sigp_init_cpu_reset", VCPU_STAT(instruction_sigp_init_cpu_reset) },
153 	{ "instruction_sigp_unknown", VCPU_STAT(instruction_sigp_unknown) },
154 	{ "instruction_diag_10", VCPU_STAT(diagnose_10) },
155 	{ "instruction_diag_44", VCPU_STAT(diagnose_44) },
156 	{ "instruction_diag_9c", VCPU_STAT(diagnose_9c) },
157 	{ "instruction_diag_258", VCPU_STAT(diagnose_258) },
158 	{ "instruction_diag_308", VCPU_STAT(diagnose_308) },
159 	{ "instruction_diag_500", VCPU_STAT(diagnose_500) },
160 	{ "instruction_diag_other", VCPU_STAT(diagnose_other) },
161 	{ NULL }
162 };
163 
164 struct kvm_s390_tod_clock_ext {
165 	__u8 epoch_idx;
166 	__u64 tod;
167 	__u8 reserved[7];
168 } __packed;
169 
170 /* allow nested virtualization in KVM (if enabled by user space) */
171 static int nested;
172 module_param(nested, int, S_IRUGO);
173 MODULE_PARM_DESC(nested, "Nested virtualization support");
174 
175 /* allow 1m huge page guest backing, if !nested */
176 static int hpage;
177 module_param(hpage, int, 0444);
178 MODULE_PARM_DESC(hpage, "1m huge page backing support");
179 
180 /*
181  * For now we handle at most 16 double words as this is what the s390 base
182  * kernel handles and stores in the prefix page. If we ever need to go beyond
183  * this, this requires changes to code, but the external uapi can stay.
184  */
185 #define SIZE_INTERNAL 16
186 
187 /*
188  * Base feature mask that defines default mask for facilities. Consists of the
189  * defines in FACILITIES_KVM and the non-hypervisor managed bits.
190  */
191 static unsigned long kvm_s390_fac_base[SIZE_INTERNAL] = { FACILITIES_KVM };
192 /*
193  * Extended feature mask. Consists of the defines in FACILITIES_KVM_CPUMODEL
194  * and defines the facilities that can be enabled via a cpu model.
195  */
196 static unsigned long kvm_s390_fac_ext[SIZE_INTERNAL] = { FACILITIES_KVM_CPUMODEL };
197 
198 static unsigned long kvm_s390_fac_size(void)
199 {
200 	BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_MASK_SIZE_U64);
201 	BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_LIST_SIZE_U64);
202 	BUILD_BUG_ON(SIZE_INTERNAL * sizeof(unsigned long) >
203 		sizeof(S390_lowcore.stfle_fac_list));
204 
205 	return SIZE_INTERNAL;
206 }
207 
208 /* available cpu features supported by kvm */
209 static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
210 /* available subfunctions indicated via query / "test bit" */
211 static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
212 
213 static struct gmap_notifier gmap_notifier;
214 static struct gmap_notifier vsie_gmap_notifier;
215 debug_info_t *kvm_s390_dbf;
216 
217 /* Section: not file related */
218 int kvm_arch_hardware_enable(void)
219 {
220 	/* every s390 is virtualization enabled ;-) */
221 	return 0;
222 }
223 
224 static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
225 			      unsigned long end);
226 
227 static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta)
228 {
229 	u8 delta_idx = 0;
230 
231 	/*
232 	 * The TOD jumps by delta, we have to compensate this by adding
233 	 * -delta to the epoch.
234 	 */
235 	delta = -delta;
236 
237 	/* sign-extension - we're adding to signed values below */
238 	if ((s64)delta < 0)
239 		delta_idx = -1;
240 
241 	scb->epoch += delta;
242 	if (scb->ecd & ECD_MEF) {
243 		scb->epdx += delta_idx;
244 		if (scb->epoch < delta)
245 			scb->epdx += 1;
246 	}
247 }
248 
249 /*
250  * This callback is executed during stop_machine(). All CPUs are therefore
251  * temporarily stopped. In order not to change guest behavior, we have to
252  * disable preemption whenever we touch the epoch of kvm and the VCPUs,
253  * so a CPU won't be stopped while calculating with the epoch.
254  */
255 static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
256 			  void *v)
257 {
258 	struct kvm *kvm;
259 	struct kvm_vcpu *vcpu;
260 	int i;
261 	unsigned long long *delta = v;
262 
263 	list_for_each_entry(kvm, &vm_list, vm_list) {
264 		kvm_for_each_vcpu(i, vcpu, kvm) {
265 			kvm_clock_sync_scb(vcpu->arch.sie_block, *delta);
266 			if (i == 0) {
267 				kvm->arch.epoch = vcpu->arch.sie_block->epoch;
268 				kvm->arch.epdx = vcpu->arch.sie_block->epdx;
269 			}
270 			if (vcpu->arch.cputm_enabled)
271 				vcpu->arch.cputm_start += *delta;
272 			if (vcpu->arch.vsie_block)
273 				kvm_clock_sync_scb(vcpu->arch.vsie_block,
274 						   *delta);
275 		}
276 	}
277 	return NOTIFY_OK;
278 }
279 
280 static struct notifier_block kvm_clock_notifier = {
281 	.notifier_call = kvm_clock_sync,
282 };
283 
284 int kvm_arch_hardware_setup(void)
285 {
286 	gmap_notifier.notifier_call = kvm_gmap_notifier;
287 	gmap_register_pte_notifier(&gmap_notifier);
288 	vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
289 	gmap_register_pte_notifier(&vsie_gmap_notifier);
290 	atomic_notifier_chain_register(&s390_epoch_delta_notifier,
291 				       &kvm_clock_notifier);
292 	return 0;
293 }
294 
295 void kvm_arch_hardware_unsetup(void)
296 {
297 	gmap_unregister_pte_notifier(&gmap_notifier);
298 	gmap_unregister_pte_notifier(&vsie_gmap_notifier);
299 	atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
300 					 &kvm_clock_notifier);
301 }
302 
303 static void allow_cpu_feat(unsigned long nr)
304 {
305 	set_bit_inv(nr, kvm_s390_available_cpu_feat);
306 }
307 
308 static inline int plo_test_bit(unsigned char nr)
309 {
310 	register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
311 	int cc;
312 
313 	asm volatile(
314 		/* Parameter registers are ignored for "test bit" */
315 		"	plo	0,0,0,0(0)\n"
316 		"	ipm	%0\n"
317 		"	srl	%0,28\n"
318 		: "=d" (cc)
319 		: "d" (r0)
320 		: "cc");
321 	return cc == 0;
322 }
323 
324 static void kvm_s390_cpu_feat_init(void)
325 {
326 	int i;
327 
328 	for (i = 0; i < 256; ++i) {
329 		if (plo_test_bit(i))
330 			kvm_s390_available_subfunc.plo[i >> 3] |= 0x80 >> (i & 7);
331 	}
332 
333 	if (test_facility(28)) /* TOD-clock steering */
334 		ptff(kvm_s390_available_subfunc.ptff,
335 		     sizeof(kvm_s390_available_subfunc.ptff),
336 		     PTFF_QAF);
337 
338 	if (test_facility(17)) { /* MSA */
339 		__cpacf_query(CPACF_KMAC, (cpacf_mask_t *)
340 			      kvm_s390_available_subfunc.kmac);
341 		__cpacf_query(CPACF_KMC, (cpacf_mask_t *)
342 			      kvm_s390_available_subfunc.kmc);
343 		__cpacf_query(CPACF_KM, (cpacf_mask_t *)
344 			      kvm_s390_available_subfunc.km);
345 		__cpacf_query(CPACF_KIMD, (cpacf_mask_t *)
346 			      kvm_s390_available_subfunc.kimd);
347 		__cpacf_query(CPACF_KLMD, (cpacf_mask_t *)
348 			      kvm_s390_available_subfunc.klmd);
349 	}
350 	if (test_facility(76)) /* MSA3 */
351 		__cpacf_query(CPACF_PCKMO, (cpacf_mask_t *)
352 			      kvm_s390_available_subfunc.pckmo);
353 	if (test_facility(77)) { /* MSA4 */
354 		__cpacf_query(CPACF_KMCTR, (cpacf_mask_t *)
355 			      kvm_s390_available_subfunc.kmctr);
356 		__cpacf_query(CPACF_KMF, (cpacf_mask_t *)
357 			      kvm_s390_available_subfunc.kmf);
358 		__cpacf_query(CPACF_KMO, (cpacf_mask_t *)
359 			      kvm_s390_available_subfunc.kmo);
360 		__cpacf_query(CPACF_PCC, (cpacf_mask_t *)
361 			      kvm_s390_available_subfunc.pcc);
362 	}
363 	if (test_facility(57)) /* MSA5 */
364 		__cpacf_query(CPACF_PRNO, (cpacf_mask_t *)
365 			      kvm_s390_available_subfunc.ppno);
366 
367 	if (test_facility(146)) /* MSA8 */
368 		__cpacf_query(CPACF_KMA, (cpacf_mask_t *)
369 			      kvm_s390_available_subfunc.kma);
370 
371 	if (MACHINE_HAS_ESOP)
372 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
373 	/*
374 	 * We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
375 	 * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
376 	 */
377 	if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
378 	    !test_facility(3) || !nested)
379 		return;
380 	allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
381 	if (sclp.has_64bscao)
382 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
383 	if (sclp.has_siif)
384 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
385 	if (sclp.has_gpere)
386 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
387 	if (sclp.has_gsls)
388 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
389 	if (sclp.has_ib)
390 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
391 	if (sclp.has_cei)
392 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
393 	if (sclp.has_ibs)
394 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
395 	if (sclp.has_kss)
396 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_KSS);
397 	/*
398 	 * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
399 	 * all skey handling functions read/set the skey from the PGSTE
400 	 * instead of the real storage key.
401 	 *
402 	 * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make
403 	 * pages being detected as preserved although they are resident.
404 	 *
405 	 * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will
406 	 * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY.
407 	 *
408 	 * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and
409 	 * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be
410 	 * correctly shadowed. We can do that for the PGSTE but not for PTE.I.
411 	 *
412 	 * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We
413 	 * cannot easily shadow the SCA because of the ipte lock.
414 	 */
415 }
416 
417 int kvm_arch_init(void *opaque)
418 {
419 	kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long));
420 	if (!kvm_s390_dbf)
421 		return -ENOMEM;
422 
423 	if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view)) {
424 		debug_unregister(kvm_s390_dbf);
425 		return -ENOMEM;
426 	}
427 
428 	kvm_s390_cpu_feat_init();
429 
430 	/* Register floating interrupt controller interface. */
431 	return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
432 }
433 
434 void kvm_arch_exit(void)
435 {
436 	debug_unregister(kvm_s390_dbf);
437 }
438 
439 /* Section: device related */
440 long kvm_arch_dev_ioctl(struct file *filp,
441 			unsigned int ioctl, unsigned long arg)
442 {
443 	if (ioctl == KVM_S390_ENABLE_SIE)
444 		return s390_enable_sie();
445 	return -EINVAL;
446 }
447 
448 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
449 {
450 	int r;
451 
452 	switch (ext) {
453 	case KVM_CAP_S390_PSW:
454 	case KVM_CAP_S390_GMAP:
455 	case KVM_CAP_SYNC_MMU:
456 #ifdef CONFIG_KVM_S390_UCONTROL
457 	case KVM_CAP_S390_UCONTROL:
458 #endif
459 	case KVM_CAP_ASYNC_PF:
460 	case KVM_CAP_SYNC_REGS:
461 	case KVM_CAP_ONE_REG:
462 	case KVM_CAP_ENABLE_CAP:
463 	case KVM_CAP_S390_CSS_SUPPORT:
464 	case KVM_CAP_IOEVENTFD:
465 	case KVM_CAP_DEVICE_CTRL:
466 	case KVM_CAP_ENABLE_CAP_VM:
467 	case KVM_CAP_S390_IRQCHIP:
468 	case KVM_CAP_VM_ATTRIBUTES:
469 	case KVM_CAP_MP_STATE:
470 	case KVM_CAP_IMMEDIATE_EXIT:
471 	case KVM_CAP_S390_INJECT_IRQ:
472 	case KVM_CAP_S390_USER_SIGP:
473 	case KVM_CAP_S390_USER_STSI:
474 	case KVM_CAP_S390_SKEYS:
475 	case KVM_CAP_S390_IRQ_STATE:
476 	case KVM_CAP_S390_USER_INSTR0:
477 	case KVM_CAP_S390_CMMA_MIGRATION:
478 	case KVM_CAP_S390_AIS:
479 	case KVM_CAP_S390_AIS_MIGRATION:
480 		r = 1;
481 		break;
482 	case KVM_CAP_S390_HPAGE_1M:
483 		r = 0;
484 		if (hpage)
485 			r = 1;
486 		break;
487 	case KVM_CAP_S390_MEM_OP:
488 		r = MEM_OP_MAX_SIZE;
489 		break;
490 	case KVM_CAP_NR_VCPUS:
491 	case KVM_CAP_MAX_VCPUS:
492 		r = KVM_S390_BSCA_CPU_SLOTS;
493 		if (!kvm_s390_use_sca_entries())
494 			r = KVM_MAX_VCPUS;
495 		else if (sclp.has_esca && sclp.has_64bscao)
496 			r = KVM_S390_ESCA_CPU_SLOTS;
497 		break;
498 	case KVM_CAP_NR_MEMSLOTS:
499 		r = KVM_USER_MEM_SLOTS;
500 		break;
501 	case KVM_CAP_S390_COW:
502 		r = MACHINE_HAS_ESOP;
503 		break;
504 	case KVM_CAP_S390_VECTOR_REGISTERS:
505 		r = MACHINE_HAS_VX;
506 		break;
507 	case KVM_CAP_S390_RI:
508 		r = test_facility(64);
509 		break;
510 	case KVM_CAP_S390_GS:
511 		r = test_facility(133);
512 		break;
513 	case KVM_CAP_S390_BPB:
514 		r = test_facility(82);
515 		break;
516 	default:
517 		r = 0;
518 	}
519 	return r;
520 }
521 
522 static void kvm_s390_sync_dirty_log(struct kvm *kvm,
523 				    struct kvm_memory_slot *memslot)
524 {
525 	int i;
526 	gfn_t cur_gfn, last_gfn;
527 	unsigned long gaddr, vmaddr;
528 	struct gmap *gmap = kvm->arch.gmap;
529 	DECLARE_BITMAP(bitmap, _PAGE_ENTRIES);
530 
531 	/* Loop over all guest segments */
532 	cur_gfn = memslot->base_gfn;
533 	last_gfn = memslot->base_gfn + memslot->npages;
534 	for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) {
535 		gaddr = gfn_to_gpa(cur_gfn);
536 		vmaddr = gfn_to_hva_memslot(memslot, cur_gfn);
537 		if (kvm_is_error_hva(vmaddr))
538 			continue;
539 
540 		bitmap_zero(bitmap, _PAGE_ENTRIES);
541 		gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr);
542 		for (i = 0; i < _PAGE_ENTRIES; i++) {
543 			if (test_bit(i, bitmap))
544 				mark_page_dirty(kvm, cur_gfn + i);
545 		}
546 
547 		if (fatal_signal_pending(current))
548 			return;
549 		cond_resched();
550 	}
551 }
552 
553 /* Section: vm related */
554 static void sca_del_vcpu(struct kvm_vcpu *vcpu);
555 
556 /*
557  * Get (and clear) the dirty memory log for a memory slot.
558  */
559 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
560 			       struct kvm_dirty_log *log)
561 {
562 	int r;
563 	unsigned long n;
564 	struct kvm_memslots *slots;
565 	struct kvm_memory_slot *memslot;
566 	int is_dirty = 0;
567 
568 	if (kvm_is_ucontrol(kvm))
569 		return -EINVAL;
570 
571 	mutex_lock(&kvm->slots_lock);
572 
573 	r = -EINVAL;
574 	if (log->slot >= KVM_USER_MEM_SLOTS)
575 		goto out;
576 
577 	slots = kvm_memslots(kvm);
578 	memslot = id_to_memslot(slots, log->slot);
579 	r = -ENOENT;
580 	if (!memslot->dirty_bitmap)
581 		goto out;
582 
583 	kvm_s390_sync_dirty_log(kvm, memslot);
584 	r = kvm_get_dirty_log(kvm, log, &is_dirty);
585 	if (r)
586 		goto out;
587 
588 	/* Clear the dirty log */
589 	if (is_dirty) {
590 		n = kvm_dirty_bitmap_bytes(memslot);
591 		memset(memslot->dirty_bitmap, 0, n);
592 	}
593 	r = 0;
594 out:
595 	mutex_unlock(&kvm->slots_lock);
596 	return r;
597 }
598 
599 static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
600 {
601 	unsigned int i;
602 	struct kvm_vcpu *vcpu;
603 
604 	kvm_for_each_vcpu(i, vcpu, kvm) {
605 		kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu);
606 	}
607 }
608 
609 static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
610 {
611 	int r;
612 
613 	if (cap->flags)
614 		return -EINVAL;
615 
616 	switch (cap->cap) {
617 	case KVM_CAP_S390_IRQCHIP:
618 		VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_IRQCHIP");
619 		kvm->arch.use_irqchip = 1;
620 		r = 0;
621 		break;
622 	case KVM_CAP_S390_USER_SIGP:
623 		VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_SIGP");
624 		kvm->arch.user_sigp = 1;
625 		r = 0;
626 		break;
627 	case KVM_CAP_S390_VECTOR_REGISTERS:
628 		mutex_lock(&kvm->lock);
629 		if (kvm->created_vcpus) {
630 			r = -EBUSY;
631 		} else if (MACHINE_HAS_VX) {
632 			set_kvm_facility(kvm->arch.model.fac_mask, 129);
633 			set_kvm_facility(kvm->arch.model.fac_list, 129);
634 			if (test_facility(134)) {
635 				set_kvm_facility(kvm->arch.model.fac_mask, 134);
636 				set_kvm_facility(kvm->arch.model.fac_list, 134);
637 			}
638 			if (test_facility(135)) {
639 				set_kvm_facility(kvm->arch.model.fac_mask, 135);
640 				set_kvm_facility(kvm->arch.model.fac_list, 135);
641 			}
642 			r = 0;
643 		} else
644 			r = -EINVAL;
645 		mutex_unlock(&kvm->lock);
646 		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_VECTOR_REGISTERS %s",
647 			 r ? "(not available)" : "(success)");
648 		break;
649 	case KVM_CAP_S390_RI:
650 		r = -EINVAL;
651 		mutex_lock(&kvm->lock);
652 		if (kvm->created_vcpus) {
653 			r = -EBUSY;
654 		} else if (test_facility(64)) {
655 			set_kvm_facility(kvm->arch.model.fac_mask, 64);
656 			set_kvm_facility(kvm->arch.model.fac_list, 64);
657 			r = 0;
658 		}
659 		mutex_unlock(&kvm->lock);
660 		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_RI %s",
661 			 r ? "(not available)" : "(success)");
662 		break;
663 	case KVM_CAP_S390_AIS:
664 		mutex_lock(&kvm->lock);
665 		if (kvm->created_vcpus) {
666 			r = -EBUSY;
667 		} else {
668 			set_kvm_facility(kvm->arch.model.fac_mask, 72);
669 			set_kvm_facility(kvm->arch.model.fac_list, 72);
670 			r = 0;
671 		}
672 		mutex_unlock(&kvm->lock);
673 		VM_EVENT(kvm, 3, "ENABLE: AIS %s",
674 			 r ? "(not available)" : "(success)");
675 		break;
676 	case KVM_CAP_S390_GS:
677 		r = -EINVAL;
678 		mutex_lock(&kvm->lock);
679 		if (kvm->created_vcpus) {
680 			r = -EBUSY;
681 		} else if (test_facility(133)) {
682 			set_kvm_facility(kvm->arch.model.fac_mask, 133);
683 			set_kvm_facility(kvm->arch.model.fac_list, 133);
684 			r = 0;
685 		}
686 		mutex_unlock(&kvm->lock);
687 		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_GS %s",
688 			 r ? "(not available)" : "(success)");
689 		break;
690 	case KVM_CAP_S390_HPAGE_1M:
691 		mutex_lock(&kvm->lock);
692 		if (kvm->created_vcpus)
693 			r = -EBUSY;
694 		else if (!hpage || kvm->arch.use_cmma)
695 			r = -EINVAL;
696 		else {
697 			r = 0;
698 			kvm->mm->context.allow_gmap_hpage_1m = 1;
699 			/*
700 			 * We might have to create fake 4k page
701 			 * tables. To avoid that the hardware works on
702 			 * stale PGSTEs, we emulate these instructions.
703 			 */
704 			kvm->arch.use_skf = 0;
705 			kvm->arch.use_pfmfi = 0;
706 		}
707 		mutex_unlock(&kvm->lock);
708 		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_HPAGE %s",
709 			 r ? "(not available)" : "(success)");
710 		break;
711 	case KVM_CAP_S390_USER_STSI:
712 		VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_STSI");
713 		kvm->arch.user_stsi = 1;
714 		r = 0;
715 		break;
716 	case KVM_CAP_S390_USER_INSTR0:
717 		VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0");
718 		kvm->arch.user_instr0 = 1;
719 		icpt_operexc_on_all_vcpus(kvm);
720 		r = 0;
721 		break;
722 	default:
723 		r = -EINVAL;
724 		break;
725 	}
726 	return r;
727 }
728 
729 static int kvm_s390_get_mem_control(struct kvm *kvm, struct kvm_device_attr *attr)
730 {
731 	int ret;
732 
733 	switch (attr->attr) {
734 	case KVM_S390_VM_MEM_LIMIT_SIZE:
735 		ret = 0;
736 		VM_EVENT(kvm, 3, "QUERY: max guest memory: %lu bytes",
737 			 kvm->arch.mem_limit);
738 		if (put_user(kvm->arch.mem_limit, (u64 __user *)attr->addr))
739 			ret = -EFAULT;
740 		break;
741 	default:
742 		ret = -ENXIO;
743 		break;
744 	}
745 	return ret;
746 }
747 
748 static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *attr)
749 {
750 	int ret;
751 	unsigned int idx;
752 	switch (attr->attr) {
753 	case KVM_S390_VM_MEM_ENABLE_CMMA:
754 		ret = -ENXIO;
755 		if (!sclp.has_cmma)
756 			break;
757 
758 		VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support");
759 		mutex_lock(&kvm->lock);
760 		if (kvm->created_vcpus)
761 			ret = -EBUSY;
762 		else if (kvm->mm->context.allow_gmap_hpage_1m)
763 			ret = -EINVAL;
764 		else {
765 			kvm->arch.use_cmma = 1;
766 			/* Not compatible with cmma. */
767 			kvm->arch.use_pfmfi = 0;
768 			ret = 0;
769 		}
770 		mutex_unlock(&kvm->lock);
771 		break;
772 	case KVM_S390_VM_MEM_CLR_CMMA:
773 		ret = -ENXIO;
774 		if (!sclp.has_cmma)
775 			break;
776 		ret = -EINVAL;
777 		if (!kvm->arch.use_cmma)
778 			break;
779 
780 		VM_EVENT(kvm, 3, "%s", "RESET: CMMA states");
781 		mutex_lock(&kvm->lock);
782 		idx = srcu_read_lock(&kvm->srcu);
783 		s390_reset_cmma(kvm->arch.gmap->mm);
784 		srcu_read_unlock(&kvm->srcu, idx);
785 		mutex_unlock(&kvm->lock);
786 		ret = 0;
787 		break;
788 	case KVM_S390_VM_MEM_LIMIT_SIZE: {
789 		unsigned long new_limit;
790 
791 		if (kvm_is_ucontrol(kvm))
792 			return -EINVAL;
793 
794 		if (get_user(new_limit, (u64 __user *)attr->addr))
795 			return -EFAULT;
796 
797 		if (kvm->arch.mem_limit != KVM_S390_NO_MEM_LIMIT &&
798 		    new_limit > kvm->arch.mem_limit)
799 			return -E2BIG;
800 
801 		if (!new_limit)
802 			return -EINVAL;
803 
804 		/* gmap_create takes last usable address */
805 		if (new_limit != KVM_S390_NO_MEM_LIMIT)
806 			new_limit -= 1;
807 
808 		ret = -EBUSY;
809 		mutex_lock(&kvm->lock);
810 		if (!kvm->created_vcpus) {
811 			/* gmap_create will round the limit up */
812 			struct gmap *new = gmap_create(current->mm, new_limit);
813 
814 			if (!new) {
815 				ret = -ENOMEM;
816 			} else {
817 				gmap_remove(kvm->arch.gmap);
818 				new->private = kvm;
819 				kvm->arch.gmap = new;
820 				ret = 0;
821 			}
822 		}
823 		mutex_unlock(&kvm->lock);
824 		VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit);
825 		VM_EVENT(kvm, 3, "New guest asce: 0x%pK",
826 			 (void *) kvm->arch.gmap->asce);
827 		break;
828 	}
829 	default:
830 		ret = -ENXIO;
831 		break;
832 	}
833 	return ret;
834 }
835 
836 static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu);
837 
838 void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm)
839 {
840 	struct kvm_vcpu *vcpu;
841 	int i;
842 
843 	kvm_s390_vcpu_block_all(kvm);
844 
845 	kvm_for_each_vcpu(i, vcpu, kvm)
846 		kvm_s390_vcpu_crypto_setup(vcpu);
847 
848 	kvm_s390_vcpu_unblock_all(kvm);
849 }
850 
851 static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
852 {
853 	if (!test_kvm_facility(kvm, 76))
854 		return -EINVAL;
855 
856 	mutex_lock(&kvm->lock);
857 	switch (attr->attr) {
858 	case KVM_S390_VM_CRYPTO_ENABLE_AES_KW:
859 		get_random_bytes(
860 			kvm->arch.crypto.crycb->aes_wrapping_key_mask,
861 			sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask));
862 		kvm->arch.crypto.aes_kw = 1;
863 		VM_EVENT(kvm, 3, "%s", "ENABLE: AES keywrapping support");
864 		break;
865 	case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW:
866 		get_random_bytes(
867 			kvm->arch.crypto.crycb->dea_wrapping_key_mask,
868 			sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask));
869 		kvm->arch.crypto.dea_kw = 1;
870 		VM_EVENT(kvm, 3, "%s", "ENABLE: DEA keywrapping support");
871 		break;
872 	case KVM_S390_VM_CRYPTO_DISABLE_AES_KW:
873 		kvm->arch.crypto.aes_kw = 0;
874 		memset(kvm->arch.crypto.crycb->aes_wrapping_key_mask, 0,
875 			sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask));
876 		VM_EVENT(kvm, 3, "%s", "DISABLE: AES keywrapping support");
877 		break;
878 	case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW:
879 		kvm->arch.crypto.dea_kw = 0;
880 		memset(kvm->arch.crypto.crycb->dea_wrapping_key_mask, 0,
881 			sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask));
882 		VM_EVENT(kvm, 3, "%s", "DISABLE: DEA keywrapping support");
883 		break;
884 	default:
885 		mutex_unlock(&kvm->lock);
886 		return -ENXIO;
887 	}
888 
889 	kvm_s390_vcpu_crypto_reset_all(kvm);
890 	mutex_unlock(&kvm->lock);
891 	return 0;
892 }
893 
894 static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req)
895 {
896 	int cx;
897 	struct kvm_vcpu *vcpu;
898 
899 	kvm_for_each_vcpu(cx, vcpu, kvm)
900 		kvm_s390_sync_request(req, vcpu);
901 }
902 
903 /*
904  * Must be called with kvm->srcu held to avoid races on memslots, and with
905  * kvm->slots_lock to avoid races with ourselves and kvm_s390_vm_stop_migration.
906  */
907 static int kvm_s390_vm_start_migration(struct kvm *kvm)
908 {
909 	struct kvm_memory_slot *ms;
910 	struct kvm_memslots *slots;
911 	unsigned long ram_pages = 0;
912 	int slotnr;
913 
914 	/* migration mode already enabled */
915 	if (kvm->arch.migration_mode)
916 		return 0;
917 	slots = kvm_memslots(kvm);
918 	if (!slots || !slots->used_slots)
919 		return -EINVAL;
920 
921 	if (!kvm->arch.use_cmma) {
922 		kvm->arch.migration_mode = 1;
923 		return 0;
924 	}
925 	/* mark all the pages in active slots as dirty */
926 	for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
927 		ms = slots->memslots + slotnr;
928 		/*
929 		 * The second half of the bitmap is only used on x86,
930 		 * and would be wasted otherwise, so we put it to good
931 		 * use here to keep track of the state of the storage
932 		 * attributes.
933 		 */
934 		memset(kvm_second_dirty_bitmap(ms), 0xff, kvm_dirty_bitmap_bytes(ms));
935 		ram_pages += ms->npages;
936 	}
937 	atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages);
938 	kvm->arch.migration_mode = 1;
939 	kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
940 	return 0;
941 }
942 
943 /*
944  * Must be called with kvm->slots_lock to avoid races with ourselves and
945  * kvm_s390_vm_start_migration.
946  */
947 static int kvm_s390_vm_stop_migration(struct kvm *kvm)
948 {
949 	/* migration mode already disabled */
950 	if (!kvm->arch.migration_mode)
951 		return 0;
952 	kvm->arch.migration_mode = 0;
953 	if (kvm->arch.use_cmma)
954 		kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION);
955 	return 0;
956 }
957 
958 static int kvm_s390_vm_set_migration(struct kvm *kvm,
959 				     struct kvm_device_attr *attr)
960 {
961 	int res = -ENXIO;
962 
963 	mutex_lock(&kvm->slots_lock);
964 	switch (attr->attr) {
965 	case KVM_S390_VM_MIGRATION_START:
966 		res = kvm_s390_vm_start_migration(kvm);
967 		break;
968 	case KVM_S390_VM_MIGRATION_STOP:
969 		res = kvm_s390_vm_stop_migration(kvm);
970 		break;
971 	default:
972 		break;
973 	}
974 	mutex_unlock(&kvm->slots_lock);
975 
976 	return res;
977 }
978 
979 static int kvm_s390_vm_get_migration(struct kvm *kvm,
980 				     struct kvm_device_attr *attr)
981 {
982 	u64 mig = kvm->arch.migration_mode;
983 
984 	if (attr->attr != KVM_S390_VM_MIGRATION_STATUS)
985 		return -ENXIO;
986 
987 	if (copy_to_user((void __user *)attr->addr, &mig, sizeof(mig)))
988 		return -EFAULT;
989 	return 0;
990 }
991 
992 static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr)
993 {
994 	struct kvm_s390_vm_tod_clock gtod;
995 
996 	if (copy_from_user(&gtod, (void __user *)attr->addr, sizeof(gtod)))
997 		return -EFAULT;
998 
999 	if (!test_kvm_facility(kvm, 139) && gtod.epoch_idx)
1000 		return -EINVAL;
1001 	kvm_s390_set_tod_clock(kvm, &gtod);
1002 
1003 	VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx",
1004 		gtod.epoch_idx, gtod.tod);
1005 
1006 	return 0;
1007 }
1008 
1009 static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
1010 {
1011 	u8 gtod_high;
1012 
1013 	if (copy_from_user(&gtod_high, (void __user *)attr->addr,
1014 					   sizeof(gtod_high)))
1015 		return -EFAULT;
1016 
1017 	if (gtod_high != 0)
1018 		return -EINVAL;
1019 	VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x", gtod_high);
1020 
1021 	return 0;
1022 }
1023 
1024 static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
1025 {
1026 	struct kvm_s390_vm_tod_clock gtod = { 0 };
1027 
1028 	if (copy_from_user(&gtod.tod, (void __user *)attr->addr,
1029 			   sizeof(gtod.tod)))
1030 		return -EFAULT;
1031 
1032 	kvm_s390_set_tod_clock(kvm, &gtod);
1033 	VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod.tod);
1034 	return 0;
1035 }
1036 
1037 static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr)
1038 {
1039 	int ret;
1040 
1041 	if (attr->flags)
1042 		return -EINVAL;
1043 
1044 	switch (attr->attr) {
1045 	case KVM_S390_VM_TOD_EXT:
1046 		ret = kvm_s390_set_tod_ext(kvm, attr);
1047 		break;
1048 	case KVM_S390_VM_TOD_HIGH:
1049 		ret = kvm_s390_set_tod_high(kvm, attr);
1050 		break;
1051 	case KVM_S390_VM_TOD_LOW:
1052 		ret = kvm_s390_set_tod_low(kvm, attr);
1053 		break;
1054 	default:
1055 		ret = -ENXIO;
1056 		break;
1057 	}
1058 	return ret;
1059 }
1060 
1061 static void kvm_s390_get_tod_clock(struct kvm *kvm,
1062 				   struct kvm_s390_vm_tod_clock *gtod)
1063 {
1064 	struct kvm_s390_tod_clock_ext htod;
1065 
1066 	preempt_disable();
1067 
1068 	get_tod_clock_ext((char *)&htod);
1069 
1070 	gtod->tod = htod.tod + kvm->arch.epoch;
1071 	gtod->epoch_idx = 0;
1072 	if (test_kvm_facility(kvm, 139)) {
1073 		gtod->epoch_idx = htod.epoch_idx + kvm->arch.epdx;
1074 		if (gtod->tod < htod.tod)
1075 			gtod->epoch_idx += 1;
1076 	}
1077 
1078 	preempt_enable();
1079 }
1080 
1081 static int kvm_s390_get_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr)
1082 {
1083 	struct kvm_s390_vm_tod_clock gtod;
1084 
1085 	memset(&gtod, 0, sizeof(gtod));
1086 	kvm_s390_get_tod_clock(kvm, &gtod);
1087 	if (copy_to_user((void __user *)attr->addr, &gtod, sizeof(gtod)))
1088 		return -EFAULT;
1089 
1090 	VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x, TOD base: 0x%llx",
1091 		gtod.epoch_idx, gtod.tod);
1092 	return 0;
1093 }
1094 
1095 static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
1096 {
1097 	u8 gtod_high = 0;
1098 
1099 	if (copy_to_user((void __user *)attr->addr, &gtod_high,
1100 					 sizeof(gtod_high)))
1101 		return -EFAULT;
1102 	VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x", gtod_high);
1103 
1104 	return 0;
1105 }
1106 
1107 static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
1108 {
1109 	u64 gtod;
1110 
1111 	gtod = kvm_s390_get_tod_clock_fast(kvm);
1112 	if (copy_to_user((void __user *)attr->addr, &gtod, sizeof(gtod)))
1113 		return -EFAULT;
1114 	VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx", gtod);
1115 
1116 	return 0;
1117 }
1118 
1119 static int kvm_s390_get_tod(struct kvm *kvm, struct kvm_device_attr *attr)
1120 {
1121 	int ret;
1122 
1123 	if (attr->flags)
1124 		return -EINVAL;
1125 
1126 	switch (attr->attr) {
1127 	case KVM_S390_VM_TOD_EXT:
1128 		ret = kvm_s390_get_tod_ext(kvm, attr);
1129 		break;
1130 	case KVM_S390_VM_TOD_HIGH:
1131 		ret = kvm_s390_get_tod_high(kvm, attr);
1132 		break;
1133 	case KVM_S390_VM_TOD_LOW:
1134 		ret = kvm_s390_get_tod_low(kvm, attr);
1135 		break;
1136 	default:
1137 		ret = -ENXIO;
1138 		break;
1139 	}
1140 	return ret;
1141 }
1142 
1143 static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
1144 {
1145 	struct kvm_s390_vm_cpu_processor *proc;
1146 	u16 lowest_ibc, unblocked_ibc;
1147 	int ret = 0;
1148 
1149 	mutex_lock(&kvm->lock);
1150 	if (kvm->created_vcpus) {
1151 		ret = -EBUSY;
1152 		goto out;
1153 	}
1154 	proc = kzalloc(sizeof(*proc), GFP_KERNEL);
1155 	if (!proc) {
1156 		ret = -ENOMEM;
1157 		goto out;
1158 	}
1159 	if (!copy_from_user(proc, (void __user *)attr->addr,
1160 			    sizeof(*proc))) {
1161 		kvm->arch.model.cpuid = proc->cpuid;
1162 		lowest_ibc = sclp.ibc >> 16 & 0xfff;
1163 		unblocked_ibc = sclp.ibc & 0xfff;
1164 		if (lowest_ibc && proc->ibc) {
1165 			if (proc->ibc > unblocked_ibc)
1166 				kvm->arch.model.ibc = unblocked_ibc;
1167 			else if (proc->ibc < lowest_ibc)
1168 				kvm->arch.model.ibc = lowest_ibc;
1169 			else
1170 				kvm->arch.model.ibc = proc->ibc;
1171 		}
1172 		memcpy(kvm->arch.model.fac_list, proc->fac_list,
1173 		       S390_ARCH_FAC_LIST_SIZE_BYTE);
1174 		VM_EVENT(kvm, 3, "SET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx",
1175 			 kvm->arch.model.ibc,
1176 			 kvm->arch.model.cpuid);
1177 		VM_EVENT(kvm, 3, "SET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx",
1178 			 kvm->arch.model.fac_list[0],
1179 			 kvm->arch.model.fac_list[1],
1180 			 kvm->arch.model.fac_list[2]);
1181 	} else
1182 		ret = -EFAULT;
1183 	kfree(proc);
1184 out:
1185 	mutex_unlock(&kvm->lock);
1186 	return ret;
1187 }
1188 
1189 static int kvm_s390_set_processor_feat(struct kvm *kvm,
1190 				       struct kvm_device_attr *attr)
1191 {
1192 	struct kvm_s390_vm_cpu_feat data;
1193 
1194 	if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data)))
1195 		return -EFAULT;
1196 	if (!bitmap_subset((unsigned long *) data.feat,
1197 			   kvm_s390_available_cpu_feat,
1198 			   KVM_S390_VM_CPU_FEAT_NR_BITS))
1199 		return -EINVAL;
1200 
1201 	mutex_lock(&kvm->lock);
1202 	if (kvm->created_vcpus) {
1203 		mutex_unlock(&kvm->lock);
1204 		return -EBUSY;
1205 	}
1206 	bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
1207 		    KVM_S390_VM_CPU_FEAT_NR_BITS);
1208 	mutex_unlock(&kvm->lock);
1209 	VM_EVENT(kvm, 3, "SET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx",
1210 			 data.feat[0],
1211 			 data.feat[1],
1212 			 data.feat[2]);
1213 	return 0;
1214 }
1215 
1216 static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
1217 					  struct kvm_device_attr *attr)
1218 {
1219 	/*
1220 	 * Once supported by kernel + hw, we have to store the subfunctions
1221 	 * in kvm->arch and remember that user space configured them.
1222 	 */
1223 	return -ENXIO;
1224 }
1225 
1226 static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
1227 {
1228 	int ret = -ENXIO;
1229 
1230 	switch (attr->attr) {
1231 	case KVM_S390_VM_CPU_PROCESSOR:
1232 		ret = kvm_s390_set_processor(kvm, attr);
1233 		break;
1234 	case KVM_S390_VM_CPU_PROCESSOR_FEAT:
1235 		ret = kvm_s390_set_processor_feat(kvm, attr);
1236 		break;
1237 	case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
1238 		ret = kvm_s390_set_processor_subfunc(kvm, attr);
1239 		break;
1240 	}
1241 	return ret;
1242 }
1243 
1244 static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr)
1245 {
1246 	struct kvm_s390_vm_cpu_processor *proc;
1247 	int ret = 0;
1248 
1249 	proc = kzalloc(sizeof(*proc), GFP_KERNEL);
1250 	if (!proc) {
1251 		ret = -ENOMEM;
1252 		goto out;
1253 	}
1254 	proc->cpuid = kvm->arch.model.cpuid;
1255 	proc->ibc = kvm->arch.model.ibc;
1256 	memcpy(&proc->fac_list, kvm->arch.model.fac_list,
1257 	       S390_ARCH_FAC_LIST_SIZE_BYTE);
1258 	VM_EVENT(kvm, 3, "GET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx",
1259 		 kvm->arch.model.ibc,
1260 		 kvm->arch.model.cpuid);
1261 	VM_EVENT(kvm, 3, "GET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx",
1262 		 kvm->arch.model.fac_list[0],
1263 		 kvm->arch.model.fac_list[1],
1264 		 kvm->arch.model.fac_list[2]);
1265 	if (copy_to_user((void __user *)attr->addr, proc, sizeof(*proc)))
1266 		ret = -EFAULT;
1267 	kfree(proc);
1268 out:
1269 	return ret;
1270 }
1271 
1272 static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
1273 {
1274 	struct kvm_s390_vm_cpu_machine *mach;
1275 	int ret = 0;
1276 
1277 	mach = kzalloc(sizeof(*mach), GFP_KERNEL);
1278 	if (!mach) {
1279 		ret = -ENOMEM;
1280 		goto out;
1281 	}
1282 	get_cpu_id((struct cpuid *) &mach->cpuid);
1283 	mach->ibc = sclp.ibc;
1284 	memcpy(&mach->fac_mask, kvm->arch.model.fac_mask,
1285 	       S390_ARCH_FAC_LIST_SIZE_BYTE);
1286 	memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list,
1287 	       sizeof(S390_lowcore.stfle_fac_list));
1288 	VM_EVENT(kvm, 3, "GET: host ibc:  0x%4.4x, host cpuid:  0x%16.16llx",
1289 		 kvm->arch.model.ibc,
1290 		 kvm->arch.model.cpuid);
1291 	VM_EVENT(kvm, 3, "GET: host facmask:  0x%16.16llx.%16.16llx.%16.16llx",
1292 		 mach->fac_mask[0],
1293 		 mach->fac_mask[1],
1294 		 mach->fac_mask[2]);
1295 	VM_EVENT(kvm, 3, "GET: host faclist:  0x%16.16llx.%16.16llx.%16.16llx",
1296 		 mach->fac_list[0],
1297 		 mach->fac_list[1],
1298 		 mach->fac_list[2]);
1299 	if (copy_to_user((void __user *)attr->addr, mach, sizeof(*mach)))
1300 		ret = -EFAULT;
1301 	kfree(mach);
1302 out:
1303 	return ret;
1304 }
1305 
1306 static int kvm_s390_get_processor_feat(struct kvm *kvm,
1307 				       struct kvm_device_attr *attr)
1308 {
1309 	struct kvm_s390_vm_cpu_feat data;
1310 
1311 	bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat,
1312 		    KVM_S390_VM_CPU_FEAT_NR_BITS);
1313 	if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
1314 		return -EFAULT;
1315 	VM_EVENT(kvm, 3, "GET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx",
1316 			 data.feat[0],
1317 			 data.feat[1],
1318 			 data.feat[2]);
1319 	return 0;
1320 }
1321 
1322 static int kvm_s390_get_machine_feat(struct kvm *kvm,
1323 				     struct kvm_device_attr *attr)
1324 {
1325 	struct kvm_s390_vm_cpu_feat data;
1326 
1327 	bitmap_copy((unsigned long *) data.feat,
1328 		    kvm_s390_available_cpu_feat,
1329 		    KVM_S390_VM_CPU_FEAT_NR_BITS);
1330 	if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
1331 		return -EFAULT;
1332 	VM_EVENT(kvm, 3, "GET: host feat:  0x%16.16llx.0x%16.16llx.0x%16.16llx",
1333 			 data.feat[0],
1334 			 data.feat[1],
1335 			 data.feat[2]);
1336 	return 0;
1337 }
1338 
1339 static int kvm_s390_get_processor_subfunc(struct kvm *kvm,
1340 					  struct kvm_device_attr *attr)
1341 {
1342 	/*
1343 	 * Once we can actually configure subfunctions (kernel + hw support),
1344 	 * we have to check if they were already set by user space, if so copy
1345 	 * them from kvm->arch.
1346 	 */
1347 	return -ENXIO;
1348 }
1349 
1350 static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
1351 					struct kvm_device_attr *attr)
1352 {
1353 	if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc,
1354 	    sizeof(struct kvm_s390_vm_cpu_subfunc)))
1355 		return -EFAULT;
1356 	return 0;
1357 }
1358 static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
1359 {
1360 	int ret = -ENXIO;
1361 
1362 	switch (attr->attr) {
1363 	case KVM_S390_VM_CPU_PROCESSOR:
1364 		ret = kvm_s390_get_processor(kvm, attr);
1365 		break;
1366 	case KVM_S390_VM_CPU_MACHINE:
1367 		ret = kvm_s390_get_machine(kvm, attr);
1368 		break;
1369 	case KVM_S390_VM_CPU_PROCESSOR_FEAT:
1370 		ret = kvm_s390_get_processor_feat(kvm, attr);
1371 		break;
1372 	case KVM_S390_VM_CPU_MACHINE_FEAT:
1373 		ret = kvm_s390_get_machine_feat(kvm, attr);
1374 		break;
1375 	case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
1376 		ret = kvm_s390_get_processor_subfunc(kvm, attr);
1377 		break;
1378 	case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
1379 		ret = kvm_s390_get_machine_subfunc(kvm, attr);
1380 		break;
1381 	}
1382 	return ret;
1383 }
1384 
1385 static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1386 {
1387 	int ret;
1388 
1389 	switch (attr->group) {
1390 	case KVM_S390_VM_MEM_CTRL:
1391 		ret = kvm_s390_set_mem_control(kvm, attr);
1392 		break;
1393 	case KVM_S390_VM_TOD:
1394 		ret = kvm_s390_set_tod(kvm, attr);
1395 		break;
1396 	case KVM_S390_VM_CPU_MODEL:
1397 		ret = kvm_s390_set_cpu_model(kvm, attr);
1398 		break;
1399 	case KVM_S390_VM_CRYPTO:
1400 		ret = kvm_s390_vm_set_crypto(kvm, attr);
1401 		break;
1402 	case KVM_S390_VM_MIGRATION:
1403 		ret = kvm_s390_vm_set_migration(kvm, attr);
1404 		break;
1405 	default:
1406 		ret = -ENXIO;
1407 		break;
1408 	}
1409 
1410 	return ret;
1411 }
1412 
1413 static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1414 {
1415 	int ret;
1416 
1417 	switch (attr->group) {
1418 	case KVM_S390_VM_MEM_CTRL:
1419 		ret = kvm_s390_get_mem_control(kvm, attr);
1420 		break;
1421 	case KVM_S390_VM_TOD:
1422 		ret = kvm_s390_get_tod(kvm, attr);
1423 		break;
1424 	case KVM_S390_VM_CPU_MODEL:
1425 		ret = kvm_s390_get_cpu_model(kvm, attr);
1426 		break;
1427 	case KVM_S390_VM_MIGRATION:
1428 		ret = kvm_s390_vm_get_migration(kvm, attr);
1429 		break;
1430 	default:
1431 		ret = -ENXIO;
1432 		break;
1433 	}
1434 
1435 	return ret;
1436 }
1437 
1438 static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1439 {
1440 	int ret;
1441 
1442 	switch (attr->group) {
1443 	case KVM_S390_VM_MEM_CTRL:
1444 		switch (attr->attr) {
1445 		case KVM_S390_VM_MEM_ENABLE_CMMA:
1446 		case KVM_S390_VM_MEM_CLR_CMMA:
1447 			ret = sclp.has_cmma ? 0 : -ENXIO;
1448 			break;
1449 		case KVM_S390_VM_MEM_LIMIT_SIZE:
1450 			ret = 0;
1451 			break;
1452 		default:
1453 			ret = -ENXIO;
1454 			break;
1455 		}
1456 		break;
1457 	case KVM_S390_VM_TOD:
1458 		switch (attr->attr) {
1459 		case KVM_S390_VM_TOD_LOW:
1460 		case KVM_S390_VM_TOD_HIGH:
1461 			ret = 0;
1462 			break;
1463 		default:
1464 			ret = -ENXIO;
1465 			break;
1466 		}
1467 		break;
1468 	case KVM_S390_VM_CPU_MODEL:
1469 		switch (attr->attr) {
1470 		case KVM_S390_VM_CPU_PROCESSOR:
1471 		case KVM_S390_VM_CPU_MACHINE:
1472 		case KVM_S390_VM_CPU_PROCESSOR_FEAT:
1473 		case KVM_S390_VM_CPU_MACHINE_FEAT:
1474 		case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
1475 			ret = 0;
1476 			break;
1477 		/* configuring subfunctions is not supported yet */
1478 		case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
1479 		default:
1480 			ret = -ENXIO;
1481 			break;
1482 		}
1483 		break;
1484 	case KVM_S390_VM_CRYPTO:
1485 		switch (attr->attr) {
1486 		case KVM_S390_VM_CRYPTO_ENABLE_AES_KW:
1487 		case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW:
1488 		case KVM_S390_VM_CRYPTO_DISABLE_AES_KW:
1489 		case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW:
1490 			ret = 0;
1491 			break;
1492 		default:
1493 			ret = -ENXIO;
1494 			break;
1495 		}
1496 		break;
1497 	case KVM_S390_VM_MIGRATION:
1498 		ret = 0;
1499 		break;
1500 	default:
1501 		ret = -ENXIO;
1502 		break;
1503 	}
1504 
1505 	return ret;
1506 }
1507 
1508 static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
1509 {
1510 	uint8_t *keys;
1511 	uint64_t hva;
1512 	int srcu_idx, i, r = 0;
1513 
1514 	if (args->flags != 0)
1515 		return -EINVAL;
1516 
1517 	/* Is this guest using storage keys? */
1518 	if (!mm_uses_skeys(current->mm))
1519 		return KVM_S390_GET_SKEYS_NONE;
1520 
1521 	/* Enforce sane limit on memory allocation */
1522 	if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
1523 		return -EINVAL;
1524 
1525 	keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL);
1526 	if (!keys)
1527 		return -ENOMEM;
1528 
1529 	down_read(&current->mm->mmap_sem);
1530 	srcu_idx = srcu_read_lock(&kvm->srcu);
1531 	for (i = 0; i < args->count; i++) {
1532 		hva = gfn_to_hva(kvm, args->start_gfn + i);
1533 		if (kvm_is_error_hva(hva)) {
1534 			r = -EFAULT;
1535 			break;
1536 		}
1537 
1538 		r = get_guest_storage_key(current->mm, hva, &keys[i]);
1539 		if (r)
1540 			break;
1541 	}
1542 	srcu_read_unlock(&kvm->srcu, srcu_idx);
1543 	up_read(&current->mm->mmap_sem);
1544 
1545 	if (!r) {
1546 		r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
1547 				 sizeof(uint8_t) * args->count);
1548 		if (r)
1549 			r = -EFAULT;
1550 	}
1551 
1552 	kvfree(keys);
1553 	return r;
1554 }
1555 
1556 static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
1557 {
1558 	uint8_t *keys;
1559 	uint64_t hva;
1560 	int srcu_idx, i, r = 0;
1561 	bool unlocked;
1562 
1563 	if (args->flags != 0)
1564 		return -EINVAL;
1565 
1566 	/* Enforce sane limit on memory allocation */
1567 	if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
1568 		return -EINVAL;
1569 
1570 	keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL);
1571 	if (!keys)
1572 		return -ENOMEM;
1573 
1574 	r = copy_from_user(keys, (uint8_t __user *)args->skeydata_addr,
1575 			   sizeof(uint8_t) * args->count);
1576 	if (r) {
1577 		r = -EFAULT;
1578 		goto out;
1579 	}
1580 
1581 	/* Enable storage key handling for the guest */
1582 	r = s390_enable_skey();
1583 	if (r)
1584 		goto out;
1585 
1586 	i = 0;
1587 	down_read(&current->mm->mmap_sem);
1588 	srcu_idx = srcu_read_lock(&kvm->srcu);
1589         while (i < args->count) {
1590 		unlocked = false;
1591 		hva = gfn_to_hva(kvm, args->start_gfn + i);
1592 		if (kvm_is_error_hva(hva)) {
1593 			r = -EFAULT;
1594 			break;
1595 		}
1596 
1597 		/* Lowest order bit is reserved */
1598 		if (keys[i] & 0x01) {
1599 			r = -EINVAL;
1600 			break;
1601 		}
1602 
1603 		r = set_guest_storage_key(current->mm, hva, keys[i], 0);
1604 		if (r) {
1605 			r = fixup_user_fault(current, current->mm, hva,
1606 					     FAULT_FLAG_WRITE, &unlocked);
1607 			if (r)
1608 				break;
1609 		}
1610 		if (!r)
1611 			i++;
1612 	}
1613 	srcu_read_unlock(&kvm->srcu, srcu_idx);
1614 	up_read(&current->mm->mmap_sem);
1615 out:
1616 	kvfree(keys);
1617 	return r;
1618 }
1619 
1620 /*
1621  * Base address and length must be sent at the start of each block, therefore
1622  * it's cheaper to send some clean data, as long as it's less than the size of
1623  * two longs.
1624  */
1625 #define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *))
1626 /* for consistency */
1627 #define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
1628 
1629 /*
1630  * Similar to gfn_to_memslot, but returns the index of a memslot also when the
1631  * address falls in a hole. In that case the index of one of the memslots
1632  * bordering the hole is returned.
1633  */
1634 static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn)
1635 {
1636 	int start = 0, end = slots->used_slots;
1637 	int slot = atomic_read(&slots->lru_slot);
1638 	struct kvm_memory_slot *memslots = slots->memslots;
1639 
1640 	if (gfn >= memslots[slot].base_gfn &&
1641 	    gfn < memslots[slot].base_gfn + memslots[slot].npages)
1642 		return slot;
1643 
1644 	while (start < end) {
1645 		slot = start + (end - start) / 2;
1646 
1647 		if (gfn >= memslots[slot].base_gfn)
1648 			end = slot;
1649 		else
1650 			start = slot + 1;
1651 	}
1652 
1653 	if (gfn >= memslots[start].base_gfn &&
1654 	    gfn < memslots[start].base_gfn + memslots[start].npages) {
1655 		atomic_set(&slots->lru_slot, start);
1656 	}
1657 
1658 	return start;
1659 }
1660 
1661 static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
1662 			      u8 *res, unsigned long bufsize)
1663 {
1664 	unsigned long pgstev, hva, cur_gfn = args->start_gfn;
1665 
1666 	args->count = 0;
1667 	while (args->count < bufsize) {
1668 		hva = gfn_to_hva(kvm, cur_gfn);
1669 		/*
1670 		 * We return an error if the first value was invalid, but we
1671 		 * return successfully if at least one value was copied.
1672 		 */
1673 		if (kvm_is_error_hva(hva))
1674 			return args->count ? 0 : -EFAULT;
1675 		if (get_pgste(kvm->mm, hva, &pgstev) < 0)
1676 			pgstev = 0;
1677 		res[args->count++] = (pgstev >> 24) & 0x43;
1678 		cur_gfn++;
1679 	}
1680 
1681 	return 0;
1682 }
1683 
1684 static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots,
1685 					      unsigned long cur_gfn)
1686 {
1687 	int slotidx = gfn_to_memslot_approx(slots, cur_gfn);
1688 	struct kvm_memory_slot *ms = slots->memslots + slotidx;
1689 	unsigned long ofs = cur_gfn - ms->base_gfn;
1690 
1691 	if (ms->base_gfn + ms->npages <= cur_gfn) {
1692 		slotidx--;
1693 		/* If we are above the highest slot, wrap around */
1694 		if (slotidx < 0)
1695 			slotidx = slots->used_slots - 1;
1696 
1697 		ms = slots->memslots + slotidx;
1698 		ofs = 0;
1699 	}
1700 	ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs);
1701 	while ((slotidx > 0) && (ofs >= ms->npages)) {
1702 		slotidx--;
1703 		ms = slots->memslots + slotidx;
1704 		ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0);
1705 	}
1706 	return ms->base_gfn + ofs;
1707 }
1708 
1709 static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
1710 			     u8 *res, unsigned long bufsize)
1711 {
1712 	unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev;
1713 	struct kvm_memslots *slots = kvm_memslots(kvm);
1714 	struct kvm_memory_slot *ms;
1715 
1716 	cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn);
1717 	ms = gfn_to_memslot(kvm, cur_gfn);
1718 	args->count = 0;
1719 	args->start_gfn = cur_gfn;
1720 	if (!ms)
1721 		return 0;
1722 	next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
1723 	mem_end = slots->memslots[0].base_gfn + slots->memslots[0].npages;
1724 
1725 	while (args->count < bufsize) {
1726 		hva = gfn_to_hva(kvm, cur_gfn);
1727 		if (kvm_is_error_hva(hva))
1728 			return 0;
1729 		/* Decrement only if we actually flipped the bit to 0 */
1730 		if (test_and_clear_bit(cur_gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms)))
1731 			atomic64_dec(&kvm->arch.cmma_dirty_pages);
1732 		if (get_pgste(kvm->mm, hva, &pgstev) < 0)
1733 			pgstev = 0;
1734 		/* Save the value */
1735 		res[args->count++] = (pgstev >> 24) & 0x43;
1736 		/* If the next bit is too far away, stop. */
1737 		if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE)
1738 			return 0;
1739 		/* If we reached the previous "next", find the next one */
1740 		if (cur_gfn == next_gfn)
1741 			next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
1742 		/* Reached the end of memory or of the buffer, stop */
1743 		if ((next_gfn >= mem_end) ||
1744 		    (next_gfn - args->start_gfn >= bufsize))
1745 			return 0;
1746 		cur_gfn++;
1747 		/* Reached the end of the current memslot, take the next one. */
1748 		if (cur_gfn - ms->base_gfn >= ms->npages) {
1749 			ms = gfn_to_memslot(kvm, cur_gfn);
1750 			if (!ms)
1751 				return 0;
1752 		}
1753 	}
1754 	return 0;
1755 }
1756 
1757 /*
1758  * This function searches for the next page with dirty CMMA attributes, and
1759  * saves the attributes in the buffer up to either the end of the buffer or
1760  * until a block of at least KVM_S390_MAX_BIT_DISTANCE clean bits is found;
1761  * no trailing clean bytes are saved.
1762  * In case no dirty bits were found, or if CMMA was not enabled or used, the
1763  * output buffer will indicate 0 as length.
1764  */
1765 static int kvm_s390_get_cmma_bits(struct kvm *kvm,
1766 				  struct kvm_s390_cmma_log *args)
1767 {
1768 	unsigned long bufsize;
1769 	int srcu_idx, peek, ret;
1770 	u8 *values;
1771 
1772 	if (!kvm->arch.use_cmma)
1773 		return -ENXIO;
1774 	/* Invalid/unsupported flags were specified */
1775 	if (args->flags & ~KVM_S390_CMMA_PEEK)
1776 		return -EINVAL;
1777 	/* Migration mode query, and we are not doing a migration */
1778 	peek = !!(args->flags & KVM_S390_CMMA_PEEK);
1779 	if (!peek && !kvm->arch.migration_mode)
1780 		return -EINVAL;
1781 	/* CMMA is disabled or was not used, or the buffer has length zero */
1782 	bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX);
1783 	if (!bufsize || !kvm->mm->context.uses_cmm) {
1784 		memset(args, 0, sizeof(*args));
1785 		return 0;
1786 	}
1787 	/* We are not peeking, and there are no dirty pages */
1788 	if (!peek && !atomic64_read(&kvm->arch.cmma_dirty_pages)) {
1789 		memset(args, 0, sizeof(*args));
1790 		return 0;
1791 	}
1792 
1793 	values = vmalloc(bufsize);
1794 	if (!values)
1795 		return -ENOMEM;
1796 
1797 	down_read(&kvm->mm->mmap_sem);
1798 	srcu_idx = srcu_read_lock(&kvm->srcu);
1799 	if (peek)
1800 		ret = kvm_s390_peek_cmma(kvm, args, values, bufsize);
1801 	else
1802 		ret = kvm_s390_get_cmma(kvm, args, values, bufsize);
1803 	srcu_read_unlock(&kvm->srcu, srcu_idx);
1804 	up_read(&kvm->mm->mmap_sem);
1805 
1806 	if (kvm->arch.migration_mode)
1807 		args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages);
1808 	else
1809 		args->remaining = 0;
1810 
1811 	if (copy_to_user((void __user *)args->values, values, args->count))
1812 		ret = -EFAULT;
1813 
1814 	vfree(values);
1815 	return ret;
1816 }
1817 
1818 /*
1819  * This function sets the CMMA attributes for the given pages. If the input
1820  * buffer has zero length, no action is taken, otherwise the attributes are
1821  * set and the mm->context.uses_cmm flag is set.
1822  */
1823 static int kvm_s390_set_cmma_bits(struct kvm *kvm,
1824 				  const struct kvm_s390_cmma_log *args)
1825 {
1826 	unsigned long hva, mask, pgstev, i;
1827 	uint8_t *bits;
1828 	int srcu_idx, r = 0;
1829 
1830 	mask = args->mask;
1831 
1832 	if (!kvm->arch.use_cmma)
1833 		return -ENXIO;
1834 	/* invalid/unsupported flags */
1835 	if (args->flags != 0)
1836 		return -EINVAL;
1837 	/* Enforce sane limit on memory allocation */
1838 	if (args->count > KVM_S390_CMMA_SIZE_MAX)
1839 		return -EINVAL;
1840 	/* Nothing to do */
1841 	if (args->count == 0)
1842 		return 0;
1843 
1844 	bits = vmalloc(array_size(sizeof(*bits), args->count));
1845 	if (!bits)
1846 		return -ENOMEM;
1847 
1848 	r = copy_from_user(bits, (void __user *)args->values, args->count);
1849 	if (r) {
1850 		r = -EFAULT;
1851 		goto out;
1852 	}
1853 
1854 	down_read(&kvm->mm->mmap_sem);
1855 	srcu_idx = srcu_read_lock(&kvm->srcu);
1856 	for (i = 0; i < args->count; i++) {
1857 		hva = gfn_to_hva(kvm, args->start_gfn + i);
1858 		if (kvm_is_error_hva(hva)) {
1859 			r = -EFAULT;
1860 			break;
1861 		}
1862 
1863 		pgstev = bits[i];
1864 		pgstev = pgstev << 24;
1865 		mask &= _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT;
1866 		set_pgste_bits(kvm->mm, hva, mask, pgstev);
1867 	}
1868 	srcu_read_unlock(&kvm->srcu, srcu_idx);
1869 	up_read(&kvm->mm->mmap_sem);
1870 
1871 	if (!kvm->mm->context.uses_cmm) {
1872 		down_write(&kvm->mm->mmap_sem);
1873 		kvm->mm->context.uses_cmm = 1;
1874 		up_write(&kvm->mm->mmap_sem);
1875 	}
1876 out:
1877 	vfree(bits);
1878 	return r;
1879 }
1880 
1881 long kvm_arch_vm_ioctl(struct file *filp,
1882 		       unsigned int ioctl, unsigned long arg)
1883 {
1884 	struct kvm *kvm = filp->private_data;
1885 	void __user *argp = (void __user *)arg;
1886 	struct kvm_device_attr attr;
1887 	int r;
1888 
1889 	switch (ioctl) {
1890 	case KVM_S390_INTERRUPT: {
1891 		struct kvm_s390_interrupt s390int;
1892 
1893 		r = -EFAULT;
1894 		if (copy_from_user(&s390int, argp, sizeof(s390int)))
1895 			break;
1896 		r = kvm_s390_inject_vm(kvm, &s390int);
1897 		break;
1898 	}
1899 	case KVM_ENABLE_CAP: {
1900 		struct kvm_enable_cap cap;
1901 		r = -EFAULT;
1902 		if (copy_from_user(&cap, argp, sizeof(cap)))
1903 			break;
1904 		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
1905 		break;
1906 	}
1907 	case KVM_CREATE_IRQCHIP: {
1908 		struct kvm_irq_routing_entry routing;
1909 
1910 		r = -EINVAL;
1911 		if (kvm->arch.use_irqchip) {
1912 			/* Set up dummy routing. */
1913 			memset(&routing, 0, sizeof(routing));
1914 			r = kvm_set_irq_routing(kvm, &routing, 0, 0);
1915 		}
1916 		break;
1917 	}
1918 	case KVM_SET_DEVICE_ATTR: {
1919 		r = -EFAULT;
1920 		if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
1921 			break;
1922 		r = kvm_s390_vm_set_attr(kvm, &attr);
1923 		break;
1924 	}
1925 	case KVM_GET_DEVICE_ATTR: {
1926 		r = -EFAULT;
1927 		if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
1928 			break;
1929 		r = kvm_s390_vm_get_attr(kvm, &attr);
1930 		break;
1931 	}
1932 	case KVM_HAS_DEVICE_ATTR: {
1933 		r = -EFAULT;
1934 		if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
1935 			break;
1936 		r = kvm_s390_vm_has_attr(kvm, &attr);
1937 		break;
1938 	}
1939 	case KVM_S390_GET_SKEYS: {
1940 		struct kvm_s390_skeys args;
1941 
1942 		r = -EFAULT;
1943 		if (copy_from_user(&args, argp,
1944 				   sizeof(struct kvm_s390_skeys)))
1945 			break;
1946 		r = kvm_s390_get_skeys(kvm, &args);
1947 		break;
1948 	}
1949 	case KVM_S390_SET_SKEYS: {
1950 		struct kvm_s390_skeys args;
1951 
1952 		r = -EFAULT;
1953 		if (copy_from_user(&args, argp,
1954 				   sizeof(struct kvm_s390_skeys)))
1955 			break;
1956 		r = kvm_s390_set_skeys(kvm, &args);
1957 		break;
1958 	}
1959 	case KVM_S390_GET_CMMA_BITS: {
1960 		struct kvm_s390_cmma_log args;
1961 
1962 		r = -EFAULT;
1963 		if (copy_from_user(&args, argp, sizeof(args)))
1964 			break;
1965 		mutex_lock(&kvm->slots_lock);
1966 		r = kvm_s390_get_cmma_bits(kvm, &args);
1967 		mutex_unlock(&kvm->slots_lock);
1968 		if (!r) {
1969 			r = copy_to_user(argp, &args, sizeof(args));
1970 			if (r)
1971 				r = -EFAULT;
1972 		}
1973 		break;
1974 	}
1975 	case KVM_S390_SET_CMMA_BITS: {
1976 		struct kvm_s390_cmma_log args;
1977 
1978 		r = -EFAULT;
1979 		if (copy_from_user(&args, argp, sizeof(args)))
1980 			break;
1981 		mutex_lock(&kvm->slots_lock);
1982 		r = kvm_s390_set_cmma_bits(kvm, &args);
1983 		mutex_unlock(&kvm->slots_lock);
1984 		break;
1985 	}
1986 	default:
1987 		r = -ENOTTY;
1988 	}
1989 
1990 	return r;
1991 }
1992 
1993 static int kvm_s390_query_ap_config(u8 *config)
1994 {
1995 	u32 fcn_code = 0x04000000UL;
1996 	u32 cc = 0;
1997 
1998 	memset(config, 0, 128);
1999 	asm volatile(
2000 		"lgr 0,%1\n"
2001 		"lgr 2,%2\n"
2002 		".long 0xb2af0000\n"		/* PQAP(QCI) */
2003 		"0: ipm %0\n"
2004 		"srl %0,28\n"
2005 		"1:\n"
2006 		EX_TABLE(0b, 1b)
2007 		: "+r" (cc)
2008 		: "r" (fcn_code), "r" (config)
2009 		: "cc", "0", "2", "memory"
2010 	);
2011 
2012 	return cc;
2013 }
2014 
2015 static int kvm_s390_apxa_installed(void)
2016 {
2017 	u8 config[128];
2018 	int cc;
2019 
2020 	if (test_facility(12)) {
2021 		cc = kvm_s390_query_ap_config(config);
2022 
2023 		if (cc)
2024 			pr_err("PQAP(QCI) failed with cc=%d", cc);
2025 		else
2026 			return config[0] & 0x40;
2027 	}
2028 
2029 	return 0;
2030 }
2031 
2032 static void kvm_s390_set_crycb_format(struct kvm *kvm)
2033 {
2034 	kvm->arch.crypto.crycbd = (__u32)(unsigned long) kvm->arch.crypto.crycb;
2035 
2036 	if (kvm_s390_apxa_installed())
2037 		kvm->arch.crypto.crycbd |= CRYCB_FORMAT2;
2038 	else
2039 		kvm->arch.crypto.crycbd |= CRYCB_FORMAT1;
2040 }
2041 
2042 static u64 kvm_s390_get_initial_cpuid(void)
2043 {
2044 	struct cpuid cpuid;
2045 
2046 	get_cpu_id(&cpuid);
2047 	cpuid.version = 0xff;
2048 	return *((u64 *) &cpuid);
2049 }
2050 
2051 static void kvm_s390_crypto_init(struct kvm *kvm)
2052 {
2053 	if (!test_kvm_facility(kvm, 76))
2054 		return;
2055 
2056 	kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb;
2057 	kvm_s390_set_crycb_format(kvm);
2058 
2059 	/* Enable AES/DEA protected key functions by default */
2060 	kvm->arch.crypto.aes_kw = 1;
2061 	kvm->arch.crypto.dea_kw = 1;
2062 	get_random_bytes(kvm->arch.crypto.crycb->aes_wrapping_key_mask,
2063 			 sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask));
2064 	get_random_bytes(kvm->arch.crypto.crycb->dea_wrapping_key_mask,
2065 			 sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask));
2066 }
2067 
2068 static void sca_dispose(struct kvm *kvm)
2069 {
2070 	if (kvm->arch.use_esca)
2071 		free_pages_exact(kvm->arch.sca, sizeof(struct esca_block));
2072 	else
2073 		free_page((unsigned long)(kvm->arch.sca));
2074 	kvm->arch.sca = NULL;
2075 }
2076 
2077 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
2078 {
2079 	gfp_t alloc_flags = GFP_KERNEL;
2080 	int i, rc;
2081 	char debug_name[16];
2082 	static unsigned long sca_offset;
2083 
2084 	rc = -EINVAL;
2085 #ifdef CONFIG_KVM_S390_UCONTROL
2086 	if (type & ~KVM_VM_S390_UCONTROL)
2087 		goto out_err;
2088 	if ((type & KVM_VM_S390_UCONTROL) && (!capable(CAP_SYS_ADMIN)))
2089 		goto out_err;
2090 #else
2091 	if (type)
2092 		goto out_err;
2093 #endif
2094 
2095 	rc = s390_enable_sie();
2096 	if (rc)
2097 		goto out_err;
2098 
2099 	rc = -ENOMEM;
2100 
2101 	if (!sclp.has_64bscao)
2102 		alloc_flags |= GFP_DMA;
2103 	rwlock_init(&kvm->arch.sca_lock);
2104 	/* start with basic SCA */
2105 	kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
2106 	if (!kvm->arch.sca)
2107 		goto out_err;
2108 	spin_lock(&kvm_lock);
2109 	sca_offset += 16;
2110 	if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE)
2111 		sca_offset = 0;
2112 	kvm->arch.sca = (struct bsca_block *)
2113 			((char *) kvm->arch.sca + sca_offset);
2114 	spin_unlock(&kvm_lock);
2115 
2116 	sprintf(debug_name, "kvm-%u", current->pid);
2117 
2118 	kvm->arch.dbf = debug_register(debug_name, 32, 1, 7 * sizeof(long));
2119 	if (!kvm->arch.dbf)
2120 		goto out_err;
2121 
2122 	BUILD_BUG_ON(sizeof(struct sie_page2) != 4096);
2123 	kvm->arch.sie_page2 =
2124 	     (struct sie_page2 *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
2125 	if (!kvm->arch.sie_page2)
2126 		goto out_err;
2127 
2128 	kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list;
2129 
2130 	for (i = 0; i < kvm_s390_fac_size(); i++) {
2131 		kvm->arch.model.fac_mask[i] = S390_lowcore.stfle_fac_list[i] &
2132 					      (kvm_s390_fac_base[i] |
2133 					       kvm_s390_fac_ext[i]);
2134 		kvm->arch.model.fac_list[i] = S390_lowcore.stfle_fac_list[i] &
2135 					      kvm_s390_fac_base[i];
2136 	}
2137 
2138 	/* we are always in czam mode - even on pre z14 machines */
2139 	set_kvm_facility(kvm->arch.model.fac_mask, 138);
2140 	set_kvm_facility(kvm->arch.model.fac_list, 138);
2141 	/* we emulate STHYI in kvm */
2142 	set_kvm_facility(kvm->arch.model.fac_mask, 74);
2143 	set_kvm_facility(kvm->arch.model.fac_list, 74);
2144 	if (MACHINE_HAS_TLB_GUEST) {
2145 		set_kvm_facility(kvm->arch.model.fac_mask, 147);
2146 		set_kvm_facility(kvm->arch.model.fac_list, 147);
2147 	}
2148 
2149 	kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid();
2150 	kvm->arch.model.ibc = sclp.ibc & 0x0fff;
2151 
2152 	kvm_s390_crypto_init(kvm);
2153 
2154 	mutex_init(&kvm->arch.float_int.ais_lock);
2155 	spin_lock_init(&kvm->arch.float_int.lock);
2156 	for (i = 0; i < FIRQ_LIST_COUNT; i++)
2157 		INIT_LIST_HEAD(&kvm->arch.float_int.lists[i]);
2158 	init_waitqueue_head(&kvm->arch.ipte_wq);
2159 	mutex_init(&kvm->arch.ipte_mutex);
2160 
2161 	debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
2162 	VM_EVENT(kvm, 3, "vm created with type %lu", type);
2163 
2164 	if (type & KVM_VM_S390_UCONTROL) {
2165 		kvm->arch.gmap = NULL;
2166 		kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT;
2167 	} else {
2168 		if (sclp.hamax == U64_MAX)
2169 			kvm->arch.mem_limit = TASK_SIZE_MAX;
2170 		else
2171 			kvm->arch.mem_limit = min_t(unsigned long, TASK_SIZE_MAX,
2172 						    sclp.hamax + 1);
2173 		kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
2174 		if (!kvm->arch.gmap)
2175 			goto out_err;
2176 		kvm->arch.gmap->private = kvm;
2177 		kvm->arch.gmap->pfault_enabled = 0;
2178 	}
2179 
2180 	kvm->arch.use_pfmfi = sclp.has_pfmfi;
2181 	kvm->arch.use_skf = sclp.has_skey;
2182 	spin_lock_init(&kvm->arch.start_stop_lock);
2183 	kvm_s390_vsie_init(kvm);
2184 	kvm_s390_gisa_init(kvm);
2185 	KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
2186 
2187 	return 0;
2188 out_err:
2189 	free_page((unsigned long)kvm->arch.sie_page2);
2190 	debug_unregister(kvm->arch.dbf);
2191 	sca_dispose(kvm);
2192 	KVM_EVENT(3, "creation of vm failed: %d", rc);
2193 	return rc;
2194 }
2195 
2196 bool kvm_arch_has_vcpu_debugfs(void)
2197 {
2198 	return false;
2199 }
2200 
2201 int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
2202 {
2203 	return 0;
2204 }
2205 
2206 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
2207 {
2208 	VCPU_EVENT(vcpu, 3, "%s", "free cpu");
2209 	trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
2210 	kvm_s390_clear_local_irqs(vcpu);
2211 	kvm_clear_async_pf_completion_queue(vcpu);
2212 	if (!kvm_is_ucontrol(vcpu->kvm))
2213 		sca_del_vcpu(vcpu);
2214 
2215 	if (kvm_is_ucontrol(vcpu->kvm))
2216 		gmap_remove(vcpu->arch.gmap);
2217 
2218 	if (vcpu->kvm->arch.use_cmma)
2219 		kvm_s390_vcpu_unsetup_cmma(vcpu);
2220 	free_page((unsigned long)(vcpu->arch.sie_block));
2221 
2222 	kvm_vcpu_uninit(vcpu);
2223 	kmem_cache_free(kvm_vcpu_cache, vcpu);
2224 }
2225 
2226 static void kvm_free_vcpus(struct kvm *kvm)
2227 {
2228 	unsigned int i;
2229 	struct kvm_vcpu *vcpu;
2230 
2231 	kvm_for_each_vcpu(i, vcpu, kvm)
2232 		kvm_arch_vcpu_destroy(vcpu);
2233 
2234 	mutex_lock(&kvm->lock);
2235 	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
2236 		kvm->vcpus[i] = NULL;
2237 
2238 	atomic_set(&kvm->online_vcpus, 0);
2239 	mutex_unlock(&kvm->lock);
2240 }
2241 
2242 void kvm_arch_destroy_vm(struct kvm *kvm)
2243 {
2244 	kvm_free_vcpus(kvm);
2245 	sca_dispose(kvm);
2246 	debug_unregister(kvm->arch.dbf);
2247 	kvm_s390_gisa_destroy(kvm);
2248 	free_page((unsigned long)kvm->arch.sie_page2);
2249 	if (!kvm_is_ucontrol(kvm))
2250 		gmap_remove(kvm->arch.gmap);
2251 	kvm_s390_destroy_adapters(kvm);
2252 	kvm_s390_clear_float_irqs(kvm);
2253 	kvm_s390_vsie_destroy(kvm);
2254 	KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
2255 }
2256 
2257 /* Section: vcpu related */
2258 static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
2259 {
2260 	vcpu->arch.gmap = gmap_create(current->mm, -1UL);
2261 	if (!vcpu->arch.gmap)
2262 		return -ENOMEM;
2263 	vcpu->arch.gmap->private = vcpu->kvm;
2264 
2265 	return 0;
2266 }
2267 
2268 static void sca_del_vcpu(struct kvm_vcpu *vcpu)
2269 {
2270 	if (!kvm_s390_use_sca_entries())
2271 		return;
2272 	read_lock(&vcpu->kvm->arch.sca_lock);
2273 	if (vcpu->kvm->arch.use_esca) {
2274 		struct esca_block *sca = vcpu->kvm->arch.sca;
2275 
2276 		clear_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn);
2277 		sca->cpu[vcpu->vcpu_id].sda = 0;
2278 	} else {
2279 		struct bsca_block *sca = vcpu->kvm->arch.sca;
2280 
2281 		clear_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn);
2282 		sca->cpu[vcpu->vcpu_id].sda = 0;
2283 	}
2284 	read_unlock(&vcpu->kvm->arch.sca_lock);
2285 }
2286 
2287 static void sca_add_vcpu(struct kvm_vcpu *vcpu)
2288 {
2289 	if (!kvm_s390_use_sca_entries()) {
2290 		struct bsca_block *sca = vcpu->kvm->arch.sca;
2291 
2292 		/* we still need the basic sca for the ipte control */
2293 		vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
2294 		vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
2295 		return;
2296 	}
2297 	read_lock(&vcpu->kvm->arch.sca_lock);
2298 	if (vcpu->kvm->arch.use_esca) {
2299 		struct esca_block *sca = vcpu->kvm->arch.sca;
2300 
2301 		sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block;
2302 		vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
2303 		vcpu->arch.sie_block->scaol = (__u32)(__u64)sca & ~0x3fU;
2304 		vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
2305 		set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn);
2306 	} else {
2307 		struct bsca_block *sca = vcpu->kvm->arch.sca;
2308 
2309 		sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block;
2310 		vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
2311 		vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
2312 		set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn);
2313 	}
2314 	read_unlock(&vcpu->kvm->arch.sca_lock);
2315 }
2316 
2317 /* Basic SCA to Extended SCA data copy routines */
2318 static inline void sca_copy_entry(struct esca_entry *d, struct bsca_entry *s)
2319 {
2320 	d->sda = s->sda;
2321 	d->sigp_ctrl.c = s->sigp_ctrl.c;
2322 	d->sigp_ctrl.scn = s->sigp_ctrl.scn;
2323 }
2324 
2325 static void sca_copy_b_to_e(struct esca_block *d, struct bsca_block *s)
2326 {
2327 	int i;
2328 
2329 	d->ipte_control = s->ipte_control;
2330 	d->mcn[0] = s->mcn;
2331 	for (i = 0; i < KVM_S390_BSCA_CPU_SLOTS; i++)
2332 		sca_copy_entry(&d->cpu[i], &s->cpu[i]);
2333 }
2334 
2335 static int sca_switch_to_extended(struct kvm *kvm)
2336 {
2337 	struct bsca_block *old_sca = kvm->arch.sca;
2338 	struct esca_block *new_sca;
2339 	struct kvm_vcpu *vcpu;
2340 	unsigned int vcpu_idx;
2341 	u32 scaol, scaoh;
2342 
2343 	new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL|__GFP_ZERO);
2344 	if (!new_sca)
2345 		return -ENOMEM;
2346 
2347 	scaoh = (u32)((u64)(new_sca) >> 32);
2348 	scaol = (u32)(u64)(new_sca) & ~0x3fU;
2349 
2350 	kvm_s390_vcpu_block_all(kvm);
2351 	write_lock(&kvm->arch.sca_lock);
2352 
2353 	sca_copy_b_to_e(new_sca, old_sca);
2354 
2355 	kvm_for_each_vcpu(vcpu_idx, vcpu, kvm) {
2356 		vcpu->arch.sie_block->scaoh = scaoh;
2357 		vcpu->arch.sie_block->scaol = scaol;
2358 		vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
2359 	}
2360 	kvm->arch.sca = new_sca;
2361 	kvm->arch.use_esca = 1;
2362 
2363 	write_unlock(&kvm->arch.sca_lock);
2364 	kvm_s390_vcpu_unblock_all(kvm);
2365 
2366 	free_page((unsigned long)old_sca);
2367 
2368 	VM_EVENT(kvm, 2, "Switched to ESCA (0x%pK -> 0x%pK)",
2369 		 old_sca, kvm->arch.sca);
2370 	return 0;
2371 }
2372 
2373 static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
2374 {
2375 	int rc;
2376 
2377 	if (!kvm_s390_use_sca_entries()) {
2378 		if (id < KVM_MAX_VCPUS)
2379 			return true;
2380 		return false;
2381 	}
2382 	if (id < KVM_S390_BSCA_CPU_SLOTS)
2383 		return true;
2384 	if (!sclp.has_esca || !sclp.has_64bscao)
2385 		return false;
2386 
2387 	mutex_lock(&kvm->lock);
2388 	rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm);
2389 	mutex_unlock(&kvm->lock);
2390 
2391 	return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS;
2392 }
2393 
2394 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
2395 {
2396 	vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
2397 	kvm_clear_async_pf_completion_queue(vcpu);
2398 	vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX |
2399 				    KVM_SYNC_GPRS |
2400 				    KVM_SYNC_ACRS |
2401 				    KVM_SYNC_CRS |
2402 				    KVM_SYNC_ARCH0 |
2403 				    KVM_SYNC_PFAULT;
2404 	kvm_s390_set_prefix(vcpu, 0);
2405 	if (test_kvm_facility(vcpu->kvm, 64))
2406 		vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
2407 	if (test_kvm_facility(vcpu->kvm, 82))
2408 		vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC;
2409 	if (test_kvm_facility(vcpu->kvm, 133))
2410 		vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB;
2411 	if (test_kvm_facility(vcpu->kvm, 156))
2412 		vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN;
2413 	/* fprs can be synchronized via vrs, even if the guest has no vx. With
2414 	 * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format.
2415 	 */
2416 	if (MACHINE_HAS_VX)
2417 		vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS;
2418 	else
2419 		vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS;
2420 
2421 	if (kvm_is_ucontrol(vcpu->kvm))
2422 		return __kvm_ucontrol_vcpu_init(vcpu);
2423 
2424 	return 0;
2425 }
2426 
2427 /* needs disabled preemption to protect from TOD sync and vcpu_load/put */
2428 static void __start_cpu_timer_accounting(struct kvm_vcpu *vcpu)
2429 {
2430 	WARN_ON_ONCE(vcpu->arch.cputm_start != 0);
2431 	raw_write_seqcount_begin(&vcpu->arch.cputm_seqcount);
2432 	vcpu->arch.cputm_start = get_tod_clock_fast();
2433 	raw_write_seqcount_end(&vcpu->arch.cputm_seqcount);
2434 }
2435 
2436 /* needs disabled preemption to protect from TOD sync and vcpu_load/put */
2437 static void __stop_cpu_timer_accounting(struct kvm_vcpu *vcpu)
2438 {
2439 	WARN_ON_ONCE(vcpu->arch.cputm_start == 0);
2440 	raw_write_seqcount_begin(&vcpu->arch.cputm_seqcount);
2441 	vcpu->arch.sie_block->cputm -= get_tod_clock_fast() - vcpu->arch.cputm_start;
2442 	vcpu->arch.cputm_start = 0;
2443 	raw_write_seqcount_end(&vcpu->arch.cputm_seqcount);
2444 }
2445 
2446 /* needs disabled preemption to protect from TOD sync and vcpu_load/put */
2447 static void __enable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
2448 {
2449 	WARN_ON_ONCE(vcpu->arch.cputm_enabled);
2450 	vcpu->arch.cputm_enabled = true;
2451 	__start_cpu_timer_accounting(vcpu);
2452 }
2453 
2454 /* needs disabled preemption to protect from TOD sync and vcpu_load/put */
2455 static void __disable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
2456 {
2457 	WARN_ON_ONCE(!vcpu->arch.cputm_enabled);
2458 	__stop_cpu_timer_accounting(vcpu);
2459 	vcpu->arch.cputm_enabled = false;
2460 }
2461 
2462 static void enable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
2463 {
2464 	preempt_disable(); /* protect from TOD sync and vcpu_load/put */
2465 	__enable_cpu_timer_accounting(vcpu);
2466 	preempt_enable();
2467 }
2468 
2469 static void disable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
2470 {
2471 	preempt_disable(); /* protect from TOD sync and vcpu_load/put */
2472 	__disable_cpu_timer_accounting(vcpu);
2473 	preempt_enable();
2474 }
2475 
2476 /* set the cpu timer - may only be called from the VCPU thread itself */
2477 void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm)
2478 {
2479 	preempt_disable(); /* protect from TOD sync and vcpu_load/put */
2480 	raw_write_seqcount_begin(&vcpu->arch.cputm_seqcount);
2481 	if (vcpu->arch.cputm_enabled)
2482 		vcpu->arch.cputm_start = get_tod_clock_fast();
2483 	vcpu->arch.sie_block->cputm = cputm;
2484 	raw_write_seqcount_end(&vcpu->arch.cputm_seqcount);
2485 	preempt_enable();
2486 }
2487 
2488 /* update and get the cpu timer - can also be called from other VCPU threads */
2489 __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu)
2490 {
2491 	unsigned int seq;
2492 	__u64 value;
2493 
2494 	if (unlikely(!vcpu->arch.cputm_enabled))
2495 		return vcpu->arch.sie_block->cputm;
2496 
2497 	preempt_disable(); /* protect from TOD sync and vcpu_load/put */
2498 	do {
2499 		seq = raw_read_seqcount(&vcpu->arch.cputm_seqcount);
2500 		/*
2501 		 * If the writer would ever execute a read in the critical
2502 		 * section, e.g. in irq context, we have a deadlock.
2503 		 */
2504 		WARN_ON_ONCE((seq & 1) && smp_processor_id() == vcpu->cpu);
2505 		value = vcpu->arch.sie_block->cputm;
2506 		/* if cputm_start is 0, accounting is being started/stopped */
2507 		if (likely(vcpu->arch.cputm_start))
2508 			value -= get_tod_clock_fast() - vcpu->arch.cputm_start;
2509 	} while (read_seqcount_retry(&vcpu->arch.cputm_seqcount, seq & ~1));
2510 	preempt_enable();
2511 	return value;
2512 }
2513 
2514 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2515 {
2516 
2517 	gmap_enable(vcpu->arch.enabled_gmap);
2518 	kvm_s390_set_cpuflags(vcpu, CPUSTAT_RUNNING);
2519 	if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
2520 		__start_cpu_timer_accounting(vcpu);
2521 	vcpu->cpu = cpu;
2522 }
2523 
2524 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2525 {
2526 	vcpu->cpu = -1;
2527 	if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
2528 		__stop_cpu_timer_accounting(vcpu);
2529 	kvm_s390_clear_cpuflags(vcpu, CPUSTAT_RUNNING);
2530 	vcpu->arch.enabled_gmap = gmap_get_enabled();
2531 	gmap_disable(vcpu->arch.enabled_gmap);
2532 
2533 }
2534 
2535 static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
2536 {
2537 	/* this equals initial cpu reset in pop, but we don't switch to ESA */
2538 	vcpu->arch.sie_block->gpsw.mask = 0UL;
2539 	vcpu->arch.sie_block->gpsw.addr = 0UL;
2540 	kvm_s390_set_prefix(vcpu, 0);
2541 	kvm_s390_set_cpu_timer(vcpu, 0);
2542 	vcpu->arch.sie_block->ckc       = 0UL;
2543 	vcpu->arch.sie_block->todpr     = 0;
2544 	memset(vcpu->arch.sie_block->gcr, 0, 16 * sizeof(__u64));
2545 	vcpu->arch.sie_block->gcr[0]  = CR0_UNUSED_56 |
2546 					CR0_INTERRUPT_KEY_SUBMASK |
2547 					CR0_MEASUREMENT_ALERT_SUBMASK;
2548 	vcpu->arch.sie_block->gcr[14] = CR14_UNUSED_32 |
2549 					CR14_UNUSED_33 |
2550 					CR14_EXTERNAL_DAMAGE_SUBMASK;
2551 	/* make sure the new fpc will be lazily loaded */
2552 	save_fpu_regs();
2553 	current->thread.fpu.fpc = 0;
2554 	vcpu->arch.sie_block->gbea = 1;
2555 	vcpu->arch.sie_block->pp = 0;
2556 	vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
2557 	vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
2558 	kvm_clear_async_pf_completion_queue(vcpu);
2559 	if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm))
2560 		kvm_s390_vcpu_stop(vcpu);
2561 	kvm_s390_clear_local_irqs(vcpu);
2562 }
2563 
2564 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
2565 {
2566 	mutex_lock(&vcpu->kvm->lock);
2567 	preempt_disable();
2568 	vcpu->arch.sie_block->epoch = vcpu->kvm->arch.epoch;
2569 	vcpu->arch.sie_block->epdx = vcpu->kvm->arch.epdx;
2570 	preempt_enable();
2571 	mutex_unlock(&vcpu->kvm->lock);
2572 	if (!kvm_is_ucontrol(vcpu->kvm)) {
2573 		vcpu->arch.gmap = vcpu->kvm->arch.gmap;
2574 		sca_add_vcpu(vcpu);
2575 	}
2576 	if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0)
2577 		vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
2578 	/* make vcpu_load load the right gmap on the first trigger */
2579 	vcpu->arch.enabled_gmap = vcpu->arch.gmap;
2580 }
2581 
2582 static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
2583 {
2584 	if (!test_kvm_facility(vcpu->kvm, 76))
2585 		return;
2586 
2587 	vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA);
2588 
2589 	if (vcpu->kvm->arch.crypto.aes_kw)
2590 		vcpu->arch.sie_block->ecb3 |= ECB3_AES;
2591 	if (vcpu->kvm->arch.crypto.dea_kw)
2592 		vcpu->arch.sie_block->ecb3 |= ECB3_DEA;
2593 
2594 	vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd;
2595 }
2596 
2597 void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu)
2598 {
2599 	free_page(vcpu->arch.sie_block->cbrlo);
2600 	vcpu->arch.sie_block->cbrlo = 0;
2601 }
2602 
2603 int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
2604 {
2605 	vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL);
2606 	if (!vcpu->arch.sie_block->cbrlo)
2607 		return -ENOMEM;
2608 	return 0;
2609 }
2610 
2611 static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu)
2612 {
2613 	struct kvm_s390_cpu_model *model = &vcpu->kvm->arch.model;
2614 
2615 	vcpu->arch.sie_block->ibc = model->ibc;
2616 	if (test_kvm_facility(vcpu->kvm, 7))
2617 		vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list;
2618 }
2619 
2620 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
2621 {
2622 	int rc = 0;
2623 
2624 	atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
2625 						    CPUSTAT_SM |
2626 						    CPUSTAT_STOPPED);
2627 
2628 	if (test_kvm_facility(vcpu->kvm, 78))
2629 		kvm_s390_set_cpuflags(vcpu, CPUSTAT_GED2);
2630 	else if (test_kvm_facility(vcpu->kvm, 8))
2631 		kvm_s390_set_cpuflags(vcpu, CPUSTAT_GED);
2632 
2633 	kvm_s390_vcpu_setup_model(vcpu);
2634 
2635 	/* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */
2636 	if (MACHINE_HAS_ESOP)
2637 		vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT;
2638 	if (test_kvm_facility(vcpu->kvm, 9))
2639 		vcpu->arch.sie_block->ecb |= ECB_SRSI;
2640 	if (test_kvm_facility(vcpu->kvm, 73))
2641 		vcpu->arch.sie_block->ecb |= ECB_TE;
2642 
2643 	if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi)
2644 		vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI;
2645 	if (test_kvm_facility(vcpu->kvm, 130))
2646 		vcpu->arch.sie_block->ecb2 |= ECB2_IEP;
2647 	vcpu->arch.sie_block->eca = ECA_MVPGI | ECA_PROTEXCI;
2648 	if (sclp.has_cei)
2649 		vcpu->arch.sie_block->eca |= ECA_CEI;
2650 	if (sclp.has_ib)
2651 		vcpu->arch.sie_block->eca |= ECA_IB;
2652 	if (sclp.has_siif)
2653 		vcpu->arch.sie_block->eca |= ECA_SII;
2654 	if (sclp.has_sigpif)
2655 		vcpu->arch.sie_block->eca |= ECA_SIGPI;
2656 	if (test_kvm_facility(vcpu->kvm, 129)) {
2657 		vcpu->arch.sie_block->eca |= ECA_VX;
2658 		vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT;
2659 	}
2660 	if (test_kvm_facility(vcpu->kvm, 139))
2661 		vcpu->arch.sie_block->ecd |= ECD_MEF;
2662 	if (test_kvm_facility(vcpu->kvm, 156))
2663 		vcpu->arch.sie_block->ecd |= ECD_ETOKENF;
2664 	if (vcpu->arch.sie_block->gd) {
2665 		vcpu->arch.sie_block->eca |= ECA_AIV;
2666 		VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u",
2667 			   vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id);
2668 	}
2669 	vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx)
2670 					| SDNXC;
2671 	vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
2672 
2673 	if (sclp.has_kss)
2674 		kvm_s390_set_cpuflags(vcpu, CPUSTAT_KSS);
2675 	else
2676 		vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
2677 
2678 	if (vcpu->kvm->arch.use_cmma) {
2679 		rc = kvm_s390_vcpu_setup_cmma(vcpu);
2680 		if (rc)
2681 			return rc;
2682 	}
2683 	hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2684 	vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup;
2685 
2686 	kvm_s390_vcpu_crypto_setup(vcpu);
2687 
2688 	return rc;
2689 }
2690 
2691 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
2692 				      unsigned int id)
2693 {
2694 	struct kvm_vcpu *vcpu;
2695 	struct sie_page *sie_page;
2696 	int rc = -EINVAL;
2697 
2698 	if (!kvm_is_ucontrol(kvm) && !sca_can_add_vcpu(kvm, id))
2699 		goto out;
2700 
2701 	rc = -ENOMEM;
2702 
2703 	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
2704 	if (!vcpu)
2705 		goto out;
2706 
2707 	BUILD_BUG_ON(sizeof(struct sie_page) != 4096);
2708 	sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL);
2709 	if (!sie_page)
2710 		goto out_free_cpu;
2711 
2712 	vcpu->arch.sie_block = &sie_page->sie_block;
2713 	vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
2714 
2715 	/* the real guest size will always be smaller than msl */
2716 	vcpu->arch.sie_block->mso = 0;
2717 	vcpu->arch.sie_block->msl = sclp.hamax;
2718 
2719 	vcpu->arch.sie_block->icpua = id;
2720 	spin_lock_init(&vcpu->arch.local_int.lock);
2721 	vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa;
2722 	if (vcpu->arch.sie_block->gd && sclp.has_gisaf)
2723 		vcpu->arch.sie_block->gd |= GISA_FORMAT1;
2724 	seqcount_init(&vcpu->arch.cputm_seqcount);
2725 
2726 	rc = kvm_vcpu_init(vcpu, kvm, id);
2727 	if (rc)
2728 		goto out_free_sie_block;
2729 	VM_EVENT(kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", id, vcpu,
2730 		 vcpu->arch.sie_block);
2731 	trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block);
2732 
2733 	return vcpu;
2734 out_free_sie_block:
2735 	free_page((unsigned long)(vcpu->arch.sie_block));
2736 out_free_cpu:
2737 	kmem_cache_free(kvm_vcpu_cache, vcpu);
2738 out:
2739 	return ERR_PTR(rc);
2740 }
2741 
2742 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
2743 {
2744 	return kvm_s390_vcpu_has_irq(vcpu, 0);
2745 }
2746 
2747 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
2748 {
2749 	return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE);
2750 }
2751 
2752 void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu)
2753 {
2754 	atomic_or(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
2755 	exit_sie(vcpu);
2756 }
2757 
2758 void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu)
2759 {
2760 	atomic_andnot(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
2761 }
2762 
2763 static void kvm_s390_vcpu_request(struct kvm_vcpu *vcpu)
2764 {
2765 	atomic_or(PROG_REQUEST, &vcpu->arch.sie_block->prog20);
2766 	exit_sie(vcpu);
2767 }
2768 
2769 static void kvm_s390_vcpu_request_handled(struct kvm_vcpu *vcpu)
2770 {
2771 	atomic_andnot(PROG_REQUEST, &vcpu->arch.sie_block->prog20);
2772 }
2773 
2774 /*
2775  * Kick a guest cpu out of SIE and wait until SIE is not running.
2776  * If the CPU is not running (e.g. waiting as idle) the function will
2777  * return immediately. */
2778 void exit_sie(struct kvm_vcpu *vcpu)
2779 {
2780 	kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT);
2781 	while (vcpu->arch.sie_block->prog0c & PROG_IN_SIE)
2782 		cpu_relax();
2783 }
2784 
2785 /* Kick a guest cpu out of SIE to process a request synchronously */
2786 void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
2787 {
2788 	kvm_make_request(req, vcpu);
2789 	kvm_s390_vcpu_request(vcpu);
2790 }
2791 
2792 static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
2793 			      unsigned long end)
2794 {
2795 	struct kvm *kvm = gmap->private;
2796 	struct kvm_vcpu *vcpu;
2797 	unsigned long prefix;
2798 	int i;
2799 
2800 	if (gmap_is_shadow(gmap))
2801 		return;
2802 	if (start >= 1UL << 31)
2803 		/* We are only interested in prefix pages */
2804 		return;
2805 	kvm_for_each_vcpu(i, vcpu, kvm) {
2806 		/* match against both prefix pages */
2807 		prefix = kvm_s390_get_prefix(vcpu);
2808 		if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
2809 			VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
2810 				   start, end);
2811 			kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
2812 		}
2813 	}
2814 }
2815 
2816 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
2817 {
2818 	/* kvm common code refers to this, but never calls it */
2819 	BUG();
2820 	return 0;
2821 }
2822 
2823 static int kvm_arch_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu,
2824 					   struct kvm_one_reg *reg)
2825 {
2826 	int r = -EINVAL;
2827 
2828 	switch (reg->id) {
2829 	case KVM_REG_S390_TODPR:
2830 		r = put_user(vcpu->arch.sie_block->todpr,
2831 			     (u32 __user *)reg->addr);
2832 		break;
2833 	case KVM_REG_S390_EPOCHDIFF:
2834 		r = put_user(vcpu->arch.sie_block->epoch,
2835 			     (u64 __user *)reg->addr);
2836 		break;
2837 	case KVM_REG_S390_CPU_TIMER:
2838 		r = put_user(kvm_s390_get_cpu_timer(vcpu),
2839 			     (u64 __user *)reg->addr);
2840 		break;
2841 	case KVM_REG_S390_CLOCK_COMP:
2842 		r = put_user(vcpu->arch.sie_block->ckc,
2843 			     (u64 __user *)reg->addr);
2844 		break;
2845 	case KVM_REG_S390_PFTOKEN:
2846 		r = put_user(vcpu->arch.pfault_token,
2847 			     (u64 __user *)reg->addr);
2848 		break;
2849 	case KVM_REG_S390_PFCOMPARE:
2850 		r = put_user(vcpu->arch.pfault_compare,
2851 			     (u64 __user *)reg->addr);
2852 		break;
2853 	case KVM_REG_S390_PFSELECT:
2854 		r = put_user(vcpu->arch.pfault_select,
2855 			     (u64 __user *)reg->addr);
2856 		break;
2857 	case KVM_REG_S390_PP:
2858 		r = put_user(vcpu->arch.sie_block->pp,
2859 			     (u64 __user *)reg->addr);
2860 		break;
2861 	case KVM_REG_S390_GBEA:
2862 		r = put_user(vcpu->arch.sie_block->gbea,
2863 			     (u64 __user *)reg->addr);
2864 		break;
2865 	default:
2866 		break;
2867 	}
2868 
2869 	return r;
2870 }
2871 
2872 static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu,
2873 					   struct kvm_one_reg *reg)
2874 {
2875 	int r = -EINVAL;
2876 	__u64 val;
2877 
2878 	switch (reg->id) {
2879 	case KVM_REG_S390_TODPR:
2880 		r = get_user(vcpu->arch.sie_block->todpr,
2881 			     (u32 __user *)reg->addr);
2882 		break;
2883 	case KVM_REG_S390_EPOCHDIFF:
2884 		r = get_user(vcpu->arch.sie_block->epoch,
2885 			     (u64 __user *)reg->addr);
2886 		break;
2887 	case KVM_REG_S390_CPU_TIMER:
2888 		r = get_user(val, (u64 __user *)reg->addr);
2889 		if (!r)
2890 			kvm_s390_set_cpu_timer(vcpu, val);
2891 		break;
2892 	case KVM_REG_S390_CLOCK_COMP:
2893 		r = get_user(vcpu->arch.sie_block->ckc,
2894 			     (u64 __user *)reg->addr);
2895 		break;
2896 	case KVM_REG_S390_PFTOKEN:
2897 		r = get_user(vcpu->arch.pfault_token,
2898 			     (u64 __user *)reg->addr);
2899 		if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
2900 			kvm_clear_async_pf_completion_queue(vcpu);
2901 		break;
2902 	case KVM_REG_S390_PFCOMPARE:
2903 		r = get_user(vcpu->arch.pfault_compare,
2904 			     (u64 __user *)reg->addr);
2905 		break;
2906 	case KVM_REG_S390_PFSELECT:
2907 		r = get_user(vcpu->arch.pfault_select,
2908 			     (u64 __user *)reg->addr);
2909 		break;
2910 	case KVM_REG_S390_PP:
2911 		r = get_user(vcpu->arch.sie_block->pp,
2912 			     (u64 __user *)reg->addr);
2913 		break;
2914 	case KVM_REG_S390_GBEA:
2915 		r = get_user(vcpu->arch.sie_block->gbea,
2916 			     (u64 __user *)reg->addr);
2917 		break;
2918 	default:
2919 		break;
2920 	}
2921 
2922 	return r;
2923 }
2924 
2925 static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
2926 {
2927 	kvm_s390_vcpu_initial_reset(vcpu);
2928 	return 0;
2929 }
2930 
2931 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2932 {
2933 	vcpu_load(vcpu);
2934 	memcpy(&vcpu->run->s.regs.gprs, &regs->gprs, sizeof(regs->gprs));
2935 	vcpu_put(vcpu);
2936 	return 0;
2937 }
2938 
2939 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2940 {
2941 	vcpu_load(vcpu);
2942 	memcpy(&regs->gprs, &vcpu->run->s.regs.gprs, sizeof(regs->gprs));
2943 	vcpu_put(vcpu);
2944 	return 0;
2945 }
2946 
2947 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2948 				  struct kvm_sregs *sregs)
2949 {
2950 	vcpu_load(vcpu);
2951 
2952 	memcpy(&vcpu->run->s.regs.acrs, &sregs->acrs, sizeof(sregs->acrs));
2953 	memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
2954 
2955 	vcpu_put(vcpu);
2956 	return 0;
2957 }
2958 
2959 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2960 				  struct kvm_sregs *sregs)
2961 {
2962 	vcpu_load(vcpu);
2963 
2964 	memcpy(&sregs->acrs, &vcpu->run->s.regs.acrs, sizeof(sregs->acrs));
2965 	memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs));
2966 
2967 	vcpu_put(vcpu);
2968 	return 0;
2969 }
2970 
2971 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2972 {
2973 	int ret = 0;
2974 
2975 	vcpu_load(vcpu);
2976 
2977 	if (test_fp_ctl(fpu->fpc)) {
2978 		ret = -EINVAL;
2979 		goto out;
2980 	}
2981 	vcpu->run->s.regs.fpc = fpu->fpc;
2982 	if (MACHINE_HAS_VX)
2983 		convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs,
2984 				 (freg_t *) fpu->fprs);
2985 	else
2986 		memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs));
2987 
2988 out:
2989 	vcpu_put(vcpu);
2990 	return ret;
2991 }
2992 
2993 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2994 {
2995 	vcpu_load(vcpu);
2996 
2997 	/* make sure we have the latest values */
2998 	save_fpu_regs();
2999 	if (MACHINE_HAS_VX)
3000 		convert_vx_to_fp((freg_t *) fpu->fprs,
3001 				 (__vector128 *) vcpu->run->s.regs.vrs);
3002 	else
3003 		memcpy(fpu->fprs, vcpu->run->s.regs.fprs, sizeof(fpu->fprs));
3004 	fpu->fpc = vcpu->run->s.regs.fpc;
3005 
3006 	vcpu_put(vcpu);
3007 	return 0;
3008 }
3009 
3010 static int kvm_arch_vcpu_ioctl_set_initial_psw(struct kvm_vcpu *vcpu, psw_t psw)
3011 {
3012 	int rc = 0;
3013 
3014 	if (!is_vcpu_stopped(vcpu))
3015 		rc = -EBUSY;
3016 	else {
3017 		vcpu->run->psw_mask = psw.mask;
3018 		vcpu->run->psw_addr = psw.addr;
3019 	}
3020 	return rc;
3021 }
3022 
3023 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
3024 				  struct kvm_translation *tr)
3025 {
3026 	return -EINVAL; /* not implemented yet */
3027 }
3028 
3029 #define VALID_GUESTDBG_FLAGS (KVM_GUESTDBG_SINGLESTEP | \
3030 			      KVM_GUESTDBG_USE_HW_BP | \
3031 			      KVM_GUESTDBG_ENABLE)
3032 
3033 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
3034 					struct kvm_guest_debug *dbg)
3035 {
3036 	int rc = 0;
3037 
3038 	vcpu_load(vcpu);
3039 
3040 	vcpu->guest_debug = 0;
3041 	kvm_s390_clear_bp_data(vcpu);
3042 
3043 	if (dbg->control & ~VALID_GUESTDBG_FLAGS) {
3044 		rc = -EINVAL;
3045 		goto out;
3046 	}
3047 	if (!sclp.has_gpere) {
3048 		rc = -EINVAL;
3049 		goto out;
3050 	}
3051 
3052 	if (dbg->control & KVM_GUESTDBG_ENABLE) {
3053 		vcpu->guest_debug = dbg->control;
3054 		/* enforce guest PER */
3055 		kvm_s390_set_cpuflags(vcpu, CPUSTAT_P);
3056 
3057 		if (dbg->control & KVM_GUESTDBG_USE_HW_BP)
3058 			rc = kvm_s390_import_bp_data(vcpu, dbg);
3059 	} else {
3060 		kvm_s390_clear_cpuflags(vcpu, CPUSTAT_P);
3061 		vcpu->arch.guestdbg.last_bp = 0;
3062 	}
3063 
3064 	if (rc) {
3065 		vcpu->guest_debug = 0;
3066 		kvm_s390_clear_bp_data(vcpu);
3067 		kvm_s390_clear_cpuflags(vcpu, CPUSTAT_P);
3068 	}
3069 
3070 out:
3071 	vcpu_put(vcpu);
3072 	return rc;
3073 }
3074 
3075 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
3076 				    struct kvm_mp_state *mp_state)
3077 {
3078 	int ret;
3079 
3080 	vcpu_load(vcpu);
3081 
3082 	/* CHECK_STOP and LOAD are not supported yet */
3083 	ret = is_vcpu_stopped(vcpu) ? KVM_MP_STATE_STOPPED :
3084 				      KVM_MP_STATE_OPERATING;
3085 
3086 	vcpu_put(vcpu);
3087 	return ret;
3088 }
3089 
3090 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3091 				    struct kvm_mp_state *mp_state)
3092 {
3093 	int rc = 0;
3094 
3095 	vcpu_load(vcpu);
3096 
3097 	/* user space knows about this interface - let it control the state */
3098 	vcpu->kvm->arch.user_cpu_state_ctrl = 1;
3099 
3100 	switch (mp_state->mp_state) {
3101 	case KVM_MP_STATE_STOPPED:
3102 		kvm_s390_vcpu_stop(vcpu);
3103 		break;
3104 	case KVM_MP_STATE_OPERATING:
3105 		kvm_s390_vcpu_start(vcpu);
3106 		break;
3107 	case KVM_MP_STATE_LOAD:
3108 	case KVM_MP_STATE_CHECK_STOP:
3109 		/* fall through - CHECK_STOP and LOAD are not supported yet */
3110 	default:
3111 		rc = -ENXIO;
3112 	}
3113 
3114 	vcpu_put(vcpu);
3115 	return rc;
3116 }
3117 
3118 static bool ibs_enabled(struct kvm_vcpu *vcpu)
3119 {
3120 	return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS);
3121 }
3122 
3123 static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
3124 {
3125 retry:
3126 	kvm_s390_vcpu_request_handled(vcpu);
3127 	if (!kvm_request_pending(vcpu))
3128 		return 0;
3129 	/*
3130 	 * We use MMU_RELOAD just to re-arm the ipte notifier for the
3131 	 * guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
3132 	 * This ensures that the ipte instruction for this request has
3133 	 * already finished. We might race against a second unmapper that
3134 	 * wants to set the blocking bit. Lets just retry the request loop.
3135 	 */
3136 	if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
3137 		int rc;
3138 		rc = gmap_mprotect_notify(vcpu->arch.gmap,
3139 					  kvm_s390_get_prefix(vcpu),
3140 					  PAGE_SIZE * 2, PROT_WRITE);
3141 		if (rc) {
3142 			kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
3143 			return rc;
3144 		}
3145 		goto retry;
3146 	}
3147 
3148 	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
3149 		vcpu->arch.sie_block->ihcpu = 0xffff;
3150 		goto retry;
3151 	}
3152 
3153 	if (kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu)) {
3154 		if (!ibs_enabled(vcpu)) {
3155 			trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 1);
3156 			kvm_s390_set_cpuflags(vcpu, CPUSTAT_IBS);
3157 		}
3158 		goto retry;
3159 	}
3160 
3161 	if (kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu)) {
3162 		if (ibs_enabled(vcpu)) {
3163 			trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 0);
3164 			kvm_s390_clear_cpuflags(vcpu, CPUSTAT_IBS);
3165 		}
3166 		goto retry;
3167 	}
3168 
3169 	if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) {
3170 		vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
3171 		goto retry;
3172 	}
3173 
3174 	if (kvm_check_request(KVM_REQ_START_MIGRATION, vcpu)) {
3175 		/*
3176 		 * Disable CMM virtualization; we will emulate the ESSA
3177 		 * instruction manually, in order to provide additional
3178 		 * functionalities needed for live migration.
3179 		 */
3180 		vcpu->arch.sie_block->ecb2 &= ~ECB2_CMMA;
3181 		goto retry;
3182 	}
3183 
3184 	if (kvm_check_request(KVM_REQ_STOP_MIGRATION, vcpu)) {
3185 		/*
3186 		 * Re-enable CMM virtualization if CMMA is available and
3187 		 * CMM has been used.
3188 		 */
3189 		if ((vcpu->kvm->arch.use_cmma) &&
3190 		    (vcpu->kvm->mm->context.uses_cmm))
3191 			vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
3192 		goto retry;
3193 	}
3194 
3195 	/* nothing to do, just clear the request */
3196 	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
3197 
3198 	return 0;
3199 }
3200 
3201 void kvm_s390_set_tod_clock(struct kvm *kvm,
3202 			    const struct kvm_s390_vm_tod_clock *gtod)
3203 {
3204 	struct kvm_vcpu *vcpu;
3205 	struct kvm_s390_tod_clock_ext htod;
3206 	int i;
3207 
3208 	mutex_lock(&kvm->lock);
3209 	preempt_disable();
3210 
3211 	get_tod_clock_ext((char *)&htod);
3212 
3213 	kvm->arch.epoch = gtod->tod - htod.tod;
3214 	kvm->arch.epdx = 0;
3215 	if (test_kvm_facility(kvm, 139)) {
3216 		kvm->arch.epdx = gtod->epoch_idx - htod.epoch_idx;
3217 		if (kvm->arch.epoch > gtod->tod)
3218 			kvm->arch.epdx -= 1;
3219 	}
3220 
3221 	kvm_s390_vcpu_block_all(kvm);
3222 	kvm_for_each_vcpu(i, vcpu, kvm) {
3223 		vcpu->arch.sie_block->epoch = kvm->arch.epoch;
3224 		vcpu->arch.sie_block->epdx  = kvm->arch.epdx;
3225 	}
3226 
3227 	kvm_s390_vcpu_unblock_all(kvm);
3228 	preempt_enable();
3229 	mutex_unlock(&kvm->lock);
3230 }
3231 
3232 /**
3233  * kvm_arch_fault_in_page - fault-in guest page if necessary
3234  * @vcpu: The corresponding virtual cpu
3235  * @gpa: Guest physical address
3236  * @writable: Whether the page should be writable or not
3237  *
3238  * Make sure that a guest page has been faulted-in on the host.
3239  *
3240  * Return: Zero on success, negative error code otherwise.
3241  */
3242 long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable)
3243 {
3244 	return gmap_fault(vcpu->arch.gmap, gpa,
3245 			  writable ? FAULT_FLAG_WRITE : 0);
3246 }
3247 
3248 static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
3249 				      unsigned long token)
3250 {
3251 	struct kvm_s390_interrupt inti;
3252 	struct kvm_s390_irq irq;
3253 
3254 	if (start_token) {
3255 		irq.u.ext.ext_params2 = token;
3256 		irq.type = KVM_S390_INT_PFAULT_INIT;
3257 		WARN_ON_ONCE(kvm_s390_inject_vcpu(vcpu, &irq));
3258 	} else {
3259 		inti.type = KVM_S390_INT_PFAULT_DONE;
3260 		inti.parm64 = token;
3261 		WARN_ON_ONCE(kvm_s390_inject_vm(vcpu->kvm, &inti));
3262 	}
3263 }
3264 
3265 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
3266 				     struct kvm_async_pf *work)
3267 {
3268 	trace_kvm_s390_pfault_init(vcpu, work->arch.pfault_token);
3269 	__kvm_inject_pfault_token(vcpu, true, work->arch.pfault_token);
3270 }
3271 
3272 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
3273 				 struct kvm_async_pf *work)
3274 {
3275 	trace_kvm_s390_pfault_done(vcpu, work->arch.pfault_token);
3276 	__kvm_inject_pfault_token(vcpu, false, work->arch.pfault_token);
3277 }
3278 
3279 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
3280 			       struct kvm_async_pf *work)
3281 {
3282 	/* s390 will always inject the page directly */
3283 }
3284 
3285 bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
3286 {
3287 	/*
3288 	 * s390 will always inject the page directly,
3289 	 * but we still want check_async_completion to cleanup
3290 	 */
3291 	return true;
3292 }
3293 
3294 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
3295 {
3296 	hva_t hva;
3297 	struct kvm_arch_async_pf arch;
3298 	int rc;
3299 
3300 	if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
3301 		return 0;
3302 	if ((vcpu->arch.sie_block->gpsw.mask & vcpu->arch.pfault_select) !=
3303 	    vcpu->arch.pfault_compare)
3304 		return 0;
3305 	if (psw_extint_disabled(vcpu))
3306 		return 0;
3307 	if (kvm_s390_vcpu_has_irq(vcpu, 0))
3308 		return 0;
3309 	if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK))
3310 		return 0;
3311 	if (!vcpu->arch.gmap->pfault_enabled)
3312 		return 0;
3313 
3314 	hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr));
3315 	hva += current->thread.gmap_addr & ~PAGE_MASK;
3316 	if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8))
3317 		return 0;
3318 
3319 	rc = kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch);
3320 	return rc;
3321 }
3322 
3323 static int vcpu_pre_run(struct kvm_vcpu *vcpu)
3324 {
3325 	int rc, cpuflags;
3326 
3327 	/*
3328 	 * On s390 notifications for arriving pages will be delivered directly
3329 	 * to the guest but the house keeping for completed pfaults is
3330 	 * handled outside the worker.
3331 	 */
3332 	kvm_check_async_pf_completion(vcpu);
3333 
3334 	vcpu->arch.sie_block->gg14 = vcpu->run->s.regs.gprs[14];
3335 	vcpu->arch.sie_block->gg15 = vcpu->run->s.regs.gprs[15];
3336 
3337 	if (need_resched())
3338 		schedule();
3339 
3340 	if (test_cpu_flag(CIF_MCCK_PENDING))
3341 		s390_handle_mcck();
3342 
3343 	if (!kvm_is_ucontrol(vcpu->kvm)) {
3344 		rc = kvm_s390_deliver_pending_interrupts(vcpu);
3345 		if (rc)
3346 			return rc;
3347 	}
3348 
3349 	rc = kvm_s390_handle_requests(vcpu);
3350 	if (rc)
3351 		return rc;
3352 
3353 	if (guestdbg_enabled(vcpu)) {
3354 		kvm_s390_backup_guest_per_regs(vcpu);
3355 		kvm_s390_patch_guest_per_regs(vcpu);
3356 	}
3357 
3358 	vcpu->arch.sie_block->icptcode = 0;
3359 	cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
3360 	VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags);
3361 	trace_kvm_s390_sie_enter(vcpu, cpuflags);
3362 
3363 	return 0;
3364 }
3365 
3366 static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
3367 {
3368 	struct kvm_s390_pgm_info pgm_info = {
3369 		.code = PGM_ADDRESSING,
3370 	};
3371 	u8 opcode, ilen;
3372 	int rc;
3373 
3374 	VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
3375 	trace_kvm_s390_sie_fault(vcpu);
3376 
3377 	/*
3378 	 * We want to inject an addressing exception, which is defined as a
3379 	 * suppressing or terminating exception. However, since we came here
3380 	 * by a DAT access exception, the PSW still points to the faulting
3381 	 * instruction since DAT exceptions are nullifying. So we've got
3382 	 * to look up the current opcode to get the length of the instruction
3383 	 * to be able to forward the PSW.
3384 	 */
3385 	rc = read_guest_instr(vcpu, vcpu->arch.sie_block->gpsw.addr, &opcode, 1);
3386 	ilen = insn_length(opcode);
3387 	if (rc < 0) {
3388 		return rc;
3389 	} else if (rc) {
3390 		/* Instruction-Fetching Exceptions - we can't detect the ilen.
3391 		 * Forward by arbitrary ilc, injection will take care of
3392 		 * nullification if necessary.
3393 		 */
3394 		pgm_info = vcpu->arch.pgm;
3395 		ilen = 4;
3396 	}
3397 	pgm_info.flags = ilen | KVM_S390_PGM_FLAGS_ILC_VALID;
3398 	kvm_s390_forward_psw(vcpu, ilen);
3399 	return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
3400 }
3401 
3402 static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
3403 {
3404 	struct mcck_volatile_info *mcck_info;
3405 	struct sie_page *sie_page;
3406 
3407 	VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
3408 		   vcpu->arch.sie_block->icptcode);
3409 	trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
3410 
3411 	if (guestdbg_enabled(vcpu))
3412 		kvm_s390_restore_guest_per_regs(vcpu);
3413 
3414 	vcpu->run->s.regs.gprs[14] = vcpu->arch.sie_block->gg14;
3415 	vcpu->run->s.regs.gprs[15] = vcpu->arch.sie_block->gg15;
3416 
3417 	if (exit_reason == -EINTR) {
3418 		VCPU_EVENT(vcpu, 3, "%s", "machine check");
3419 		sie_page = container_of(vcpu->arch.sie_block,
3420 					struct sie_page, sie_block);
3421 		mcck_info = &sie_page->mcck_info;
3422 		kvm_s390_reinject_machine_check(vcpu, mcck_info);
3423 		return 0;
3424 	}
3425 
3426 	if (vcpu->arch.sie_block->icptcode > 0) {
3427 		int rc = kvm_handle_sie_intercept(vcpu);
3428 
3429 		if (rc != -EOPNOTSUPP)
3430 			return rc;
3431 		vcpu->run->exit_reason = KVM_EXIT_S390_SIEIC;
3432 		vcpu->run->s390_sieic.icptcode = vcpu->arch.sie_block->icptcode;
3433 		vcpu->run->s390_sieic.ipa = vcpu->arch.sie_block->ipa;
3434 		vcpu->run->s390_sieic.ipb = vcpu->arch.sie_block->ipb;
3435 		return -EREMOTE;
3436 	} else if (exit_reason != -EFAULT) {
3437 		vcpu->stat.exit_null++;
3438 		return 0;
3439 	} else if (kvm_is_ucontrol(vcpu->kvm)) {
3440 		vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
3441 		vcpu->run->s390_ucontrol.trans_exc_code =
3442 						current->thread.gmap_addr;
3443 		vcpu->run->s390_ucontrol.pgm_code = 0x10;
3444 		return -EREMOTE;
3445 	} else if (current->thread.gmap_pfault) {
3446 		trace_kvm_s390_major_guest_pfault(vcpu);
3447 		current->thread.gmap_pfault = 0;
3448 		if (kvm_arch_setup_async_pf(vcpu))
3449 			return 0;
3450 		return kvm_arch_fault_in_page(vcpu, current->thread.gmap_addr, 1);
3451 	}
3452 	return vcpu_post_run_fault_in_sie(vcpu);
3453 }
3454 
3455 static int __vcpu_run(struct kvm_vcpu *vcpu)
3456 {
3457 	int rc, exit_reason;
3458 
3459 	/*
3460 	 * We try to hold kvm->srcu during most of vcpu_run (except when run-
3461 	 * ning the guest), so that memslots (and other stuff) are protected
3462 	 */
3463 	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
3464 
3465 	do {
3466 		rc = vcpu_pre_run(vcpu);
3467 		if (rc)
3468 			break;
3469 
3470 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
3471 		/*
3472 		 * As PF_VCPU will be used in fault handler, between
3473 		 * guest_enter and guest_exit should be no uaccess.
3474 		 */
3475 		local_irq_disable();
3476 		guest_enter_irqoff();
3477 		__disable_cpu_timer_accounting(vcpu);
3478 		local_irq_enable();
3479 		exit_reason = sie64a(vcpu->arch.sie_block,
3480 				     vcpu->run->s.regs.gprs);
3481 		local_irq_disable();
3482 		__enable_cpu_timer_accounting(vcpu);
3483 		guest_exit_irqoff();
3484 		local_irq_enable();
3485 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
3486 
3487 		rc = vcpu_post_run(vcpu, exit_reason);
3488 	} while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc);
3489 
3490 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
3491 	return rc;
3492 }
3493 
3494 static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3495 {
3496 	struct runtime_instr_cb *riccb;
3497 	struct gs_cb *gscb;
3498 
3499 	riccb = (struct runtime_instr_cb *) &kvm_run->s.regs.riccb;
3500 	gscb = (struct gs_cb *) &kvm_run->s.regs.gscb;
3501 	vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
3502 	vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
3503 	if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX)
3504 		kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix);
3505 	if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) {
3506 		memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128);
3507 		/* some control register changes require a tlb flush */
3508 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3509 	}
3510 	if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) {
3511 		kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm);
3512 		vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc;
3513 		vcpu->arch.sie_block->todpr = kvm_run->s.regs.todpr;
3514 		vcpu->arch.sie_block->pp = kvm_run->s.regs.pp;
3515 		vcpu->arch.sie_block->gbea = kvm_run->s.regs.gbea;
3516 	}
3517 	if (kvm_run->kvm_dirty_regs & KVM_SYNC_PFAULT) {
3518 		vcpu->arch.pfault_token = kvm_run->s.regs.pft;
3519 		vcpu->arch.pfault_select = kvm_run->s.regs.pfs;
3520 		vcpu->arch.pfault_compare = kvm_run->s.regs.pfc;
3521 		if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
3522 			kvm_clear_async_pf_completion_queue(vcpu);
3523 	}
3524 	/*
3525 	 * If userspace sets the riccb (e.g. after migration) to a valid state,
3526 	 * we should enable RI here instead of doing the lazy enablement.
3527 	 */
3528 	if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) &&
3529 	    test_kvm_facility(vcpu->kvm, 64) &&
3530 	    riccb->v &&
3531 	    !(vcpu->arch.sie_block->ecb3 & ECB3_RI)) {
3532 		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (sync_regs)");
3533 		vcpu->arch.sie_block->ecb3 |= ECB3_RI;
3534 	}
3535 	/*
3536 	 * If userspace sets the gscb (e.g. after migration) to non-zero,
3537 	 * we should enable GS here instead of doing the lazy enablement.
3538 	 */
3539 	if ((kvm_run->kvm_dirty_regs & KVM_SYNC_GSCB) &&
3540 	    test_kvm_facility(vcpu->kvm, 133) &&
3541 	    gscb->gssm &&
3542 	    !vcpu->arch.gs_enabled) {
3543 		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (sync_regs)");
3544 		vcpu->arch.sie_block->ecb |= ECB_GS;
3545 		vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT;
3546 		vcpu->arch.gs_enabled = 1;
3547 	}
3548 	if ((kvm_run->kvm_dirty_regs & KVM_SYNC_BPBC) &&
3549 	    test_kvm_facility(vcpu->kvm, 82)) {
3550 		vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
3551 		vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0;
3552 	}
3553 	save_access_regs(vcpu->arch.host_acrs);
3554 	restore_access_regs(vcpu->run->s.regs.acrs);
3555 	/* save host (userspace) fprs/vrs */
3556 	save_fpu_regs();
3557 	vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc;
3558 	vcpu->arch.host_fpregs.regs = current->thread.fpu.regs;
3559 	if (MACHINE_HAS_VX)
3560 		current->thread.fpu.regs = vcpu->run->s.regs.vrs;
3561 	else
3562 		current->thread.fpu.regs = vcpu->run->s.regs.fprs;
3563 	current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
3564 	if (test_fp_ctl(current->thread.fpu.fpc))
3565 		/* User space provided an invalid FPC, let's clear it */
3566 		current->thread.fpu.fpc = 0;
3567 	if (MACHINE_HAS_GS) {
3568 		preempt_disable();
3569 		__ctl_set_bit(2, 4);
3570 		if (current->thread.gs_cb) {
3571 			vcpu->arch.host_gscb = current->thread.gs_cb;
3572 			save_gs_cb(vcpu->arch.host_gscb);
3573 		}
3574 		if (vcpu->arch.gs_enabled) {
3575 			current->thread.gs_cb = (struct gs_cb *)
3576 						&vcpu->run->s.regs.gscb;
3577 			restore_gs_cb(current->thread.gs_cb);
3578 		}
3579 		preempt_enable();
3580 	}
3581 	/* SIE will load etoken directly from SDNX and therefore kvm_run */
3582 
3583 	kvm_run->kvm_dirty_regs = 0;
3584 }
3585 
3586 static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3587 {
3588 	kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask;
3589 	kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr;
3590 	kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu);
3591 	memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128);
3592 	kvm_run->s.regs.cputm = kvm_s390_get_cpu_timer(vcpu);
3593 	kvm_run->s.regs.ckc = vcpu->arch.sie_block->ckc;
3594 	kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr;
3595 	kvm_run->s.regs.pp = vcpu->arch.sie_block->pp;
3596 	kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea;
3597 	kvm_run->s.regs.pft = vcpu->arch.pfault_token;
3598 	kvm_run->s.regs.pfs = vcpu->arch.pfault_select;
3599 	kvm_run->s.regs.pfc = vcpu->arch.pfault_compare;
3600 	kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
3601 	save_access_regs(vcpu->run->s.regs.acrs);
3602 	restore_access_regs(vcpu->arch.host_acrs);
3603 	/* Save guest register state */
3604 	save_fpu_regs();
3605 	vcpu->run->s.regs.fpc = current->thread.fpu.fpc;
3606 	/* Restore will be done lazily at return */
3607 	current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc;
3608 	current->thread.fpu.regs = vcpu->arch.host_fpregs.regs;
3609 	if (MACHINE_HAS_GS) {
3610 		__ctl_set_bit(2, 4);
3611 		if (vcpu->arch.gs_enabled)
3612 			save_gs_cb(current->thread.gs_cb);
3613 		preempt_disable();
3614 		current->thread.gs_cb = vcpu->arch.host_gscb;
3615 		restore_gs_cb(vcpu->arch.host_gscb);
3616 		preempt_enable();
3617 		if (!vcpu->arch.host_gscb)
3618 			__ctl_clear_bit(2, 4);
3619 		vcpu->arch.host_gscb = NULL;
3620 	}
3621 	/* SIE will save etoken directly into SDNX and therefore kvm_run */
3622 }
3623 
3624 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3625 {
3626 	int rc;
3627 
3628 	if (kvm_run->immediate_exit)
3629 		return -EINTR;
3630 
3631 	vcpu_load(vcpu);
3632 
3633 	if (guestdbg_exit_pending(vcpu)) {
3634 		kvm_s390_prepare_debug_exit(vcpu);
3635 		rc = 0;
3636 		goto out;
3637 	}
3638 
3639 	kvm_sigset_activate(vcpu);
3640 
3641 	if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) {
3642 		kvm_s390_vcpu_start(vcpu);
3643 	} else if (is_vcpu_stopped(vcpu)) {
3644 		pr_err_ratelimited("can't run stopped vcpu %d\n",
3645 				   vcpu->vcpu_id);
3646 		rc = -EINVAL;
3647 		goto out;
3648 	}
3649 
3650 	sync_regs(vcpu, kvm_run);
3651 	enable_cpu_timer_accounting(vcpu);
3652 
3653 	might_fault();
3654 	rc = __vcpu_run(vcpu);
3655 
3656 	if (signal_pending(current) && !rc) {
3657 		kvm_run->exit_reason = KVM_EXIT_INTR;
3658 		rc = -EINTR;
3659 	}
3660 
3661 	if (guestdbg_exit_pending(vcpu) && !rc)  {
3662 		kvm_s390_prepare_debug_exit(vcpu);
3663 		rc = 0;
3664 	}
3665 
3666 	if (rc == -EREMOTE) {
3667 		/* userspace support is needed, kvm_run has been prepared */
3668 		rc = 0;
3669 	}
3670 
3671 	disable_cpu_timer_accounting(vcpu);
3672 	store_regs(vcpu, kvm_run);
3673 
3674 	kvm_sigset_deactivate(vcpu);
3675 
3676 	vcpu->stat.exit_userspace++;
3677 out:
3678 	vcpu_put(vcpu);
3679 	return rc;
3680 }
3681 
3682 /*
3683  * store status at address
3684  * we use have two special cases:
3685  * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit
3686  * KVM_S390_STORE_STATUS_PREFIXED: -> prefix
3687  */
3688 int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
3689 {
3690 	unsigned char archmode = 1;
3691 	freg_t fprs[NUM_FPRS];
3692 	unsigned int px;
3693 	u64 clkcomp, cputm;
3694 	int rc;
3695 
3696 	px = kvm_s390_get_prefix(vcpu);
3697 	if (gpa == KVM_S390_STORE_STATUS_NOADDR) {
3698 		if (write_guest_abs(vcpu, 163, &archmode, 1))
3699 			return -EFAULT;
3700 		gpa = 0;
3701 	} else if (gpa == KVM_S390_STORE_STATUS_PREFIXED) {
3702 		if (write_guest_real(vcpu, 163, &archmode, 1))
3703 			return -EFAULT;
3704 		gpa = px;
3705 	} else
3706 		gpa -= __LC_FPREGS_SAVE_AREA;
3707 
3708 	/* manually convert vector registers if necessary */
3709 	if (MACHINE_HAS_VX) {
3710 		convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
3711 		rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
3712 				     fprs, 128);
3713 	} else {
3714 		rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
3715 				     vcpu->run->s.regs.fprs, 128);
3716 	}
3717 	rc |= write_guest_abs(vcpu, gpa + __LC_GPREGS_SAVE_AREA,
3718 			      vcpu->run->s.regs.gprs, 128);
3719 	rc |= write_guest_abs(vcpu, gpa + __LC_PSW_SAVE_AREA,
3720 			      &vcpu->arch.sie_block->gpsw, 16);
3721 	rc |= write_guest_abs(vcpu, gpa + __LC_PREFIX_SAVE_AREA,
3722 			      &px, 4);
3723 	rc |= write_guest_abs(vcpu, gpa + __LC_FP_CREG_SAVE_AREA,
3724 			      &vcpu->run->s.regs.fpc, 4);
3725 	rc |= write_guest_abs(vcpu, gpa + __LC_TOD_PROGREG_SAVE_AREA,
3726 			      &vcpu->arch.sie_block->todpr, 4);
3727 	cputm = kvm_s390_get_cpu_timer(vcpu);
3728 	rc |= write_guest_abs(vcpu, gpa + __LC_CPU_TIMER_SAVE_AREA,
3729 			      &cputm, 8);
3730 	clkcomp = vcpu->arch.sie_block->ckc >> 8;
3731 	rc |= write_guest_abs(vcpu, gpa + __LC_CLOCK_COMP_SAVE_AREA,
3732 			      &clkcomp, 8);
3733 	rc |= write_guest_abs(vcpu, gpa + __LC_AREGS_SAVE_AREA,
3734 			      &vcpu->run->s.regs.acrs, 64);
3735 	rc |= write_guest_abs(vcpu, gpa + __LC_CREGS_SAVE_AREA,
3736 			      &vcpu->arch.sie_block->gcr, 128);
3737 	return rc ? -EFAULT : 0;
3738 }
3739 
3740 int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
3741 {
3742 	/*
3743 	 * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy
3744 	 * switch in the run ioctl. Let's update our copies before we save
3745 	 * it into the save area
3746 	 */
3747 	save_fpu_regs();
3748 	vcpu->run->s.regs.fpc = current->thread.fpu.fpc;
3749 	save_access_regs(vcpu->run->s.regs.acrs);
3750 
3751 	return kvm_s390_store_status_unloaded(vcpu, addr);
3752 }
3753 
3754 static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
3755 {
3756 	kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu);
3757 	kvm_s390_sync_request(KVM_REQ_DISABLE_IBS, vcpu);
3758 }
3759 
3760 static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
3761 {
3762 	unsigned int i;
3763 	struct kvm_vcpu *vcpu;
3764 
3765 	kvm_for_each_vcpu(i, vcpu, kvm) {
3766 		__disable_ibs_on_vcpu(vcpu);
3767 	}
3768 }
3769 
3770 static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
3771 {
3772 	if (!sclp.has_ibs)
3773 		return;
3774 	kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu);
3775 	kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
3776 }
3777 
3778 void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
3779 {
3780 	int i, online_vcpus, started_vcpus = 0;
3781 
3782 	if (!is_vcpu_stopped(vcpu))
3783 		return;
3784 
3785 	trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1);
3786 	/* Only one cpu at a time may enter/leave the STOPPED state. */
3787 	spin_lock(&vcpu->kvm->arch.start_stop_lock);
3788 	online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
3789 
3790 	for (i = 0; i < online_vcpus; i++) {
3791 		if (!is_vcpu_stopped(vcpu->kvm->vcpus[i]))
3792 			started_vcpus++;
3793 	}
3794 
3795 	if (started_vcpus == 0) {
3796 		/* we're the only active VCPU -> speed it up */
3797 		__enable_ibs_on_vcpu(vcpu);
3798 	} else if (started_vcpus == 1) {
3799 		/*
3800 		 * As we are starting a second VCPU, we have to disable
3801 		 * the IBS facility on all VCPUs to remove potentially
3802 		 * oustanding ENABLE requests.
3803 		 */
3804 		__disable_ibs_on_all_vcpus(vcpu->kvm);
3805 	}
3806 
3807 	kvm_s390_clear_cpuflags(vcpu, CPUSTAT_STOPPED);
3808 	/*
3809 	 * Another VCPU might have used IBS while we were offline.
3810 	 * Let's play safe and flush the VCPU at startup.
3811 	 */
3812 	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3813 	spin_unlock(&vcpu->kvm->arch.start_stop_lock);
3814 	return;
3815 }
3816 
3817 void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
3818 {
3819 	int i, online_vcpus, started_vcpus = 0;
3820 	struct kvm_vcpu *started_vcpu = NULL;
3821 
3822 	if (is_vcpu_stopped(vcpu))
3823 		return;
3824 
3825 	trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 0);
3826 	/* Only one cpu at a time may enter/leave the STOPPED state. */
3827 	spin_lock(&vcpu->kvm->arch.start_stop_lock);
3828 	online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
3829 
3830 	/* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */
3831 	kvm_s390_clear_stop_irq(vcpu);
3832 
3833 	kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOPPED);
3834 	__disable_ibs_on_vcpu(vcpu);
3835 
3836 	for (i = 0; i < online_vcpus; i++) {
3837 		if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) {
3838 			started_vcpus++;
3839 			started_vcpu = vcpu->kvm->vcpus[i];
3840 		}
3841 	}
3842 
3843 	if (started_vcpus == 1) {
3844 		/*
3845 		 * As we only have one VCPU left, we want to enable the
3846 		 * IBS facility for that VCPU to speed it up.
3847 		 */
3848 		__enable_ibs_on_vcpu(started_vcpu);
3849 	}
3850 
3851 	spin_unlock(&vcpu->kvm->arch.start_stop_lock);
3852 	return;
3853 }
3854 
3855 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
3856 				     struct kvm_enable_cap *cap)
3857 {
3858 	int r;
3859 
3860 	if (cap->flags)
3861 		return -EINVAL;
3862 
3863 	switch (cap->cap) {
3864 	case KVM_CAP_S390_CSS_SUPPORT:
3865 		if (!vcpu->kvm->arch.css_support) {
3866 			vcpu->kvm->arch.css_support = 1;
3867 			VM_EVENT(vcpu->kvm, 3, "%s", "ENABLE: CSS support");
3868 			trace_kvm_s390_enable_css(vcpu->kvm);
3869 		}
3870 		r = 0;
3871 		break;
3872 	default:
3873 		r = -EINVAL;
3874 		break;
3875 	}
3876 	return r;
3877 }
3878 
3879 static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
3880 				  struct kvm_s390_mem_op *mop)
3881 {
3882 	void __user *uaddr = (void __user *)mop->buf;
3883 	void *tmpbuf = NULL;
3884 	int r, srcu_idx;
3885 	const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION
3886 				    | KVM_S390_MEMOP_F_CHECK_ONLY;
3887 
3888 	if (mop->flags & ~supported_flags)
3889 		return -EINVAL;
3890 
3891 	if (mop->size > MEM_OP_MAX_SIZE)
3892 		return -E2BIG;
3893 
3894 	if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
3895 		tmpbuf = vmalloc(mop->size);
3896 		if (!tmpbuf)
3897 			return -ENOMEM;
3898 	}
3899 
3900 	srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
3901 
3902 	switch (mop->op) {
3903 	case KVM_S390_MEMOP_LOGICAL_READ:
3904 		if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
3905 			r = check_gva_range(vcpu, mop->gaddr, mop->ar,
3906 					    mop->size, GACC_FETCH);
3907 			break;
3908 		}
3909 		r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
3910 		if (r == 0) {
3911 			if (copy_to_user(uaddr, tmpbuf, mop->size))
3912 				r = -EFAULT;
3913 		}
3914 		break;
3915 	case KVM_S390_MEMOP_LOGICAL_WRITE:
3916 		if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
3917 			r = check_gva_range(vcpu, mop->gaddr, mop->ar,
3918 					    mop->size, GACC_STORE);
3919 			break;
3920 		}
3921 		if (copy_from_user(tmpbuf, uaddr, mop->size)) {
3922 			r = -EFAULT;
3923 			break;
3924 		}
3925 		r = write_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
3926 		break;
3927 	default:
3928 		r = -EINVAL;
3929 	}
3930 
3931 	srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
3932 
3933 	if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0)
3934 		kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
3935 
3936 	vfree(tmpbuf);
3937 	return r;
3938 }
3939 
3940 long kvm_arch_vcpu_async_ioctl(struct file *filp,
3941 			       unsigned int ioctl, unsigned long arg)
3942 {
3943 	struct kvm_vcpu *vcpu = filp->private_data;
3944 	void __user *argp = (void __user *)arg;
3945 
3946 	switch (ioctl) {
3947 	case KVM_S390_IRQ: {
3948 		struct kvm_s390_irq s390irq;
3949 
3950 		if (copy_from_user(&s390irq, argp, sizeof(s390irq)))
3951 			return -EFAULT;
3952 		return kvm_s390_inject_vcpu(vcpu, &s390irq);
3953 	}
3954 	case KVM_S390_INTERRUPT: {
3955 		struct kvm_s390_interrupt s390int;
3956 		struct kvm_s390_irq s390irq;
3957 
3958 		if (copy_from_user(&s390int, argp, sizeof(s390int)))
3959 			return -EFAULT;
3960 		if (s390int_to_s390irq(&s390int, &s390irq))
3961 			return -EINVAL;
3962 		return kvm_s390_inject_vcpu(vcpu, &s390irq);
3963 	}
3964 	}
3965 	return -ENOIOCTLCMD;
3966 }
3967 
3968 long kvm_arch_vcpu_ioctl(struct file *filp,
3969 			 unsigned int ioctl, unsigned long arg)
3970 {
3971 	struct kvm_vcpu *vcpu = filp->private_data;
3972 	void __user *argp = (void __user *)arg;
3973 	int idx;
3974 	long r;
3975 
3976 	vcpu_load(vcpu);
3977 
3978 	switch (ioctl) {
3979 	case KVM_S390_STORE_STATUS:
3980 		idx = srcu_read_lock(&vcpu->kvm->srcu);
3981 		r = kvm_s390_vcpu_store_status(vcpu, arg);
3982 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
3983 		break;
3984 	case KVM_S390_SET_INITIAL_PSW: {
3985 		psw_t psw;
3986 
3987 		r = -EFAULT;
3988 		if (copy_from_user(&psw, argp, sizeof(psw)))
3989 			break;
3990 		r = kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw);
3991 		break;
3992 	}
3993 	case KVM_S390_INITIAL_RESET:
3994 		r = kvm_arch_vcpu_ioctl_initial_reset(vcpu);
3995 		break;
3996 	case KVM_SET_ONE_REG:
3997 	case KVM_GET_ONE_REG: {
3998 		struct kvm_one_reg reg;
3999 		r = -EFAULT;
4000 		if (copy_from_user(&reg, argp, sizeof(reg)))
4001 			break;
4002 		if (ioctl == KVM_SET_ONE_REG)
4003 			r = kvm_arch_vcpu_ioctl_set_one_reg(vcpu, &reg);
4004 		else
4005 			r = kvm_arch_vcpu_ioctl_get_one_reg(vcpu, &reg);
4006 		break;
4007 	}
4008 #ifdef CONFIG_KVM_S390_UCONTROL
4009 	case KVM_S390_UCAS_MAP: {
4010 		struct kvm_s390_ucas_mapping ucasmap;
4011 
4012 		if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) {
4013 			r = -EFAULT;
4014 			break;
4015 		}
4016 
4017 		if (!kvm_is_ucontrol(vcpu->kvm)) {
4018 			r = -EINVAL;
4019 			break;
4020 		}
4021 
4022 		r = gmap_map_segment(vcpu->arch.gmap, ucasmap.user_addr,
4023 				     ucasmap.vcpu_addr, ucasmap.length);
4024 		break;
4025 	}
4026 	case KVM_S390_UCAS_UNMAP: {
4027 		struct kvm_s390_ucas_mapping ucasmap;
4028 
4029 		if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) {
4030 			r = -EFAULT;
4031 			break;
4032 		}
4033 
4034 		if (!kvm_is_ucontrol(vcpu->kvm)) {
4035 			r = -EINVAL;
4036 			break;
4037 		}
4038 
4039 		r = gmap_unmap_segment(vcpu->arch.gmap, ucasmap.vcpu_addr,
4040 			ucasmap.length);
4041 		break;
4042 	}
4043 #endif
4044 	case KVM_S390_VCPU_FAULT: {
4045 		r = gmap_fault(vcpu->arch.gmap, arg, 0);
4046 		break;
4047 	}
4048 	case KVM_ENABLE_CAP:
4049 	{
4050 		struct kvm_enable_cap cap;
4051 		r = -EFAULT;
4052 		if (copy_from_user(&cap, argp, sizeof(cap)))
4053 			break;
4054 		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
4055 		break;
4056 	}
4057 	case KVM_S390_MEM_OP: {
4058 		struct kvm_s390_mem_op mem_op;
4059 
4060 		if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0)
4061 			r = kvm_s390_guest_mem_op(vcpu, &mem_op);
4062 		else
4063 			r = -EFAULT;
4064 		break;
4065 	}
4066 	case KVM_S390_SET_IRQ_STATE: {
4067 		struct kvm_s390_irq_state irq_state;
4068 
4069 		r = -EFAULT;
4070 		if (copy_from_user(&irq_state, argp, sizeof(irq_state)))
4071 			break;
4072 		if (irq_state.len > VCPU_IRQS_MAX_BUF ||
4073 		    irq_state.len == 0 ||
4074 		    irq_state.len % sizeof(struct kvm_s390_irq) > 0) {
4075 			r = -EINVAL;
4076 			break;
4077 		}
4078 		/* do not use irq_state.flags, it will break old QEMUs */
4079 		r = kvm_s390_set_irq_state(vcpu,
4080 					   (void __user *) irq_state.buf,
4081 					   irq_state.len);
4082 		break;
4083 	}
4084 	case KVM_S390_GET_IRQ_STATE: {
4085 		struct kvm_s390_irq_state irq_state;
4086 
4087 		r = -EFAULT;
4088 		if (copy_from_user(&irq_state, argp, sizeof(irq_state)))
4089 			break;
4090 		if (irq_state.len == 0) {
4091 			r = -EINVAL;
4092 			break;
4093 		}
4094 		/* do not use irq_state.flags, it will break old QEMUs */
4095 		r = kvm_s390_get_irq_state(vcpu,
4096 					   (__u8 __user *)  irq_state.buf,
4097 					   irq_state.len);
4098 		break;
4099 	}
4100 	default:
4101 		r = -ENOTTY;
4102 	}
4103 
4104 	vcpu_put(vcpu);
4105 	return r;
4106 }
4107 
4108 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
4109 {
4110 #ifdef CONFIG_KVM_S390_UCONTROL
4111 	if ((vmf->pgoff == KVM_S390_SIE_PAGE_OFFSET)
4112 		 && (kvm_is_ucontrol(vcpu->kvm))) {
4113 		vmf->page = virt_to_page(vcpu->arch.sie_block);
4114 		get_page(vmf->page);
4115 		return 0;
4116 	}
4117 #endif
4118 	return VM_FAULT_SIGBUS;
4119 }
4120 
4121 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
4122 			    unsigned long npages)
4123 {
4124 	return 0;
4125 }
4126 
4127 /* Section: memory related */
4128 int kvm_arch_prepare_memory_region(struct kvm *kvm,
4129 				   struct kvm_memory_slot *memslot,
4130 				   const struct kvm_userspace_memory_region *mem,
4131 				   enum kvm_mr_change change)
4132 {
4133 	/* A few sanity checks. We can have memory slots which have to be
4134 	   located/ended at a segment boundary (1MB). The memory in userland is
4135 	   ok to be fragmented into various different vmas. It is okay to mmap()
4136 	   and munmap() stuff in this slot after doing this call at any time */
4137 
4138 	if (mem->userspace_addr & 0xffffful)
4139 		return -EINVAL;
4140 
4141 	if (mem->memory_size & 0xffffful)
4142 		return -EINVAL;
4143 
4144 	if (mem->guest_phys_addr + mem->memory_size > kvm->arch.mem_limit)
4145 		return -EINVAL;
4146 
4147 	return 0;
4148 }
4149 
4150 void kvm_arch_commit_memory_region(struct kvm *kvm,
4151 				const struct kvm_userspace_memory_region *mem,
4152 				const struct kvm_memory_slot *old,
4153 				const struct kvm_memory_slot *new,
4154 				enum kvm_mr_change change)
4155 {
4156 	int rc;
4157 
4158 	/* If the basics of the memslot do not change, we do not want
4159 	 * to update the gmap. Every update causes several unnecessary
4160 	 * segment translation exceptions. This is usually handled just
4161 	 * fine by the normal fault handler + gmap, but it will also
4162 	 * cause faults on the prefix page of running guest CPUs.
4163 	 */
4164 	if (old->userspace_addr == mem->userspace_addr &&
4165 	    old->base_gfn * PAGE_SIZE == mem->guest_phys_addr &&
4166 	    old->npages * PAGE_SIZE == mem->memory_size)
4167 		return;
4168 
4169 	rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr,
4170 		mem->guest_phys_addr, mem->memory_size);
4171 	if (rc)
4172 		pr_warn("failed to commit memory region\n");
4173 	return;
4174 }
4175 
4176 static inline unsigned long nonhyp_mask(int i)
4177 {
4178 	unsigned int nonhyp_fai = (sclp.hmfai << i * 2) >> 30;
4179 
4180 	return 0x0000ffffffffffffUL >> (nonhyp_fai << 4);
4181 }
4182 
4183 void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu)
4184 {
4185 	vcpu->valid_wakeup = false;
4186 }
4187 
4188 static int __init kvm_s390_init(void)
4189 {
4190 	int i;
4191 
4192 	if (!sclp.has_sief2) {
4193 		pr_info("SIE not available\n");
4194 		return -ENODEV;
4195 	}
4196 
4197 	if (nested && hpage) {
4198 		pr_info("nested (vSIE) and hpage (huge page backing) can currently not be activated concurrently");
4199 		return -EINVAL;
4200 	}
4201 
4202 	for (i = 0; i < 16; i++)
4203 		kvm_s390_fac_base[i] |=
4204 			S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i);
4205 
4206 	return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
4207 }
4208 
4209 static void __exit kvm_s390_exit(void)
4210 {
4211 	kvm_exit();
4212 }
4213 
4214 module_init(kvm_s390_init);
4215 module_exit(kvm_s390_exit);
4216 
4217 /*
4218  * Enable autoloading of the kvm module.
4219  * Note that we add the module alias here instead of virt/kvm/kvm_main.c
4220  * since x86 takes a different approach.
4221  */
4222 #include <linux/miscdevice.h>
4223 MODULE_ALIAS_MISCDEV(KVM_MINOR);
4224 MODULE_ALIAS("devname:kvm");
4225