1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2017-2019, IBM Corporation.
4  */
5 
6 #define pr_fmt(fmt) "xive-kvm: " fmt
7 
8 #include <linux/kernel.h>
9 #include <linux/kvm_host.h>
10 #include <linux/err.h>
11 #include <linux/gfp.h>
12 #include <linux/spinlock.h>
13 #include <linux/delay.h>
14 #include <linux/file.h>
15 #include <asm/uaccess.h>
16 #include <asm/kvm_book3s.h>
17 #include <asm/kvm_ppc.h>
18 #include <asm/hvcall.h>
19 #include <asm/xive.h>
20 #include <asm/xive-regs.h>
21 #include <asm/debug.h>
22 #include <asm/debugfs.h>
23 #include <asm/opal.h>
24 
25 #include <linux/debugfs.h>
26 #include <linux/seq_file.h>
27 
28 #include "book3s_xive.h"
29 
30 static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
31 {
32 	u64 val;
33 
34 	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
35 		offset |= offset << 4;
36 
37 	val = in_be64(xd->eoi_mmio + offset);
38 	return (u8)val;
39 }
40 
41 static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
42 {
43 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
44 	struct xive_q *q = &xc->queues[prio];
45 
46 	xive_native_disable_queue(xc->vp_id, q, prio);
47 	if (q->qpage) {
48 		put_page(virt_to_page(q->qpage));
49 		q->qpage = NULL;
50 	}
51 }
52 
53 void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
54 {
55 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
56 	int i;
57 
58 	if (!kvmppc_xive_enabled(vcpu))
59 		return;
60 
61 	if (!xc)
62 		return;
63 
64 	pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num);
65 
66 	/* Ensure no interrupt is still routed to that VP */
67 	xc->valid = false;
68 	kvmppc_xive_disable_vcpu_interrupts(vcpu);
69 
70 	/* Free escalations */
71 	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
72 		/* Free the escalation irq */
73 		if (xc->esc_virq[i]) {
74 			if (xc->xive->single_escalation)
75 				xive_cleanup_single_escalation(vcpu, xc,
76 							xc->esc_virq[i]);
77 			free_irq(xc->esc_virq[i], vcpu);
78 			irq_dispose_mapping(xc->esc_virq[i]);
79 			kfree(xc->esc_virq_names[i]);
80 			xc->esc_virq[i] = 0;
81 		}
82 	}
83 
84 	/* Disable the VP */
85 	xive_native_disable_vp(xc->vp_id);
86 
87 	/* Clear the cam word so guest entry won't try to push context */
88 	vcpu->arch.xive_cam_word = 0;
89 
90 	/* Free the queues */
91 	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
92 		kvmppc_xive_native_cleanup_queue(vcpu, i);
93 	}
94 
95 	/* Free the VP */
96 	kfree(xc);
97 
98 	/* Cleanup the vcpu */
99 	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
100 	vcpu->arch.xive_vcpu = NULL;
101 }
102 
103 int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
104 				    struct kvm_vcpu *vcpu, u32 server_num)
105 {
106 	struct kvmppc_xive *xive = dev->private;
107 	struct kvmppc_xive_vcpu *xc = NULL;
108 	int rc;
109 
110 	pr_devel("native_connect_vcpu(server=%d)\n", server_num);
111 
112 	if (dev->ops != &kvm_xive_native_ops) {
113 		pr_devel("Wrong ops !\n");
114 		return -EPERM;
115 	}
116 	if (xive->kvm != vcpu->kvm)
117 		return -EPERM;
118 	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
119 		return -EBUSY;
120 	if (server_num >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) {
121 		pr_devel("Out of bounds !\n");
122 		return -EINVAL;
123 	}
124 
125 	mutex_lock(&xive->lock);
126 
127 	if (kvmppc_xive_find_server(vcpu->kvm, server_num)) {
128 		pr_devel("Duplicate !\n");
129 		rc = -EEXIST;
130 		goto bail;
131 	}
132 
133 	xc = kzalloc(sizeof(*xc), GFP_KERNEL);
134 	if (!xc) {
135 		rc = -ENOMEM;
136 		goto bail;
137 	}
138 
139 	vcpu->arch.xive_vcpu = xc;
140 	xc->xive = xive;
141 	xc->vcpu = vcpu;
142 	xc->server_num = server_num;
143 
144 	xc->vp_id = kvmppc_xive_vp(xive, server_num);
145 	xc->valid = true;
146 	vcpu->arch.irq_type = KVMPPC_IRQ_XIVE;
147 
148 	rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
149 	if (rc) {
150 		pr_err("Failed to get VP info from OPAL: %d\n", rc);
151 		goto bail;
152 	}
153 
154 	/*
155 	 * Enable the VP first as the single escalation mode will
156 	 * affect escalation interrupts numbering
157 	 */
158 	rc = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
159 	if (rc) {
160 		pr_err("Failed to enable VP in OPAL: %d\n", rc);
161 		goto bail;
162 	}
163 
164 	/* Configure VCPU fields for use by assembly push/pull */
165 	vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
166 	vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
167 
168 	/* TODO: reset all queues to a clean state ? */
169 bail:
170 	mutex_unlock(&xive->lock);
171 	if (rc)
172 		kvmppc_xive_native_cleanup_vcpu(vcpu);
173 
174 	return rc;
175 }
176 
177 /*
178  * Device passthrough support
179  */
180 static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq)
181 {
182 	struct kvmppc_xive *xive = kvm->arch.xive;
183 	pgoff_t esb_pgoff = KVM_XIVE_ESB_PAGE_OFFSET + irq * 2;
184 
185 	if (irq >= KVMPPC_XIVE_NR_IRQS)
186 		return -EINVAL;
187 
188 	/*
189 	 * Clear the ESB pages of the IRQ number being mapped (or
190 	 * unmapped) into the guest and let the the VM fault handler
191 	 * repopulate with the appropriate ESB pages (device or IC)
192 	 */
193 	pr_debug("clearing esb pages for girq 0x%lx\n", irq);
194 	mutex_lock(&xive->mapping_lock);
195 	if (xive->mapping)
196 		unmap_mapping_range(xive->mapping,
197 				    esb_pgoff << PAGE_SHIFT,
198 				    2ull << PAGE_SHIFT, 1);
199 	mutex_unlock(&xive->mapping_lock);
200 	return 0;
201 }
202 
203 static struct kvmppc_xive_ops kvmppc_xive_native_ops =  {
204 	.reset_mapped = kvmppc_xive_native_reset_mapped,
205 };
206 
207 static vm_fault_t xive_native_esb_fault(struct vm_fault *vmf)
208 {
209 	struct vm_area_struct *vma = vmf->vma;
210 	struct kvm_device *dev = vma->vm_file->private_data;
211 	struct kvmppc_xive *xive = dev->private;
212 	struct kvmppc_xive_src_block *sb;
213 	struct kvmppc_xive_irq_state *state;
214 	struct xive_irq_data *xd;
215 	u32 hw_num;
216 	u16 src;
217 	u64 page;
218 	unsigned long irq;
219 	u64 page_offset;
220 
221 	/*
222 	 * Linux/KVM uses a two pages ESB setting, one for trigger and
223 	 * one for EOI
224 	 */
225 	page_offset = vmf->pgoff - vma->vm_pgoff;
226 	irq = page_offset / 2;
227 
228 	sb = kvmppc_xive_find_source(xive, irq, &src);
229 	if (!sb) {
230 		pr_devel("%s: source %lx not found !\n", __func__, irq);
231 		return VM_FAULT_SIGBUS;
232 	}
233 
234 	state = &sb->irq_state[src];
235 	kvmppc_xive_select_irq(state, &hw_num, &xd);
236 
237 	arch_spin_lock(&sb->lock);
238 
239 	/*
240 	 * first/even page is for trigger
241 	 * second/odd page is for EOI and management.
242 	 */
243 	page = page_offset % 2 ? xd->eoi_page : xd->trig_page;
244 	arch_spin_unlock(&sb->lock);
245 
246 	if (WARN_ON(!page)) {
247 		pr_err("%s: accessing invalid ESB page for source %lx !\n",
248 		       __func__, irq);
249 		return VM_FAULT_SIGBUS;
250 	}
251 
252 	vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
253 	return VM_FAULT_NOPAGE;
254 }
255 
256 static const struct vm_operations_struct xive_native_esb_vmops = {
257 	.fault = xive_native_esb_fault,
258 };
259 
260 static vm_fault_t xive_native_tima_fault(struct vm_fault *vmf)
261 {
262 	struct vm_area_struct *vma = vmf->vma;
263 
264 	switch (vmf->pgoff - vma->vm_pgoff) {
265 	case 0: /* HW - forbid access */
266 	case 1: /* HV - forbid access */
267 		return VM_FAULT_SIGBUS;
268 	case 2: /* OS */
269 		vmf_insert_pfn(vma, vmf->address, xive_tima_os >> PAGE_SHIFT);
270 		return VM_FAULT_NOPAGE;
271 	case 3: /* USER - TODO */
272 	default:
273 		return VM_FAULT_SIGBUS;
274 	}
275 }
276 
277 static const struct vm_operations_struct xive_native_tima_vmops = {
278 	.fault = xive_native_tima_fault,
279 };
280 
281 static int kvmppc_xive_native_mmap(struct kvm_device *dev,
282 				   struct vm_area_struct *vma)
283 {
284 	struct kvmppc_xive *xive = dev->private;
285 
286 	/* We only allow mappings at fixed offset for now */
287 	if (vma->vm_pgoff == KVM_XIVE_TIMA_PAGE_OFFSET) {
288 		if (vma_pages(vma) > 4)
289 			return -EINVAL;
290 		vma->vm_ops = &xive_native_tima_vmops;
291 	} else if (vma->vm_pgoff == KVM_XIVE_ESB_PAGE_OFFSET) {
292 		if (vma_pages(vma) > KVMPPC_XIVE_NR_IRQS * 2)
293 			return -EINVAL;
294 		vma->vm_ops = &xive_native_esb_vmops;
295 	} else {
296 		return -EINVAL;
297 	}
298 
299 	vma->vm_flags |= VM_IO | VM_PFNMAP;
300 	vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
301 
302 	/*
303 	 * Grab the KVM device file address_space to be able to clear
304 	 * the ESB pages mapping when a device is passed-through into
305 	 * the guest.
306 	 */
307 	xive->mapping = vma->vm_file->f_mapping;
308 	return 0;
309 }
310 
311 static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
312 					 u64 addr)
313 {
314 	struct kvmppc_xive_src_block *sb;
315 	struct kvmppc_xive_irq_state *state;
316 	u64 __user *ubufp = (u64 __user *) addr;
317 	u64 val;
318 	u16 idx;
319 	int rc;
320 
321 	pr_devel("%s irq=0x%lx\n", __func__, irq);
322 
323 	if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
324 		return -E2BIG;
325 
326 	sb = kvmppc_xive_find_source(xive, irq, &idx);
327 	if (!sb) {
328 		pr_debug("No source, creating source block...\n");
329 		sb = kvmppc_xive_create_src_block(xive, irq);
330 		if (!sb) {
331 			pr_err("Failed to create block...\n");
332 			return -ENOMEM;
333 		}
334 	}
335 	state = &sb->irq_state[idx];
336 
337 	if (get_user(val, ubufp)) {
338 		pr_err("fault getting user info !\n");
339 		return -EFAULT;
340 	}
341 
342 	arch_spin_lock(&sb->lock);
343 
344 	/*
345 	 * If the source doesn't already have an IPI, allocate
346 	 * one and get the corresponding data
347 	 */
348 	if (!state->ipi_number) {
349 		state->ipi_number = xive_native_alloc_irq();
350 		if (state->ipi_number == 0) {
351 			pr_err("Failed to allocate IRQ !\n");
352 			rc = -ENXIO;
353 			goto unlock;
354 		}
355 		xive_native_populate_irq_data(state->ipi_number,
356 					      &state->ipi_data);
357 		pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
358 			 state->ipi_number, irq);
359 	}
360 
361 	/* Restore LSI state */
362 	if (val & KVM_XIVE_LEVEL_SENSITIVE) {
363 		state->lsi = true;
364 		if (val & KVM_XIVE_LEVEL_ASSERTED)
365 			state->asserted = true;
366 		pr_devel("  LSI ! Asserted=%d\n", state->asserted);
367 	}
368 
369 	/* Mask IRQ to start with */
370 	state->act_server = 0;
371 	state->act_priority = MASKED;
372 	xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
373 	xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
374 
375 	/* Increment the number of valid sources and mark this one valid */
376 	if (!state->valid)
377 		xive->src_count++;
378 	state->valid = true;
379 
380 	rc = 0;
381 
382 unlock:
383 	arch_spin_unlock(&sb->lock);
384 
385 	return rc;
386 }
387 
388 static int kvmppc_xive_native_update_source_config(struct kvmppc_xive *xive,
389 					struct kvmppc_xive_src_block *sb,
390 					struct kvmppc_xive_irq_state *state,
391 					u32 server, u8 priority, bool masked,
392 					u32 eisn)
393 {
394 	struct kvm *kvm = xive->kvm;
395 	u32 hw_num;
396 	int rc = 0;
397 
398 	arch_spin_lock(&sb->lock);
399 
400 	if (state->act_server == server && state->act_priority == priority &&
401 	    state->eisn == eisn)
402 		goto unlock;
403 
404 	pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n",
405 		 priority, server, masked, state->act_server,
406 		 state->act_priority);
407 
408 	kvmppc_xive_select_irq(state, &hw_num, NULL);
409 
410 	if (priority != MASKED && !masked) {
411 		rc = kvmppc_xive_select_target(kvm, &server, priority);
412 		if (rc)
413 			goto unlock;
414 
415 		state->act_priority = priority;
416 		state->act_server = server;
417 		state->eisn = eisn;
418 
419 		rc = xive_native_configure_irq(hw_num,
420 					       kvmppc_xive_vp(xive, server),
421 					       priority, eisn);
422 	} else {
423 		state->act_priority = MASKED;
424 		state->act_server = 0;
425 		state->eisn = 0;
426 
427 		rc = xive_native_configure_irq(hw_num, 0, MASKED, 0);
428 	}
429 
430 unlock:
431 	arch_spin_unlock(&sb->lock);
432 	return rc;
433 }
434 
435 static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive,
436 						long irq, u64 addr)
437 {
438 	struct kvmppc_xive_src_block *sb;
439 	struct kvmppc_xive_irq_state *state;
440 	u64 __user *ubufp = (u64 __user *) addr;
441 	u16 src;
442 	u64 kvm_cfg;
443 	u32 server;
444 	u8 priority;
445 	bool masked;
446 	u32 eisn;
447 
448 	sb = kvmppc_xive_find_source(xive, irq, &src);
449 	if (!sb)
450 		return -ENOENT;
451 
452 	state = &sb->irq_state[src];
453 
454 	if (!state->valid)
455 		return -EINVAL;
456 
457 	if (get_user(kvm_cfg, ubufp))
458 		return -EFAULT;
459 
460 	pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__, irq, kvm_cfg);
461 
462 	priority = (kvm_cfg & KVM_XIVE_SOURCE_PRIORITY_MASK) >>
463 		KVM_XIVE_SOURCE_PRIORITY_SHIFT;
464 	server = (kvm_cfg & KVM_XIVE_SOURCE_SERVER_MASK) >>
465 		KVM_XIVE_SOURCE_SERVER_SHIFT;
466 	masked = (kvm_cfg & KVM_XIVE_SOURCE_MASKED_MASK) >>
467 		KVM_XIVE_SOURCE_MASKED_SHIFT;
468 	eisn = (kvm_cfg & KVM_XIVE_SOURCE_EISN_MASK) >>
469 		KVM_XIVE_SOURCE_EISN_SHIFT;
470 
471 	if (priority != xive_prio_from_guest(priority)) {
472 		pr_err("invalid priority for queue %d for VCPU %d\n",
473 		       priority, server);
474 		return -EINVAL;
475 	}
476 
477 	return kvmppc_xive_native_update_source_config(xive, sb, state, server,
478 						       priority, masked, eisn);
479 }
480 
481 static int kvmppc_xive_native_sync_source(struct kvmppc_xive *xive,
482 					  long irq, u64 addr)
483 {
484 	struct kvmppc_xive_src_block *sb;
485 	struct kvmppc_xive_irq_state *state;
486 	struct xive_irq_data *xd;
487 	u32 hw_num;
488 	u16 src;
489 	int rc = 0;
490 
491 	pr_devel("%s irq=0x%lx", __func__, irq);
492 
493 	sb = kvmppc_xive_find_source(xive, irq, &src);
494 	if (!sb)
495 		return -ENOENT;
496 
497 	state = &sb->irq_state[src];
498 
499 	rc = -EINVAL;
500 
501 	arch_spin_lock(&sb->lock);
502 
503 	if (state->valid) {
504 		kvmppc_xive_select_irq(state, &hw_num, &xd);
505 		xive_native_sync_source(hw_num);
506 		rc = 0;
507 	}
508 
509 	arch_spin_unlock(&sb->lock);
510 	return rc;
511 }
512 
513 static int xive_native_validate_queue_size(u32 qshift)
514 {
515 	/*
516 	 * We only support 64K pages for the moment. This is also
517 	 * advertised in the DT property "ibm,xive-eq-sizes"
518 	 */
519 	switch (qshift) {
520 	case 0: /* EQ reset */
521 	case 16:
522 		return 0;
523 	case 12:
524 	case 21:
525 	case 24:
526 	default:
527 		return -EINVAL;
528 	}
529 }
530 
531 static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
532 					       long eq_idx, u64 addr)
533 {
534 	struct kvm *kvm = xive->kvm;
535 	struct kvm_vcpu *vcpu;
536 	struct kvmppc_xive_vcpu *xc;
537 	void __user *ubufp = (void __user *) addr;
538 	u32 server;
539 	u8 priority;
540 	struct kvm_ppc_xive_eq kvm_eq;
541 	int rc;
542 	__be32 *qaddr = 0;
543 	struct page *page;
544 	struct xive_q *q;
545 	gfn_t gfn;
546 	unsigned long page_size;
547 	int srcu_idx;
548 
549 	/*
550 	 * Demangle priority/server tuple from the EQ identifier
551 	 */
552 	priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
553 		KVM_XIVE_EQ_PRIORITY_SHIFT;
554 	server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
555 		KVM_XIVE_EQ_SERVER_SHIFT;
556 
557 	if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq)))
558 		return -EFAULT;
559 
560 	vcpu = kvmppc_xive_find_server(kvm, server);
561 	if (!vcpu) {
562 		pr_err("Can't find server %d\n", server);
563 		return -ENOENT;
564 	}
565 	xc = vcpu->arch.xive_vcpu;
566 
567 	if (priority != xive_prio_from_guest(priority)) {
568 		pr_err("Trying to restore invalid queue %d for VCPU %d\n",
569 		       priority, server);
570 		return -EINVAL;
571 	}
572 	q = &xc->queues[priority];
573 
574 	pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
575 		 __func__, server, priority, kvm_eq.flags,
576 		 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
577 
578 	/* reset queue and disable queueing */
579 	if (!kvm_eq.qshift) {
580 		q->guest_qaddr  = 0;
581 		q->guest_qshift = 0;
582 
583 		rc = xive_native_configure_queue(xc->vp_id, q, priority,
584 						 NULL, 0, true);
585 		if (rc) {
586 			pr_err("Failed to reset queue %d for VCPU %d: %d\n",
587 			       priority, xc->server_num, rc);
588 			return rc;
589 		}
590 
591 		if (q->qpage) {
592 			put_page(virt_to_page(q->qpage));
593 			q->qpage = NULL;
594 		}
595 
596 		return 0;
597 	}
598 
599 	/*
600 	 * sPAPR specifies a "Unconditional Notify (n) flag" for the
601 	 * H_INT_SET_QUEUE_CONFIG hcall which forces notification
602 	 * without using the coalescing mechanisms provided by the
603 	 * XIVE END ESBs. This is required on KVM as notification
604 	 * using the END ESBs is not supported.
605 	 */
606 	if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) {
607 		pr_err("invalid flags %d\n", kvm_eq.flags);
608 		return -EINVAL;
609 	}
610 
611 	rc = xive_native_validate_queue_size(kvm_eq.qshift);
612 	if (rc) {
613 		pr_err("invalid queue size %d\n", kvm_eq.qshift);
614 		return rc;
615 	}
616 
617 	if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) {
618 		pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr,
619 		       1ull << kvm_eq.qshift);
620 		return -EINVAL;
621 	}
622 
623 	srcu_idx = srcu_read_lock(&kvm->srcu);
624 	gfn = gpa_to_gfn(kvm_eq.qaddr);
625 	page = gfn_to_page(kvm, gfn);
626 	if (is_error_page(page)) {
627 		srcu_read_unlock(&kvm->srcu, srcu_idx);
628 		pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
629 		return -EINVAL;
630 	}
631 
632 	page_size = kvm_host_page_size(kvm, gfn);
633 	if (1ull << kvm_eq.qshift > page_size) {
634 		srcu_read_unlock(&kvm->srcu, srcu_idx);
635 		pr_warn("Incompatible host page size %lx!\n", page_size);
636 		return -EINVAL;
637 	}
638 
639 	qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
640 	srcu_read_unlock(&kvm->srcu, srcu_idx);
641 
642 	/*
643 	 * Backup the queue page guest address to the mark EQ page
644 	 * dirty for migration.
645 	 */
646 	q->guest_qaddr  = kvm_eq.qaddr;
647 	q->guest_qshift = kvm_eq.qshift;
648 
649 	 /*
650 	  * Unconditional Notification is forced by default at the
651 	  * OPAL level because the use of END ESBs is not supported by
652 	  * Linux.
653 	  */
654 	rc = xive_native_configure_queue(xc->vp_id, q, priority,
655 					 (__be32 *) qaddr, kvm_eq.qshift, true);
656 	if (rc) {
657 		pr_err("Failed to configure queue %d for VCPU %d: %d\n",
658 		       priority, xc->server_num, rc);
659 		put_page(page);
660 		return rc;
661 	}
662 
663 	/*
664 	 * Only restore the queue state when needed. When doing the
665 	 * H_INT_SET_SOURCE_CONFIG hcall, it should not.
666 	 */
667 	if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) {
668 		rc = xive_native_set_queue_state(xc->vp_id, priority,
669 						 kvm_eq.qtoggle,
670 						 kvm_eq.qindex);
671 		if (rc)
672 			goto error;
673 	}
674 
675 	rc = kvmppc_xive_attach_escalation(vcpu, priority,
676 					   xive->single_escalation);
677 error:
678 	if (rc)
679 		kvmppc_xive_native_cleanup_queue(vcpu, priority);
680 	return rc;
681 }
682 
683 static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive,
684 					       long eq_idx, u64 addr)
685 {
686 	struct kvm *kvm = xive->kvm;
687 	struct kvm_vcpu *vcpu;
688 	struct kvmppc_xive_vcpu *xc;
689 	struct xive_q *q;
690 	void __user *ubufp = (u64 __user *) addr;
691 	u32 server;
692 	u8 priority;
693 	struct kvm_ppc_xive_eq kvm_eq;
694 	u64 qaddr;
695 	u64 qshift;
696 	u64 qeoi_page;
697 	u32 escalate_irq;
698 	u64 qflags;
699 	int rc;
700 
701 	/*
702 	 * Demangle priority/server tuple from the EQ identifier
703 	 */
704 	priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
705 		KVM_XIVE_EQ_PRIORITY_SHIFT;
706 	server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
707 		KVM_XIVE_EQ_SERVER_SHIFT;
708 
709 	vcpu = kvmppc_xive_find_server(kvm, server);
710 	if (!vcpu) {
711 		pr_err("Can't find server %d\n", server);
712 		return -ENOENT;
713 	}
714 	xc = vcpu->arch.xive_vcpu;
715 
716 	if (priority != xive_prio_from_guest(priority)) {
717 		pr_err("invalid priority for queue %d for VCPU %d\n",
718 		       priority, server);
719 		return -EINVAL;
720 	}
721 	q = &xc->queues[priority];
722 
723 	memset(&kvm_eq, 0, sizeof(kvm_eq));
724 
725 	if (!q->qpage)
726 		return 0;
727 
728 	rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift,
729 					&qeoi_page, &escalate_irq, &qflags);
730 	if (rc)
731 		return rc;
732 
733 	kvm_eq.flags = 0;
734 	if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
735 		kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY;
736 
737 	kvm_eq.qshift = q->guest_qshift;
738 	kvm_eq.qaddr  = q->guest_qaddr;
739 
740 	rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle,
741 					 &kvm_eq.qindex);
742 	if (rc)
743 		return rc;
744 
745 	pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
746 		 __func__, server, priority, kvm_eq.flags,
747 		 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
748 
749 	if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq)))
750 		return -EFAULT;
751 
752 	return 0;
753 }
754 
755 static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block *sb)
756 {
757 	int i;
758 
759 	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
760 		struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
761 
762 		if (!state->valid)
763 			continue;
764 
765 		if (state->act_priority == MASKED)
766 			continue;
767 
768 		state->eisn = 0;
769 		state->act_server = 0;
770 		state->act_priority = MASKED;
771 		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
772 		xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
773 		if (state->pt_number) {
774 			xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
775 			xive_native_configure_irq(state->pt_number,
776 						  0, MASKED, 0);
777 		}
778 	}
779 }
780 
781 static int kvmppc_xive_reset(struct kvmppc_xive *xive)
782 {
783 	struct kvm *kvm = xive->kvm;
784 	struct kvm_vcpu *vcpu;
785 	unsigned int i;
786 
787 	pr_devel("%s\n", __func__);
788 
789 	mutex_lock(&xive->lock);
790 
791 	kvm_for_each_vcpu(i, vcpu, kvm) {
792 		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
793 		unsigned int prio;
794 
795 		if (!xc)
796 			continue;
797 
798 		kvmppc_xive_disable_vcpu_interrupts(vcpu);
799 
800 		for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
801 
802 			/* Single escalation, no queue 7 */
803 			if (prio == 7 && xive->single_escalation)
804 				break;
805 
806 			if (xc->esc_virq[prio]) {
807 				free_irq(xc->esc_virq[prio], vcpu);
808 				irq_dispose_mapping(xc->esc_virq[prio]);
809 				kfree(xc->esc_virq_names[prio]);
810 				xc->esc_virq[prio] = 0;
811 			}
812 
813 			kvmppc_xive_native_cleanup_queue(vcpu, prio);
814 		}
815 	}
816 
817 	for (i = 0; i <= xive->max_sbid; i++) {
818 		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
819 
820 		if (sb) {
821 			arch_spin_lock(&sb->lock);
822 			kvmppc_xive_reset_sources(sb);
823 			arch_spin_unlock(&sb->lock);
824 		}
825 	}
826 
827 	mutex_unlock(&xive->lock);
828 
829 	return 0;
830 }
831 
832 static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
833 {
834 	int j;
835 
836 	for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
837 		struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
838 		struct xive_irq_data *xd;
839 		u32 hw_num;
840 
841 		if (!state->valid)
842 			continue;
843 
844 		/*
845 		 * The struct kvmppc_xive_irq_state reflects the state
846 		 * of the EAS configuration and not the state of the
847 		 * source. The source is masked setting the PQ bits to
848 		 * '-Q', which is what is being done before calling
849 		 * the KVM_DEV_XIVE_EQ_SYNC control.
850 		 *
851 		 * If a source EAS is configured, OPAL syncs the XIVE
852 		 * IC of the source and the XIVE IC of the previous
853 		 * target if any.
854 		 *
855 		 * So it should be fine ignoring MASKED sources as
856 		 * they have been synced already.
857 		 */
858 		if (state->act_priority == MASKED)
859 			continue;
860 
861 		kvmppc_xive_select_irq(state, &hw_num, &xd);
862 		xive_native_sync_source(hw_num);
863 		xive_native_sync_queue(hw_num);
864 	}
865 }
866 
867 static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
868 {
869 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
870 	unsigned int prio;
871 	int srcu_idx;
872 
873 	if (!xc)
874 		return -ENOENT;
875 
876 	for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
877 		struct xive_q *q = &xc->queues[prio];
878 
879 		if (!q->qpage)
880 			continue;
881 
882 		/* Mark EQ page dirty for migration */
883 		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
884 		mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr));
885 		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
886 	}
887 	return 0;
888 }
889 
890 static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
891 {
892 	struct kvm *kvm = xive->kvm;
893 	struct kvm_vcpu *vcpu;
894 	unsigned int i;
895 
896 	pr_devel("%s\n", __func__);
897 
898 	mutex_lock(&xive->lock);
899 	for (i = 0; i <= xive->max_sbid; i++) {
900 		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
901 
902 		if (sb) {
903 			arch_spin_lock(&sb->lock);
904 			kvmppc_xive_native_sync_sources(sb);
905 			arch_spin_unlock(&sb->lock);
906 		}
907 	}
908 
909 	kvm_for_each_vcpu(i, vcpu, kvm) {
910 		kvmppc_xive_native_vcpu_eq_sync(vcpu);
911 	}
912 	mutex_unlock(&xive->lock);
913 
914 	return 0;
915 }
916 
917 static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
918 				       struct kvm_device_attr *attr)
919 {
920 	struct kvmppc_xive *xive = dev->private;
921 
922 	switch (attr->group) {
923 	case KVM_DEV_XIVE_GRP_CTRL:
924 		switch (attr->attr) {
925 		case KVM_DEV_XIVE_RESET:
926 			return kvmppc_xive_reset(xive);
927 		case KVM_DEV_XIVE_EQ_SYNC:
928 			return kvmppc_xive_native_eq_sync(xive);
929 		}
930 		break;
931 	case KVM_DEV_XIVE_GRP_SOURCE:
932 		return kvmppc_xive_native_set_source(xive, attr->attr,
933 						     attr->addr);
934 	case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
935 		return kvmppc_xive_native_set_source_config(xive, attr->attr,
936 							    attr->addr);
937 	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
938 		return kvmppc_xive_native_set_queue_config(xive, attr->attr,
939 							   attr->addr);
940 	case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
941 		return kvmppc_xive_native_sync_source(xive, attr->attr,
942 						      attr->addr);
943 	}
944 	return -ENXIO;
945 }
946 
947 static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
948 				       struct kvm_device_attr *attr)
949 {
950 	struct kvmppc_xive *xive = dev->private;
951 
952 	switch (attr->group) {
953 	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
954 		return kvmppc_xive_native_get_queue_config(xive, attr->attr,
955 							   attr->addr);
956 	}
957 	return -ENXIO;
958 }
959 
960 static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
961 				       struct kvm_device_attr *attr)
962 {
963 	switch (attr->group) {
964 	case KVM_DEV_XIVE_GRP_CTRL:
965 		switch (attr->attr) {
966 		case KVM_DEV_XIVE_RESET:
967 		case KVM_DEV_XIVE_EQ_SYNC:
968 			return 0;
969 		}
970 		break;
971 	case KVM_DEV_XIVE_GRP_SOURCE:
972 	case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
973 	case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
974 		if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
975 		    attr->attr < KVMPPC_XIVE_NR_IRQS)
976 			return 0;
977 		break;
978 	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
979 		return 0;
980 	}
981 	return -ENXIO;
982 }
983 
984 /*
985  * Called when device fd is closed.  kvm->lock is held.
986  */
987 static void kvmppc_xive_native_release(struct kvm_device *dev)
988 {
989 	struct kvmppc_xive *xive = dev->private;
990 	struct kvm *kvm = xive->kvm;
991 	struct kvm_vcpu *vcpu;
992 	int i;
993 
994 	pr_devel("Releasing xive native device\n");
995 
996 	/*
997 	 * Clear the KVM device file address_space which is used to
998 	 * unmap the ESB pages when a device is passed-through.
999 	 */
1000 	mutex_lock(&xive->mapping_lock);
1001 	xive->mapping = NULL;
1002 	mutex_unlock(&xive->mapping_lock);
1003 
1004 	/*
1005 	 * Since this is the device release function, we know that
1006 	 * userspace does not have any open fd or mmap referring to
1007 	 * the device.  Therefore there can not be any of the
1008 	 * device attribute set/get, mmap, or page fault functions
1009 	 * being executed concurrently, and similarly, the
1010 	 * connect_vcpu and set/clr_mapped functions also cannot
1011 	 * be being executed.
1012 	 */
1013 
1014 	debugfs_remove(xive->dentry);
1015 
1016 	/*
1017 	 * We should clean up the vCPU interrupt presenters first.
1018 	 */
1019 	kvm_for_each_vcpu(i, vcpu, kvm) {
1020 		/*
1021 		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
1022 		 * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
1023 		 * Holding the vcpu->mutex also means that the vcpu cannot
1024 		 * be executing the KVM_RUN ioctl, and therefore it cannot
1025 		 * be executing the XIVE push or pull code or accessing
1026 		 * the XIVE MMIO regions.
1027 		 */
1028 		mutex_lock(&vcpu->mutex);
1029 		kvmppc_xive_native_cleanup_vcpu(vcpu);
1030 		mutex_unlock(&vcpu->mutex);
1031 	}
1032 
1033 	/*
1034 	 * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
1035 	 * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
1036 	 * against xive code getting called during vcpu execution or
1037 	 * set/get one_reg operations.
1038 	 */
1039 	kvm->arch.xive = NULL;
1040 
1041 	for (i = 0; i <= xive->max_sbid; i++) {
1042 		if (xive->src_blocks[i])
1043 			kvmppc_xive_free_sources(xive->src_blocks[i]);
1044 		kfree(xive->src_blocks[i]);
1045 		xive->src_blocks[i] = NULL;
1046 	}
1047 
1048 	if (xive->vp_base != XIVE_INVALID_VP)
1049 		xive_native_free_vp_block(xive->vp_base);
1050 
1051 	/*
1052 	 * A reference of the kvmppc_xive pointer is now kept under
1053 	 * the xive_devices struct of the machine for reuse. It is
1054 	 * freed when the VM is destroyed for now until we fix all the
1055 	 * execution paths.
1056 	 */
1057 
1058 	kfree(dev);
1059 }
1060 
1061 /*
1062  * Create a XIVE device.  kvm->lock is held.
1063  */
1064 static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
1065 {
1066 	struct kvmppc_xive *xive;
1067 	struct kvm *kvm = dev->kvm;
1068 	int ret = 0;
1069 
1070 	pr_devel("Creating xive native device\n");
1071 
1072 	if (kvm->arch.xive)
1073 		return -EEXIST;
1074 
1075 	xive = kvmppc_xive_get_device(kvm, type);
1076 	if (!xive)
1077 		return -ENOMEM;
1078 
1079 	dev->private = xive;
1080 	xive->dev = dev;
1081 	xive->kvm = kvm;
1082 	kvm->arch.xive = xive;
1083 	mutex_init(&xive->mapping_lock);
1084 	mutex_init(&xive->lock);
1085 
1086 	/*
1087 	 * Allocate a bunch of VPs. KVM_MAX_VCPUS is a large value for
1088 	 * a default. Getting the max number of CPUs the VM was
1089 	 * configured with would improve our usage of the XIVE VP space.
1090 	 */
1091 	xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS);
1092 	pr_devel("VP_Base=%x\n", xive->vp_base);
1093 
1094 	if (xive->vp_base == XIVE_INVALID_VP)
1095 		ret = -ENXIO;
1096 
1097 	xive->single_escalation = xive_native_has_single_escalation();
1098 	xive->ops = &kvmppc_xive_native_ops;
1099 
1100 	if (ret)
1101 		return ret;
1102 
1103 	return 0;
1104 }
1105 
1106 /*
1107  * Interrupt Pending Buffer (IPB) offset
1108  */
1109 #define TM_IPB_SHIFT 40
1110 #define TM_IPB_MASK  (((u64) 0xFF) << TM_IPB_SHIFT)
1111 
1112 int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1113 {
1114 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1115 	u64 opal_state;
1116 	int rc;
1117 
1118 	if (!kvmppc_xive_enabled(vcpu))
1119 		return -EPERM;
1120 
1121 	if (!xc)
1122 		return -ENOENT;
1123 
1124 	/* Thread context registers. We only care about IPB and CPPR */
1125 	val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
1126 
1127 	/* Get the VP state from OPAL */
1128 	rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
1129 	if (rc)
1130 		return rc;
1131 
1132 	/*
1133 	 * Capture the backup of IPB register in the NVT structure and
1134 	 * merge it in our KVM VP state.
1135 	 */
1136 	val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
1137 
1138 	pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
1139 		 __func__,
1140 		 vcpu->arch.xive_saved_state.nsr,
1141 		 vcpu->arch.xive_saved_state.cppr,
1142 		 vcpu->arch.xive_saved_state.ipb,
1143 		 vcpu->arch.xive_saved_state.pipr,
1144 		 vcpu->arch.xive_saved_state.w01,
1145 		 (u32) vcpu->arch.xive_cam_word, opal_state);
1146 
1147 	return 0;
1148 }
1149 
1150 int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1151 {
1152 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1153 	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
1154 
1155 	pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
1156 		 val->xive_timaval[0], val->xive_timaval[1]);
1157 
1158 	if (!kvmppc_xive_enabled(vcpu))
1159 		return -EPERM;
1160 
1161 	if (!xc || !xive)
1162 		return -ENOENT;
1163 
1164 	/* We can't update the state of a "pushed" VCPU	 */
1165 	if (WARN_ON(vcpu->arch.xive_pushed))
1166 		return -EBUSY;
1167 
1168 	/*
1169 	 * Restore the thread context registers. IPB and CPPR should
1170 	 * be the only ones that matter.
1171 	 */
1172 	vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
1173 
1174 	/*
1175 	 * There is no need to restore the XIVE internal state (IPB
1176 	 * stored in the NVT) as the IPB register was merged in KVM VP
1177 	 * state when captured.
1178 	 */
1179 	return 0;
1180 }
1181 
1182 bool kvmppc_xive_native_supported(void)
1183 {
1184 	return xive_native_has_queue_state_support();
1185 }
1186 
1187 static int xive_native_debug_show(struct seq_file *m, void *private)
1188 {
1189 	struct kvmppc_xive *xive = m->private;
1190 	struct kvm *kvm = xive->kvm;
1191 	struct kvm_vcpu *vcpu;
1192 	unsigned int i;
1193 
1194 	if (!kvm)
1195 		return 0;
1196 
1197 	seq_puts(m, "=========\nVCPU state\n=========\n");
1198 
1199 	kvm_for_each_vcpu(i, vcpu, kvm) {
1200 		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1201 
1202 		if (!xc)
1203 			continue;
1204 
1205 		seq_printf(m, "cpu server %#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
1206 			   xc->server_num,
1207 			   vcpu->arch.xive_saved_state.nsr,
1208 			   vcpu->arch.xive_saved_state.cppr,
1209 			   vcpu->arch.xive_saved_state.ipb,
1210 			   vcpu->arch.xive_saved_state.pipr,
1211 			   vcpu->arch.xive_saved_state.w01,
1212 			   (u32) vcpu->arch.xive_cam_word);
1213 
1214 		kvmppc_xive_debug_show_queues(m, vcpu);
1215 	}
1216 
1217 	return 0;
1218 }
1219 
1220 static int xive_native_debug_open(struct inode *inode, struct file *file)
1221 {
1222 	return single_open(file, xive_native_debug_show, inode->i_private);
1223 }
1224 
1225 static const struct file_operations xive_native_debug_fops = {
1226 	.open = xive_native_debug_open,
1227 	.read = seq_read,
1228 	.llseek = seq_lseek,
1229 	.release = single_release,
1230 };
1231 
1232 static void xive_native_debugfs_init(struct kvmppc_xive *xive)
1233 {
1234 	char *name;
1235 
1236 	name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive);
1237 	if (!name) {
1238 		pr_err("%s: no memory for name\n", __func__);
1239 		return;
1240 	}
1241 
1242 	xive->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root,
1243 					   xive, &xive_native_debug_fops);
1244 
1245 	pr_debug("%s: created %s\n", __func__, name);
1246 	kfree(name);
1247 }
1248 
1249 static void kvmppc_xive_native_init(struct kvm_device *dev)
1250 {
1251 	struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private;
1252 
1253 	/* Register some debug interfaces */
1254 	xive_native_debugfs_init(xive);
1255 }
1256 
1257 struct kvm_device_ops kvm_xive_native_ops = {
1258 	.name = "kvm-xive-native",
1259 	.create = kvmppc_xive_native_create,
1260 	.init = kvmppc_xive_native_init,
1261 	.release = kvmppc_xive_native_release,
1262 	.set_attr = kvmppc_xive_native_set_attr,
1263 	.get_attr = kvmppc_xive_native_get_attr,
1264 	.has_attr = kvmppc_xive_native_has_attr,
1265 	.mmap = kvmppc_xive_native_mmap,
1266 };
1267 
1268 void kvmppc_xive_native_init_module(void)
1269 {
1270 	;
1271 }
1272 
1273 void kvmppc_xive_native_exit_module(void)
1274 {
1275 	;
1276 }
1277