1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2017-2019, IBM Corporation.
4  */
5 
6 #define pr_fmt(fmt) "xive-kvm: " fmt
7 
8 #include <linux/kernel.h>
9 #include <linux/kvm_host.h>
10 #include <linux/err.h>
11 #include <linux/gfp.h>
12 #include <linux/spinlock.h>
13 #include <linux/delay.h>
14 #include <linux/file.h>
15 #include <asm/uaccess.h>
16 #include <asm/kvm_book3s.h>
17 #include <asm/kvm_ppc.h>
18 #include <asm/hvcall.h>
19 #include <asm/xive.h>
20 #include <asm/xive-regs.h>
21 #include <asm/debug.h>
22 #include <asm/debugfs.h>
23 #include <asm/opal.h>
24 
25 #include <linux/debugfs.h>
26 #include <linux/seq_file.h>
27 
28 #include "book3s_xive.h"
29 
30 static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
31 {
32 	u64 val;
33 
34 	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
35 		offset |= offset << 4;
36 
37 	val = in_be64(xd->eoi_mmio + offset);
38 	return (u8)val;
39 }
40 
41 static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
42 {
43 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
44 	struct xive_q *q = &xc->queues[prio];
45 
46 	xive_native_disable_queue(xc->vp_id, q, prio);
47 	if (q->qpage) {
48 		put_page(virt_to_page(q->qpage));
49 		q->qpage = NULL;
50 	}
51 }
52 
53 static int kvmppc_xive_native_configure_queue(u32 vp_id, struct xive_q *q,
54 					      u8 prio, __be32 *qpage,
55 					      u32 order, bool can_escalate)
56 {
57 	int rc;
58 	__be32 *qpage_prev = q->qpage;
59 
60 	rc = xive_native_configure_queue(vp_id, q, prio, qpage, order,
61 					 can_escalate);
62 	if (rc)
63 		return rc;
64 
65 	if (qpage_prev)
66 		put_page(virt_to_page(qpage_prev));
67 
68 	return rc;
69 }
70 
71 void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
72 {
73 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
74 	int i;
75 
76 	if (!kvmppc_xive_enabled(vcpu))
77 		return;
78 
79 	if (!xc)
80 		return;
81 
82 	pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num);
83 
84 	/* Ensure no interrupt is still routed to that VP */
85 	xc->valid = false;
86 	kvmppc_xive_disable_vcpu_interrupts(vcpu);
87 
88 	/* Free escalations */
89 	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
90 		/* Free the escalation irq */
91 		if (xc->esc_virq[i]) {
92 			if (xc->xive->single_escalation)
93 				xive_cleanup_single_escalation(vcpu, xc,
94 							xc->esc_virq[i]);
95 			free_irq(xc->esc_virq[i], vcpu);
96 			irq_dispose_mapping(xc->esc_virq[i]);
97 			kfree(xc->esc_virq_names[i]);
98 			xc->esc_virq[i] = 0;
99 		}
100 	}
101 
102 	/* Disable the VP */
103 	xive_native_disable_vp(xc->vp_id);
104 
105 	/* Clear the cam word so guest entry won't try to push context */
106 	vcpu->arch.xive_cam_word = 0;
107 
108 	/* Free the queues */
109 	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
110 		kvmppc_xive_native_cleanup_queue(vcpu, i);
111 	}
112 
113 	/* Free the VP */
114 	kfree(xc);
115 
116 	/* Cleanup the vcpu */
117 	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
118 	vcpu->arch.xive_vcpu = NULL;
119 }
120 
121 int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
122 				    struct kvm_vcpu *vcpu, u32 server_num)
123 {
124 	struct kvmppc_xive *xive = dev->private;
125 	struct kvmppc_xive_vcpu *xc = NULL;
126 	int rc;
127 	u32 vp_id;
128 
129 	pr_devel("native_connect_vcpu(server=%d)\n", server_num);
130 
131 	if (dev->ops != &kvm_xive_native_ops) {
132 		pr_devel("Wrong ops !\n");
133 		return -EPERM;
134 	}
135 	if (xive->kvm != vcpu->kvm)
136 		return -EPERM;
137 	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
138 		return -EBUSY;
139 
140 	mutex_lock(&xive->lock);
141 
142 	rc = kvmppc_xive_compute_vp_id(xive, server_num, &vp_id);
143 	if (rc)
144 		goto bail;
145 
146 	xc = kzalloc(sizeof(*xc), GFP_KERNEL);
147 	if (!xc) {
148 		rc = -ENOMEM;
149 		goto bail;
150 	}
151 
152 	vcpu->arch.xive_vcpu = xc;
153 	xc->xive = xive;
154 	xc->vcpu = vcpu;
155 	xc->server_num = server_num;
156 
157 	xc->vp_id = vp_id;
158 	xc->valid = true;
159 	vcpu->arch.irq_type = KVMPPC_IRQ_XIVE;
160 
161 	rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
162 	if (rc) {
163 		pr_err("Failed to get VP info from OPAL: %d\n", rc);
164 		goto bail;
165 	}
166 
167 	/*
168 	 * Enable the VP first as the single escalation mode will
169 	 * affect escalation interrupts numbering
170 	 */
171 	rc = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
172 	if (rc) {
173 		pr_err("Failed to enable VP in OPAL: %d\n", rc);
174 		goto bail;
175 	}
176 
177 	/* Configure VCPU fields for use by assembly push/pull */
178 	vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
179 	vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
180 
181 	/* TODO: reset all queues to a clean state ? */
182 bail:
183 	mutex_unlock(&xive->lock);
184 	if (rc)
185 		kvmppc_xive_native_cleanup_vcpu(vcpu);
186 
187 	return rc;
188 }
189 
190 /*
191  * Device passthrough support
192  */
193 static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq)
194 {
195 	struct kvmppc_xive *xive = kvm->arch.xive;
196 	pgoff_t esb_pgoff = KVM_XIVE_ESB_PAGE_OFFSET + irq * 2;
197 
198 	if (irq >= KVMPPC_XIVE_NR_IRQS)
199 		return -EINVAL;
200 
201 	/*
202 	 * Clear the ESB pages of the IRQ number being mapped (or
203 	 * unmapped) into the guest and let the the VM fault handler
204 	 * repopulate with the appropriate ESB pages (device or IC)
205 	 */
206 	pr_debug("clearing esb pages for girq 0x%lx\n", irq);
207 	mutex_lock(&xive->mapping_lock);
208 	if (xive->mapping)
209 		unmap_mapping_range(xive->mapping,
210 				    esb_pgoff << PAGE_SHIFT,
211 				    2ull << PAGE_SHIFT, 1);
212 	mutex_unlock(&xive->mapping_lock);
213 	return 0;
214 }
215 
216 static struct kvmppc_xive_ops kvmppc_xive_native_ops =  {
217 	.reset_mapped = kvmppc_xive_native_reset_mapped,
218 };
219 
220 static vm_fault_t xive_native_esb_fault(struct vm_fault *vmf)
221 {
222 	struct vm_area_struct *vma = vmf->vma;
223 	struct kvm_device *dev = vma->vm_file->private_data;
224 	struct kvmppc_xive *xive = dev->private;
225 	struct kvmppc_xive_src_block *sb;
226 	struct kvmppc_xive_irq_state *state;
227 	struct xive_irq_data *xd;
228 	u32 hw_num;
229 	u16 src;
230 	u64 page;
231 	unsigned long irq;
232 	u64 page_offset;
233 
234 	/*
235 	 * Linux/KVM uses a two pages ESB setting, one for trigger and
236 	 * one for EOI
237 	 */
238 	page_offset = vmf->pgoff - vma->vm_pgoff;
239 	irq = page_offset / 2;
240 
241 	sb = kvmppc_xive_find_source(xive, irq, &src);
242 	if (!sb) {
243 		pr_devel("%s: source %lx not found !\n", __func__, irq);
244 		return VM_FAULT_SIGBUS;
245 	}
246 
247 	state = &sb->irq_state[src];
248 	kvmppc_xive_select_irq(state, &hw_num, &xd);
249 
250 	arch_spin_lock(&sb->lock);
251 
252 	/*
253 	 * first/even page is for trigger
254 	 * second/odd page is for EOI and management.
255 	 */
256 	page = page_offset % 2 ? xd->eoi_page : xd->trig_page;
257 	arch_spin_unlock(&sb->lock);
258 
259 	if (WARN_ON(!page)) {
260 		pr_err("%s: accessing invalid ESB page for source %lx !\n",
261 		       __func__, irq);
262 		return VM_FAULT_SIGBUS;
263 	}
264 
265 	vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
266 	return VM_FAULT_NOPAGE;
267 }
268 
269 static const struct vm_operations_struct xive_native_esb_vmops = {
270 	.fault = xive_native_esb_fault,
271 };
272 
273 static vm_fault_t xive_native_tima_fault(struct vm_fault *vmf)
274 {
275 	struct vm_area_struct *vma = vmf->vma;
276 
277 	switch (vmf->pgoff - vma->vm_pgoff) {
278 	case 0: /* HW - forbid access */
279 	case 1: /* HV - forbid access */
280 		return VM_FAULT_SIGBUS;
281 	case 2: /* OS */
282 		vmf_insert_pfn(vma, vmf->address, xive_tima_os >> PAGE_SHIFT);
283 		return VM_FAULT_NOPAGE;
284 	case 3: /* USER - TODO */
285 	default:
286 		return VM_FAULT_SIGBUS;
287 	}
288 }
289 
290 static const struct vm_operations_struct xive_native_tima_vmops = {
291 	.fault = xive_native_tima_fault,
292 };
293 
294 static int kvmppc_xive_native_mmap(struct kvm_device *dev,
295 				   struct vm_area_struct *vma)
296 {
297 	struct kvmppc_xive *xive = dev->private;
298 
299 	/* We only allow mappings at fixed offset for now */
300 	if (vma->vm_pgoff == KVM_XIVE_TIMA_PAGE_OFFSET) {
301 		if (vma_pages(vma) > 4)
302 			return -EINVAL;
303 		vma->vm_ops = &xive_native_tima_vmops;
304 	} else if (vma->vm_pgoff == KVM_XIVE_ESB_PAGE_OFFSET) {
305 		if (vma_pages(vma) > KVMPPC_XIVE_NR_IRQS * 2)
306 			return -EINVAL;
307 		vma->vm_ops = &xive_native_esb_vmops;
308 	} else {
309 		return -EINVAL;
310 	}
311 
312 	vma->vm_flags |= VM_IO | VM_PFNMAP;
313 	vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
314 
315 	/*
316 	 * Grab the KVM device file address_space to be able to clear
317 	 * the ESB pages mapping when a device is passed-through into
318 	 * the guest.
319 	 */
320 	xive->mapping = vma->vm_file->f_mapping;
321 	return 0;
322 }
323 
324 static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
325 					 u64 addr)
326 {
327 	struct kvmppc_xive_src_block *sb;
328 	struct kvmppc_xive_irq_state *state;
329 	u64 __user *ubufp = (u64 __user *) addr;
330 	u64 val;
331 	u16 idx;
332 	int rc;
333 
334 	pr_devel("%s irq=0x%lx\n", __func__, irq);
335 
336 	if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
337 		return -E2BIG;
338 
339 	sb = kvmppc_xive_find_source(xive, irq, &idx);
340 	if (!sb) {
341 		pr_debug("No source, creating source block...\n");
342 		sb = kvmppc_xive_create_src_block(xive, irq);
343 		if (!sb) {
344 			pr_err("Failed to create block...\n");
345 			return -ENOMEM;
346 		}
347 	}
348 	state = &sb->irq_state[idx];
349 
350 	if (get_user(val, ubufp)) {
351 		pr_err("fault getting user info !\n");
352 		return -EFAULT;
353 	}
354 
355 	arch_spin_lock(&sb->lock);
356 
357 	/*
358 	 * If the source doesn't already have an IPI, allocate
359 	 * one and get the corresponding data
360 	 */
361 	if (!state->ipi_number) {
362 		state->ipi_number = xive_native_alloc_irq();
363 		if (state->ipi_number == 0) {
364 			pr_err("Failed to allocate IRQ !\n");
365 			rc = -ENXIO;
366 			goto unlock;
367 		}
368 		xive_native_populate_irq_data(state->ipi_number,
369 					      &state->ipi_data);
370 		pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
371 			 state->ipi_number, irq);
372 	}
373 
374 	/* Restore LSI state */
375 	if (val & KVM_XIVE_LEVEL_SENSITIVE) {
376 		state->lsi = true;
377 		if (val & KVM_XIVE_LEVEL_ASSERTED)
378 			state->asserted = true;
379 		pr_devel("  LSI ! Asserted=%d\n", state->asserted);
380 	}
381 
382 	/* Mask IRQ to start with */
383 	state->act_server = 0;
384 	state->act_priority = MASKED;
385 	xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
386 	xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
387 
388 	/* Increment the number of valid sources and mark this one valid */
389 	if (!state->valid)
390 		xive->src_count++;
391 	state->valid = true;
392 
393 	rc = 0;
394 
395 unlock:
396 	arch_spin_unlock(&sb->lock);
397 
398 	return rc;
399 }
400 
401 static int kvmppc_xive_native_update_source_config(struct kvmppc_xive *xive,
402 					struct kvmppc_xive_src_block *sb,
403 					struct kvmppc_xive_irq_state *state,
404 					u32 server, u8 priority, bool masked,
405 					u32 eisn)
406 {
407 	struct kvm *kvm = xive->kvm;
408 	u32 hw_num;
409 	int rc = 0;
410 
411 	arch_spin_lock(&sb->lock);
412 
413 	if (state->act_server == server && state->act_priority == priority &&
414 	    state->eisn == eisn)
415 		goto unlock;
416 
417 	pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n",
418 		 priority, server, masked, state->act_server,
419 		 state->act_priority);
420 
421 	kvmppc_xive_select_irq(state, &hw_num, NULL);
422 
423 	if (priority != MASKED && !masked) {
424 		rc = kvmppc_xive_select_target(kvm, &server, priority);
425 		if (rc)
426 			goto unlock;
427 
428 		state->act_priority = priority;
429 		state->act_server = server;
430 		state->eisn = eisn;
431 
432 		rc = xive_native_configure_irq(hw_num,
433 					       kvmppc_xive_vp(xive, server),
434 					       priority, eisn);
435 	} else {
436 		state->act_priority = MASKED;
437 		state->act_server = 0;
438 		state->eisn = 0;
439 
440 		rc = xive_native_configure_irq(hw_num, 0, MASKED, 0);
441 	}
442 
443 unlock:
444 	arch_spin_unlock(&sb->lock);
445 	return rc;
446 }
447 
448 static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive,
449 						long irq, u64 addr)
450 {
451 	struct kvmppc_xive_src_block *sb;
452 	struct kvmppc_xive_irq_state *state;
453 	u64 __user *ubufp = (u64 __user *) addr;
454 	u16 src;
455 	u64 kvm_cfg;
456 	u32 server;
457 	u8 priority;
458 	bool masked;
459 	u32 eisn;
460 
461 	sb = kvmppc_xive_find_source(xive, irq, &src);
462 	if (!sb)
463 		return -ENOENT;
464 
465 	state = &sb->irq_state[src];
466 
467 	if (!state->valid)
468 		return -EINVAL;
469 
470 	if (get_user(kvm_cfg, ubufp))
471 		return -EFAULT;
472 
473 	pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__, irq, kvm_cfg);
474 
475 	priority = (kvm_cfg & KVM_XIVE_SOURCE_PRIORITY_MASK) >>
476 		KVM_XIVE_SOURCE_PRIORITY_SHIFT;
477 	server = (kvm_cfg & KVM_XIVE_SOURCE_SERVER_MASK) >>
478 		KVM_XIVE_SOURCE_SERVER_SHIFT;
479 	masked = (kvm_cfg & KVM_XIVE_SOURCE_MASKED_MASK) >>
480 		KVM_XIVE_SOURCE_MASKED_SHIFT;
481 	eisn = (kvm_cfg & KVM_XIVE_SOURCE_EISN_MASK) >>
482 		KVM_XIVE_SOURCE_EISN_SHIFT;
483 
484 	if (priority != xive_prio_from_guest(priority)) {
485 		pr_err("invalid priority for queue %d for VCPU %d\n",
486 		       priority, server);
487 		return -EINVAL;
488 	}
489 
490 	return kvmppc_xive_native_update_source_config(xive, sb, state, server,
491 						       priority, masked, eisn);
492 }
493 
494 static int kvmppc_xive_native_sync_source(struct kvmppc_xive *xive,
495 					  long irq, u64 addr)
496 {
497 	struct kvmppc_xive_src_block *sb;
498 	struct kvmppc_xive_irq_state *state;
499 	struct xive_irq_data *xd;
500 	u32 hw_num;
501 	u16 src;
502 	int rc = 0;
503 
504 	pr_devel("%s irq=0x%lx", __func__, irq);
505 
506 	sb = kvmppc_xive_find_source(xive, irq, &src);
507 	if (!sb)
508 		return -ENOENT;
509 
510 	state = &sb->irq_state[src];
511 
512 	rc = -EINVAL;
513 
514 	arch_spin_lock(&sb->lock);
515 
516 	if (state->valid) {
517 		kvmppc_xive_select_irq(state, &hw_num, &xd);
518 		xive_native_sync_source(hw_num);
519 		rc = 0;
520 	}
521 
522 	arch_spin_unlock(&sb->lock);
523 	return rc;
524 }
525 
526 static int xive_native_validate_queue_size(u32 qshift)
527 {
528 	/*
529 	 * We only support 64K pages for the moment. This is also
530 	 * advertised in the DT property "ibm,xive-eq-sizes"
531 	 */
532 	switch (qshift) {
533 	case 0: /* EQ reset */
534 	case 16:
535 		return 0;
536 	case 12:
537 	case 21:
538 	case 24:
539 	default:
540 		return -EINVAL;
541 	}
542 }
543 
544 static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
545 					       long eq_idx, u64 addr)
546 {
547 	struct kvm *kvm = xive->kvm;
548 	struct kvm_vcpu *vcpu;
549 	struct kvmppc_xive_vcpu *xc;
550 	void __user *ubufp = (void __user *) addr;
551 	u32 server;
552 	u8 priority;
553 	struct kvm_ppc_xive_eq kvm_eq;
554 	int rc;
555 	__be32 *qaddr = 0;
556 	struct page *page;
557 	struct xive_q *q;
558 	gfn_t gfn;
559 	unsigned long page_size;
560 	int srcu_idx;
561 
562 	/*
563 	 * Demangle priority/server tuple from the EQ identifier
564 	 */
565 	priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
566 		KVM_XIVE_EQ_PRIORITY_SHIFT;
567 	server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
568 		KVM_XIVE_EQ_SERVER_SHIFT;
569 
570 	if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq)))
571 		return -EFAULT;
572 
573 	vcpu = kvmppc_xive_find_server(kvm, server);
574 	if (!vcpu) {
575 		pr_err("Can't find server %d\n", server);
576 		return -ENOENT;
577 	}
578 	xc = vcpu->arch.xive_vcpu;
579 
580 	if (priority != xive_prio_from_guest(priority)) {
581 		pr_err("Trying to restore invalid queue %d for VCPU %d\n",
582 		       priority, server);
583 		return -EINVAL;
584 	}
585 	q = &xc->queues[priority];
586 
587 	pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
588 		 __func__, server, priority, kvm_eq.flags,
589 		 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
590 
591 	/* reset queue and disable queueing */
592 	if (!kvm_eq.qshift) {
593 		q->guest_qaddr  = 0;
594 		q->guest_qshift = 0;
595 
596 		rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
597 							NULL, 0, true);
598 		if (rc) {
599 			pr_err("Failed to reset queue %d for VCPU %d: %d\n",
600 			       priority, xc->server_num, rc);
601 			return rc;
602 		}
603 
604 		return 0;
605 	}
606 
607 	/*
608 	 * sPAPR specifies a "Unconditional Notify (n) flag" for the
609 	 * H_INT_SET_QUEUE_CONFIG hcall which forces notification
610 	 * without using the coalescing mechanisms provided by the
611 	 * XIVE END ESBs. This is required on KVM as notification
612 	 * using the END ESBs is not supported.
613 	 */
614 	if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) {
615 		pr_err("invalid flags %d\n", kvm_eq.flags);
616 		return -EINVAL;
617 	}
618 
619 	rc = xive_native_validate_queue_size(kvm_eq.qshift);
620 	if (rc) {
621 		pr_err("invalid queue size %d\n", kvm_eq.qshift);
622 		return rc;
623 	}
624 
625 	if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) {
626 		pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr,
627 		       1ull << kvm_eq.qshift);
628 		return -EINVAL;
629 	}
630 
631 	srcu_idx = srcu_read_lock(&kvm->srcu);
632 	gfn = gpa_to_gfn(kvm_eq.qaddr);
633 
634 	page_size = kvm_host_page_size(vcpu, gfn);
635 	if (1ull << kvm_eq.qshift > page_size) {
636 		srcu_read_unlock(&kvm->srcu, srcu_idx);
637 		pr_warn("Incompatible host page size %lx!\n", page_size);
638 		return -EINVAL;
639 	}
640 
641 	page = gfn_to_page(kvm, gfn);
642 	if (is_error_page(page)) {
643 		srcu_read_unlock(&kvm->srcu, srcu_idx);
644 		pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
645 		return -EINVAL;
646 	}
647 
648 	qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
649 	srcu_read_unlock(&kvm->srcu, srcu_idx);
650 
651 	/*
652 	 * Backup the queue page guest address to the mark EQ page
653 	 * dirty for migration.
654 	 */
655 	q->guest_qaddr  = kvm_eq.qaddr;
656 	q->guest_qshift = kvm_eq.qshift;
657 
658 	 /*
659 	  * Unconditional Notification is forced by default at the
660 	  * OPAL level because the use of END ESBs is not supported by
661 	  * Linux.
662 	  */
663 	rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
664 					(__be32 *) qaddr, kvm_eq.qshift, true);
665 	if (rc) {
666 		pr_err("Failed to configure queue %d for VCPU %d: %d\n",
667 		       priority, xc->server_num, rc);
668 		put_page(page);
669 		return rc;
670 	}
671 
672 	/*
673 	 * Only restore the queue state when needed. When doing the
674 	 * H_INT_SET_SOURCE_CONFIG hcall, it should not.
675 	 */
676 	if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) {
677 		rc = xive_native_set_queue_state(xc->vp_id, priority,
678 						 kvm_eq.qtoggle,
679 						 kvm_eq.qindex);
680 		if (rc)
681 			goto error;
682 	}
683 
684 	rc = kvmppc_xive_attach_escalation(vcpu, priority,
685 					   xive->single_escalation);
686 error:
687 	if (rc)
688 		kvmppc_xive_native_cleanup_queue(vcpu, priority);
689 	return rc;
690 }
691 
692 static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive,
693 					       long eq_idx, u64 addr)
694 {
695 	struct kvm *kvm = xive->kvm;
696 	struct kvm_vcpu *vcpu;
697 	struct kvmppc_xive_vcpu *xc;
698 	struct xive_q *q;
699 	void __user *ubufp = (u64 __user *) addr;
700 	u32 server;
701 	u8 priority;
702 	struct kvm_ppc_xive_eq kvm_eq;
703 	u64 qaddr;
704 	u64 qshift;
705 	u64 qeoi_page;
706 	u32 escalate_irq;
707 	u64 qflags;
708 	int rc;
709 
710 	/*
711 	 * Demangle priority/server tuple from the EQ identifier
712 	 */
713 	priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
714 		KVM_XIVE_EQ_PRIORITY_SHIFT;
715 	server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
716 		KVM_XIVE_EQ_SERVER_SHIFT;
717 
718 	vcpu = kvmppc_xive_find_server(kvm, server);
719 	if (!vcpu) {
720 		pr_err("Can't find server %d\n", server);
721 		return -ENOENT;
722 	}
723 	xc = vcpu->arch.xive_vcpu;
724 
725 	if (priority != xive_prio_from_guest(priority)) {
726 		pr_err("invalid priority for queue %d for VCPU %d\n",
727 		       priority, server);
728 		return -EINVAL;
729 	}
730 	q = &xc->queues[priority];
731 
732 	memset(&kvm_eq, 0, sizeof(kvm_eq));
733 
734 	if (!q->qpage)
735 		return 0;
736 
737 	rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift,
738 					&qeoi_page, &escalate_irq, &qflags);
739 	if (rc)
740 		return rc;
741 
742 	kvm_eq.flags = 0;
743 	if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
744 		kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY;
745 
746 	kvm_eq.qshift = q->guest_qshift;
747 	kvm_eq.qaddr  = q->guest_qaddr;
748 
749 	rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle,
750 					 &kvm_eq.qindex);
751 	if (rc)
752 		return rc;
753 
754 	pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
755 		 __func__, server, priority, kvm_eq.flags,
756 		 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
757 
758 	if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq)))
759 		return -EFAULT;
760 
761 	return 0;
762 }
763 
764 static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block *sb)
765 {
766 	int i;
767 
768 	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
769 		struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
770 
771 		if (!state->valid)
772 			continue;
773 
774 		if (state->act_priority == MASKED)
775 			continue;
776 
777 		state->eisn = 0;
778 		state->act_server = 0;
779 		state->act_priority = MASKED;
780 		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
781 		xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
782 		if (state->pt_number) {
783 			xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
784 			xive_native_configure_irq(state->pt_number,
785 						  0, MASKED, 0);
786 		}
787 	}
788 }
789 
790 static int kvmppc_xive_reset(struct kvmppc_xive *xive)
791 {
792 	struct kvm *kvm = xive->kvm;
793 	struct kvm_vcpu *vcpu;
794 	unsigned int i;
795 
796 	pr_devel("%s\n", __func__);
797 
798 	mutex_lock(&xive->lock);
799 
800 	kvm_for_each_vcpu(i, vcpu, kvm) {
801 		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
802 		unsigned int prio;
803 
804 		if (!xc)
805 			continue;
806 
807 		kvmppc_xive_disable_vcpu_interrupts(vcpu);
808 
809 		for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
810 
811 			/* Single escalation, no queue 7 */
812 			if (prio == 7 && xive->single_escalation)
813 				break;
814 
815 			if (xc->esc_virq[prio]) {
816 				free_irq(xc->esc_virq[prio], vcpu);
817 				irq_dispose_mapping(xc->esc_virq[prio]);
818 				kfree(xc->esc_virq_names[prio]);
819 				xc->esc_virq[prio] = 0;
820 			}
821 
822 			kvmppc_xive_native_cleanup_queue(vcpu, prio);
823 		}
824 	}
825 
826 	for (i = 0; i <= xive->max_sbid; i++) {
827 		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
828 
829 		if (sb) {
830 			arch_spin_lock(&sb->lock);
831 			kvmppc_xive_reset_sources(sb);
832 			arch_spin_unlock(&sb->lock);
833 		}
834 	}
835 
836 	mutex_unlock(&xive->lock);
837 
838 	return 0;
839 }
840 
841 static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
842 {
843 	int j;
844 
845 	for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
846 		struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
847 		struct xive_irq_data *xd;
848 		u32 hw_num;
849 
850 		if (!state->valid)
851 			continue;
852 
853 		/*
854 		 * The struct kvmppc_xive_irq_state reflects the state
855 		 * of the EAS configuration and not the state of the
856 		 * source. The source is masked setting the PQ bits to
857 		 * '-Q', which is what is being done before calling
858 		 * the KVM_DEV_XIVE_EQ_SYNC control.
859 		 *
860 		 * If a source EAS is configured, OPAL syncs the XIVE
861 		 * IC of the source and the XIVE IC of the previous
862 		 * target if any.
863 		 *
864 		 * So it should be fine ignoring MASKED sources as
865 		 * they have been synced already.
866 		 */
867 		if (state->act_priority == MASKED)
868 			continue;
869 
870 		kvmppc_xive_select_irq(state, &hw_num, &xd);
871 		xive_native_sync_source(hw_num);
872 		xive_native_sync_queue(hw_num);
873 	}
874 }
875 
876 static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
877 {
878 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
879 	unsigned int prio;
880 	int srcu_idx;
881 
882 	if (!xc)
883 		return -ENOENT;
884 
885 	for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
886 		struct xive_q *q = &xc->queues[prio];
887 
888 		if (!q->qpage)
889 			continue;
890 
891 		/* Mark EQ page dirty for migration */
892 		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
893 		mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr));
894 		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
895 	}
896 	return 0;
897 }
898 
899 static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
900 {
901 	struct kvm *kvm = xive->kvm;
902 	struct kvm_vcpu *vcpu;
903 	unsigned int i;
904 
905 	pr_devel("%s\n", __func__);
906 
907 	mutex_lock(&xive->lock);
908 	for (i = 0; i <= xive->max_sbid; i++) {
909 		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
910 
911 		if (sb) {
912 			arch_spin_lock(&sb->lock);
913 			kvmppc_xive_native_sync_sources(sb);
914 			arch_spin_unlock(&sb->lock);
915 		}
916 	}
917 
918 	kvm_for_each_vcpu(i, vcpu, kvm) {
919 		kvmppc_xive_native_vcpu_eq_sync(vcpu);
920 	}
921 	mutex_unlock(&xive->lock);
922 
923 	return 0;
924 }
925 
926 static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
927 				       struct kvm_device_attr *attr)
928 {
929 	struct kvmppc_xive *xive = dev->private;
930 
931 	switch (attr->group) {
932 	case KVM_DEV_XIVE_GRP_CTRL:
933 		switch (attr->attr) {
934 		case KVM_DEV_XIVE_RESET:
935 			return kvmppc_xive_reset(xive);
936 		case KVM_DEV_XIVE_EQ_SYNC:
937 			return kvmppc_xive_native_eq_sync(xive);
938 		case KVM_DEV_XIVE_NR_SERVERS:
939 			return kvmppc_xive_set_nr_servers(xive, attr->addr);
940 		}
941 		break;
942 	case KVM_DEV_XIVE_GRP_SOURCE:
943 		return kvmppc_xive_native_set_source(xive, attr->attr,
944 						     attr->addr);
945 	case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
946 		return kvmppc_xive_native_set_source_config(xive, attr->attr,
947 							    attr->addr);
948 	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
949 		return kvmppc_xive_native_set_queue_config(xive, attr->attr,
950 							   attr->addr);
951 	case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
952 		return kvmppc_xive_native_sync_source(xive, attr->attr,
953 						      attr->addr);
954 	}
955 	return -ENXIO;
956 }
957 
958 static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
959 				       struct kvm_device_attr *attr)
960 {
961 	struct kvmppc_xive *xive = dev->private;
962 
963 	switch (attr->group) {
964 	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
965 		return kvmppc_xive_native_get_queue_config(xive, attr->attr,
966 							   attr->addr);
967 	}
968 	return -ENXIO;
969 }
970 
971 static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
972 				       struct kvm_device_attr *attr)
973 {
974 	switch (attr->group) {
975 	case KVM_DEV_XIVE_GRP_CTRL:
976 		switch (attr->attr) {
977 		case KVM_DEV_XIVE_RESET:
978 		case KVM_DEV_XIVE_EQ_SYNC:
979 		case KVM_DEV_XIVE_NR_SERVERS:
980 			return 0;
981 		}
982 		break;
983 	case KVM_DEV_XIVE_GRP_SOURCE:
984 	case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
985 	case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
986 		if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
987 		    attr->attr < KVMPPC_XIVE_NR_IRQS)
988 			return 0;
989 		break;
990 	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
991 		return 0;
992 	}
993 	return -ENXIO;
994 }
995 
996 /*
997  * Called when device fd is closed.  kvm->lock is held.
998  */
999 static void kvmppc_xive_native_release(struct kvm_device *dev)
1000 {
1001 	struct kvmppc_xive *xive = dev->private;
1002 	struct kvm *kvm = xive->kvm;
1003 	struct kvm_vcpu *vcpu;
1004 	int i;
1005 
1006 	pr_devel("Releasing xive native device\n");
1007 
1008 	/*
1009 	 * Clear the KVM device file address_space which is used to
1010 	 * unmap the ESB pages when a device is passed-through.
1011 	 */
1012 	mutex_lock(&xive->mapping_lock);
1013 	xive->mapping = NULL;
1014 	mutex_unlock(&xive->mapping_lock);
1015 
1016 	/*
1017 	 * Since this is the device release function, we know that
1018 	 * userspace does not have any open fd or mmap referring to
1019 	 * the device.  Therefore there can not be any of the
1020 	 * device attribute set/get, mmap, or page fault functions
1021 	 * being executed concurrently, and similarly, the
1022 	 * connect_vcpu and set/clr_mapped functions also cannot
1023 	 * be being executed.
1024 	 */
1025 
1026 	debugfs_remove(xive->dentry);
1027 
1028 	/*
1029 	 * We should clean up the vCPU interrupt presenters first.
1030 	 */
1031 	kvm_for_each_vcpu(i, vcpu, kvm) {
1032 		/*
1033 		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
1034 		 * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
1035 		 * Holding the vcpu->mutex also means that the vcpu cannot
1036 		 * be executing the KVM_RUN ioctl, and therefore it cannot
1037 		 * be executing the XIVE push or pull code or accessing
1038 		 * the XIVE MMIO regions.
1039 		 */
1040 		mutex_lock(&vcpu->mutex);
1041 		kvmppc_xive_native_cleanup_vcpu(vcpu);
1042 		mutex_unlock(&vcpu->mutex);
1043 	}
1044 
1045 	/*
1046 	 * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
1047 	 * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
1048 	 * against xive code getting called during vcpu execution or
1049 	 * set/get one_reg operations.
1050 	 */
1051 	kvm->arch.xive = NULL;
1052 
1053 	for (i = 0; i <= xive->max_sbid; i++) {
1054 		if (xive->src_blocks[i])
1055 			kvmppc_xive_free_sources(xive->src_blocks[i]);
1056 		kfree(xive->src_blocks[i]);
1057 		xive->src_blocks[i] = NULL;
1058 	}
1059 
1060 	if (xive->vp_base != XIVE_INVALID_VP)
1061 		xive_native_free_vp_block(xive->vp_base);
1062 
1063 	/*
1064 	 * A reference of the kvmppc_xive pointer is now kept under
1065 	 * the xive_devices struct of the machine for reuse. It is
1066 	 * freed when the VM is destroyed for now until we fix all the
1067 	 * execution paths.
1068 	 */
1069 
1070 	kfree(dev);
1071 }
1072 
1073 /*
1074  * Create a XIVE device.  kvm->lock is held.
1075  */
1076 static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
1077 {
1078 	struct kvmppc_xive *xive;
1079 	struct kvm *kvm = dev->kvm;
1080 
1081 	pr_devel("Creating xive native device\n");
1082 
1083 	if (kvm->arch.xive)
1084 		return -EEXIST;
1085 
1086 	xive = kvmppc_xive_get_device(kvm, type);
1087 	if (!xive)
1088 		return -ENOMEM;
1089 
1090 	dev->private = xive;
1091 	xive->dev = dev;
1092 	xive->kvm = kvm;
1093 	mutex_init(&xive->mapping_lock);
1094 	mutex_init(&xive->lock);
1095 
1096 	/* VP allocation is delayed to the first call to connect_vcpu */
1097 	xive->vp_base = XIVE_INVALID_VP;
1098 	/* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets
1099 	 * on a POWER9 system.
1100 	 */
1101 	xive->nr_servers = KVM_MAX_VCPUS;
1102 
1103 	xive->single_escalation = xive_native_has_single_escalation();
1104 	xive->ops = &kvmppc_xive_native_ops;
1105 
1106 	kvm->arch.xive = xive;
1107 	return 0;
1108 }
1109 
1110 /*
1111  * Interrupt Pending Buffer (IPB) offset
1112  */
1113 #define TM_IPB_SHIFT 40
1114 #define TM_IPB_MASK  (((u64) 0xFF) << TM_IPB_SHIFT)
1115 
1116 int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1117 {
1118 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1119 	u64 opal_state;
1120 	int rc;
1121 
1122 	if (!kvmppc_xive_enabled(vcpu))
1123 		return -EPERM;
1124 
1125 	if (!xc)
1126 		return -ENOENT;
1127 
1128 	/* Thread context registers. We only care about IPB and CPPR */
1129 	val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
1130 
1131 	/* Get the VP state from OPAL */
1132 	rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
1133 	if (rc)
1134 		return rc;
1135 
1136 	/*
1137 	 * Capture the backup of IPB register in the NVT structure and
1138 	 * merge it in our KVM VP state.
1139 	 */
1140 	val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
1141 
1142 	pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
1143 		 __func__,
1144 		 vcpu->arch.xive_saved_state.nsr,
1145 		 vcpu->arch.xive_saved_state.cppr,
1146 		 vcpu->arch.xive_saved_state.ipb,
1147 		 vcpu->arch.xive_saved_state.pipr,
1148 		 vcpu->arch.xive_saved_state.w01,
1149 		 (u32) vcpu->arch.xive_cam_word, opal_state);
1150 
1151 	return 0;
1152 }
1153 
1154 int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1155 {
1156 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1157 	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
1158 
1159 	pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
1160 		 val->xive_timaval[0], val->xive_timaval[1]);
1161 
1162 	if (!kvmppc_xive_enabled(vcpu))
1163 		return -EPERM;
1164 
1165 	if (!xc || !xive)
1166 		return -ENOENT;
1167 
1168 	/* We can't update the state of a "pushed" VCPU	 */
1169 	if (WARN_ON(vcpu->arch.xive_pushed))
1170 		return -EBUSY;
1171 
1172 	/*
1173 	 * Restore the thread context registers. IPB and CPPR should
1174 	 * be the only ones that matter.
1175 	 */
1176 	vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
1177 
1178 	/*
1179 	 * There is no need to restore the XIVE internal state (IPB
1180 	 * stored in the NVT) as the IPB register was merged in KVM VP
1181 	 * state when captured.
1182 	 */
1183 	return 0;
1184 }
1185 
1186 bool kvmppc_xive_native_supported(void)
1187 {
1188 	return xive_native_has_queue_state_support();
1189 }
1190 
1191 static int xive_native_debug_show(struct seq_file *m, void *private)
1192 {
1193 	struct kvmppc_xive *xive = m->private;
1194 	struct kvm *kvm = xive->kvm;
1195 	struct kvm_vcpu *vcpu;
1196 	unsigned int i;
1197 
1198 	if (!kvm)
1199 		return 0;
1200 
1201 	seq_puts(m, "=========\nVCPU state\n=========\n");
1202 
1203 	kvm_for_each_vcpu(i, vcpu, kvm) {
1204 		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1205 
1206 		if (!xc)
1207 			continue;
1208 
1209 		seq_printf(m, "cpu server %#x VP=%#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
1210 			   xc->server_num, xc->vp_id,
1211 			   vcpu->arch.xive_saved_state.nsr,
1212 			   vcpu->arch.xive_saved_state.cppr,
1213 			   vcpu->arch.xive_saved_state.ipb,
1214 			   vcpu->arch.xive_saved_state.pipr,
1215 			   vcpu->arch.xive_saved_state.w01,
1216 			   (u32) vcpu->arch.xive_cam_word);
1217 
1218 		kvmppc_xive_debug_show_queues(m, vcpu);
1219 	}
1220 
1221 	return 0;
1222 }
1223 
1224 static int xive_native_debug_open(struct inode *inode, struct file *file)
1225 {
1226 	return single_open(file, xive_native_debug_show, inode->i_private);
1227 }
1228 
1229 static const struct file_operations xive_native_debug_fops = {
1230 	.open = xive_native_debug_open,
1231 	.read = seq_read,
1232 	.llseek = seq_lseek,
1233 	.release = single_release,
1234 };
1235 
1236 static void xive_native_debugfs_init(struct kvmppc_xive *xive)
1237 {
1238 	char *name;
1239 
1240 	name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive);
1241 	if (!name) {
1242 		pr_err("%s: no memory for name\n", __func__);
1243 		return;
1244 	}
1245 
1246 	xive->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root,
1247 					   xive, &xive_native_debug_fops);
1248 
1249 	pr_debug("%s: created %s\n", __func__, name);
1250 	kfree(name);
1251 }
1252 
1253 static void kvmppc_xive_native_init(struct kvm_device *dev)
1254 {
1255 	struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private;
1256 
1257 	/* Register some debug interfaces */
1258 	xive_native_debugfs_init(xive);
1259 }
1260 
1261 struct kvm_device_ops kvm_xive_native_ops = {
1262 	.name = "kvm-xive-native",
1263 	.create = kvmppc_xive_native_create,
1264 	.init = kvmppc_xive_native_init,
1265 	.release = kvmppc_xive_native_release,
1266 	.set_attr = kvmppc_xive_native_set_attr,
1267 	.get_attr = kvmppc_xive_native_get_attr,
1268 	.has_attr = kvmppc_xive_native_has_attr,
1269 	.mmap = kvmppc_xive_native_mmap,
1270 };
1271 
1272 void kvmppc_xive_native_init_module(void)
1273 {
1274 	;
1275 }
1276 
1277 void kvmppc_xive_native_exit_module(void)
1278 {
1279 	;
1280 }
1281