xref: /openbmc/linux/arch/x86/kvm/svm/avic.c (revision ecefa105)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * AMD SVM support
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *   Avi Kivity   <avi@qumranet.com>
13  */
14 
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16 
17 #include <linux/kvm_types.h>
18 #include <linux/hashtable.h>
19 #include <linux/amd-iommu.h>
20 #include <linux/kvm_host.h>
21 
22 #include <asm/irq_remapping.h>
23 
24 #include "trace.h"
25 #include "lapic.h"
26 #include "x86.h"
27 #include "irq.h"
28 #include "svm.h"
29 
30 /* AVIC GATAG is encoded using VM and VCPU IDs */
31 #define AVIC_VCPU_ID_BITS		8
32 #define AVIC_VCPU_ID_MASK		((1 << AVIC_VCPU_ID_BITS) - 1)
33 
34 #define AVIC_VM_ID_BITS			24
35 #define AVIC_VM_ID_NR			(1 << AVIC_VM_ID_BITS)
36 #define AVIC_VM_ID_MASK			((1 << AVIC_VM_ID_BITS) - 1)
37 
38 #define AVIC_GATAG(x, y)		(((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
39 						(y & AVIC_VCPU_ID_MASK))
40 #define AVIC_GATAG_TO_VMID(x)		((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
41 #define AVIC_GATAG_TO_VCPUID(x)		(x & AVIC_VCPU_ID_MASK)
42 
43 static bool force_avic;
44 module_param_unsafe(force_avic, bool, 0444);
45 
46 /* Note:
47  * This hash table is used to map VM_ID to a struct kvm_svm,
48  * when handling AMD IOMMU GALOG notification to schedule in
49  * a particular vCPU.
50  */
51 #define SVM_VM_DATA_HASH_BITS	8
52 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
53 static u32 next_vm_id = 0;
54 static bool next_vm_id_wrapped = 0;
55 static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
56 bool x2avic_enabled;
57 
58 /*
59  * This is a wrapper of struct amd_iommu_ir_data.
60  */
61 struct amd_svm_iommu_ir {
62 	struct list_head node;	/* Used by SVM for per-vcpu ir_list */
63 	void *data;		/* Storing pointer to struct amd_ir_data */
64 };
65 
66 static void avic_activate_vmcb(struct vcpu_svm *svm)
67 {
68 	struct vmcb *vmcb = svm->vmcb01.ptr;
69 
70 	vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
71 	vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
72 
73 	vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
74 
75 	/*
76 	 * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
77 	 * accesses, while interrupt injection to a running vCPU can be
78 	 * achieved using AVIC doorbell.  KVM disables the APIC access page
79 	 * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
80 	 * AVIC in hybrid mode activates only the doorbell mechanism.
81 	 */
82 	if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
83 		vmcb->control.int_ctl |= X2APIC_MODE_MASK;
84 		vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
85 		/* Disabling MSR intercept for x2APIC registers */
86 		svm_set_x2apic_msr_interception(svm, false);
87 	} else {
88 		/*
89 		 * Flush the TLB, the guest may have inserted a non-APIC
90 		 * mapping into the TLB while AVIC was disabled.
91 		 */
92 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
93 
94 		/* For xAVIC and hybrid-xAVIC modes */
95 		vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
96 		/* Enabling MSR intercept for x2APIC registers */
97 		svm_set_x2apic_msr_interception(svm, true);
98 	}
99 }
100 
101 static void avic_deactivate_vmcb(struct vcpu_svm *svm)
102 {
103 	struct vmcb *vmcb = svm->vmcb01.ptr;
104 
105 	vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
106 	vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
107 
108 	/*
109 	 * If running nested and the guest uses its own MSR bitmap, there
110 	 * is no need to update L0's msr bitmap
111 	 */
112 	if (is_guest_mode(&svm->vcpu) &&
113 	    vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))
114 		return;
115 
116 	/* Enabling MSR intercept for x2APIC registers */
117 	svm_set_x2apic_msr_interception(svm, true);
118 }
119 
120 /* Note:
121  * This function is called from IOMMU driver to notify
122  * SVM to schedule in a particular vCPU of a particular VM.
123  */
124 int avic_ga_log_notifier(u32 ga_tag)
125 {
126 	unsigned long flags;
127 	struct kvm_svm *kvm_svm;
128 	struct kvm_vcpu *vcpu = NULL;
129 	u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
130 	u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
131 
132 	pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
133 	trace_kvm_avic_ga_log(vm_id, vcpu_id);
134 
135 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
136 	hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
137 		if (kvm_svm->avic_vm_id != vm_id)
138 			continue;
139 		vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
140 		break;
141 	}
142 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
143 
144 	/* Note:
145 	 * At this point, the IOMMU should have already set the pending
146 	 * bit in the vAPIC backing page. So, we just need to schedule
147 	 * in the vcpu.
148 	 */
149 	if (vcpu)
150 		kvm_vcpu_wake_up(vcpu);
151 
152 	return 0;
153 }
154 
155 void avic_vm_destroy(struct kvm *kvm)
156 {
157 	unsigned long flags;
158 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
159 
160 	if (!enable_apicv)
161 		return;
162 
163 	if (kvm_svm->avic_logical_id_table_page)
164 		__free_page(kvm_svm->avic_logical_id_table_page);
165 	if (kvm_svm->avic_physical_id_table_page)
166 		__free_page(kvm_svm->avic_physical_id_table_page);
167 
168 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
169 	hash_del(&kvm_svm->hnode);
170 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
171 }
172 
173 int avic_vm_init(struct kvm *kvm)
174 {
175 	unsigned long flags;
176 	int err = -ENOMEM;
177 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
178 	struct kvm_svm *k2;
179 	struct page *p_page;
180 	struct page *l_page;
181 	u32 vm_id;
182 
183 	if (!enable_apicv)
184 		return 0;
185 
186 	/* Allocating physical APIC ID table (4KB) */
187 	p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
188 	if (!p_page)
189 		goto free_avic;
190 
191 	kvm_svm->avic_physical_id_table_page = p_page;
192 
193 	/* Allocating logical APIC ID table (4KB) */
194 	l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
195 	if (!l_page)
196 		goto free_avic;
197 
198 	kvm_svm->avic_logical_id_table_page = l_page;
199 
200 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
201  again:
202 	vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
203 	if (vm_id == 0) { /* id is 1-based, zero is not okay */
204 		next_vm_id_wrapped = 1;
205 		goto again;
206 	}
207 	/* Is it still in use? Only possible if wrapped at least once */
208 	if (next_vm_id_wrapped) {
209 		hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
210 			if (k2->avic_vm_id == vm_id)
211 				goto again;
212 		}
213 	}
214 	kvm_svm->avic_vm_id = vm_id;
215 	hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
216 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
217 
218 	return 0;
219 
220 free_avic:
221 	avic_vm_destroy(kvm);
222 	return err;
223 }
224 
225 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
226 {
227 	struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
228 	phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
229 	phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
230 	phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
231 
232 	vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
233 	vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
234 	vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
235 	vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
236 
237 	if (kvm_apicv_activated(svm->vcpu.kvm))
238 		avic_activate_vmcb(svm);
239 	else
240 		avic_deactivate_vmcb(svm);
241 }
242 
243 static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
244 				       unsigned int index)
245 {
246 	u64 *avic_physical_id_table;
247 	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
248 
249 	if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
250 	    (index > X2AVIC_MAX_PHYSICAL_ID))
251 		return NULL;
252 
253 	avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
254 
255 	return &avic_physical_id_table[index];
256 }
257 
258 static int avic_init_backing_page(struct kvm_vcpu *vcpu)
259 {
260 	u64 *entry, new_entry;
261 	int id = vcpu->vcpu_id;
262 	struct vcpu_svm *svm = to_svm(vcpu);
263 
264 	if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
265 	    (id > X2AVIC_MAX_PHYSICAL_ID))
266 		return -EINVAL;
267 
268 	if (!vcpu->arch.apic->regs)
269 		return -EINVAL;
270 
271 	if (kvm_apicv_activated(vcpu->kvm)) {
272 		int ret;
273 
274 		/*
275 		 * Note, AVIC hardware walks the nested page table to check
276 		 * permissions, but does not use the SPA address specified in
277 		 * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
278 		 * pointer field of the VMCB.
279 		 */
280 		ret = kvm_alloc_apic_access_page(vcpu->kvm);
281 		if (ret)
282 			return ret;
283 	}
284 
285 	svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
286 
287 	/* Setting AVIC backing page address in the phy APIC ID table */
288 	entry = avic_get_physical_id_entry(vcpu, id);
289 	if (!entry)
290 		return -EINVAL;
291 
292 	new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
293 			      AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
294 			      AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
295 	WRITE_ONCE(*entry, new_entry);
296 
297 	svm->avic_physical_id_cache = entry;
298 
299 	return 0;
300 }
301 
302 void avic_ring_doorbell(struct kvm_vcpu *vcpu)
303 {
304 	/*
305 	 * Note, the vCPU could get migrated to a different pCPU at any point,
306 	 * which could result in signalling the wrong/previous pCPU.  But if
307 	 * that happens the vCPU is guaranteed to do a VMRUN (after being
308 	 * migrated) and thus will process pending interrupts, i.e. a doorbell
309 	 * is not needed (and the spurious one is harmless).
310 	 */
311 	int cpu = READ_ONCE(vcpu->cpu);
312 
313 	if (cpu != get_cpu()) {
314 		wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
315 		trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu));
316 	}
317 	put_cpu();
318 }
319 
320 
321 static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl)
322 {
323 	vcpu->arch.apic->irr_pending = true;
324 	svm_complete_interrupt_delivery(vcpu,
325 					icrl & APIC_MODE_MASK,
326 					icrl & APIC_INT_LEVELTRIG,
327 					icrl & APIC_VECTOR_MASK);
328 }
329 
330 static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id,
331 					  u32 icrl)
332 {
333 	/*
334 	 * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
335 	 * i.e. APIC ID == vCPU ID.
336 	 */
337 	struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id);
338 
339 	/* Once again, nothing to do if the target vCPU doesn't exist. */
340 	if (unlikely(!target_vcpu))
341 		return;
342 
343 	avic_kick_vcpu(target_vcpu, icrl);
344 }
345 
346 static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table,
347 					 u32 logid_index, u32 icrl)
348 {
349 	u32 physical_id;
350 
351 	if (avic_logical_id_table) {
352 		u32 logid_entry = avic_logical_id_table[logid_index];
353 
354 		/* Nothing to do if the logical destination is invalid. */
355 		if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
356 			return;
357 
358 		physical_id = logid_entry &
359 			      AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
360 	} else {
361 		/*
362 		 * For x2APIC, the logical APIC ID is a read-only value that is
363 		 * derived from the x2APIC ID, thus the x2APIC ID can be found
364 		 * by reversing the calculation (stored in logid_index).  Note,
365 		 * bits 31:20 of the x2APIC ID aren't propagated to the logical
366 		 * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS.
367 		 */
368 		physical_id = logid_index;
369 	}
370 
371 	avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl);
372 }
373 
374 /*
375  * A fast-path version of avic_kick_target_vcpus(), which attempts to match
376  * destination APIC ID to vCPU without looping through all vCPUs.
377  */
378 static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
379 				       u32 icrl, u32 icrh, u32 index)
380 {
381 	int dest_mode = icrl & APIC_DEST_MASK;
382 	int shorthand = icrl & APIC_SHORT_MASK;
383 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
384 	u32 dest;
385 
386 	if (shorthand != APIC_DEST_NOSHORT)
387 		return -EINVAL;
388 
389 	if (apic_x2apic_mode(source))
390 		dest = icrh;
391 	else
392 		dest = GET_XAPIC_DEST_FIELD(icrh);
393 
394 	if (dest_mode == APIC_DEST_PHYSICAL) {
395 		/* broadcast destination, use slow path */
396 		if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
397 			return -EINVAL;
398 		if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
399 			return -EINVAL;
400 
401 		if (WARN_ON_ONCE(dest != index))
402 			return -EINVAL;
403 
404 		avic_kick_vcpu_by_physical_id(kvm, dest, icrl);
405 	} else {
406 		u32 *avic_logical_id_table;
407 		unsigned long bitmap, i;
408 		u32 cluster;
409 
410 		if (apic_x2apic_mode(source)) {
411 			/* 16 bit dest mask, 16 bit cluster id */
412 			bitmap = dest & 0xFFFF;
413 			cluster = (dest >> 16) << 4;
414 		} else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
415 			/* 8 bit dest mask*/
416 			bitmap = dest;
417 			cluster = 0;
418 		} else {
419 			/* 4 bit desk mask, 4 bit cluster id */
420 			bitmap = dest & 0xF;
421 			cluster = (dest >> 4) << 2;
422 		}
423 
424 		/* Nothing to do if there are no destinations in the cluster. */
425 		if (unlikely(!bitmap))
426 			return 0;
427 
428 		if (apic_x2apic_mode(source))
429 			avic_logical_id_table = NULL;
430 		else
431 			avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
432 
433 		/*
434 		 * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
435 		 * IDs, thus each bit in the destination is guaranteed to map
436 		 * to at most one vCPU.
437 		 */
438 		for_each_set_bit(i, &bitmap, 16)
439 			avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table,
440 						     cluster + i, icrl);
441 	}
442 
443 	return 0;
444 }
445 
446 static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
447 				   u32 icrl, u32 icrh, u32 index)
448 {
449 	u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh);
450 	unsigned long i;
451 	struct kvm_vcpu *vcpu;
452 
453 	if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
454 		return;
455 
456 	trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
457 
458 	/*
459 	 * Wake any target vCPUs that are blocking, i.e. waiting for a wake
460 	 * event.  There's no need to signal doorbells, as hardware has handled
461 	 * vCPUs that were in guest at the time of the IPI, and vCPUs that have
462 	 * since entered the guest will have processed pending IRQs at VMRUN.
463 	 */
464 	kvm_for_each_vcpu(i, vcpu, kvm) {
465 		if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
466 					dest, icrl & APIC_DEST_MASK))
467 			avic_kick_vcpu(vcpu, icrl);
468 	}
469 }
470 
471 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
472 {
473 	struct vcpu_svm *svm = to_svm(vcpu);
474 	u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
475 	u32 icrl = svm->vmcb->control.exit_info_1;
476 	u32 id = svm->vmcb->control.exit_info_2 >> 32;
477 	u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
478 	struct kvm_lapic *apic = vcpu->arch.apic;
479 
480 	trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
481 
482 	switch (id) {
483 	case AVIC_IPI_FAILURE_INVALID_TARGET:
484 	case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
485 		/*
486 		 * Emulate IPIs that are not handled by AVIC hardware, which
487 		 * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
488 		 * if _any_ targets are invalid, e.g. if the logical mode mask
489 		 * is a superset of running vCPUs.
490 		 *
491 		 * The exit is a trap, e.g. ICR holds the correct value and RIP
492 		 * has been advanced, KVM is responsible only for emulating the
493 		 * IPI.  Sadly, hardware may sometimes leave the BUSY flag set,
494 		 * in which case KVM needs to emulate the ICR write as well in
495 		 * order to clear the BUSY flag.
496 		 */
497 		if (icrl & APIC_ICR_BUSY)
498 			kvm_apic_write_nodecode(vcpu, APIC_ICR);
499 		else
500 			kvm_apic_send_ipi(apic, icrl, icrh);
501 		break;
502 	case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
503 		/*
504 		 * At this point, we expect that the AVIC HW has already
505 		 * set the appropriate IRR bits on the valid target
506 		 * vcpus. So, we just need to kick the appropriate vcpu.
507 		 */
508 		avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
509 		break;
510 	case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
511 		WARN_ONCE(1, "Invalid backing page\n");
512 		break;
513 	default:
514 		pr_err("Unknown IPI interception\n");
515 	}
516 
517 	return 1;
518 }
519 
520 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
521 {
522 	if (is_guest_mode(vcpu))
523 		return APICV_INHIBIT_REASON_NESTED;
524 	return 0;
525 }
526 
527 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
528 {
529 	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
530 	u32 *logical_apic_id_table;
531 	u32 cluster, index;
532 
533 	ldr = GET_APIC_LOGICAL_ID(ldr);
534 
535 	if (flat) {
536 		cluster = 0;
537 	} else {
538 		cluster = (ldr >> 4);
539 		if (cluster >= 0xf)
540 			return NULL;
541 		ldr &= 0xf;
542 	}
543 	if (!ldr || !is_power_of_2(ldr))
544 		return NULL;
545 
546 	index = __ffs(ldr);
547 	if (WARN_ON_ONCE(index > 7))
548 		return NULL;
549 	index += (cluster << 2);
550 
551 	logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
552 
553 	return &logical_apic_id_table[index];
554 }
555 
556 static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
557 {
558 	bool flat;
559 	u32 *entry, new_entry;
560 
561 	flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
562 	entry = avic_get_logical_id_entry(vcpu, ldr, flat);
563 	if (!entry)
564 		return;
565 
566 	new_entry = READ_ONCE(*entry);
567 	new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
568 	new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
569 	new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
570 	WRITE_ONCE(*entry, new_entry);
571 }
572 
573 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
574 {
575 	struct vcpu_svm *svm = to_svm(vcpu);
576 	bool flat = svm->dfr_reg == APIC_DFR_FLAT;
577 	u32 *entry;
578 
579 	/* Note: x2AVIC does not use logical APIC ID table */
580 	if (apic_x2apic_mode(vcpu->arch.apic))
581 		return;
582 
583 	entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
584 	if (entry)
585 		clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
586 }
587 
588 static void avic_handle_ldr_update(struct kvm_vcpu *vcpu)
589 {
590 	struct vcpu_svm *svm = to_svm(vcpu);
591 	u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
592 	u32 id = kvm_xapic_id(vcpu->arch.apic);
593 
594 	/* AVIC does not support LDR update for x2APIC */
595 	if (apic_x2apic_mode(vcpu->arch.apic))
596 		return;
597 
598 	if (ldr == svm->ldr_reg)
599 		return;
600 
601 	avic_invalidate_logical_id_entry(vcpu);
602 
603 	svm->ldr_reg = ldr;
604 	avic_ldr_write(vcpu, id, ldr);
605 }
606 
607 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
608 {
609 	struct vcpu_svm *svm = to_svm(vcpu);
610 	u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
611 
612 	if (svm->dfr_reg == dfr)
613 		return;
614 
615 	avic_invalidate_logical_id_entry(vcpu);
616 	svm->dfr_reg = dfr;
617 }
618 
619 static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
620 {
621 	u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
622 				AVIC_UNACCEL_ACCESS_OFFSET_MASK;
623 
624 	switch (offset) {
625 	case APIC_LDR:
626 		avic_handle_ldr_update(vcpu);
627 		break;
628 	case APIC_DFR:
629 		avic_handle_dfr_update(vcpu);
630 		break;
631 	case APIC_RRR:
632 		/* Ignore writes to Read Remote Data, it's read-only. */
633 		return 1;
634 	default:
635 		break;
636 	}
637 
638 	kvm_apic_write_nodecode(vcpu, offset);
639 	return 1;
640 }
641 
642 static bool is_avic_unaccelerated_access_trap(u32 offset)
643 {
644 	bool ret = false;
645 
646 	switch (offset) {
647 	case APIC_ID:
648 	case APIC_EOI:
649 	case APIC_RRR:
650 	case APIC_LDR:
651 	case APIC_DFR:
652 	case APIC_SPIV:
653 	case APIC_ESR:
654 	case APIC_ICR:
655 	case APIC_LVTT:
656 	case APIC_LVTTHMR:
657 	case APIC_LVTPC:
658 	case APIC_LVT0:
659 	case APIC_LVT1:
660 	case APIC_LVTERR:
661 	case APIC_TMICT:
662 	case APIC_TDCR:
663 		ret = true;
664 		break;
665 	default:
666 		break;
667 	}
668 	return ret;
669 }
670 
671 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
672 {
673 	struct vcpu_svm *svm = to_svm(vcpu);
674 	int ret = 0;
675 	u32 offset = svm->vmcb->control.exit_info_1 &
676 		     AVIC_UNACCEL_ACCESS_OFFSET_MASK;
677 	u32 vector = svm->vmcb->control.exit_info_2 &
678 		     AVIC_UNACCEL_ACCESS_VECTOR_MASK;
679 	bool write = (svm->vmcb->control.exit_info_1 >> 32) &
680 		     AVIC_UNACCEL_ACCESS_WRITE_MASK;
681 	bool trap = is_avic_unaccelerated_access_trap(offset);
682 
683 	trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
684 					    trap, write, vector);
685 	if (trap) {
686 		/* Handling Trap */
687 		WARN_ONCE(!write, "svm: Handling trap read.\n");
688 		ret = avic_unaccel_trap_write(vcpu);
689 	} else {
690 		/* Handling Fault */
691 		ret = kvm_emulate_instruction(vcpu, 0);
692 	}
693 
694 	return ret;
695 }
696 
697 int avic_init_vcpu(struct vcpu_svm *svm)
698 {
699 	int ret;
700 	struct kvm_vcpu *vcpu = &svm->vcpu;
701 
702 	if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
703 		return 0;
704 
705 	ret = avic_init_backing_page(vcpu);
706 	if (ret)
707 		return ret;
708 
709 	INIT_LIST_HEAD(&svm->ir_list);
710 	spin_lock_init(&svm->ir_list_lock);
711 	svm->dfr_reg = APIC_DFR_FLAT;
712 
713 	return ret;
714 }
715 
716 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
717 {
718 	avic_handle_dfr_update(vcpu);
719 	avic_handle_ldr_update(vcpu);
720 }
721 
722 static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
723 {
724 	int ret = 0;
725 	unsigned long flags;
726 	struct amd_svm_iommu_ir *ir;
727 	struct vcpu_svm *svm = to_svm(vcpu);
728 
729 	if (!kvm_arch_has_assigned_device(vcpu->kvm))
730 		return 0;
731 
732 	/*
733 	 * Here, we go through the per-vcpu ir_list to update all existing
734 	 * interrupt remapping table entry targeting this vcpu.
735 	 */
736 	spin_lock_irqsave(&svm->ir_list_lock, flags);
737 
738 	if (list_empty(&svm->ir_list))
739 		goto out;
740 
741 	list_for_each_entry(ir, &svm->ir_list, node) {
742 		if (activate)
743 			ret = amd_iommu_activate_guest_mode(ir->data);
744 		else
745 			ret = amd_iommu_deactivate_guest_mode(ir->data);
746 		if (ret)
747 			break;
748 	}
749 out:
750 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
751 	return ret;
752 }
753 
754 static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
755 {
756 	unsigned long flags;
757 	struct amd_svm_iommu_ir *cur;
758 
759 	spin_lock_irqsave(&svm->ir_list_lock, flags);
760 	list_for_each_entry(cur, &svm->ir_list, node) {
761 		if (cur->data != pi->ir_data)
762 			continue;
763 		list_del(&cur->node);
764 		kfree(cur);
765 		break;
766 	}
767 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
768 }
769 
770 static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
771 {
772 	int ret = 0;
773 	unsigned long flags;
774 	struct amd_svm_iommu_ir *ir;
775 
776 	/**
777 	 * In some cases, the existing irte is updated and re-set,
778 	 * so we need to check here if it's already been * added
779 	 * to the ir_list.
780 	 */
781 	if (pi->ir_data && (pi->prev_ga_tag != 0)) {
782 		struct kvm *kvm = svm->vcpu.kvm;
783 		u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
784 		struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
785 		struct vcpu_svm *prev_svm;
786 
787 		if (!prev_vcpu) {
788 			ret = -EINVAL;
789 			goto out;
790 		}
791 
792 		prev_svm = to_svm(prev_vcpu);
793 		svm_ir_list_del(prev_svm, pi);
794 	}
795 
796 	/**
797 	 * Allocating new amd_iommu_pi_data, which will get
798 	 * add to the per-vcpu ir_list.
799 	 */
800 	ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
801 	if (!ir) {
802 		ret = -ENOMEM;
803 		goto out;
804 	}
805 	ir->data = pi->ir_data;
806 
807 	spin_lock_irqsave(&svm->ir_list_lock, flags);
808 	list_add(&ir->node, &svm->ir_list);
809 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
810 out:
811 	return ret;
812 }
813 
814 /*
815  * Note:
816  * The HW cannot support posting multicast/broadcast
817  * interrupts to a vCPU. So, we still use legacy interrupt
818  * remapping for these kind of interrupts.
819  *
820  * For lowest-priority interrupts, we only support
821  * those with single CPU as the destination, e.g. user
822  * configures the interrupts via /proc/irq or uses
823  * irqbalance to make the interrupts single-CPU.
824  */
825 static int
826 get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
827 		 struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
828 {
829 	struct kvm_lapic_irq irq;
830 	struct kvm_vcpu *vcpu = NULL;
831 
832 	kvm_set_msi_irq(kvm, e, &irq);
833 
834 	if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
835 	    !kvm_irq_is_postable(&irq)) {
836 		pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
837 			 __func__, irq.vector);
838 		return -1;
839 	}
840 
841 	pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
842 		 irq.vector);
843 	*svm = to_svm(vcpu);
844 	vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
845 	vcpu_info->vector = irq.vector;
846 
847 	return 0;
848 }
849 
850 /*
851  * avic_pi_update_irte - set IRTE for Posted-Interrupts
852  *
853  * @kvm: kvm
854  * @host_irq: host irq of the interrupt
855  * @guest_irq: gsi of the interrupt
856  * @set: set or unset PI
857  * returns 0 on success, < 0 on failure
858  */
859 int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
860 			uint32_t guest_irq, bool set)
861 {
862 	struct kvm_kernel_irq_routing_entry *e;
863 	struct kvm_irq_routing_table *irq_rt;
864 	int idx, ret = 0;
865 
866 	if (!kvm_arch_has_assigned_device(kvm) ||
867 	    !irq_remapping_cap(IRQ_POSTING_CAP))
868 		return 0;
869 
870 	pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
871 		 __func__, host_irq, guest_irq, set);
872 
873 	idx = srcu_read_lock(&kvm->irq_srcu);
874 	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
875 
876 	if (guest_irq >= irq_rt->nr_rt_entries ||
877 		hlist_empty(&irq_rt->map[guest_irq])) {
878 		pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
879 			     guest_irq, irq_rt->nr_rt_entries);
880 		goto out;
881 	}
882 
883 	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
884 		struct vcpu_data vcpu_info;
885 		struct vcpu_svm *svm = NULL;
886 
887 		if (e->type != KVM_IRQ_ROUTING_MSI)
888 			continue;
889 
890 		/**
891 		 * Here, we setup with legacy mode in the following cases:
892 		 * 1. When cannot target interrupt to a specific vcpu.
893 		 * 2. Unsetting posted interrupt.
894 		 * 3. APIC virtualization is disabled for the vcpu.
895 		 * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
896 		 */
897 		if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
898 		    kvm_vcpu_apicv_active(&svm->vcpu)) {
899 			struct amd_iommu_pi_data pi;
900 
901 			/* Try to enable guest_mode in IRTE */
902 			pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
903 					    AVIC_HPA_MASK);
904 			pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
905 						     svm->vcpu.vcpu_id);
906 			pi.is_guest_mode = true;
907 			pi.vcpu_data = &vcpu_info;
908 			ret = irq_set_vcpu_affinity(host_irq, &pi);
909 
910 			/**
911 			 * Here, we successfully setting up vcpu affinity in
912 			 * IOMMU guest mode. Now, we need to store the posted
913 			 * interrupt information in a per-vcpu ir_list so that
914 			 * we can reference to them directly when we update vcpu
915 			 * scheduling information in IOMMU irte.
916 			 */
917 			if (!ret && pi.is_guest_mode)
918 				svm_ir_list_add(svm, &pi);
919 		} else {
920 			/* Use legacy mode in IRTE */
921 			struct amd_iommu_pi_data pi;
922 
923 			/**
924 			 * Here, pi is used to:
925 			 * - Tell IOMMU to use legacy mode for this interrupt.
926 			 * - Retrieve ga_tag of prior interrupt remapping data.
927 			 */
928 			pi.prev_ga_tag = 0;
929 			pi.is_guest_mode = false;
930 			ret = irq_set_vcpu_affinity(host_irq, &pi);
931 
932 			/**
933 			 * Check if the posted interrupt was previously
934 			 * setup with the guest_mode by checking if the ga_tag
935 			 * was cached. If so, we need to clean up the per-vcpu
936 			 * ir_list.
937 			 */
938 			if (!ret && pi.prev_ga_tag) {
939 				int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
940 				struct kvm_vcpu *vcpu;
941 
942 				vcpu = kvm_get_vcpu_by_id(kvm, id);
943 				if (vcpu)
944 					svm_ir_list_del(to_svm(vcpu), &pi);
945 			}
946 		}
947 
948 		if (!ret && svm) {
949 			trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
950 						 e->gsi, vcpu_info.vector,
951 						 vcpu_info.pi_desc_addr, set);
952 		}
953 
954 		if (ret < 0) {
955 			pr_err("%s: failed to update PI IRTE\n", __func__);
956 			goto out;
957 		}
958 	}
959 
960 	ret = 0;
961 out:
962 	srcu_read_unlock(&kvm->irq_srcu, idx);
963 	return ret;
964 }
965 
966 static inline int
967 avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
968 {
969 	int ret = 0;
970 	unsigned long flags;
971 	struct amd_svm_iommu_ir *ir;
972 	struct vcpu_svm *svm = to_svm(vcpu);
973 
974 	if (!kvm_arch_has_assigned_device(vcpu->kvm))
975 		return 0;
976 
977 	/*
978 	 * Here, we go through the per-vcpu ir_list to update all existing
979 	 * interrupt remapping table entry targeting this vcpu.
980 	 */
981 	spin_lock_irqsave(&svm->ir_list_lock, flags);
982 
983 	if (list_empty(&svm->ir_list))
984 		goto out;
985 
986 	list_for_each_entry(ir, &svm->ir_list, node) {
987 		ret = amd_iommu_update_ga(cpu, r, ir->data);
988 		if (ret)
989 			break;
990 	}
991 out:
992 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
993 	return ret;
994 }
995 
996 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
997 {
998 	u64 entry;
999 	int h_physical_id = kvm_cpu_get_apicid(cpu);
1000 	struct vcpu_svm *svm = to_svm(vcpu);
1001 
1002 	lockdep_assert_preemption_disabled();
1003 
1004 	if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
1005 		return;
1006 
1007 	/*
1008 	 * No need to update anything if the vCPU is blocking, i.e. if the vCPU
1009 	 * is being scheduled in after being preempted.  The CPU entries in the
1010 	 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
1011 	 * If the vCPU was migrated, its new CPU value will be stuffed when the
1012 	 * vCPU unblocks.
1013 	 */
1014 	if (kvm_vcpu_is_blocking(vcpu))
1015 		return;
1016 
1017 	entry = READ_ONCE(*(svm->avic_physical_id_cache));
1018 	WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
1019 
1020 	entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
1021 	entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
1022 	entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1023 
1024 	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1025 	avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
1026 }
1027 
1028 void avic_vcpu_put(struct kvm_vcpu *vcpu)
1029 {
1030 	u64 entry;
1031 	struct vcpu_svm *svm = to_svm(vcpu);
1032 
1033 	lockdep_assert_preemption_disabled();
1034 
1035 	entry = READ_ONCE(*(svm->avic_physical_id_cache));
1036 
1037 	/* Nothing to do if IsRunning == '0' due to vCPU blocking. */
1038 	if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
1039 		return;
1040 
1041 	avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
1042 
1043 	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1044 	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1045 }
1046 
1047 void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
1048 {
1049 	struct vcpu_svm *svm = to_svm(vcpu);
1050 	struct vmcb *vmcb = svm->vmcb01.ptr;
1051 
1052 	if (!lapic_in_kernel(vcpu) || !enable_apicv)
1053 		return;
1054 
1055 	if (kvm_vcpu_apicv_active(vcpu)) {
1056 		/**
1057 		 * During AVIC temporary deactivation, guest could update
1058 		 * APIC ID, DFR and LDR registers, which would not be trapped
1059 		 * by avic_unaccelerated_access_interception(). In this case,
1060 		 * we need to check and update the AVIC logical APIC ID table
1061 		 * accordingly before re-activating.
1062 		 */
1063 		avic_apicv_post_state_restore(vcpu);
1064 		avic_activate_vmcb(svm);
1065 	} else {
1066 		avic_deactivate_vmcb(svm);
1067 	}
1068 	vmcb_mark_dirty(vmcb, VMCB_AVIC);
1069 }
1070 
1071 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
1072 {
1073 	bool activated = kvm_vcpu_apicv_active(vcpu);
1074 
1075 	if (!enable_apicv)
1076 		return;
1077 
1078 	avic_refresh_virtual_apic_mode(vcpu);
1079 
1080 	if (activated)
1081 		avic_vcpu_load(vcpu, vcpu->cpu);
1082 	else
1083 		avic_vcpu_put(vcpu);
1084 
1085 	avic_set_pi_irte_mode(vcpu, activated);
1086 }
1087 
1088 void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
1089 {
1090 	if (!kvm_vcpu_apicv_active(vcpu))
1091 		return;
1092 
1093        /*
1094         * Unload the AVIC when the vCPU is about to block, _before_
1095         * the vCPU actually blocks.
1096         *
1097         * Any IRQs that arrive before IsRunning=0 will not cause an
1098         * incomplete IPI vmexit on the source, therefore vIRR will also
1099         * be checked by kvm_vcpu_check_block() before blocking.  The
1100         * memory barrier implicit in set_current_state orders writing
1101         * IsRunning=0 before reading the vIRR.  The processor needs a
1102         * matching memory barrier on interrupt delivery between writing
1103         * IRR and reading IsRunning; the lack of this barrier might be
1104         * the cause of errata #1235).
1105         */
1106 	avic_vcpu_put(vcpu);
1107 }
1108 
1109 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
1110 {
1111 	if (!kvm_vcpu_apicv_active(vcpu))
1112 		return;
1113 
1114 	avic_vcpu_load(vcpu, vcpu->cpu);
1115 }
1116 
1117 /*
1118  * Note:
1119  * - The module param avic enable both xAPIC and x2APIC mode.
1120  * - Hypervisor can support both xAVIC and x2AVIC in the same guest.
1121  * - The mode can be switched at run-time.
1122  */
1123 bool avic_hardware_setup(void)
1124 {
1125 	if (!npt_enabled)
1126 		return false;
1127 
1128 	/* AVIC is a prerequisite for x2AVIC. */
1129 	if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) {
1130 		if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
1131 			pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
1132 			pr_warn(FW_BUG "Try enable AVIC using force_avic option");
1133 		}
1134 		return false;
1135 	}
1136 
1137 	if (boot_cpu_has(X86_FEATURE_AVIC)) {
1138 		pr_info("AVIC enabled\n");
1139 	} else if (force_avic) {
1140 		/*
1141 		 * Some older systems does not advertise AVIC support.
1142 		 * See Revision Guide for specific AMD processor for more detail.
1143 		 */
1144 		pr_warn("AVIC is not supported in CPUID but force enabled");
1145 		pr_warn("Your system might crash and burn");
1146 	}
1147 
1148 	/* AVIC is a prerequisite for x2AVIC. */
1149 	x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
1150 	if (x2avic_enabled)
1151 		pr_info("x2AVIC enabled\n");
1152 
1153 	amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
1154 
1155 	return true;
1156 }
1157