1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * AMD SVM support 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * Avi Kivity <avi@qumranet.com> 13 */ 14 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/kvm_types.h> 18 #include <linux/hashtable.h> 19 #include <linux/amd-iommu.h> 20 #include <linux/kvm_host.h> 21 22 #include <asm/irq_remapping.h> 23 24 #include "trace.h" 25 #include "lapic.h" 26 #include "x86.h" 27 #include "irq.h" 28 #include "svm.h" 29 30 /* 31 * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID, 32 * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry 33 * if an interrupt can't be delivered, e.g. because the vCPU isn't running. 34 * 35 * For the vCPU ID, use however many bits are currently allowed for the max 36 * guest physical APIC ID (limited by the size of the physical ID table), and 37 * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the 38 * size of the GATag is defined by hardware (32 bits), but is an opaque value 39 * as far as hardware is concerned. 40 */ 41 #define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK 42 43 #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK) 44 #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT) 45 46 #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK) 47 #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) 48 49 #define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ 50 ((vcpu_id) & AVIC_VCPU_ID_MASK)) 51 #define AVIC_GATAG(vm_id, vcpu_id) \ 52 ({ \ 53 u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \ 54 \ 55 WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \ 56 WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \ 57 ga_tag; \ 58 }) 59 60 static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u); 61 62 static bool force_avic; 63 module_param_unsafe(force_avic, bool, 0444); 64 65 /* Note: 66 * This hash table is used to map VM_ID to a struct kvm_svm, 67 * when handling AMD IOMMU GALOG notification to schedule in 68 * a particular vCPU. 69 */ 70 #define SVM_VM_DATA_HASH_BITS 8 71 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); 72 static u32 next_vm_id = 0; 73 static bool next_vm_id_wrapped = 0; 74 static DEFINE_SPINLOCK(svm_vm_data_hash_lock); 75 bool x2avic_enabled; 76 77 /* 78 * This is a wrapper of struct amd_iommu_ir_data. 79 */ 80 struct amd_svm_iommu_ir { 81 struct list_head node; /* Used by SVM for per-vcpu ir_list */ 82 void *data; /* Storing pointer to struct amd_ir_data */ 83 }; 84 85 static void avic_activate_vmcb(struct vcpu_svm *svm) 86 { 87 struct vmcb *vmcb = svm->vmcb01.ptr; 88 89 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); 90 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; 91 92 vmcb->control.int_ctl |= AVIC_ENABLE_MASK; 93 94 /* 95 * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR 96 * accesses, while interrupt injection to a running vCPU can be 97 * achieved using AVIC doorbell. KVM disables the APIC access page 98 * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling 99 * AVIC in hybrid mode activates only the doorbell mechanism. 100 */ 101 if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { 102 vmcb->control.int_ctl |= X2APIC_MODE_MASK; 103 vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; 104 /* Disabling MSR intercept for x2APIC registers */ 105 svm_set_x2apic_msr_interception(svm, false); 106 } else { 107 /* 108 * Flush the TLB, the guest may have inserted a non-APIC 109 * mapping into the TLB while AVIC was disabled. 110 */ 111 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); 112 113 /* For xAVIC and hybrid-xAVIC modes */ 114 vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; 115 /* Enabling MSR intercept for x2APIC registers */ 116 svm_set_x2apic_msr_interception(svm, true); 117 } 118 } 119 120 static void avic_deactivate_vmcb(struct vcpu_svm *svm) 121 { 122 struct vmcb *vmcb = svm->vmcb01.ptr; 123 124 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); 125 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; 126 127 /* 128 * If running nested and the guest uses its own MSR bitmap, there 129 * is no need to update L0's msr bitmap 130 */ 131 if (is_guest_mode(&svm->vcpu) && 132 vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)) 133 return; 134 135 /* Enabling MSR intercept for x2APIC registers */ 136 svm_set_x2apic_msr_interception(svm, true); 137 } 138 139 /* Note: 140 * This function is called from IOMMU driver to notify 141 * SVM to schedule in a particular vCPU of a particular VM. 142 */ 143 int avic_ga_log_notifier(u32 ga_tag) 144 { 145 unsigned long flags; 146 struct kvm_svm *kvm_svm; 147 struct kvm_vcpu *vcpu = NULL; 148 u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); 149 u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); 150 151 pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); 152 trace_kvm_avic_ga_log(vm_id, vcpu_id); 153 154 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 155 hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { 156 if (kvm_svm->avic_vm_id != vm_id) 157 continue; 158 vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id); 159 break; 160 } 161 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 162 163 /* Note: 164 * At this point, the IOMMU should have already set the pending 165 * bit in the vAPIC backing page. So, we just need to schedule 166 * in the vcpu. 167 */ 168 if (vcpu) 169 kvm_vcpu_wake_up(vcpu); 170 171 return 0; 172 } 173 174 void avic_vm_destroy(struct kvm *kvm) 175 { 176 unsigned long flags; 177 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 178 179 if (!enable_apicv) 180 return; 181 182 if (kvm_svm->avic_logical_id_table_page) 183 __free_page(kvm_svm->avic_logical_id_table_page); 184 if (kvm_svm->avic_physical_id_table_page) 185 __free_page(kvm_svm->avic_physical_id_table_page); 186 187 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 188 hash_del(&kvm_svm->hnode); 189 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 190 } 191 192 int avic_vm_init(struct kvm *kvm) 193 { 194 unsigned long flags; 195 int err = -ENOMEM; 196 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 197 struct kvm_svm *k2; 198 struct page *p_page; 199 struct page *l_page; 200 u32 vm_id; 201 202 if (!enable_apicv) 203 return 0; 204 205 /* Allocating physical APIC ID table (4KB) */ 206 p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 207 if (!p_page) 208 goto free_avic; 209 210 kvm_svm->avic_physical_id_table_page = p_page; 211 212 /* Allocating logical APIC ID table (4KB) */ 213 l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 214 if (!l_page) 215 goto free_avic; 216 217 kvm_svm->avic_logical_id_table_page = l_page; 218 219 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 220 again: 221 vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; 222 if (vm_id == 0) { /* id is 1-based, zero is not okay */ 223 next_vm_id_wrapped = 1; 224 goto again; 225 } 226 /* Is it still in use? Only possible if wrapped at least once */ 227 if (next_vm_id_wrapped) { 228 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) { 229 if (k2->avic_vm_id == vm_id) 230 goto again; 231 } 232 } 233 kvm_svm->avic_vm_id = vm_id; 234 hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id); 235 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 236 237 return 0; 238 239 free_avic: 240 avic_vm_destroy(kvm); 241 return err; 242 } 243 244 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) 245 { 246 struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); 247 phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); 248 phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); 249 phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page)); 250 251 vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; 252 vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; 253 vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; 254 vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; 255 256 if (kvm_apicv_activated(svm->vcpu.kvm)) 257 avic_activate_vmcb(svm); 258 else 259 avic_deactivate_vmcb(svm); 260 } 261 262 static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, 263 unsigned int index) 264 { 265 u64 *avic_physical_id_table; 266 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 267 268 if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) || 269 (index > X2AVIC_MAX_PHYSICAL_ID)) 270 return NULL; 271 272 avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); 273 274 return &avic_physical_id_table[index]; 275 } 276 277 static int avic_init_backing_page(struct kvm_vcpu *vcpu) 278 { 279 u64 *entry, new_entry; 280 int id = vcpu->vcpu_id; 281 struct vcpu_svm *svm = to_svm(vcpu); 282 283 if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || 284 (id > X2AVIC_MAX_PHYSICAL_ID)) 285 return -EINVAL; 286 287 if (!vcpu->arch.apic->regs) 288 return -EINVAL; 289 290 if (kvm_apicv_activated(vcpu->kvm)) { 291 int ret; 292 293 /* 294 * Note, AVIC hardware walks the nested page table to check 295 * permissions, but does not use the SPA address specified in 296 * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE 297 * pointer field of the VMCB. 298 */ 299 ret = kvm_alloc_apic_access_page(vcpu->kvm); 300 if (ret) 301 return ret; 302 } 303 304 svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs); 305 306 /* Setting AVIC backing page address in the phy APIC ID table */ 307 entry = avic_get_physical_id_entry(vcpu, id); 308 if (!entry) 309 return -EINVAL; 310 311 new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & 312 AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | 313 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); 314 WRITE_ONCE(*entry, new_entry); 315 316 svm->avic_physical_id_cache = entry; 317 318 return 0; 319 } 320 321 void avic_ring_doorbell(struct kvm_vcpu *vcpu) 322 { 323 /* 324 * Note, the vCPU could get migrated to a different pCPU at any point, 325 * which could result in signalling the wrong/previous pCPU. But if 326 * that happens the vCPU is guaranteed to do a VMRUN (after being 327 * migrated) and thus will process pending interrupts, i.e. a doorbell 328 * is not needed (and the spurious one is harmless). 329 */ 330 int cpu = READ_ONCE(vcpu->cpu); 331 332 if (cpu != get_cpu()) { 333 wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); 334 trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); 335 } 336 put_cpu(); 337 } 338 339 340 static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl) 341 { 342 vcpu->arch.apic->irr_pending = true; 343 svm_complete_interrupt_delivery(vcpu, 344 icrl & APIC_MODE_MASK, 345 icrl & APIC_INT_LEVELTRIG, 346 icrl & APIC_VECTOR_MASK); 347 } 348 349 static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id, 350 u32 icrl) 351 { 352 /* 353 * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID, 354 * i.e. APIC ID == vCPU ID. 355 */ 356 struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id); 357 358 /* Once again, nothing to do if the target vCPU doesn't exist. */ 359 if (unlikely(!target_vcpu)) 360 return; 361 362 avic_kick_vcpu(target_vcpu, icrl); 363 } 364 365 static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table, 366 u32 logid_index, u32 icrl) 367 { 368 u32 physical_id; 369 370 if (avic_logical_id_table) { 371 u32 logid_entry = avic_logical_id_table[logid_index]; 372 373 /* Nothing to do if the logical destination is invalid. */ 374 if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) 375 return; 376 377 physical_id = logid_entry & 378 AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; 379 } else { 380 /* 381 * For x2APIC, the logical APIC ID is a read-only value that is 382 * derived from the x2APIC ID, thus the x2APIC ID can be found 383 * by reversing the calculation (stored in logid_index). Note, 384 * bits 31:20 of the x2APIC ID aren't propagated to the logical 385 * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS. 386 */ 387 physical_id = logid_index; 388 } 389 390 avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl); 391 } 392 393 /* 394 * A fast-path version of avic_kick_target_vcpus(), which attempts to match 395 * destination APIC ID to vCPU without looping through all vCPUs. 396 */ 397 static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source, 398 u32 icrl, u32 icrh, u32 index) 399 { 400 int dest_mode = icrl & APIC_DEST_MASK; 401 int shorthand = icrl & APIC_SHORT_MASK; 402 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 403 u32 dest; 404 405 if (shorthand != APIC_DEST_NOSHORT) 406 return -EINVAL; 407 408 if (apic_x2apic_mode(source)) 409 dest = icrh; 410 else 411 dest = GET_XAPIC_DEST_FIELD(icrh); 412 413 if (dest_mode == APIC_DEST_PHYSICAL) { 414 /* broadcast destination, use slow path */ 415 if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST) 416 return -EINVAL; 417 if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST) 418 return -EINVAL; 419 420 if (WARN_ON_ONCE(dest != index)) 421 return -EINVAL; 422 423 avic_kick_vcpu_by_physical_id(kvm, dest, icrl); 424 } else { 425 u32 *avic_logical_id_table; 426 unsigned long bitmap, i; 427 u32 cluster; 428 429 if (apic_x2apic_mode(source)) { 430 /* 16 bit dest mask, 16 bit cluster id */ 431 bitmap = dest & 0xFFFF; 432 cluster = (dest >> 16) << 4; 433 } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) { 434 /* 8 bit dest mask*/ 435 bitmap = dest; 436 cluster = 0; 437 } else { 438 /* 4 bit desk mask, 4 bit cluster id */ 439 bitmap = dest & 0xF; 440 cluster = (dest >> 4) << 2; 441 } 442 443 /* Nothing to do if there are no destinations in the cluster. */ 444 if (unlikely(!bitmap)) 445 return 0; 446 447 if (apic_x2apic_mode(source)) 448 avic_logical_id_table = NULL; 449 else 450 avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); 451 452 /* 453 * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical 454 * IDs, thus each bit in the destination is guaranteed to map 455 * to at most one vCPU. 456 */ 457 for_each_set_bit(i, &bitmap, 16) 458 avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table, 459 cluster + i, icrl); 460 } 461 462 return 0; 463 } 464 465 static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, 466 u32 icrl, u32 icrh, u32 index) 467 { 468 u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh); 469 unsigned long i; 470 struct kvm_vcpu *vcpu; 471 472 if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index)) 473 return; 474 475 trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index); 476 477 /* 478 * Wake any target vCPUs that are blocking, i.e. waiting for a wake 479 * event. There's no need to signal doorbells, as hardware has handled 480 * vCPUs that were in guest at the time of the IPI, and vCPUs that have 481 * since entered the guest will have processed pending IRQs at VMRUN. 482 */ 483 kvm_for_each_vcpu(i, vcpu, kvm) { 484 if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, 485 dest, icrl & APIC_DEST_MASK)) 486 avic_kick_vcpu(vcpu, icrl); 487 } 488 } 489 490 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) 491 { 492 struct vcpu_svm *svm = to_svm(vcpu); 493 u32 icrh = svm->vmcb->control.exit_info_1 >> 32; 494 u32 icrl = svm->vmcb->control.exit_info_1; 495 u32 id = svm->vmcb->control.exit_info_2 >> 32; 496 u32 index = svm->vmcb->control.exit_info_2 & 0x1FF; 497 struct kvm_lapic *apic = vcpu->arch.apic; 498 499 trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); 500 501 switch (id) { 502 case AVIC_IPI_FAILURE_INVALID_TARGET: 503 case AVIC_IPI_FAILURE_INVALID_INT_TYPE: 504 /* 505 * Emulate IPIs that are not handled by AVIC hardware, which 506 * only virtualizes Fixed, Edge-Triggered INTRs, and falls over 507 * if _any_ targets are invalid, e.g. if the logical mode mask 508 * is a superset of running vCPUs. 509 * 510 * The exit is a trap, e.g. ICR holds the correct value and RIP 511 * has been advanced, KVM is responsible only for emulating the 512 * IPI. Sadly, hardware may sometimes leave the BUSY flag set, 513 * in which case KVM needs to emulate the ICR write as well in 514 * order to clear the BUSY flag. 515 */ 516 if (icrl & APIC_ICR_BUSY) 517 kvm_apic_write_nodecode(vcpu, APIC_ICR); 518 else 519 kvm_apic_send_ipi(apic, icrl, icrh); 520 break; 521 case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: 522 /* 523 * At this point, we expect that the AVIC HW has already 524 * set the appropriate IRR bits on the valid target 525 * vcpus. So, we just need to kick the appropriate vcpu. 526 */ 527 avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index); 528 break; 529 case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: 530 WARN_ONCE(1, "Invalid backing page\n"); 531 break; 532 default: 533 pr_err("Unknown IPI interception\n"); 534 } 535 536 return 1; 537 } 538 539 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) 540 { 541 if (is_guest_mode(vcpu)) 542 return APICV_INHIBIT_REASON_NESTED; 543 return 0; 544 } 545 546 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) 547 { 548 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 549 u32 *logical_apic_id_table; 550 u32 cluster, index; 551 552 ldr = GET_APIC_LOGICAL_ID(ldr); 553 554 if (flat) { 555 cluster = 0; 556 } else { 557 cluster = (ldr >> 4); 558 if (cluster >= 0xf) 559 return NULL; 560 ldr &= 0xf; 561 } 562 if (!ldr || !is_power_of_2(ldr)) 563 return NULL; 564 565 index = __ffs(ldr); 566 if (WARN_ON_ONCE(index > 7)) 567 return NULL; 568 index += (cluster << 2); 569 570 logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page); 571 572 return &logical_apic_id_table[index]; 573 } 574 575 static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) 576 { 577 bool flat; 578 u32 *entry, new_entry; 579 580 flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; 581 entry = avic_get_logical_id_entry(vcpu, ldr, flat); 582 if (!entry) 583 return; 584 585 new_entry = READ_ONCE(*entry); 586 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; 587 new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); 588 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; 589 WRITE_ONCE(*entry, new_entry); 590 } 591 592 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) 593 { 594 struct vcpu_svm *svm = to_svm(vcpu); 595 bool flat = svm->dfr_reg == APIC_DFR_FLAT; 596 u32 *entry; 597 598 /* Note: x2AVIC does not use logical APIC ID table */ 599 if (apic_x2apic_mode(vcpu->arch.apic)) 600 return; 601 602 entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); 603 if (entry) 604 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); 605 } 606 607 static void avic_handle_ldr_update(struct kvm_vcpu *vcpu) 608 { 609 struct vcpu_svm *svm = to_svm(vcpu); 610 u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); 611 u32 id = kvm_xapic_id(vcpu->arch.apic); 612 613 /* AVIC does not support LDR update for x2APIC */ 614 if (apic_x2apic_mode(vcpu->arch.apic)) 615 return; 616 617 if (ldr == svm->ldr_reg) 618 return; 619 620 avic_invalidate_logical_id_entry(vcpu); 621 622 svm->ldr_reg = ldr; 623 avic_ldr_write(vcpu, id, ldr); 624 } 625 626 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) 627 { 628 struct vcpu_svm *svm = to_svm(vcpu); 629 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); 630 631 if (svm->dfr_reg == dfr) 632 return; 633 634 avic_invalidate_logical_id_entry(vcpu); 635 svm->dfr_reg = dfr; 636 } 637 638 static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu) 639 { 640 u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 & 641 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 642 643 switch (offset) { 644 case APIC_LDR: 645 avic_handle_ldr_update(vcpu); 646 break; 647 case APIC_DFR: 648 avic_handle_dfr_update(vcpu); 649 break; 650 case APIC_RRR: 651 /* Ignore writes to Read Remote Data, it's read-only. */ 652 return 1; 653 default: 654 break; 655 } 656 657 kvm_apic_write_nodecode(vcpu, offset); 658 return 1; 659 } 660 661 static bool is_avic_unaccelerated_access_trap(u32 offset) 662 { 663 bool ret = false; 664 665 switch (offset) { 666 case APIC_ID: 667 case APIC_EOI: 668 case APIC_RRR: 669 case APIC_LDR: 670 case APIC_DFR: 671 case APIC_SPIV: 672 case APIC_ESR: 673 case APIC_ICR: 674 case APIC_LVTT: 675 case APIC_LVTTHMR: 676 case APIC_LVTPC: 677 case APIC_LVT0: 678 case APIC_LVT1: 679 case APIC_LVTERR: 680 case APIC_TMICT: 681 case APIC_TDCR: 682 ret = true; 683 break; 684 default: 685 break; 686 } 687 return ret; 688 } 689 690 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu) 691 { 692 struct vcpu_svm *svm = to_svm(vcpu); 693 int ret = 0; 694 u32 offset = svm->vmcb->control.exit_info_1 & 695 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 696 u32 vector = svm->vmcb->control.exit_info_2 & 697 AVIC_UNACCEL_ACCESS_VECTOR_MASK; 698 bool write = (svm->vmcb->control.exit_info_1 >> 32) & 699 AVIC_UNACCEL_ACCESS_WRITE_MASK; 700 bool trap = is_avic_unaccelerated_access_trap(offset); 701 702 trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset, 703 trap, write, vector); 704 if (trap) { 705 /* Handling Trap */ 706 WARN_ONCE(!write, "svm: Handling trap read.\n"); 707 ret = avic_unaccel_trap_write(vcpu); 708 } else { 709 /* Handling Fault */ 710 ret = kvm_emulate_instruction(vcpu, 0); 711 } 712 713 return ret; 714 } 715 716 int avic_init_vcpu(struct vcpu_svm *svm) 717 { 718 int ret; 719 struct kvm_vcpu *vcpu = &svm->vcpu; 720 721 if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) 722 return 0; 723 724 ret = avic_init_backing_page(vcpu); 725 if (ret) 726 return ret; 727 728 INIT_LIST_HEAD(&svm->ir_list); 729 spin_lock_init(&svm->ir_list_lock); 730 svm->dfr_reg = APIC_DFR_FLAT; 731 732 return ret; 733 } 734 735 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) 736 { 737 avic_handle_dfr_update(vcpu); 738 avic_handle_ldr_update(vcpu); 739 } 740 741 static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) 742 { 743 int ret = 0; 744 unsigned long flags; 745 struct amd_svm_iommu_ir *ir; 746 struct vcpu_svm *svm = to_svm(vcpu); 747 748 if (!kvm_arch_has_assigned_device(vcpu->kvm)) 749 return 0; 750 751 /* 752 * Here, we go through the per-vcpu ir_list to update all existing 753 * interrupt remapping table entry targeting this vcpu. 754 */ 755 spin_lock_irqsave(&svm->ir_list_lock, flags); 756 757 if (list_empty(&svm->ir_list)) 758 goto out; 759 760 list_for_each_entry(ir, &svm->ir_list, node) { 761 if (activate) 762 ret = amd_iommu_activate_guest_mode(ir->data); 763 else 764 ret = amd_iommu_deactivate_guest_mode(ir->data); 765 if (ret) 766 break; 767 } 768 out: 769 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 770 return ret; 771 } 772 773 static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) 774 { 775 unsigned long flags; 776 struct amd_svm_iommu_ir *cur; 777 778 spin_lock_irqsave(&svm->ir_list_lock, flags); 779 list_for_each_entry(cur, &svm->ir_list, node) { 780 if (cur->data != pi->ir_data) 781 continue; 782 list_del(&cur->node); 783 kfree(cur); 784 break; 785 } 786 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 787 } 788 789 static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) 790 { 791 int ret = 0; 792 unsigned long flags; 793 struct amd_svm_iommu_ir *ir; 794 795 /** 796 * In some cases, the existing irte is updated and re-set, 797 * so we need to check here if it's already been * added 798 * to the ir_list. 799 */ 800 if (pi->ir_data && (pi->prev_ga_tag != 0)) { 801 struct kvm *kvm = svm->vcpu.kvm; 802 u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); 803 struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); 804 struct vcpu_svm *prev_svm; 805 806 if (!prev_vcpu) { 807 ret = -EINVAL; 808 goto out; 809 } 810 811 prev_svm = to_svm(prev_vcpu); 812 svm_ir_list_del(prev_svm, pi); 813 } 814 815 /** 816 * Allocating new amd_iommu_pi_data, which will get 817 * add to the per-vcpu ir_list. 818 */ 819 ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); 820 if (!ir) { 821 ret = -ENOMEM; 822 goto out; 823 } 824 ir->data = pi->ir_data; 825 826 spin_lock_irqsave(&svm->ir_list_lock, flags); 827 list_add(&ir->node, &svm->ir_list); 828 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 829 out: 830 return ret; 831 } 832 833 /* 834 * Note: 835 * The HW cannot support posting multicast/broadcast 836 * interrupts to a vCPU. So, we still use legacy interrupt 837 * remapping for these kind of interrupts. 838 * 839 * For lowest-priority interrupts, we only support 840 * those with single CPU as the destination, e.g. user 841 * configures the interrupts via /proc/irq or uses 842 * irqbalance to make the interrupts single-CPU. 843 */ 844 static int 845 get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, 846 struct vcpu_data *vcpu_info, struct vcpu_svm **svm) 847 { 848 struct kvm_lapic_irq irq; 849 struct kvm_vcpu *vcpu = NULL; 850 851 kvm_set_msi_irq(kvm, e, &irq); 852 853 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || 854 !kvm_irq_is_postable(&irq)) { 855 pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", 856 __func__, irq.vector); 857 return -1; 858 } 859 860 pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, 861 irq.vector); 862 *svm = to_svm(vcpu); 863 vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); 864 vcpu_info->vector = irq.vector; 865 866 return 0; 867 } 868 869 /* 870 * avic_pi_update_irte - set IRTE for Posted-Interrupts 871 * 872 * @kvm: kvm 873 * @host_irq: host irq of the interrupt 874 * @guest_irq: gsi of the interrupt 875 * @set: set or unset PI 876 * returns 0 on success, < 0 on failure 877 */ 878 int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, 879 uint32_t guest_irq, bool set) 880 { 881 struct kvm_kernel_irq_routing_entry *e; 882 struct kvm_irq_routing_table *irq_rt; 883 int idx, ret = 0; 884 885 if (!kvm_arch_has_assigned_device(kvm) || 886 !irq_remapping_cap(IRQ_POSTING_CAP)) 887 return 0; 888 889 pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", 890 __func__, host_irq, guest_irq, set); 891 892 idx = srcu_read_lock(&kvm->irq_srcu); 893 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); 894 895 if (guest_irq >= irq_rt->nr_rt_entries || 896 hlist_empty(&irq_rt->map[guest_irq])) { 897 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", 898 guest_irq, irq_rt->nr_rt_entries); 899 goto out; 900 } 901 902 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { 903 struct vcpu_data vcpu_info; 904 struct vcpu_svm *svm = NULL; 905 906 if (e->type != KVM_IRQ_ROUTING_MSI) 907 continue; 908 909 /** 910 * Here, we setup with legacy mode in the following cases: 911 * 1. When cannot target interrupt to a specific vcpu. 912 * 2. Unsetting posted interrupt. 913 * 3. APIC virtualization is disabled for the vcpu. 914 * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) 915 */ 916 if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && 917 kvm_vcpu_apicv_active(&svm->vcpu)) { 918 struct amd_iommu_pi_data pi; 919 920 /* Try to enable guest_mode in IRTE */ 921 pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & 922 AVIC_HPA_MASK); 923 pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, 924 svm->vcpu.vcpu_id); 925 pi.is_guest_mode = true; 926 pi.vcpu_data = &vcpu_info; 927 ret = irq_set_vcpu_affinity(host_irq, &pi); 928 929 /** 930 * Here, we successfully setting up vcpu affinity in 931 * IOMMU guest mode. Now, we need to store the posted 932 * interrupt information in a per-vcpu ir_list so that 933 * we can reference to them directly when we update vcpu 934 * scheduling information in IOMMU irte. 935 */ 936 if (!ret && pi.is_guest_mode) 937 svm_ir_list_add(svm, &pi); 938 } else { 939 /* Use legacy mode in IRTE */ 940 struct amd_iommu_pi_data pi; 941 942 /** 943 * Here, pi is used to: 944 * - Tell IOMMU to use legacy mode for this interrupt. 945 * - Retrieve ga_tag of prior interrupt remapping data. 946 */ 947 pi.prev_ga_tag = 0; 948 pi.is_guest_mode = false; 949 ret = irq_set_vcpu_affinity(host_irq, &pi); 950 951 /** 952 * Check if the posted interrupt was previously 953 * setup with the guest_mode by checking if the ga_tag 954 * was cached. If so, we need to clean up the per-vcpu 955 * ir_list. 956 */ 957 if (!ret && pi.prev_ga_tag) { 958 int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); 959 struct kvm_vcpu *vcpu; 960 961 vcpu = kvm_get_vcpu_by_id(kvm, id); 962 if (vcpu) 963 svm_ir_list_del(to_svm(vcpu), &pi); 964 } 965 } 966 967 if (!ret && svm) { 968 trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, 969 e->gsi, vcpu_info.vector, 970 vcpu_info.pi_desc_addr, set); 971 } 972 973 if (ret < 0) { 974 pr_err("%s: failed to update PI IRTE\n", __func__); 975 goto out; 976 } 977 } 978 979 ret = 0; 980 out: 981 srcu_read_unlock(&kvm->irq_srcu, idx); 982 return ret; 983 } 984 985 static inline int 986 avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) 987 { 988 int ret = 0; 989 unsigned long flags; 990 struct amd_svm_iommu_ir *ir; 991 struct vcpu_svm *svm = to_svm(vcpu); 992 993 if (!kvm_arch_has_assigned_device(vcpu->kvm)) 994 return 0; 995 996 /* 997 * Here, we go through the per-vcpu ir_list to update all existing 998 * interrupt remapping table entry targeting this vcpu. 999 */ 1000 spin_lock_irqsave(&svm->ir_list_lock, flags); 1001 1002 if (list_empty(&svm->ir_list)) 1003 goto out; 1004 1005 list_for_each_entry(ir, &svm->ir_list, node) { 1006 ret = amd_iommu_update_ga(cpu, r, ir->data); 1007 if (ret) 1008 break; 1009 } 1010 out: 1011 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 1012 return ret; 1013 } 1014 1015 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1016 { 1017 u64 entry; 1018 int h_physical_id = kvm_cpu_get_apicid(cpu); 1019 struct vcpu_svm *svm = to_svm(vcpu); 1020 1021 lockdep_assert_preemption_disabled(); 1022 1023 if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) 1024 return; 1025 1026 /* 1027 * No need to update anything if the vCPU is blocking, i.e. if the vCPU 1028 * is being scheduled in after being preempted. The CPU entries in the 1029 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. 1030 * If the vCPU was migrated, its new CPU value will be stuffed when the 1031 * vCPU unblocks. 1032 */ 1033 if (kvm_vcpu_is_blocking(vcpu)) 1034 return; 1035 1036 entry = READ_ONCE(*(svm->avic_physical_id_cache)); 1037 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); 1038 1039 entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; 1040 entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); 1041 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 1042 1043 WRITE_ONCE(*(svm->avic_physical_id_cache), entry); 1044 avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); 1045 } 1046 1047 void avic_vcpu_put(struct kvm_vcpu *vcpu) 1048 { 1049 u64 entry; 1050 struct vcpu_svm *svm = to_svm(vcpu); 1051 1052 lockdep_assert_preemption_disabled(); 1053 1054 entry = READ_ONCE(*(svm->avic_physical_id_cache)); 1055 1056 /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ 1057 if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) 1058 return; 1059 1060 avic_update_iommu_vcpu_affinity(vcpu, -1, 0); 1061 1062 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 1063 WRITE_ONCE(*(svm->avic_physical_id_cache), entry); 1064 } 1065 1066 void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) 1067 { 1068 struct vcpu_svm *svm = to_svm(vcpu); 1069 struct vmcb *vmcb = svm->vmcb01.ptr; 1070 1071 if (!lapic_in_kernel(vcpu) || !enable_apicv) 1072 return; 1073 1074 if (kvm_vcpu_apicv_active(vcpu)) { 1075 /** 1076 * During AVIC temporary deactivation, guest could update 1077 * APIC ID, DFR and LDR registers, which would not be trapped 1078 * by avic_unaccelerated_access_interception(). In this case, 1079 * we need to check and update the AVIC logical APIC ID table 1080 * accordingly before re-activating. 1081 */ 1082 avic_apicv_post_state_restore(vcpu); 1083 avic_activate_vmcb(svm); 1084 } else { 1085 avic_deactivate_vmcb(svm); 1086 } 1087 vmcb_mark_dirty(vmcb, VMCB_AVIC); 1088 } 1089 1090 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 1091 { 1092 bool activated = kvm_vcpu_apicv_active(vcpu); 1093 1094 if (!enable_apicv) 1095 return; 1096 1097 avic_refresh_virtual_apic_mode(vcpu); 1098 1099 if (activated) 1100 avic_vcpu_load(vcpu, vcpu->cpu); 1101 else 1102 avic_vcpu_put(vcpu); 1103 1104 avic_set_pi_irte_mode(vcpu, activated); 1105 } 1106 1107 void avic_vcpu_blocking(struct kvm_vcpu *vcpu) 1108 { 1109 if (!kvm_vcpu_apicv_active(vcpu)) 1110 return; 1111 1112 /* 1113 * Unload the AVIC when the vCPU is about to block, _before_ 1114 * the vCPU actually blocks. 1115 * 1116 * Any IRQs that arrive before IsRunning=0 will not cause an 1117 * incomplete IPI vmexit on the source, therefore vIRR will also 1118 * be checked by kvm_vcpu_check_block() before blocking. The 1119 * memory barrier implicit in set_current_state orders writing 1120 * IsRunning=0 before reading the vIRR. The processor needs a 1121 * matching memory barrier on interrupt delivery between writing 1122 * IRR and reading IsRunning; the lack of this barrier might be 1123 * the cause of errata #1235). 1124 */ 1125 avic_vcpu_put(vcpu); 1126 } 1127 1128 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) 1129 { 1130 if (!kvm_vcpu_apicv_active(vcpu)) 1131 return; 1132 1133 avic_vcpu_load(vcpu, vcpu->cpu); 1134 } 1135 1136 /* 1137 * Note: 1138 * - The module param avic enable both xAPIC and x2APIC mode. 1139 * - Hypervisor can support both xAVIC and x2AVIC in the same guest. 1140 * - The mode can be switched at run-time. 1141 */ 1142 bool avic_hardware_setup(void) 1143 { 1144 if (!npt_enabled) 1145 return false; 1146 1147 /* AVIC is a prerequisite for x2AVIC. */ 1148 if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) { 1149 if (boot_cpu_has(X86_FEATURE_X2AVIC)) { 1150 pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); 1151 pr_warn(FW_BUG "Try enable AVIC using force_avic option"); 1152 } 1153 return false; 1154 } 1155 1156 if (boot_cpu_has(X86_FEATURE_AVIC)) { 1157 pr_info("AVIC enabled\n"); 1158 } else if (force_avic) { 1159 /* 1160 * Some older systems does not advertise AVIC support. 1161 * See Revision Guide for specific AMD processor for more detail. 1162 */ 1163 pr_warn("AVIC is not supported in CPUID but force enabled"); 1164 pr_warn("Your system might crash and burn"); 1165 } 1166 1167 /* AVIC is a prerequisite for x2AVIC. */ 1168 x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); 1169 if (x2avic_enabled) 1170 pr_info("x2AVIC enabled\n"); 1171 1172 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 1173 1174 return true; 1175 } 1176