1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/kvm_host.h> 3 4 #include <asm/irq_remapping.h> 5 #include <asm/cpu.h> 6 7 #include "lapic.h" 8 #include "posted_intr.h" 9 #include "trace.h" 10 #include "vmx.h" 11 12 /* 13 * We maintain a per-CPU linked-list of vCPU, so in wakeup_handler() we 14 * can find which vCPU should be waken up. 15 */ 16 static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); 17 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); 18 19 static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) 20 { 21 return &(to_vmx(vcpu)->pi_desc); 22 } 23 24 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) 25 { 26 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 27 struct pi_desc old, new; 28 unsigned int dest; 29 30 /* 31 * In case of hot-plug or hot-unplug, we may have to undo 32 * vmx_vcpu_pi_put even if there is no assigned device. And we 33 * always keep PI.NDST up to date for simplicity: it makes the 34 * code easier, and CPU migration is not a fast path. 35 */ 36 if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) 37 return; 38 39 /* 40 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change 41 * PI.NDST: pi_post_block is the one expected to change PID.NDST and the 42 * wakeup handler expects the vCPU to be on the blocked_vcpu_list that 43 * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up 44 * correctly. 45 */ 46 if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { 47 pi_clear_sn(pi_desc); 48 goto after_clear_sn; 49 } 50 51 /* The full case. */ 52 do { 53 old.control = new.control = pi_desc->control; 54 55 dest = cpu_physical_id(cpu); 56 57 if (x2apic_mode) 58 new.ndst = dest; 59 else 60 new.ndst = (dest << 8) & 0xFF00; 61 62 new.sn = 0; 63 } while (cmpxchg64(&pi_desc->control, old.control, 64 new.control) != old.control); 65 66 after_clear_sn: 67 68 /* 69 * Clear SN before reading the bitmap. The VT-d firmware 70 * writes the bitmap and reads SN atomically (5.2.3 in the 71 * spec), so it doesn't really have a memory barrier that 72 * pairs with this, but we cannot do that and we need one. 73 */ 74 smp_mb__after_atomic(); 75 76 if (!pi_is_pir_empty(pi_desc)) 77 pi_set_on(pi_desc); 78 } 79 80 void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) 81 { 82 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 83 84 if (!kvm_arch_has_assigned_device(vcpu->kvm) || 85 !irq_remapping_cap(IRQ_POSTING_CAP) || 86 !kvm_vcpu_apicv_active(vcpu)) 87 return; 88 89 /* Set SN when the vCPU is preempted */ 90 if (vcpu->preempted) 91 pi_set_sn(pi_desc); 92 } 93 94 static void __pi_post_block(struct kvm_vcpu *vcpu) 95 { 96 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 97 struct pi_desc old, new; 98 unsigned int dest; 99 100 do { 101 old.control = new.control = pi_desc->control; 102 WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, 103 "Wakeup handler not enabled while the VCPU is blocked\n"); 104 105 dest = cpu_physical_id(vcpu->cpu); 106 107 if (x2apic_mode) 108 new.ndst = dest; 109 else 110 new.ndst = (dest << 8) & 0xFF00; 111 112 /* set 'NV' to 'notification vector' */ 113 new.nv = POSTED_INTR_VECTOR; 114 } while (cmpxchg64(&pi_desc->control, old.control, 115 new.control) != old.control); 116 117 if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { 118 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); 119 list_del(&vcpu->blocked_vcpu_list); 120 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); 121 vcpu->pre_pcpu = -1; 122 } 123 } 124 125 /* 126 * This routine does the following things for vCPU which is going 127 * to be blocked if VT-d PI is enabled. 128 * - Store the vCPU to the wakeup list, so when interrupts happen 129 * we can find the right vCPU to wake up. 130 * - Change the Posted-interrupt descriptor as below: 131 * 'NDST' <-- vcpu->pre_pcpu 132 * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR 133 * - If 'ON' is set during this process, which means at least one 134 * interrupt is posted for this vCPU, we cannot block it, in 135 * this case, return 1, otherwise, return 0. 136 * 137 */ 138 int pi_pre_block(struct kvm_vcpu *vcpu) 139 { 140 unsigned int dest; 141 struct pi_desc old, new; 142 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 143 144 if (!kvm_arch_has_assigned_device(vcpu->kvm) || 145 !irq_remapping_cap(IRQ_POSTING_CAP) || 146 !kvm_vcpu_apicv_active(vcpu)) 147 return 0; 148 149 WARN_ON(irqs_disabled()); 150 local_irq_disable(); 151 if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { 152 vcpu->pre_pcpu = vcpu->cpu; 153 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); 154 list_add_tail(&vcpu->blocked_vcpu_list, 155 &per_cpu(blocked_vcpu_on_cpu, 156 vcpu->pre_pcpu)); 157 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); 158 } 159 160 do { 161 old.control = new.control = pi_desc->control; 162 163 WARN((pi_desc->sn == 1), 164 "Warning: SN field of posted-interrupts " 165 "is set before blocking\n"); 166 167 /* 168 * Since vCPU can be preempted during this process, 169 * vcpu->cpu could be different with pre_pcpu, we 170 * need to set pre_pcpu as the destination of wakeup 171 * notification event, then we can find the right vCPU 172 * to wakeup in wakeup handler if interrupts happen 173 * when the vCPU is in blocked state. 174 */ 175 dest = cpu_physical_id(vcpu->pre_pcpu); 176 177 if (x2apic_mode) 178 new.ndst = dest; 179 else 180 new.ndst = (dest << 8) & 0xFF00; 181 182 /* set 'NV' to 'wakeup vector' */ 183 new.nv = POSTED_INTR_WAKEUP_VECTOR; 184 } while (cmpxchg64(&pi_desc->control, old.control, 185 new.control) != old.control); 186 187 /* We should not block the vCPU if an interrupt is posted for it. */ 188 if (pi_test_on(pi_desc) == 1) 189 __pi_post_block(vcpu); 190 191 local_irq_enable(); 192 return (vcpu->pre_pcpu == -1); 193 } 194 195 void pi_post_block(struct kvm_vcpu *vcpu) 196 { 197 if (vcpu->pre_pcpu == -1) 198 return; 199 200 WARN_ON(irqs_disabled()); 201 local_irq_disable(); 202 __pi_post_block(vcpu); 203 local_irq_enable(); 204 } 205 206 /* 207 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. 208 */ 209 void pi_wakeup_handler(void) 210 { 211 struct kvm_vcpu *vcpu; 212 int cpu = smp_processor_id(); 213 214 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 215 list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), 216 blocked_vcpu_list) { 217 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 218 219 if (pi_test_on(pi_desc) == 1) 220 kvm_vcpu_kick(vcpu); 221 } 222 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 223 } 224 225 void __init pi_init_cpu(int cpu) 226 { 227 INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); 228 spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 229 } 230 231 bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) 232 { 233 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 234 235 return pi_test_on(pi_desc) || 236 (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); 237 } 238 239 240 /* 241 * pi_update_irte - set IRTE for Posted-Interrupts 242 * 243 * @kvm: kvm 244 * @host_irq: host irq of the interrupt 245 * @guest_irq: gsi of the interrupt 246 * @set: set or unset PI 247 * returns 0 on success, < 0 on failure 248 */ 249 int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, 250 bool set) 251 { 252 struct kvm_kernel_irq_routing_entry *e; 253 struct kvm_irq_routing_table *irq_rt; 254 struct kvm_lapic_irq irq; 255 struct kvm_vcpu *vcpu; 256 struct vcpu_data vcpu_info; 257 int idx, ret = 0; 258 259 if (!kvm_arch_has_assigned_device(kvm) || 260 !irq_remapping_cap(IRQ_POSTING_CAP) || 261 !kvm_vcpu_apicv_active(kvm->vcpus[0])) 262 return 0; 263 264 idx = srcu_read_lock(&kvm->irq_srcu); 265 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); 266 if (guest_irq >= irq_rt->nr_rt_entries || 267 hlist_empty(&irq_rt->map[guest_irq])) { 268 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", 269 guest_irq, irq_rt->nr_rt_entries); 270 goto out; 271 } 272 273 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { 274 if (e->type != KVM_IRQ_ROUTING_MSI) 275 continue; 276 /* 277 * VT-d PI cannot support posting multicast/broadcast 278 * interrupts to a vCPU, we still use interrupt remapping 279 * for these kind of interrupts. 280 * 281 * For lowest-priority interrupts, we only support 282 * those with single CPU as the destination, e.g. user 283 * configures the interrupts via /proc/irq or uses 284 * irqbalance to make the interrupts single-CPU. 285 * 286 * We will support full lowest-priority interrupt later. 287 * 288 * In addition, we can only inject generic interrupts using 289 * the PI mechanism, refuse to route others through it. 290 */ 291 292 kvm_set_msi_irq(kvm, e, &irq); 293 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || 294 !kvm_irq_is_postable(&irq)) { 295 /* 296 * Make sure the IRTE is in remapped mode if 297 * we don't handle it in posted mode. 298 */ 299 ret = irq_set_vcpu_affinity(host_irq, NULL); 300 if (ret < 0) { 301 printk(KERN_INFO 302 "failed to back to remapped mode, irq: %u\n", 303 host_irq); 304 goto out; 305 } 306 307 continue; 308 } 309 310 vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc); 311 vcpu_info.vector = irq.vector; 312 313 trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, 314 vcpu_info.vector, vcpu_info.pi_desc_addr, set); 315 316 if (set) 317 ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); 318 else 319 ret = irq_set_vcpu_affinity(host_irq, NULL); 320 321 if (ret < 0) { 322 printk(KERN_INFO "%s: failed to update PI IRTE\n", 323 __func__); 324 goto out; 325 } 326 } 327 328 ret = 0; 329 out: 330 srcu_read_unlock(&kvm->irq_srcu, idx); 331 return ret; 332 } 333