1 /* 2 * vfio based device assignment support 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Based on qemu-kvm device-assignment: 13 * Adapted for KVM by Qumranet. 14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) 15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) 16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) 17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) 18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) 19 */ 20 21 #include <linux/vfio.h> 22 #include <sys/ioctl.h> 23 #include <sys/mman.h> 24 #include <sys/stat.h> 25 #include <sys/types.h> 26 #include <unistd.h> 27 28 #include "config.h" 29 #include "hw/pci/msi.h" 30 #include "hw/pci/msix.h" 31 #include "hw/pci/pci_bridge.h" 32 #include "qemu/error-report.h" 33 #include "qemu/range.h" 34 #include "sysemu/kvm.h" 35 #include "sysemu/sysemu.h" 36 #include "pci.h" 37 #include "trace.h" 38 39 #define MSIX_CAP_LENGTH 12 40 41 static void vfio_disable_interrupts(VFIOPCIDevice *vdev); 42 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); 43 44 /* 45 * Disabling BAR mmaping can be slow, but toggling it around INTx can 46 * also be a huge overhead. We try to get the best of both worlds by 47 * waiting until an interrupt to disable mmaps (subsequent transitions 48 * to the same state are effectively no overhead). If the interrupt has 49 * been serviced and the time gap is long enough, we re-enable mmaps for 50 * performance. This works well for things like graphics cards, which 51 * may not use their interrupt at all and are penalized to an unusable 52 * level by read/write BAR traps. Other devices, like NICs, have more 53 * regular interrupts and see much better latency by staying in non-mmap 54 * mode. We therefore set the default mmap_timeout such that a ping 55 * is just enough to keep the mmap disabled. Users can experiment with 56 * other options with the x-intx-mmap-timeout-ms parameter (a value of 57 * zero disables the timer). 58 */ 59 static void vfio_intx_mmap_enable(void *opaque) 60 { 61 VFIOPCIDevice *vdev = opaque; 62 63 if (vdev->intx.pending) { 64 timer_mod(vdev->intx.mmap_timer, 65 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout); 66 return; 67 } 68 69 vfio_mmap_set_enabled(vdev, true); 70 } 71 72 static void vfio_intx_interrupt(void *opaque) 73 { 74 VFIOPCIDevice *vdev = opaque; 75 76 if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) { 77 return; 78 } 79 80 trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin); 81 82 vdev->intx.pending = true; 83 pci_irq_assert(&vdev->pdev); 84 vfio_mmap_set_enabled(vdev, false); 85 if (vdev->intx.mmap_timeout) { 86 timer_mod(vdev->intx.mmap_timer, 87 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout); 88 } 89 } 90 91 static void vfio_intx_eoi(VFIODevice *vbasedev) 92 { 93 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 94 95 if (!vdev->intx.pending) { 96 return; 97 } 98 99 trace_vfio_intx_eoi(vbasedev->name); 100 101 vdev->intx.pending = false; 102 pci_irq_deassert(&vdev->pdev); 103 vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 104 } 105 106 static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev) 107 { 108 #ifdef CONFIG_KVM 109 struct kvm_irqfd irqfd = { 110 .fd = event_notifier_get_fd(&vdev->intx.interrupt), 111 .gsi = vdev->intx.route.irq, 112 .flags = KVM_IRQFD_FLAG_RESAMPLE, 113 }; 114 struct vfio_irq_set *irq_set; 115 int ret, argsz; 116 int32_t *pfd; 117 118 if (vdev->no_kvm_intx || !kvm_irqfds_enabled() || 119 vdev->intx.route.mode != PCI_INTX_ENABLED || 120 !kvm_resamplefds_enabled()) { 121 return; 122 } 123 124 /* Get to a known interrupt state */ 125 qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev); 126 vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 127 vdev->intx.pending = false; 128 pci_irq_deassert(&vdev->pdev); 129 130 /* Get an eventfd for resample/unmask */ 131 if (event_notifier_init(&vdev->intx.unmask, 0)) { 132 error_report("vfio: Error: event_notifier_init failed eoi"); 133 goto fail; 134 } 135 136 /* KVM triggers it, VFIO listens for it */ 137 irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask); 138 139 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) { 140 error_report("vfio: Error: Failed to setup resample irqfd: %m"); 141 goto fail_irqfd; 142 } 143 144 argsz = sizeof(*irq_set) + sizeof(*pfd); 145 146 irq_set = g_malloc0(argsz); 147 irq_set->argsz = argsz; 148 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK; 149 irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; 150 irq_set->start = 0; 151 irq_set->count = 1; 152 pfd = (int32_t *)&irq_set->data; 153 154 *pfd = irqfd.resamplefd; 155 156 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 157 g_free(irq_set); 158 if (ret) { 159 error_report("vfio: Error: Failed to setup INTx unmask fd: %m"); 160 goto fail_vfio; 161 } 162 163 /* Let'em rip */ 164 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 165 166 vdev->intx.kvm_accel = true; 167 168 trace_vfio_intx_enable_kvm(vdev->vbasedev.name); 169 170 return; 171 172 fail_vfio: 173 irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN; 174 kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd); 175 fail_irqfd: 176 event_notifier_cleanup(&vdev->intx.unmask); 177 fail: 178 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev); 179 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 180 #endif 181 } 182 183 static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) 184 { 185 #ifdef CONFIG_KVM 186 struct kvm_irqfd irqfd = { 187 .fd = event_notifier_get_fd(&vdev->intx.interrupt), 188 .gsi = vdev->intx.route.irq, 189 .flags = KVM_IRQFD_FLAG_DEASSIGN, 190 }; 191 192 if (!vdev->intx.kvm_accel) { 193 return; 194 } 195 196 /* 197 * Get to a known state, hardware masked, QEMU ready to accept new 198 * interrupts, QEMU IRQ de-asserted. 199 */ 200 vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 201 vdev->intx.pending = false; 202 pci_irq_deassert(&vdev->pdev); 203 204 /* Tell KVM to stop listening for an INTx irqfd */ 205 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) { 206 error_report("vfio: Error: Failed to disable INTx irqfd: %m"); 207 } 208 209 /* We only need to close the eventfd for VFIO to cleanup the kernel side */ 210 event_notifier_cleanup(&vdev->intx.unmask); 211 212 /* QEMU starts listening for interrupt events. */ 213 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev); 214 215 vdev->intx.kvm_accel = false; 216 217 /* If we've missed an event, let it re-fire through QEMU */ 218 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 219 220 trace_vfio_intx_disable_kvm(vdev->vbasedev.name); 221 #endif 222 } 223 224 static void vfio_intx_update(PCIDevice *pdev) 225 { 226 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 227 PCIINTxRoute route; 228 229 if (vdev->interrupt != VFIO_INT_INTx) { 230 return; 231 } 232 233 route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin); 234 235 if (!pci_intx_route_changed(&vdev->intx.route, &route)) { 236 return; /* Nothing changed */ 237 } 238 239 trace_vfio_intx_update(vdev->vbasedev.name, 240 vdev->intx.route.irq, route.irq); 241 242 vfio_intx_disable_kvm(vdev); 243 244 vdev->intx.route = route; 245 246 if (route.mode != PCI_INTX_ENABLED) { 247 return; 248 } 249 250 vfio_intx_enable_kvm(vdev); 251 252 /* Re-enable the interrupt in cased we missed an EOI */ 253 vfio_intx_eoi(&vdev->vbasedev); 254 } 255 256 static int vfio_intx_enable(VFIOPCIDevice *vdev) 257 { 258 uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); 259 int ret, argsz; 260 struct vfio_irq_set *irq_set; 261 int32_t *pfd; 262 263 if (!pin) { 264 return 0; 265 } 266 267 vfio_disable_interrupts(vdev); 268 269 vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ 270 pci_config_set_interrupt_pin(vdev->pdev.config, pin); 271 272 #ifdef CONFIG_KVM 273 /* 274 * Only conditional to avoid generating error messages on platforms 275 * where we won't actually use the result anyway. 276 */ 277 if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) { 278 vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev, 279 vdev->intx.pin); 280 } 281 #endif 282 283 ret = event_notifier_init(&vdev->intx.interrupt, 0); 284 if (ret) { 285 error_report("vfio: Error: event_notifier_init failed"); 286 return ret; 287 } 288 289 argsz = sizeof(*irq_set) + sizeof(*pfd); 290 291 irq_set = g_malloc0(argsz); 292 irq_set->argsz = argsz; 293 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; 294 irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; 295 irq_set->start = 0; 296 irq_set->count = 1; 297 pfd = (int32_t *)&irq_set->data; 298 299 *pfd = event_notifier_get_fd(&vdev->intx.interrupt); 300 qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev); 301 302 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 303 g_free(irq_set); 304 if (ret) { 305 error_report("vfio: Error: Failed to setup INTx fd: %m"); 306 qemu_set_fd_handler(*pfd, NULL, NULL, vdev); 307 event_notifier_cleanup(&vdev->intx.interrupt); 308 return -errno; 309 } 310 311 vfio_intx_enable_kvm(vdev); 312 313 vdev->interrupt = VFIO_INT_INTx; 314 315 trace_vfio_intx_enable(vdev->vbasedev.name); 316 317 return 0; 318 } 319 320 static void vfio_intx_disable(VFIOPCIDevice *vdev) 321 { 322 int fd; 323 324 timer_del(vdev->intx.mmap_timer); 325 vfio_intx_disable_kvm(vdev); 326 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 327 vdev->intx.pending = false; 328 pci_irq_deassert(&vdev->pdev); 329 vfio_mmap_set_enabled(vdev, true); 330 331 fd = event_notifier_get_fd(&vdev->intx.interrupt); 332 qemu_set_fd_handler(fd, NULL, NULL, vdev); 333 event_notifier_cleanup(&vdev->intx.interrupt); 334 335 vdev->interrupt = VFIO_INT_NONE; 336 337 trace_vfio_intx_disable(vdev->vbasedev.name); 338 } 339 340 /* 341 * MSI/X 342 */ 343 static void vfio_msi_interrupt(void *opaque) 344 { 345 VFIOMSIVector *vector = opaque; 346 VFIOPCIDevice *vdev = vector->vdev; 347 MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector); 348 void (*notify)(PCIDevice *dev, unsigned vector); 349 MSIMessage msg; 350 int nr = vector - vdev->msi_vectors; 351 352 if (!event_notifier_test_and_clear(&vector->interrupt)) { 353 return; 354 } 355 356 if (vdev->interrupt == VFIO_INT_MSIX) { 357 get_msg = msix_get_message; 358 notify = msix_notify; 359 } else if (vdev->interrupt == VFIO_INT_MSI) { 360 get_msg = msi_get_message; 361 notify = msi_notify; 362 } else { 363 abort(); 364 } 365 366 msg = get_msg(&vdev->pdev, nr); 367 trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data); 368 notify(&vdev->pdev, nr); 369 } 370 371 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) 372 { 373 struct vfio_irq_set *irq_set; 374 int ret = 0, i, argsz; 375 int32_t *fds; 376 377 argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds)); 378 379 irq_set = g_malloc0(argsz); 380 irq_set->argsz = argsz; 381 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; 382 irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX; 383 irq_set->start = 0; 384 irq_set->count = vdev->nr_vectors; 385 fds = (int32_t *)&irq_set->data; 386 387 for (i = 0; i < vdev->nr_vectors; i++) { 388 int fd = -1; 389 390 /* 391 * MSI vs MSI-X - The guest has direct access to MSI mask and pending 392 * bits, therefore we always use the KVM signaling path when setup. 393 * MSI-X mask and pending bits are emulated, so we want to use the 394 * KVM signaling path only when configured and unmasked. 395 */ 396 if (vdev->msi_vectors[i].use) { 397 if (vdev->msi_vectors[i].virq < 0 || 398 (msix && msix_is_masked(&vdev->pdev, i))) { 399 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt); 400 } else { 401 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt); 402 } 403 } 404 405 fds[i] = fd; 406 } 407 408 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 409 410 g_free(irq_set); 411 412 return ret; 413 } 414 415 static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, 416 MSIMessage *msg, bool msix) 417 { 418 int virq; 419 420 if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi) || !msg) { 421 return; 422 } 423 424 if (event_notifier_init(&vector->kvm_interrupt, 0)) { 425 return; 426 } 427 428 virq = kvm_irqchip_add_msi_route(kvm_state, *msg, &vdev->pdev); 429 if (virq < 0) { 430 event_notifier_cleanup(&vector->kvm_interrupt); 431 return; 432 } 433 434 if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, 435 NULL, virq) < 0) { 436 kvm_irqchip_release_virq(kvm_state, virq); 437 event_notifier_cleanup(&vector->kvm_interrupt); 438 return; 439 } 440 441 vector->virq = virq; 442 } 443 444 static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) 445 { 446 kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, 447 vector->virq); 448 kvm_irqchip_release_virq(kvm_state, vector->virq); 449 vector->virq = -1; 450 event_notifier_cleanup(&vector->kvm_interrupt); 451 } 452 453 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, 454 PCIDevice *pdev) 455 { 456 kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev); 457 } 458 459 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, 460 MSIMessage *msg, IOHandler *handler) 461 { 462 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 463 VFIOMSIVector *vector; 464 int ret; 465 466 trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr); 467 468 vector = &vdev->msi_vectors[nr]; 469 470 if (!vector->use) { 471 vector->vdev = vdev; 472 vector->virq = -1; 473 if (event_notifier_init(&vector->interrupt, 0)) { 474 error_report("vfio: Error: event_notifier_init failed"); 475 } 476 vector->use = true; 477 msix_vector_use(pdev, nr); 478 } 479 480 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 481 handler, NULL, vector); 482 483 /* 484 * Attempt to enable route through KVM irqchip, 485 * default to userspace handling if unavailable. 486 */ 487 if (vector->virq >= 0) { 488 if (!msg) { 489 vfio_remove_kvm_msi_virq(vector); 490 } else { 491 vfio_update_kvm_msi_virq(vector, *msg, pdev); 492 } 493 } else { 494 vfio_add_kvm_msi_virq(vdev, vector, msg, true); 495 } 496 497 /* 498 * We don't want to have the host allocate all possible MSI vectors 499 * for a device if they're not in use, so we shutdown and incrementally 500 * increase them as needed. 501 */ 502 if (vdev->nr_vectors < nr + 1) { 503 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); 504 vdev->nr_vectors = nr + 1; 505 ret = vfio_enable_vectors(vdev, true); 506 if (ret) { 507 error_report("vfio: failed to enable vectors, %d", ret); 508 } 509 } else { 510 int argsz; 511 struct vfio_irq_set *irq_set; 512 int32_t *pfd; 513 514 argsz = sizeof(*irq_set) + sizeof(*pfd); 515 516 irq_set = g_malloc0(argsz); 517 irq_set->argsz = argsz; 518 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 519 VFIO_IRQ_SET_ACTION_TRIGGER; 520 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; 521 irq_set->start = nr; 522 irq_set->count = 1; 523 pfd = (int32_t *)&irq_set->data; 524 525 if (vector->virq >= 0) { 526 *pfd = event_notifier_get_fd(&vector->kvm_interrupt); 527 } else { 528 *pfd = event_notifier_get_fd(&vector->interrupt); 529 } 530 531 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 532 g_free(irq_set); 533 if (ret) { 534 error_report("vfio: failed to modify vector, %d", ret); 535 } 536 } 537 538 return 0; 539 } 540 541 static int vfio_msix_vector_use(PCIDevice *pdev, 542 unsigned int nr, MSIMessage msg) 543 { 544 return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt); 545 } 546 547 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) 548 { 549 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 550 VFIOMSIVector *vector = &vdev->msi_vectors[nr]; 551 552 trace_vfio_msix_vector_release(vdev->vbasedev.name, nr); 553 554 /* 555 * There are still old guests that mask and unmask vectors on every 556 * interrupt. If we're using QEMU bypass with a KVM irqfd, leave all of 557 * the KVM setup in place, simply switch VFIO to use the non-bypass 558 * eventfd. We'll then fire the interrupt through QEMU and the MSI-X 559 * core will mask the interrupt and set pending bits, allowing it to 560 * be re-asserted on unmask. Nothing to do if already using QEMU mode. 561 */ 562 if (vector->virq >= 0) { 563 int argsz; 564 struct vfio_irq_set *irq_set; 565 int32_t *pfd; 566 567 argsz = sizeof(*irq_set) + sizeof(*pfd); 568 569 irq_set = g_malloc0(argsz); 570 irq_set->argsz = argsz; 571 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 572 VFIO_IRQ_SET_ACTION_TRIGGER; 573 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; 574 irq_set->start = nr; 575 irq_set->count = 1; 576 pfd = (int32_t *)&irq_set->data; 577 578 *pfd = event_notifier_get_fd(&vector->interrupt); 579 580 ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 581 582 g_free(irq_set); 583 } 584 } 585 586 static void vfio_msix_enable(VFIOPCIDevice *vdev) 587 { 588 vfio_disable_interrupts(vdev); 589 590 vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries); 591 592 vdev->interrupt = VFIO_INT_MSIX; 593 594 /* 595 * Some communication channels between VF & PF or PF & fw rely on the 596 * physical state of the device and expect that enabling MSI-X from the 597 * guest enables the same on the host. When our guest is Linux, the 598 * guest driver call to pci_enable_msix() sets the enabling bit in the 599 * MSI-X capability, but leaves the vector table masked. We therefore 600 * can't rely on a vector_use callback (from request_irq() in the guest) 601 * to switch the physical device into MSI-X mode because that may come a 602 * long time after pci_enable_msix(). This code enables vector 0 with 603 * triggering to userspace, then immediately release the vector, leaving 604 * the physical device with no vectors enabled, but MSI-X enabled, just 605 * like the guest view. 606 */ 607 vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL); 608 vfio_msix_vector_release(&vdev->pdev, 0); 609 610 if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, 611 vfio_msix_vector_release, NULL)) { 612 error_report("vfio: msix_set_vector_notifiers failed"); 613 } 614 615 trace_vfio_msix_enable(vdev->vbasedev.name); 616 } 617 618 static void vfio_msi_enable(VFIOPCIDevice *vdev) 619 { 620 int ret, i; 621 622 vfio_disable_interrupts(vdev); 623 624 vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev); 625 retry: 626 vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors); 627 628 for (i = 0; i < vdev->nr_vectors; i++) { 629 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 630 MSIMessage msg = msi_get_message(&vdev->pdev, i); 631 632 vector->vdev = vdev; 633 vector->virq = -1; 634 vector->use = true; 635 636 if (event_notifier_init(&vector->interrupt, 0)) { 637 error_report("vfio: Error: event_notifier_init failed"); 638 } 639 640 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 641 vfio_msi_interrupt, NULL, vector); 642 643 /* 644 * Attempt to enable route through KVM irqchip, 645 * default to userspace handling if unavailable. 646 */ 647 vfio_add_kvm_msi_virq(vdev, vector, &msg, false); 648 } 649 650 /* Set interrupt type prior to possible interrupts */ 651 vdev->interrupt = VFIO_INT_MSI; 652 653 ret = vfio_enable_vectors(vdev, false); 654 if (ret) { 655 if (ret < 0) { 656 error_report("vfio: Error: Failed to setup MSI fds: %m"); 657 } else if (ret != vdev->nr_vectors) { 658 error_report("vfio: Error: Failed to enable %d " 659 "MSI vectors, retry with %d", vdev->nr_vectors, ret); 660 } 661 662 for (i = 0; i < vdev->nr_vectors; i++) { 663 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 664 if (vector->virq >= 0) { 665 vfio_remove_kvm_msi_virq(vector); 666 } 667 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 668 NULL, NULL, NULL); 669 event_notifier_cleanup(&vector->interrupt); 670 } 671 672 g_free(vdev->msi_vectors); 673 674 if (ret > 0 && ret != vdev->nr_vectors) { 675 vdev->nr_vectors = ret; 676 goto retry; 677 } 678 vdev->nr_vectors = 0; 679 680 /* 681 * Failing to setup MSI doesn't really fall within any specification. 682 * Let's try leaving interrupts disabled and hope the guest figures 683 * out to fall back to INTx for this device. 684 */ 685 error_report("vfio: Error: Failed to enable MSI"); 686 vdev->interrupt = VFIO_INT_NONE; 687 688 return; 689 } 690 691 trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors); 692 } 693 694 static void vfio_msi_disable_common(VFIOPCIDevice *vdev) 695 { 696 int i; 697 698 for (i = 0; i < vdev->nr_vectors; i++) { 699 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 700 if (vdev->msi_vectors[i].use) { 701 if (vector->virq >= 0) { 702 vfio_remove_kvm_msi_virq(vector); 703 } 704 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 705 NULL, NULL, NULL); 706 event_notifier_cleanup(&vector->interrupt); 707 } 708 } 709 710 g_free(vdev->msi_vectors); 711 vdev->msi_vectors = NULL; 712 vdev->nr_vectors = 0; 713 vdev->interrupt = VFIO_INT_NONE; 714 715 vfio_intx_enable(vdev); 716 } 717 718 static void vfio_msix_disable(VFIOPCIDevice *vdev) 719 { 720 int i; 721 722 msix_unset_vector_notifiers(&vdev->pdev); 723 724 /* 725 * MSI-X will only release vectors if MSI-X is still enabled on the 726 * device, check through the rest and release it ourselves if necessary. 727 */ 728 for (i = 0; i < vdev->nr_vectors; i++) { 729 if (vdev->msi_vectors[i].use) { 730 vfio_msix_vector_release(&vdev->pdev, i); 731 msix_vector_unuse(&vdev->pdev, i); 732 } 733 } 734 735 if (vdev->nr_vectors) { 736 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); 737 } 738 739 vfio_msi_disable_common(vdev); 740 741 trace_vfio_msix_disable(vdev->vbasedev.name); 742 } 743 744 static void vfio_msi_disable(VFIOPCIDevice *vdev) 745 { 746 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX); 747 vfio_msi_disable_common(vdev); 748 749 trace_vfio_msi_disable(vdev->vbasedev.name); 750 } 751 752 static void vfio_update_msi(VFIOPCIDevice *vdev) 753 { 754 int i; 755 756 for (i = 0; i < vdev->nr_vectors; i++) { 757 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 758 MSIMessage msg; 759 760 if (!vector->use || vector->virq < 0) { 761 continue; 762 } 763 764 msg = msi_get_message(&vdev->pdev, i); 765 vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev); 766 } 767 } 768 769 static void vfio_pci_load_rom(VFIOPCIDevice *vdev) 770 { 771 struct vfio_region_info reg_info = { 772 .argsz = sizeof(reg_info), 773 .index = VFIO_PCI_ROM_REGION_INDEX 774 }; 775 uint64_t size; 776 off_t off = 0; 777 ssize_t bytes; 778 779 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, ®_info)) { 780 error_report("vfio: Error getting ROM info: %m"); 781 return; 782 } 783 784 trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info.size, 785 (unsigned long)reg_info.offset, 786 (unsigned long)reg_info.flags); 787 788 vdev->rom_size = size = reg_info.size; 789 vdev->rom_offset = reg_info.offset; 790 791 if (!vdev->rom_size) { 792 vdev->rom_read_failed = true; 793 error_report("vfio-pci: Cannot read device rom at " 794 "%s", vdev->vbasedev.name); 795 error_printf("Device option ROM contents are probably invalid " 796 "(check dmesg).\nSkip option ROM probe with rombar=0, " 797 "or load from file with romfile=\n"); 798 return; 799 } 800 801 vdev->rom = g_malloc(size); 802 memset(vdev->rom, 0xff, size); 803 804 while (size) { 805 bytes = pread(vdev->vbasedev.fd, vdev->rom + off, 806 size, vdev->rom_offset + off); 807 if (bytes == 0) { 808 break; 809 } else if (bytes > 0) { 810 off += bytes; 811 size -= bytes; 812 } else { 813 if (errno == EINTR || errno == EAGAIN) { 814 continue; 815 } 816 error_report("vfio: Error reading device ROM: %m"); 817 break; 818 } 819 } 820 } 821 822 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) 823 { 824 VFIOPCIDevice *vdev = opaque; 825 union { 826 uint8_t byte; 827 uint16_t word; 828 uint32_t dword; 829 uint64_t qword; 830 } val; 831 uint64_t data = 0; 832 833 /* Load the ROM lazily when the guest tries to read it */ 834 if (unlikely(!vdev->rom && !vdev->rom_read_failed)) { 835 vfio_pci_load_rom(vdev); 836 } 837 838 memcpy(&val, vdev->rom + addr, 839 (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0); 840 841 switch (size) { 842 case 1: 843 data = val.byte; 844 break; 845 case 2: 846 data = le16_to_cpu(val.word); 847 break; 848 case 4: 849 data = le32_to_cpu(val.dword); 850 break; 851 default: 852 hw_error("vfio: unsupported read size, %d bytes\n", size); 853 break; 854 } 855 856 trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data); 857 858 return data; 859 } 860 861 static void vfio_rom_write(void *opaque, hwaddr addr, 862 uint64_t data, unsigned size) 863 { 864 } 865 866 static const MemoryRegionOps vfio_rom_ops = { 867 .read = vfio_rom_read, 868 .write = vfio_rom_write, 869 .endianness = DEVICE_LITTLE_ENDIAN, 870 }; 871 872 static void vfio_pci_size_rom(VFIOPCIDevice *vdev) 873 { 874 uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); 875 off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; 876 DeviceState *dev = DEVICE(vdev); 877 char name[32]; 878 int fd = vdev->vbasedev.fd; 879 880 if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { 881 /* Since pci handles romfile, just print a message and return */ 882 if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) { 883 error_printf("Warning : Device at %04x:%02x:%02x.%x " 884 "is known to cause system instability issues during " 885 "option rom execution. " 886 "Proceeding anyway since user specified romfile\n", 887 vdev->host.domain, vdev->host.bus, vdev->host.slot, 888 vdev->host.function); 889 } 890 return; 891 } 892 893 /* 894 * Use the same size ROM BAR as the physical device. The contents 895 * will get filled in later when the guest tries to read it. 896 */ 897 if (pread(fd, &orig, 4, offset) != 4 || 898 pwrite(fd, &size, 4, offset) != 4 || 899 pread(fd, &size, 4, offset) != 4 || 900 pwrite(fd, &orig, 4, offset) != 4) { 901 error_report("%s(%04x:%02x:%02x.%x) failed: %m", 902 __func__, vdev->host.domain, vdev->host.bus, 903 vdev->host.slot, vdev->host.function); 904 return; 905 } 906 907 size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1; 908 909 if (!size) { 910 return; 911 } 912 913 if (vfio_blacklist_opt_rom(vdev)) { 914 if (dev->opts && qemu_opt_get(dev->opts, "rombar")) { 915 error_printf("Warning : Device at %04x:%02x:%02x.%x " 916 "is known to cause system instability issues during " 917 "option rom execution. " 918 "Proceeding anyway since user specified non zero value for " 919 "rombar\n", 920 vdev->host.domain, vdev->host.bus, vdev->host.slot, 921 vdev->host.function); 922 } else { 923 error_printf("Warning : Rom loading for device at " 924 "%04x:%02x:%02x.%x has been disabled due to " 925 "system instability issues. " 926 "Specify rombar=1 or romfile to force\n", 927 vdev->host.domain, vdev->host.bus, vdev->host.slot, 928 vdev->host.function); 929 return; 930 } 931 } 932 933 trace_vfio_pci_size_rom(vdev->vbasedev.name, size); 934 935 snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom", 936 vdev->host.domain, vdev->host.bus, vdev->host.slot, 937 vdev->host.function); 938 939 memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev), 940 &vfio_rom_ops, vdev, name, size); 941 942 pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 943 PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom); 944 945 vdev->pdev.has_rom = true; 946 vdev->rom_read_failed = false; 947 } 948 949 void vfio_vga_write(void *opaque, hwaddr addr, 950 uint64_t data, unsigned size) 951 { 952 VFIOVGARegion *region = opaque; 953 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]); 954 union { 955 uint8_t byte; 956 uint16_t word; 957 uint32_t dword; 958 uint64_t qword; 959 } buf; 960 off_t offset = vga->fd_offset + region->offset + addr; 961 962 switch (size) { 963 case 1: 964 buf.byte = data; 965 break; 966 case 2: 967 buf.word = cpu_to_le16(data); 968 break; 969 case 4: 970 buf.dword = cpu_to_le32(data); 971 break; 972 default: 973 hw_error("vfio: unsupported write size, %d bytes", size); 974 break; 975 } 976 977 if (pwrite(vga->fd, &buf, size, offset) != size) { 978 error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m", 979 __func__, region->offset + addr, data, size); 980 } 981 982 trace_vfio_vga_write(region->offset + addr, data, size); 983 } 984 985 uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size) 986 { 987 VFIOVGARegion *region = opaque; 988 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]); 989 union { 990 uint8_t byte; 991 uint16_t word; 992 uint32_t dword; 993 uint64_t qword; 994 } buf; 995 uint64_t data = 0; 996 off_t offset = vga->fd_offset + region->offset + addr; 997 998 if (pread(vga->fd, &buf, size, offset) != size) { 999 error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m", 1000 __func__, region->offset + addr, size); 1001 return (uint64_t)-1; 1002 } 1003 1004 switch (size) { 1005 case 1: 1006 data = buf.byte; 1007 break; 1008 case 2: 1009 data = le16_to_cpu(buf.word); 1010 break; 1011 case 4: 1012 data = le32_to_cpu(buf.dword); 1013 break; 1014 default: 1015 hw_error("vfio: unsupported read size, %d bytes", size); 1016 break; 1017 } 1018 1019 trace_vfio_vga_read(region->offset + addr, size, data); 1020 1021 return data; 1022 } 1023 1024 static const MemoryRegionOps vfio_vga_ops = { 1025 .read = vfio_vga_read, 1026 .write = vfio_vga_write, 1027 .endianness = DEVICE_LITTLE_ENDIAN, 1028 }; 1029 1030 /* 1031 * PCI config space 1032 */ 1033 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) 1034 { 1035 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 1036 uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; 1037 1038 memcpy(&emu_bits, vdev->emulated_config_bits + addr, len); 1039 emu_bits = le32_to_cpu(emu_bits); 1040 1041 if (emu_bits) { 1042 emu_val = pci_default_read_config(pdev, addr, len); 1043 } 1044 1045 if (~emu_bits & (0xffffffffU >> (32 - len * 8))) { 1046 ssize_t ret; 1047 1048 ret = pread(vdev->vbasedev.fd, &phys_val, len, 1049 vdev->config_offset + addr); 1050 if (ret != len) { 1051 error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m", 1052 __func__, vdev->host.domain, vdev->host.bus, 1053 vdev->host.slot, vdev->host.function, addr, len); 1054 return -errno; 1055 } 1056 phys_val = le32_to_cpu(phys_val); 1057 } 1058 1059 val = (emu_val & emu_bits) | (phys_val & ~emu_bits); 1060 1061 trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val); 1062 1063 return val; 1064 } 1065 1066 void vfio_pci_write_config(PCIDevice *pdev, 1067 uint32_t addr, uint32_t val, int len) 1068 { 1069 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 1070 uint32_t val_le = cpu_to_le32(val); 1071 1072 trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len); 1073 1074 /* Write everything to VFIO, let it filter out what we can't write */ 1075 if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr) 1076 != len) { 1077 error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m", 1078 __func__, vdev->host.domain, vdev->host.bus, 1079 vdev->host.slot, vdev->host.function, addr, val, len); 1080 } 1081 1082 /* MSI/MSI-X Enabling/Disabling */ 1083 if (pdev->cap_present & QEMU_PCI_CAP_MSI && 1084 ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) { 1085 int is_enabled, was_enabled = msi_enabled(pdev); 1086 1087 pci_default_write_config(pdev, addr, val, len); 1088 1089 is_enabled = msi_enabled(pdev); 1090 1091 if (!was_enabled) { 1092 if (is_enabled) { 1093 vfio_msi_enable(vdev); 1094 } 1095 } else { 1096 if (!is_enabled) { 1097 vfio_msi_disable(vdev); 1098 } else { 1099 vfio_update_msi(vdev); 1100 } 1101 } 1102 } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX && 1103 ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) { 1104 int is_enabled, was_enabled = msix_enabled(pdev); 1105 1106 pci_default_write_config(pdev, addr, val, len); 1107 1108 is_enabled = msix_enabled(pdev); 1109 1110 if (!was_enabled && is_enabled) { 1111 vfio_msix_enable(vdev); 1112 } else if (was_enabled && !is_enabled) { 1113 vfio_msix_disable(vdev); 1114 } 1115 } else { 1116 /* Write everything to QEMU to keep emulated bits correct */ 1117 pci_default_write_config(pdev, addr, val, len); 1118 } 1119 } 1120 1121 /* 1122 * Interrupt setup 1123 */ 1124 static void vfio_disable_interrupts(VFIOPCIDevice *vdev) 1125 { 1126 /* 1127 * More complicated than it looks. Disabling MSI/X transitions the 1128 * device to INTx mode (if supported). Therefore we need to first 1129 * disable MSI/X and then cleanup by disabling INTx. 1130 */ 1131 if (vdev->interrupt == VFIO_INT_MSIX) { 1132 vfio_msix_disable(vdev); 1133 } else if (vdev->interrupt == VFIO_INT_MSI) { 1134 vfio_msi_disable(vdev); 1135 } 1136 1137 if (vdev->interrupt == VFIO_INT_INTx) { 1138 vfio_intx_disable(vdev); 1139 } 1140 } 1141 1142 static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos) 1143 { 1144 uint16_t ctrl; 1145 bool msi_64bit, msi_maskbit; 1146 int ret, entries; 1147 1148 if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl), 1149 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { 1150 return -errno; 1151 } 1152 ctrl = le16_to_cpu(ctrl); 1153 1154 msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT); 1155 msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT); 1156 entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1); 1157 1158 trace_vfio_msi_setup(vdev->vbasedev.name, pos); 1159 1160 ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit); 1161 if (ret < 0) { 1162 if (ret == -ENOTSUP) { 1163 return 0; 1164 } 1165 error_report("vfio: msi_init failed"); 1166 return ret; 1167 } 1168 vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0); 1169 1170 return 0; 1171 } 1172 1173 /* 1174 * We don't have any control over how pci_add_capability() inserts 1175 * capabilities into the chain. In order to setup MSI-X we need a 1176 * MemoryRegion for the BAR. In order to setup the BAR and not 1177 * attempt to mmap the MSI-X table area, which VFIO won't allow, we 1178 * need to first look for where the MSI-X table lives. So we 1179 * unfortunately split MSI-X setup across two functions. 1180 */ 1181 static int vfio_msix_early_setup(VFIOPCIDevice *vdev) 1182 { 1183 uint8_t pos; 1184 uint16_t ctrl; 1185 uint32_t table, pba; 1186 int fd = vdev->vbasedev.fd; 1187 VFIOMSIXInfo *msix; 1188 1189 pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); 1190 if (!pos) { 1191 return 0; 1192 } 1193 1194 if (pread(fd, &ctrl, sizeof(ctrl), 1195 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { 1196 return -errno; 1197 } 1198 1199 if (pread(fd, &table, sizeof(table), 1200 vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) { 1201 return -errno; 1202 } 1203 1204 if (pread(fd, &pba, sizeof(pba), 1205 vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) { 1206 return -errno; 1207 } 1208 1209 ctrl = le16_to_cpu(ctrl); 1210 table = le32_to_cpu(table); 1211 pba = le32_to_cpu(pba); 1212 1213 msix = g_malloc0(sizeof(*msix)); 1214 msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK; 1215 msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK; 1216 msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK; 1217 msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; 1218 msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; 1219 1220 /* 1221 * Test the size of the pba_offset variable and catch if it extends outside 1222 * of the specified BAR. If it is the case, we need to apply a hardware 1223 * specific quirk if the device is known or we have a broken configuration. 1224 */ 1225 if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) { 1226 /* 1227 * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5 1228 * adapters. The T5 hardware returns an incorrect value of 0x8000 for 1229 * the VF PBA offset while the BAR itself is only 8k. The correct value 1230 * is 0x1000, so we hard code that here. 1231 */ 1232 if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO && 1233 (vdev->device_id & 0xff00) == 0x5800) { 1234 msix->pba_offset = 0x1000; 1235 } else { 1236 error_report("vfio: Hardware reports invalid configuration, " 1237 "MSIX PBA outside of specified BAR"); 1238 g_free(msix); 1239 return -EINVAL; 1240 } 1241 } 1242 1243 trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar, 1244 msix->table_offset, msix->entries); 1245 vdev->msix = msix; 1246 1247 return 0; 1248 } 1249 1250 static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos) 1251 { 1252 int ret; 1253 1254 ret = msix_init(&vdev->pdev, vdev->msix->entries, 1255 &vdev->bars[vdev->msix->table_bar].region.mem, 1256 vdev->msix->table_bar, vdev->msix->table_offset, 1257 &vdev->bars[vdev->msix->pba_bar].region.mem, 1258 vdev->msix->pba_bar, vdev->msix->pba_offset, pos); 1259 if (ret < 0) { 1260 if (ret == -ENOTSUP) { 1261 return 0; 1262 } 1263 error_report("vfio: msix_init failed"); 1264 return ret; 1265 } 1266 1267 return 0; 1268 } 1269 1270 static void vfio_teardown_msi(VFIOPCIDevice *vdev) 1271 { 1272 msi_uninit(&vdev->pdev); 1273 1274 if (vdev->msix) { 1275 msix_uninit(&vdev->pdev, 1276 &vdev->bars[vdev->msix->table_bar].region.mem, 1277 &vdev->bars[vdev->msix->pba_bar].region.mem); 1278 } 1279 } 1280 1281 /* 1282 * Resource setup 1283 */ 1284 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled) 1285 { 1286 int i; 1287 1288 for (i = 0; i < PCI_ROM_SLOT; i++) { 1289 VFIOBAR *bar = &vdev->bars[i]; 1290 1291 if (!bar->region.size) { 1292 continue; 1293 } 1294 1295 memory_region_set_enabled(&bar->region.mmap_mem, enabled); 1296 if (vdev->msix && vdev->msix->table_bar == i) { 1297 memory_region_set_enabled(&vdev->msix->mmap_mem, enabled); 1298 } 1299 } 1300 } 1301 1302 static void vfio_unregister_bar(VFIOPCIDevice *vdev, int nr) 1303 { 1304 VFIOBAR *bar = &vdev->bars[nr]; 1305 1306 if (!bar->region.size) { 1307 return; 1308 } 1309 1310 vfio_bar_quirk_teardown(vdev, nr); 1311 1312 memory_region_del_subregion(&bar->region.mem, &bar->region.mmap_mem); 1313 1314 if (vdev->msix && vdev->msix->table_bar == nr) { 1315 memory_region_del_subregion(&bar->region.mem, &vdev->msix->mmap_mem); 1316 } 1317 } 1318 1319 static void vfio_unmap_bar(VFIOPCIDevice *vdev, int nr) 1320 { 1321 VFIOBAR *bar = &vdev->bars[nr]; 1322 1323 if (!bar->region.size) { 1324 return; 1325 } 1326 1327 vfio_bar_quirk_free(vdev, nr); 1328 1329 munmap(bar->region.mmap, memory_region_size(&bar->region.mmap_mem)); 1330 1331 if (vdev->msix && vdev->msix->table_bar == nr) { 1332 munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem)); 1333 } 1334 } 1335 1336 static void vfio_map_bar(VFIOPCIDevice *vdev, int nr) 1337 { 1338 VFIOBAR *bar = &vdev->bars[nr]; 1339 uint64_t size = bar->region.size; 1340 char name[64]; 1341 uint32_t pci_bar; 1342 uint8_t type; 1343 int ret; 1344 1345 /* Skip both unimplemented BARs and the upper half of 64bit BARS. */ 1346 if (!size) { 1347 return; 1348 } 1349 1350 snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d", 1351 vdev->host.domain, vdev->host.bus, vdev->host.slot, 1352 vdev->host.function, nr); 1353 1354 /* Determine what type of BAR this is for registration */ 1355 ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar), 1356 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); 1357 if (ret != sizeof(pci_bar)) { 1358 error_report("vfio: Failed to read BAR %d (%m)", nr); 1359 return; 1360 } 1361 1362 pci_bar = le32_to_cpu(pci_bar); 1363 bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO); 1364 bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64); 1365 type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK : 1366 ~PCI_BASE_ADDRESS_MEM_MASK); 1367 1368 /* A "slow" read/write mapping underlies all BARs */ 1369 memory_region_init_io(&bar->region.mem, OBJECT(vdev), &vfio_region_ops, 1370 bar, name, size); 1371 pci_register_bar(&vdev->pdev, nr, type, &bar->region.mem); 1372 1373 /* 1374 * We can't mmap areas overlapping the MSIX vector table, so we 1375 * potentially insert a direct-mapped subregion before and after it. 1376 */ 1377 if (vdev->msix && vdev->msix->table_bar == nr) { 1378 size = vdev->msix->table_offset & qemu_real_host_page_mask; 1379 } 1380 1381 strncat(name, " mmap", sizeof(name) - strlen(name) - 1); 1382 if (vfio_mmap_region(OBJECT(vdev), &bar->region, &bar->region.mem, 1383 &bar->region.mmap_mem, &bar->region.mmap, 1384 size, 0, name)) { 1385 error_report("%s unsupported. Performance may be slow", name); 1386 } 1387 1388 if (vdev->msix && vdev->msix->table_bar == nr) { 1389 uint64_t start; 1390 1391 start = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset + 1392 (vdev->msix->entries * 1393 PCI_MSIX_ENTRY_SIZE)); 1394 1395 size = start < bar->region.size ? bar->region.size - start : 0; 1396 strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1); 1397 /* VFIOMSIXInfo contains another MemoryRegion for this mapping */ 1398 if (vfio_mmap_region(OBJECT(vdev), &bar->region, &bar->region.mem, 1399 &vdev->msix->mmap_mem, 1400 &vdev->msix->mmap, size, start, name)) { 1401 error_report("%s unsupported. Performance may be slow", name); 1402 } 1403 } 1404 1405 vfio_bar_quirk_setup(vdev, nr); 1406 } 1407 1408 static void vfio_map_bars(VFIOPCIDevice *vdev) 1409 { 1410 int i; 1411 1412 for (i = 0; i < PCI_ROM_SLOT; i++) { 1413 vfio_map_bar(vdev, i); 1414 } 1415 1416 if (vdev->has_vga) { 1417 memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem, 1418 OBJECT(vdev), &vfio_vga_ops, 1419 &vdev->vga.region[QEMU_PCI_VGA_MEM], 1420 "vfio-vga-mmio@0xa0000", 1421 QEMU_PCI_VGA_MEM_SIZE); 1422 memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem, 1423 OBJECT(vdev), &vfio_vga_ops, 1424 &vdev->vga.region[QEMU_PCI_VGA_IO_LO], 1425 "vfio-vga-io@0x3b0", 1426 QEMU_PCI_VGA_IO_LO_SIZE); 1427 memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem, 1428 OBJECT(vdev), &vfio_vga_ops, 1429 &vdev->vga.region[QEMU_PCI_VGA_IO_HI], 1430 "vfio-vga-io@0x3c0", 1431 QEMU_PCI_VGA_IO_HI_SIZE); 1432 1433 pci_register_vga(&vdev->pdev, &vdev->vga.region[QEMU_PCI_VGA_MEM].mem, 1434 &vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem, 1435 &vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem); 1436 vfio_vga_quirk_setup(vdev); 1437 } 1438 } 1439 1440 static void vfio_unregister_bars(VFIOPCIDevice *vdev) 1441 { 1442 int i; 1443 1444 for (i = 0; i < PCI_ROM_SLOT; i++) { 1445 vfio_unregister_bar(vdev, i); 1446 } 1447 1448 if (vdev->has_vga) { 1449 vfio_vga_quirk_teardown(vdev); 1450 pci_unregister_vga(&vdev->pdev); 1451 } 1452 } 1453 1454 static void vfio_unmap_bars(VFIOPCIDevice *vdev) 1455 { 1456 int i; 1457 1458 for (i = 0; i < PCI_ROM_SLOT; i++) { 1459 vfio_unmap_bar(vdev, i); 1460 } 1461 1462 if (vdev->has_vga) { 1463 vfio_vga_quirk_free(vdev); 1464 } 1465 } 1466 1467 /* 1468 * General setup 1469 */ 1470 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos) 1471 { 1472 uint8_t tmp, next = 0xff; 1473 1474 for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp; 1475 tmp = pdev->config[tmp + 1]) { 1476 if (tmp > pos && tmp < next) { 1477 next = tmp; 1478 } 1479 } 1480 1481 return next - pos; 1482 } 1483 1484 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask) 1485 { 1486 pci_set_word(buf, (pci_get_word(buf) & ~mask) | val); 1487 } 1488 1489 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos, 1490 uint16_t val, uint16_t mask) 1491 { 1492 vfio_set_word_bits(vdev->pdev.config + pos, val, mask); 1493 vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask); 1494 vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask); 1495 } 1496 1497 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask) 1498 { 1499 pci_set_long(buf, (pci_get_long(buf) & ~mask) | val); 1500 } 1501 1502 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos, 1503 uint32_t val, uint32_t mask) 1504 { 1505 vfio_set_long_bits(vdev->pdev.config + pos, val, mask); 1506 vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask); 1507 vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask); 1508 } 1509 1510 static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size) 1511 { 1512 uint16_t flags; 1513 uint8_t type; 1514 1515 flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS); 1516 type = (flags & PCI_EXP_FLAGS_TYPE) >> 4; 1517 1518 if (type != PCI_EXP_TYPE_ENDPOINT && 1519 type != PCI_EXP_TYPE_LEG_END && 1520 type != PCI_EXP_TYPE_RC_END) { 1521 1522 error_report("vfio: Assignment of PCIe type 0x%x " 1523 "devices is not currently supported", type); 1524 return -EINVAL; 1525 } 1526 1527 if (!pci_bus_is_express(vdev->pdev.bus)) { 1528 PCIBus *bus = vdev->pdev.bus; 1529 PCIDevice *bridge; 1530 1531 /* 1532 * Traditionally PCI device assignment exposes the PCIe capability 1533 * as-is on non-express buses. The reason being that some drivers 1534 * simply assume that it's there, for example tg3. However when 1535 * we're running on a native PCIe machine type, like Q35, we need 1536 * to hide the PCIe capability. The reason for this is twofold; 1537 * first Windows guests get a Code 10 error when the PCIe capability 1538 * is exposed in this configuration. Therefore express devices won't 1539 * work at all unless they're attached to express buses in the VM. 1540 * Second, a native PCIe machine introduces the possibility of fine 1541 * granularity IOMMUs supporting both translation and isolation. 1542 * Guest code to discover the IOMMU visibility of a device, such as 1543 * IOMMU grouping code on Linux, is very aware of device types and 1544 * valid transitions between bus types. An express device on a non- 1545 * express bus is not a valid combination on bare metal systems. 1546 * 1547 * Drivers that require a PCIe capability to make the device 1548 * functional are simply going to need to have their devices placed 1549 * on a PCIe bus in the VM. 1550 */ 1551 while (!pci_bus_is_root(bus)) { 1552 bridge = pci_bridge_get_device(bus); 1553 bus = bridge->bus; 1554 } 1555 1556 if (pci_bus_is_express(bus)) { 1557 return 0; 1558 } 1559 1560 } else if (pci_bus_is_root(vdev->pdev.bus)) { 1561 /* 1562 * On a Root Complex bus Endpoints become Root Complex Integrated 1563 * Endpoints, which changes the type and clears the LNK & LNK2 fields. 1564 */ 1565 if (type == PCI_EXP_TYPE_ENDPOINT) { 1566 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS, 1567 PCI_EXP_TYPE_RC_END << 4, 1568 PCI_EXP_FLAGS_TYPE); 1569 1570 /* Link Capabilities, Status, and Control goes away */ 1571 if (size > PCI_EXP_LNKCTL) { 1572 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0); 1573 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0); 1574 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0); 1575 1576 #ifndef PCI_EXP_LNKCAP2 1577 #define PCI_EXP_LNKCAP2 44 1578 #endif 1579 #ifndef PCI_EXP_LNKSTA2 1580 #define PCI_EXP_LNKSTA2 50 1581 #endif 1582 /* Link 2 Capabilities, Status, and Control goes away */ 1583 if (size > PCI_EXP_LNKCAP2) { 1584 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0); 1585 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0); 1586 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0); 1587 } 1588 } 1589 1590 } else if (type == PCI_EXP_TYPE_LEG_END) { 1591 /* 1592 * Legacy endpoints don't belong on the root complex. Windows 1593 * seems to be happier with devices if we skip the capability. 1594 */ 1595 return 0; 1596 } 1597 1598 } else { 1599 /* 1600 * Convert Root Complex Integrated Endpoints to regular endpoints. 1601 * These devices don't support LNK/LNK2 capabilities, so make them up. 1602 */ 1603 if (type == PCI_EXP_TYPE_RC_END) { 1604 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS, 1605 PCI_EXP_TYPE_ENDPOINT << 4, 1606 PCI_EXP_FLAGS_TYPE); 1607 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 1608 PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25, ~0); 1609 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0); 1610 } 1611 1612 /* Mark the Link Status bits as emulated to allow virtual negotiation */ 1613 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 1614 pci_get_word(vdev->pdev.config + pos + 1615 PCI_EXP_LNKSTA), 1616 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS); 1617 } 1618 1619 pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size); 1620 if (pos >= 0) { 1621 vdev->pdev.exp.exp_cap = pos; 1622 } 1623 1624 return pos; 1625 } 1626 1627 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos) 1628 { 1629 uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP); 1630 1631 if (cap & PCI_EXP_DEVCAP_FLR) { 1632 trace_vfio_check_pcie_flr(vdev->vbasedev.name); 1633 vdev->has_flr = true; 1634 } 1635 } 1636 1637 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos) 1638 { 1639 uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL); 1640 1641 if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) { 1642 trace_vfio_check_pm_reset(vdev->vbasedev.name); 1643 vdev->has_pm_reset = true; 1644 } 1645 } 1646 1647 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos) 1648 { 1649 uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP); 1650 1651 if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) { 1652 trace_vfio_check_af_flr(vdev->vbasedev.name); 1653 vdev->has_flr = true; 1654 } 1655 } 1656 1657 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) 1658 { 1659 PCIDevice *pdev = &vdev->pdev; 1660 uint8_t cap_id, next, size; 1661 int ret; 1662 1663 cap_id = pdev->config[pos]; 1664 next = pdev->config[pos + 1]; 1665 1666 /* 1667 * If it becomes important to configure capabilities to their actual 1668 * size, use this as the default when it's something we don't recognize. 1669 * Since QEMU doesn't actually handle many of the config accesses, 1670 * exact size doesn't seem worthwhile. 1671 */ 1672 size = vfio_std_cap_max_size(pdev, pos); 1673 1674 /* 1675 * pci_add_capability always inserts the new capability at the head 1676 * of the chain. Therefore to end up with a chain that matches the 1677 * physical device, we insert from the end by making this recursive. 1678 * This is also why we pre-caclulate size above as cached config space 1679 * will be changed as we unwind the stack. 1680 */ 1681 if (next) { 1682 ret = vfio_add_std_cap(vdev, next); 1683 if (ret) { 1684 return ret; 1685 } 1686 } else { 1687 /* Begin the rebuild, use QEMU emulated list bits */ 1688 pdev->config[PCI_CAPABILITY_LIST] = 0; 1689 vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff; 1690 vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST; 1691 } 1692 1693 /* Use emulated next pointer to allow dropping caps */ 1694 pci_set_byte(vdev->emulated_config_bits + pos + 1, 0xff); 1695 1696 switch (cap_id) { 1697 case PCI_CAP_ID_MSI: 1698 ret = vfio_msi_setup(vdev, pos); 1699 break; 1700 case PCI_CAP_ID_EXP: 1701 vfio_check_pcie_flr(vdev, pos); 1702 ret = vfio_setup_pcie_cap(vdev, pos, size); 1703 break; 1704 case PCI_CAP_ID_MSIX: 1705 ret = vfio_msix_setup(vdev, pos); 1706 break; 1707 case PCI_CAP_ID_PM: 1708 vfio_check_pm_reset(vdev, pos); 1709 vdev->pm_cap = pos; 1710 ret = pci_add_capability(pdev, cap_id, pos, size); 1711 break; 1712 case PCI_CAP_ID_AF: 1713 vfio_check_af_flr(vdev, pos); 1714 ret = pci_add_capability(pdev, cap_id, pos, size); 1715 break; 1716 default: 1717 ret = pci_add_capability(pdev, cap_id, pos, size); 1718 break; 1719 } 1720 1721 if (ret < 0) { 1722 error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability " 1723 "0x%x[0x%x]@0x%x: %d", vdev->host.domain, 1724 vdev->host.bus, vdev->host.slot, vdev->host.function, 1725 cap_id, size, pos, ret); 1726 return ret; 1727 } 1728 1729 return 0; 1730 } 1731 1732 static int vfio_add_capabilities(VFIOPCIDevice *vdev) 1733 { 1734 PCIDevice *pdev = &vdev->pdev; 1735 1736 if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) || 1737 !pdev->config[PCI_CAPABILITY_LIST]) { 1738 return 0; /* Nothing to add */ 1739 } 1740 1741 return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]); 1742 } 1743 1744 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) 1745 { 1746 PCIDevice *pdev = &vdev->pdev; 1747 uint16_t cmd; 1748 1749 vfio_disable_interrupts(vdev); 1750 1751 /* Make sure the device is in D0 */ 1752 if (vdev->pm_cap) { 1753 uint16_t pmcsr; 1754 uint8_t state; 1755 1756 pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); 1757 state = pmcsr & PCI_PM_CTRL_STATE_MASK; 1758 if (state) { 1759 pmcsr &= ~PCI_PM_CTRL_STATE_MASK; 1760 vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); 1761 /* vfio handles the necessary delay here */ 1762 pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); 1763 state = pmcsr & PCI_PM_CTRL_STATE_MASK; 1764 if (state) { 1765 error_report("vfio: Unable to power on device, stuck in D%d", 1766 state); 1767 } 1768 } 1769 } 1770 1771 /* 1772 * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master. 1773 * Also put INTx Disable in known state. 1774 */ 1775 cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); 1776 cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | 1777 PCI_COMMAND_INTX_DISABLE); 1778 vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); 1779 } 1780 1781 static void vfio_pci_post_reset(VFIOPCIDevice *vdev) 1782 { 1783 vfio_intx_enable(vdev); 1784 } 1785 1786 static bool vfio_pci_host_match(PCIHostDeviceAddress *host1, 1787 PCIHostDeviceAddress *host2) 1788 { 1789 return (host1->domain == host2->domain && host1->bus == host2->bus && 1790 host1->slot == host2->slot && host1->function == host2->function); 1791 } 1792 1793 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) 1794 { 1795 VFIOGroup *group; 1796 struct vfio_pci_hot_reset_info *info; 1797 struct vfio_pci_dependent_device *devices; 1798 struct vfio_pci_hot_reset *reset; 1799 int32_t *fds; 1800 int ret, i, count; 1801 bool multi = false; 1802 1803 trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); 1804 1805 vfio_pci_pre_reset(vdev); 1806 vdev->vbasedev.needs_reset = false; 1807 1808 info = g_malloc0(sizeof(*info)); 1809 info->argsz = sizeof(*info); 1810 1811 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); 1812 if (ret && errno != ENOSPC) { 1813 ret = -errno; 1814 if (!vdev->has_pm_reset) { 1815 error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, " 1816 "no available reset mechanism.", vdev->host.domain, 1817 vdev->host.bus, vdev->host.slot, vdev->host.function); 1818 } 1819 goto out_single; 1820 } 1821 1822 count = info->count; 1823 info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices))); 1824 info->argsz = sizeof(*info) + (count * sizeof(*devices)); 1825 devices = &info->devices[0]; 1826 1827 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); 1828 if (ret) { 1829 ret = -errno; 1830 error_report("vfio: hot reset info failed: %m"); 1831 goto out_single; 1832 } 1833 1834 trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); 1835 1836 /* Verify that we have all the groups required */ 1837 for (i = 0; i < info->count; i++) { 1838 PCIHostDeviceAddress host; 1839 VFIOPCIDevice *tmp; 1840 VFIODevice *vbasedev_iter; 1841 1842 host.domain = devices[i].segment; 1843 host.bus = devices[i].bus; 1844 host.slot = PCI_SLOT(devices[i].devfn); 1845 host.function = PCI_FUNC(devices[i].devfn); 1846 1847 trace_vfio_pci_hot_reset_dep_devices(host.domain, 1848 host.bus, host.slot, host.function, devices[i].group_id); 1849 1850 if (vfio_pci_host_match(&host, &vdev->host)) { 1851 continue; 1852 } 1853 1854 QLIST_FOREACH(group, &vfio_group_list, next) { 1855 if (group->groupid == devices[i].group_id) { 1856 break; 1857 } 1858 } 1859 1860 if (!group) { 1861 if (!vdev->has_pm_reset) { 1862 error_report("vfio: Cannot reset device %s, " 1863 "depends on group %d which is not owned.", 1864 vdev->vbasedev.name, devices[i].group_id); 1865 } 1866 ret = -EPERM; 1867 goto out; 1868 } 1869 1870 /* Prep dependent devices for reset and clear our marker. */ 1871 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { 1872 if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { 1873 continue; 1874 } 1875 tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); 1876 if (vfio_pci_host_match(&host, &tmp->host)) { 1877 if (single) { 1878 ret = -EINVAL; 1879 goto out_single; 1880 } 1881 vfio_pci_pre_reset(tmp); 1882 tmp->vbasedev.needs_reset = false; 1883 multi = true; 1884 break; 1885 } 1886 } 1887 } 1888 1889 if (!single && !multi) { 1890 ret = -EINVAL; 1891 goto out_single; 1892 } 1893 1894 /* Determine how many group fds need to be passed */ 1895 count = 0; 1896 QLIST_FOREACH(group, &vfio_group_list, next) { 1897 for (i = 0; i < info->count; i++) { 1898 if (group->groupid == devices[i].group_id) { 1899 count++; 1900 break; 1901 } 1902 } 1903 } 1904 1905 reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); 1906 reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); 1907 fds = &reset->group_fds[0]; 1908 1909 /* Fill in group fds */ 1910 QLIST_FOREACH(group, &vfio_group_list, next) { 1911 for (i = 0; i < info->count; i++) { 1912 if (group->groupid == devices[i].group_id) { 1913 fds[reset->count++] = group->fd; 1914 break; 1915 } 1916 } 1917 } 1918 1919 /* Bus reset! */ 1920 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); 1921 g_free(reset); 1922 1923 trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, 1924 ret ? "%m" : "Success"); 1925 1926 out: 1927 /* Re-enable INTx on affected devices */ 1928 for (i = 0; i < info->count; i++) { 1929 PCIHostDeviceAddress host; 1930 VFIOPCIDevice *tmp; 1931 VFIODevice *vbasedev_iter; 1932 1933 host.domain = devices[i].segment; 1934 host.bus = devices[i].bus; 1935 host.slot = PCI_SLOT(devices[i].devfn); 1936 host.function = PCI_FUNC(devices[i].devfn); 1937 1938 if (vfio_pci_host_match(&host, &vdev->host)) { 1939 continue; 1940 } 1941 1942 QLIST_FOREACH(group, &vfio_group_list, next) { 1943 if (group->groupid == devices[i].group_id) { 1944 break; 1945 } 1946 } 1947 1948 if (!group) { 1949 break; 1950 } 1951 1952 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { 1953 if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { 1954 continue; 1955 } 1956 tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); 1957 if (vfio_pci_host_match(&host, &tmp->host)) { 1958 vfio_pci_post_reset(tmp); 1959 break; 1960 } 1961 } 1962 } 1963 out_single: 1964 vfio_pci_post_reset(vdev); 1965 g_free(info); 1966 1967 return ret; 1968 } 1969 1970 /* 1971 * We want to differentiate hot reset of mulitple in-use devices vs hot reset 1972 * of a single in-use device. VFIO_DEVICE_RESET will already handle the case 1973 * of doing hot resets when there is only a single device per bus. The in-use 1974 * here refers to how many VFIODevices are affected. A hot reset that affects 1975 * multiple devices, but only a single in-use device, means that we can call 1976 * it from our bus ->reset() callback since the extent is effectively a single 1977 * device. This allows us to make use of it in the hotplug path. When there 1978 * are multiple in-use devices, we can only trigger the hot reset during a 1979 * system reset and thus from our reset handler. We separate _one vs _multi 1980 * here so that we don't overlap and do a double reset on the system reset 1981 * path where both our reset handler and ->reset() callback are used. Calling 1982 * _one() will only do a hot reset for the one in-use devices case, calling 1983 * _multi() will do nothing if a _one() would have been sufficient. 1984 */ 1985 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev) 1986 { 1987 return vfio_pci_hot_reset(vdev, true); 1988 } 1989 1990 static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev) 1991 { 1992 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 1993 return vfio_pci_hot_reset(vdev, false); 1994 } 1995 1996 static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev) 1997 { 1998 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 1999 if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) { 2000 vbasedev->needs_reset = true; 2001 } 2002 } 2003 2004 static VFIODeviceOps vfio_pci_ops = { 2005 .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, 2006 .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, 2007 .vfio_eoi = vfio_intx_eoi, 2008 }; 2009 2010 static int vfio_populate_device(VFIOPCIDevice *vdev) 2011 { 2012 VFIODevice *vbasedev = &vdev->vbasedev; 2013 struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) }; 2014 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; 2015 int i, ret = -1; 2016 2017 /* Sanity check device */ 2018 if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) { 2019 error_report("vfio: Um, this isn't a PCI device"); 2020 goto error; 2021 } 2022 2023 if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) { 2024 error_report("vfio: unexpected number of io regions %u", 2025 vbasedev->num_regions); 2026 goto error; 2027 } 2028 2029 if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) { 2030 error_report("vfio: unexpected number of irqs %u", vbasedev->num_irqs); 2031 goto error; 2032 } 2033 2034 for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) { 2035 reg_info.index = i; 2036 2037 ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); 2038 if (ret) { 2039 error_report("vfio: Error getting region %d info: %m", i); 2040 goto error; 2041 } 2042 2043 trace_vfio_populate_device_region(vbasedev->name, i, 2044 (unsigned long)reg_info.size, 2045 (unsigned long)reg_info.offset, 2046 (unsigned long)reg_info.flags); 2047 2048 vdev->bars[i].region.vbasedev = vbasedev; 2049 vdev->bars[i].region.flags = reg_info.flags; 2050 vdev->bars[i].region.size = reg_info.size; 2051 vdev->bars[i].region.fd_offset = reg_info.offset; 2052 vdev->bars[i].region.nr = i; 2053 QLIST_INIT(&vdev->bars[i].quirks); 2054 } 2055 2056 reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX; 2057 2058 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); 2059 if (ret) { 2060 error_report("vfio: Error getting config info: %m"); 2061 goto error; 2062 } 2063 2064 trace_vfio_populate_device_config(vdev->vbasedev.name, 2065 (unsigned long)reg_info.size, 2066 (unsigned long)reg_info.offset, 2067 (unsigned long)reg_info.flags); 2068 2069 vdev->config_size = reg_info.size; 2070 if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) { 2071 vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS; 2072 } 2073 vdev->config_offset = reg_info.offset; 2074 2075 if ((vdev->features & VFIO_FEATURE_ENABLE_VGA) && 2076 vbasedev->num_regions > VFIO_PCI_VGA_REGION_INDEX) { 2077 struct vfio_region_info vga_info = { 2078 .argsz = sizeof(vga_info), 2079 .index = VFIO_PCI_VGA_REGION_INDEX, 2080 }; 2081 2082 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &vga_info); 2083 if (ret) { 2084 error_report( 2085 "vfio: Device does not support requested feature x-vga"); 2086 goto error; 2087 } 2088 2089 if (!(vga_info.flags & VFIO_REGION_INFO_FLAG_READ) || 2090 !(vga_info.flags & VFIO_REGION_INFO_FLAG_WRITE) || 2091 vga_info.size < 0xbffff + 1) { 2092 error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx", 2093 (unsigned long)vga_info.flags, 2094 (unsigned long)vga_info.size); 2095 goto error; 2096 } 2097 2098 vdev->vga.fd_offset = vga_info.offset; 2099 vdev->vga.fd = vdev->vbasedev.fd; 2100 2101 vdev->vga.region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE; 2102 vdev->vga.region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM; 2103 QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_MEM].quirks); 2104 2105 vdev->vga.region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE; 2106 vdev->vga.region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO; 2107 QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].quirks); 2108 2109 vdev->vga.region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE; 2110 vdev->vga.region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI; 2111 QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks); 2112 2113 vdev->has_vga = true; 2114 } 2115 2116 irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; 2117 2118 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); 2119 if (ret) { 2120 /* This can fail for an old kernel or legacy PCI dev */ 2121 trace_vfio_populate_device_get_irq_info_failure(); 2122 ret = 0; 2123 } else if (irq_info.count == 1) { 2124 vdev->pci_aer = true; 2125 } else { 2126 error_report("vfio: %s " 2127 "Could not enable error recovery for the device", 2128 vbasedev->name); 2129 } 2130 2131 error: 2132 return ret; 2133 } 2134 2135 static void vfio_put_device(VFIOPCIDevice *vdev) 2136 { 2137 g_free(vdev->vbasedev.name); 2138 if (vdev->msix) { 2139 object_unparent(OBJECT(&vdev->msix->mmap_mem)); 2140 g_free(vdev->msix); 2141 vdev->msix = NULL; 2142 } 2143 vfio_put_base_device(&vdev->vbasedev); 2144 } 2145 2146 static void vfio_err_notifier_handler(void *opaque) 2147 { 2148 VFIOPCIDevice *vdev = opaque; 2149 2150 if (!event_notifier_test_and_clear(&vdev->err_notifier)) { 2151 return; 2152 } 2153 2154 /* 2155 * TBD. Retrieve the error details and decide what action 2156 * needs to be taken. One of the actions could be to pass 2157 * the error to the guest and have the guest driver recover 2158 * from the error. This requires that PCIe capabilities be 2159 * exposed to the guest. For now, we just terminate the 2160 * guest to contain the error. 2161 */ 2162 2163 error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. " 2164 "Please collect any data possible and then kill the guest", 2165 __func__, vdev->host.domain, vdev->host.bus, 2166 vdev->host.slot, vdev->host.function); 2167 2168 vm_stop(RUN_STATE_INTERNAL_ERROR); 2169 } 2170 2171 /* 2172 * Registers error notifier for devices supporting error recovery. 2173 * If we encounter a failure in this function, we report an error 2174 * and continue after disabling error recovery support for the 2175 * device. 2176 */ 2177 static void vfio_register_err_notifier(VFIOPCIDevice *vdev) 2178 { 2179 int ret; 2180 int argsz; 2181 struct vfio_irq_set *irq_set; 2182 int32_t *pfd; 2183 2184 if (!vdev->pci_aer) { 2185 return; 2186 } 2187 2188 if (event_notifier_init(&vdev->err_notifier, 0)) { 2189 error_report("vfio: Unable to init event notifier for error detection"); 2190 vdev->pci_aer = false; 2191 return; 2192 } 2193 2194 argsz = sizeof(*irq_set) + sizeof(*pfd); 2195 2196 irq_set = g_malloc0(argsz); 2197 irq_set->argsz = argsz; 2198 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 2199 VFIO_IRQ_SET_ACTION_TRIGGER; 2200 irq_set->index = VFIO_PCI_ERR_IRQ_INDEX; 2201 irq_set->start = 0; 2202 irq_set->count = 1; 2203 pfd = (int32_t *)&irq_set->data; 2204 2205 *pfd = event_notifier_get_fd(&vdev->err_notifier); 2206 qemu_set_fd_handler(*pfd, vfio_err_notifier_handler, NULL, vdev); 2207 2208 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 2209 if (ret) { 2210 error_report("vfio: Failed to set up error notification"); 2211 qemu_set_fd_handler(*pfd, NULL, NULL, vdev); 2212 event_notifier_cleanup(&vdev->err_notifier); 2213 vdev->pci_aer = false; 2214 } 2215 g_free(irq_set); 2216 } 2217 2218 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev) 2219 { 2220 int argsz; 2221 struct vfio_irq_set *irq_set; 2222 int32_t *pfd; 2223 int ret; 2224 2225 if (!vdev->pci_aer) { 2226 return; 2227 } 2228 2229 argsz = sizeof(*irq_set) + sizeof(*pfd); 2230 2231 irq_set = g_malloc0(argsz); 2232 irq_set->argsz = argsz; 2233 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 2234 VFIO_IRQ_SET_ACTION_TRIGGER; 2235 irq_set->index = VFIO_PCI_ERR_IRQ_INDEX; 2236 irq_set->start = 0; 2237 irq_set->count = 1; 2238 pfd = (int32_t *)&irq_set->data; 2239 *pfd = -1; 2240 2241 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 2242 if (ret) { 2243 error_report("vfio: Failed to de-assign error fd: %m"); 2244 } 2245 g_free(irq_set); 2246 qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier), 2247 NULL, NULL, vdev); 2248 event_notifier_cleanup(&vdev->err_notifier); 2249 } 2250 2251 static void vfio_req_notifier_handler(void *opaque) 2252 { 2253 VFIOPCIDevice *vdev = opaque; 2254 2255 if (!event_notifier_test_and_clear(&vdev->req_notifier)) { 2256 return; 2257 } 2258 2259 qdev_unplug(&vdev->pdev.qdev, NULL); 2260 } 2261 2262 static void vfio_register_req_notifier(VFIOPCIDevice *vdev) 2263 { 2264 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), 2265 .index = VFIO_PCI_REQ_IRQ_INDEX }; 2266 int argsz; 2267 struct vfio_irq_set *irq_set; 2268 int32_t *pfd; 2269 2270 if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) { 2271 return; 2272 } 2273 2274 if (ioctl(vdev->vbasedev.fd, 2275 VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) { 2276 return; 2277 } 2278 2279 if (event_notifier_init(&vdev->req_notifier, 0)) { 2280 error_report("vfio: Unable to init event notifier for device request"); 2281 return; 2282 } 2283 2284 argsz = sizeof(*irq_set) + sizeof(*pfd); 2285 2286 irq_set = g_malloc0(argsz); 2287 irq_set->argsz = argsz; 2288 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 2289 VFIO_IRQ_SET_ACTION_TRIGGER; 2290 irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; 2291 irq_set->start = 0; 2292 irq_set->count = 1; 2293 pfd = (int32_t *)&irq_set->data; 2294 2295 *pfd = event_notifier_get_fd(&vdev->req_notifier); 2296 qemu_set_fd_handler(*pfd, vfio_req_notifier_handler, NULL, vdev); 2297 2298 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set)) { 2299 error_report("vfio: Failed to set up device request notification"); 2300 qemu_set_fd_handler(*pfd, NULL, NULL, vdev); 2301 event_notifier_cleanup(&vdev->req_notifier); 2302 } else { 2303 vdev->req_enabled = true; 2304 } 2305 2306 g_free(irq_set); 2307 } 2308 2309 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) 2310 { 2311 int argsz; 2312 struct vfio_irq_set *irq_set; 2313 int32_t *pfd; 2314 2315 if (!vdev->req_enabled) { 2316 return; 2317 } 2318 2319 argsz = sizeof(*irq_set) + sizeof(*pfd); 2320 2321 irq_set = g_malloc0(argsz); 2322 irq_set->argsz = argsz; 2323 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 2324 VFIO_IRQ_SET_ACTION_TRIGGER; 2325 irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; 2326 irq_set->start = 0; 2327 irq_set->count = 1; 2328 pfd = (int32_t *)&irq_set->data; 2329 *pfd = -1; 2330 2331 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set)) { 2332 error_report("vfio: Failed to de-assign device request fd: %m"); 2333 } 2334 g_free(irq_set); 2335 qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier), 2336 NULL, NULL, vdev); 2337 event_notifier_cleanup(&vdev->req_notifier); 2338 2339 vdev->req_enabled = false; 2340 } 2341 2342 static int vfio_initfn(PCIDevice *pdev) 2343 { 2344 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 2345 VFIODevice *vbasedev_iter; 2346 VFIOGroup *group; 2347 char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name; 2348 ssize_t len; 2349 struct stat st; 2350 int groupid; 2351 int ret; 2352 2353 /* Check that the host device exists */ 2354 snprintf(path, sizeof(path), 2355 "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/", 2356 vdev->host.domain, vdev->host.bus, vdev->host.slot, 2357 vdev->host.function); 2358 if (stat(path, &st) < 0) { 2359 error_report("vfio: error: no such host device: %s", path); 2360 return -errno; 2361 } 2362 2363 vdev->vbasedev.ops = &vfio_pci_ops; 2364 2365 vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI; 2366 vdev->vbasedev.name = g_strdup_printf("%04x:%02x:%02x.%01x", 2367 vdev->host.domain, vdev->host.bus, 2368 vdev->host.slot, vdev->host.function); 2369 2370 strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1); 2371 2372 len = readlink(path, iommu_group_path, sizeof(path)); 2373 if (len <= 0 || len >= sizeof(path)) { 2374 error_report("vfio: error no iommu_group for device"); 2375 return len < 0 ? -errno : -ENAMETOOLONG; 2376 } 2377 2378 iommu_group_path[len] = 0; 2379 group_name = basename(iommu_group_path); 2380 2381 if (sscanf(group_name, "%d", &groupid) != 1) { 2382 error_report("vfio: error reading %s: %m", path); 2383 return -errno; 2384 } 2385 2386 trace_vfio_initfn(vdev->vbasedev.name, groupid); 2387 2388 group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev)); 2389 if (!group) { 2390 error_report("vfio: failed to get group %d", groupid); 2391 return -ENOENT; 2392 } 2393 2394 snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x", 2395 vdev->host.domain, vdev->host.bus, vdev->host.slot, 2396 vdev->host.function); 2397 2398 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { 2399 if (strcmp(vbasedev_iter->name, vdev->vbasedev.name) == 0) { 2400 error_report("vfio: error: device %s is already attached", path); 2401 vfio_put_group(group); 2402 return -EBUSY; 2403 } 2404 } 2405 2406 ret = vfio_get_device(group, path, &vdev->vbasedev); 2407 if (ret) { 2408 error_report("vfio: failed to get device %s", path); 2409 vfio_put_group(group); 2410 return ret; 2411 } 2412 2413 ret = vfio_populate_device(vdev); 2414 if (ret) { 2415 return ret; 2416 } 2417 2418 /* Get a copy of config space */ 2419 ret = pread(vdev->vbasedev.fd, vdev->pdev.config, 2420 MIN(pci_config_size(&vdev->pdev), vdev->config_size), 2421 vdev->config_offset); 2422 if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) { 2423 ret = ret < 0 ? -errno : -EFAULT; 2424 error_report("vfio: Failed to read device config space"); 2425 return ret; 2426 } 2427 2428 /* vfio emulates a lot for us, but some bits need extra love */ 2429 vdev->emulated_config_bits = g_malloc0(vdev->config_size); 2430 2431 /* QEMU can choose to expose the ROM or not */ 2432 memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4); 2433 2434 /* 2435 * The PCI spec reserves vendor ID 0xffff as an invalid value. The 2436 * device ID is managed by the vendor and need only be a 16-bit value. 2437 * Allow any 16-bit value for subsystem so they can be hidden or changed. 2438 */ 2439 if (vdev->vendor_id != PCI_ANY_ID) { 2440 if (vdev->vendor_id >= 0xffff) { 2441 error_report("vfio: Invalid PCI vendor ID provided"); 2442 return -EINVAL; 2443 } 2444 vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0); 2445 trace_vfio_pci_emulated_vendor_id(vdev->vbasedev.name, vdev->vendor_id); 2446 } else { 2447 vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID); 2448 } 2449 2450 if (vdev->device_id != PCI_ANY_ID) { 2451 if (vdev->device_id > 0xffff) { 2452 error_report("vfio: Invalid PCI device ID provided"); 2453 return -EINVAL; 2454 } 2455 vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0); 2456 trace_vfio_pci_emulated_device_id(vdev->vbasedev.name, vdev->device_id); 2457 } else { 2458 vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID); 2459 } 2460 2461 if (vdev->sub_vendor_id != PCI_ANY_ID) { 2462 if (vdev->sub_vendor_id > 0xffff) { 2463 error_report("vfio: Invalid PCI subsystem vendor ID provided"); 2464 return -EINVAL; 2465 } 2466 vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID, 2467 vdev->sub_vendor_id, ~0); 2468 trace_vfio_pci_emulated_sub_vendor_id(vdev->vbasedev.name, 2469 vdev->sub_vendor_id); 2470 } 2471 2472 if (vdev->sub_device_id != PCI_ANY_ID) { 2473 if (vdev->sub_device_id > 0xffff) { 2474 error_report("vfio: Invalid PCI subsystem device ID provided"); 2475 return -EINVAL; 2476 } 2477 vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0); 2478 trace_vfio_pci_emulated_sub_device_id(vdev->vbasedev.name, 2479 vdev->sub_device_id); 2480 } 2481 2482 /* QEMU can change multi-function devices to single function, or reverse */ 2483 vdev->emulated_config_bits[PCI_HEADER_TYPE] = 2484 PCI_HEADER_TYPE_MULTI_FUNCTION; 2485 2486 /* Restore or clear multifunction, this is always controlled by QEMU */ 2487 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { 2488 vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION; 2489 } else { 2490 vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION; 2491 } 2492 2493 /* 2494 * Clear host resource mapping info. If we choose not to register a 2495 * BAR, such as might be the case with the option ROM, we can get 2496 * confusing, unwritable, residual addresses from the host here. 2497 */ 2498 memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24); 2499 memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4); 2500 2501 vfio_pci_size_rom(vdev); 2502 2503 ret = vfio_msix_early_setup(vdev); 2504 if (ret) { 2505 return ret; 2506 } 2507 2508 vfio_map_bars(vdev); 2509 2510 ret = vfio_add_capabilities(vdev); 2511 if (ret) { 2512 goto out_teardown; 2513 } 2514 2515 /* QEMU emulates all of MSI & MSIX */ 2516 if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { 2517 memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff, 2518 MSIX_CAP_LENGTH); 2519 } 2520 2521 if (pdev->cap_present & QEMU_PCI_CAP_MSI) { 2522 memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff, 2523 vdev->msi_cap_size); 2524 } 2525 2526 if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { 2527 vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, 2528 vfio_intx_mmap_enable, vdev); 2529 pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_intx_update); 2530 ret = vfio_intx_enable(vdev); 2531 if (ret) { 2532 goto out_teardown; 2533 } 2534 } 2535 2536 vfio_register_err_notifier(vdev); 2537 vfio_register_req_notifier(vdev); 2538 vfio_setup_resetfn_quirk(vdev); 2539 2540 return 0; 2541 2542 out_teardown: 2543 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); 2544 vfio_teardown_msi(vdev); 2545 vfio_unregister_bars(vdev); 2546 return ret; 2547 } 2548 2549 static void vfio_instance_finalize(Object *obj) 2550 { 2551 PCIDevice *pci_dev = PCI_DEVICE(obj); 2552 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pci_dev); 2553 VFIOGroup *group = vdev->vbasedev.group; 2554 2555 vfio_unmap_bars(vdev); 2556 g_free(vdev->emulated_config_bits); 2557 g_free(vdev->rom); 2558 vfio_put_device(vdev); 2559 vfio_put_group(group); 2560 } 2561 2562 static void vfio_exitfn(PCIDevice *pdev) 2563 { 2564 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 2565 2566 vfio_unregister_req_notifier(vdev); 2567 vfio_unregister_err_notifier(vdev); 2568 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); 2569 vfio_disable_interrupts(vdev); 2570 if (vdev->intx.mmap_timer) { 2571 timer_free(vdev->intx.mmap_timer); 2572 } 2573 vfio_teardown_msi(vdev); 2574 vfio_unregister_bars(vdev); 2575 } 2576 2577 static void vfio_pci_reset(DeviceState *dev) 2578 { 2579 PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev); 2580 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 2581 2582 trace_vfio_pci_reset(vdev->vbasedev.name); 2583 2584 vfio_pci_pre_reset(vdev); 2585 2586 if (vdev->resetfn && !vdev->resetfn(vdev)) { 2587 goto post_reset; 2588 } 2589 2590 if (vdev->vbasedev.reset_works && 2591 (vdev->has_flr || !vdev->has_pm_reset) && 2592 !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) { 2593 trace_vfio_pci_reset_flr(vdev->vbasedev.name); 2594 goto post_reset; 2595 } 2596 2597 /* See if we can do our own bus reset */ 2598 if (!vfio_pci_hot_reset_one(vdev)) { 2599 goto post_reset; 2600 } 2601 2602 /* If nothing else works and the device supports PM reset, use it */ 2603 if (vdev->vbasedev.reset_works && vdev->has_pm_reset && 2604 !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) { 2605 trace_vfio_pci_reset_pm(vdev->vbasedev.name); 2606 goto post_reset; 2607 } 2608 2609 post_reset: 2610 vfio_pci_post_reset(vdev); 2611 } 2612 2613 static void vfio_instance_init(Object *obj) 2614 { 2615 PCIDevice *pci_dev = PCI_DEVICE(obj); 2616 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(obj)); 2617 2618 device_add_bootindex_property(obj, &vdev->bootindex, 2619 "bootindex", NULL, 2620 &pci_dev->qdev, NULL); 2621 } 2622 2623 static Property vfio_pci_dev_properties[] = { 2624 DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), 2625 DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice, 2626 intx.mmap_timeout, 1100), 2627 DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features, 2628 VFIO_FEATURE_ENABLE_VGA_BIT, false), 2629 DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features, 2630 VFIO_FEATURE_ENABLE_REQ_BIT, true), 2631 DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), 2632 DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false), 2633 DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false), 2634 DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false), 2635 DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID), 2636 DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID), 2637 DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice, 2638 sub_vendor_id, PCI_ANY_ID), 2639 DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice, 2640 sub_device_id, PCI_ANY_ID), 2641 /* 2642 * TODO - support passed fds... is this necessary? 2643 * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), 2644 * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name), 2645 */ 2646 DEFINE_PROP_END_OF_LIST(), 2647 }; 2648 2649 static const VMStateDescription vfio_pci_vmstate = { 2650 .name = "vfio-pci", 2651 .unmigratable = 1, 2652 }; 2653 2654 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) 2655 { 2656 DeviceClass *dc = DEVICE_CLASS(klass); 2657 PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); 2658 2659 dc->reset = vfio_pci_reset; 2660 dc->props = vfio_pci_dev_properties; 2661 dc->vmsd = &vfio_pci_vmstate; 2662 dc->desc = "VFIO-based PCI device assignment"; 2663 set_bit(DEVICE_CATEGORY_MISC, dc->categories); 2664 pdc->init = vfio_initfn; 2665 pdc->exit = vfio_exitfn; 2666 pdc->config_read = vfio_pci_read_config; 2667 pdc->config_write = vfio_pci_write_config; 2668 pdc->is_express = 1; /* We might be */ 2669 } 2670 2671 static const TypeInfo vfio_pci_dev_info = { 2672 .name = "vfio-pci", 2673 .parent = TYPE_PCI_DEVICE, 2674 .instance_size = sizeof(VFIOPCIDevice), 2675 .class_init = vfio_pci_dev_class_init, 2676 .instance_init = vfio_instance_init, 2677 .instance_finalize = vfio_instance_finalize, 2678 }; 2679 2680 static void register_vfio_pci_dev_type(void) 2681 { 2682 type_register_static(&vfio_pci_dev_info); 2683 } 2684 2685 type_init(register_vfio_pci_dev_type) 2686