1 /* 2 * vfio based device assignment support 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Based on qemu-kvm device-assignment: 13 * Adapted for KVM by Qumranet. 14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) 15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) 16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) 17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) 18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) 19 */ 20 21 #include "qemu/osdep.h" 22 #include <linux/vfio.h> 23 #include <sys/ioctl.h> 24 25 #include "hw/pci/msi.h" 26 #include "hw/pci/msix.h" 27 #include "hw/pci/pci_bridge.h" 28 #include "qemu/error-report.h" 29 #include "qemu/module.h" 30 #include "qemu/option.h" 31 #include "qemu/range.h" 32 #include "qemu/units.h" 33 #include "sysemu/kvm.h" 34 #include "sysemu/sysemu.h" 35 #include "pci.h" 36 #include "trace.h" 37 #include "qapi/error.h" 38 39 #define TYPE_VFIO_PCI "vfio-pci" 40 #define PCI_VFIO(obj) OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI) 41 42 #define TYPE_VIFO_PCI_NOHOTPLUG "vfio-pci-nohotplug" 43 44 static void vfio_disable_interrupts(VFIOPCIDevice *vdev); 45 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); 46 47 /* 48 * Disabling BAR mmaping can be slow, but toggling it around INTx can 49 * also be a huge overhead. We try to get the best of both worlds by 50 * waiting until an interrupt to disable mmaps (subsequent transitions 51 * to the same state are effectively no overhead). If the interrupt has 52 * been serviced and the time gap is long enough, we re-enable mmaps for 53 * performance. This works well for things like graphics cards, which 54 * may not use their interrupt at all and are penalized to an unusable 55 * level by read/write BAR traps. Other devices, like NICs, have more 56 * regular interrupts and see much better latency by staying in non-mmap 57 * mode. We therefore set the default mmap_timeout such that a ping 58 * is just enough to keep the mmap disabled. Users can experiment with 59 * other options with the x-intx-mmap-timeout-ms parameter (a value of 60 * zero disables the timer). 61 */ 62 static void vfio_intx_mmap_enable(void *opaque) 63 { 64 VFIOPCIDevice *vdev = opaque; 65 66 if (vdev->intx.pending) { 67 timer_mod(vdev->intx.mmap_timer, 68 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout); 69 return; 70 } 71 72 vfio_mmap_set_enabled(vdev, true); 73 } 74 75 static void vfio_intx_interrupt(void *opaque) 76 { 77 VFIOPCIDevice *vdev = opaque; 78 79 if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) { 80 return; 81 } 82 83 trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin); 84 85 vdev->intx.pending = true; 86 pci_irq_assert(&vdev->pdev); 87 vfio_mmap_set_enabled(vdev, false); 88 if (vdev->intx.mmap_timeout) { 89 timer_mod(vdev->intx.mmap_timer, 90 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout); 91 } 92 } 93 94 static void vfio_intx_eoi(VFIODevice *vbasedev) 95 { 96 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 97 98 if (!vdev->intx.pending) { 99 return; 100 } 101 102 trace_vfio_intx_eoi(vbasedev->name); 103 104 vdev->intx.pending = false; 105 pci_irq_deassert(&vdev->pdev); 106 vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 107 } 108 109 static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) 110 { 111 #ifdef CONFIG_KVM 112 struct kvm_irqfd irqfd = { 113 .fd = event_notifier_get_fd(&vdev->intx.interrupt), 114 .gsi = vdev->intx.route.irq, 115 .flags = KVM_IRQFD_FLAG_RESAMPLE, 116 }; 117 Error *err = NULL; 118 119 if (vdev->no_kvm_intx || !kvm_irqfds_enabled() || 120 vdev->intx.route.mode != PCI_INTX_ENABLED || 121 !kvm_resamplefds_enabled()) { 122 return; 123 } 124 125 /* Get to a known interrupt state */ 126 qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev); 127 vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 128 vdev->intx.pending = false; 129 pci_irq_deassert(&vdev->pdev); 130 131 /* Get an eventfd for resample/unmask */ 132 if (event_notifier_init(&vdev->intx.unmask, 0)) { 133 error_setg(errp, "event_notifier_init failed eoi"); 134 goto fail; 135 } 136 137 /* KVM triggers it, VFIO listens for it */ 138 irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask); 139 140 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) { 141 error_setg_errno(errp, errno, "failed to setup resample irqfd"); 142 goto fail_irqfd; 143 } 144 145 if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, 146 VFIO_IRQ_SET_ACTION_UNMASK, 147 irqfd.resamplefd, &err)) { 148 error_propagate(errp, err); 149 goto fail_vfio; 150 } 151 152 /* Let'em rip */ 153 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 154 155 vdev->intx.kvm_accel = true; 156 157 trace_vfio_intx_enable_kvm(vdev->vbasedev.name); 158 159 return; 160 161 fail_vfio: 162 irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN; 163 kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd); 164 fail_irqfd: 165 event_notifier_cleanup(&vdev->intx.unmask); 166 fail: 167 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev); 168 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 169 #endif 170 } 171 172 static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) 173 { 174 #ifdef CONFIG_KVM 175 struct kvm_irqfd irqfd = { 176 .fd = event_notifier_get_fd(&vdev->intx.interrupt), 177 .gsi = vdev->intx.route.irq, 178 .flags = KVM_IRQFD_FLAG_DEASSIGN, 179 }; 180 181 if (!vdev->intx.kvm_accel) { 182 return; 183 } 184 185 /* 186 * Get to a known state, hardware masked, QEMU ready to accept new 187 * interrupts, QEMU IRQ de-asserted. 188 */ 189 vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 190 vdev->intx.pending = false; 191 pci_irq_deassert(&vdev->pdev); 192 193 /* Tell KVM to stop listening for an INTx irqfd */ 194 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) { 195 error_report("vfio: Error: Failed to disable INTx irqfd: %m"); 196 } 197 198 /* We only need to close the eventfd for VFIO to cleanup the kernel side */ 199 event_notifier_cleanup(&vdev->intx.unmask); 200 201 /* QEMU starts listening for interrupt events. */ 202 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev); 203 204 vdev->intx.kvm_accel = false; 205 206 /* If we've missed an event, let it re-fire through QEMU */ 207 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 208 209 trace_vfio_intx_disable_kvm(vdev->vbasedev.name); 210 #endif 211 } 212 213 static void vfio_intx_update(PCIDevice *pdev) 214 { 215 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 216 PCIINTxRoute route; 217 Error *err = NULL; 218 219 if (vdev->interrupt != VFIO_INT_INTx) { 220 return; 221 } 222 223 route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin); 224 225 if (!pci_intx_route_changed(&vdev->intx.route, &route)) { 226 return; /* Nothing changed */ 227 } 228 229 trace_vfio_intx_update(vdev->vbasedev.name, 230 vdev->intx.route.irq, route.irq); 231 232 vfio_intx_disable_kvm(vdev); 233 234 vdev->intx.route = route; 235 236 if (route.mode != PCI_INTX_ENABLED) { 237 return; 238 } 239 240 vfio_intx_enable_kvm(vdev, &err); 241 if (err) { 242 warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 243 } 244 245 /* Re-enable the interrupt in cased we missed an EOI */ 246 vfio_intx_eoi(&vdev->vbasedev); 247 } 248 249 static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) 250 { 251 uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); 252 Error *err = NULL; 253 int32_t fd; 254 int ret; 255 256 257 if (!pin) { 258 return 0; 259 } 260 261 vfio_disable_interrupts(vdev); 262 263 vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ 264 pci_config_set_interrupt_pin(vdev->pdev.config, pin); 265 266 #ifdef CONFIG_KVM 267 /* 268 * Only conditional to avoid generating error messages on platforms 269 * where we won't actually use the result anyway. 270 */ 271 if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) { 272 vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev, 273 vdev->intx.pin); 274 } 275 #endif 276 277 ret = event_notifier_init(&vdev->intx.interrupt, 0); 278 if (ret) { 279 error_setg_errno(errp, -ret, "event_notifier_init failed"); 280 return ret; 281 } 282 fd = event_notifier_get_fd(&vdev->intx.interrupt); 283 qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev); 284 285 if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, 286 VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { 287 error_propagate(errp, err); 288 qemu_set_fd_handler(fd, NULL, NULL, vdev); 289 event_notifier_cleanup(&vdev->intx.interrupt); 290 return -errno; 291 } 292 293 vfio_intx_enable_kvm(vdev, &err); 294 if (err) { 295 warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 296 } 297 298 vdev->interrupt = VFIO_INT_INTx; 299 300 trace_vfio_intx_enable(vdev->vbasedev.name); 301 return 0; 302 } 303 304 static void vfio_intx_disable(VFIOPCIDevice *vdev) 305 { 306 int fd; 307 308 timer_del(vdev->intx.mmap_timer); 309 vfio_intx_disable_kvm(vdev); 310 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 311 vdev->intx.pending = false; 312 pci_irq_deassert(&vdev->pdev); 313 vfio_mmap_set_enabled(vdev, true); 314 315 fd = event_notifier_get_fd(&vdev->intx.interrupt); 316 qemu_set_fd_handler(fd, NULL, NULL, vdev); 317 event_notifier_cleanup(&vdev->intx.interrupt); 318 319 vdev->interrupt = VFIO_INT_NONE; 320 321 trace_vfio_intx_disable(vdev->vbasedev.name); 322 } 323 324 /* 325 * MSI/X 326 */ 327 static void vfio_msi_interrupt(void *opaque) 328 { 329 VFIOMSIVector *vector = opaque; 330 VFIOPCIDevice *vdev = vector->vdev; 331 MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector); 332 void (*notify)(PCIDevice *dev, unsigned vector); 333 MSIMessage msg; 334 int nr = vector - vdev->msi_vectors; 335 336 if (!event_notifier_test_and_clear(&vector->interrupt)) { 337 return; 338 } 339 340 if (vdev->interrupt == VFIO_INT_MSIX) { 341 get_msg = msix_get_message; 342 notify = msix_notify; 343 344 /* A masked vector firing needs to use the PBA, enable it */ 345 if (msix_is_masked(&vdev->pdev, nr)) { 346 set_bit(nr, vdev->msix->pending); 347 memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true); 348 trace_vfio_msix_pba_enable(vdev->vbasedev.name); 349 } 350 } else if (vdev->interrupt == VFIO_INT_MSI) { 351 get_msg = msi_get_message; 352 notify = msi_notify; 353 } else { 354 abort(); 355 } 356 357 msg = get_msg(&vdev->pdev, nr); 358 trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data); 359 notify(&vdev->pdev, nr); 360 } 361 362 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) 363 { 364 struct vfio_irq_set *irq_set; 365 int ret = 0, i, argsz; 366 int32_t *fds; 367 368 argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds)); 369 370 irq_set = g_malloc0(argsz); 371 irq_set->argsz = argsz; 372 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; 373 irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX; 374 irq_set->start = 0; 375 irq_set->count = vdev->nr_vectors; 376 fds = (int32_t *)&irq_set->data; 377 378 for (i = 0; i < vdev->nr_vectors; i++) { 379 int fd = -1; 380 381 /* 382 * MSI vs MSI-X - The guest has direct access to MSI mask and pending 383 * bits, therefore we always use the KVM signaling path when setup. 384 * MSI-X mask and pending bits are emulated, so we want to use the 385 * KVM signaling path only when configured and unmasked. 386 */ 387 if (vdev->msi_vectors[i].use) { 388 if (vdev->msi_vectors[i].virq < 0 || 389 (msix && msix_is_masked(&vdev->pdev, i))) { 390 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt); 391 } else { 392 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt); 393 } 394 } 395 396 fds[i] = fd; 397 } 398 399 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 400 401 g_free(irq_set); 402 403 return ret; 404 } 405 406 static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, 407 int vector_n, bool msix) 408 { 409 int virq; 410 411 if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) { 412 return; 413 } 414 415 if (event_notifier_init(&vector->kvm_interrupt, 0)) { 416 return; 417 } 418 419 virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev); 420 if (virq < 0) { 421 event_notifier_cleanup(&vector->kvm_interrupt); 422 return; 423 } 424 425 if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, 426 NULL, virq) < 0) { 427 kvm_irqchip_release_virq(kvm_state, virq); 428 event_notifier_cleanup(&vector->kvm_interrupt); 429 return; 430 } 431 432 vector->virq = virq; 433 } 434 435 static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) 436 { 437 kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, 438 vector->virq); 439 kvm_irqchip_release_virq(kvm_state, vector->virq); 440 vector->virq = -1; 441 event_notifier_cleanup(&vector->kvm_interrupt); 442 } 443 444 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, 445 PCIDevice *pdev) 446 { 447 kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev); 448 kvm_irqchip_commit_routes(kvm_state); 449 } 450 451 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, 452 MSIMessage *msg, IOHandler *handler) 453 { 454 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 455 VFIOMSIVector *vector; 456 int ret; 457 458 trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr); 459 460 vector = &vdev->msi_vectors[nr]; 461 462 if (!vector->use) { 463 vector->vdev = vdev; 464 vector->virq = -1; 465 if (event_notifier_init(&vector->interrupt, 0)) { 466 error_report("vfio: Error: event_notifier_init failed"); 467 } 468 vector->use = true; 469 msix_vector_use(pdev, nr); 470 } 471 472 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 473 handler, NULL, vector); 474 475 /* 476 * Attempt to enable route through KVM irqchip, 477 * default to userspace handling if unavailable. 478 */ 479 if (vector->virq >= 0) { 480 if (!msg) { 481 vfio_remove_kvm_msi_virq(vector); 482 } else { 483 vfio_update_kvm_msi_virq(vector, *msg, pdev); 484 } 485 } else { 486 if (msg) { 487 vfio_add_kvm_msi_virq(vdev, vector, nr, true); 488 } 489 } 490 491 /* 492 * We don't want to have the host allocate all possible MSI vectors 493 * for a device if they're not in use, so we shutdown and incrementally 494 * increase them as needed. 495 */ 496 if (vdev->nr_vectors < nr + 1) { 497 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); 498 vdev->nr_vectors = nr + 1; 499 ret = vfio_enable_vectors(vdev, true); 500 if (ret) { 501 error_report("vfio: failed to enable vectors, %d", ret); 502 } 503 } else { 504 Error *err = NULL; 505 int32_t fd; 506 507 if (vector->virq >= 0) { 508 fd = event_notifier_get_fd(&vector->kvm_interrupt); 509 } else { 510 fd = event_notifier_get_fd(&vector->interrupt); 511 } 512 513 if (vfio_set_irq_signaling(&vdev->vbasedev, 514 VFIO_PCI_MSIX_IRQ_INDEX, nr, 515 VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { 516 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 517 } 518 } 519 520 /* Disable PBA emulation when nothing more is pending. */ 521 clear_bit(nr, vdev->msix->pending); 522 if (find_first_bit(vdev->msix->pending, 523 vdev->nr_vectors) == vdev->nr_vectors) { 524 memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false); 525 trace_vfio_msix_pba_disable(vdev->vbasedev.name); 526 } 527 528 return 0; 529 } 530 531 static int vfio_msix_vector_use(PCIDevice *pdev, 532 unsigned int nr, MSIMessage msg) 533 { 534 return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt); 535 } 536 537 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) 538 { 539 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 540 VFIOMSIVector *vector = &vdev->msi_vectors[nr]; 541 542 trace_vfio_msix_vector_release(vdev->vbasedev.name, nr); 543 544 /* 545 * There are still old guests that mask and unmask vectors on every 546 * interrupt. If we're using QEMU bypass with a KVM irqfd, leave all of 547 * the KVM setup in place, simply switch VFIO to use the non-bypass 548 * eventfd. We'll then fire the interrupt through QEMU and the MSI-X 549 * core will mask the interrupt and set pending bits, allowing it to 550 * be re-asserted on unmask. Nothing to do if already using QEMU mode. 551 */ 552 if (vector->virq >= 0) { 553 int32_t fd = event_notifier_get_fd(&vector->interrupt); 554 Error *err = NULL; 555 556 if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr, 557 VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { 558 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 559 } 560 } 561 } 562 563 static void vfio_msix_enable(VFIOPCIDevice *vdev) 564 { 565 vfio_disable_interrupts(vdev); 566 567 vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries); 568 569 vdev->interrupt = VFIO_INT_MSIX; 570 571 /* 572 * Some communication channels between VF & PF or PF & fw rely on the 573 * physical state of the device and expect that enabling MSI-X from the 574 * guest enables the same on the host. When our guest is Linux, the 575 * guest driver call to pci_enable_msix() sets the enabling bit in the 576 * MSI-X capability, but leaves the vector table masked. We therefore 577 * can't rely on a vector_use callback (from request_irq() in the guest) 578 * to switch the physical device into MSI-X mode because that may come a 579 * long time after pci_enable_msix(). This code enables vector 0 with 580 * triggering to userspace, then immediately release the vector, leaving 581 * the physical device with no vectors enabled, but MSI-X enabled, just 582 * like the guest view. 583 */ 584 vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL); 585 vfio_msix_vector_release(&vdev->pdev, 0); 586 587 if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, 588 vfio_msix_vector_release, NULL)) { 589 error_report("vfio: msix_set_vector_notifiers failed"); 590 } 591 592 trace_vfio_msix_enable(vdev->vbasedev.name); 593 } 594 595 static void vfio_msi_enable(VFIOPCIDevice *vdev) 596 { 597 int ret, i; 598 599 vfio_disable_interrupts(vdev); 600 601 vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev); 602 retry: 603 vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors); 604 605 for (i = 0; i < vdev->nr_vectors; i++) { 606 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 607 608 vector->vdev = vdev; 609 vector->virq = -1; 610 vector->use = true; 611 612 if (event_notifier_init(&vector->interrupt, 0)) { 613 error_report("vfio: Error: event_notifier_init failed"); 614 } 615 616 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 617 vfio_msi_interrupt, NULL, vector); 618 619 /* 620 * Attempt to enable route through KVM irqchip, 621 * default to userspace handling if unavailable. 622 */ 623 vfio_add_kvm_msi_virq(vdev, vector, i, false); 624 } 625 626 /* Set interrupt type prior to possible interrupts */ 627 vdev->interrupt = VFIO_INT_MSI; 628 629 ret = vfio_enable_vectors(vdev, false); 630 if (ret) { 631 if (ret < 0) { 632 error_report("vfio: Error: Failed to setup MSI fds: %m"); 633 } else if (ret != vdev->nr_vectors) { 634 error_report("vfio: Error: Failed to enable %d " 635 "MSI vectors, retry with %d", vdev->nr_vectors, ret); 636 } 637 638 for (i = 0; i < vdev->nr_vectors; i++) { 639 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 640 if (vector->virq >= 0) { 641 vfio_remove_kvm_msi_virq(vector); 642 } 643 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 644 NULL, NULL, NULL); 645 event_notifier_cleanup(&vector->interrupt); 646 } 647 648 g_free(vdev->msi_vectors); 649 650 if (ret > 0 && ret != vdev->nr_vectors) { 651 vdev->nr_vectors = ret; 652 goto retry; 653 } 654 vdev->nr_vectors = 0; 655 656 /* 657 * Failing to setup MSI doesn't really fall within any specification. 658 * Let's try leaving interrupts disabled and hope the guest figures 659 * out to fall back to INTx for this device. 660 */ 661 error_report("vfio: Error: Failed to enable MSI"); 662 vdev->interrupt = VFIO_INT_NONE; 663 664 return; 665 } 666 667 trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors); 668 } 669 670 static void vfio_msi_disable_common(VFIOPCIDevice *vdev) 671 { 672 Error *err = NULL; 673 int i; 674 675 for (i = 0; i < vdev->nr_vectors; i++) { 676 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 677 if (vdev->msi_vectors[i].use) { 678 if (vector->virq >= 0) { 679 vfio_remove_kvm_msi_virq(vector); 680 } 681 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 682 NULL, NULL, NULL); 683 event_notifier_cleanup(&vector->interrupt); 684 } 685 } 686 687 g_free(vdev->msi_vectors); 688 vdev->msi_vectors = NULL; 689 vdev->nr_vectors = 0; 690 vdev->interrupt = VFIO_INT_NONE; 691 692 vfio_intx_enable(vdev, &err); 693 if (err) { 694 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 695 } 696 } 697 698 static void vfio_msix_disable(VFIOPCIDevice *vdev) 699 { 700 int i; 701 702 msix_unset_vector_notifiers(&vdev->pdev); 703 704 /* 705 * MSI-X will only release vectors if MSI-X is still enabled on the 706 * device, check through the rest and release it ourselves if necessary. 707 */ 708 for (i = 0; i < vdev->nr_vectors; i++) { 709 if (vdev->msi_vectors[i].use) { 710 vfio_msix_vector_release(&vdev->pdev, i); 711 msix_vector_unuse(&vdev->pdev, i); 712 } 713 } 714 715 if (vdev->nr_vectors) { 716 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); 717 } 718 719 vfio_msi_disable_common(vdev); 720 721 memset(vdev->msix->pending, 0, 722 BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long)); 723 724 trace_vfio_msix_disable(vdev->vbasedev.name); 725 } 726 727 static void vfio_msi_disable(VFIOPCIDevice *vdev) 728 { 729 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX); 730 vfio_msi_disable_common(vdev); 731 732 trace_vfio_msi_disable(vdev->vbasedev.name); 733 } 734 735 static void vfio_update_msi(VFIOPCIDevice *vdev) 736 { 737 int i; 738 739 for (i = 0; i < vdev->nr_vectors; i++) { 740 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 741 MSIMessage msg; 742 743 if (!vector->use || vector->virq < 0) { 744 continue; 745 } 746 747 msg = msi_get_message(&vdev->pdev, i); 748 vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev); 749 } 750 } 751 752 static void vfio_pci_load_rom(VFIOPCIDevice *vdev) 753 { 754 struct vfio_region_info *reg_info; 755 uint64_t size; 756 off_t off = 0; 757 ssize_t bytes; 758 759 if (vfio_get_region_info(&vdev->vbasedev, 760 VFIO_PCI_ROM_REGION_INDEX, ®_info)) { 761 error_report("vfio: Error getting ROM info: %m"); 762 return; 763 } 764 765 trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size, 766 (unsigned long)reg_info->offset, 767 (unsigned long)reg_info->flags); 768 769 vdev->rom_size = size = reg_info->size; 770 vdev->rom_offset = reg_info->offset; 771 772 g_free(reg_info); 773 774 if (!vdev->rom_size) { 775 vdev->rom_read_failed = true; 776 error_report("vfio-pci: Cannot read device rom at " 777 "%s", vdev->vbasedev.name); 778 error_printf("Device option ROM contents are probably invalid " 779 "(check dmesg).\nSkip option ROM probe with rombar=0, " 780 "or load from file with romfile=\n"); 781 return; 782 } 783 784 vdev->rom = g_malloc(size); 785 memset(vdev->rom, 0xff, size); 786 787 while (size) { 788 bytes = pread(vdev->vbasedev.fd, vdev->rom + off, 789 size, vdev->rom_offset + off); 790 if (bytes == 0) { 791 break; 792 } else if (bytes > 0) { 793 off += bytes; 794 size -= bytes; 795 } else { 796 if (errno == EINTR || errno == EAGAIN) { 797 continue; 798 } 799 error_report("vfio: Error reading device ROM: %m"); 800 break; 801 } 802 } 803 804 /* 805 * Test the ROM signature against our device, if the vendor is correct 806 * but the device ID doesn't match, store the correct device ID and 807 * recompute the checksum. Intel IGD devices need this and are known 808 * to have bogus checksums so we can't simply adjust the checksum. 809 */ 810 if (pci_get_word(vdev->rom) == 0xaa55 && 811 pci_get_word(vdev->rom + 0x18) + 8 < vdev->rom_size && 812 !memcmp(vdev->rom + pci_get_word(vdev->rom + 0x18), "PCIR", 4)) { 813 uint16_t vid, did; 814 815 vid = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 4); 816 did = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6); 817 818 if (vid == vdev->vendor_id && did != vdev->device_id) { 819 int i; 820 uint8_t csum, *data = vdev->rom; 821 822 pci_set_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6, 823 vdev->device_id); 824 data[6] = 0; 825 826 for (csum = 0, i = 0; i < vdev->rom_size; i++) { 827 csum += data[i]; 828 } 829 830 data[6] = -csum; 831 } 832 } 833 } 834 835 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) 836 { 837 VFIOPCIDevice *vdev = opaque; 838 union { 839 uint8_t byte; 840 uint16_t word; 841 uint32_t dword; 842 uint64_t qword; 843 } val; 844 uint64_t data = 0; 845 846 /* Load the ROM lazily when the guest tries to read it */ 847 if (unlikely(!vdev->rom && !vdev->rom_read_failed)) { 848 vfio_pci_load_rom(vdev); 849 } 850 851 memcpy(&val, vdev->rom + addr, 852 (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0); 853 854 switch (size) { 855 case 1: 856 data = val.byte; 857 break; 858 case 2: 859 data = le16_to_cpu(val.word); 860 break; 861 case 4: 862 data = le32_to_cpu(val.dword); 863 break; 864 default: 865 hw_error("vfio: unsupported read size, %d bytes\n", size); 866 break; 867 } 868 869 trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data); 870 871 return data; 872 } 873 874 static void vfio_rom_write(void *opaque, hwaddr addr, 875 uint64_t data, unsigned size) 876 { 877 } 878 879 static const MemoryRegionOps vfio_rom_ops = { 880 .read = vfio_rom_read, 881 .write = vfio_rom_write, 882 .endianness = DEVICE_LITTLE_ENDIAN, 883 }; 884 885 static void vfio_pci_size_rom(VFIOPCIDevice *vdev) 886 { 887 uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); 888 off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; 889 DeviceState *dev = DEVICE(vdev); 890 char *name; 891 int fd = vdev->vbasedev.fd; 892 893 if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { 894 /* Since pci handles romfile, just print a message and return */ 895 if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) { 896 warn_report("Device at %s is known to cause system instability" 897 " issues during option rom execution", 898 vdev->vbasedev.name); 899 error_printf("Proceeding anyway since user specified romfile\n"); 900 } 901 return; 902 } 903 904 /* 905 * Use the same size ROM BAR as the physical device. The contents 906 * will get filled in later when the guest tries to read it. 907 */ 908 if (pread(fd, &orig, 4, offset) != 4 || 909 pwrite(fd, &size, 4, offset) != 4 || 910 pread(fd, &size, 4, offset) != 4 || 911 pwrite(fd, &orig, 4, offset) != 4) { 912 error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name); 913 return; 914 } 915 916 size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1; 917 918 if (!size) { 919 return; 920 } 921 922 if (vfio_blacklist_opt_rom(vdev)) { 923 if (dev->opts && qemu_opt_get(dev->opts, "rombar")) { 924 warn_report("Device at %s is known to cause system instability" 925 " issues during option rom execution", 926 vdev->vbasedev.name); 927 error_printf("Proceeding anyway since user specified" 928 " non zero value for rombar\n"); 929 } else { 930 warn_report("Rom loading for device at %s has been disabled" 931 " due to system instability issues", 932 vdev->vbasedev.name); 933 error_printf("Specify rombar=1 or romfile to force\n"); 934 return; 935 } 936 } 937 938 trace_vfio_pci_size_rom(vdev->vbasedev.name, size); 939 940 name = g_strdup_printf("vfio[%s].rom", vdev->vbasedev.name); 941 942 memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev), 943 &vfio_rom_ops, vdev, name, size); 944 g_free(name); 945 946 pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 947 PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom); 948 949 vdev->rom_read_failed = false; 950 } 951 952 void vfio_vga_write(void *opaque, hwaddr addr, 953 uint64_t data, unsigned size) 954 { 955 VFIOVGARegion *region = opaque; 956 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]); 957 union { 958 uint8_t byte; 959 uint16_t word; 960 uint32_t dword; 961 uint64_t qword; 962 } buf; 963 off_t offset = vga->fd_offset + region->offset + addr; 964 965 switch (size) { 966 case 1: 967 buf.byte = data; 968 break; 969 case 2: 970 buf.word = cpu_to_le16(data); 971 break; 972 case 4: 973 buf.dword = cpu_to_le32(data); 974 break; 975 default: 976 hw_error("vfio: unsupported write size, %d bytes", size); 977 break; 978 } 979 980 if (pwrite(vga->fd, &buf, size, offset) != size) { 981 error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m", 982 __func__, region->offset + addr, data, size); 983 } 984 985 trace_vfio_vga_write(region->offset + addr, data, size); 986 } 987 988 uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size) 989 { 990 VFIOVGARegion *region = opaque; 991 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]); 992 union { 993 uint8_t byte; 994 uint16_t word; 995 uint32_t dword; 996 uint64_t qword; 997 } buf; 998 uint64_t data = 0; 999 off_t offset = vga->fd_offset + region->offset + addr; 1000 1001 if (pread(vga->fd, &buf, size, offset) != size) { 1002 error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m", 1003 __func__, region->offset + addr, size); 1004 return (uint64_t)-1; 1005 } 1006 1007 switch (size) { 1008 case 1: 1009 data = buf.byte; 1010 break; 1011 case 2: 1012 data = le16_to_cpu(buf.word); 1013 break; 1014 case 4: 1015 data = le32_to_cpu(buf.dword); 1016 break; 1017 default: 1018 hw_error("vfio: unsupported read size, %d bytes", size); 1019 break; 1020 } 1021 1022 trace_vfio_vga_read(region->offset + addr, size, data); 1023 1024 return data; 1025 } 1026 1027 static const MemoryRegionOps vfio_vga_ops = { 1028 .read = vfio_vga_read, 1029 .write = vfio_vga_write, 1030 .endianness = DEVICE_LITTLE_ENDIAN, 1031 }; 1032 1033 /* 1034 * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page 1035 * size if the BAR is in an exclusive page in host so that we could map 1036 * this BAR to guest. But this sub-page BAR may not occupy an exclusive 1037 * page in guest. So we should set the priority of the expanded memory 1038 * region to zero in case of overlap with BARs which share the same page 1039 * with the sub-page BAR in guest. Besides, we should also recover the 1040 * size of this sub-page BAR when its base address is changed in guest 1041 * and not page aligned any more. 1042 */ 1043 static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) 1044 { 1045 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 1046 VFIORegion *region = &vdev->bars[bar].region; 1047 MemoryRegion *mmap_mr, *region_mr, *base_mr; 1048 PCIIORegion *r; 1049 pcibus_t bar_addr; 1050 uint64_t size = region->size; 1051 1052 /* Make sure that the whole region is allowed to be mmapped */ 1053 if (region->nr_mmaps != 1 || !region->mmaps[0].mmap || 1054 region->mmaps[0].size != region->size) { 1055 return; 1056 } 1057 1058 r = &pdev->io_regions[bar]; 1059 bar_addr = r->addr; 1060 base_mr = vdev->bars[bar].mr; 1061 region_mr = region->mem; 1062 mmap_mr = ®ion->mmaps[0].mem; 1063 1064 /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */ 1065 if (bar_addr != PCI_BAR_UNMAPPED && 1066 !(bar_addr & ~qemu_real_host_page_mask)) { 1067 size = qemu_real_host_page_size; 1068 } 1069 1070 memory_region_transaction_begin(); 1071 1072 if (vdev->bars[bar].size < size) { 1073 memory_region_set_size(base_mr, size); 1074 } 1075 memory_region_set_size(region_mr, size); 1076 memory_region_set_size(mmap_mr, size); 1077 if (size != vdev->bars[bar].size && memory_region_is_mapped(base_mr)) { 1078 memory_region_del_subregion(r->address_space, base_mr); 1079 memory_region_add_subregion_overlap(r->address_space, 1080 bar_addr, base_mr, 0); 1081 } 1082 1083 memory_region_transaction_commit(); 1084 } 1085 1086 /* 1087 * PCI config space 1088 */ 1089 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) 1090 { 1091 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 1092 uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; 1093 1094 memcpy(&emu_bits, vdev->emulated_config_bits + addr, len); 1095 emu_bits = le32_to_cpu(emu_bits); 1096 1097 if (emu_bits) { 1098 emu_val = pci_default_read_config(pdev, addr, len); 1099 } 1100 1101 if (~emu_bits & (0xffffffffU >> (32 - len * 8))) { 1102 ssize_t ret; 1103 1104 ret = pread(vdev->vbasedev.fd, &phys_val, len, 1105 vdev->config_offset + addr); 1106 if (ret != len) { 1107 error_report("%s(%s, 0x%x, 0x%x) failed: %m", 1108 __func__, vdev->vbasedev.name, addr, len); 1109 return -errno; 1110 } 1111 phys_val = le32_to_cpu(phys_val); 1112 } 1113 1114 val = (emu_val & emu_bits) | (phys_val & ~emu_bits); 1115 1116 trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val); 1117 1118 return val; 1119 } 1120 1121 void vfio_pci_write_config(PCIDevice *pdev, 1122 uint32_t addr, uint32_t val, int len) 1123 { 1124 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 1125 uint32_t val_le = cpu_to_le32(val); 1126 1127 trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len); 1128 1129 /* Write everything to VFIO, let it filter out what we can't write */ 1130 if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr) 1131 != len) { 1132 error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m", 1133 __func__, vdev->vbasedev.name, addr, val, len); 1134 } 1135 1136 /* MSI/MSI-X Enabling/Disabling */ 1137 if (pdev->cap_present & QEMU_PCI_CAP_MSI && 1138 ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) { 1139 int is_enabled, was_enabled = msi_enabled(pdev); 1140 1141 pci_default_write_config(pdev, addr, val, len); 1142 1143 is_enabled = msi_enabled(pdev); 1144 1145 if (!was_enabled) { 1146 if (is_enabled) { 1147 vfio_msi_enable(vdev); 1148 } 1149 } else { 1150 if (!is_enabled) { 1151 vfio_msi_disable(vdev); 1152 } else { 1153 vfio_update_msi(vdev); 1154 } 1155 } 1156 } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX && 1157 ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) { 1158 int is_enabled, was_enabled = msix_enabled(pdev); 1159 1160 pci_default_write_config(pdev, addr, val, len); 1161 1162 is_enabled = msix_enabled(pdev); 1163 1164 if (!was_enabled && is_enabled) { 1165 vfio_msix_enable(vdev); 1166 } else if (was_enabled && !is_enabled) { 1167 vfio_msix_disable(vdev); 1168 } 1169 } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) || 1170 range_covers_byte(addr, len, PCI_COMMAND)) { 1171 pcibus_t old_addr[PCI_NUM_REGIONS - 1]; 1172 int bar; 1173 1174 for (bar = 0; bar < PCI_ROM_SLOT; bar++) { 1175 old_addr[bar] = pdev->io_regions[bar].addr; 1176 } 1177 1178 pci_default_write_config(pdev, addr, val, len); 1179 1180 for (bar = 0; bar < PCI_ROM_SLOT; bar++) { 1181 if (old_addr[bar] != pdev->io_regions[bar].addr && 1182 vdev->bars[bar].region.size > 0 && 1183 vdev->bars[bar].region.size < qemu_real_host_page_size) { 1184 vfio_sub_page_bar_update_mapping(pdev, bar); 1185 } 1186 } 1187 } else { 1188 /* Write everything to QEMU to keep emulated bits correct */ 1189 pci_default_write_config(pdev, addr, val, len); 1190 } 1191 } 1192 1193 /* 1194 * Interrupt setup 1195 */ 1196 static void vfio_disable_interrupts(VFIOPCIDevice *vdev) 1197 { 1198 /* 1199 * More complicated than it looks. Disabling MSI/X transitions the 1200 * device to INTx mode (if supported). Therefore we need to first 1201 * disable MSI/X and then cleanup by disabling INTx. 1202 */ 1203 if (vdev->interrupt == VFIO_INT_MSIX) { 1204 vfio_msix_disable(vdev); 1205 } else if (vdev->interrupt == VFIO_INT_MSI) { 1206 vfio_msi_disable(vdev); 1207 } 1208 1209 if (vdev->interrupt == VFIO_INT_INTx) { 1210 vfio_intx_disable(vdev); 1211 } 1212 } 1213 1214 static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp) 1215 { 1216 uint16_t ctrl; 1217 bool msi_64bit, msi_maskbit; 1218 int ret, entries; 1219 Error *err = NULL; 1220 1221 if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl), 1222 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { 1223 error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS"); 1224 return -errno; 1225 } 1226 ctrl = le16_to_cpu(ctrl); 1227 1228 msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT); 1229 msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT); 1230 entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1); 1231 1232 trace_vfio_msi_setup(vdev->vbasedev.name, pos); 1233 1234 ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit, &err); 1235 if (ret < 0) { 1236 if (ret == -ENOTSUP) { 1237 return 0; 1238 } 1239 error_propagate_prepend(errp, err, "msi_init failed: "); 1240 return ret; 1241 } 1242 vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0); 1243 1244 return 0; 1245 } 1246 1247 static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev) 1248 { 1249 off_t start, end; 1250 VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region; 1251 1252 /* 1253 * If the host driver allows mapping of a MSIX data, we are going to 1254 * do map the entire BAR and emulate MSIX table on top of that. 1255 */ 1256 if (vfio_has_region_cap(&vdev->vbasedev, region->nr, 1257 VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) { 1258 return; 1259 } 1260 1261 /* 1262 * We expect to find a single mmap covering the whole BAR, anything else 1263 * means it's either unsupported or already setup. 1264 */ 1265 if (region->nr_mmaps != 1 || region->mmaps[0].offset || 1266 region->size != region->mmaps[0].size) { 1267 return; 1268 } 1269 1270 /* MSI-X table start and end aligned to host page size */ 1271 start = vdev->msix->table_offset & qemu_real_host_page_mask; 1272 end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset + 1273 (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE)); 1274 1275 /* 1276 * Does the MSI-X table cover the beginning of the BAR? The whole BAR? 1277 * NB - Host page size is necessarily a power of two and so is the PCI 1278 * BAR (not counting EA yet), therefore if we have host page aligned 1279 * @start and @end, then any remainder of the BAR before or after those 1280 * must be at least host page sized and therefore mmap'able. 1281 */ 1282 if (!start) { 1283 if (end >= region->size) { 1284 region->nr_mmaps = 0; 1285 g_free(region->mmaps); 1286 region->mmaps = NULL; 1287 trace_vfio_msix_fixup(vdev->vbasedev.name, 1288 vdev->msix->table_bar, 0, 0); 1289 } else { 1290 region->mmaps[0].offset = end; 1291 region->mmaps[0].size = region->size - end; 1292 trace_vfio_msix_fixup(vdev->vbasedev.name, 1293 vdev->msix->table_bar, region->mmaps[0].offset, 1294 region->mmaps[0].offset + region->mmaps[0].size); 1295 } 1296 1297 /* Maybe it's aligned at the end of the BAR */ 1298 } else if (end >= region->size) { 1299 region->mmaps[0].size = start; 1300 trace_vfio_msix_fixup(vdev->vbasedev.name, 1301 vdev->msix->table_bar, region->mmaps[0].offset, 1302 region->mmaps[0].offset + region->mmaps[0].size); 1303 1304 /* Otherwise it must split the BAR */ 1305 } else { 1306 region->nr_mmaps = 2; 1307 region->mmaps = g_renew(VFIOMmap, region->mmaps, 2); 1308 1309 memcpy(®ion->mmaps[1], ®ion->mmaps[0], sizeof(VFIOMmap)); 1310 1311 region->mmaps[0].size = start; 1312 trace_vfio_msix_fixup(vdev->vbasedev.name, 1313 vdev->msix->table_bar, region->mmaps[0].offset, 1314 region->mmaps[0].offset + region->mmaps[0].size); 1315 1316 region->mmaps[1].offset = end; 1317 region->mmaps[1].size = region->size - end; 1318 trace_vfio_msix_fixup(vdev->vbasedev.name, 1319 vdev->msix->table_bar, region->mmaps[1].offset, 1320 region->mmaps[1].offset + region->mmaps[1].size); 1321 } 1322 } 1323 1324 static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp) 1325 { 1326 int target_bar = -1; 1327 size_t msix_sz; 1328 1329 if (!vdev->msix || vdev->msix_relo == OFF_AUTOPCIBAR_OFF) { 1330 return; 1331 } 1332 1333 /* The actual minimum size of MSI-X structures */ 1334 msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) + 1335 (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8); 1336 /* Round up to host pages, we don't want to share a page */ 1337 msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz); 1338 /* PCI BARs must be a power of 2 */ 1339 msix_sz = pow2ceil(msix_sz); 1340 1341 if (vdev->msix_relo == OFF_AUTOPCIBAR_AUTO) { 1342 /* 1343 * TODO: Lookup table for known devices. 1344 * 1345 * Logically we might use an algorithm here to select the BAR adding 1346 * the least additional MMIO space, but we cannot programatically 1347 * predict the driver dependency on BAR ordering or sizing, therefore 1348 * 'auto' becomes a lookup for combinations reported to work. 1349 */ 1350 if (target_bar < 0) { 1351 error_setg(errp, "No automatic MSI-X relocation available for " 1352 "device %04x:%04x", vdev->vendor_id, vdev->device_id); 1353 return; 1354 } 1355 } else { 1356 target_bar = (int)(vdev->msix_relo - OFF_AUTOPCIBAR_BAR0); 1357 } 1358 1359 /* I/O port BARs cannot host MSI-X structures */ 1360 if (vdev->bars[target_bar].ioport) { 1361 error_setg(errp, "Invalid MSI-X relocation BAR %d, " 1362 "I/O port BAR", target_bar); 1363 return; 1364 } 1365 1366 /* Cannot use a BAR in the "shadow" of a 64-bit BAR */ 1367 if (!vdev->bars[target_bar].size && 1368 target_bar > 0 && vdev->bars[target_bar - 1].mem64) { 1369 error_setg(errp, "Invalid MSI-X relocation BAR %d, " 1370 "consumed by 64-bit BAR %d", target_bar, target_bar - 1); 1371 return; 1372 } 1373 1374 /* 2GB max size for 32-bit BARs, cannot double if already > 1G */ 1375 if (vdev->bars[target_bar].size > 1 * GiB && 1376 !vdev->bars[target_bar].mem64) { 1377 error_setg(errp, "Invalid MSI-X relocation BAR %d, " 1378 "no space to extend 32-bit BAR", target_bar); 1379 return; 1380 } 1381 1382 /* 1383 * If adding a new BAR, test if we can make it 64bit. We make it 1384 * prefetchable since QEMU MSI-X emulation has no read side effects 1385 * and doing so makes mapping more flexible. 1386 */ 1387 if (!vdev->bars[target_bar].size) { 1388 if (target_bar < (PCI_ROM_SLOT - 1) && 1389 !vdev->bars[target_bar + 1].size) { 1390 vdev->bars[target_bar].mem64 = true; 1391 vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64; 1392 } 1393 vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH; 1394 vdev->bars[target_bar].size = msix_sz; 1395 vdev->msix->table_offset = 0; 1396 } else { 1397 vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2, 1398 msix_sz * 2); 1399 /* 1400 * Due to above size calc, MSI-X always starts halfway into the BAR, 1401 * which will always be a separate host page. 1402 */ 1403 vdev->msix->table_offset = vdev->bars[target_bar].size / 2; 1404 } 1405 1406 vdev->msix->table_bar = target_bar; 1407 vdev->msix->pba_bar = target_bar; 1408 /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */ 1409 vdev->msix->pba_offset = vdev->msix->table_offset + 1410 (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE); 1411 1412 trace_vfio_msix_relo(vdev->vbasedev.name, 1413 vdev->msix->table_bar, vdev->msix->table_offset); 1414 } 1415 1416 /* 1417 * We don't have any control over how pci_add_capability() inserts 1418 * capabilities into the chain. In order to setup MSI-X we need a 1419 * MemoryRegion for the BAR. In order to setup the BAR and not 1420 * attempt to mmap the MSI-X table area, which VFIO won't allow, we 1421 * need to first look for where the MSI-X table lives. So we 1422 * unfortunately split MSI-X setup across two functions. 1423 */ 1424 static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) 1425 { 1426 uint8_t pos; 1427 uint16_t ctrl; 1428 uint32_t table, pba; 1429 int fd = vdev->vbasedev.fd; 1430 VFIOMSIXInfo *msix; 1431 1432 pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); 1433 if (!pos) { 1434 return; 1435 } 1436 1437 if (pread(fd, &ctrl, sizeof(ctrl), 1438 vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) { 1439 error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS"); 1440 return; 1441 } 1442 1443 if (pread(fd, &table, sizeof(table), 1444 vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) { 1445 error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE"); 1446 return; 1447 } 1448 1449 if (pread(fd, &pba, sizeof(pba), 1450 vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) { 1451 error_setg_errno(errp, errno, "failed to read PCI MSIX PBA"); 1452 return; 1453 } 1454 1455 ctrl = le16_to_cpu(ctrl); 1456 table = le32_to_cpu(table); 1457 pba = le32_to_cpu(pba); 1458 1459 msix = g_malloc0(sizeof(*msix)); 1460 msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK; 1461 msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK; 1462 msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK; 1463 msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; 1464 msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; 1465 1466 /* 1467 * Test the size of the pba_offset variable and catch if it extends outside 1468 * of the specified BAR. If it is the case, we need to apply a hardware 1469 * specific quirk if the device is known or we have a broken configuration. 1470 */ 1471 if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) { 1472 /* 1473 * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5 1474 * adapters. The T5 hardware returns an incorrect value of 0x8000 for 1475 * the VF PBA offset while the BAR itself is only 8k. The correct value 1476 * is 0x1000, so we hard code that here. 1477 */ 1478 if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO && 1479 (vdev->device_id & 0xff00) == 0x5800) { 1480 msix->pba_offset = 0x1000; 1481 } else if (vdev->msix_relo == OFF_AUTOPCIBAR_OFF) { 1482 error_setg(errp, "hardware reports invalid configuration, " 1483 "MSIX PBA outside of specified BAR"); 1484 g_free(msix); 1485 return; 1486 } 1487 } 1488 1489 trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar, 1490 msix->table_offset, msix->entries); 1491 vdev->msix = msix; 1492 1493 vfio_pci_fixup_msix_region(vdev); 1494 1495 vfio_pci_relocate_msix(vdev, errp); 1496 } 1497 1498 static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) 1499 { 1500 int ret; 1501 Error *err = NULL; 1502 1503 vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) * 1504 sizeof(unsigned long)); 1505 ret = msix_init(&vdev->pdev, vdev->msix->entries, 1506 vdev->bars[vdev->msix->table_bar].mr, 1507 vdev->msix->table_bar, vdev->msix->table_offset, 1508 vdev->bars[vdev->msix->pba_bar].mr, 1509 vdev->msix->pba_bar, vdev->msix->pba_offset, pos, 1510 &err); 1511 if (ret < 0) { 1512 if (ret == -ENOTSUP) { 1513 warn_report_err(err); 1514 return 0; 1515 } 1516 1517 error_propagate(errp, err); 1518 return ret; 1519 } 1520 1521 /* 1522 * The PCI spec suggests that devices provide additional alignment for 1523 * MSI-X structures and avoid overlapping non-MSI-X related registers. 1524 * For an assigned device, this hopefully means that emulation of MSI-X 1525 * structures does not affect the performance of the device. If devices 1526 * fail to provide that alignment, a significant performance penalty may 1527 * result, for instance Mellanox MT27500 VFs: 1528 * http://www.spinics.net/lists/kvm/msg125881.html 1529 * 1530 * The PBA is simply not that important for such a serious regression and 1531 * most drivers do not appear to look at it. The solution for this is to 1532 * disable the PBA MemoryRegion unless it's being used. We disable it 1533 * here and only enable it if a masked vector fires through QEMU. As the 1534 * vector-use notifier is called, which occurs on unmask, we test whether 1535 * PBA emulation is needed and again disable if not. 1536 */ 1537 memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false); 1538 1539 /* 1540 * The emulated machine may provide a paravirt interface for MSIX setup 1541 * so it is not strictly necessary to emulate MSIX here. This becomes 1542 * helpful when frequently accessed MMIO registers are located in 1543 * subpages adjacent to the MSIX table but the MSIX data containing page 1544 * cannot be mapped because of a host page size bigger than the MSIX table 1545 * alignment. 1546 */ 1547 if (object_property_get_bool(OBJECT(qdev_get_machine()), 1548 "vfio-no-msix-emulation", NULL)) { 1549 memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false); 1550 } 1551 1552 return 0; 1553 } 1554 1555 static void vfio_teardown_msi(VFIOPCIDevice *vdev) 1556 { 1557 msi_uninit(&vdev->pdev); 1558 1559 if (vdev->msix) { 1560 msix_uninit(&vdev->pdev, 1561 vdev->bars[vdev->msix->table_bar].mr, 1562 vdev->bars[vdev->msix->pba_bar].mr); 1563 g_free(vdev->msix->pending); 1564 } 1565 } 1566 1567 /* 1568 * Resource setup 1569 */ 1570 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled) 1571 { 1572 int i; 1573 1574 for (i = 0; i < PCI_ROM_SLOT; i++) { 1575 vfio_region_mmaps_set_enabled(&vdev->bars[i].region, enabled); 1576 } 1577 } 1578 1579 static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) 1580 { 1581 VFIOBAR *bar = &vdev->bars[nr]; 1582 1583 uint32_t pci_bar; 1584 int ret; 1585 1586 /* Skip both unimplemented BARs and the upper half of 64bit BARS. */ 1587 if (!bar->region.size) { 1588 return; 1589 } 1590 1591 /* Determine what type of BAR this is for registration */ 1592 ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar), 1593 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); 1594 if (ret != sizeof(pci_bar)) { 1595 error_report("vfio: Failed to read BAR %d (%m)", nr); 1596 return; 1597 } 1598 1599 pci_bar = le32_to_cpu(pci_bar); 1600 bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO); 1601 bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64); 1602 bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK : 1603 ~PCI_BASE_ADDRESS_MEM_MASK); 1604 bar->size = bar->region.size; 1605 } 1606 1607 static void vfio_bars_prepare(VFIOPCIDevice *vdev) 1608 { 1609 int i; 1610 1611 for (i = 0; i < PCI_ROM_SLOT; i++) { 1612 vfio_bar_prepare(vdev, i); 1613 } 1614 } 1615 1616 static void vfio_bar_register(VFIOPCIDevice *vdev, int nr) 1617 { 1618 VFIOBAR *bar = &vdev->bars[nr]; 1619 char *name; 1620 1621 if (!bar->size) { 1622 return; 1623 } 1624 1625 bar->mr = g_new0(MemoryRegion, 1); 1626 name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr); 1627 memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size); 1628 g_free(name); 1629 1630 if (bar->region.size) { 1631 memory_region_add_subregion(bar->mr, 0, bar->region.mem); 1632 1633 if (vfio_region_mmap(&bar->region)) { 1634 error_report("Failed to mmap %s BAR %d. Performance may be slow", 1635 vdev->vbasedev.name, nr); 1636 } 1637 } 1638 1639 pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr); 1640 } 1641 1642 static void vfio_bars_register(VFIOPCIDevice *vdev) 1643 { 1644 int i; 1645 1646 for (i = 0; i < PCI_ROM_SLOT; i++) { 1647 vfio_bar_register(vdev, i); 1648 } 1649 } 1650 1651 static void vfio_bars_exit(VFIOPCIDevice *vdev) 1652 { 1653 int i; 1654 1655 for (i = 0; i < PCI_ROM_SLOT; i++) { 1656 VFIOBAR *bar = &vdev->bars[i]; 1657 1658 vfio_bar_quirk_exit(vdev, i); 1659 vfio_region_exit(&bar->region); 1660 if (bar->region.size) { 1661 memory_region_del_subregion(bar->mr, bar->region.mem); 1662 } 1663 } 1664 1665 if (vdev->vga) { 1666 pci_unregister_vga(&vdev->pdev); 1667 vfio_vga_quirk_exit(vdev); 1668 } 1669 } 1670 1671 static void vfio_bars_finalize(VFIOPCIDevice *vdev) 1672 { 1673 int i; 1674 1675 for (i = 0; i < PCI_ROM_SLOT; i++) { 1676 VFIOBAR *bar = &vdev->bars[i]; 1677 1678 vfio_bar_quirk_finalize(vdev, i); 1679 vfio_region_finalize(&bar->region); 1680 if (bar->size) { 1681 object_unparent(OBJECT(bar->mr)); 1682 g_free(bar->mr); 1683 } 1684 } 1685 1686 if (vdev->vga) { 1687 vfio_vga_quirk_finalize(vdev); 1688 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { 1689 object_unparent(OBJECT(&vdev->vga->region[i].mem)); 1690 } 1691 g_free(vdev->vga); 1692 } 1693 } 1694 1695 /* 1696 * General setup 1697 */ 1698 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos) 1699 { 1700 uint8_t tmp; 1701 uint16_t next = PCI_CONFIG_SPACE_SIZE; 1702 1703 for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp; 1704 tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]) { 1705 if (tmp > pos && tmp < next) { 1706 next = tmp; 1707 } 1708 } 1709 1710 return next - pos; 1711 } 1712 1713 1714 static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos) 1715 { 1716 uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE; 1717 1718 for (tmp = PCI_CONFIG_SPACE_SIZE; tmp; 1719 tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) { 1720 if (tmp > pos && tmp < next) { 1721 next = tmp; 1722 } 1723 } 1724 1725 return next - pos; 1726 } 1727 1728 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask) 1729 { 1730 pci_set_word(buf, (pci_get_word(buf) & ~mask) | val); 1731 } 1732 1733 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos, 1734 uint16_t val, uint16_t mask) 1735 { 1736 vfio_set_word_bits(vdev->pdev.config + pos, val, mask); 1737 vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask); 1738 vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask); 1739 } 1740 1741 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask) 1742 { 1743 pci_set_long(buf, (pci_get_long(buf) & ~mask) | val); 1744 } 1745 1746 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos, 1747 uint32_t val, uint32_t mask) 1748 { 1749 vfio_set_long_bits(vdev->pdev.config + pos, val, mask); 1750 vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask); 1751 vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask); 1752 } 1753 1754 static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size, 1755 Error **errp) 1756 { 1757 uint16_t flags; 1758 uint8_t type; 1759 1760 flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS); 1761 type = (flags & PCI_EXP_FLAGS_TYPE) >> 4; 1762 1763 if (type != PCI_EXP_TYPE_ENDPOINT && 1764 type != PCI_EXP_TYPE_LEG_END && 1765 type != PCI_EXP_TYPE_RC_END) { 1766 1767 error_setg(errp, "assignment of PCIe type 0x%x " 1768 "devices is not currently supported", type); 1769 return -EINVAL; 1770 } 1771 1772 if (!pci_bus_is_express(pci_get_bus(&vdev->pdev))) { 1773 PCIBus *bus = pci_get_bus(&vdev->pdev); 1774 PCIDevice *bridge; 1775 1776 /* 1777 * Traditionally PCI device assignment exposes the PCIe capability 1778 * as-is on non-express buses. The reason being that some drivers 1779 * simply assume that it's there, for example tg3. However when 1780 * we're running on a native PCIe machine type, like Q35, we need 1781 * to hide the PCIe capability. The reason for this is twofold; 1782 * first Windows guests get a Code 10 error when the PCIe capability 1783 * is exposed in this configuration. Therefore express devices won't 1784 * work at all unless they're attached to express buses in the VM. 1785 * Second, a native PCIe machine introduces the possibility of fine 1786 * granularity IOMMUs supporting both translation and isolation. 1787 * Guest code to discover the IOMMU visibility of a device, such as 1788 * IOMMU grouping code on Linux, is very aware of device types and 1789 * valid transitions between bus types. An express device on a non- 1790 * express bus is not a valid combination on bare metal systems. 1791 * 1792 * Drivers that require a PCIe capability to make the device 1793 * functional are simply going to need to have their devices placed 1794 * on a PCIe bus in the VM. 1795 */ 1796 while (!pci_bus_is_root(bus)) { 1797 bridge = pci_bridge_get_device(bus); 1798 bus = pci_get_bus(bridge); 1799 } 1800 1801 if (pci_bus_is_express(bus)) { 1802 return 0; 1803 } 1804 1805 } else if (pci_bus_is_root(pci_get_bus(&vdev->pdev))) { 1806 /* 1807 * On a Root Complex bus Endpoints become Root Complex Integrated 1808 * Endpoints, which changes the type and clears the LNK & LNK2 fields. 1809 */ 1810 if (type == PCI_EXP_TYPE_ENDPOINT) { 1811 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS, 1812 PCI_EXP_TYPE_RC_END << 4, 1813 PCI_EXP_FLAGS_TYPE); 1814 1815 /* Link Capabilities, Status, and Control goes away */ 1816 if (size > PCI_EXP_LNKCTL) { 1817 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0); 1818 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0); 1819 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0); 1820 1821 #ifndef PCI_EXP_LNKCAP2 1822 #define PCI_EXP_LNKCAP2 44 1823 #endif 1824 #ifndef PCI_EXP_LNKSTA2 1825 #define PCI_EXP_LNKSTA2 50 1826 #endif 1827 /* Link 2 Capabilities, Status, and Control goes away */ 1828 if (size > PCI_EXP_LNKCAP2) { 1829 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0); 1830 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0); 1831 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0); 1832 } 1833 } 1834 1835 } else if (type == PCI_EXP_TYPE_LEG_END) { 1836 /* 1837 * Legacy endpoints don't belong on the root complex. Windows 1838 * seems to be happier with devices if we skip the capability. 1839 */ 1840 return 0; 1841 } 1842 1843 } else { 1844 /* 1845 * Convert Root Complex Integrated Endpoints to regular endpoints. 1846 * These devices don't support LNK/LNK2 capabilities, so make them up. 1847 */ 1848 if (type == PCI_EXP_TYPE_RC_END) { 1849 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS, 1850 PCI_EXP_TYPE_ENDPOINT << 4, 1851 PCI_EXP_FLAGS_TYPE); 1852 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 1853 QEMU_PCI_EXP_LNKCAP_MLW(QEMU_PCI_EXP_LNK_X1) | 1854 QEMU_PCI_EXP_LNKCAP_MLS(QEMU_PCI_EXP_LNK_2_5GT), ~0); 1855 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0); 1856 } 1857 } 1858 1859 /* 1860 * Intel 82599 SR-IOV VFs report an invalid PCIe capability version 0 1861 * (Niantic errate #35) causing Windows to error with a Code 10 for the 1862 * device on Q35. Fixup any such devices to report version 1. If we 1863 * were to remove the capability entirely the guest would lose extended 1864 * config space. 1865 */ 1866 if ((flags & PCI_EXP_FLAGS_VERS) == 0) { 1867 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS, 1868 1, PCI_EXP_FLAGS_VERS); 1869 } 1870 1871 pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size, 1872 errp); 1873 if (pos < 0) { 1874 return pos; 1875 } 1876 1877 vdev->pdev.exp.exp_cap = pos; 1878 1879 return pos; 1880 } 1881 1882 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos) 1883 { 1884 uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP); 1885 1886 if (cap & PCI_EXP_DEVCAP_FLR) { 1887 trace_vfio_check_pcie_flr(vdev->vbasedev.name); 1888 vdev->has_flr = true; 1889 } 1890 } 1891 1892 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos) 1893 { 1894 uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL); 1895 1896 if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) { 1897 trace_vfio_check_pm_reset(vdev->vbasedev.name); 1898 vdev->has_pm_reset = true; 1899 } 1900 } 1901 1902 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos) 1903 { 1904 uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP); 1905 1906 if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) { 1907 trace_vfio_check_af_flr(vdev->vbasedev.name); 1908 vdev->has_flr = true; 1909 } 1910 } 1911 1912 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) 1913 { 1914 PCIDevice *pdev = &vdev->pdev; 1915 uint8_t cap_id, next, size; 1916 int ret; 1917 1918 cap_id = pdev->config[pos]; 1919 next = pdev->config[pos + PCI_CAP_LIST_NEXT]; 1920 1921 /* 1922 * If it becomes important to configure capabilities to their actual 1923 * size, use this as the default when it's something we don't recognize. 1924 * Since QEMU doesn't actually handle many of the config accesses, 1925 * exact size doesn't seem worthwhile. 1926 */ 1927 size = vfio_std_cap_max_size(pdev, pos); 1928 1929 /* 1930 * pci_add_capability always inserts the new capability at the head 1931 * of the chain. Therefore to end up with a chain that matches the 1932 * physical device, we insert from the end by making this recursive. 1933 * This is also why we pre-calculate size above as cached config space 1934 * will be changed as we unwind the stack. 1935 */ 1936 if (next) { 1937 ret = vfio_add_std_cap(vdev, next, errp); 1938 if (ret) { 1939 return ret; 1940 } 1941 } else { 1942 /* Begin the rebuild, use QEMU emulated list bits */ 1943 pdev->config[PCI_CAPABILITY_LIST] = 0; 1944 vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff; 1945 vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST; 1946 1947 ret = vfio_add_virt_caps(vdev, errp); 1948 if (ret) { 1949 return ret; 1950 } 1951 } 1952 1953 /* Scale down size, esp in case virt caps were added above */ 1954 size = MIN(size, vfio_std_cap_max_size(pdev, pos)); 1955 1956 /* Use emulated next pointer to allow dropping caps */ 1957 pci_set_byte(vdev->emulated_config_bits + pos + PCI_CAP_LIST_NEXT, 0xff); 1958 1959 switch (cap_id) { 1960 case PCI_CAP_ID_MSI: 1961 ret = vfio_msi_setup(vdev, pos, errp); 1962 break; 1963 case PCI_CAP_ID_EXP: 1964 vfio_check_pcie_flr(vdev, pos); 1965 ret = vfio_setup_pcie_cap(vdev, pos, size, errp); 1966 break; 1967 case PCI_CAP_ID_MSIX: 1968 ret = vfio_msix_setup(vdev, pos, errp); 1969 break; 1970 case PCI_CAP_ID_PM: 1971 vfio_check_pm_reset(vdev, pos); 1972 vdev->pm_cap = pos; 1973 ret = pci_add_capability(pdev, cap_id, pos, size, errp); 1974 break; 1975 case PCI_CAP_ID_AF: 1976 vfio_check_af_flr(vdev, pos); 1977 ret = pci_add_capability(pdev, cap_id, pos, size, errp); 1978 break; 1979 default: 1980 ret = pci_add_capability(pdev, cap_id, pos, size, errp); 1981 break; 1982 } 1983 1984 if (ret < 0) { 1985 error_prepend(errp, 1986 "failed to add PCI capability 0x%x[0x%x]@0x%x: ", 1987 cap_id, size, pos); 1988 return ret; 1989 } 1990 1991 return 0; 1992 } 1993 1994 static void vfio_add_ext_cap(VFIOPCIDevice *vdev) 1995 { 1996 PCIDevice *pdev = &vdev->pdev; 1997 uint32_t header; 1998 uint16_t cap_id, next, size; 1999 uint8_t cap_ver; 2000 uint8_t *config; 2001 2002 /* Only add extended caps if we have them and the guest can see them */ 2003 if (!pci_is_express(pdev) || !pci_bus_is_express(pci_get_bus(pdev)) || 2004 !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) { 2005 return; 2006 } 2007 2008 /* 2009 * pcie_add_capability always inserts the new capability at the tail 2010 * of the chain. Therefore to end up with a chain that matches the 2011 * physical device, we cache the config space to avoid overwriting 2012 * the original config space when we parse the extended capabilities. 2013 */ 2014 config = g_memdup(pdev->config, vdev->config_size); 2015 2016 /* 2017 * Extended capabilities are chained with each pointing to the next, so we 2018 * can drop anything other than the head of the chain simply by modifying 2019 * the previous next pointer. Seed the head of the chain here such that 2020 * we can simply skip any capabilities we want to drop below, regardless 2021 * of their position in the chain. If this stub capability still exists 2022 * after we add the capabilities we want to expose, update the capability 2023 * ID to zero. Note that we cannot seed with the capability header being 2024 * zero as this conflicts with definition of an absent capability chain 2025 * and prevents capabilities beyond the head of the list from being added. 2026 * By replacing the dummy capability ID with zero after walking the device 2027 * chain, we also transparently mark extended capabilities as absent if 2028 * no capabilities were added. Note that the PCIe spec defines an absence 2029 * of extended capabilities to be determined by a value of zero for the 2030 * capability ID, version, AND next pointer. A non-zero next pointer 2031 * should be sufficient to indicate additional capabilities are present, 2032 * which will occur if we call pcie_add_capability() below. The entire 2033 * first dword is emulated to support this. 2034 * 2035 * NB. The kernel side does similar masking, so be prepared that our 2036 * view of the device may also contain a capability ID zero in the head 2037 * of the chain. Skip it for the same reason that we cannot seed the 2038 * chain with a zero capability. 2039 */ 2040 pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE, 2041 PCI_EXT_CAP(0xFFFF, 0, 0)); 2042 pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0); 2043 pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0); 2044 2045 for (next = PCI_CONFIG_SPACE_SIZE; next; 2046 next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) { 2047 header = pci_get_long(config + next); 2048 cap_id = PCI_EXT_CAP_ID(header); 2049 cap_ver = PCI_EXT_CAP_VER(header); 2050 2051 /* 2052 * If it becomes important to configure extended capabilities to their 2053 * actual size, use this as the default when it's something we don't 2054 * recognize. Since QEMU doesn't actually handle many of the config 2055 * accesses, exact size doesn't seem worthwhile. 2056 */ 2057 size = vfio_ext_cap_max_size(config, next); 2058 2059 /* Use emulated next pointer to allow dropping extended caps */ 2060 pci_long_test_and_set_mask(vdev->emulated_config_bits + next, 2061 PCI_EXT_CAP_NEXT_MASK); 2062 2063 switch (cap_id) { 2064 case 0: /* kernel masked capability */ 2065 case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */ 2066 case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */ 2067 case PCI_EXT_CAP_ID_REBAR: /* Can't expose read-only */ 2068 trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next); 2069 break; 2070 default: 2071 pcie_add_capability(pdev, cap_id, cap_ver, next, size); 2072 } 2073 2074 } 2075 2076 /* Cleanup chain head ID if necessary */ 2077 if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) { 2078 pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0); 2079 } 2080 2081 g_free(config); 2082 return; 2083 } 2084 2085 static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) 2086 { 2087 PCIDevice *pdev = &vdev->pdev; 2088 int ret; 2089 2090 if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) || 2091 !pdev->config[PCI_CAPABILITY_LIST]) { 2092 return 0; /* Nothing to add */ 2093 } 2094 2095 ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST], errp); 2096 if (ret) { 2097 return ret; 2098 } 2099 2100 vfio_add_ext_cap(vdev); 2101 return 0; 2102 } 2103 2104 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) 2105 { 2106 PCIDevice *pdev = &vdev->pdev; 2107 uint16_t cmd; 2108 2109 vfio_disable_interrupts(vdev); 2110 2111 /* Make sure the device is in D0 */ 2112 if (vdev->pm_cap) { 2113 uint16_t pmcsr; 2114 uint8_t state; 2115 2116 pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); 2117 state = pmcsr & PCI_PM_CTRL_STATE_MASK; 2118 if (state) { 2119 pmcsr &= ~PCI_PM_CTRL_STATE_MASK; 2120 vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); 2121 /* vfio handles the necessary delay here */ 2122 pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); 2123 state = pmcsr & PCI_PM_CTRL_STATE_MASK; 2124 if (state) { 2125 error_report("vfio: Unable to power on device, stuck in D%d", 2126 state); 2127 } 2128 } 2129 } 2130 2131 /* 2132 * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master. 2133 * Also put INTx Disable in known state. 2134 */ 2135 cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); 2136 cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | 2137 PCI_COMMAND_INTX_DISABLE); 2138 vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); 2139 } 2140 2141 static void vfio_pci_post_reset(VFIOPCIDevice *vdev) 2142 { 2143 Error *err = NULL; 2144 int nr; 2145 2146 vfio_intx_enable(vdev, &err); 2147 if (err) { 2148 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 2149 } 2150 2151 for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) { 2152 off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr); 2153 uint32_t val = 0; 2154 uint32_t len = sizeof(val); 2155 2156 if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) { 2157 error_report("%s(%s) reset bar %d failed: %m", __func__, 2158 vdev->vbasedev.name, nr); 2159 } 2160 } 2161 2162 vfio_quirk_reset(vdev); 2163 } 2164 2165 static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) 2166 { 2167 char tmp[13]; 2168 2169 sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain, 2170 addr->bus, addr->slot, addr->function); 2171 2172 return (strcmp(tmp, name) == 0); 2173 } 2174 2175 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) 2176 { 2177 VFIOGroup *group; 2178 struct vfio_pci_hot_reset_info *info; 2179 struct vfio_pci_dependent_device *devices; 2180 struct vfio_pci_hot_reset *reset; 2181 int32_t *fds; 2182 int ret, i, count; 2183 bool multi = false; 2184 2185 trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); 2186 2187 if (!single) { 2188 vfio_pci_pre_reset(vdev); 2189 } 2190 vdev->vbasedev.needs_reset = false; 2191 2192 info = g_malloc0(sizeof(*info)); 2193 info->argsz = sizeof(*info); 2194 2195 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); 2196 if (ret && errno != ENOSPC) { 2197 ret = -errno; 2198 if (!vdev->has_pm_reset) { 2199 error_report("vfio: Cannot reset device %s, " 2200 "no available reset mechanism.", vdev->vbasedev.name); 2201 } 2202 goto out_single; 2203 } 2204 2205 count = info->count; 2206 info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices))); 2207 info->argsz = sizeof(*info) + (count * sizeof(*devices)); 2208 devices = &info->devices[0]; 2209 2210 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); 2211 if (ret) { 2212 ret = -errno; 2213 error_report("vfio: hot reset info failed: %m"); 2214 goto out_single; 2215 } 2216 2217 trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); 2218 2219 /* Verify that we have all the groups required */ 2220 for (i = 0; i < info->count; i++) { 2221 PCIHostDeviceAddress host; 2222 VFIOPCIDevice *tmp; 2223 VFIODevice *vbasedev_iter; 2224 2225 host.domain = devices[i].segment; 2226 host.bus = devices[i].bus; 2227 host.slot = PCI_SLOT(devices[i].devfn); 2228 host.function = PCI_FUNC(devices[i].devfn); 2229 2230 trace_vfio_pci_hot_reset_dep_devices(host.domain, 2231 host.bus, host.slot, host.function, devices[i].group_id); 2232 2233 if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { 2234 continue; 2235 } 2236 2237 QLIST_FOREACH(group, &vfio_group_list, next) { 2238 if (group->groupid == devices[i].group_id) { 2239 break; 2240 } 2241 } 2242 2243 if (!group) { 2244 if (!vdev->has_pm_reset) { 2245 error_report("vfio: Cannot reset device %s, " 2246 "depends on group %d which is not owned.", 2247 vdev->vbasedev.name, devices[i].group_id); 2248 } 2249 ret = -EPERM; 2250 goto out; 2251 } 2252 2253 /* Prep dependent devices for reset and clear our marker. */ 2254 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { 2255 if (!vbasedev_iter->dev->realized || 2256 vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { 2257 continue; 2258 } 2259 tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); 2260 if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { 2261 if (single) { 2262 ret = -EINVAL; 2263 goto out_single; 2264 } 2265 vfio_pci_pre_reset(tmp); 2266 tmp->vbasedev.needs_reset = false; 2267 multi = true; 2268 break; 2269 } 2270 } 2271 } 2272 2273 if (!single && !multi) { 2274 ret = -EINVAL; 2275 goto out_single; 2276 } 2277 2278 /* Determine how many group fds need to be passed */ 2279 count = 0; 2280 QLIST_FOREACH(group, &vfio_group_list, next) { 2281 for (i = 0; i < info->count; i++) { 2282 if (group->groupid == devices[i].group_id) { 2283 count++; 2284 break; 2285 } 2286 } 2287 } 2288 2289 reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); 2290 reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); 2291 fds = &reset->group_fds[0]; 2292 2293 /* Fill in group fds */ 2294 QLIST_FOREACH(group, &vfio_group_list, next) { 2295 for (i = 0; i < info->count; i++) { 2296 if (group->groupid == devices[i].group_id) { 2297 fds[reset->count++] = group->fd; 2298 break; 2299 } 2300 } 2301 } 2302 2303 /* Bus reset! */ 2304 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); 2305 g_free(reset); 2306 2307 trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, 2308 ret ? "%m" : "Success"); 2309 2310 out: 2311 /* Re-enable INTx on affected devices */ 2312 for (i = 0; i < info->count; i++) { 2313 PCIHostDeviceAddress host; 2314 VFIOPCIDevice *tmp; 2315 VFIODevice *vbasedev_iter; 2316 2317 host.domain = devices[i].segment; 2318 host.bus = devices[i].bus; 2319 host.slot = PCI_SLOT(devices[i].devfn); 2320 host.function = PCI_FUNC(devices[i].devfn); 2321 2322 if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { 2323 continue; 2324 } 2325 2326 QLIST_FOREACH(group, &vfio_group_list, next) { 2327 if (group->groupid == devices[i].group_id) { 2328 break; 2329 } 2330 } 2331 2332 if (!group) { 2333 break; 2334 } 2335 2336 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { 2337 if (!vbasedev_iter->dev->realized || 2338 vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { 2339 continue; 2340 } 2341 tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); 2342 if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { 2343 vfio_pci_post_reset(tmp); 2344 break; 2345 } 2346 } 2347 } 2348 out_single: 2349 if (!single) { 2350 vfio_pci_post_reset(vdev); 2351 } 2352 g_free(info); 2353 2354 return ret; 2355 } 2356 2357 /* 2358 * We want to differentiate hot reset of mulitple in-use devices vs hot reset 2359 * of a single in-use device. VFIO_DEVICE_RESET will already handle the case 2360 * of doing hot resets when there is only a single device per bus. The in-use 2361 * here refers to how many VFIODevices are affected. A hot reset that affects 2362 * multiple devices, but only a single in-use device, means that we can call 2363 * it from our bus ->reset() callback since the extent is effectively a single 2364 * device. This allows us to make use of it in the hotplug path. When there 2365 * are multiple in-use devices, we can only trigger the hot reset during a 2366 * system reset and thus from our reset handler. We separate _one vs _multi 2367 * here so that we don't overlap and do a double reset on the system reset 2368 * path where both our reset handler and ->reset() callback are used. Calling 2369 * _one() will only do a hot reset for the one in-use devices case, calling 2370 * _multi() will do nothing if a _one() would have been sufficient. 2371 */ 2372 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev) 2373 { 2374 return vfio_pci_hot_reset(vdev, true); 2375 } 2376 2377 static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev) 2378 { 2379 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 2380 return vfio_pci_hot_reset(vdev, false); 2381 } 2382 2383 static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev) 2384 { 2385 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 2386 if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) { 2387 vbasedev->needs_reset = true; 2388 } 2389 } 2390 2391 static VFIODeviceOps vfio_pci_ops = { 2392 .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, 2393 .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, 2394 .vfio_eoi = vfio_intx_eoi, 2395 }; 2396 2397 int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) 2398 { 2399 VFIODevice *vbasedev = &vdev->vbasedev; 2400 struct vfio_region_info *reg_info; 2401 int ret; 2402 2403 ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info); 2404 if (ret) { 2405 error_setg_errno(errp, -ret, 2406 "failed getting region info for VGA region index %d", 2407 VFIO_PCI_VGA_REGION_INDEX); 2408 return ret; 2409 } 2410 2411 if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) || 2412 !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) || 2413 reg_info->size < 0xbffff + 1) { 2414 error_setg(errp, "unexpected VGA info, flags 0x%lx, size 0x%lx", 2415 (unsigned long)reg_info->flags, 2416 (unsigned long)reg_info->size); 2417 g_free(reg_info); 2418 return -EINVAL; 2419 } 2420 2421 vdev->vga = g_new0(VFIOVGA, 1); 2422 2423 vdev->vga->fd_offset = reg_info->offset; 2424 vdev->vga->fd = vdev->vbasedev.fd; 2425 2426 g_free(reg_info); 2427 2428 vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE; 2429 vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM; 2430 QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks); 2431 2432 memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem, 2433 OBJECT(vdev), &vfio_vga_ops, 2434 &vdev->vga->region[QEMU_PCI_VGA_MEM], 2435 "vfio-vga-mmio@0xa0000", 2436 QEMU_PCI_VGA_MEM_SIZE); 2437 2438 vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE; 2439 vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO; 2440 QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks); 2441 2442 memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, 2443 OBJECT(vdev), &vfio_vga_ops, 2444 &vdev->vga->region[QEMU_PCI_VGA_IO_LO], 2445 "vfio-vga-io@0x3b0", 2446 QEMU_PCI_VGA_IO_LO_SIZE); 2447 2448 vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE; 2449 vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI; 2450 QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks); 2451 2452 memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 2453 OBJECT(vdev), &vfio_vga_ops, 2454 &vdev->vga->region[QEMU_PCI_VGA_IO_HI], 2455 "vfio-vga-io@0x3c0", 2456 QEMU_PCI_VGA_IO_HI_SIZE); 2457 2458 pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem, 2459 &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, 2460 &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem); 2461 2462 return 0; 2463 } 2464 2465 static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) 2466 { 2467 VFIODevice *vbasedev = &vdev->vbasedev; 2468 struct vfio_region_info *reg_info; 2469 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; 2470 int i, ret = -1; 2471 2472 /* Sanity check device */ 2473 if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) { 2474 error_setg(errp, "this isn't a PCI device"); 2475 return; 2476 } 2477 2478 if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) { 2479 error_setg(errp, "unexpected number of io regions %u", 2480 vbasedev->num_regions); 2481 return; 2482 } 2483 2484 if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) { 2485 error_setg(errp, "unexpected number of irqs %u", vbasedev->num_irqs); 2486 return; 2487 } 2488 2489 for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) { 2490 char *name = g_strdup_printf("%s BAR %d", vbasedev->name, i); 2491 2492 ret = vfio_region_setup(OBJECT(vdev), vbasedev, 2493 &vdev->bars[i].region, i, name); 2494 g_free(name); 2495 2496 if (ret) { 2497 error_setg_errno(errp, -ret, "failed to get region %d info", i); 2498 return; 2499 } 2500 2501 QLIST_INIT(&vdev->bars[i].quirks); 2502 } 2503 2504 ret = vfio_get_region_info(vbasedev, 2505 VFIO_PCI_CONFIG_REGION_INDEX, ®_info); 2506 if (ret) { 2507 error_setg_errno(errp, -ret, "failed to get config info"); 2508 return; 2509 } 2510 2511 trace_vfio_populate_device_config(vdev->vbasedev.name, 2512 (unsigned long)reg_info->size, 2513 (unsigned long)reg_info->offset, 2514 (unsigned long)reg_info->flags); 2515 2516 vdev->config_size = reg_info->size; 2517 if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) { 2518 vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS; 2519 } 2520 vdev->config_offset = reg_info->offset; 2521 2522 g_free(reg_info); 2523 2524 if (vdev->features & VFIO_FEATURE_ENABLE_VGA) { 2525 ret = vfio_populate_vga(vdev, errp); 2526 if (ret) { 2527 error_append_hint(errp, "device does not support " 2528 "requested feature x-vga\n"); 2529 return; 2530 } 2531 } 2532 2533 irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; 2534 2535 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); 2536 if (ret) { 2537 /* This can fail for an old kernel or legacy PCI dev */ 2538 trace_vfio_populate_device_get_irq_info_failure(strerror(errno)); 2539 } else if (irq_info.count == 1) { 2540 vdev->pci_aer = true; 2541 } else { 2542 warn_report(VFIO_MSG_PREFIX 2543 "Could not enable error recovery for the device", 2544 vbasedev->name); 2545 } 2546 } 2547 2548 static void vfio_put_device(VFIOPCIDevice *vdev) 2549 { 2550 g_free(vdev->vbasedev.name); 2551 g_free(vdev->msix); 2552 2553 vfio_put_base_device(&vdev->vbasedev); 2554 } 2555 2556 static void vfio_err_notifier_handler(void *opaque) 2557 { 2558 VFIOPCIDevice *vdev = opaque; 2559 2560 if (!event_notifier_test_and_clear(&vdev->err_notifier)) { 2561 return; 2562 } 2563 2564 /* 2565 * TBD. Retrieve the error details and decide what action 2566 * needs to be taken. One of the actions could be to pass 2567 * the error to the guest and have the guest driver recover 2568 * from the error. This requires that PCIe capabilities be 2569 * exposed to the guest. For now, we just terminate the 2570 * guest to contain the error. 2571 */ 2572 2573 error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name); 2574 2575 vm_stop(RUN_STATE_INTERNAL_ERROR); 2576 } 2577 2578 /* 2579 * Registers error notifier for devices supporting error recovery. 2580 * If we encounter a failure in this function, we report an error 2581 * and continue after disabling error recovery support for the 2582 * device. 2583 */ 2584 static void vfio_register_err_notifier(VFIOPCIDevice *vdev) 2585 { 2586 Error *err = NULL; 2587 int32_t fd; 2588 2589 if (!vdev->pci_aer) { 2590 return; 2591 } 2592 2593 if (event_notifier_init(&vdev->err_notifier, 0)) { 2594 error_report("vfio: Unable to init event notifier for error detection"); 2595 vdev->pci_aer = false; 2596 return; 2597 } 2598 2599 fd = event_notifier_get_fd(&vdev->err_notifier); 2600 qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev); 2601 2602 if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, 2603 VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { 2604 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 2605 qemu_set_fd_handler(fd, NULL, NULL, vdev); 2606 event_notifier_cleanup(&vdev->err_notifier); 2607 vdev->pci_aer = false; 2608 } 2609 } 2610 2611 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev) 2612 { 2613 Error *err = NULL; 2614 2615 if (!vdev->pci_aer) { 2616 return; 2617 } 2618 2619 if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, 2620 VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { 2621 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 2622 } 2623 qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier), 2624 NULL, NULL, vdev); 2625 event_notifier_cleanup(&vdev->err_notifier); 2626 } 2627 2628 static void vfio_req_notifier_handler(void *opaque) 2629 { 2630 VFIOPCIDevice *vdev = opaque; 2631 Error *err = NULL; 2632 2633 if (!event_notifier_test_and_clear(&vdev->req_notifier)) { 2634 return; 2635 } 2636 2637 qdev_unplug(DEVICE(vdev), &err); 2638 if (err) { 2639 warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 2640 } 2641 } 2642 2643 static void vfio_register_req_notifier(VFIOPCIDevice *vdev) 2644 { 2645 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), 2646 .index = VFIO_PCI_REQ_IRQ_INDEX }; 2647 Error *err = NULL; 2648 int32_t fd; 2649 2650 if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) { 2651 return; 2652 } 2653 2654 if (ioctl(vdev->vbasedev.fd, 2655 VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) { 2656 return; 2657 } 2658 2659 if (event_notifier_init(&vdev->req_notifier, 0)) { 2660 error_report("vfio: Unable to init event notifier for device request"); 2661 return; 2662 } 2663 2664 fd = event_notifier_get_fd(&vdev->req_notifier); 2665 qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev); 2666 2667 if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, 2668 VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { 2669 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 2670 qemu_set_fd_handler(fd, NULL, NULL, vdev); 2671 event_notifier_cleanup(&vdev->req_notifier); 2672 } else { 2673 vdev->req_enabled = true; 2674 } 2675 } 2676 2677 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) 2678 { 2679 Error *err = NULL; 2680 2681 if (!vdev->req_enabled) { 2682 return; 2683 } 2684 2685 if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, 2686 VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { 2687 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 2688 } 2689 qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier), 2690 NULL, NULL, vdev); 2691 event_notifier_cleanup(&vdev->req_notifier); 2692 2693 vdev->req_enabled = false; 2694 } 2695 2696 static void vfio_realize(PCIDevice *pdev, Error **errp) 2697 { 2698 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 2699 VFIODevice *vbasedev_iter; 2700 VFIOGroup *group; 2701 char *tmp, *subsys, group_path[PATH_MAX], *group_name; 2702 Error *err = NULL; 2703 ssize_t len; 2704 struct stat st; 2705 int groupid; 2706 int i, ret; 2707 bool is_mdev; 2708 2709 if (!vdev->vbasedev.sysfsdev) { 2710 if (!(~vdev->host.domain || ~vdev->host.bus || 2711 ~vdev->host.slot || ~vdev->host.function)) { 2712 error_setg(errp, "No provided host device"); 2713 error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F " 2714 "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n"); 2715 return; 2716 } 2717 vdev->vbasedev.sysfsdev = 2718 g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x", 2719 vdev->host.domain, vdev->host.bus, 2720 vdev->host.slot, vdev->host.function); 2721 } 2722 2723 if (stat(vdev->vbasedev.sysfsdev, &st) < 0) { 2724 error_setg_errno(errp, errno, "no such host device"); 2725 error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.sysfsdev); 2726 return; 2727 } 2728 2729 vdev->vbasedev.name = g_path_get_basename(vdev->vbasedev.sysfsdev); 2730 vdev->vbasedev.ops = &vfio_pci_ops; 2731 vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI; 2732 vdev->vbasedev.dev = DEVICE(vdev); 2733 2734 tmp = g_strdup_printf("%s/iommu_group", vdev->vbasedev.sysfsdev); 2735 len = readlink(tmp, group_path, sizeof(group_path)); 2736 g_free(tmp); 2737 2738 if (len <= 0 || len >= sizeof(group_path)) { 2739 error_setg_errno(errp, len < 0 ? errno : ENAMETOOLONG, 2740 "no iommu_group found"); 2741 goto error; 2742 } 2743 2744 group_path[len] = 0; 2745 2746 group_name = basename(group_path); 2747 if (sscanf(group_name, "%d", &groupid) != 1) { 2748 error_setg_errno(errp, errno, "failed to read %s", group_path); 2749 goto error; 2750 } 2751 2752 trace_vfio_realize(vdev->vbasedev.name, groupid); 2753 2754 group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev), errp); 2755 if (!group) { 2756 goto error; 2757 } 2758 2759 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { 2760 if (strcmp(vbasedev_iter->name, vdev->vbasedev.name) == 0) { 2761 error_setg(errp, "device is already attached"); 2762 vfio_put_group(group); 2763 goto error; 2764 } 2765 } 2766 2767 /* 2768 * Mediated devices *might* operate compatibly with memory ballooning, but 2769 * we cannot know for certain, it depends on whether the mdev vendor driver 2770 * stays in sync with the active working set of the guest driver. Prevent 2771 * the x-balloon-allowed option unless this is minimally an mdev device. 2772 */ 2773 tmp = g_strdup_printf("%s/subsystem", vdev->vbasedev.sysfsdev); 2774 subsys = realpath(tmp, NULL); 2775 g_free(tmp); 2776 is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); 2777 free(subsys); 2778 2779 trace_vfio_mdev(vdev->vbasedev.name, is_mdev); 2780 2781 if (vdev->vbasedev.balloon_allowed && !is_mdev) { 2782 error_setg(errp, "x-balloon-allowed only potentially compatible " 2783 "with mdev devices"); 2784 vfio_put_group(group); 2785 goto error; 2786 } 2787 2788 ret = vfio_get_device(group, vdev->vbasedev.name, &vdev->vbasedev, errp); 2789 if (ret) { 2790 vfio_put_group(group); 2791 goto error; 2792 } 2793 2794 vfio_populate_device(vdev, &err); 2795 if (err) { 2796 error_propagate(errp, err); 2797 goto error; 2798 } 2799 2800 /* Get a copy of config space */ 2801 ret = pread(vdev->vbasedev.fd, vdev->pdev.config, 2802 MIN(pci_config_size(&vdev->pdev), vdev->config_size), 2803 vdev->config_offset); 2804 if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) { 2805 ret = ret < 0 ? -errno : -EFAULT; 2806 error_setg_errno(errp, -ret, "failed to read device config space"); 2807 goto error; 2808 } 2809 2810 /* vfio emulates a lot for us, but some bits need extra love */ 2811 vdev->emulated_config_bits = g_malloc0(vdev->config_size); 2812 2813 /* QEMU can choose to expose the ROM or not */ 2814 memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4); 2815 /* QEMU can also add or extend BARs */ 2816 memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4); 2817 2818 /* 2819 * The PCI spec reserves vendor ID 0xffff as an invalid value. The 2820 * device ID is managed by the vendor and need only be a 16-bit value. 2821 * Allow any 16-bit value for subsystem so they can be hidden or changed. 2822 */ 2823 if (vdev->vendor_id != PCI_ANY_ID) { 2824 if (vdev->vendor_id >= 0xffff) { 2825 error_setg(errp, "invalid PCI vendor ID provided"); 2826 goto error; 2827 } 2828 vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0); 2829 trace_vfio_pci_emulated_vendor_id(vdev->vbasedev.name, vdev->vendor_id); 2830 } else { 2831 vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID); 2832 } 2833 2834 if (vdev->device_id != PCI_ANY_ID) { 2835 if (vdev->device_id > 0xffff) { 2836 error_setg(errp, "invalid PCI device ID provided"); 2837 goto error; 2838 } 2839 vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0); 2840 trace_vfio_pci_emulated_device_id(vdev->vbasedev.name, vdev->device_id); 2841 } else { 2842 vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID); 2843 } 2844 2845 if (vdev->sub_vendor_id != PCI_ANY_ID) { 2846 if (vdev->sub_vendor_id > 0xffff) { 2847 error_setg(errp, "invalid PCI subsystem vendor ID provided"); 2848 goto error; 2849 } 2850 vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID, 2851 vdev->sub_vendor_id, ~0); 2852 trace_vfio_pci_emulated_sub_vendor_id(vdev->vbasedev.name, 2853 vdev->sub_vendor_id); 2854 } 2855 2856 if (vdev->sub_device_id != PCI_ANY_ID) { 2857 if (vdev->sub_device_id > 0xffff) { 2858 error_setg(errp, "invalid PCI subsystem device ID provided"); 2859 goto error; 2860 } 2861 vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0); 2862 trace_vfio_pci_emulated_sub_device_id(vdev->vbasedev.name, 2863 vdev->sub_device_id); 2864 } 2865 2866 /* QEMU can change multi-function devices to single function, or reverse */ 2867 vdev->emulated_config_bits[PCI_HEADER_TYPE] = 2868 PCI_HEADER_TYPE_MULTI_FUNCTION; 2869 2870 /* Restore or clear multifunction, this is always controlled by QEMU */ 2871 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { 2872 vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION; 2873 } else { 2874 vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION; 2875 } 2876 2877 /* 2878 * Clear host resource mapping info. If we choose not to register a 2879 * BAR, such as might be the case with the option ROM, we can get 2880 * confusing, unwritable, residual addresses from the host here. 2881 */ 2882 memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24); 2883 memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4); 2884 2885 vfio_pci_size_rom(vdev); 2886 2887 vfio_bars_prepare(vdev); 2888 2889 vfio_msix_early_setup(vdev, &err); 2890 if (err) { 2891 error_propagate(errp, err); 2892 goto error; 2893 } 2894 2895 vfio_bars_register(vdev); 2896 2897 ret = vfio_add_capabilities(vdev, errp); 2898 if (ret) { 2899 goto out_teardown; 2900 } 2901 2902 if (vdev->vga) { 2903 vfio_vga_quirk_setup(vdev); 2904 } 2905 2906 for (i = 0; i < PCI_ROM_SLOT; i++) { 2907 vfio_bar_quirk_setup(vdev, i); 2908 } 2909 2910 if (!vdev->igd_opregion && 2911 vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) { 2912 struct vfio_region_info *opregion; 2913 2914 if (vdev->pdev.qdev.hotplugged) { 2915 error_setg(errp, 2916 "cannot support IGD OpRegion feature on hotplugged " 2917 "device"); 2918 goto out_teardown; 2919 } 2920 2921 ret = vfio_get_dev_region_info(&vdev->vbasedev, 2922 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, 2923 VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); 2924 if (ret) { 2925 error_setg_errno(errp, -ret, 2926 "does not support requested IGD OpRegion feature"); 2927 goto out_teardown; 2928 } 2929 2930 ret = vfio_pci_igd_opregion_init(vdev, opregion, errp); 2931 g_free(opregion); 2932 if (ret) { 2933 goto out_teardown; 2934 } 2935 } 2936 2937 /* QEMU emulates all of MSI & MSIX */ 2938 if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { 2939 memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff, 2940 MSIX_CAP_LENGTH); 2941 } 2942 2943 if (pdev->cap_present & QEMU_PCI_CAP_MSI) { 2944 memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff, 2945 vdev->msi_cap_size); 2946 } 2947 2948 if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { 2949 vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, 2950 vfio_intx_mmap_enable, vdev); 2951 pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_intx_update); 2952 ret = vfio_intx_enable(vdev, errp); 2953 if (ret) { 2954 goto out_teardown; 2955 } 2956 } 2957 2958 if (vdev->display != ON_OFF_AUTO_OFF) { 2959 ret = vfio_display_probe(vdev, errp); 2960 if (ret) { 2961 goto out_teardown; 2962 } 2963 } 2964 if (vdev->enable_ramfb && vdev->dpy == NULL) { 2965 error_setg(errp, "ramfb=on requires display=on"); 2966 goto out_teardown; 2967 } 2968 if (vdev->display_xres || vdev->display_yres) { 2969 if (vdev->dpy == NULL) { 2970 error_setg(errp, "xres and yres properties require display=on"); 2971 goto out_teardown; 2972 } 2973 if (vdev->dpy->edid_regs == NULL) { 2974 error_setg(errp, "xres and yres properties need edid support"); 2975 goto out_teardown; 2976 } 2977 } 2978 2979 if (vdev->vendor_id == PCI_VENDOR_ID_NVIDIA) { 2980 ret = vfio_pci_nvidia_v100_ram_init(vdev, errp); 2981 if (ret && ret != -ENODEV) { 2982 error_report("Failed to setup NVIDIA V100 GPU RAM"); 2983 } 2984 } 2985 2986 if (vdev->vendor_id == PCI_VENDOR_ID_IBM) { 2987 ret = vfio_pci_nvlink2_init(vdev, errp); 2988 if (ret && ret != -ENODEV) { 2989 error_report("Failed to setup NVlink2 bridge"); 2990 } 2991 } 2992 2993 vfio_register_err_notifier(vdev); 2994 vfio_register_req_notifier(vdev); 2995 vfio_setup_resetfn_quirk(vdev); 2996 2997 return; 2998 2999 out_teardown: 3000 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); 3001 vfio_teardown_msi(vdev); 3002 vfio_bars_exit(vdev); 3003 error: 3004 error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name); 3005 } 3006 3007 static void vfio_instance_finalize(Object *obj) 3008 { 3009 VFIOPCIDevice *vdev = PCI_VFIO(obj); 3010 VFIOGroup *group = vdev->vbasedev.group; 3011 3012 vfio_display_finalize(vdev); 3013 vfio_bars_finalize(vdev); 3014 g_free(vdev->emulated_config_bits); 3015 g_free(vdev->rom); 3016 /* 3017 * XXX Leaking igd_opregion is not an oversight, we can't remove the 3018 * fw_cfg entry therefore leaking this allocation seems like the safest 3019 * option. 3020 * 3021 * g_free(vdev->igd_opregion); 3022 */ 3023 vfio_put_device(vdev); 3024 vfio_put_group(group); 3025 } 3026 3027 static void vfio_exitfn(PCIDevice *pdev) 3028 { 3029 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 3030 3031 vfio_unregister_req_notifier(vdev); 3032 vfio_unregister_err_notifier(vdev); 3033 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); 3034 vfio_disable_interrupts(vdev); 3035 if (vdev->intx.mmap_timer) { 3036 timer_free(vdev->intx.mmap_timer); 3037 } 3038 vfio_teardown_msi(vdev); 3039 vfio_bars_exit(vdev); 3040 } 3041 3042 static void vfio_pci_reset(DeviceState *dev) 3043 { 3044 VFIOPCIDevice *vdev = PCI_VFIO(dev); 3045 3046 trace_vfio_pci_reset(vdev->vbasedev.name); 3047 3048 vfio_pci_pre_reset(vdev); 3049 3050 if (vdev->display != ON_OFF_AUTO_OFF) { 3051 vfio_display_reset(vdev); 3052 } 3053 3054 if (vdev->resetfn && !vdev->resetfn(vdev)) { 3055 goto post_reset; 3056 } 3057 3058 if (vdev->vbasedev.reset_works && 3059 (vdev->has_flr || !vdev->has_pm_reset) && 3060 !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) { 3061 trace_vfio_pci_reset_flr(vdev->vbasedev.name); 3062 goto post_reset; 3063 } 3064 3065 /* See if we can do our own bus reset */ 3066 if (!vfio_pci_hot_reset_one(vdev)) { 3067 goto post_reset; 3068 } 3069 3070 /* If nothing else works and the device supports PM reset, use it */ 3071 if (vdev->vbasedev.reset_works && vdev->has_pm_reset && 3072 !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) { 3073 trace_vfio_pci_reset_pm(vdev->vbasedev.name); 3074 goto post_reset; 3075 } 3076 3077 post_reset: 3078 vfio_pci_post_reset(vdev); 3079 } 3080 3081 static void vfio_instance_init(Object *obj) 3082 { 3083 PCIDevice *pci_dev = PCI_DEVICE(obj); 3084 VFIOPCIDevice *vdev = PCI_VFIO(obj); 3085 3086 device_add_bootindex_property(obj, &vdev->bootindex, 3087 "bootindex", NULL, 3088 &pci_dev->qdev, NULL); 3089 vdev->host.domain = ~0U; 3090 vdev->host.bus = ~0U; 3091 vdev->host.slot = ~0U; 3092 vdev->host.function = ~0U; 3093 3094 vdev->nv_gpudirect_clique = 0xFF; 3095 3096 /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command 3097 * line, therefore, no need to wait to realize like other devices */ 3098 pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; 3099 } 3100 3101 static Property vfio_pci_dev_properties[] = { 3102 DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), 3103 DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev), 3104 DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice, 3105 display, ON_OFF_AUTO_OFF), 3106 DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0), 3107 DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0), 3108 DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice, 3109 intx.mmap_timeout, 1100), 3110 DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features, 3111 VFIO_FEATURE_ENABLE_VGA_BIT, false), 3112 DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features, 3113 VFIO_FEATURE_ENABLE_REQ_BIT, true), 3114 DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, 3115 VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), 3116 DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), 3117 DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice, 3118 vbasedev.balloon_allowed, false), 3119 DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false), 3120 DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false), 3121 DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false), 3122 DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice, 3123 no_geforce_quirks, false), 3124 DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd, 3125 false), 3126 DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd, 3127 false), 3128 DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID), 3129 DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID), 3130 DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice, 3131 sub_vendor_id, PCI_ANY_ID), 3132 DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice, 3133 sub_device_id, PCI_ANY_ID), 3134 DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0), 3135 DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice, 3136 nv_gpudirect_clique, 3137 qdev_prop_nv_gpudirect_clique, uint8_t), 3138 DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo, 3139 OFF_AUTOPCIBAR_OFF), 3140 /* 3141 * TODO - support passed fds... is this necessary? 3142 * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), 3143 * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name), 3144 */ 3145 DEFINE_PROP_END_OF_LIST(), 3146 }; 3147 3148 static const VMStateDescription vfio_pci_vmstate = { 3149 .name = "vfio-pci", 3150 .unmigratable = 1, 3151 }; 3152 3153 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) 3154 { 3155 DeviceClass *dc = DEVICE_CLASS(klass); 3156 PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); 3157 3158 dc->reset = vfio_pci_reset; 3159 dc->props = vfio_pci_dev_properties; 3160 dc->vmsd = &vfio_pci_vmstate; 3161 dc->desc = "VFIO-based PCI device assignment"; 3162 set_bit(DEVICE_CATEGORY_MISC, dc->categories); 3163 pdc->realize = vfio_realize; 3164 pdc->exit = vfio_exitfn; 3165 pdc->config_read = vfio_pci_read_config; 3166 pdc->config_write = vfio_pci_write_config; 3167 } 3168 3169 static const TypeInfo vfio_pci_dev_info = { 3170 .name = TYPE_VFIO_PCI, 3171 .parent = TYPE_PCI_DEVICE, 3172 .instance_size = sizeof(VFIOPCIDevice), 3173 .class_init = vfio_pci_dev_class_init, 3174 .instance_init = vfio_instance_init, 3175 .instance_finalize = vfio_instance_finalize, 3176 .interfaces = (InterfaceInfo[]) { 3177 { INTERFACE_PCIE_DEVICE }, 3178 { INTERFACE_CONVENTIONAL_PCI_DEVICE }, 3179 { } 3180 }, 3181 }; 3182 3183 static Property vfio_pci_dev_nohotplug_properties[] = { 3184 DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false), 3185 DEFINE_PROP_END_OF_LIST(), 3186 }; 3187 3188 static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data) 3189 { 3190 DeviceClass *dc = DEVICE_CLASS(klass); 3191 3192 dc->props = vfio_pci_dev_nohotplug_properties; 3193 dc->hotpluggable = false; 3194 } 3195 3196 static const TypeInfo vfio_pci_nohotplug_dev_info = { 3197 .name = TYPE_VIFO_PCI_NOHOTPLUG, 3198 .parent = TYPE_VFIO_PCI, 3199 .instance_size = sizeof(VFIOPCIDevice), 3200 .class_init = vfio_pci_nohotplug_dev_class_init, 3201 }; 3202 3203 static void register_vfio_pci_dev_type(void) 3204 { 3205 type_register_static(&vfio_pci_dev_info); 3206 type_register_static(&vfio_pci_nohotplug_dev_info); 3207 } 3208 3209 type_init(register_vfio_pci_dev_type) 3210