1 /* 2 * vfio based device assignment support 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Based on qemu-kvm device-assignment: 13 * Adapted for KVM by Qumranet. 14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) 15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) 16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) 17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) 18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) 19 */ 20 21 #include "qemu/osdep.h" 22 #include <linux/vfio.h> 23 #include <sys/ioctl.h> 24 25 #include "hw/pci/msi.h" 26 #include "hw/pci/msix.h" 27 #include "hw/pci/pci_bridge.h" 28 #include "migration/vmstate.h" 29 #include "qemu/error-report.h" 30 #include "qemu/module.h" 31 #include "qemu/option.h" 32 #include "qemu/range.h" 33 #include "qemu/units.h" 34 #include "sysemu/kvm.h" 35 #include "sysemu/sysemu.h" 36 #include "pci.h" 37 #include "trace.h" 38 #include "qapi/error.h" 39 40 #define TYPE_VFIO_PCI "vfio-pci" 41 #define PCI_VFIO(obj) OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI) 42 43 #define TYPE_VIFO_PCI_NOHOTPLUG "vfio-pci-nohotplug" 44 45 static void vfio_disable_interrupts(VFIOPCIDevice *vdev); 46 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); 47 48 /* 49 * Disabling BAR mmaping can be slow, but toggling it around INTx can 50 * also be a huge overhead. We try to get the best of both worlds by 51 * waiting until an interrupt to disable mmaps (subsequent transitions 52 * to the same state are effectively no overhead). If the interrupt has 53 * been serviced and the time gap is long enough, we re-enable mmaps for 54 * performance. This works well for things like graphics cards, which 55 * may not use their interrupt at all and are penalized to an unusable 56 * level by read/write BAR traps. Other devices, like NICs, have more 57 * regular interrupts and see much better latency by staying in non-mmap 58 * mode. We therefore set the default mmap_timeout such that a ping 59 * is just enough to keep the mmap disabled. Users can experiment with 60 * other options with the x-intx-mmap-timeout-ms parameter (a value of 61 * zero disables the timer). 62 */ 63 static void vfio_intx_mmap_enable(void *opaque) 64 { 65 VFIOPCIDevice *vdev = opaque; 66 67 if (vdev->intx.pending) { 68 timer_mod(vdev->intx.mmap_timer, 69 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout); 70 return; 71 } 72 73 vfio_mmap_set_enabled(vdev, true); 74 } 75 76 static void vfio_intx_interrupt(void *opaque) 77 { 78 VFIOPCIDevice *vdev = opaque; 79 80 if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) { 81 return; 82 } 83 84 trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin); 85 86 vdev->intx.pending = true; 87 pci_irq_assert(&vdev->pdev); 88 vfio_mmap_set_enabled(vdev, false); 89 if (vdev->intx.mmap_timeout) { 90 timer_mod(vdev->intx.mmap_timer, 91 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout); 92 } 93 } 94 95 static void vfio_intx_eoi(VFIODevice *vbasedev) 96 { 97 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 98 99 if (!vdev->intx.pending) { 100 return; 101 } 102 103 trace_vfio_intx_eoi(vbasedev->name); 104 105 vdev->intx.pending = false; 106 pci_irq_deassert(&vdev->pdev); 107 vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 108 } 109 110 static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) 111 { 112 #ifdef CONFIG_KVM 113 struct kvm_irqfd irqfd = { 114 .fd = event_notifier_get_fd(&vdev->intx.interrupt), 115 .gsi = vdev->intx.route.irq, 116 .flags = KVM_IRQFD_FLAG_RESAMPLE, 117 }; 118 Error *err = NULL; 119 120 if (vdev->no_kvm_intx || !kvm_irqfds_enabled() || 121 vdev->intx.route.mode != PCI_INTX_ENABLED || 122 !kvm_resamplefds_enabled()) { 123 return; 124 } 125 126 /* Get to a known interrupt state */ 127 qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev); 128 vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 129 vdev->intx.pending = false; 130 pci_irq_deassert(&vdev->pdev); 131 132 /* Get an eventfd for resample/unmask */ 133 if (event_notifier_init(&vdev->intx.unmask, 0)) { 134 error_setg(errp, "event_notifier_init failed eoi"); 135 goto fail; 136 } 137 138 /* KVM triggers it, VFIO listens for it */ 139 irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask); 140 141 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) { 142 error_setg_errno(errp, errno, "failed to setup resample irqfd"); 143 goto fail_irqfd; 144 } 145 146 if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, 147 VFIO_IRQ_SET_ACTION_UNMASK, 148 irqfd.resamplefd, &err)) { 149 error_propagate(errp, err); 150 goto fail_vfio; 151 } 152 153 /* Let'em rip */ 154 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 155 156 vdev->intx.kvm_accel = true; 157 158 trace_vfio_intx_enable_kvm(vdev->vbasedev.name); 159 160 return; 161 162 fail_vfio: 163 irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN; 164 kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd); 165 fail_irqfd: 166 event_notifier_cleanup(&vdev->intx.unmask); 167 fail: 168 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev); 169 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 170 #endif 171 } 172 173 static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) 174 { 175 #ifdef CONFIG_KVM 176 struct kvm_irqfd irqfd = { 177 .fd = event_notifier_get_fd(&vdev->intx.interrupt), 178 .gsi = vdev->intx.route.irq, 179 .flags = KVM_IRQFD_FLAG_DEASSIGN, 180 }; 181 182 if (!vdev->intx.kvm_accel) { 183 return; 184 } 185 186 /* 187 * Get to a known state, hardware masked, QEMU ready to accept new 188 * interrupts, QEMU IRQ de-asserted. 189 */ 190 vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 191 vdev->intx.pending = false; 192 pci_irq_deassert(&vdev->pdev); 193 194 /* Tell KVM to stop listening for an INTx irqfd */ 195 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) { 196 error_report("vfio: Error: Failed to disable INTx irqfd: %m"); 197 } 198 199 /* We only need to close the eventfd for VFIO to cleanup the kernel side */ 200 event_notifier_cleanup(&vdev->intx.unmask); 201 202 /* QEMU starts listening for interrupt events. */ 203 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev); 204 205 vdev->intx.kvm_accel = false; 206 207 /* If we've missed an event, let it re-fire through QEMU */ 208 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 209 210 trace_vfio_intx_disable_kvm(vdev->vbasedev.name); 211 #endif 212 } 213 214 static void vfio_intx_update(PCIDevice *pdev) 215 { 216 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 217 PCIINTxRoute route; 218 Error *err = NULL; 219 220 if (vdev->interrupt != VFIO_INT_INTx) { 221 return; 222 } 223 224 route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin); 225 226 if (!pci_intx_route_changed(&vdev->intx.route, &route)) { 227 return; /* Nothing changed */ 228 } 229 230 trace_vfio_intx_update(vdev->vbasedev.name, 231 vdev->intx.route.irq, route.irq); 232 233 vfio_intx_disable_kvm(vdev); 234 235 vdev->intx.route = route; 236 237 if (route.mode != PCI_INTX_ENABLED) { 238 return; 239 } 240 241 vfio_intx_enable_kvm(vdev, &err); 242 if (err) { 243 warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 244 } 245 246 /* Re-enable the interrupt in cased we missed an EOI */ 247 vfio_intx_eoi(&vdev->vbasedev); 248 } 249 250 static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) 251 { 252 uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); 253 Error *err = NULL; 254 int32_t fd; 255 int ret; 256 257 258 if (!pin) { 259 return 0; 260 } 261 262 vfio_disable_interrupts(vdev); 263 264 vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ 265 pci_config_set_interrupt_pin(vdev->pdev.config, pin); 266 267 #ifdef CONFIG_KVM 268 /* 269 * Only conditional to avoid generating error messages on platforms 270 * where we won't actually use the result anyway. 271 */ 272 if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) { 273 vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev, 274 vdev->intx.pin); 275 } 276 #endif 277 278 ret = event_notifier_init(&vdev->intx.interrupt, 0); 279 if (ret) { 280 error_setg_errno(errp, -ret, "event_notifier_init failed"); 281 return ret; 282 } 283 fd = event_notifier_get_fd(&vdev->intx.interrupt); 284 qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev); 285 286 if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, 287 VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { 288 error_propagate(errp, err); 289 qemu_set_fd_handler(fd, NULL, NULL, vdev); 290 event_notifier_cleanup(&vdev->intx.interrupt); 291 return -errno; 292 } 293 294 vfio_intx_enable_kvm(vdev, &err); 295 if (err) { 296 warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 297 } 298 299 vdev->interrupt = VFIO_INT_INTx; 300 301 trace_vfio_intx_enable(vdev->vbasedev.name); 302 return 0; 303 } 304 305 static void vfio_intx_disable(VFIOPCIDevice *vdev) 306 { 307 int fd; 308 309 timer_del(vdev->intx.mmap_timer); 310 vfio_intx_disable_kvm(vdev); 311 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 312 vdev->intx.pending = false; 313 pci_irq_deassert(&vdev->pdev); 314 vfio_mmap_set_enabled(vdev, true); 315 316 fd = event_notifier_get_fd(&vdev->intx.interrupt); 317 qemu_set_fd_handler(fd, NULL, NULL, vdev); 318 event_notifier_cleanup(&vdev->intx.interrupt); 319 320 vdev->interrupt = VFIO_INT_NONE; 321 322 trace_vfio_intx_disable(vdev->vbasedev.name); 323 } 324 325 /* 326 * MSI/X 327 */ 328 static void vfio_msi_interrupt(void *opaque) 329 { 330 VFIOMSIVector *vector = opaque; 331 VFIOPCIDevice *vdev = vector->vdev; 332 MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector); 333 void (*notify)(PCIDevice *dev, unsigned vector); 334 MSIMessage msg; 335 int nr = vector - vdev->msi_vectors; 336 337 if (!event_notifier_test_and_clear(&vector->interrupt)) { 338 return; 339 } 340 341 if (vdev->interrupt == VFIO_INT_MSIX) { 342 get_msg = msix_get_message; 343 notify = msix_notify; 344 345 /* A masked vector firing needs to use the PBA, enable it */ 346 if (msix_is_masked(&vdev->pdev, nr)) { 347 set_bit(nr, vdev->msix->pending); 348 memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true); 349 trace_vfio_msix_pba_enable(vdev->vbasedev.name); 350 } 351 } else if (vdev->interrupt == VFIO_INT_MSI) { 352 get_msg = msi_get_message; 353 notify = msi_notify; 354 } else { 355 abort(); 356 } 357 358 msg = get_msg(&vdev->pdev, nr); 359 trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data); 360 notify(&vdev->pdev, nr); 361 } 362 363 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) 364 { 365 struct vfio_irq_set *irq_set; 366 int ret = 0, i, argsz; 367 int32_t *fds; 368 369 argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds)); 370 371 irq_set = g_malloc0(argsz); 372 irq_set->argsz = argsz; 373 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; 374 irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX; 375 irq_set->start = 0; 376 irq_set->count = vdev->nr_vectors; 377 fds = (int32_t *)&irq_set->data; 378 379 for (i = 0; i < vdev->nr_vectors; i++) { 380 int fd = -1; 381 382 /* 383 * MSI vs MSI-X - The guest has direct access to MSI mask and pending 384 * bits, therefore we always use the KVM signaling path when setup. 385 * MSI-X mask and pending bits are emulated, so we want to use the 386 * KVM signaling path only when configured and unmasked. 387 */ 388 if (vdev->msi_vectors[i].use) { 389 if (vdev->msi_vectors[i].virq < 0 || 390 (msix && msix_is_masked(&vdev->pdev, i))) { 391 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt); 392 } else { 393 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt); 394 } 395 } 396 397 fds[i] = fd; 398 } 399 400 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 401 402 g_free(irq_set); 403 404 return ret; 405 } 406 407 static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, 408 int vector_n, bool msix) 409 { 410 int virq; 411 412 if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) { 413 return; 414 } 415 416 if (event_notifier_init(&vector->kvm_interrupt, 0)) { 417 return; 418 } 419 420 virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev); 421 if (virq < 0) { 422 event_notifier_cleanup(&vector->kvm_interrupt); 423 return; 424 } 425 426 if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, 427 NULL, virq) < 0) { 428 kvm_irqchip_release_virq(kvm_state, virq); 429 event_notifier_cleanup(&vector->kvm_interrupt); 430 return; 431 } 432 433 vector->virq = virq; 434 } 435 436 static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) 437 { 438 kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, 439 vector->virq); 440 kvm_irqchip_release_virq(kvm_state, vector->virq); 441 vector->virq = -1; 442 event_notifier_cleanup(&vector->kvm_interrupt); 443 } 444 445 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, 446 PCIDevice *pdev) 447 { 448 kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev); 449 kvm_irqchip_commit_routes(kvm_state); 450 } 451 452 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, 453 MSIMessage *msg, IOHandler *handler) 454 { 455 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 456 VFIOMSIVector *vector; 457 int ret; 458 459 trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr); 460 461 vector = &vdev->msi_vectors[nr]; 462 463 if (!vector->use) { 464 vector->vdev = vdev; 465 vector->virq = -1; 466 if (event_notifier_init(&vector->interrupt, 0)) { 467 error_report("vfio: Error: event_notifier_init failed"); 468 } 469 vector->use = true; 470 msix_vector_use(pdev, nr); 471 } 472 473 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 474 handler, NULL, vector); 475 476 /* 477 * Attempt to enable route through KVM irqchip, 478 * default to userspace handling if unavailable. 479 */ 480 if (vector->virq >= 0) { 481 if (!msg) { 482 vfio_remove_kvm_msi_virq(vector); 483 } else { 484 vfio_update_kvm_msi_virq(vector, *msg, pdev); 485 } 486 } else { 487 if (msg) { 488 vfio_add_kvm_msi_virq(vdev, vector, nr, true); 489 } 490 } 491 492 /* 493 * We don't want to have the host allocate all possible MSI vectors 494 * for a device if they're not in use, so we shutdown and incrementally 495 * increase them as needed. 496 */ 497 if (vdev->nr_vectors < nr + 1) { 498 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); 499 vdev->nr_vectors = nr + 1; 500 ret = vfio_enable_vectors(vdev, true); 501 if (ret) { 502 error_report("vfio: failed to enable vectors, %d", ret); 503 } 504 } else { 505 Error *err = NULL; 506 int32_t fd; 507 508 if (vector->virq >= 0) { 509 fd = event_notifier_get_fd(&vector->kvm_interrupt); 510 } else { 511 fd = event_notifier_get_fd(&vector->interrupt); 512 } 513 514 if (vfio_set_irq_signaling(&vdev->vbasedev, 515 VFIO_PCI_MSIX_IRQ_INDEX, nr, 516 VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { 517 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 518 } 519 } 520 521 /* Disable PBA emulation when nothing more is pending. */ 522 clear_bit(nr, vdev->msix->pending); 523 if (find_first_bit(vdev->msix->pending, 524 vdev->nr_vectors) == vdev->nr_vectors) { 525 memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false); 526 trace_vfio_msix_pba_disable(vdev->vbasedev.name); 527 } 528 529 return 0; 530 } 531 532 static int vfio_msix_vector_use(PCIDevice *pdev, 533 unsigned int nr, MSIMessage msg) 534 { 535 return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt); 536 } 537 538 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) 539 { 540 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 541 VFIOMSIVector *vector = &vdev->msi_vectors[nr]; 542 543 trace_vfio_msix_vector_release(vdev->vbasedev.name, nr); 544 545 /* 546 * There are still old guests that mask and unmask vectors on every 547 * interrupt. If we're using QEMU bypass with a KVM irqfd, leave all of 548 * the KVM setup in place, simply switch VFIO to use the non-bypass 549 * eventfd. We'll then fire the interrupt through QEMU and the MSI-X 550 * core will mask the interrupt and set pending bits, allowing it to 551 * be re-asserted on unmask. Nothing to do if already using QEMU mode. 552 */ 553 if (vector->virq >= 0) { 554 int32_t fd = event_notifier_get_fd(&vector->interrupt); 555 Error *err = NULL; 556 557 if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr, 558 VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { 559 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 560 } 561 } 562 } 563 564 static void vfio_msix_enable(VFIOPCIDevice *vdev) 565 { 566 vfio_disable_interrupts(vdev); 567 568 vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries); 569 570 vdev->interrupt = VFIO_INT_MSIX; 571 572 /* 573 * Some communication channels between VF & PF or PF & fw rely on the 574 * physical state of the device and expect that enabling MSI-X from the 575 * guest enables the same on the host. When our guest is Linux, the 576 * guest driver call to pci_enable_msix() sets the enabling bit in the 577 * MSI-X capability, but leaves the vector table masked. We therefore 578 * can't rely on a vector_use callback (from request_irq() in the guest) 579 * to switch the physical device into MSI-X mode because that may come a 580 * long time after pci_enable_msix(). This code enables vector 0 with 581 * triggering to userspace, then immediately release the vector, leaving 582 * the physical device with no vectors enabled, but MSI-X enabled, just 583 * like the guest view. 584 */ 585 vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL); 586 vfio_msix_vector_release(&vdev->pdev, 0); 587 588 if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, 589 vfio_msix_vector_release, NULL)) { 590 error_report("vfio: msix_set_vector_notifiers failed"); 591 } 592 593 trace_vfio_msix_enable(vdev->vbasedev.name); 594 } 595 596 static void vfio_msi_enable(VFIOPCIDevice *vdev) 597 { 598 int ret, i; 599 600 vfio_disable_interrupts(vdev); 601 602 vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev); 603 retry: 604 vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors); 605 606 for (i = 0; i < vdev->nr_vectors; i++) { 607 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 608 609 vector->vdev = vdev; 610 vector->virq = -1; 611 vector->use = true; 612 613 if (event_notifier_init(&vector->interrupt, 0)) { 614 error_report("vfio: Error: event_notifier_init failed"); 615 } 616 617 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 618 vfio_msi_interrupt, NULL, vector); 619 620 /* 621 * Attempt to enable route through KVM irqchip, 622 * default to userspace handling if unavailable. 623 */ 624 vfio_add_kvm_msi_virq(vdev, vector, i, false); 625 } 626 627 /* Set interrupt type prior to possible interrupts */ 628 vdev->interrupt = VFIO_INT_MSI; 629 630 ret = vfio_enable_vectors(vdev, false); 631 if (ret) { 632 if (ret < 0) { 633 error_report("vfio: Error: Failed to setup MSI fds: %m"); 634 } else if (ret != vdev->nr_vectors) { 635 error_report("vfio: Error: Failed to enable %d " 636 "MSI vectors, retry with %d", vdev->nr_vectors, ret); 637 } 638 639 for (i = 0; i < vdev->nr_vectors; i++) { 640 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 641 if (vector->virq >= 0) { 642 vfio_remove_kvm_msi_virq(vector); 643 } 644 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 645 NULL, NULL, NULL); 646 event_notifier_cleanup(&vector->interrupt); 647 } 648 649 g_free(vdev->msi_vectors); 650 651 if (ret > 0 && ret != vdev->nr_vectors) { 652 vdev->nr_vectors = ret; 653 goto retry; 654 } 655 vdev->nr_vectors = 0; 656 657 /* 658 * Failing to setup MSI doesn't really fall within any specification. 659 * Let's try leaving interrupts disabled and hope the guest figures 660 * out to fall back to INTx for this device. 661 */ 662 error_report("vfio: Error: Failed to enable MSI"); 663 vdev->interrupt = VFIO_INT_NONE; 664 665 return; 666 } 667 668 trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors); 669 } 670 671 static void vfio_msi_disable_common(VFIOPCIDevice *vdev) 672 { 673 Error *err = NULL; 674 int i; 675 676 for (i = 0; i < vdev->nr_vectors; i++) { 677 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 678 if (vdev->msi_vectors[i].use) { 679 if (vector->virq >= 0) { 680 vfio_remove_kvm_msi_virq(vector); 681 } 682 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 683 NULL, NULL, NULL); 684 event_notifier_cleanup(&vector->interrupt); 685 } 686 } 687 688 g_free(vdev->msi_vectors); 689 vdev->msi_vectors = NULL; 690 vdev->nr_vectors = 0; 691 vdev->interrupt = VFIO_INT_NONE; 692 693 vfio_intx_enable(vdev, &err); 694 if (err) { 695 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 696 } 697 } 698 699 static void vfio_msix_disable(VFIOPCIDevice *vdev) 700 { 701 int i; 702 703 msix_unset_vector_notifiers(&vdev->pdev); 704 705 /* 706 * MSI-X will only release vectors if MSI-X is still enabled on the 707 * device, check through the rest and release it ourselves if necessary. 708 */ 709 for (i = 0; i < vdev->nr_vectors; i++) { 710 if (vdev->msi_vectors[i].use) { 711 vfio_msix_vector_release(&vdev->pdev, i); 712 msix_vector_unuse(&vdev->pdev, i); 713 } 714 } 715 716 if (vdev->nr_vectors) { 717 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); 718 } 719 720 vfio_msi_disable_common(vdev); 721 722 memset(vdev->msix->pending, 0, 723 BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long)); 724 725 trace_vfio_msix_disable(vdev->vbasedev.name); 726 } 727 728 static void vfio_msi_disable(VFIOPCIDevice *vdev) 729 { 730 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX); 731 vfio_msi_disable_common(vdev); 732 733 trace_vfio_msi_disable(vdev->vbasedev.name); 734 } 735 736 static void vfio_update_msi(VFIOPCIDevice *vdev) 737 { 738 int i; 739 740 for (i = 0; i < vdev->nr_vectors; i++) { 741 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 742 MSIMessage msg; 743 744 if (!vector->use || vector->virq < 0) { 745 continue; 746 } 747 748 msg = msi_get_message(&vdev->pdev, i); 749 vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev); 750 } 751 } 752 753 static void vfio_pci_load_rom(VFIOPCIDevice *vdev) 754 { 755 struct vfio_region_info *reg_info; 756 uint64_t size; 757 off_t off = 0; 758 ssize_t bytes; 759 760 if (vfio_get_region_info(&vdev->vbasedev, 761 VFIO_PCI_ROM_REGION_INDEX, ®_info)) { 762 error_report("vfio: Error getting ROM info: %m"); 763 return; 764 } 765 766 trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size, 767 (unsigned long)reg_info->offset, 768 (unsigned long)reg_info->flags); 769 770 vdev->rom_size = size = reg_info->size; 771 vdev->rom_offset = reg_info->offset; 772 773 g_free(reg_info); 774 775 if (!vdev->rom_size) { 776 vdev->rom_read_failed = true; 777 error_report("vfio-pci: Cannot read device rom at " 778 "%s", vdev->vbasedev.name); 779 error_printf("Device option ROM contents are probably invalid " 780 "(check dmesg).\nSkip option ROM probe with rombar=0, " 781 "or load from file with romfile=\n"); 782 return; 783 } 784 785 vdev->rom = g_malloc(size); 786 memset(vdev->rom, 0xff, size); 787 788 while (size) { 789 bytes = pread(vdev->vbasedev.fd, vdev->rom + off, 790 size, vdev->rom_offset + off); 791 if (bytes == 0) { 792 break; 793 } else if (bytes > 0) { 794 off += bytes; 795 size -= bytes; 796 } else { 797 if (errno == EINTR || errno == EAGAIN) { 798 continue; 799 } 800 error_report("vfio: Error reading device ROM: %m"); 801 break; 802 } 803 } 804 805 /* 806 * Test the ROM signature against our device, if the vendor is correct 807 * but the device ID doesn't match, store the correct device ID and 808 * recompute the checksum. Intel IGD devices need this and are known 809 * to have bogus checksums so we can't simply adjust the checksum. 810 */ 811 if (pci_get_word(vdev->rom) == 0xaa55 && 812 pci_get_word(vdev->rom + 0x18) + 8 < vdev->rom_size && 813 !memcmp(vdev->rom + pci_get_word(vdev->rom + 0x18), "PCIR", 4)) { 814 uint16_t vid, did; 815 816 vid = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 4); 817 did = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6); 818 819 if (vid == vdev->vendor_id && did != vdev->device_id) { 820 int i; 821 uint8_t csum, *data = vdev->rom; 822 823 pci_set_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6, 824 vdev->device_id); 825 data[6] = 0; 826 827 for (csum = 0, i = 0; i < vdev->rom_size; i++) { 828 csum += data[i]; 829 } 830 831 data[6] = -csum; 832 } 833 } 834 } 835 836 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) 837 { 838 VFIOPCIDevice *vdev = opaque; 839 union { 840 uint8_t byte; 841 uint16_t word; 842 uint32_t dword; 843 uint64_t qword; 844 } val; 845 uint64_t data = 0; 846 847 /* Load the ROM lazily when the guest tries to read it */ 848 if (unlikely(!vdev->rom && !vdev->rom_read_failed)) { 849 vfio_pci_load_rom(vdev); 850 } 851 852 memcpy(&val, vdev->rom + addr, 853 (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0); 854 855 switch (size) { 856 case 1: 857 data = val.byte; 858 break; 859 case 2: 860 data = le16_to_cpu(val.word); 861 break; 862 case 4: 863 data = le32_to_cpu(val.dword); 864 break; 865 default: 866 hw_error("vfio: unsupported read size, %d bytes\n", size); 867 break; 868 } 869 870 trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data); 871 872 return data; 873 } 874 875 static void vfio_rom_write(void *opaque, hwaddr addr, 876 uint64_t data, unsigned size) 877 { 878 } 879 880 static const MemoryRegionOps vfio_rom_ops = { 881 .read = vfio_rom_read, 882 .write = vfio_rom_write, 883 .endianness = DEVICE_LITTLE_ENDIAN, 884 }; 885 886 static void vfio_pci_size_rom(VFIOPCIDevice *vdev) 887 { 888 uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); 889 off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; 890 DeviceState *dev = DEVICE(vdev); 891 char *name; 892 int fd = vdev->vbasedev.fd; 893 894 if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { 895 /* Since pci handles romfile, just print a message and return */ 896 if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) { 897 warn_report("Device at %s is known to cause system instability" 898 " issues during option rom execution", 899 vdev->vbasedev.name); 900 error_printf("Proceeding anyway since user specified romfile\n"); 901 } 902 return; 903 } 904 905 /* 906 * Use the same size ROM BAR as the physical device. The contents 907 * will get filled in later when the guest tries to read it. 908 */ 909 if (pread(fd, &orig, 4, offset) != 4 || 910 pwrite(fd, &size, 4, offset) != 4 || 911 pread(fd, &size, 4, offset) != 4 || 912 pwrite(fd, &orig, 4, offset) != 4) { 913 error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name); 914 return; 915 } 916 917 size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1; 918 919 if (!size) { 920 return; 921 } 922 923 if (vfio_blacklist_opt_rom(vdev)) { 924 if (dev->opts && qemu_opt_get(dev->opts, "rombar")) { 925 warn_report("Device at %s is known to cause system instability" 926 " issues during option rom execution", 927 vdev->vbasedev.name); 928 error_printf("Proceeding anyway since user specified" 929 " non zero value for rombar\n"); 930 } else { 931 warn_report("Rom loading for device at %s has been disabled" 932 " due to system instability issues", 933 vdev->vbasedev.name); 934 error_printf("Specify rombar=1 or romfile to force\n"); 935 return; 936 } 937 } 938 939 trace_vfio_pci_size_rom(vdev->vbasedev.name, size); 940 941 name = g_strdup_printf("vfio[%s].rom", vdev->vbasedev.name); 942 943 memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev), 944 &vfio_rom_ops, vdev, name, size); 945 g_free(name); 946 947 pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 948 PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom); 949 950 vdev->rom_read_failed = false; 951 } 952 953 void vfio_vga_write(void *opaque, hwaddr addr, 954 uint64_t data, unsigned size) 955 { 956 VFIOVGARegion *region = opaque; 957 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]); 958 union { 959 uint8_t byte; 960 uint16_t word; 961 uint32_t dword; 962 uint64_t qword; 963 } buf; 964 off_t offset = vga->fd_offset + region->offset + addr; 965 966 switch (size) { 967 case 1: 968 buf.byte = data; 969 break; 970 case 2: 971 buf.word = cpu_to_le16(data); 972 break; 973 case 4: 974 buf.dword = cpu_to_le32(data); 975 break; 976 default: 977 hw_error("vfio: unsupported write size, %d bytes", size); 978 break; 979 } 980 981 if (pwrite(vga->fd, &buf, size, offset) != size) { 982 error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m", 983 __func__, region->offset + addr, data, size); 984 } 985 986 trace_vfio_vga_write(region->offset + addr, data, size); 987 } 988 989 uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size) 990 { 991 VFIOVGARegion *region = opaque; 992 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]); 993 union { 994 uint8_t byte; 995 uint16_t word; 996 uint32_t dword; 997 uint64_t qword; 998 } buf; 999 uint64_t data = 0; 1000 off_t offset = vga->fd_offset + region->offset + addr; 1001 1002 if (pread(vga->fd, &buf, size, offset) != size) { 1003 error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m", 1004 __func__, region->offset + addr, size); 1005 return (uint64_t)-1; 1006 } 1007 1008 switch (size) { 1009 case 1: 1010 data = buf.byte; 1011 break; 1012 case 2: 1013 data = le16_to_cpu(buf.word); 1014 break; 1015 case 4: 1016 data = le32_to_cpu(buf.dword); 1017 break; 1018 default: 1019 hw_error("vfio: unsupported read size, %d bytes", size); 1020 break; 1021 } 1022 1023 trace_vfio_vga_read(region->offset + addr, size, data); 1024 1025 return data; 1026 } 1027 1028 static const MemoryRegionOps vfio_vga_ops = { 1029 .read = vfio_vga_read, 1030 .write = vfio_vga_write, 1031 .endianness = DEVICE_LITTLE_ENDIAN, 1032 }; 1033 1034 /* 1035 * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page 1036 * size if the BAR is in an exclusive page in host so that we could map 1037 * this BAR to guest. But this sub-page BAR may not occupy an exclusive 1038 * page in guest. So we should set the priority of the expanded memory 1039 * region to zero in case of overlap with BARs which share the same page 1040 * with the sub-page BAR in guest. Besides, we should also recover the 1041 * size of this sub-page BAR when its base address is changed in guest 1042 * and not page aligned any more. 1043 */ 1044 static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) 1045 { 1046 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 1047 VFIORegion *region = &vdev->bars[bar].region; 1048 MemoryRegion *mmap_mr, *region_mr, *base_mr; 1049 PCIIORegion *r; 1050 pcibus_t bar_addr; 1051 uint64_t size = region->size; 1052 1053 /* Make sure that the whole region is allowed to be mmapped */ 1054 if (region->nr_mmaps != 1 || !region->mmaps[0].mmap || 1055 region->mmaps[0].size != region->size) { 1056 return; 1057 } 1058 1059 r = &pdev->io_regions[bar]; 1060 bar_addr = r->addr; 1061 base_mr = vdev->bars[bar].mr; 1062 region_mr = region->mem; 1063 mmap_mr = ®ion->mmaps[0].mem; 1064 1065 /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */ 1066 if (bar_addr != PCI_BAR_UNMAPPED && 1067 !(bar_addr & ~qemu_real_host_page_mask)) { 1068 size = qemu_real_host_page_size; 1069 } 1070 1071 memory_region_transaction_begin(); 1072 1073 if (vdev->bars[bar].size < size) { 1074 memory_region_set_size(base_mr, size); 1075 } 1076 memory_region_set_size(region_mr, size); 1077 memory_region_set_size(mmap_mr, size); 1078 if (size != vdev->bars[bar].size && memory_region_is_mapped(base_mr)) { 1079 memory_region_del_subregion(r->address_space, base_mr); 1080 memory_region_add_subregion_overlap(r->address_space, 1081 bar_addr, base_mr, 0); 1082 } 1083 1084 memory_region_transaction_commit(); 1085 } 1086 1087 /* 1088 * PCI config space 1089 */ 1090 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) 1091 { 1092 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 1093 uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; 1094 1095 memcpy(&emu_bits, vdev->emulated_config_bits + addr, len); 1096 emu_bits = le32_to_cpu(emu_bits); 1097 1098 if (emu_bits) { 1099 emu_val = pci_default_read_config(pdev, addr, len); 1100 } 1101 1102 if (~emu_bits & (0xffffffffU >> (32 - len * 8))) { 1103 ssize_t ret; 1104 1105 ret = pread(vdev->vbasedev.fd, &phys_val, len, 1106 vdev->config_offset + addr); 1107 if (ret != len) { 1108 error_report("%s(%s, 0x%x, 0x%x) failed: %m", 1109 __func__, vdev->vbasedev.name, addr, len); 1110 return -errno; 1111 } 1112 phys_val = le32_to_cpu(phys_val); 1113 } 1114 1115 val = (emu_val & emu_bits) | (phys_val & ~emu_bits); 1116 1117 trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val); 1118 1119 return val; 1120 } 1121 1122 void vfio_pci_write_config(PCIDevice *pdev, 1123 uint32_t addr, uint32_t val, int len) 1124 { 1125 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 1126 uint32_t val_le = cpu_to_le32(val); 1127 1128 trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len); 1129 1130 /* Write everything to VFIO, let it filter out what we can't write */ 1131 if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr) 1132 != len) { 1133 error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m", 1134 __func__, vdev->vbasedev.name, addr, val, len); 1135 } 1136 1137 /* MSI/MSI-X Enabling/Disabling */ 1138 if (pdev->cap_present & QEMU_PCI_CAP_MSI && 1139 ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) { 1140 int is_enabled, was_enabled = msi_enabled(pdev); 1141 1142 pci_default_write_config(pdev, addr, val, len); 1143 1144 is_enabled = msi_enabled(pdev); 1145 1146 if (!was_enabled) { 1147 if (is_enabled) { 1148 vfio_msi_enable(vdev); 1149 } 1150 } else { 1151 if (!is_enabled) { 1152 vfio_msi_disable(vdev); 1153 } else { 1154 vfio_update_msi(vdev); 1155 } 1156 } 1157 } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX && 1158 ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) { 1159 int is_enabled, was_enabled = msix_enabled(pdev); 1160 1161 pci_default_write_config(pdev, addr, val, len); 1162 1163 is_enabled = msix_enabled(pdev); 1164 1165 if (!was_enabled && is_enabled) { 1166 vfio_msix_enable(vdev); 1167 } else if (was_enabled && !is_enabled) { 1168 vfio_msix_disable(vdev); 1169 } 1170 } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) || 1171 range_covers_byte(addr, len, PCI_COMMAND)) { 1172 pcibus_t old_addr[PCI_NUM_REGIONS - 1]; 1173 int bar; 1174 1175 for (bar = 0; bar < PCI_ROM_SLOT; bar++) { 1176 old_addr[bar] = pdev->io_regions[bar].addr; 1177 } 1178 1179 pci_default_write_config(pdev, addr, val, len); 1180 1181 for (bar = 0; bar < PCI_ROM_SLOT; bar++) { 1182 if (old_addr[bar] != pdev->io_regions[bar].addr && 1183 vdev->bars[bar].region.size > 0 && 1184 vdev->bars[bar].region.size < qemu_real_host_page_size) { 1185 vfio_sub_page_bar_update_mapping(pdev, bar); 1186 } 1187 } 1188 } else { 1189 /* Write everything to QEMU to keep emulated bits correct */ 1190 pci_default_write_config(pdev, addr, val, len); 1191 } 1192 } 1193 1194 /* 1195 * Interrupt setup 1196 */ 1197 static void vfio_disable_interrupts(VFIOPCIDevice *vdev) 1198 { 1199 /* 1200 * More complicated than it looks. Disabling MSI/X transitions the 1201 * device to INTx mode (if supported). Therefore we need to first 1202 * disable MSI/X and then cleanup by disabling INTx. 1203 */ 1204 if (vdev->interrupt == VFIO_INT_MSIX) { 1205 vfio_msix_disable(vdev); 1206 } else if (vdev->interrupt == VFIO_INT_MSI) { 1207 vfio_msi_disable(vdev); 1208 } 1209 1210 if (vdev->interrupt == VFIO_INT_INTx) { 1211 vfio_intx_disable(vdev); 1212 } 1213 } 1214 1215 static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp) 1216 { 1217 uint16_t ctrl; 1218 bool msi_64bit, msi_maskbit; 1219 int ret, entries; 1220 Error *err = NULL; 1221 1222 if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl), 1223 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { 1224 error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS"); 1225 return -errno; 1226 } 1227 ctrl = le16_to_cpu(ctrl); 1228 1229 msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT); 1230 msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT); 1231 entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1); 1232 1233 trace_vfio_msi_setup(vdev->vbasedev.name, pos); 1234 1235 ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit, &err); 1236 if (ret < 0) { 1237 if (ret == -ENOTSUP) { 1238 return 0; 1239 } 1240 error_propagate_prepend(errp, err, "msi_init failed: "); 1241 return ret; 1242 } 1243 vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0); 1244 1245 return 0; 1246 } 1247 1248 static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev) 1249 { 1250 off_t start, end; 1251 VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region; 1252 1253 /* 1254 * If the host driver allows mapping of a MSIX data, we are going to 1255 * do map the entire BAR and emulate MSIX table on top of that. 1256 */ 1257 if (vfio_has_region_cap(&vdev->vbasedev, region->nr, 1258 VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) { 1259 return; 1260 } 1261 1262 /* 1263 * We expect to find a single mmap covering the whole BAR, anything else 1264 * means it's either unsupported or already setup. 1265 */ 1266 if (region->nr_mmaps != 1 || region->mmaps[0].offset || 1267 region->size != region->mmaps[0].size) { 1268 return; 1269 } 1270 1271 /* MSI-X table start and end aligned to host page size */ 1272 start = vdev->msix->table_offset & qemu_real_host_page_mask; 1273 end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset + 1274 (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE)); 1275 1276 /* 1277 * Does the MSI-X table cover the beginning of the BAR? The whole BAR? 1278 * NB - Host page size is necessarily a power of two and so is the PCI 1279 * BAR (not counting EA yet), therefore if we have host page aligned 1280 * @start and @end, then any remainder of the BAR before or after those 1281 * must be at least host page sized and therefore mmap'able. 1282 */ 1283 if (!start) { 1284 if (end >= region->size) { 1285 region->nr_mmaps = 0; 1286 g_free(region->mmaps); 1287 region->mmaps = NULL; 1288 trace_vfio_msix_fixup(vdev->vbasedev.name, 1289 vdev->msix->table_bar, 0, 0); 1290 } else { 1291 region->mmaps[0].offset = end; 1292 region->mmaps[0].size = region->size - end; 1293 trace_vfio_msix_fixup(vdev->vbasedev.name, 1294 vdev->msix->table_bar, region->mmaps[0].offset, 1295 region->mmaps[0].offset + region->mmaps[0].size); 1296 } 1297 1298 /* Maybe it's aligned at the end of the BAR */ 1299 } else if (end >= region->size) { 1300 region->mmaps[0].size = start; 1301 trace_vfio_msix_fixup(vdev->vbasedev.name, 1302 vdev->msix->table_bar, region->mmaps[0].offset, 1303 region->mmaps[0].offset + region->mmaps[0].size); 1304 1305 /* Otherwise it must split the BAR */ 1306 } else { 1307 region->nr_mmaps = 2; 1308 region->mmaps = g_renew(VFIOMmap, region->mmaps, 2); 1309 1310 memcpy(®ion->mmaps[1], ®ion->mmaps[0], sizeof(VFIOMmap)); 1311 1312 region->mmaps[0].size = start; 1313 trace_vfio_msix_fixup(vdev->vbasedev.name, 1314 vdev->msix->table_bar, region->mmaps[0].offset, 1315 region->mmaps[0].offset + region->mmaps[0].size); 1316 1317 region->mmaps[1].offset = end; 1318 region->mmaps[1].size = region->size - end; 1319 trace_vfio_msix_fixup(vdev->vbasedev.name, 1320 vdev->msix->table_bar, region->mmaps[1].offset, 1321 region->mmaps[1].offset + region->mmaps[1].size); 1322 } 1323 } 1324 1325 static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp) 1326 { 1327 int target_bar = -1; 1328 size_t msix_sz; 1329 1330 if (!vdev->msix || vdev->msix_relo == OFF_AUTOPCIBAR_OFF) { 1331 return; 1332 } 1333 1334 /* The actual minimum size of MSI-X structures */ 1335 msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) + 1336 (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8); 1337 /* Round up to host pages, we don't want to share a page */ 1338 msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz); 1339 /* PCI BARs must be a power of 2 */ 1340 msix_sz = pow2ceil(msix_sz); 1341 1342 if (vdev->msix_relo == OFF_AUTOPCIBAR_AUTO) { 1343 /* 1344 * TODO: Lookup table for known devices. 1345 * 1346 * Logically we might use an algorithm here to select the BAR adding 1347 * the least additional MMIO space, but we cannot programatically 1348 * predict the driver dependency on BAR ordering or sizing, therefore 1349 * 'auto' becomes a lookup for combinations reported to work. 1350 */ 1351 if (target_bar < 0) { 1352 error_setg(errp, "No automatic MSI-X relocation available for " 1353 "device %04x:%04x", vdev->vendor_id, vdev->device_id); 1354 return; 1355 } 1356 } else { 1357 target_bar = (int)(vdev->msix_relo - OFF_AUTOPCIBAR_BAR0); 1358 } 1359 1360 /* I/O port BARs cannot host MSI-X structures */ 1361 if (vdev->bars[target_bar].ioport) { 1362 error_setg(errp, "Invalid MSI-X relocation BAR %d, " 1363 "I/O port BAR", target_bar); 1364 return; 1365 } 1366 1367 /* Cannot use a BAR in the "shadow" of a 64-bit BAR */ 1368 if (!vdev->bars[target_bar].size && 1369 target_bar > 0 && vdev->bars[target_bar - 1].mem64) { 1370 error_setg(errp, "Invalid MSI-X relocation BAR %d, " 1371 "consumed by 64-bit BAR %d", target_bar, target_bar - 1); 1372 return; 1373 } 1374 1375 /* 2GB max size for 32-bit BARs, cannot double if already > 1G */ 1376 if (vdev->bars[target_bar].size > 1 * GiB && 1377 !vdev->bars[target_bar].mem64) { 1378 error_setg(errp, "Invalid MSI-X relocation BAR %d, " 1379 "no space to extend 32-bit BAR", target_bar); 1380 return; 1381 } 1382 1383 /* 1384 * If adding a new BAR, test if we can make it 64bit. We make it 1385 * prefetchable since QEMU MSI-X emulation has no read side effects 1386 * and doing so makes mapping more flexible. 1387 */ 1388 if (!vdev->bars[target_bar].size) { 1389 if (target_bar < (PCI_ROM_SLOT - 1) && 1390 !vdev->bars[target_bar + 1].size) { 1391 vdev->bars[target_bar].mem64 = true; 1392 vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64; 1393 } 1394 vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH; 1395 vdev->bars[target_bar].size = msix_sz; 1396 vdev->msix->table_offset = 0; 1397 } else { 1398 vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2, 1399 msix_sz * 2); 1400 /* 1401 * Due to above size calc, MSI-X always starts halfway into the BAR, 1402 * which will always be a separate host page. 1403 */ 1404 vdev->msix->table_offset = vdev->bars[target_bar].size / 2; 1405 } 1406 1407 vdev->msix->table_bar = target_bar; 1408 vdev->msix->pba_bar = target_bar; 1409 /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */ 1410 vdev->msix->pba_offset = vdev->msix->table_offset + 1411 (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE); 1412 1413 trace_vfio_msix_relo(vdev->vbasedev.name, 1414 vdev->msix->table_bar, vdev->msix->table_offset); 1415 } 1416 1417 /* 1418 * We don't have any control over how pci_add_capability() inserts 1419 * capabilities into the chain. In order to setup MSI-X we need a 1420 * MemoryRegion for the BAR. In order to setup the BAR and not 1421 * attempt to mmap the MSI-X table area, which VFIO won't allow, we 1422 * need to first look for where the MSI-X table lives. So we 1423 * unfortunately split MSI-X setup across two functions. 1424 */ 1425 static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) 1426 { 1427 uint8_t pos; 1428 uint16_t ctrl; 1429 uint32_t table, pba; 1430 int fd = vdev->vbasedev.fd; 1431 VFIOMSIXInfo *msix; 1432 1433 pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); 1434 if (!pos) { 1435 return; 1436 } 1437 1438 if (pread(fd, &ctrl, sizeof(ctrl), 1439 vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) { 1440 error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS"); 1441 return; 1442 } 1443 1444 if (pread(fd, &table, sizeof(table), 1445 vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) { 1446 error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE"); 1447 return; 1448 } 1449 1450 if (pread(fd, &pba, sizeof(pba), 1451 vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) { 1452 error_setg_errno(errp, errno, "failed to read PCI MSIX PBA"); 1453 return; 1454 } 1455 1456 ctrl = le16_to_cpu(ctrl); 1457 table = le32_to_cpu(table); 1458 pba = le32_to_cpu(pba); 1459 1460 msix = g_malloc0(sizeof(*msix)); 1461 msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK; 1462 msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK; 1463 msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK; 1464 msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; 1465 msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; 1466 1467 /* 1468 * Test the size of the pba_offset variable and catch if it extends outside 1469 * of the specified BAR. If it is the case, we need to apply a hardware 1470 * specific quirk if the device is known or we have a broken configuration. 1471 */ 1472 if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) { 1473 /* 1474 * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5 1475 * adapters. The T5 hardware returns an incorrect value of 0x8000 for 1476 * the VF PBA offset while the BAR itself is only 8k. The correct value 1477 * is 0x1000, so we hard code that here. 1478 */ 1479 if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO && 1480 (vdev->device_id & 0xff00) == 0x5800) { 1481 msix->pba_offset = 0x1000; 1482 } else if (vdev->msix_relo == OFF_AUTOPCIBAR_OFF) { 1483 error_setg(errp, "hardware reports invalid configuration, " 1484 "MSIX PBA outside of specified BAR"); 1485 g_free(msix); 1486 return; 1487 } 1488 } 1489 1490 trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar, 1491 msix->table_offset, msix->entries); 1492 vdev->msix = msix; 1493 1494 vfio_pci_fixup_msix_region(vdev); 1495 1496 vfio_pci_relocate_msix(vdev, errp); 1497 } 1498 1499 static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) 1500 { 1501 int ret; 1502 Error *err = NULL; 1503 1504 vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) * 1505 sizeof(unsigned long)); 1506 ret = msix_init(&vdev->pdev, vdev->msix->entries, 1507 vdev->bars[vdev->msix->table_bar].mr, 1508 vdev->msix->table_bar, vdev->msix->table_offset, 1509 vdev->bars[vdev->msix->pba_bar].mr, 1510 vdev->msix->pba_bar, vdev->msix->pba_offset, pos, 1511 &err); 1512 if (ret < 0) { 1513 if (ret == -ENOTSUP) { 1514 warn_report_err(err); 1515 return 0; 1516 } 1517 1518 error_propagate(errp, err); 1519 return ret; 1520 } 1521 1522 /* 1523 * The PCI spec suggests that devices provide additional alignment for 1524 * MSI-X structures and avoid overlapping non-MSI-X related registers. 1525 * For an assigned device, this hopefully means that emulation of MSI-X 1526 * structures does not affect the performance of the device. If devices 1527 * fail to provide that alignment, a significant performance penalty may 1528 * result, for instance Mellanox MT27500 VFs: 1529 * http://www.spinics.net/lists/kvm/msg125881.html 1530 * 1531 * The PBA is simply not that important for such a serious regression and 1532 * most drivers do not appear to look at it. The solution for this is to 1533 * disable the PBA MemoryRegion unless it's being used. We disable it 1534 * here and only enable it if a masked vector fires through QEMU. As the 1535 * vector-use notifier is called, which occurs on unmask, we test whether 1536 * PBA emulation is needed and again disable if not. 1537 */ 1538 memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false); 1539 1540 /* 1541 * The emulated machine may provide a paravirt interface for MSIX setup 1542 * so it is not strictly necessary to emulate MSIX here. This becomes 1543 * helpful when frequently accessed MMIO registers are located in 1544 * subpages adjacent to the MSIX table but the MSIX data containing page 1545 * cannot be mapped because of a host page size bigger than the MSIX table 1546 * alignment. 1547 */ 1548 if (object_property_get_bool(OBJECT(qdev_get_machine()), 1549 "vfio-no-msix-emulation", NULL)) { 1550 memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false); 1551 } 1552 1553 return 0; 1554 } 1555 1556 static void vfio_teardown_msi(VFIOPCIDevice *vdev) 1557 { 1558 msi_uninit(&vdev->pdev); 1559 1560 if (vdev->msix) { 1561 msix_uninit(&vdev->pdev, 1562 vdev->bars[vdev->msix->table_bar].mr, 1563 vdev->bars[vdev->msix->pba_bar].mr); 1564 g_free(vdev->msix->pending); 1565 } 1566 } 1567 1568 /* 1569 * Resource setup 1570 */ 1571 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled) 1572 { 1573 int i; 1574 1575 for (i = 0; i < PCI_ROM_SLOT; i++) { 1576 vfio_region_mmaps_set_enabled(&vdev->bars[i].region, enabled); 1577 } 1578 } 1579 1580 static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) 1581 { 1582 VFIOBAR *bar = &vdev->bars[nr]; 1583 1584 uint32_t pci_bar; 1585 int ret; 1586 1587 /* Skip both unimplemented BARs and the upper half of 64bit BARS. */ 1588 if (!bar->region.size) { 1589 return; 1590 } 1591 1592 /* Determine what type of BAR this is for registration */ 1593 ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar), 1594 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); 1595 if (ret != sizeof(pci_bar)) { 1596 error_report("vfio: Failed to read BAR %d (%m)", nr); 1597 return; 1598 } 1599 1600 pci_bar = le32_to_cpu(pci_bar); 1601 bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO); 1602 bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64); 1603 bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK : 1604 ~PCI_BASE_ADDRESS_MEM_MASK); 1605 bar->size = bar->region.size; 1606 } 1607 1608 static void vfio_bars_prepare(VFIOPCIDevice *vdev) 1609 { 1610 int i; 1611 1612 for (i = 0; i < PCI_ROM_SLOT; i++) { 1613 vfio_bar_prepare(vdev, i); 1614 } 1615 } 1616 1617 static void vfio_bar_register(VFIOPCIDevice *vdev, int nr) 1618 { 1619 VFIOBAR *bar = &vdev->bars[nr]; 1620 char *name; 1621 1622 if (!bar->size) { 1623 return; 1624 } 1625 1626 bar->mr = g_new0(MemoryRegion, 1); 1627 name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr); 1628 memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size); 1629 g_free(name); 1630 1631 if (bar->region.size) { 1632 memory_region_add_subregion(bar->mr, 0, bar->region.mem); 1633 1634 if (vfio_region_mmap(&bar->region)) { 1635 error_report("Failed to mmap %s BAR %d. Performance may be slow", 1636 vdev->vbasedev.name, nr); 1637 } 1638 } 1639 1640 pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr); 1641 } 1642 1643 static void vfio_bars_register(VFIOPCIDevice *vdev) 1644 { 1645 int i; 1646 1647 for (i = 0; i < PCI_ROM_SLOT; i++) { 1648 vfio_bar_register(vdev, i); 1649 } 1650 } 1651 1652 static void vfio_bars_exit(VFIOPCIDevice *vdev) 1653 { 1654 int i; 1655 1656 for (i = 0; i < PCI_ROM_SLOT; i++) { 1657 VFIOBAR *bar = &vdev->bars[i]; 1658 1659 vfio_bar_quirk_exit(vdev, i); 1660 vfio_region_exit(&bar->region); 1661 if (bar->region.size) { 1662 memory_region_del_subregion(bar->mr, bar->region.mem); 1663 } 1664 } 1665 1666 if (vdev->vga) { 1667 pci_unregister_vga(&vdev->pdev); 1668 vfio_vga_quirk_exit(vdev); 1669 } 1670 } 1671 1672 static void vfio_bars_finalize(VFIOPCIDevice *vdev) 1673 { 1674 int i; 1675 1676 for (i = 0; i < PCI_ROM_SLOT; i++) { 1677 VFIOBAR *bar = &vdev->bars[i]; 1678 1679 vfio_bar_quirk_finalize(vdev, i); 1680 vfio_region_finalize(&bar->region); 1681 if (bar->size) { 1682 object_unparent(OBJECT(bar->mr)); 1683 g_free(bar->mr); 1684 } 1685 } 1686 1687 if (vdev->vga) { 1688 vfio_vga_quirk_finalize(vdev); 1689 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { 1690 object_unparent(OBJECT(&vdev->vga->region[i].mem)); 1691 } 1692 g_free(vdev->vga); 1693 } 1694 } 1695 1696 /* 1697 * General setup 1698 */ 1699 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos) 1700 { 1701 uint8_t tmp; 1702 uint16_t next = PCI_CONFIG_SPACE_SIZE; 1703 1704 for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp; 1705 tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]) { 1706 if (tmp > pos && tmp < next) { 1707 next = tmp; 1708 } 1709 } 1710 1711 return next - pos; 1712 } 1713 1714 1715 static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos) 1716 { 1717 uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE; 1718 1719 for (tmp = PCI_CONFIG_SPACE_SIZE; tmp; 1720 tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) { 1721 if (tmp > pos && tmp < next) { 1722 next = tmp; 1723 } 1724 } 1725 1726 return next - pos; 1727 } 1728 1729 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask) 1730 { 1731 pci_set_word(buf, (pci_get_word(buf) & ~mask) | val); 1732 } 1733 1734 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos, 1735 uint16_t val, uint16_t mask) 1736 { 1737 vfio_set_word_bits(vdev->pdev.config + pos, val, mask); 1738 vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask); 1739 vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask); 1740 } 1741 1742 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask) 1743 { 1744 pci_set_long(buf, (pci_get_long(buf) & ~mask) | val); 1745 } 1746 1747 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos, 1748 uint32_t val, uint32_t mask) 1749 { 1750 vfio_set_long_bits(vdev->pdev.config + pos, val, mask); 1751 vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask); 1752 vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask); 1753 } 1754 1755 static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size, 1756 Error **errp) 1757 { 1758 uint16_t flags; 1759 uint8_t type; 1760 1761 flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS); 1762 type = (flags & PCI_EXP_FLAGS_TYPE) >> 4; 1763 1764 if (type != PCI_EXP_TYPE_ENDPOINT && 1765 type != PCI_EXP_TYPE_LEG_END && 1766 type != PCI_EXP_TYPE_RC_END) { 1767 1768 error_setg(errp, "assignment of PCIe type 0x%x " 1769 "devices is not currently supported", type); 1770 return -EINVAL; 1771 } 1772 1773 if (!pci_bus_is_express(pci_get_bus(&vdev->pdev))) { 1774 PCIBus *bus = pci_get_bus(&vdev->pdev); 1775 PCIDevice *bridge; 1776 1777 /* 1778 * Traditionally PCI device assignment exposes the PCIe capability 1779 * as-is on non-express buses. The reason being that some drivers 1780 * simply assume that it's there, for example tg3. However when 1781 * we're running on a native PCIe machine type, like Q35, we need 1782 * to hide the PCIe capability. The reason for this is twofold; 1783 * first Windows guests get a Code 10 error when the PCIe capability 1784 * is exposed in this configuration. Therefore express devices won't 1785 * work at all unless they're attached to express buses in the VM. 1786 * Second, a native PCIe machine introduces the possibility of fine 1787 * granularity IOMMUs supporting both translation and isolation. 1788 * Guest code to discover the IOMMU visibility of a device, such as 1789 * IOMMU grouping code on Linux, is very aware of device types and 1790 * valid transitions between bus types. An express device on a non- 1791 * express bus is not a valid combination on bare metal systems. 1792 * 1793 * Drivers that require a PCIe capability to make the device 1794 * functional are simply going to need to have their devices placed 1795 * on a PCIe bus in the VM. 1796 */ 1797 while (!pci_bus_is_root(bus)) { 1798 bridge = pci_bridge_get_device(bus); 1799 bus = pci_get_bus(bridge); 1800 } 1801 1802 if (pci_bus_is_express(bus)) { 1803 return 0; 1804 } 1805 1806 } else if (pci_bus_is_root(pci_get_bus(&vdev->pdev))) { 1807 /* 1808 * On a Root Complex bus Endpoints become Root Complex Integrated 1809 * Endpoints, which changes the type and clears the LNK & LNK2 fields. 1810 */ 1811 if (type == PCI_EXP_TYPE_ENDPOINT) { 1812 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS, 1813 PCI_EXP_TYPE_RC_END << 4, 1814 PCI_EXP_FLAGS_TYPE); 1815 1816 /* Link Capabilities, Status, and Control goes away */ 1817 if (size > PCI_EXP_LNKCTL) { 1818 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0); 1819 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0); 1820 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0); 1821 1822 #ifndef PCI_EXP_LNKCAP2 1823 #define PCI_EXP_LNKCAP2 44 1824 #endif 1825 #ifndef PCI_EXP_LNKSTA2 1826 #define PCI_EXP_LNKSTA2 50 1827 #endif 1828 /* Link 2 Capabilities, Status, and Control goes away */ 1829 if (size > PCI_EXP_LNKCAP2) { 1830 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0); 1831 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0); 1832 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0); 1833 } 1834 } 1835 1836 } else if (type == PCI_EXP_TYPE_LEG_END) { 1837 /* 1838 * Legacy endpoints don't belong on the root complex. Windows 1839 * seems to be happier with devices if we skip the capability. 1840 */ 1841 return 0; 1842 } 1843 1844 } else { 1845 /* 1846 * Convert Root Complex Integrated Endpoints to regular endpoints. 1847 * These devices don't support LNK/LNK2 capabilities, so make them up. 1848 */ 1849 if (type == PCI_EXP_TYPE_RC_END) { 1850 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS, 1851 PCI_EXP_TYPE_ENDPOINT << 4, 1852 PCI_EXP_FLAGS_TYPE); 1853 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 1854 QEMU_PCI_EXP_LNKCAP_MLW(QEMU_PCI_EXP_LNK_X1) | 1855 QEMU_PCI_EXP_LNKCAP_MLS(QEMU_PCI_EXP_LNK_2_5GT), ~0); 1856 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0); 1857 } 1858 } 1859 1860 /* 1861 * Intel 82599 SR-IOV VFs report an invalid PCIe capability version 0 1862 * (Niantic errate #35) causing Windows to error with a Code 10 for the 1863 * device on Q35. Fixup any such devices to report version 1. If we 1864 * were to remove the capability entirely the guest would lose extended 1865 * config space. 1866 */ 1867 if ((flags & PCI_EXP_FLAGS_VERS) == 0) { 1868 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS, 1869 1, PCI_EXP_FLAGS_VERS); 1870 } 1871 1872 pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size, 1873 errp); 1874 if (pos < 0) { 1875 return pos; 1876 } 1877 1878 vdev->pdev.exp.exp_cap = pos; 1879 1880 return pos; 1881 } 1882 1883 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos) 1884 { 1885 uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP); 1886 1887 if (cap & PCI_EXP_DEVCAP_FLR) { 1888 trace_vfio_check_pcie_flr(vdev->vbasedev.name); 1889 vdev->has_flr = true; 1890 } 1891 } 1892 1893 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos) 1894 { 1895 uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL); 1896 1897 if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) { 1898 trace_vfio_check_pm_reset(vdev->vbasedev.name); 1899 vdev->has_pm_reset = true; 1900 } 1901 } 1902 1903 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos) 1904 { 1905 uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP); 1906 1907 if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) { 1908 trace_vfio_check_af_flr(vdev->vbasedev.name); 1909 vdev->has_flr = true; 1910 } 1911 } 1912 1913 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) 1914 { 1915 PCIDevice *pdev = &vdev->pdev; 1916 uint8_t cap_id, next, size; 1917 int ret; 1918 1919 cap_id = pdev->config[pos]; 1920 next = pdev->config[pos + PCI_CAP_LIST_NEXT]; 1921 1922 /* 1923 * If it becomes important to configure capabilities to their actual 1924 * size, use this as the default when it's something we don't recognize. 1925 * Since QEMU doesn't actually handle many of the config accesses, 1926 * exact size doesn't seem worthwhile. 1927 */ 1928 size = vfio_std_cap_max_size(pdev, pos); 1929 1930 /* 1931 * pci_add_capability always inserts the new capability at the head 1932 * of the chain. Therefore to end up with a chain that matches the 1933 * physical device, we insert from the end by making this recursive. 1934 * This is also why we pre-calculate size above as cached config space 1935 * will be changed as we unwind the stack. 1936 */ 1937 if (next) { 1938 ret = vfio_add_std_cap(vdev, next, errp); 1939 if (ret) { 1940 return ret; 1941 } 1942 } else { 1943 /* Begin the rebuild, use QEMU emulated list bits */ 1944 pdev->config[PCI_CAPABILITY_LIST] = 0; 1945 vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff; 1946 vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST; 1947 1948 ret = vfio_add_virt_caps(vdev, errp); 1949 if (ret) { 1950 return ret; 1951 } 1952 } 1953 1954 /* Scale down size, esp in case virt caps were added above */ 1955 size = MIN(size, vfio_std_cap_max_size(pdev, pos)); 1956 1957 /* Use emulated next pointer to allow dropping caps */ 1958 pci_set_byte(vdev->emulated_config_bits + pos + PCI_CAP_LIST_NEXT, 0xff); 1959 1960 switch (cap_id) { 1961 case PCI_CAP_ID_MSI: 1962 ret = vfio_msi_setup(vdev, pos, errp); 1963 break; 1964 case PCI_CAP_ID_EXP: 1965 vfio_check_pcie_flr(vdev, pos); 1966 ret = vfio_setup_pcie_cap(vdev, pos, size, errp); 1967 break; 1968 case PCI_CAP_ID_MSIX: 1969 ret = vfio_msix_setup(vdev, pos, errp); 1970 break; 1971 case PCI_CAP_ID_PM: 1972 vfio_check_pm_reset(vdev, pos); 1973 vdev->pm_cap = pos; 1974 ret = pci_add_capability(pdev, cap_id, pos, size, errp); 1975 break; 1976 case PCI_CAP_ID_AF: 1977 vfio_check_af_flr(vdev, pos); 1978 ret = pci_add_capability(pdev, cap_id, pos, size, errp); 1979 break; 1980 default: 1981 ret = pci_add_capability(pdev, cap_id, pos, size, errp); 1982 break; 1983 } 1984 1985 if (ret < 0) { 1986 error_prepend(errp, 1987 "failed to add PCI capability 0x%x[0x%x]@0x%x: ", 1988 cap_id, size, pos); 1989 return ret; 1990 } 1991 1992 return 0; 1993 } 1994 1995 static void vfio_add_ext_cap(VFIOPCIDevice *vdev) 1996 { 1997 PCIDevice *pdev = &vdev->pdev; 1998 uint32_t header; 1999 uint16_t cap_id, next, size; 2000 uint8_t cap_ver; 2001 uint8_t *config; 2002 2003 /* Only add extended caps if we have them and the guest can see them */ 2004 if (!pci_is_express(pdev) || !pci_bus_is_express(pci_get_bus(pdev)) || 2005 !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) { 2006 return; 2007 } 2008 2009 /* 2010 * pcie_add_capability always inserts the new capability at the tail 2011 * of the chain. Therefore to end up with a chain that matches the 2012 * physical device, we cache the config space to avoid overwriting 2013 * the original config space when we parse the extended capabilities. 2014 */ 2015 config = g_memdup(pdev->config, vdev->config_size); 2016 2017 /* 2018 * Extended capabilities are chained with each pointing to the next, so we 2019 * can drop anything other than the head of the chain simply by modifying 2020 * the previous next pointer. Seed the head of the chain here such that 2021 * we can simply skip any capabilities we want to drop below, regardless 2022 * of their position in the chain. If this stub capability still exists 2023 * after we add the capabilities we want to expose, update the capability 2024 * ID to zero. Note that we cannot seed with the capability header being 2025 * zero as this conflicts with definition of an absent capability chain 2026 * and prevents capabilities beyond the head of the list from being added. 2027 * By replacing the dummy capability ID with zero after walking the device 2028 * chain, we also transparently mark extended capabilities as absent if 2029 * no capabilities were added. Note that the PCIe spec defines an absence 2030 * of extended capabilities to be determined by a value of zero for the 2031 * capability ID, version, AND next pointer. A non-zero next pointer 2032 * should be sufficient to indicate additional capabilities are present, 2033 * which will occur if we call pcie_add_capability() below. The entire 2034 * first dword is emulated to support this. 2035 * 2036 * NB. The kernel side does similar masking, so be prepared that our 2037 * view of the device may also contain a capability ID zero in the head 2038 * of the chain. Skip it for the same reason that we cannot seed the 2039 * chain with a zero capability. 2040 */ 2041 pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE, 2042 PCI_EXT_CAP(0xFFFF, 0, 0)); 2043 pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0); 2044 pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0); 2045 2046 for (next = PCI_CONFIG_SPACE_SIZE; next; 2047 next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) { 2048 header = pci_get_long(config + next); 2049 cap_id = PCI_EXT_CAP_ID(header); 2050 cap_ver = PCI_EXT_CAP_VER(header); 2051 2052 /* 2053 * If it becomes important to configure extended capabilities to their 2054 * actual size, use this as the default when it's something we don't 2055 * recognize. Since QEMU doesn't actually handle many of the config 2056 * accesses, exact size doesn't seem worthwhile. 2057 */ 2058 size = vfio_ext_cap_max_size(config, next); 2059 2060 /* Use emulated next pointer to allow dropping extended caps */ 2061 pci_long_test_and_set_mask(vdev->emulated_config_bits + next, 2062 PCI_EXT_CAP_NEXT_MASK); 2063 2064 switch (cap_id) { 2065 case 0: /* kernel masked capability */ 2066 case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */ 2067 case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */ 2068 case PCI_EXT_CAP_ID_REBAR: /* Can't expose read-only */ 2069 trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next); 2070 break; 2071 default: 2072 pcie_add_capability(pdev, cap_id, cap_ver, next, size); 2073 } 2074 2075 } 2076 2077 /* Cleanup chain head ID if necessary */ 2078 if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) { 2079 pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0); 2080 } 2081 2082 g_free(config); 2083 return; 2084 } 2085 2086 static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) 2087 { 2088 PCIDevice *pdev = &vdev->pdev; 2089 int ret; 2090 2091 if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) || 2092 !pdev->config[PCI_CAPABILITY_LIST]) { 2093 return 0; /* Nothing to add */ 2094 } 2095 2096 ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST], errp); 2097 if (ret) { 2098 return ret; 2099 } 2100 2101 vfio_add_ext_cap(vdev); 2102 return 0; 2103 } 2104 2105 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) 2106 { 2107 PCIDevice *pdev = &vdev->pdev; 2108 uint16_t cmd; 2109 2110 vfio_disable_interrupts(vdev); 2111 2112 /* Make sure the device is in D0 */ 2113 if (vdev->pm_cap) { 2114 uint16_t pmcsr; 2115 uint8_t state; 2116 2117 pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); 2118 state = pmcsr & PCI_PM_CTRL_STATE_MASK; 2119 if (state) { 2120 pmcsr &= ~PCI_PM_CTRL_STATE_MASK; 2121 vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); 2122 /* vfio handles the necessary delay here */ 2123 pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); 2124 state = pmcsr & PCI_PM_CTRL_STATE_MASK; 2125 if (state) { 2126 error_report("vfio: Unable to power on device, stuck in D%d", 2127 state); 2128 } 2129 } 2130 } 2131 2132 /* 2133 * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master. 2134 * Also put INTx Disable in known state. 2135 */ 2136 cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); 2137 cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | 2138 PCI_COMMAND_INTX_DISABLE); 2139 vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); 2140 } 2141 2142 static void vfio_pci_post_reset(VFIOPCIDevice *vdev) 2143 { 2144 Error *err = NULL; 2145 int nr; 2146 2147 vfio_intx_enable(vdev, &err); 2148 if (err) { 2149 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 2150 } 2151 2152 for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) { 2153 off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr); 2154 uint32_t val = 0; 2155 uint32_t len = sizeof(val); 2156 2157 if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) { 2158 error_report("%s(%s) reset bar %d failed: %m", __func__, 2159 vdev->vbasedev.name, nr); 2160 } 2161 } 2162 2163 vfio_quirk_reset(vdev); 2164 } 2165 2166 static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) 2167 { 2168 char tmp[13]; 2169 2170 sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain, 2171 addr->bus, addr->slot, addr->function); 2172 2173 return (strcmp(tmp, name) == 0); 2174 } 2175 2176 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) 2177 { 2178 VFIOGroup *group; 2179 struct vfio_pci_hot_reset_info *info; 2180 struct vfio_pci_dependent_device *devices; 2181 struct vfio_pci_hot_reset *reset; 2182 int32_t *fds; 2183 int ret, i, count; 2184 bool multi = false; 2185 2186 trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); 2187 2188 if (!single) { 2189 vfio_pci_pre_reset(vdev); 2190 } 2191 vdev->vbasedev.needs_reset = false; 2192 2193 info = g_malloc0(sizeof(*info)); 2194 info->argsz = sizeof(*info); 2195 2196 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); 2197 if (ret && errno != ENOSPC) { 2198 ret = -errno; 2199 if (!vdev->has_pm_reset) { 2200 error_report("vfio: Cannot reset device %s, " 2201 "no available reset mechanism.", vdev->vbasedev.name); 2202 } 2203 goto out_single; 2204 } 2205 2206 count = info->count; 2207 info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices))); 2208 info->argsz = sizeof(*info) + (count * sizeof(*devices)); 2209 devices = &info->devices[0]; 2210 2211 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); 2212 if (ret) { 2213 ret = -errno; 2214 error_report("vfio: hot reset info failed: %m"); 2215 goto out_single; 2216 } 2217 2218 trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); 2219 2220 /* Verify that we have all the groups required */ 2221 for (i = 0; i < info->count; i++) { 2222 PCIHostDeviceAddress host; 2223 VFIOPCIDevice *tmp; 2224 VFIODevice *vbasedev_iter; 2225 2226 host.domain = devices[i].segment; 2227 host.bus = devices[i].bus; 2228 host.slot = PCI_SLOT(devices[i].devfn); 2229 host.function = PCI_FUNC(devices[i].devfn); 2230 2231 trace_vfio_pci_hot_reset_dep_devices(host.domain, 2232 host.bus, host.slot, host.function, devices[i].group_id); 2233 2234 if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { 2235 continue; 2236 } 2237 2238 QLIST_FOREACH(group, &vfio_group_list, next) { 2239 if (group->groupid == devices[i].group_id) { 2240 break; 2241 } 2242 } 2243 2244 if (!group) { 2245 if (!vdev->has_pm_reset) { 2246 error_report("vfio: Cannot reset device %s, " 2247 "depends on group %d which is not owned.", 2248 vdev->vbasedev.name, devices[i].group_id); 2249 } 2250 ret = -EPERM; 2251 goto out; 2252 } 2253 2254 /* Prep dependent devices for reset and clear our marker. */ 2255 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { 2256 if (!vbasedev_iter->dev->realized || 2257 vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { 2258 continue; 2259 } 2260 tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); 2261 if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { 2262 if (single) { 2263 ret = -EINVAL; 2264 goto out_single; 2265 } 2266 vfio_pci_pre_reset(tmp); 2267 tmp->vbasedev.needs_reset = false; 2268 multi = true; 2269 break; 2270 } 2271 } 2272 } 2273 2274 if (!single && !multi) { 2275 ret = -EINVAL; 2276 goto out_single; 2277 } 2278 2279 /* Determine how many group fds need to be passed */ 2280 count = 0; 2281 QLIST_FOREACH(group, &vfio_group_list, next) { 2282 for (i = 0; i < info->count; i++) { 2283 if (group->groupid == devices[i].group_id) { 2284 count++; 2285 break; 2286 } 2287 } 2288 } 2289 2290 reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); 2291 reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); 2292 fds = &reset->group_fds[0]; 2293 2294 /* Fill in group fds */ 2295 QLIST_FOREACH(group, &vfio_group_list, next) { 2296 for (i = 0; i < info->count; i++) { 2297 if (group->groupid == devices[i].group_id) { 2298 fds[reset->count++] = group->fd; 2299 break; 2300 } 2301 } 2302 } 2303 2304 /* Bus reset! */ 2305 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); 2306 g_free(reset); 2307 2308 trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, 2309 ret ? "%m" : "Success"); 2310 2311 out: 2312 /* Re-enable INTx on affected devices */ 2313 for (i = 0; i < info->count; i++) { 2314 PCIHostDeviceAddress host; 2315 VFIOPCIDevice *tmp; 2316 VFIODevice *vbasedev_iter; 2317 2318 host.domain = devices[i].segment; 2319 host.bus = devices[i].bus; 2320 host.slot = PCI_SLOT(devices[i].devfn); 2321 host.function = PCI_FUNC(devices[i].devfn); 2322 2323 if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { 2324 continue; 2325 } 2326 2327 QLIST_FOREACH(group, &vfio_group_list, next) { 2328 if (group->groupid == devices[i].group_id) { 2329 break; 2330 } 2331 } 2332 2333 if (!group) { 2334 break; 2335 } 2336 2337 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { 2338 if (!vbasedev_iter->dev->realized || 2339 vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { 2340 continue; 2341 } 2342 tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); 2343 if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { 2344 vfio_pci_post_reset(tmp); 2345 break; 2346 } 2347 } 2348 } 2349 out_single: 2350 if (!single) { 2351 vfio_pci_post_reset(vdev); 2352 } 2353 g_free(info); 2354 2355 return ret; 2356 } 2357 2358 /* 2359 * We want to differentiate hot reset of mulitple in-use devices vs hot reset 2360 * of a single in-use device. VFIO_DEVICE_RESET will already handle the case 2361 * of doing hot resets when there is only a single device per bus. The in-use 2362 * here refers to how many VFIODevices are affected. A hot reset that affects 2363 * multiple devices, but only a single in-use device, means that we can call 2364 * it from our bus ->reset() callback since the extent is effectively a single 2365 * device. This allows us to make use of it in the hotplug path. When there 2366 * are multiple in-use devices, we can only trigger the hot reset during a 2367 * system reset and thus from our reset handler. We separate _one vs _multi 2368 * here so that we don't overlap and do a double reset on the system reset 2369 * path where both our reset handler and ->reset() callback are used. Calling 2370 * _one() will only do a hot reset for the one in-use devices case, calling 2371 * _multi() will do nothing if a _one() would have been sufficient. 2372 */ 2373 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev) 2374 { 2375 return vfio_pci_hot_reset(vdev, true); 2376 } 2377 2378 static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev) 2379 { 2380 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 2381 return vfio_pci_hot_reset(vdev, false); 2382 } 2383 2384 static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev) 2385 { 2386 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 2387 if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) { 2388 vbasedev->needs_reset = true; 2389 } 2390 } 2391 2392 static VFIODeviceOps vfio_pci_ops = { 2393 .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, 2394 .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, 2395 .vfio_eoi = vfio_intx_eoi, 2396 }; 2397 2398 int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) 2399 { 2400 VFIODevice *vbasedev = &vdev->vbasedev; 2401 struct vfio_region_info *reg_info; 2402 int ret; 2403 2404 ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info); 2405 if (ret) { 2406 error_setg_errno(errp, -ret, 2407 "failed getting region info for VGA region index %d", 2408 VFIO_PCI_VGA_REGION_INDEX); 2409 return ret; 2410 } 2411 2412 if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) || 2413 !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) || 2414 reg_info->size < 0xbffff + 1) { 2415 error_setg(errp, "unexpected VGA info, flags 0x%lx, size 0x%lx", 2416 (unsigned long)reg_info->flags, 2417 (unsigned long)reg_info->size); 2418 g_free(reg_info); 2419 return -EINVAL; 2420 } 2421 2422 vdev->vga = g_new0(VFIOVGA, 1); 2423 2424 vdev->vga->fd_offset = reg_info->offset; 2425 vdev->vga->fd = vdev->vbasedev.fd; 2426 2427 g_free(reg_info); 2428 2429 vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE; 2430 vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM; 2431 QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks); 2432 2433 memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem, 2434 OBJECT(vdev), &vfio_vga_ops, 2435 &vdev->vga->region[QEMU_PCI_VGA_MEM], 2436 "vfio-vga-mmio@0xa0000", 2437 QEMU_PCI_VGA_MEM_SIZE); 2438 2439 vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE; 2440 vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO; 2441 QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks); 2442 2443 memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, 2444 OBJECT(vdev), &vfio_vga_ops, 2445 &vdev->vga->region[QEMU_PCI_VGA_IO_LO], 2446 "vfio-vga-io@0x3b0", 2447 QEMU_PCI_VGA_IO_LO_SIZE); 2448 2449 vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE; 2450 vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI; 2451 QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks); 2452 2453 memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 2454 OBJECT(vdev), &vfio_vga_ops, 2455 &vdev->vga->region[QEMU_PCI_VGA_IO_HI], 2456 "vfio-vga-io@0x3c0", 2457 QEMU_PCI_VGA_IO_HI_SIZE); 2458 2459 pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem, 2460 &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, 2461 &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem); 2462 2463 return 0; 2464 } 2465 2466 static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) 2467 { 2468 VFIODevice *vbasedev = &vdev->vbasedev; 2469 struct vfio_region_info *reg_info; 2470 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; 2471 int i, ret = -1; 2472 2473 /* Sanity check device */ 2474 if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) { 2475 error_setg(errp, "this isn't a PCI device"); 2476 return; 2477 } 2478 2479 if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) { 2480 error_setg(errp, "unexpected number of io regions %u", 2481 vbasedev->num_regions); 2482 return; 2483 } 2484 2485 if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) { 2486 error_setg(errp, "unexpected number of irqs %u", vbasedev->num_irqs); 2487 return; 2488 } 2489 2490 for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) { 2491 char *name = g_strdup_printf("%s BAR %d", vbasedev->name, i); 2492 2493 ret = vfio_region_setup(OBJECT(vdev), vbasedev, 2494 &vdev->bars[i].region, i, name); 2495 g_free(name); 2496 2497 if (ret) { 2498 error_setg_errno(errp, -ret, "failed to get region %d info", i); 2499 return; 2500 } 2501 2502 QLIST_INIT(&vdev->bars[i].quirks); 2503 } 2504 2505 ret = vfio_get_region_info(vbasedev, 2506 VFIO_PCI_CONFIG_REGION_INDEX, ®_info); 2507 if (ret) { 2508 error_setg_errno(errp, -ret, "failed to get config info"); 2509 return; 2510 } 2511 2512 trace_vfio_populate_device_config(vdev->vbasedev.name, 2513 (unsigned long)reg_info->size, 2514 (unsigned long)reg_info->offset, 2515 (unsigned long)reg_info->flags); 2516 2517 vdev->config_size = reg_info->size; 2518 if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) { 2519 vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS; 2520 } 2521 vdev->config_offset = reg_info->offset; 2522 2523 g_free(reg_info); 2524 2525 if (vdev->features & VFIO_FEATURE_ENABLE_VGA) { 2526 ret = vfio_populate_vga(vdev, errp); 2527 if (ret) { 2528 error_append_hint(errp, "device does not support " 2529 "requested feature x-vga\n"); 2530 return; 2531 } 2532 } 2533 2534 irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; 2535 2536 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); 2537 if (ret) { 2538 /* This can fail for an old kernel or legacy PCI dev */ 2539 trace_vfio_populate_device_get_irq_info_failure(strerror(errno)); 2540 } else if (irq_info.count == 1) { 2541 vdev->pci_aer = true; 2542 } else { 2543 warn_report(VFIO_MSG_PREFIX 2544 "Could not enable error recovery for the device", 2545 vbasedev->name); 2546 } 2547 } 2548 2549 static void vfio_put_device(VFIOPCIDevice *vdev) 2550 { 2551 g_free(vdev->vbasedev.name); 2552 g_free(vdev->msix); 2553 2554 vfio_put_base_device(&vdev->vbasedev); 2555 } 2556 2557 static void vfio_err_notifier_handler(void *opaque) 2558 { 2559 VFIOPCIDevice *vdev = opaque; 2560 2561 if (!event_notifier_test_and_clear(&vdev->err_notifier)) { 2562 return; 2563 } 2564 2565 /* 2566 * TBD. Retrieve the error details and decide what action 2567 * needs to be taken. One of the actions could be to pass 2568 * the error to the guest and have the guest driver recover 2569 * from the error. This requires that PCIe capabilities be 2570 * exposed to the guest. For now, we just terminate the 2571 * guest to contain the error. 2572 */ 2573 2574 error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name); 2575 2576 vm_stop(RUN_STATE_INTERNAL_ERROR); 2577 } 2578 2579 /* 2580 * Registers error notifier for devices supporting error recovery. 2581 * If we encounter a failure in this function, we report an error 2582 * and continue after disabling error recovery support for the 2583 * device. 2584 */ 2585 static void vfio_register_err_notifier(VFIOPCIDevice *vdev) 2586 { 2587 Error *err = NULL; 2588 int32_t fd; 2589 2590 if (!vdev->pci_aer) { 2591 return; 2592 } 2593 2594 if (event_notifier_init(&vdev->err_notifier, 0)) { 2595 error_report("vfio: Unable to init event notifier for error detection"); 2596 vdev->pci_aer = false; 2597 return; 2598 } 2599 2600 fd = event_notifier_get_fd(&vdev->err_notifier); 2601 qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev); 2602 2603 if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, 2604 VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { 2605 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 2606 qemu_set_fd_handler(fd, NULL, NULL, vdev); 2607 event_notifier_cleanup(&vdev->err_notifier); 2608 vdev->pci_aer = false; 2609 } 2610 } 2611 2612 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev) 2613 { 2614 Error *err = NULL; 2615 2616 if (!vdev->pci_aer) { 2617 return; 2618 } 2619 2620 if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, 2621 VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { 2622 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 2623 } 2624 qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier), 2625 NULL, NULL, vdev); 2626 event_notifier_cleanup(&vdev->err_notifier); 2627 } 2628 2629 static void vfio_req_notifier_handler(void *opaque) 2630 { 2631 VFIOPCIDevice *vdev = opaque; 2632 Error *err = NULL; 2633 2634 if (!event_notifier_test_and_clear(&vdev->req_notifier)) { 2635 return; 2636 } 2637 2638 qdev_unplug(DEVICE(vdev), &err); 2639 if (err) { 2640 warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 2641 } 2642 } 2643 2644 static void vfio_register_req_notifier(VFIOPCIDevice *vdev) 2645 { 2646 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), 2647 .index = VFIO_PCI_REQ_IRQ_INDEX }; 2648 Error *err = NULL; 2649 int32_t fd; 2650 2651 if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) { 2652 return; 2653 } 2654 2655 if (ioctl(vdev->vbasedev.fd, 2656 VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) { 2657 return; 2658 } 2659 2660 if (event_notifier_init(&vdev->req_notifier, 0)) { 2661 error_report("vfio: Unable to init event notifier for device request"); 2662 return; 2663 } 2664 2665 fd = event_notifier_get_fd(&vdev->req_notifier); 2666 qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev); 2667 2668 if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, 2669 VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { 2670 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 2671 qemu_set_fd_handler(fd, NULL, NULL, vdev); 2672 event_notifier_cleanup(&vdev->req_notifier); 2673 } else { 2674 vdev->req_enabled = true; 2675 } 2676 } 2677 2678 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) 2679 { 2680 Error *err = NULL; 2681 2682 if (!vdev->req_enabled) { 2683 return; 2684 } 2685 2686 if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, 2687 VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { 2688 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 2689 } 2690 qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier), 2691 NULL, NULL, vdev); 2692 event_notifier_cleanup(&vdev->req_notifier); 2693 2694 vdev->req_enabled = false; 2695 } 2696 2697 static void vfio_realize(PCIDevice *pdev, Error **errp) 2698 { 2699 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 2700 VFIODevice *vbasedev_iter; 2701 VFIOGroup *group; 2702 char *tmp, *subsys, group_path[PATH_MAX], *group_name; 2703 Error *err = NULL; 2704 ssize_t len; 2705 struct stat st; 2706 int groupid; 2707 int i, ret; 2708 bool is_mdev; 2709 2710 if (!vdev->vbasedev.sysfsdev) { 2711 if (!(~vdev->host.domain || ~vdev->host.bus || 2712 ~vdev->host.slot || ~vdev->host.function)) { 2713 error_setg(errp, "No provided host device"); 2714 error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F " 2715 "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n"); 2716 return; 2717 } 2718 vdev->vbasedev.sysfsdev = 2719 g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x", 2720 vdev->host.domain, vdev->host.bus, 2721 vdev->host.slot, vdev->host.function); 2722 } 2723 2724 if (stat(vdev->vbasedev.sysfsdev, &st) < 0) { 2725 error_setg_errno(errp, errno, "no such host device"); 2726 error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.sysfsdev); 2727 return; 2728 } 2729 2730 vdev->vbasedev.name = g_path_get_basename(vdev->vbasedev.sysfsdev); 2731 vdev->vbasedev.ops = &vfio_pci_ops; 2732 vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI; 2733 vdev->vbasedev.dev = DEVICE(vdev); 2734 2735 tmp = g_strdup_printf("%s/iommu_group", vdev->vbasedev.sysfsdev); 2736 len = readlink(tmp, group_path, sizeof(group_path)); 2737 g_free(tmp); 2738 2739 if (len <= 0 || len >= sizeof(group_path)) { 2740 error_setg_errno(errp, len < 0 ? errno : ENAMETOOLONG, 2741 "no iommu_group found"); 2742 goto error; 2743 } 2744 2745 group_path[len] = 0; 2746 2747 group_name = basename(group_path); 2748 if (sscanf(group_name, "%d", &groupid) != 1) { 2749 error_setg_errno(errp, errno, "failed to read %s", group_path); 2750 goto error; 2751 } 2752 2753 trace_vfio_realize(vdev->vbasedev.name, groupid); 2754 2755 group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev), errp); 2756 if (!group) { 2757 goto error; 2758 } 2759 2760 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { 2761 if (strcmp(vbasedev_iter->name, vdev->vbasedev.name) == 0) { 2762 error_setg(errp, "device is already attached"); 2763 vfio_put_group(group); 2764 goto error; 2765 } 2766 } 2767 2768 /* 2769 * Mediated devices *might* operate compatibly with memory ballooning, but 2770 * we cannot know for certain, it depends on whether the mdev vendor driver 2771 * stays in sync with the active working set of the guest driver. Prevent 2772 * the x-balloon-allowed option unless this is minimally an mdev device. 2773 */ 2774 tmp = g_strdup_printf("%s/subsystem", vdev->vbasedev.sysfsdev); 2775 subsys = realpath(tmp, NULL); 2776 g_free(tmp); 2777 is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); 2778 free(subsys); 2779 2780 trace_vfio_mdev(vdev->vbasedev.name, is_mdev); 2781 2782 if (vdev->vbasedev.balloon_allowed && !is_mdev) { 2783 error_setg(errp, "x-balloon-allowed only potentially compatible " 2784 "with mdev devices"); 2785 vfio_put_group(group); 2786 goto error; 2787 } 2788 2789 ret = vfio_get_device(group, vdev->vbasedev.name, &vdev->vbasedev, errp); 2790 if (ret) { 2791 vfio_put_group(group); 2792 goto error; 2793 } 2794 2795 vfio_populate_device(vdev, &err); 2796 if (err) { 2797 error_propagate(errp, err); 2798 goto error; 2799 } 2800 2801 /* Get a copy of config space */ 2802 ret = pread(vdev->vbasedev.fd, vdev->pdev.config, 2803 MIN(pci_config_size(&vdev->pdev), vdev->config_size), 2804 vdev->config_offset); 2805 if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) { 2806 ret = ret < 0 ? -errno : -EFAULT; 2807 error_setg_errno(errp, -ret, "failed to read device config space"); 2808 goto error; 2809 } 2810 2811 /* vfio emulates a lot for us, but some bits need extra love */ 2812 vdev->emulated_config_bits = g_malloc0(vdev->config_size); 2813 2814 /* QEMU can choose to expose the ROM or not */ 2815 memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4); 2816 /* QEMU can also add or extend BARs */ 2817 memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4); 2818 2819 /* 2820 * The PCI spec reserves vendor ID 0xffff as an invalid value. The 2821 * device ID is managed by the vendor and need only be a 16-bit value. 2822 * Allow any 16-bit value for subsystem so they can be hidden or changed. 2823 */ 2824 if (vdev->vendor_id != PCI_ANY_ID) { 2825 if (vdev->vendor_id >= 0xffff) { 2826 error_setg(errp, "invalid PCI vendor ID provided"); 2827 goto error; 2828 } 2829 vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0); 2830 trace_vfio_pci_emulated_vendor_id(vdev->vbasedev.name, vdev->vendor_id); 2831 } else { 2832 vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID); 2833 } 2834 2835 if (vdev->device_id != PCI_ANY_ID) { 2836 if (vdev->device_id > 0xffff) { 2837 error_setg(errp, "invalid PCI device ID provided"); 2838 goto error; 2839 } 2840 vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0); 2841 trace_vfio_pci_emulated_device_id(vdev->vbasedev.name, vdev->device_id); 2842 } else { 2843 vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID); 2844 } 2845 2846 if (vdev->sub_vendor_id != PCI_ANY_ID) { 2847 if (vdev->sub_vendor_id > 0xffff) { 2848 error_setg(errp, "invalid PCI subsystem vendor ID provided"); 2849 goto error; 2850 } 2851 vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID, 2852 vdev->sub_vendor_id, ~0); 2853 trace_vfio_pci_emulated_sub_vendor_id(vdev->vbasedev.name, 2854 vdev->sub_vendor_id); 2855 } 2856 2857 if (vdev->sub_device_id != PCI_ANY_ID) { 2858 if (vdev->sub_device_id > 0xffff) { 2859 error_setg(errp, "invalid PCI subsystem device ID provided"); 2860 goto error; 2861 } 2862 vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0); 2863 trace_vfio_pci_emulated_sub_device_id(vdev->vbasedev.name, 2864 vdev->sub_device_id); 2865 } 2866 2867 /* QEMU can change multi-function devices to single function, or reverse */ 2868 vdev->emulated_config_bits[PCI_HEADER_TYPE] = 2869 PCI_HEADER_TYPE_MULTI_FUNCTION; 2870 2871 /* Restore or clear multifunction, this is always controlled by QEMU */ 2872 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { 2873 vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION; 2874 } else { 2875 vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION; 2876 } 2877 2878 /* 2879 * Clear host resource mapping info. If we choose not to register a 2880 * BAR, such as might be the case with the option ROM, we can get 2881 * confusing, unwritable, residual addresses from the host here. 2882 */ 2883 memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24); 2884 memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4); 2885 2886 vfio_pci_size_rom(vdev); 2887 2888 vfio_bars_prepare(vdev); 2889 2890 vfio_msix_early_setup(vdev, &err); 2891 if (err) { 2892 error_propagate(errp, err); 2893 goto error; 2894 } 2895 2896 vfio_bars_register(vdev); 2897 2898 ret = vfio_add_capabilities(vdev, errp); 2899 if (ret) { 2900 goto out_teardown; 2901 } 2902 2903 if (vdev->vga) { 2904 vfio_vga_quirk_setup(vdev); 2905 } 2906 2907 for (i = 0; i < PCI_ROM_SLOT; i++) { 2908 vfio_bar_quirk_setup(vdev, i); 2909 } 2910 2911 if (!vdev->igd_opregion && 2912 vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) { 2913 struct vfio_region_info *opregion; 2914 2915 if (vdev->pdev.qdev.hotplugged) { 2916 error_setg(errp, 2917 "cannot support IGD OpRegion feature on hotplugged " 2918 "device"); 2919 goto out_teardown; 2920 } 2921 2922 ret = vfio_get_dev_region_info(&vdev->vbasedev, 2923 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, 2924 VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); 2925 if (ret) { 2926 error_setg_errno(errp, -ret, 2927 "does not support requested IGD OpRegion feature"); 2928 goto out_teardown; 2929 } 2930 2931 ret = vfio_pci_igd_opregion_init(vdev, opregion, errp); 2932 g_free(opregion); 2933 if (ret) { 2934 goto out_teardown; 2935 } 2936 } 2937 2938 /* QEMU emulates all of MSI & MSIX */ 2939 if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { 2940 memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff, 2941 MSIX_CAP_LENGTH); 2942 } 2943 2944 if (pdev->cap_present & QEMU_PCI_CAP_MSI) { 2945 memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff, 2946 vdev->msi_cap_size); 2947 } 2948 2949 if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { 2950 vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, 2951 vfio_intx_mmap_enable, vdev); 2952 pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_intx_update); 2953 ret = vfio_intx_enable(vdev, errp); 2954 if (ret) { 2955 goto out_teardown; 2956 } 2957 } 2958 2959 if (vdev->display != ON_OFF_AUTO_OFF) { 2960 ret = vfio_display_probe(vdev, errp); 2961 if (ret) { 2962 goto out_teardown; 2963 } 2964 } 2965 if (vdev->enable_ramfb && vdev->dpy == NULL) { 2966 error_setg(errp, "ramfb=on requires display=on"); 2967 goto out_teardown; 2968 } 2969 if (vdev->display_xres || vdev->display_yres) { 2970 if (vdev->dpy == NULL) { 2971 error_setg(errp, "xres and yres properties require display=on"); 2972 goto out_teardown; 2973 } 2974 if (vdev->dpy->edid_regs == NULL) { 2975 error_setg(errp, "xres and yres properties need edid support"); 2976 goto out_teardown; 2977 } 2978 } 2979 2980 if (vdev->vendor_id == PCI_VENDOR_ID_NVIDIA) { 2981 ret = vfio_pci_nvidia_v100_ram_init(vdev, errp); 2982 if (ret && ret != -ENODEV) { 2983 error_report("Failed to setup NVIDIA V100 GPU RAM"); 2984 } 2985 } 2986 2987 if (vdev->vendor_id == PCI_VENDOR_ID_IBM) { 2988 ret = vfio_pci_nvlink2_init(vdev, errp); 2989 if (ret && ret != -ENODEV) { 2990 error_report("Failed to setup NVlink2 bridge"); 2991 } 2992 } 2993 2994 vfio_register_err_notifier(vdev); 2995 vfio_register_req_notifier(vdev); 2996 vfio_setup_resetfn_quirk(vdev); 2997 2998 return; 2999 3000 out_teardown: 3001 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); 3002 vfio_teardown_msi(vdev); 3003 vfio_bars_exit(vdev); 3004 error: 3005 error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name); 3006 } 3007 3008 static void vfio_instance_finalize(Object *obj) 3009 { 3010 VFIOPCIDevice *vdev = PCI_VFIO(obj); 3011 VFIOGroup *group = vdev->vbasedev.group; 3012 3013 vfio_display_finalize(vdev); 3014 vfio_bars_finalize(vdev); 3015 g_free(vdev->emulated_config_bits); 3016 g_free(vdev->rom); 3017 /* 3018 * XXX Leaking igd_opregion is not an oversight, we can't remove the 3019 * fw_cfg entry therefore leaking this allocation seems like the safest 3020 * option. 3021 * 3022 * g_free(vdev->igd_opregion); 3023 */ 3024 vfio_put_device(vdev); 3025 vfio_put_group(group); 3026 } 3027 3028 static void vfio_exitfn(PCIDevice *pdev) 3029 { 3030 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 3031 3032 vfio_unregister_req_notifier(vdev); 3033 vfio_unregister_err_notifier(vdev); 3034 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); 3035 vfio_disable_interrupts(vdev); 3036 if (vdev->intx.mmap_timer) { 3037 timer_free(vdev->intx.mmap_timer); 3038 } 3039 vfio_teardown_msi(vdev); 3040 vfio_bars_exit(vdev); 3041 } 3042 3043 static void vfio_pci_reset(DeviceState *dev) 3044 { 3045 VFIOPCIDevice *vdev = PCI_VFIO(dev); 3046 3047 trace_vfio_pci_reset(vdev->vbasedev.name); 3048 3049 vfio_pci_pre_reset(vdev); 3050 3051 if (vdev->display != ON_OFF_AUTO_OFF) { 3052 vfio_display_reset(vdev); 3053 } 3054 3055 if (vdev->resetfn && !vdev->resetfn(vdev)) { 3056 goto post_reset; 3057 } 3058 3059 if (vdev->vbasedev.reset_works && 3060 (vdev->has_flr || !vdev->has_pm_reset) && 3061 !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) { 3062 trace_vfio_pci_reset_flr(vdev->vbasedev.name); 3063 goto post_reset; 3064 } 3065 3066 /* See if we can do our own bus reset */ 3067 if (!vfio_pci_hot_reset_one(vdev)) { 3068 goto post_reset; 3069 } 3070 3071 /* If nothing else works and the device supports PM reset, use it */ 3072 if (vdev->vbasedev.reset_works && vdev->has_pm_reset && 3073 !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) { 3074 trace_vfio_pci_reset_pm(vdev->vbasedev.name); 3075 goto post_reset; 3076 } 3077 3078 post_reset: 3079 vfio_pci_post_reset(vdev); 3080 } 3081 3082 static void vfio_instance_init(Object *obj) 3083 { 3084 PCIDevice *pci_dev = PCI_DEVICE(obj); 3085 VFIOPCIDevice *vdev = PCI_VFIO(obj); 3086 3087 device_add_bootindex_property(obj, &vdev->bootindex, 3088 "bootindex", NULL, 3089 &pci_dev->qdev, NULL); 3090 vdev->host.domain = ~0U; 3091 vdev->host.bus = ~0U; 3092 vdev->host.slot = ~0U; 3093 vdev->host.function = ~0U; 3094 3095 vdev->nv_gpudirect_clique = 0xFF; 3096 3097 /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command 3098 * line, therefore, no need to wait to realize like other devices */ 3099 pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; 3100 } 3101 3102 static Property vfio_pci_dev_properties[] = { 3103 DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), 3104 DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev), 3105 DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice, 3106 display, ON_OFF_AUTO_OFF), 3107 DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0), 3108 DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0), 3109 DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice, 3110 intx.mmap_timeout, 1100), 3111 DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features, 3112 VFIO_FEATURE_ENABLE_VGA_BIT, false), 3113 DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features, 3114 VFIO_FEATURE_ENABLE_REQ_BIT, true), 3115 DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, 3116 VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), 3117 DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), 3118 DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice, 3119 vbasedev.balloon_allowed, false), 3120 DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false), 3121 DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false), 3122 DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false), 3123 DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice, 3124 no_geforce_quirks, false), 3125 DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd, 3126 false), 3127 DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd, 3128 false), 3129 DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID), 3130 DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID), 3131 DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice, 3132 sub_vendor_id, PCI_ANY_ID), 3133 DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice, 3134 sub_device_id, PCI_ANY_ID), 3135 DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0), 3136 DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice, 3137 nv_gpudirect_clique, 3138 qdev_prop_nv_gpudirect_clique, uint8_t), 3139 DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo, 3140 OFF_AUTOPCIBAR_OFF), 3141 /* 3142 * TODO - support passed fds... is this necessary? 3143 * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), 3144 * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name), 3145 */ 3146 DEFINE_PROP_END_OF_LIST(), 3147 }; 3148 3149 static const VMStateDescription vfio_pci_vmstate = { 3150 .name = "vfio-pci", 3151 .unmigratable = 1, 3152 }; 3153 3154 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) 3155 { 3156 DeviceClass *dc = DEVICE_CLASS(klass); 3157 PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); 3158 3159 dc->reset = vfio_pci_reset; 3160 dc->props = vfio_pci_dev_properties; 3161 dc->vmsd = &vfio_pci_vmstate; 3162 dc->desc = "VFIO-based PCI device assignment"; 3163 set_bit(DEVICE_CATEGORY_MISC, dc->categories); 3164 pdc->realize = vfio_realize; 3165 pdc->exit = vfio_exitfn; 3166 pdc->config_read = vfio_pci_read_config; 3167 pdc->config_write = vfio_pci_write_config; 3168 } 3169 3170 static const TypeInfo vfio_pci_dev_info = { 3171 .name = TYPE_VFIO_PCI, 3172 .parent = TYPE_PCI_DEVICE, 3173 .instance_size = sizeof(VFIOPCIDevice), 3174 .class_init = vfio_pci_dev_class_init, 3175 .instance_init = vfio_instance_init, 3176 .instance_finalize = vfio_instance_finalize, 3177 .interfaces = (InterfaceInfo[]) { 3178 { INTERFACE_PCIE_DEVICE }, 3179 { INTERFACE_CONVENTIONAL_PCI_DEVICE }, 3180 { } 3181 }, 3182 }; 3183 3184 static Property vfio_pci_dev_nohotplug_properties[] = { 3185 DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false), 3186 DEFINE_PROP_END_OF_LIST(), 3187 }; 3188 3189 static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data) 3190 { 3191 DeviceClass *dc = DEVICE_CLASS(klass); 3192 3193 dc->props = vfio_pci_dev_nohotplug_properties; 3194 dc->hotpluggable = false; 3195 } 3196 3197 static const TypeInfo vfio_pci_nohotplug_dev_info = { 3198 .name = TYPE_VIFO_PCI_NOHOTPLUG, 3199 .parent = TYPE_VFIO_PCI, 3200 .instance_size = sizeof(VFIOPCIDevice), 3201 .class_init = vfio_pci_nohotplug_dev_class_init, 3202 }; 3203 3204 static void register_vfio_pci_dev_type(void) 3205 { 3206 type_register_static(&vfio_pci_dev_info); 3207 type_register_static(&vfio_pci_nohotplug_dev_info); 3208 } 3209 3210 type_init(register_vfio_pci_dev_type) 3211