1 /* 2 * vfio based device assignment support 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Based on qemu-kvm device-assignment: 13 * Adapted for KVM by Qumranet. 14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) 15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) 16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) 17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) 18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) 19 */ 20 21 #include "qemu/osdep.h" 22 #include <linux/vfio.h> 23 #include <sys/ioctl.h> 24 25 #include "hw/pci/msi.h" 26 #include "hw/pci/msix.h" 27 #include "hw/pci/pci_bridge.h" 28 #include "qemu/error-report.h" 29 #include "qemu/module.h" 30 #include "qemu/option.h" 31 #include "qemu/range.h" 32 #include "qemu/units.h" 33 #include "sysemu/kvm.h" 34 #include "sysemu/sysemu.h" 35 #include "pci.h" 36 #include "trace.h" 37 #include "qapi/error.h" 38 39 #define TYPE_VFIO_PCI "vfio-pci" 40 #define PCI_VFIO(obj) OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI) 41 42 #define TYPE_VIFO_PCI_NOHOTPLUG "vfio-pci-nohotplug" 43 44 static void vfio_disable_interrupts(VFIOPCIDevice *vdev); 45 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); 46 47 /* 48 * Disabling BAR mmaping can be slow, but toggling it around INTx can 49 * also be a huge overhead. We try to get the best of both worlds by 50 * waiting until an interrupt to disable mmaps (subsequent transitions 51 * to the same state are effectively no overhead). If the interrupt has 52 * been serviced and the time gap is long enough, we re-enable mmaps for 53 * performance. This works well for things like graphics cards, which 54 * may not use their interrupt at all and are penalized to an unusable 55 * level by read/write BAR traps. Other devices, like NICs, have more 56 * regular interrupts and see much better latency by staying in non-mmap 57 * mode. We therefore set the default mmap_timeout such that a ping 58 * is just enough to keep the mmap disabled. Users can experiment with 59 * other options with the x-intx-mmap-timeout-ms parameter (a value of 60 * zero disables the timer). 61 */ 62 static void vfio_intx_mmap_enable(void *opaque) 63 { 64 VFIOPCIDevice *vdev = opaque; 65 66 if (vdev->intx.pending) { 67 timer_mod(vdev->intx.mmap_timer, 68 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout); 69 return; 70 } 71 72 vfio_mmap_set_enabled(vdev, true); 73 } 74 75 static void vfio_intx_interrupt(void *opaque) 76 { 77 VFIOPCIDevice *vdev = opaque; 78 79 if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) { 80 return; 81 } 82 83 trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin); 84 85 vdev->intx.pending = true; 86 pci_irq_assert(&vdev->pdev); 87 vfio_mmap_set_enabled(vdev, false); 88 if (vdev->intx.mmap_timeout) { 89 timer_mod(vdev->intx.mmap_timer, 90 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout); 91 } 92 } 93 94 static void vfio_intx_eoi(VFIODevice *vbasedev) 95 { 96 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 97 98 if (!vdev->intx.pending) { 99 return; 100 } 101 102 trace_vfio_intx_eoi(vbasedev->name); 103 104 vdev->intx.pending = false; 105 pci_irq_deassert(&vdev->pdev); 106 vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 107 } 108 109 static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) 110 { 111 #ifdef CONFIG_KVM 112 struct kvm_irqfd irqfd = { 113 .fd = event_notifier_get_fd(&vdev->intx.interrupt), 114 .gsi = vdev->intx.route.irq, 115 .flags = KVM_IRQFD_FLAG_RESAMPLE, 116 }; 117 struct vfio_irq_set *irq_set; 118 int ret, argsz; 119 int32_t *pfd; 120 121 if (vdev->no_kvm_intx || !kvm_irqfds_enabled() || 122 vdev->intx.route.mode != PCI_INTX_ENABLED || 123 !kvm_resamplefds_enabled()) { 124 return; 125 } 126 127 /* Get to a known interrupt state */ 128 qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev); 129 vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 130 vdev->intx.pending = false; 131 pci_irq_deassert(&vdev->pdev); 132 133 /* Get an eventfd for resample/unmask */ 134 if (event_notifier_init(&vdev->intx.unmask, 0)) { 135 error_setg(errp, "event_notifier_init failed eoi"); 136 goto fail; 137 } 138 139 /* KVM triggers it, VFIO listens for it */ 140 irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask); 141 142 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) { 143 error_setg_errno(errp, errno, "failed to setup resample irqfd"); 144 goto fail_irqfd; 145 } 146 147 argsz = sizeof(*irq_set) + sizeof(*pfd); 148 149 irq_set = g_malloc0(argsz); 150 irq_set->argsz = argsz; 151 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK; 152 irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; 153 irq_set->start = 0; 154 irq_set->count = 1; 155 pfd = (int32_t *)&irq_set->data; 156 157 *pfd = irqfd.resamplefd; 158 159 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 160 g_free(irq_set); 161 if (ret) { 162 error_setg_errno(errp, -ret, "failed to setup INTx unmask fd"); 163 goto fail_vfio; 164 } 165 166 /* Let'em rip */ 167 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 168 169 vdev->intx.kvm_accel = true; 170 171 trace_vfio_intx_enable_kvm(vdev->vbasedev.name); 172 173 return; 174 175 fail_vfio: 176 irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN; 177 kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd); 178 fail_irqfd: 179 event_notifier_cleanup(&vdev->intx.unmask); 180 fail: 181 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev); 182 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 183 #endif 184 } 185 186 static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) 187 { 188 #ifdef CONFIG_KVM 189 struct kvm_irqfd irqfd = { 190 .fd = event_notifier_get_fd(&vdev->intx.interrupt), 191 .gsi = vdev->intx.route.irq, 192 .flags = KVM_IRQFD_FLAG_DEASSIGN, 193 }; 194 195 if (!vdev->intx.kvm_accel) { 196 return; 197 } 198 199 /* 200 * Get to a known state, hardware masked, QEMU ready to accept new 201 * interrupts, QEMU IRQ de-asserted. 202 */ 203 vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 204 vdev->intx.pending = false; 205 pci_irq_deassert(&vdev->pdev); 206 207 /* Tell KVM to stop listening for an INTx irqfd */ 208 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) { 209 error_report("vfio: Error: Failed to disable INTx irqfd: %m"); 210 } 211 212 /* We only need to close the eventfd for VFIO to cleanup the kernel side */ 213 event_notifier_cleanup(&vdev->intx.unmask); 214 215 /* QEMU starts listening for interrupt events. */ 216 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev); 217 218 vdev->intx.kvm_accel = false; 219 220 /* If we've missed an event, let it re-fire through QEMU */ 221 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 222 223 trace_vfio_intx_disable_kvm(vdev->vbasedev.name); 224 #endif 225 } 226 227 static void vfio_intx_update(PCIDevice *pdev) 228 { 229 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 230 PCIINTxRoute route; 231 Error *err = NULL; 232 233 if (vdev->interrupt != VFIO_INT_INTx) { 234 return; 235 } 236 237 route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin); 238 239 if (!pci_intx_route_changed(&vdev->intx.route, &route)) { 240 return; /* Nothing changed */ 241 } 242 243 trace_vfio_intx_update(vdev->vbasedev.name, 244 vdev->intx.route.irq, route.irq); 245 246 vfio_intx_disable_kvm(vdev); 247 248 vdev->intx.route = route; 249 250 if (route.mode != PCI_INTX_ENABLED) { 251 return; 252 } 253 254 vfio_intx_enable_kvm(vdev, &err); 255 if (err) { 256 warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 257 } 258 259 /* Re-enable the interrupt in cased we missed an EOI */ 260 vfio_intx_eoi(&vdev->vbasedev); 261 } 262 263 static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) 264 { 265 uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); 266 int ret, argsz, retval = 0; 267 struct vfio_irq_set *irq_set; 268 int32_t *pfd; 269 Error *err = NULL; 270 271 if (!pin) { 272 return 0; 273 } 274 275 vfio_disable_interrupts(vdev); 276 277 vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ 278 pci_config_set_interrupt_pin(vdev->pdev.config, pin); 279 280 #ifdef CONFIG_KVM 281 /* 282 * Only conditional to avoid generating error messages on platforms 283 * where we won't actually use the result anyway. 284 */ 285 if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) { 286 vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev, 287 vdev->intx.pin); 288 } 289 #endif 290 291 ret = event_notifier_init(&vdev->intx.interrupt, 0); 292 if (ret) { 293 error_setg_errno(errp, -ret, "event_notifier_init failed"); 294 return ret; 295 } 296 297 argsz = sizeof(*irq_set) + sizeof(*pfd); 298 299 irq_set = g_malloc0(argsz); 300 irq_set->argsz = argsz; 301 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; 302 irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; 303 irq_set->start = 0; 304 irq_set->count = 1; 305 pfd = (int32_t *)&irq_set->data; 306 307 *pfd = event_notifier_get_fd(&vdev->intx.interrupt); 308 qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev); 309 310 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 311 if (ret) { 312 error_setg_errno(errp, -ret, "failed to setup INTx fd"); 313 qemu_set_fd_handler(*pfd, NULL, NULL, vdev); 314 event_notifier_cleanup(&vdev->intx.interrupt); 315 retval = -errno; 316 goto cleanup; 317 } 318 319 vfio_intx_enable_kvm(vdev, &err); 320 if (err) { 321 warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 322 } 323 324 vdev->interrupt = VFIO_INT_INTx; 325 326 trace_vfio_intx_enable(vdev->vbasedev.name); 327 328 cleanup: 329 g_free(irq_set); 330 331 return retval; 332 } 333 334 static void vfio_intx_disable(VFIOPCIDevice *vdev) 335 { 336 int fd; 337 338 timer_del(vdev->intx.mmap_timer); 339 vfio_intx_disable_kvm(vdev); 340 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 341 vdev->intx.pending = false; 342 pci_irq_deassert(&vdev->pdev); 343 vfio_mmap_set_enabled(vdev, true); 344 345 fd = event_notifier_get_fd(&vdev->intx.interrupt); 346 qemu_set_fd_handler(fd, NULL, NULL, vdev); 347 event_notifier_cleanup(&vdev->intx.interrupt); 348 349 vdev->interrupt = VFIO_INT_NONE; 350 351 trace_vfio_intx_disable(vdev->vbasedev.name); 352 } 353 354 /* 355 * MSI/X 356 */ 357 static void vfio_msi_interrupt(void *opaque) 358 { 359 VFIOMSIVector *vector = opaque; 360 VFIOPCIDevice *vdev = vector->vdev; 361 MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector); 362 void (*notify)(PCIDevice *dev, unsigned vector); 363 MSIMessage msg; 364 int nr = vector - vdev->msi_vectors; 365 366 if (!event_notifier_test_and_clear(&vector->interrupt)) { 367 return; 368 } 369 370 if (vdev->interrupt == VFIO_INT_MSIX) { 371 get_msg = msix_get_message; 372 notify = msix_notify; 373 374 /* A masked vector firing needs to use the PBA, enable it */ 375 if (msix_is_masked(&vdev->pdev, nr)) { 376 set_bit(nr, vdev->msix->pending); 377 memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true); 378 trace_vfio_msix_pba_enable(vdev->vbasedev.name); 379 } 380 } else if (vdev->interrupt == VFIO_INT_MSI) { 381 get_msg = msi_get_message; 382 notify = msi_notify; 383 } else { 384 abort(); 385 } 386 387 msg = get_msg(&vdev->pdev, nr); 388 trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data); 389 notify(&vdev->pdev, nr); 390 } 391 392 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) 393 { 394 struct vfio_irq_set *irq_set; 395 int ret = 0, i, argsz; 396 int32_t *fds; 397 398 argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds)); 399 400 irq_set = g_malloc0(argsz); 401 irq_set->argsz = argsz; 402 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; 403 irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX; 404 irq_set->start = 0; 405 irq_set->count = vdev->nr_vectors; 406 fds = (int32_t *)&irq_set->data; 407 408 for (i = 0; i < vdev->nr_vectors; i++) { 409 int fd = -1; 410 411 /* 412 * MSI vs MSI-X - The guest has direct access to MSI mask and pending 413 * bits, therefore we always use the KVM signaling path when setup. 414 * MSI-X mask and pending bits are emulated, so we want to use the 415 * KVM signaling path only when configured and unmasked. 416 */ 417 if (vdev->msi_vectors[i].use) { 418 if (vdev->msi_vectors[i].virq < 0 || 419 (msix && msix_is_masked(&vdev->pdev, i))) { 420 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt); 421 } else { 422 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt); 423 } 424 } 425 426 fds[i] = fd; 427 } 428 429 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 430 431 g_free(irq_set); 432 433 return ret; 434 } 435 436 static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, 437 int vector_n, bool msix) 438 { 439 int virq; 440 441 if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) { 442 return; 443 } 444 445 if (event_notifier_init(&vector->kvm_interrupt, 0)) { 446 return; 447 } 448 449 virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev); 450 if (virq < 0) { 451 event_notifier_cleanup(&vector->kvm_interrupt); 452 return; 453 } 454 455 if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, 456 NULL, virq) < 0) { 457 kvm_irqchip_release_virq(kvm_state, virq); 458 event_notifier_cleanup(&vector->kvm_interrupt); 459 return; 460 } 461 462 vector->virq = virq; 463 } 464 465 static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) 466 { 467 kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, 468 vector->virq); 469 kvm_irqchip_release_virq(kvm_state, vector->virq); 470 vector->virq = -1; 471 event_notifier_cleanup(&vector->kvm_interrupt); 472 } 473 474 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, 475 PCIDevice *pdev) 476 { 477 kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev); 478 kvm_irqchip_commit_routes(kvm_state); 479 } 480 481 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, 482 MSIMessage *msg, IOHandler *handler) 483 { 484 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 485 VFIOMSIVector *vector; 486 int ret; 487 488 trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr); 489 490 vector = &vdev->msi_vectors[nr]; 491 492 if (!vector->use) { 493 vector->vdev = vdev; 494 vector->virq = -1; 495 if (event_notifier_init(&vector->interrupt, 0)) { 496 error_report("vfio: Error: event_notifier_init failed"); 497 } 498 vector->use = true; 499 msix_vector_use(pdev, nr); 500 } 501 502 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 503 handler, NULL, vector); 504 505 /* 506 * Attempt to enable route through KVM irqchip, 507 * default to userspace handling if unavailable. 508 */ 509 if (vector->virq >= 0) { 510 if (!msg) { 511 vfio_remove_kvm_msi_virq(vector); 512 } else { 513 vfio_update_kvm_msi_virq(vector, *msg, pdev); 514 } 515 } else { 516 if (msg) { 517 vfio_add_kvm_msi_virq(vdev, vector, nr, true); 518 } 519 } 520 521 /* 522 * We don't want to have the host allocate all possible MSI vectors 523 * for a device if they're not in use, so we shutdown and incrementally 524 * increase them as needed. 525 */ 526 if (vdev->nr_vectors < nr + 1) { 527 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); 528 vdev->nr_vectors = nr + 1; 529 ret = vfio_enable_vectors(vdev, true); 530 if (ret) { 531 error_report("vfio: failed to enable vectors, %d", ret); 532 } 533 } else { 534 int argsz; 535 struct vfio_irq_set *irq_set; 536 int32_t *pfd; 537 538 argsz = sizeof(*irq_set) + sizeof(*pfd); 539 540 irq_set = g_malloc0(argsz); 541 irq_set->argsz = argsz; 542 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 543 VFIO_IRQ_SET_ACTION_TRIGGER; 544 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; 545 irq_set->start = nr; 546 irq_set->count = 1; 547 pfd = (int32_t *)&irq_set->data; 548 549 if (vector->virq >= 0) { 550 *pfd = event_notifier_get_fd(&vector->kvm_interrupt); 551 } else { 552 *pfd = event_notifier_get_fd(&vector->interrupt); 553 } 554 555 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 556 g_free(irq_set); 557 if (ret) { 558 error_report("vfio: failed to modify vector, %d", ret); 559 } 560 } 561 562 /* Disable PBA emulation when nothing more is pending. */ 563 clear_bit(nr, vdev->msix->pending); 564 if (find_first_bit(vdev->msix->pending, 565 vdev->nr_vectors) == vdev->nr_vectors) { 566 memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false); 567 trace_vfio_msix_pba_disable(vdev->vbasedev.name); 568 } 569 570 return 0; 571 } 572 573 static int vfio_msix_vector_use(PCIDevice *pdev, 574 unsigned int nr, MSIMessage msg) 575 { 576 return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt); 577 } 578 579 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) 580 { 581 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 582 VFIOMSIVector *vector = &vdev->msi_vectors[nr]; 583 584 trace_vfio_msix_vector_release(vdev->vbasedev.name, nr); 585 586 /* 587 * There are still old guests that mask and unmask vectors on every 588 * interrupt. If we're using QEMU bypass with a KVM irqfd, leave all of 589 * the KVM setup in place, simply switch VFIO to use the non-bypass 590 * eventfd. We'll then fire the interrupt through QEMU and the MSI-X 591 * core will mask the interrupt and set pending bits, allowing it to 592 * be re-asserted on unmask. Nothing to do if already using QEMU mode. 593 */ 594 if (vector->virq >= 0) { 595 int argsz; 596 struct vfio_irq_set *irq_set; 597 int32_t *pfd; 598 599 argsz = sizeof(*irq_set) + sizeof(*pfd); 600 601 irq_set = g_malloc0(argsz); 602 irq_set->argsz = argsz; 603 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 604 VFIO_IRQ_SET_ACTION_TRIGGER; 605 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; 606 irq_set->start = nr; 607 irq_set->count = 1; 608 pfd = (int32_t *)&irq_set->data; 609 610 *pfd = event_notifier_get_fd(&vector->interrupt); 611 612 ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 613 614 g_free(irq_set); 615 } 616 } 617 618 static void vfio_msix_enable(VFIOPCIDevice *vdev) 619 { 620 vfio_disable_interrupts(vdev); 621 622 vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries); 623 624 vdev->interrupt = VFIO_INT_MSIX; 625 626 /* 627 * Some communication channels between VF & PF or PF & fw rely on the 628 * physical state of the device and expect that enabling MSI-X from the 629 * guest enables the same on the host. When our guest is Linux, the 630 * guest driver call to pci_enable_msix() sets the enabling bit in the 631 * MSI-X capability, but leaves the vector table masked. We therefore 632 * can't rely on a vector_use callback (from request_irq() in the guest) 633 * to switch the physical device into MSI-X mode because that may come a 634 * long time after pci_enable_msix(). This code enables vector 0 with 635 * triggering to userspace, then immediately release the vector, leaving 636 * the physical device with no vectors enabled, but MSI-X enabled, just 637 * like the guest view. 638 */ 639 vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL); 640 vfio_msix_vector_release(&vdev->pdev, 0); 641 642 if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, 643 vfio_msix_vector_release, NULL)) { 644 error_report("vfio: msix_set_vector_notifiers failed"); 645 } 646 647 trace_vfio_msix_enable(vdev->vbasedev.name); 648 } 649 650 static void vfio_msi_enable(VFIOPCIDevice *vdev) 651 { 652 int ret, i; 653 654 vfio_disable_interrupts(vdev); 655 656 vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev); 657 retry: 658 vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors); 659 660 for (i = 0; i < vdev->nr_vectors; i++) { 661 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 662 663 vector->vdev = vdev; 664 vector->virq = -1; 665 vector->use = true; 666 667 if (event_notifier_init(&vector->interrupt, 0)) { 668 error_report("vfio: Error: event_notifier_init failed"); 669 } 670 671 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 672 vfio_msi_interrupt, NULL, vector); 673 674 /* 675 * Attempt to enable route through KVM irqchip, 676 * default to userspace handling if unavailable. 677 */ 678 vfio_add_kvm_msi_virq(vdev, vector, i, false); 679 } 680 681 /* Set interrupt type prior to possible interrupts */ 682 vdev->interrupt = VFIO_INT_MSI; 683 684 ret = vfio_enable_vectors(vdev, false); 685 if (ret) { 686 if (ret < 0) { 687 error_report("vfio: Error: Failed to setup MSI fds: %m"); 688 } else if (ret != vdev->nr_vectors) { 689 error_report("vfio: Error: Failed to enable %d " 690 "MSI vectors, retry with %d", vdev->nr_vectors, ret); 691 } 692 693 for (i = 0; i < vdev->nr_vectors; i++) { 694 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 695 if (vector->virq >= 0) { 696 vfio_remove_kvm_msi_virq(vector); 697 } 698 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 699 NULL, NULL, NULL); 700 event_notifier_cleanup(&vector->interrupt); 701 } 702 703 g_free(vdev->msi_vectors); 704 705 if (ret > 0 && ret != vdev->nr_vectors) { 706 vdev->nr_vectors = ret; 707 goto retry; 708 } 709 vdev->nr_vectors = 0; 710 711 /* 712 * Failing to setup MSI doesn't really fall within any specification. 713 * Let's try leaving interrupts disabled and hope the guest figures 714 * out to fall back to INTx for this device. 715 */ 716 error_report("vfio: Error: Failed to enable MSI"); 717 vdev->interrupt = VFIO_INT_NONE; 718 719 return; 720 } 721 722 trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors); 723 } 724 725 static void vfio_msi_disable_common(VFIOPCIDevice *vdev) 726 { 727 Error *err = NULL; 728 int i; 729 730 for (i = 0; i < vdev->nr_vectors; i++) { 731 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 732 if (vdev->msi_vectors[i].use) { 733 if (vector->virq >= 0) { 734 vfio_remove_kvm_msi_virq(vector); 735 } 736 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 737 NULL, NULL, NULL); 738 event_notifier_cleanup(&vector->interrupt); 739 } 740 } 741 742 g_free(vdev->msi_vectors); 743 vdev->msi_vectors = NULL; 744 vdev->nr_vectors = 0; 745 vdev->interrupt = VFIO_INT_NONE; 746 747 vfio_intx_enable(vdev, &err); 748 if (err) { 749 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 750 } 751 } 752 753 static void vfio_msix_disable(VFIOPCIDevice *vdev) 754 { 755 int i; 756 757 msix_unset_vector_notifiers(&vdev->pdev); 758 759 /* 760 * MSI-X will only release vectors if MSI-X is still enabled on the 761 * device, check through the rest and release it ourselves if necessary. 762 */ 763 for (i = 0; i < vdev->nr_vectors; i++) { 764 if (vdev->msi_vectors[i].use) { 765 vfio_msix_vector_release(&vdev->pdev, i); 766 msix_vector_unuse(&vdev->pdev, i); 767 } 768 } 769 770 if (vdev->nr_vectors) { 771 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); 772 } 773 774 vfio_msi_disable_common(vdev); 775 776 memset(vdev->msix->pending, 0, 777 BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long)); 778 779 trace_vfio_msix_disable(vdev->vbasedev.name); 780 } 781 782 static void vfio_msi_disable(VFIOPCIDevice *vdev) 783 { 784 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX); 785 vfio_msi_disable_common(vdev); 786 787 trace_vfio_msi_disable(vdev->vbasedev.name); 788 } 789 790 static void vfio_update_msi(VFIOPCIDevice *vdev) 791 { 792 int i; 793 794 for (i = 0; i < vdev->nr_vectors; i++) { 795 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 796 MSIMessage msg; 797 798 if (!vector->use || vector->virq < 0) { 799 continue; 800 } 801 802 msg = msi_get_message(&vdev->pdev, i); 803 vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev); 804 } 805 } 806 807 static void vfio_pci_load_rom(VFIOPCIDevice *vdev) 808 { 809 struct vfio_region_info *reg_info; 810 uint64_t size; 811 off_t off = 0; 812 ssize_t bytes; 813 814 if (vfio_get_region_info(&vdev->vbasedev, 815 VFIO_PCI_ROM_REGION_INDEX, ®_info)) { 816 error_report("vfio: Error getting ROM info: %m"); 817 return; 818 } 819 820 trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size, 821 (unsigned long)reg_info->offset, 822 (unsigned long)reg_info->flags); 823 824 vdev->rom_size = size = reg_info->size; 825 vdev->rom_offset = reg_info->offset; 826 827 g_free(reg_info); 828 829 if (!vdev->rom_size) { 830 vdev->rom_read_failed = true; 831 error_report("vfio-pci: Cannot read device rom at " 832 "%s", vdev->vbasedev.name); 833 error_printf("Device option ROM contents are probably invalid " 834 "(check dmesg).\nSkip option ROM probe with rombar=0, " 835 "or load from file with romfile=\n"); 836 return; 837 } 838 839 vdev->rom = g_malloc(size); 840 memset(vdev->rom, 0xff, size); 841 842 while (size) { 843 bytes = pread(vdev->vbasedev.fd, vdev->rom + off, 844 size, vdev->rom_offset + off); 845 if (bytes == 0) { 846 break; 847 } else if (bytes > 0) { 848 off += bytes; 849 size -= bytes; 850 } else { 851 if (errno == EINTR || errno == EAGAIN) { 852 continue; 853 } 854 error_report("vfio: Error reading device ROM: %m"); 855 break; 856 } 857 } 858 859 /* 860 * Test the ROM signature against our device, if the vendor is correct 861 * but the device ID doesn't match, store the correct device ID and 862 * recompute the checksum. Intel IGD devices need this and are known 863 * to have bogus checksums so we can't simply adjust the checksum. 864 */ 865 if (pci_get_word(vdev->rom) == 0xaa55 && 866 pci_get_word(vdev->rom + 0x18) + 8 < vdev->rom_size && 867 !memcmp(vdev->rom + pci_get_word(vdev->rom + 0x18), "PCIR", 4)) { 868 uint16_t vid, did; 869 870 vid = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 4); 871 did = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6); 872 873 if (vid == vdev->vendor_id && did != vdev->device_id) { 874 int i; 875 uint8_t csum, *data = vdev->rom; 876 877 pci_set_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6, 878 vdev->device_id); 879 data[6] = 0; 880 881 for (csum = 0, i = 0; i < vdev->rom_size; i++) { 882 csum += data[i]; 883 } 884 885 data[6] = -csum; 886 } 887 } 888 } 889 890 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) 891 { 892 VFIOPCIDevice *vdev = opaque; 893 union { 894 uint8_t byte; 895 uint16_t word; 896 uint32_t dword; 897 uint64_t qword; 898 } val; 899 uint64_t data = 0; 900 901 /* Load the ROM lazily when the guest tries to read it */ 902 if (unlikely(!vdev->rom && !vdev->rom_read_failed)) { 903 vfio_pci_load_rom(vdev); 904 } 905 906 memcpy(&val, vdev->rom + addr, 907 (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0); 908 909 switch (size) { 910 case 1: 911 data = val.byte; 912 break; 913 case 2: 914 data = le16_to_cpu(val.word); 915 break; 916 case 4: 917 data = le32_to_cpu(val.dword); 918 break; 919 default: 920 hw_error("vfio: unsupported read size, %d bytes\n", size); 921 break; 922 } 923 924 trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data); 925 926 return data; 927 } 928 929 static void vfio_rom_write(void *opaque, hwaddr addr, 930 uint64_t data, unsigned size) 931 { 932 } 933 934 static const MemoryRegionOps vfio_rom_ops = { 935 .read = vfio_rom_read, 936 .write = vfio_rom_write, 937 .endianness = DEVICE_LITTLE_ENDIAN, 938 }; 939 940 static void vfio_pci_size_rom(VFIOPCIDevice *vdev) 941 { 942 uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); 943 off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; 944 DeviceState *dev = DEVICE(vdev); 945 char *name; 946 int fd = vdev->vbasedev.fd; 947 948 if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { 949 /* Since pci handles romfile, just print a message and return */ 950 if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) { 951 warn_report("Device at %s is known to cause system instability" 952 " issues during option rom execution", 953 vdev->vbasedev.name); 954 error_printf("Proceeding anyway since user specified romfile\n"); 955 } 956 return; 957 } 958 959 /* 960 * Use the same size ROM BAR as the physical device. The contents 961 * will get filled in later when the guest tries to read it. 962 */ 963 if (pread(fd, &orig, 4, offset) != 4 || 964 pwrite(fd, &size, 4, offset) != 4 || 965 pread(fd, &size, 4, offset) != 4 || 966 pwrite(fd, &orig, 4, offset) != 4) { 967 error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name); 968 return; 969 } 970 971 size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1; 972 973 if (!size) { 974 return; 975 } 976 977 if (vfio_blacklist_opt_rom(vdev)) { 978 if (dev->opts && qemu_opt_get(dev->opts, "rombar")) { 979 warn_report("Device at %s is known to cause system instability" 980 " issues during option rom execution", 981 vdev->vbasedev.name); 982 error_printf("Proceeding anyway since user specified" 983 " non zero value for rombar\n"); 984 } else { 985 warn_report("Rom loading for device at %s has been disabled" 986 " due to system instability issues", 987 vdev->vbasedev.name); 988 error_printf("Specify rombar=1 or romfile to force\n"); 989 return; 990 } 991 } 992 993 trace_vfio_pci_size_rom(vdev->vbasedev.name, size); 994 995 name = g_strdup_printf("vfio[%s].rom", vdev->vbasedev.name); 996 997 memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev), 998 &vfio_rom_ops, vdev, name, size); 999 g_free(name); 1000 1001 pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 1002 PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom); 1003 1004 vdev->rom_read_failed = false; 1005 } 1006 1007 void vfio_vga_write(void *opaque, hwaddr addr, 1008 uint64_t data, unsigned size) 1009 { 1010 VFIOVGARegion *region = opaque; 1011 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]); 1012 union { 1013 uint8_t byte; 1014 uint16_t word; 1015 uint32_t dword; 1016 uint64_t qword; 1017 } buf; 1018 off_t offset = vga->fd_offset + region->offset + addr; 1019 1020 switch (size) { 1021 case 1: 1022 buf.byte = data; 1023 break; 1024 case 2: 1025 buf.word = cpu_to_le16(data); 1026 break; 1027 case 4: 1028 buf.dword = cpu_to_le32(data); 1029 break; 1030 default: 1031 hw_error("vfio: unsupported write size, %d bytes", size); 1032 break; 1033 } 1034 1035 if (pwrite(vga->fd, &buf, size, offset) != size) { 1036 error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m", 1037 __func__, region->offset + addr, data, size); 1038 } 1039 1040 trace_vfio_vga_write(region->offset + addr, data, size); 1041 } 1042 1043 uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size) 1044 { 1045 VFIOVGARegion *region = opaque; 1046 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]); 1047 union { 1048 uint8_t byte; 1049 uint16_t word; 1050 uint32_t dword; 1051 uint64_t qword; 1052 } buf; 1053 uint64_t data = 0; 1054 off_t offset = vga->fd_offset + region->offset + addr; 1055 1056 if (pread(vga->fd, &buf, size, offset) != size) { 1057 error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m", 1058 __func__, region->offset + addr, size); 1059 return (uint64_t)-1; 1060 } 1061 1062 switch (size) { 1063 case 1: 1064 data = buf.byte; 1065 break; 1066 case 2: 1067 data = le16_to_cpu(buf.word); 1068 break; 1069 case 4: 1070 data = le32_to_cpu(buf.dword); 1071 break; 1072 default: 1073 hw_error("vfio: unsupported read size, %d bytes", size); 1074 break; 1075 } 1076 1077 trace_vfio_vga_read(region->offset + addr, size, data); 1078 1079 return data; 1080 } 1081 1082 static const MemoryRegionOps vfio_vga_ops = { 1083 .read = vfio_vga_read, 1084 .write = vfio_vga_write, 1085 .endianness = DEVICE_LITTLE_ENDIAN, 1086 }; 1087 1088 /* 1089 * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page 1090 * size if the BAR is in an exclusive page in host so that we could map 1091 * this BAR to guest. But this sub-page BAR may not occupy an exclusive 1092 * page in guest. So we should set the priority of the expanded memory 1093 * region to zero in case of overlap with BARs which share the same page 1094 * with the sub-page BAR in guest. Besides, we should also recover the 1095 * size of this sub-page BAR when its base address is changed in guest 1096 * and not page aligned any more. 1097 */ 1098 static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) 1099 { 1100 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 1101 VFIORegion *region = &vdev->bars[bar].region; 1102 MemoryRegion *mmap_mr, *region_mr, *base_mr; 1103 PCIIORegion *r; 1104 pcibus_t bar_addr; 1105 uint64_t size = region->size; 1106 1107 /* Make sure that the whole region is allowed to be mmapped */ 1108 if (region->nr_mmaps != 1 || !region->mmaps[0].mmap || 1109 region->mmaps[0].size != region->size) { 1110 return; 1111 } 1112 1113 r = &pdev->io_regions[bar]; 1114 bar_addr = r->addr; 1115 base_mr = vdev->bars[bar].mr; 1116 region_mr = region->mem; 1117 mmap_mr = ®ion->mmaps[0].mem; 1118 1119 /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */ 1120 if (bar_addr != PCI_BAR_UNMAPPED && 1121 !(bar_addr & ~qemu_real_host_page_mask)) { 1122 size = qemu_real_host_page_size; 1123 } 1124 1125 memory_region_transaction_begin(); 1126 1127 if (vdev->bars[bar].size < size) { 1128 memory_region_set_size(base_mr, size); 1129 } 1130 memory_region_set_size(region_mr, size); 1131 memory_region_set_size(mmap_mr, size); 1132 if (size != vdev->bars[bar].size && memory_region_is_mapped(base_mr)) { 1133 memory_region_del_subregion(r->address_space, base_mr); 1134 memory_region_add_subregion_overlap(r->address_space, 1135 bar_addr, base_mr, 0); 1136 } 1137 1138 memory_region_transaction_commit(); 1139 } 1140 1141 /* 1142 * PCI config space 1143 */ 1144 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) 1145 { 1146 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 1147 uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; 1148 1149 memcpy(&emu_bits, vdev->emulated_config_bits + addr, len); 1150 emu_bits = le32_to_cpu(emu_bits); 1151 1152 if (emu_bits) { 1153 emu_val = pci_default_read_config(pdev, addr, len); 1154 } 1155 1156 if (~emu_bits & (0xffffffffU >> (32 - len * 8))) { 1157 ssize_t ret; 1158 1159 ret = pread(vdev->vbasedev.fd, &phys_val, len, 1160 vdev->config_offset + addr); 1161 if (ret != len) { 1162 error_report("%s(%s, 0x%x, 0x%x) failed: %m", 1163 __func__, vdev->vbasedev.name, addr, len); 1164 return -errno; 1165 } 1166 phys_val = le32_to_cpu(phys_val); 1167 } 1168 1169 val = (emu_val & emu_bits) | (phys_val & ~emu_bits); 1170 1171 trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val); 1172 1173 return val; 1174 } 1175 1176 void vfio_pci_write_config(PCIDevice *pdev, 1177 uint32_t addr, uint32_t val, int len) 1178 { 1179 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 1180 uint32_t val_le = cpu_to_le32(val); 1181 1182 trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len); 1183 1184 /* Write everything to VFIO, let it filter out what we can't write */ 1185 if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr) 1186 != len) { 1187 error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m", 1188 __func__, vdev->vbasedev.name, addr, val, len); 1189 } 1190 1191 /* MSI/MSI-X Enabling/Disabling */ 1192 if (pdev->cap_present & QEMU_PCI_CAP_MSI && 1193 ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) { 1194 int is_enabled, was_enabled = msi_enabled(pdev); 1195 1196 pci_default_write_config(pdev, addr, val, len); 1197 1198 is_enabled = msi_enabled(pdev); 1199 1200 if (!was_enabled) { 1201 if (is_enabled) { 1202 vfio_msi_enable(vdev); 1203 } 1204 } else { 1205 if (!is_enabled) { 1206 vfio_msi_disable(vdev); 1207 } else { 1208 vfio_update_msi(vdev); 1209 } 1210 } 1211 } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX && 1212 ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) { 1213 int is_enabled, was_enabled = msix_enabled(pdev); 1214 1215 pci_default_write_config(pdev, addr, val, len); 1216 1217 is_enabled = msix_enabled(pdev); 1218 1219 if (!was_enabled && is_enabled) { 1220 vfio_msix_enable(vdev); 1221 } else if (was_enabled && !is_enabled) { 1222 vfio_msix_disable(vdev); 1223 } 1224 } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) || 1225 range_covers_byte(addr, len, PCI_COMMAND)) { 1226 pcibus_t old_addr[PCI_NUM_REGIONS - 1]; 1227 int bar; 1228 1229 for (bar = 0; bar < PCI_ROM_SLOT; bar++) { 1230 old_addr[bar] = pdev->io_regions[bar].addr; 1231 } 1232 1233 pci_default_write_config(pdev, addr, val, len); 1234 1235 for (bar = 0; bar < PCI_ROM_SLOT; bar++) { 1236 if (old_addr[bar] != pdev->io_regions[bar].addr && 1237 vdev->bars[bar].region.size > 0 && 1238 vdev->bars[bar].region.size < qemu_real_host_page_size) { 1239 vfio_sub_page_bar_update_mapping(pdev, bar); 1240 } 1241 } 1242 } else { 1243 /* Write everything to QEMU to keep emulated bits correct */ 1244 pci_default_write_config(pdev, addr, val, len); 1245 } 1246 } 1247 1248 /* 1249 * Interrupt setup 1250 */ 1251 static void vfio_disable_interrupts(VFIOPCIDevice *vdev) 1252 { 1253 /* 1254 * More complicated than it looks. Disabling MSI/X transitions the 1255 * device to INTx mode (if supported). Therefore we need to first 1256 * disable MSI/X and then cleanup by disabling INTx. 1257 */ 1258 if (vdev->interrupt == VFIO_INT_MSIX) { 1259 vfio_msix_disable(vdev); 1260 } else if (vdev->interrupt == VFIO_INT_MSI) { 1261 vfio_msi_disable(vdev); 1262 } 1263 1264 if (vdev->interrupt == VFIO_INT_INTx) { 1265 vfio_intx_disable(vdev); 1266 } 1267 } 1268 1269 static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp) 1270 { 1271 uint16_t ctrl; 1272 bool msi_64bit, msi_maskbit; 1273 int ret, entries; 1274 Error *err = NULL; 1275 1276 if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl), 1277 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { 1278 error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS"); 1279 return -errno; 1280 } 1281 ctrl = le16_to_cpu(ctrl); 1282 1283 msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT); 1284 msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT); 1285 entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1); 1286 1287 trace_vfio_msi_setup(vdev->vbasedev.name, pos); 1288 1289 ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit, &err); 1290 if (ret < 0) { 1291 if (ret == -ENOTSUP) { 1292 return 0; 1293 } 1294 error_propagate_prepend(errp, err, "msi_init failed: "); 1295 return ret; 1296 } 1297 vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0); 1298 1299 return 0; 1300 } 1301 1302 static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev) 1303 { 1304 off_t start, end; 1305 VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region; 1306 1307 /* 1308 * If the host driver allows mapping of a MSIX data, we are going to 1309 * do map the entire BAR and emulate MSIX table on top of that. 1310 */ 1311 if (vfio_has_region_cap(&vdev->vbasedev, region->nr, 1312 VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) { 1313 return; 1314 } 1315 1316 /* 1317 * We expect to find a single mmap covering the whole BAR, anything else 1318 * means it's either unsupported or already setup. 1319 */ 1320 if (region->nr_mmaps != 1 || region->mmaps[0].offset || 1321 region->size != region->mmaps[0].size) { 1322 return; 1323 } 1324 1325 /* MSI-X table start and end aligned to host page size */ 1326 start = vdev->msix->table_offset & qemu_real_host_page_mask; 1327 end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset + 1328 (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE)); 1329 1330 /* 1331 * Does the MSI-X table cover the beginning of the BAR? The whole BAR? 1332 * NB - Host page size is necessarily a power of two and so is the PCI 1333 * BAR (not counting EA yet), therefore if we have host page aligned 1334 * @start and @end, then any remainder of the BAR before or after those 1335 * must be at least host page sized and therefore mmap'able. 1336 */ 1337 if (!start) { 1338 if (end >= region->size) { 1339 region->nr_mmaps = 0; 1340 g_free(region->mmaps); 1341 region->mmaps = NULL; 1342 trace_vfio_msix_fixup(vdev->vbasedev.name, 1343 vdev->msix->table_bar, 0, 0); 1344 } else { 1345 region->mmaps[0].offset = end; 1346 region->mmaps[0].size = region->size - end; 1347 trace_vfio_msix_fixup(vdev->vbasedev.name, 1348 vdev->msix->table_bar, region->mmaps[0].offset, 1349 region->mmaps[0].offset + region->mmaps[0].size); 1350 } 1351 1352 /* Maybe it's aligned at the end of the BAR */ 1353 } else if (end >= region->size) { 1354 region->mmaps[0].size = start; 1355 trace_vfio_msix_fixup(vdev->vbasedev.name, 1356 vdev->msix->table_bar, region->mmaps[0].offset, 1357 region->mmaps[0].offset + region->mmaps[0].size); 1358 1359 /* Otherwise it must split the BAR */ 1360 } else { 1361 region->nr_mmaps = 2; 1362 region->mmaps = g_renew(VFIOMmap, region->mmaps, 2); 1363 1364 memcpy(®ion->mmaps[1], ®ion->mmaps[0], sizeof(VFIOMmap)); 1365 1366 region->mmaps[0].size = start; 1367 trace_vfio_msix_fixup(vdev->vbasedev.name, 1368 vdev->msix->table_bar, region->mmaps[0].offset, 1369 region->mmaps[0].offset + region->mmaps[0].size); 1370 1371 region->mmaps[1].offset = end; 1372 region->mmaps[1].size = region->size - end; 1373 trace_vfio_msix_fixup(vdev->vbasedev.name, 1374 vdev->msix->table_bar, region->mmaps[1].offset, 1375 region->mmaps[1].offset + region->mmaps[1].size); 1376 } 1377 } 1378 1379 static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp) 1380 { 1381 int target_bar = -1; 1382 size_t msix_sz; 1383 1384 if (!vdev->msix || vdev->msix_relo == OFF_AUTOPCIBAR_OFF) { 1385 return; 1386 } 1387 1388 /* The actual minimum size of MSI-X structures */ 1389 msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) + 1390 (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8); 1391 /* Round up to host pages, we don't want to share a page */ 1392 msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz); 1393 /* PCI BARs must be a power of 2 */ 1394 msix_sz = pow2ceil(msix_sz); 1395 1396 if (vdev->msix_relo == OFF_AUTOPCIBAR_AUTO) { 1397 /* 1398 * TODO: Lookup table for known devices. 1399 * 1400 * Logically we might use an algorithm here to select the BAR adding 1401 * the least additional MMIO space, but we cannot programatically 1402 * predict the driver dependency on BAR ordering or sizing, therefore 1403 * 'auto' becomes a lookup for combinations reported to work. 1404 */ 1405 if (target_bar < 0) { 1406 error_setg(errp, "No automatic MSI-X relocation available for " 1407 "device %04x:%04x", vdev->vendor_id, vdev->device_id); 1408 return; 1409 } 1410 } else { 1411 target_bar = (int)(vdev->msix_relo - OFF_AUTOPCIBAR_BAR0); 1412 } 1413 1414 /* I/O port BARs cannot host MSI-X structures */ 1415 if (vdev->bars[target_bar].ioport) { 1416 error_setg(errp, "Invalid MSI-X relocation BAR %d, " 1417 "I/O port BAR", target_bar); 1418 return; 1419 } 1420 1421 /* Cannot use a BAR in the "shadow" of a 64-bit BAR */ 1422 if (!vdev->bars[target_bar].size && 1423 target_bar > 0 && vdev->bars[target_bar - 1].mem64) { 1424 error_setg(errp, "Invalid MSI-X relocation BAR %d, " 1425 "consumed by 64-bit BAR %d", target_bar, target_bar - 1); 1426 return; 1427 } 1428 1429 /* 2GB max size for 32-bit BARs, cannot double if already > 1G */ 1430 if (vdev->bars[target_bar].size > 1 * GiB && 1431 !vdev->bars[target_bar].mem64) { 1432 error_setg(errp, "Invalid MSI-X relocation BAR %d, " 1433 "no space to extend 32-bit BAR", target_bar); 1434 return; 1435 } 1436 1437 /* 1438 * If adding a new BAR, test if we can make it 64bit. We make it 1439 * prefetchable since QEMU MSI-X emulation has no read side effects 1440 * and doing so makes mapping more flexible. 1441 */ 1442 if (!vdev->bars[target_bar].size) { 1443 if (target_bar < (PCI_ROM_SLOT - 1) && 1444 !vdev->bars[target_bar + 1].size) { 1445 vdev->bars[target_bar].mem64 = true; 1446 vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64; 1447 } 1448 vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH; 1449 vdev->bars[target_bar].size = msix_sz; 1450 vdev->msix->table_offset = 0; 1451 } else { 1452 vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2, 1453 msix_sz * 2); 1454 /* 1455 * Due to above size calc, MSI-X always starts halfway into the BAR, 1456 * which will always be a separate host page. 1457 */ 1458 vdev->msix->table_offset = vdev->bars[target_bar].size / 2; 1459 } 1460 1461 vdev->msix->table_bar = target_bar; 1462 vdev->msix->pba_bar = target_bar; 1463 /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */ 1464 vdev->msix->pba_offset = vdev->msix->table_offset + 1465 (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE); 1466 1467 trace_vfio_msix_relo(vdev->vbasedev.name, 1468 vdev->msix->table_bar, vdev->msix->table_offset); 1469 } 1470 1471 /* 1472 * We don't have any control over how pci_add_capability() inserts 1473 * capabilities into the chain. In order to setup MSI-X we need a 1474 * MemoryRegion for the BAR. In order to setup the BAR and not 1475 * attempt to mmap the MSI-X table area, which VFIO won't allow, we 1476 * need to first look for where the MSI-X table lives. So we 1477 * unfortunately split MSI-X setup across two functions. 1478 */ 1479 static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) 1480 { 1481 uint8_t pos; 1482 uint16_t ctrl; 1483 uint32_t table, pba; 1484 int fd = vdev->vbasedev.fd; 1485 VFIOMSIXInfo *msix; 1486 1487 pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); 1488 if (!pos) { 1489 return; 1490 } 1491 1492 if (pread(fd, &ctrl, sizeof(ctrl), 1493 vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) { 1494 error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS"); 1495 return; 1496 } 1497 1498 if (pread(fd, &table, sizeof(table), 1499 vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) { 1500 error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE"); 1501 return; 1502 } 1503 1504 if (pread(fd, &pba, sizeof(pba), 1505 vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) { 1506 error_setg_errno(errp, errno, "failed to read PCI MSIX PBA"); 1507 return; 1508 } 1509 1510 ctrl = le16_to_cpu(ctrl); 1511 table = le32_to_cpu(table); 1512 pba = le32_to_cpu(pba); 1513 1514 msix = g_malloc0(sizeof(*msix)); 1515 msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK; 1516 msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK; 1517 msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK; 1518 msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; 1519 msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; 1520 1521 /* 1522 * Test the size of the pba_offset variable and catch if it extends outside 1523 * of the specified BAR. If it is the case, we need to apply a hardware 1524 * specific quirk if the device is known or we have a broken configuration. 1525 */ 1526 if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) { 1527 /* 1528 * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5 1529 * adapters. The T5 hardware returns an incorrect value of 0x8000 for 1530 * the VF PBA offset while the BAR itself is only 8k. The correct value 1531 * is 0x1000, so we hard code that here. 1532 */ 1533 if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO && 1534 (vdev->device_id & 0xff00) == 0x5800) { 1535 msix->pba_offset = 0x1000; 1536 } else { 1537 error_setg(errp, "hardware reports invalid configuration, " 1538 "MSIX PBA outside of specified BAR"); 1539 g_free(msix); 1540 return; 1541 } 1542 } 1543 1544 trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar, 1545 msix->table_offset, msix->entries); 1546 vdev->msix = msix; 1547 1548 vfio_pci_fixup_msix_region(vdev); 1549 1550 vfio_pci_relocate_msix(vdev, errp); 1551 } 1552 1553 static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) 1554 { 1555 int ret; 1556 Error *err = NULL; 1557 1558 vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) * 1559 sizeof(unsigned long)); 1560 ret = msix_init(&vdev->pdev, vdev->msix->entries, 1561 vdev->bars[vdev->msix->table_bar].mr, 1562 vdev->msix->table_bar, vdev->msix->table_offset, 1563 vdev->bars[vdev->msix->pba_bar].mr, 1564 vdev->msix->pba_bar, vdev->msix->pba_offset, pos, 1565 &err); 1566 if (ret < 0) { 1567 if (ret == -ENOTSUP) { 1568 warn_report_err(err); 1569 return 0; 1570 } 1571 1572 error_propagate(errp, err); 1573 return ret; 1574 } 1575 1576 /* 1577 * The PCI spec suggests that devices provide additional alignment for 1578 * MSI-X structures and avoid overlapping non-MSI-X related registers. 1579 * For an assigned device, this hopefully means that emulation of MSI-X 1580 * structures does not affect the performance of the device. If devices 1581 * fail to provide that alignment, a significant performance penalty may 1582 * result, for instance Mellanox MT27500 VFs: 1583 * http://www.spinics.net/lists/kvm/msg125881.html 1584 * 1585 * The PBA is simply not that important for such a serious regression and 1586 * most drivers do not appear to look at it. The solution for this is to 1587 * disable the PBA MemoryRegion unless it's being used. We disable it 1588 * here and only enable it if a masked vector fires through QEMU. As the 1589 * vector-use notifier is called, which occurs on unmask, we test whether 1590 * PBA emulation is needed and again disable if not. 1591 */ 1592 memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false); 1593 1594 /* 1595 * The emulated machine may provide a paravirt interface for MSIX setup 1596 * so it is not strictly necessary to emulate MSIX here. This becomes 1597 * helpful when frequently accessed MMIO registers are located in 1598 * subpages adjacent to the MSIX table but the MSIX data containing page 1599 * cannot be mapped because of a host page size bigger than the MSIX table 1600 * alignment. 1601 */ 1602 if (object_property_get_bool(OBJECT(qdev_get_machine()), 1603 "vfio-no-msix-emulation", NULL)) { 1604 memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false); 1605 } 1606 1607 return 0; 1608 } 1609 1610 static void vfio_teardown_msi(VFIOPCIDevice *vdev) 1611 { 1612 msi_uninit(&vdev->pdev); 1613 1614 if (vdev->msix) { 1615 msix_uninit(&vdev->pdev, 1616 vdev->bars[vdev->msix->table_bar].mr, 1617 vdev->bars[vdev->msix->pba_bar].mr); 1618 g_free(vdev->msix->pending); 1619 } 1620 } 1621 1622 /* 1623 * Resource setup 1624 */ 1625 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled) 1626 { 1627 int i; 1628 1629 for (i = 0; i < PCI_ROM_SLOT; i++) { 1630 vfio_region_mmaps_set_enabled(&vdev->bars[i].region, enabled); 1631 } 1632 } 1633 1634 static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) 1635 { 1636 VFIOBAR *bar = &vdev->bars[nr]; 1637 1638 uint32_t pci_bar; 1639 int ret; 1640 1641 /* Skip both unimplemented BARs and the upper half of 64bit BARS. */ 1642 if (!bar->region.size) { 1643 return; 1644 } 1645 1646 /* Determine what type of BAR this is for registration */ 1647 ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar), 1648 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); 1649 if (ret != sizeof(pci_bar)) { 1650 error_report("vfio: Failed to read BAR %d (%m)", nr); 1651 return; 1652 } 1653 1654 pci_bar = le32_to_cpu(pci_bar); 1655 bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO); 1656 bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64); 1657 bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK : 1658 ~PCI_BASE_ADDRESS_MEM_MASK); 1659 bar->size = bar->region.size; 1660 } 1661 1662 static void vfio_bars_prepare(VFIOPCIDevice *vdev) 1663 { 1664 int i; 1665 1666 for (i = 0; i < PCI_ROM_SLOT; i++) { 1667 vfio_bar_prepare(vdev, i); 1668 } 1669 } 1670 1671 static void vfio_bar_register(VFIOPCIDevice *vdev, int nr) 1672 { 1673 VFIOBAR *bar = &vdev->bars[nr]; 1674 char *name; 1675 1676 if (!bar->size) { 1677 return; 1678 } 1679 1680 bar->mr = g_new0(MemoryRegion, 1); 1681 name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr); 1682 memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size); 1683 g_free(name); 1684 1685 if (bar->region.size) { 1686 memory_region_add_subregion(bar->mr, 0, bar->region.mem); 1687 1688 if (vfio_region_mmap(&bar->region)) { 1689 error_report("Failed to mmap %s BAR %d. Performance may be slow", 1690 vdev->vbasedev.name, nr); 1691 } 1692 } 1693 1694 pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr); 1695 } 1696 1697 static void vfio_bars_register(VFIOPCIDevice *vdev) 1698 { 1699 int i; 1700 1701 for (i = 0; i < PCI_ROM_SLOT; i++) { 1702 vfio_bar_register(vdev, i); 1703 } 1704 } 1705 1706 static void vfio_bars_exit(VFIOPCIDevice *vdev) 1707 { 1708 int i; 1709 1710 for (i = 0; i < PCI_ROM_SLOT; i++) { 1711 VFIOBAR *bar = &vdev->bars[i]; 1712 1713 vfio_bar_quirk_exit(vdev, i); 1714 vfio_region_exit(&bar->region); 1715 if (bar->region.size) { 1716 memory_region_del_subregion(bar->mr, bar->region.mem); 1717 } 1718 } 1719 1720 if (vdev->vga) { 1721 pci_unregister_vga(&vdev->pdev); 1722 vfio_vga_quirk_exit(vdev); 1723 } 1724 } 1725 1726 static void vfio_bars_finalize(VFIOPCIDevice *vdev) 1727 { 1728 int i; 1729 1730 for (i = 0; i < PCI_ROM_SLOT; i++) { 1731 VFIOBAR *bar = &vdev->bars[i]; 1732 1733 vfio_bar_quirk_finalize(vdev, i); 1734 vfio_region_finalize(&bar->region); 1735 if (bar->size) { 1736 object_unparent(OBJECT(bar->mr)); 1737 g_free(bar->mr); 1738 } 1739 } 1740 1741 if (vdev->vga) { 1742 vfio_vga_quirk_finalize(vdev); 1743 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { 1744 object_unparent(OBJECT(&vdev->vga->region[i].mem)); 1745 } 1746 g_free(vdev->vga); 1747 } 1748 } 1749 1750 /* 1751 * General setup 1752 */ 1753 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos) 1754 { 1755 uint8_t tmp; 1756 uint16_t next = PCI_CONFIG_SPACE_SIZE; 1757 1758 for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp; 1759 tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]) { 1760 if (tmp > pos && tmp < next) { 1761 next = tmp; 1762 } 1763 } 1764 1765 return next - pos; 1766 } 1767 1768 1769 static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos) 1770 { 1771 uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE; 1772 1773 for (tmp = PCI_CONFIG_SPACE_SIZE; tmp; 1774 tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) { 1775 if (tmp > pos && tmp < next) { 1776 next = tmp; 1777 } 1778 } 1779 1780 return next - pos; 1781 } 1782 1783 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask) 1784 { 1785 pci_set_word(buf, (pci_get_word(buf) & ~mask) | val); 1786 } 1787 1788 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos, 1789 uint16_t val, uint16_t mask) 1790 { 1791 vfio_set_word_bits(vdev->pdev.config + pos, val, mask); 1792 vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask); 1793 vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask); 1794 } 1795 1796 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask) 1797 { 1798 pci_set_long(buf, (pci_get_long(buf) & ~mask) | val); 1799 } 1800 1801 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos, 1802 uint32_t val, uint32_t mask) 1803 { 1804 vfio_set_long_bits(vdev->pdev.config + pos, val, mask); 1805 vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask); 1806 vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask); 1807 } 1808 1809 static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size, 1810 Error **errp) 1811 { 1812 uint16_t flags; 1813 uint8_t type; 1814 1815 flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS); 1816 type = (flags & PCI_EXP_FLAGS_TYPE) >> 4; 1817 1818 if (type != PCI_EXP_TYPE_ENDPOINT && 1819 type != PCI_EXP_TYPE_LEG_END && 1820 type != PCI_EXP_TYPE_RC_END) { 1821 1822 error_setg(errp, "assignment of PCIe type 0x%x " 1823 "devices is not currently supported", type); 1824 return -EINVAL; 1825 } 1826 1827 if (!pci_bus_is_express(pci_get_bus(&vdev->pdev))) { 1828 PCIBus *bus = pci_get_bus(&vdev->pdev); 1829 PCIDevice *bridge; 1830 1831 /* 1832 * Traditionally PCI device assignment exposes the PCIe capability 1833 * as-is on non-express buses. The reason being that some drivers 1834 * simply assume that it's there, for example tg3. However when 1835 * we're running on a native PCIe machine type, like Q35, we need 1836 * to hide the PCIe capability. The reason for this is twofold; 1837 * first Windows guests get a Code 10 error when the PCIe capability 1838 * is exposed in this configuration. Therefore express devices won't 1839 * work at all unless they're attached to express buses in the VM. 1840 * Second, a native PCIe machine introduces the possibility of fine 1841 * granularity IOMMUs supporting both translation and isolation. 1842 * Guest code to discover the IOMMU visibility of a device, such as 1843 * IOMMU grouping code on Linux, is very aware of device types and 1844 * valid transitions between bus types. An express device on a non- 1845 * express bus is not a valid combination on bare metal systems. 1846 * 1847 * Drivers that require a PCIe capability to make the device 1848 * functional are simply going to need to have their devices placed 1849 * on a PCIe bus in the VM. 1850 */ 1851 while (!pci_bus_is_root(bus)) { 1852 bridge = pci_bridge_get_device(bus); 1853 bus = pci_get_bus(bridge); 1854 } 1855 1856 if (pci_bus_is_express(bus)) { 1857 return 0; 1858 } 1859 1860 } else if (pci_bus_is_root(pci_get_bus(&vdev->pdev))) { 1861 /* 1862 * On a Root Complex bus Endpoints become Root Complex Integrated 1863 * Endpoints, which changes the type and clears the LNK & LNK2 fields. 1864 */ 1865 if (type == PCI_EXP_TYPE_ENDPOINT) { 1866 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS, 1867 PCI_EXP_TYPE_RC_END << 4, 1868 PCI_EXP_FLAGS_TYPE); 1869 1870 /* Link Capabilities, Status, and Control goes away */ 1871 if (size > PCI_EXP_LNKCTL) { 1872 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0); 1873 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0); 1874 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0); 1875 1876 #ifndef PCI_EXP_LNKCAP2 1877 #define PCI_EXP_LNKCAP2 44 1878 #endif 1879 #ifndef PCI_EXP_LNKSTA2 1880 #define PCI_EXP_LNKSTA2 50 1881 #endif 1882 /* Link 2 Capabilities, Status, and Control goes away */ 1883 if (size > PCI_EXP_LNKCAP2) { 1884 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0); 1885 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0); 1886 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0); 1887 } 1888 } 1889 1890 } else if (type == PCI_EXP_TYPE_LEG_END) { 1891 /* 1892 * Legacy endpoints don't belong on the root complex. Windows 1893 * seems to be happier with devices if we skip the capability. 1894 */ 1895 return 0; 1896 } 1897 1898 } else { 1899 /* 1900 * Convert Root Complex Integrated Endpoints to regular endpoints. 1901 * These devices don't support LNK/LNK2 capabilities, so make them up. 1902 */ 1903 if (type == PCI_EXP_TYPE_RC_END) { 1904 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS, 1905 PCI_EXP_TYPE_ENDPOINT << 4, 1906 PCI_EXP_FLAGS_TYPE); 1907 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 1908 QEMU_PCI_EXP_LNKCAP_MLW(QEMU_PCI_EXP_LNK_X1) | 1909 QEMU_PCI_EXP_LNKCAP_MLS(QEMU_PCI_EXP_LNK_2_5GT), ~0); 1910 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0); 1911 } 1912 } 1913 1914 /* 1915 * Intel 82599 SR-IOV VFs report an invalid PCIe capability version 0 1916 * (Niantic errate #35) causing Windows to error with a Code 10 for the 1917 * device on Q35. Fixup any such devices to report version 1. If we 1918 * were to remove the capability entirely the guest would lose extended 1919 * config space. 1920 */ 1921 if ((flags & PCI_EXP_FLAGS_VERS) == 0) { 1922 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS, 1923 1, PCI_EXP_FLAGS_VERS); 1924 } 1925 1926 pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size, 1927 errp); 1928 if (pos < 0) { 1929 return pos; 1930 } 1931 1932 vdev->pdev.exp.exp_cap = pos; 1933 1934 return pos; 1935 } 1936 1937 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos) 1938 { 1939 uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP); 1940 1941 if (cap & PCI_EXP_DEVCAP_FLR) { 1942 trace_vfio_check_pcie_flr(vdev->vbasedev.name); 1943 vdev->has_flr = true; 1944 } 1945 } 1946 1947 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos) 1948 { 1949 uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL); 1950 1951 if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) { 1952 trace_vfio_check_pm_reset(vdev->vbasedev.name); 1953 vdev->has_pm_reset = true; 1954 } 1955 } 1956 1957 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos) 1958 { 1959 uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP); 1960 1961 if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) { 1962 trace_vfio_check_af_flr(vdev->vbasedev.name); 1963 vdev->has_flr = true; 1964 } 1965 } 1966 1967 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) 1968 { 1969 PCIDevice *pdev = &vdev->pdev; 1970 uint8_t cap_id, next, size; 1971 int ret; 1972 1973 cap_id = pdev->config[pos]; 1974 next = pdev->config[pos + PCI_CAP_LIST_NEXT]; 1975 1976 /* 1977 * If it becomes important to configure capabilities to their actual 1978 * size, use this as the default when it's something we don't recognize. 1979 * Since QEMU doesn't actually handle many of the config accesses, 1980 * exact size doesn't seem worthwhile. 1981 */ 1982 size = vfio_std_cap_max_size(pdev, pos); 1983 1984 /* 1985 * pci_add_capability always inserts the new capability at the head 1986 * of the chain. Therefore to end up with a chain that matches the 1987 * physical device, we insert from the end by making this recursive. 1988 * This is also why we pre-calculate size above as cached config space 1989 * will be changed as we unwind the stack. 1990 */ 1991 if (next) { 1992 ret = vfio_add_std_cap(vdev, next, errp); 1993 if (ret) { 1994 return ret; 1995 } 1996 } else { 1997 /* Begin the rebuild, use QEMU emulated list bits */ 1998 pdev->config[PCI_CAPABILITY_LIST] = 0; 1999 vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff; 2000 vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST; 2001 2002 ret = vfio_add_virt_caps(vdev, errp); 2003 if (ret) { 2004 return ret; 2005 } 2006 } 2007 2008 /* Scale down size, esp in case virt caps were added above */ 2009 size = MIN(size, vfio_std_cap_max_size(pdev, pos)); 2010 2011 /* Use emulated next pointer to allow dropping caps */ 2012 pci_set_byte(vdev->emulated_config_bits + pos + PCI_CAP_LIST_NEXT, 0xff); 2013 2014 switch (cap_id) { 2015 case PCI_CAP_ID_MSI: 2016 ret = vfio_msi_setup(vdev, pos, errp); 2017 break; 2018 case PCI_CAP_ID_EXP: 2019 vfio_check_pcie_flr(vdev, pos); 2020 ret = vfio_setup_pcie_cap(vdev, pos, size, errp); 2021 break; 2022 case PCI_CAP_ID_MSIX: 2023 ret = vfio_msix_setup(vdev, pos, errp); 2024 break; 2025 case PCI_CAP_ID_PM: 2026 vfio_check_pm_reset(vdev, pos); 2027 vdev->pm_cap = pos; 2028 ret = pci_add_capability(pdev, cap_id, pos, size, errp); 2029 break; 2030 case PCI_CAP_ID_AF: 2031 vfio_check_af_flr(vdev, pos); 2032 ret = pci_add_capability(pdev, cap_id, pos, size, errp); 2033 break; 2034 default: 2035 ret = pci_add_capability(pdev, cap_id, pos, size, errp); 2036 break; 2037 } 2038 2039 if (ret < 0) { 2040 error_prepend(errp, 2041 "failed to add PCI capability 0x%x[0x%x]@0x%x: ", 2042 cap_id, size, pos); 2043 return ret; 2044 } 2045 2046 return 0; 2047 } 2048 2049 static void vfio_add_ext_cap(VFIOPCIDevice *vdev) 2050 { 2051 PCIDevice *pdev = &vdev->pdev; 2052 uint32_t header; 2053 uint16_t cap_id, next, size; 2054 uint8_t cap_ver; 2055 uint8_t *config; 2056 2057 /* Only add extended caps if we have them and the guest can see them */ 2058 if (!pci_is_express(pdev) || !pci_bus_is_express(pci_get_bus(pdev)) || 2059 !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) { 2060 return; 2061 } 2062 2063 /* 2064 * pcie_add_capability always inserts the new capability at the tail 2065 * of the chain. Therefore to end up with a chain that matches the 2066 * physical device, we cache the config space to avoid overwriting 2067 * the original config space when we parse the extended capabilities. 2068 */ 2069 config = g_memdup(pdev->config, vdev->config_size); 2070 2071 /* 2072 * Extended capabilities are chained with each pointing to the next, so we 2073 * can drop anything other than the head of the chain simply by modifying 2074 * the previous next pointer. Seed the head of the chain here such that 2075 * we can simply skip any capabilities we want to drop below, regardless 2076 * of their position in the chain. If this stub capability still exists 2077 * after we add the capabilities we want to expose, update the capability 2078 * ID to zero. Note that we cannot seed with the capability header being 2079 * zero as this conflicts with definition of an absent capability chain 2080 * and prevents capabilities beyond the head of the list from being added. 2081 * By replacing the dummy capability ID with zero after walking the device 2082 * chain, we also transparently mark extended capabilities as absent if 2083 * no capabilities were added. Note that the PCIe spec defines an absence 2084 * of extended capabilities to be determined by a value of zero for the 2085 * capability ID, version, AND next pointer. A non-zero next pointer 2086 * should be sufficient to indicate additional capabilities are present, 2087 * which will occur if we call pcie_add_capability() below. The entire 2088 * first dword is emulated to support this. 2089 * 2090 * NB. The kernel side does similar masking, so be prepared that our 2091 * view of the device may also contain a capability ID zero in the head 2092 * of the chain. Skip it for the same reason that we cannot seed the 2093 * chain with a zero capability. 2094 */ 2095 pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE, 2096 PCI_EXT_CAP(0xFFFF, 0, 0)); 2097 pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0); 2098 pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0); 2099 2100 for (next = PCI_CONFIG_SPACE_SIZE; next; 2101 next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) { 2102 header = pci_get_long(config + next); 2103 cap_id = PCI_EXT_CAP_ID(header); 2104 cap_ver = PCI_EXT_CAP_VER(header); 2105 2106 /* 2107 * If it becomes important to configure extended capabilities to their 2108 * actual size, use this as the default when it's something we don't 2109 * recognize. Since QEMU doesn't actually handle many of the config 2110 * accesses, exact size doesn't seem worthwhile. 2111 */ 2112 size = vfio_ext_cap_max_size(config, next); 2113 2114 /* Use emulated next pointer to allow dropping extended caps */ 2115 pci_long_test_and_set_mask(vdev->emulated_config_bits + next, 2116 PCI_EXT_CAP_NEXT_MASK); 2117 2118 switch (cap_id) { 2119 case 0: /* kernel masked capability */ 2120 case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */ 2121 case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */ 2122 trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next); 2123 break; 2124 default: 2125 pcie_add_capability(pdev, cap_id, cap_ver, next, size); 2126 } 2127 2128 } 2129 2130 /* Cleanup chain head ID if necessary */ 2131 if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) { 2132 pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0); 2133 } 2134 2135 g_free(config); 2136 return; 2137 } 2138 2139 static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) 2140 { 2141 PCIDevice *pdev = &vdev->pdev; 2142 int ret; 2143 2144 if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) || 2145 !pdev->config[PCI_CAPABILITY_LIST]) { 2146 return 0; /* Nothing to add */ 2147 } 2148 2149 ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST], errp); 2150 if (ret) { 2151 return ret; 2152 } 2153 2154 vfio_add_ext_cap(vdev); 2155 return 0; 2156 } 2157 2158 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) 2159 { 2160 PCIDevice *pdev = &vdev->pdev; 2161 uint16_t cmd; 2162 2163 vfio_disable_interrupts(vdev); 2164 2165 /* Make sure the device is in D0 */ 2166 if (vdev->pm_cap) { 2167 uint16_t pmcsr; 2168 uint8_t state; 2169 2170 pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); 2171 state = pmcsr & PCI_PM_CTRL_STATE_MASK; 2172 if (state) { 2173 pmcsr &= ~PCI_PM_CTRL_STATE_MASK; 2174 vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); 2175 /* vfio handles the necessary delay here */ 2176 pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); 2177 state = pmcsr & PCI_PM_CTRL_STATE_MASK; 2178 if (state) { 2179 error_report("vfio: Unable to power on device, stuck in D%d", 2180 state); 2181 } 2182 } 2183 } 2184 2185 /* 2186 * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master. 2187 * Also put INTx Disable in known state. 2188 */ 2189 cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); 2190 cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | 2191 PCI_COMMAND_INTX_DISABLE); 2192 vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); 2193 } 2194 2195 static void vfio_pci_post_reset(VFIOPCIDevice *vdev) 2196 { 2197 Error *err = NULL; 2198 int nr; 2199 2200 vfio_intx_enable(vdev, &err); 2201 if (err) { 2202 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 2203 } 2204 2205 for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) { 2206 off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr); 2207 uint32_t val = 0; 2208 uint32_t len = sizeof(val); 2209 2210 if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) { 2211 error_report("%s(%s) reset bar %d failed: %m", __func__, 2212 vdev->vbasedev.name, nr); 2213 } 2214 } 2215 2216 vfio_quirk_reset(vdev); 2217 } 2218 2219 static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) 2220 { 2221 char tmp[13]; 2222 2223 sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain, 2224 addr->bus, addr->slot, addr->function); 2225 2226 return (strcmp(tmp, name) == 0); 2227 } 2228 2229 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) 2230 { 2231 VFIOGroup *group; 2232 struct vfio_pci_hot_reset_info *info; 2233 struct vfio_pci_dependent_device *devices; 2234 struct vfio_pci_hot_reset *reset; 2235 int32_t *fds; 2236 int ret, i, count; 2237 bool multi = false; 2238 2239 trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); 2240 2241 if (!single) { 2242 vfio_pci_pre_reset(vdev); 2243 } 2244 vdev->vbasedev.needs_reset = false; 2245 2246 info = g_malloc0(sizeof(*info)); 2247 info->argsz = sizeof(*info); 2248 2249 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); 2250 if (ret && errno != ENOSPC) { 2251 ret = -errno; 2252 if (!vdev->has_pm_reset) { 2253 error_report("vfio: Cannot reset device %s, " 2254 "no available reset mechanism.", vdev->vbasedev.name); 2255 } 2256 goto out_single; 2257 } 2258 2259 count = info->count; 2260 info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices))); 2261 info->argsz = sizeof(*info) + (count * sizeof(*devices)); 2262 devices = &info->devices[0]; 2263 2264 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); 2265 if (ret) { 2266 ret = -errno; 2267 error_report("vfio: hot reset info failed: %m"); 2268 goto out_single; 2269 } 2270 2271 trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); 2272 2273 /* Verify that we have all the groups required */ 2274 for (i = 0; i < info->count; i++) { 2275 PCIHostDeviceAddress host; 2276 VFIOPCIDevice *tmp; 2277 VFIODevice *vbasedev_iter; 2278 2279 host.domain = devices[i].segment; 2280 host.bus = devices[i].bus; 2281 host.slot = PCI_SLOT(devices[i].devfn); 2282 host.function = PCI_FUNC(devices[i].devfn); 2283 2284 trace_vfio_pci_hot_reset_dep_devices(host.domain, 2285 host.bus, host.slot, host.function, devices[i].group_id); 2286 2287 if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { 2288 continue; 2289 } 2290 2291 QLIST_FOREACH(group, &vfio_group_list, next) { 2292 if (group->groupid == devices[i].group_id) { 2293 break; 2294 } 2295 } 2296 2297 if (!group) { 2298 if (!vdev->has_pm_reset) { 2299 error_report("vfio: Cannot reset device %s, " 2300 "depends on group %d which is not owned.", 2301 vdev->vbasedev.name, devices[i].group_id); 2302 } 2303 ret = -EPERM; 2304 goto out; 2305 } 2306 2307 /* Prep dependent devices for reset and clear our marker. */ 2308 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { 2309 if (!vbasedev_iter->dev->realized || 2310 vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { 2311 continue; 2312 } 2313 tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); 2314 if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { 2315 if (single) { 2316 ret = -EINVAL; 2317 goto out_single; 2318 } 2319 vfio_pci_pre_reset(tmp); 2320 tmp->vbasedev.needs_reset = false; 2321 multi = true; 2322 break; 2323 } 2324 } 2325 } 2326 2327 if (!single && !multi) { 2328 ret = -EINVAL; 2329 goto out_single; 2330 } 2331 2332 /* Determine how many group fds need to be passed */ 2333 count = 0; 2334 QLIST_FOREACH(group, &vfio_group_list, next) { 2335 for (i = 0; i < info->count; i++) { 2336 if (group->groupid == devices[i].group_id) { 2337 count++; 2338 break; 2339 } 2340 } 2341 } 2342 2343 reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); 2344 reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); 2345 fds = &reset->group_fds[0]; 2346 2347 /* Fill in group fds */ 2348 QLIST_FOREACH(group, &vfio_group_list, next) { 2349 for (i = 0; i < info->count; i++) { 2350 if (group->groupid == devices[i].group_id) { 2351 fds[reset->count++] = group->fd; 2352 break; 2353 } 2354 } 2355 } 2356 2357 /* Bus reset! */ 2358 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); 2359 g_free(reset); 2360 2361 trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, 2362 ret ? "%m" : "Success"); 2363 2364 out: 2365 /* Re-enable INTx on affected devices */ 2366 for (i = 0; i < info->count; i++) { 2367 PCIHostDeviceAddress host; 2368 VFIOPCIDevice *tmp; 2369 VFIODevice *vbasedev_iter; 2370 2371 host.domain = devices[i].segment; 2372 host.bus = devices[i].bus; 2373 host.slot = PCI_SLOT(devices[i].devfn); 2374 host.function = PCI_FUNC(devices[i].devfn); 2375 2376 if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { 2377 continue; 2378 } 2379 2380 QLIST_FOREACH(group, &vfio_group_list, next) { 2381 if (group->groupid == devices[i].group_id) { 2382 break; 2383 } 2384 } 2385 2386 if (!group) { 2387 break; 2388 } 2389 2390 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { 2391 if (!vbasedev_iter->dev->realized || 2392 vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { 2393 continue; 2394 } 2395 tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); 2396 if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { 2397 vfio_pci_post_reset(tmp); 2398 break; 2399 } 2400 } 2401 } 2402 out_single: 2403 if (!single) { 2404 vfio_pci_post_reset(vdev); 2405 } 2406 g_free(info); 2407 2408 return ret; 2409 } 2410 2411 /* 2412 * We want to differentiate hot reset of mulitple in-use devices vs hot reset 2413 * of a single in-use device. VFIO_DEVICE_RESET will already handle the case 2414 * of doing hot resets when there is only a single device per bus. The in-use 2415 * here refers to how many VFIODevices are affected. A hot reset that affects 2416 * multiple devices, but only a single in-use device, means that we can call 2417 * it from our bus ->reset() callback since the extent is effectively a single 2418 * device. This allows us to make use of it in the hotplug path. When there 2419 * are multiple in-use devices, we can only trigger the hot reset during a 2420 * system reset and thus from our reset handler. We separate _one vs _multi 2421 * here so that we don't overlap and do a double reset on the system reset 2422 * path where both our reset handler and ->reset() callback are used. Calling 2423 * _one() will only do a hot reset for the one in-use devices case, calling 2424 * _multi() will do nothing if a _one() would have been sufficient. 2425 */ 2426 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev) 2427 { 2428 return vfio_pci_hot_reset(vdev, true); 2429 } 2430 2431 static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev) 2432 { 2433 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 2434 return vfio_pci_hot_reset(vdev, false); 2435 } 2436 2437 static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev) 2438 { 2439 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 2440 if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) { 2441 vbasedev->needs_reset = true; 2442 } 2443 } 2444 2445 static VFIODeviceOps vfio_pci_ops = { 2446 .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, 2447 .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, 2448 .vfio_eoi = vfio_intx_eoi, 2449 }; 2450 2451 int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) 2452 { 2453 VFIODevice *vbasedev = &vdev->vbasedev; 2454 struct vfio_region_info *reg_info; 2455 int ret; 2456 2457 ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info); 2458 if (ret) { 2459 error_setg_errno(errp, -ret, 2460 "failed getting region info for VGA region index %d", 2461 VFIO_PCI_VGA_REGION_INDEX); 2462 return ret; 2463 } 2464 2465 if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) || 2466 !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) || 2467 reg_info->size < 0xbffff + 1) { 2468 error_setg(errp, "unexpected VGA info, flags 0x%lx, size 0x%lx", 2469 (unsigned long)reg_info->flags, 2470 (unsigned long)reg_info->size); 2471 g_free(reg_info); 2472 return -EINVAL; 2473 } 2474 2475 vdev->vga = g_new0(VFIOVGA, 1); 2476 2477 vdev->vga->fd_offset = reg_info->offset; 2478 vdev->vga->fd = vdev->vbasedev.fd; 2479 2480 g_free(reg_info); 2481 2482 vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE; 2483 vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM; 2484 QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks); 2485 2486 memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem, 2487 OBJECT(vdev), &vfio_vga_ops, 2488 &vdev->vga->region[QEMU_PCI_VGA_MEM], 2489 "vfio-vga-mmio@0xa0000", 2490 QEMU_PCI_VGA_MEM_SIZE); 2491 2492 vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE; 2493 vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO; 2494 QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks); 2495 2496 memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, 2497 OBJECT(vdev), &vfio_vga_ops, 2498 &vdev->vga->region[QEMU_PCI_VGA_IO_LO], 2499 "vfio-vga-io@0x3b0", 2500 QEMU_PCI_VGA_IO_LO_SIZE); 2501 2502 vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE; 2503 vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI; 2504 QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks); 2505 2506 memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 2507 OBJECT(vdev), &vfio_vga_ops, 2508 &vdev->vga->region[QEMU_PCI_VGA_IO_HI], 2509 "vfio-vga-io@0x3c0", 2510 QEMU_PCI_VGA_IO_HI_SIZE); 2511 2512 pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem, 2513 &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, 2514 &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem); 2515 2516 return 0; 2517 } 2518 2519 static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) 2520 { 2521 VFIODevice *vbasedev = &vdev->vbasedev; 2522 struct vfio_region_info *reg_info; 2523 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; 2524 int i, ret = -1; 2525 2526 /* Sanity check device */ 2527 if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) { 2528 error_setg(errp, "this isn't a PCI device"); 2529 return; 2530 } 2531 2532 if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) { 2533 error_setg(errp, "unexpected number of io regions %u", 2534 vbasedev->num_regions); 2535 return; 2536 } 2537 2538 if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) { 2539 error_setg(errp, "unexpected number of irqs %u", vbasedev->num_irqs); 2540 return; 2541 } 2542 2543 for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) { 2544 char *name = g_strdup_printf("%s BAR %d", vbasedev->name, i); 2545 2546 ret = vfio_region_setup(OBJECT(vdev), vbasedev, 2547 &vdev->bars[i].region, i, name); 2548 g_free(name); 2549 2550 if (ret) { 2551 error_setg_errno(errp, -ret, "failed to get region %d info", i); 2552 return; 2553 } 2554 2555 QLIST_INIT(&vdev->bars[i].quirks); 2556 } 2557 2558 ret = vfio_get_region_info(vbasedev, 2559 VFIO_PCI_CONFIG_REGION_INDEX, ®_info); 2560 if (ret) { 2561 error_setg_errno(errp, -ret, "failed to get config info"); 2562 return; 2563 } 2564 2565 trace_vfio_populate_device_config(vdev->vbasedev.name, 2566 (unsigned long)reg_info->size, 2567 (unsigned long)reg_info->offset, 2568 (unsigned long)reg_info->flags); 2569 2570 vdev->config_size = reg_info->size; 2571 if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) { 2572 vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS; 2573 } 2574 vdev->config_offset = reg_info->offset; 2575 2576 g_free(reg_info); 2577 2578 if (vdev->features & VFIO_FEATURE_ENABLE_VGA) { 2579 ret = vfio_populate_vga(vdev, errp); 2580 if (ret) { 2581 error_append_hint(errp, "device does not support " 2582 "requested feature x-vga\n"); 2583 return; 2584 } 2585 } 2586 2587 irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; 2588 2589 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); 2590 if (ret) { 2591 /* This can fail for an old kernel or legacy PCI dev */ 2592 trace_vfio_populate_device_get_irq_info_failure(strerror(errno)); 2593 } else if (irq_info.count == 1) { 2594 vdev->pci_aer = true; 2595 } else { 2596 warn_report(VFIO_MSG_PREFIX 2597 "Could not enable error recovery for the device", 2598 vbasedev->name); 2599 } 2600 } 2601 2602 static void vfio_put_device(VFIOPCIDevice *vdev) 2603 { 2604 g_free(vdev->vbasedev.name); 2605 g_free(vdev->msix); 2606 2607 vfio_put_base_device(&vdev->vbasedev); 2608 } 2609 2610 static void vfio_err_notifier_handler(void *opaque) 2611 { 2612 VFIOPCIDevice *vdev = opaque; 2613 2614 if (!event_notifier_test_and_clear(&vdev->err_notifier)) { 2615 return; 2616 } 2617 2618 /* 2619 * TBD. Retrieve the error details and decide what action 2620 * needs to be taken. One of the actions could be to pass 2621 * the error to the guest and have the guest driver recover 2622 * from the error. This requires that PCIe capabilities be 2623 * exposed to the guest. For now, we just terminate the 2624 * guest to contain the error. 2625 */ 2626 2627 error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name); 2628 2629 vm_stop(RUN_STATE_INTERNAL_ERROR); 2630 } 2631 2632 /* 2633 * Registers error notifier for devices supporting error recovery. 2634 * If we encounter a failure in this function, we report an error 2635 * and continue after disabling error recovery support for the 2636 * device. 2637 */ 2638 static void vfio_register_err_notifier(VFIOPCIDevice *vdev) 2639 { 2640 int ret; 2641 int argsz; 2642 struct vfio_irq_set *irq_set; 2643 int32_t *pfd; 2644 2645 if (!vdev->pci_aer) { 2646 return; 2647 } 2648 2649 if (event_notifier_init(&vdev->err_notifier, 0)) { 2650 error_report("vfio: Unable to init event notifier for error detection"); 2651 vdev->pci_aer = false; 2652 return; 2653 } 2654 2655 argsz = sizeof(*irq_set) + sizeof(*pfd); 2656 2657 irq_set = g_malloc0(argsz); 2658 irq_set->argsz = argsz; 2659 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 2660 VFIO_IRQ_SET_ACTION_TRIGGER; 2661 irq_set->index = VFIO_PCI_ERR_IRQ_INDEX; 2662 irq_set->start = 0; 2663 irq_set->count = 1; 2664 pfd = (int32_t *)&irq_set->data; 2665 2666 *pfd = event_notifier_get_fd(&vdev->err_notifier); 2667 qemu_set_fd_handler(*pfd, vfio_err_notifier_handler, NULL, vdev); 2668 2669 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 2670 if (ret) { 2671 error_report("vfio: Failed to set up error notification"); 2672 qemu_set_fd_handler(*pfd, NULL, NULL, vdev); 2673 event_notifier_cleanup(&vdev->err_notifier); 2674 vdev->pci_aer = false; 2675 } 2676 g_free(irq_set); 2677 } 2678 2679 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev) 2680 { 2681 int argsz; 2682 struct vfio_irq_set *irq_set; 2683 int32_t *pfd; 2684 int ret; 2685 2686 if (!vdev->pci_aer) { 2687 return; 2688 } 2689 2690 argsz = sizeof(*irq_set) + sizeof(*pfd); 2691 2692 irq_set = g_malloc0(argsz); 2693 irq_set->argsz = argsz; 2694 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 2695 VFIO_IRQ_SET_ACTION_TRIGGER; 2696 irq_set->index = VFIO_PCI_ERR_IRQ_INDEX; 2697 irq_set->start = 0; 2698 irq_set->count = 1; 2699 pfd = (int32_t *)&irq_set->data; 2700 *pfd = -1; 2701 2702 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 2703 if (ret) { 2704 error_report("vfio: Failed to de-assign error fd: %m"); 2705 } 2706 g_free(irq_set); 2707 qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier), 2708 NULL, NULL, vdev); 2709 event_notifier_cleanup(&vdev->err_notifier); 2710 } 2711 2712 static void vfio_req_notifier_handler(void *opaque) 2713 { 2714 VFIOPCIDevice *vdev = opaque; 2715 Error *err = NULL; 2716 2717 if (!event_notifier_test_and_clear(&vdev->req_notifier)) { 2718 return; 2719 } 2720 2721 qdev_unplug(DEVICE(vdev), &err); 2722 if (err) { 2723 warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 2724 } 2725 } 2726 2727 static void vfio_register_req_notifier(VFIOPCIDevice *vdev) 2728 { 2729 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), 2730 .index = VFIO_PCI_REQ_IRQ_INDEX }; 2731 int argsz; 2732 struct vfio_irq_set *irq_set; 2733 int32_t *pfd; 2734 2735 if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) { 2736 return; 2737 } 2738 2739 if (ioctl(vdev->vbasedev.fd, 2740 VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) { 2741 return; 2742 } 2743 2744 if (event_notifier_init(&vdev->req_notifier, 0)) { 2745 error_report("vfio: Unable to init event notifier for device request"); 2746 return; 2747 } 2748 2749 argsz = sizeof(*irq_set) + sizeof(*pfd); 2750 2751 irq_set = g_malloc0(argsz); 2752 irq_set->argsz = argsz; 2753 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 2754 VFIO_IRQ_SET_ACTION_TRIGGER; 2755 irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; 2756 irq_set->start = 0; 2757 irq_set->count = 1; 2758 pfd = (int32_t *)&irq_set->data; 2759 2760 *pfd = event_notifier_get_fd(&vdev->req_notifier); 2761 qemu_set_fd_handler(*pfd, vfio_req_notifier_handler, NULL, vdev); 2762 2763 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set)) { 2764 error_report("vfio: Failed to set up device request notification"); 2765 qemu_set_fd_handler(*pfd, NULL, NULL, vdev); 2766 event_notifier_cleanup(&vdev->req_notifier); 2767 } else { 2768 vdev->req_enabled = true; 2769 } 2770 2771 g_free(irq_set); 2772 } 2773 2774 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) 2775 { 2776 int argsz; 2777 struct vfio_irq_set *irq_set; 2778 int32_t *pfd; 2779 2780 if (!vdev->req_enabled) { 2781 return; 2782 } 2783 2784 argsz = sizeof(*irq_set) + sizeof(*pfd); 2785 2786 irq_set = g_malloc0(argsz); 2787 irq_set->argsz = argsz; 2788 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 2789 VFIO_IRQ_SET_ACTION_TRIGGER; 2790 irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; 2791 irq_set->start = 0; 2792 irq_set->count = 1; 2793 pfd = (int32_t *)&irq_set->data; 2794 *pfd = -1; 2795 2796 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set)) { 2797 error_report("vfio: Failed to de-assign device request fd: %m"); 2798 } 2799 g_free(irq_set); 2800 qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier), 2801 NULL, NULL, vdev); 2802 event_notifier_cleanup(&vdev->req_notifier); 2803 2804 vdev->req_enabled = false; 2805 } 2806 2807 static void vfio_realize(PCIDevice *pdev, Error **errp) 2808 { 2809 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 2810 VFIODevice *vbasedev_iter; 2811 VFIOGroup *group; 2812 char *tmp, *subsys, group_path[PATH_MAX], *group_name; 2813 Error *err = NULL; 2814 ssize_t len; 2815 struct stat st; 2816 int groupid; 2817 int i, ret; 2818 bool is_mdev; 2819 2820 if (!vdev->vbasedev.sysfsdev) { 2821 if (!(~vdev->host.domain || ~vdev->host.bus || 2822 ~vdev->host.slot || ~vdev->host.function)) { 2823 error_setg(errp, "No provided host device"); 2824 error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F " 2825 "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n"); 2826 return; 2827 } 2828 vdev->vbasedev.sysfsdev = 2829 g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x", 2830 vdev->host.domain, vdev->host.bus, 2831 vdev->host.slot, vdev->host.function); 2832 } 2833 2834 if (stat(vdev->vbasedev.sysfsdev, &st) < 0) { 2835 error_setg_errno(errp, errno, "no such host device"); 2836 error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.sysfsdev); 2837 return; 2838 } 2839 2840 vdev->vbasedev.name = g_path_get_basename(vdev->vbasedev.sysfsdev); 2841 vdev->vbasedev.ops = &vfio_pci_ops; 2842 vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI; 2843 vdev->vbasedev.dev = DEVICE(vdev); 2844 2845 tmp = g_strdup_printf("%s/iommu_group", vdev->vbasedev.sysfsdev); 2846 len = readlink(tmp, group_path, sizeof(group_path)); 2847 g_free(tmp); 2848 2849 if (len <= 0 || len >= sizeof(group_path)) { 2850 error_setg_errno(errp, len < 0 ? errno : ENAMETOOLONG, 2851 "no iommu_group found"); 2852 goto error; 2853 } 2854 2855 group_path[len] = 0; 2856 2857 group_name = basename(group_path); 2858 if (sscanf(group_name, "%d", &groupid) != 1) { 2859 error_setg_errno(errp, errno, "failed to read %s", group_path); 2860 goto error; 2861 } 2862 2863 trace_vfio_realize(vdev->vbasedev.name, groupid); 2864 2865 group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev), errp); 2866 if (!group) { 2867 goto error; 2868 } 2869 2870 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { 2871 if (strcmp(vbasedev_iter->name, vdev->vbasedev.name) == 0) { 2872 error_setg(errp, "device is already attached"); 2873 vfio_put_group(group); 2874 goto error; 2875 } 2876 } 2877 2878 /* 2879 * Mediated devices *might* operate compatibly with memory ballooning, but 2880 * we cannot know for certain, it depends on whether the mdev vendor driver 2881 * stays in sync with the active working set of the guest driver. Prevent 2882 * the x-balloon-allowed option unless this is minimally an mdev device. 2883 */ 2884 tmp = g_strdup_printf("%s/subsystem", vdev->vbasedev.sysfsdev); 2885 subsys = realpath(tmp, NULL); 2886 g_free(tmp); 2887 is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); 2888 free(subsys); 2889 2890 trace_vfio_mdev(vdev->vbasedev.name, is_mdev); 2891 2892 if (vdev->vbasedev.balloon_allowed && !is_mdev) { 2893 error_setg(errp, "x-balloon-allowed only potentially compatible " 2894 "with mdev devices"); 2895 vfio_put_group(group); 2896 goto error; 2897 } 2898 2899 ret = vfio_get_device(group, vdev->vbasedev.name, &vdev->vbasedev, errp); 2900 if (ret) { 2901 vfio_put_group(group); 2902 goto error; 2903 } 2904 2905 vfio_populate_device(vdev, &err); 2906 if (err) { 2907 error_propagate(errp, err); 2908 goto error; 2909 } 2910 2911 /* Get a copy of config space */ 2912 ret = pread(vdev->vbasedev.fd, vdev->pdev.config, 2913 MIN(pci_config_size(&vdev->pdev), vdev->config_size), 2914 vdev->config_offset); 2915 if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) { 2916 ret = ret < 0 ? -errno : -EFAULT; 2917 error_setg_errno(errp, -ret, "failed to read device config space"); 2918 goto error; 2919 } 2920 2921 /* vfio emulates a lot for us, but some bits need extra love */ 2922 vdev->emulated_config_bits = g_malloc0(vdev->config_size); 2923 2924 /* QEMU can choose to expose the ROM or not */ 2925 memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4); 2926 /* QEMU can also add or extend BARs */ 2927 memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4); 2928 2929 /* 2930 * The PCI spec reserves vendor ID 0xffff as an invalid value. The 2931 * device ID is managed by the vendor and need only be a 16-bit value. 2932 * Allow any 16-bit value for subsystem so they can be hidden or changed. 2933 */ 2934 if (vdev->vendor_id != PCI_ANY_ID) { 2935 if (vdev->vendor_id >= 0xffff) { 2936 error_setg(errp, "invalid PCI vendor ID provided"); 2937 goto error; 2938 } 2939 vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0); 2940 trace_vfio_pci_emulated_vendor_id(vdev->vbasedev.name, vdev->vendor_id); 2941 } else { 2942 vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID); 2943 } 2944 2945 if (vdev->device_id != PCI_ANY_ID) { 2946 if (vdev->device_id > 0xffff) { 2947 error_setg(errp, "invalid PCI device ID provided"); 2948 goto error; 2949 } 2950 vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0); 2951 trace_vfio_pci_emulated_device_id(vdev->vbasedev.name, vdev->device_id); 2952 } else { 2953 vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID); 2954 } 2955 2956 if (vdev->sub_vendor_id != PCI_ANY_ID) { 2957 if (vdev->sub_vendor_id > 0xffff) { 2958 error_setg(errp, "invalid PCI subsystem vendor ID provided"); 2959 goto error; 2960 } 2961 vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID, 2962 vdev->sub_vendor_id, ~0); 2963 trace_vfio_pci_emulated_sub_vendor_id(vdev->vbasedev.name, 2964 vdev->sub_vendor_id); 2965 } 2966 2967 if (vdev->sub_device_id != PCI_ANY_ID) { 2968 if (vdev->sub_device_id > 0xffff) { 2969 error_setg(errp, "invalid PCI subsystem device ID provided"); 2970 goto error; 2971 } 2972 vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0); 2973 trace_vfio_pci_emulated_sub_device_id(vdev->vbasedev.name, 2974 vdev->sub_device_id); 2975 } 2976 2977 /* QEMU can change multi-function devices to single function, or reverse */ 2978 vdev->emulated_config_bits[PCI_HEADER_TYPE] = 2979 PCI_HEADER_TYPE_MULTI_FUNCTION; 2980 2981 /* Restore or clear multifunction, this is always controlled by QEMU */ 2982 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { 2983 vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION; 2984 } else { 2985 vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION; 2986 } 2987 2988 /* 2989 * Clear host resource mapping info. If we choose not to register a 2990 * BAR, such as might be the case with the option ROM, we can get 2991 * confusing, unwritable, residual addresses from the host here. 2992 */ 2993 memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24); 2994 memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4); 2995 2996 vfio_pci_size_rom(vdev); 2997 2998 vfio_bars_prepare(vdev); 2999 3000 vfio_msix_early_setup(vdev, &err); 3001 if (err) { 3002 error_propagate(errp, err); 3003 goto error; 3004 } 3005 3006 vfio_bars_register(vdev); 3007 3008 ret = vfio_add_capabilities(vdev, errp); 3009 if (ret) { 3010 goto out_teardown; 3011 } 3012 3013 if (vdev->vga) { 3014 vfio_vga_quirk_setup(vdev); 3015 } 3016 3017 for (i = 0; i < PCI_ROM_SLOT; i++) { 3018 vfio_bar_quirk_setup(vdev, i); 3019 } 3020 3021 if (!vdev->igd_opregion && 3022 vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) { 3023 struct vfio_region_info *opregion; 3024 3025 if (vdev->pdev.qdev.hotplugged) { 3026 error_setg(errp, 3027 "cannot support IGD OpRegion feature on hotplugged " 3028 "device"); 3029 goto out_teardown; 3030 } 3031 3032 ret = vfio_get_dev_region_info(&vdev->vbasedev, 3033 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, 3034 VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); 3035 if (ret) { 3036 error_setg_errno(errp, -ret, 3037 "does not support requested IGD OpRegion feature"); 3038 goto out_teardown; 3039 } 3040 3041 ret = vfio_pci_igd_opregion_init(vdev, opregion, errp); 3042 g_free(opregion); 3043 if (ret) { 3044 goto out_teardown; 3045 } 3046 } 3047 3048 /* QEMU emulates all of MSI & MSIX */ 3049 if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { 3050 memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff, 3051 MSIX_CAP_LENGTH); 3052 } 3053 3054 if (pdev->cap_present & QEMU_PCI_CAP_MSI) { 3055 memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff, 3056 vdev->msi_cap_size); 3057 } 3058 3059 if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { 3060 vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, 3061 vfio_intx_mmap_enable, vdev); 3062 pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_intx_update); 3063 ret = vfio_intx_enable(vdev, errp); 3064 if (ret) { 3065 goto out_teardown; 3066 } 3067 } 3068 3069 if (vdev->display != ON_OFF_AUTO_OFF) { 3070 ret = vfio_display_probe(vdev, errp); 3071 if (ret) { 3072 goto out_teardown; 3073 } 3074 } 3075 if (vdev->enable_ramfb && vdev->dpy == NULL) { 3076 error_setg(errp, "ramfb=on requires display=on"); 3077 goto out_teardown; 3078 } 3079 if (vdev->display_xres || vdev->display_yres) { 3080 if (vdev->dpy == NULL) { 3081 error_setg(errp, "xres and yres properties require display=on"); 3082 goto out_teardown; 3083 } 3084 if (vdev->dpy->edid_regs == NULL) { 3085 error_setg(errp, "xres and yres properties need edid support"); 3086 goto out_teardown; 3087 } 3088 } 3089 3090 if (vdev->vendor_id == PCI_VENDOR_ID_NVIDIA) { 3091 ret = vfio_pci_nvidia_v100_ram_init(vdev, errp); 3092 if (ret && ret != -ENODEV) { 3093 error_report("Failed to setup NVIDIA V100 GPU RAM"); 3094 } 3095 } 3096 3097 if (vdev->vendor_id == PCI_VENDOR_ID_IBM) { 3098 ret = vfio_pci_nvlink2_init(vdev, errp); 3099 if (ret && ret != -ENODEV) { 3100 error_report("Failed to setup NVlink2 bridge"); 3101 } 3102 } 3103 3104 vfio_register_err_notifier(vdev); 3105 vfio_register_req_notifier(vdev); 3106 vfio_setup_resetfn_quirk(vdev); 3107 3108 return; 3109 3110 out_teardown: 3111 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); 3112 vfio_teardown_msi(vdev); 3113 vfio_bars_exit(vdev); 3114 error: 3115 error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name); 3116 } 3117 3118 static void vfio_instance_finalize(Object *obj) 3119 { 3120 VFIOPCIDevice *vdev = PCI_VFIO(obj); 3121 VFIOGroup *group = vdev->vbasedev.group; 3122 3123 vfio_display_finalize(vdev); 3124 vfio_bars_finalize(vdev); 3125 g_free(vdev->emulated_config_bits); 3126 g_free(vdev->rom); 3127 /* 3128 * XXX Leaking igd_opregion is not an oversight, we can't remove the 3129 * fw_cfg entry therefore leaking this allocation seems like the safest 3130 * option. 3131 * 3132 * g_free(vdev->igd_opregion); 3133 */ 3134 vfio_put_device(vdev); 3135 vfio_put_group(group); 3136 } 3137 3138 static void vfio_exitfn(PCIDevice *pdev) 3139 { 3140 VFIOPCIDevice *vdev = PCI_VFIO(pdev); 3141 3142 vfio_unregister_req_notifier(vdev); 3143 vfio_unregister_err_notifier(vdev); 3144 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); 3145 vfio_disable_interrupts(vdev); 3146 if (vdev->intx.mmap_timer) { 3147 timer_free(vdev->intx.mmap_timer); 3148 } 3149 vfio_teardown_msi(vdev); 3150 vfio_bars_exit(vdev); 3151 } 3152 3153 static void vfio_pci_reset(DeviceState *dev) 3154 { 3155 VFIOPCIDevice *vdev = PCI_VFIO(dev); 3156 3157 trace_vfio_pci_reset(vdev->vbasedev.name); 3158 3159 vfio_pci_pre_reset(vdev); 3160 3161 if (vdev->display != ON_OFF_AUTO_OFF) { 3162 vfio_display_reset(vdev); 3163 } 3164 3165 if (vdev->resetfn && !vdev->resetfn(vdev)) { 3166 goto post_reset; 3167 } 3168 3169 if (vdev->vbasedev.reset_works && 3170 (vdev->has_flr || !vdev->has_pm_reset) && 3171 !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) { 3172 trace_vfio_pci_reset_flr(vdev->vbasedev.name); 3173 goto post_reset; 3174 } 3175 3176 /* See if we can do our own bus reset */ 3177 if (!vfio_pci_hot_reset_one(vdev)) { 3178 goto post_reset; 3179 } 3180 3181 /* If nothing else works and the device supports PM reset, use it */ 3182 if (vdev->vbasedev.reset_works && vdev->has_pm_reset && 3183 !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) { 3184 trace_vfio_pci_reset_pm(vdev->vbasedev.name); 3185 goto post_reset; 3186 } 3187 3188 post_reset: 3189 vfio_pci_post_reset(vdev); 3190 } 3191 3192 static void vfio_instance_init(Object *obj) 3193 { 3194 PCIDevice *pci_dev = PCI_DEVICE(obj); 3195 VFIOPCIDevice *vdev = PCI_VFIO(obj); 3196 3197 device_add_bootindex_property(obj, &vdev->bootindex, 3198 "bootindex", NULL, 3199 &pci_dev->qdev, NULL); 3200 vdev->host.domain = ~0U; 3201 vdev->host.bus = ~0U; 3202 vdev->host.slot = ~0U; 3203 vdev->host.function = ~0U; 3204 3205 vdev->nv_gpudirect_clique = 0xFF; 3206 3207 /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command 3208 * line, therefore, no need to wait to realize like other devices */ 3209 pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; 3210 } 3211 3212 static Property vfio_pci_dev_properties[] = { 3213 DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), 3214 DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev), 3215 DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice, 3216 display, ON_OFF_AUTO_OFF), 3217 DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0), 3218 DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0), 3219 DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice, 3220 intx.mmap_timeout, 1100), 3221 DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features, 3222 VFIO_FEATURE_ENABLE_VGA_BIT, false), 3223 DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features, 3224 VFIO_FEATURE_ENABLE_REQ_BIT, true), 3225 DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, 3226 VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), 3227 DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), 3228 DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice, 3229 vbasedev.balloon_allowed, false), 3230 DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false), 3231 DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false), 3232 DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false), 3233 DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice, 3234 no_geforce_quirks, false), 3235 DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd, 3236 false), 3237 DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd, 3238 false), 3239 DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID), 3240 DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID), 3241 DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice, 3242 sub_vendor_id, PCI_ANY_ID), 3243 DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice, 3244 sub_device_id, PCI_ANY_ID), 3245 DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0), 3246 DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice, 3247 nv_gpudirect_clique, 3248 qdev_prop_nv_gpudirect_clique, uint8_t), 3249 DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo, 3250 OFF_AUTOPCIBAR_OFF), 3251 /* 3252 * TODO - support passed fds... is this necessary? 3253 * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), 3254 * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name), 3255 */ 3256 DEFINE_PROP_END_OF_LIST(), 3257 }; 3258 3259 static const VMStateDescription vfio_pci_vmstate = { 3260 .name = "vfio-pci", 3261 .unmigratable = 1, 3262 }; 3263 3264 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) 3265 { 3266 DeviceClass *dc = DEVICE_CLASS(klass); 3267 PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); 3268 3269 dc->reset = vfio_pci_reset; 3270 dc->props = vfio_pci_dev_properties; 3271 dc->vmsd = &vfio_pci_vmstate; 3272 dc->desc = "VFIO-based PCI device assignment"; 3273 set_bit(DEVICE_CATEGORY_MISC, dc->categories); 3274 pdc->realize = vfio_realize; 3275 pdc->exit = vfio_exitfn; 3276 pdc->config_read = vfio_pci_read_config; 3277 pdc->config_write = vfio_pci_write_config; 3278 } 3279 3280 static const TypeInfo vfio_pci_dev_info = { 3281 .name = TYPE_VFIO_PCI, 3282 .parent = TYPE_PCI_DEVICE, 3283 .instance_size = sizeof(VFIOPCIDevice), 3284 .class_init = vfio_pci_dev_class_init, 3285 .instance_init = vfio_instance_init, 3286 .instance_finalize = vfio_instance_finalize, 3287 .interfaces = (InterfaceInfo[]) { 3288 { INTERFACE_PCIE_DEVICE }, 3289 { INTERFACE_CONVENTIONAL_PCI_DEVICE }, 3290 { } 3291 }, 3292 }; 3293 3294 static Property vfio_pci_dev_nohotplug_properties[] = { 3295 DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false), 3296 DEFINE_PROP_END_OF_LIST(), 3297 }; 3298 3299 static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data) 3300 { 3301 DeviceClass *dc = DEVICE_CLASS(klass); 3302 3303 dc->props = vfio_pci_dev_nohotplug_properties; 3304 dc->hotpluggable = false; 3305 } 3306 3307 static const TypeInfo vfio_pci_nohotplug_dev_info = { 3308 .name = TYPE_VIFO_PCI_NOHOTPLUG, 3309 .parent = TYPE_VFIO_PCI, 3310 .instance_size = sizeof(VFIOPCIDevice), 3311 .class_init = vfio_pci_nohotplug_dev_class_init, 3312 }; 3313 3314 static void register_vfio_pci_dev_type(void) 3315 { 3316 type_register_static(&vfio_pci_dev_info); 3317 type_register_static(&vfio_pci_nohotplug_dev_info); 3318 } 3319 3320 type_init(register_vfio_pci_dev_type) 3321