1 /* 2 * device quirks for PCI devices 3 * 4 * Copyright Red Hat, Inc. 2012-2015 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include "exec/memop.h" 15 #include "qemu/units.h" 16 #include "qemu/error-report.h" 17 #include "qemu/main-loop.h" 18 #include "qemu/module.h" 19 #include "qemu/range.h" 20 #include "qapi/error.h" 21 #include "qapi/visitor.h" 22 #include <sys/ioctl.h> 23 #include "hw/hw.h" 24 #include "hw/nvram/fw_cfg.h" 25 #include "hw/qdev-properties.h" 26 #include "pci.h" 27 #include "trace.h" 28 29 /* 30 * List of device ids/vendor ids for which to disable 31 * option rom loading. This avoids the guest hangs during rom 32 * execution as noticed with the BCM 57810 card for lack of a 33 * more better way to handle such issues. 34 * The user can still override by specifying a romfile or 35 * rombar=1. 36 * Please see https://bugs.launchpad.net/qemu/+bug/1284874 37 * for an analysis of the 57810 card hang. When adding 38 * a new vendor id/device id combination below, please also add 39 * your card/environment details and information that could 40 * help in debugging to the bug tracking this issue 41 */ 42 static const struct { 43 uint32_t vendor; 44 uint32_t device; 45 } romblacklist[] = { 46 { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */ 47 }; 48 49 bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev) 50 { 51 int i; 52 53 for (i = 0 ; i < ARRAY_SIZE(romblacklist); i++) { 54 if (vfio_pci_is(vdev, romblacklist[i].vendor, romblacklist[i].device)) { 55 trace_vfio_quirk_rom_blacklisted(vdev->vbasedev.name, 56 romblacklist[i].vendor, 57 romblacklist[i].device); 58 return true; 59 } 60 } 61 return false; 62 } 63 64 /* 65 * Device specific region quirks (mostly backdoors to PCI config space) 66 */ 67 68 /* 69 * The generic window quirks operate on an address and data register, 70 * vfio_generic_window_address_quirk handles the address register and 71 * vfio_generic_window_data_quirk handles the data register. These ops 72 * pass reads and writes through to hardware until a value matching the 73 * stored address match/mask is written. When this occurs, the data 74 * register access emulated PCI config space for the device rather than 75 * passing through accesses. This enables devices where PCI config space 76 * is accessible behind a window register to maintain the virtualization 77 * provided through vfio. 78 */ 79 typedef struct VFIOConfigWindowMatch { 80 uint32_t match; 81 uint32_t mask; 82 } VFIOConfigWindowMatch; 83 84 typedef struct VFIOConfigWindowQuirk { 85 struct VFIOPCIDevice *vdev; 86 87 uint32_t address_val; 88 89 uint32_t address_offset; 90 uint32_t data_offset; 91 92 bool window_enabled; 93 uint8_t bar; 94 95 MemoryRegion *addr_mem; 96 MemoryRegion *data_mem; 97 98 uint32_t nr_matches; 99 VFIOConfigWindowMatch matches[]; 100 } VFIOConfigWindowQuirk; 101 102 static uint64_t vfio_generic_window_quirk_address_read(void *opaque, 103 hwaddr addr, 104 unsigned size) 105 { 106 VFIOConfigWindowQuirk *window = opaque; 107 VFIOPCIDevice *vdev = window->vdev; 108 109 return vfio_region_read(&vdev->bars[window->bar].region, 110 addr + window->address_offset, size); 111 } 112 113 static void vfio_generic_window_quirk_address_write(void *opaque, hwaddr addr, 114 uint64_t data, 115 unsigned size) 116 { 117 VFIOConfigWindowQuirk *window = opaque; 118 VFIOPCIDevice *vdev = window->vdev; 119 int i; 120 121 window->window_enabled = false; 122 123 vfio_region_write(&vdev->bars[window->bar].region, 124 addr + window->address_offset, data, size); 125 126 for (i = 0; i < window->nr_matches; i++) { 127 if ((data & ~window->matches[i].mask) == window->matches[i].match) { 128 window->window_enabled = true; 129 window->address_val = data & window->matches[i].mask; 130 trace_vfio_quirk_generic_window_address_write(vdev->vbasedev.name, 131 memory_region_name(window->addr_mem), data); 132 break; 133 } 134 } 135 } 136 137 static const MemoryRegionOps vfio_generic_window_address_quirk = { 138 .read = vfio_generic_window_quirk_address_read, 139 .write = vfio_generic_window_quirk_address_write, 140 .endianness = DEVICE_LITTLE_ENDIAN, 141 }; 142 143 static uint64_t vfio_generic_window_quirk_data_read(void *opaque, 144 hwaddr addr, unsigned size) 145 { 146 VFIOConfigWindowQuirk *window = opaque; 147 VFIOPCIDevice *vdev = window->vdev; 148 uint64_t data; 149 150 /* Always read data reg, discard if window enabled */ 151 data = vfio_region_read(&vdev->bars[window->bar].region, 152 addr + window->data_offset, size); 153 154 if (window->window_enabled) { 155 data = vfio_pci_read_config(&vdev->pdev, window->address_val, size); 156 trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name, 157 memory_region_name(window->data_mem), data); 158 } 159 160 return data; 161 } 162 163 static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr, 164 uint64_t data, unsigned size) 165 { 166 VFIOConfigWindowQuirk *window = opaque; 167 VFIOPCIDevice *vdev = window->vdev; 168 169 if (window->window_enabled) { 170 vfio_pci_write_config(&vdev->pdev, window->address_val, data, size); 171 trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name, 172 memory_region_name(window->data_mem), data); 173 return; 174 } 175 176 vfio_region_write(&vdev->bars[window->bar].region, 177 addr + window->data_offset, data, size); 178 } 179 180 static const MemoryRegionOps vfio_generic_window_data_quirk = { 181 .read = vfio_generic_window_quirk_data_read, 182 .write = vfio_generic_window_quirk_data_write, 183 .endianness = DEVICE_LITTLE_ENDIAN, 184 }; 185 186 /* 187 * The generic mirror quirk handles devices which expose PCI config space 188 * through a region within a BAR. When enabled, reads and writes are 189 * redirected through to emulated PCI config space. XXX if PCI config space 190 * used memory regions, this could just be an alias. 191 */ 192 typedef struct VFIOConfigMirrorQuirk { 193 struct VFIOPCIDevice *vdev; 194 uint32_t offset; 195 uint8_t bar; 196 MemoryRegion *mem; 197 uint8_t data[]; 198 } VFIOConfigMirrorQuirk; 199 200 static uint64_t vfio_generic_quirk_mirror_read(void *opaque, 201 hwaddr addr, unsigned size) 202 { 203 VFIOConfigMirrorQuirk *mirror = opaque; 204 VFIOPCIDevice *vdev = mirror->vdev; 205 uint64_t data; 206 207 /* Read and discard in case the hardware cares */ 208 (void)vfio_region_read(&vdev->bars[mirror->bar].region, 209 addr + mirror->offset, size); 210 211 data = vfio_pci_read_config(&vdev->pdev, addr, size); 212 trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name, 213 memory_region_name(mirror->mem), 214 addr, data); 215 return data; 216 } 217 218 static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr, 219 uint64_t data, unsigned size) 220 { 221 VFIOConfigMirrorQuirk *mirror = opaque; 222 VFIOPCIDevice *vdev = mirror->vdev; 223 224 vfio_pci_write_config(&vdev->pdev, addr, data, size); 225 trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name, 226 memory_region_name(mirror->mem), 227 addr, data); 228 } 229 230 static const MemoryRegionOps vfio_generic_mirror_quirk = { 231 .read = vfio_generic_quirk_mirror_read, 232 .write = vfio_generic_quirk_mirror_write, 233 .endianness = DEVICE_LITTLE_ENDIAN, 234 }; 235 236 /* Is range1 fully contained within range2? */ 237 static bool vfio_range_contained(uint64_t first1, uint64_t len1, 238 uint64_t first2, uint64_t len2) { 239 return (first1 >= first2 && first1 + len1 <= first2 + len2); 240 } 241 242 #define PCI_VENDOR_ID_ATI 0x1002 243 244 /* 245 * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR 246 * through VGA register 0x3c3. On newer cards, the I/O port BAR is always 247 * BAR4 (older cards like the X550 used BAR1, but we don't care to support 248 * those). Note that on bare metal, a read of 0x3c3 doesn't always return the 249 * I/O port BAR address. Originally this was coded to return the virtual BAR 250 * address only if the physical register read returns the actual BAR address, 251 * but users have reported greater success if we return the virtual address 252 * unconditionally. 253 */ 254 static uint64_t vfio_ati_3c3_quirk_read(void *opaque, 255 hwaddr addr, unsigned size) 256 { 257 VFIOPCIDevice *vdev = opaque; 258 uint64_t data = vfio_pci_read_config(&vdev->pdev, 259 PCI_BASE_ADDRESS_4 + 1, size); 260 261 trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data); 262 263 return data; 264 } 265 266 static const MemoryRegionOps vfio_ati_3c3_quirk = { 267 .read = vfio_ati_3c3_quirk_read, 268 .endianness = DEVICE_LITTLE_ENDIAN, 269 }; 270 271 VFIOQuirk *vfio_quirk_alloc(int nr_mem) 272 { 273 VFIOQuirk *quirk = g_new0(VFIOQuirk, 1); 274 QLIST_INIT(&quirk->ioeventfds); 275 quirk->mem = g_new0(MemoryRegion, nr_mem); 276 quirk->nr_mem = nr_mem; 277 278 return quirk; 279 } 280 281 static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd) 282 { 283 QLIST_REMOVE(ioeventfd, next); 284 memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size, 285 true, ioeventfd->data, &ioeventfd->e); 286 287 if (ioeventfd->vfio) { 288 struct vfio_device_ioeventfd vfio_ioeventfd; 289 290 vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd); 291 vfio_ioeventfd.flags = ioeventfd->size; 292 vfio_ioeventfd.data = ioeventfd->data; 293 vfio_ioeventfd.offset = ioeventfd->region->fd_offset + 294 ioeventfd->region_addr; 295 vfio_ioeventfd.fd = -1; 296 297 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd)) { 298 error_report("Failed to remove vfio ioeventfd for %s+0x%" 299 HWADDR_PRIx"[%d]:0x%"PRIx64" (%m)", 300 memory_region_name(ioeventfd->mr), ioeventfd->addr, 301 ioeventfd->size, ioeventfd->data); 302 } 303 } else { 304 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), 305 NULL, NULL, NULL); 306 } 307 308 event_notifier_cleanup(&ioeventfd->e); 309 trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr), 310 (uint64_t)ioeventfd->addr, ioeventfd->size, 311 ioeventfd->data); 312 g_free(ioeventfd); 313 } 314 315 static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk) 316 { 317 VFIOIOEventFD *ioeventfd, *tmp; 318 319 QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) { 320 if (ioeventfd->dynamic) { 321 vfio_ioeventfd_exit(vdev, ioeventfd); 322 } 323 } 324 } 325 326 static void vfio_ioeventfd_handler(void *opaque) 327 { 328 VFIOIOEventFD *ioeventfd = opaque; 329 330 if (event_notifier_test_and_clear(&ioeventfd->e)) { 331 vfio_region_write(ioeventfd->region, ioeventfd->region_addr, 332 ioeventfd->data, ioeventfd->size); 333 trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr), 334 (uint64_t)ioeventfd->addr, ioeventfd->size, 335 ioeventfd->data); 336 } 337 } 338 339 static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev, 340 MemoryRegion *mr, hwaddr addr, 341 unsigned size, uint64_t data, 342 VFIORegion *region, 343 hwaddr region_addr, bool dynamic) 344 { 345 VFIOIOEventFD *ioeventfd; 346 347 if (vdev->no_kvm_ioeventfd) { 348 return NULL; 349 } 350 351 ioeventfd = g_malloc0(sizeof(*ioeventfd)); 352 353 if (event_notifier_init(&ioeventfd->e, 0)) { 354 g_free(ioeventfd); 355 return NULL; 356 } 357 358 /* 359 * MemoryRegion and relative offset, plus additional ioeventfd setup 360 * parameters for configuring and later tearing down KVM ioeventfd. 361 */ 362 ioeventfd->mr = mr; 363 ioeventfd->addr = addr; 364 ioeventfd->size = size; 365 ioeventfd->data = data; 366 ioeventfd->dynamic = dynamic; 367 /* 368 * VFIORegion and relative offset for implementing the userspace 369 * handler. data & size fields shared for both uses. 370 */ 371 ioeventfd->region = region; 372 ioeventfd->region_addr = region_addr; 373 374 if (!vdev->no_vfio_ioeventfd) { 375 struct vfio_device_ioeventfd vfio_ioeventfd; 376 377 vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd); 378 vfio_ioeventfd.flags = ioeventfd->size; 379 vfio_ioeventfd.data = ioeventfd->data; 380 vfio_ioeventfd.offset = ioeventfd->region->fd_offset + 381 ioeventfd->region_addr; 382 vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e); 383 384 ioeventfd->vfio = !ioctl(vdev->vbasedev.fd, 385 VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd); 386 } 387 388 if (!ioeventfd->vfio) { 389 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), 390 vfio_ioeventfd_handler, NULL, ioeventfd); 391 } 392 393 memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size, 394 true, ioeventfd->data, &ioeventfd->e); 395 trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr, 396 size, data, ioeventfd->vfio); 397 398 return ioeventfd; 399 } 400 401 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev) 402 { 403 VFIOQuirk *quirk; 404 405 /* 406 * As long as the BAR is >= 256 bytes it will be aligned such that the 407 * lower byte is always zero. Filter out anything else, if it exists. 408 */ 409 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 410 !vdev->bars[4].ioport || vdev->bars[4].region.size < 256) { 411 return; 412 } 413 414 quirk = vfio_quirk_alloc(1); 415 416 memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev, 417 "vfio-ati-3c3-quirk", 1); 418 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 419 3 /* offset 3 bytes from 0x3c0 */, quirk->mem); 420 421 QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks, 422 quirk, next); 423 424 trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name); 425 } 426 427 /* 428 * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI 429 * config space through MMIO BAR2 at offset 0x4000. Nothing seems to access 430 * the MMIO space directly, but a window to this space is provided through 431 * I/O port BAR4. Offset 0x0 is the address register and offset 0x4 is the 432 * data register. When the address is programmed to a range of 0x4000-0x4fff 433 * PCI configuration space is available. Experimentation seems to indicate 434 * that read-only may be provided by hardware. 435 */ 436 static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr) 437 { 438 VFIOQuirk *quirk; 439 VFIOConfigWindowQuirk *window; 440 441 /* This windows doesn't seem to be used except by legacy VGA code */ 442 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 443 !vdev->vga || nr != 4) { 444 return; 445 } 446 447 quirk = vfio_quirk_alloc(2); 448 window = quirk->data = g_malloc0(sizeof(*window) + 449 sizeof(VFIOConfigWindowMatch)); 450 window->vdev = vdev; 451 window->address_offset = 0; 452 window->data_offset = 4; 453 window->nr_matches = 1; 454 window->matches[0].match = 0x4000; 455 window->matches[0].mask = vdev->config_size - 1; 456 window->bar = nr; 457 window->addr_mem = &quirk->mem[0]; 458 window->data_mem = &quirk->mem[1]; 459 460 memory_region_init_io(window->addr_mem, OBJECT(vdev), 461 &vfio_generic_window_address_quirk, window, 462 "vfio-ati-bar4-window-address-quirk", 4); 463 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 464 window->address_offset, 465 window->addr_mem, 1); 466 467 memory_region_init_io(window->data_mem, OBJECT(vdev), 468 &vfio_generic_window_data_quirk, window, 469 "vfio-ati-bar4-window-data-quirk", 4); 470 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 471 window->data_offset, 472 window->data_mem, 1); 473 474 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 475 476 trace_vfio_quirk_ati_bar4_probe(vdev->vbasedev.name); 477 } 478 479 /* 480 * Trap the BAR2 MMIO mirror to config space as well. 481 */ 482 static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr) 483 { 484 VFIOQuirk *quirk; 485 VFIOConfigMirrorQuirk *mirror; 486 487 /* Only enable on newer devices where BAR2 is 64bit */ 488 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 489 !vdev->vga || nr != 2 || !vdev->bars[2].mem64) { 490 return; 491 } 492 493 quirk = vfio_quirk_alloc(1); 494 mirror = quirk->data = g_malloc0(sizeof(*mirror)); 495 mirror->mem = quirk->mem; 496 mirror->vdev = vdev; 497 mirror->offset = 0x4000; 498 mirror->bar = nr; 499 500 memory_region_init_io(mirror->mem, OBJECT(vdev), 501 &vfio_generic_mirror_quirk, mirror, 502 "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE); 503 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 504 mirror->offset, mirror->mem, 1); 505 506 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 507 508 trace_vfio_quirk_ati_bar2_probe(vdev->vbasedev.name); 509 } 510 511 /* 512 * Older ATI/AMD cards like the X550 have a similar window to that above. 513 * I/O port BAR1 provides a window to a mirror of PCI config space located 514 * in BAR2 at offset 0xf00. We don't care to support such older cards, but 515 * note it for future reference. 516 */ 517 518 /* 519 * Nvidia has several different methods to get to config space, the 520 * nouveu project has several of these documented here: 521 * https://github.com/pathscale/envytools/tree/master/hwdocs 522 * 523 * The first quirk is actually not documented in envytools and is found 524 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an 525 * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access 526 * the mirror of PCI config space found at BAR0 offset 0x1800. The access 527 * sequence first writes 0x338 to I/O port 0x3d4. The target offset is 528 * then written to 0x3d0. Finally 0x538 is written for a read and 0x738 529 * is written for a write to 0x3d4. The BAR0 offset is then accessible 530 * through 0x3d0. This quirk doesn't seem to be necessary on newer cards 531 * that use the I/O port BAR5 window but it doesn't hurt to leave it. 532 */ 533 typedef enum {NONE = 0, SELECT, WINDOW, READ, WRITE} VFIONvidia3d0State; 534 static const char *nv3d0_states[] = { "NONE", "SELECT", 535 "WINDOW", "READ", "WRITE" }; 536 537 typedef struct VFIONvidia3d0Quirk { 538 VFIOPCIDevice *vdev; 539 VFIONvidia3d0State state; 540 uint32_t offset; 541 } VFIONvidia3d0Quirk; 542 543 static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque, 544 hwaddr addr, unsigned size) 545 { 546 VFIONvidia3d0Quirk *quirk = opaque; 547 VFIOPCIDevice *vdev = quirk->vdev; 548 549 quirk->state = NONE; 550 551 return vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 552 addr + 0x14, size); 553 } 554 555 static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr, 556 uint64_t data, unsigned size) 557 { 558 VFIONvidia3d0Quirk *quirk = opaque; 559 VFIOPCIDevice *vdev = quirk->vdev; 560 VFIONvidia3d0State old_state = quirk->state; 561 562 quirk->state = NONE; 563 564 switch (data) { 565 case 0x338: 566 if (old_state == NONE) { 567 quirk->state = SELECT; 568 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 569 nv3d0_states[quirk->state]); 570 } 571 break; 572 case 0x538: 573 if (old_state == WINDOW) { 574 quirk->state = READ; 575 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 576 nv3d0_states[quirk->state]); 577 } 578 break; 579 case 0x738: 580 if (old_state == WINDOW) { 581 quirk->state = WRITE; 582 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 583 nv3d0_states[quirk->state]); 584 } 585 break; 586 } 587 588 vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 589 addr + 0x14, data, size); 590 } 591 592 static const MemoryRegionOps vfio_nvidia_3d4_quirk = { 593 .read = vfio_nvidia_3d4_quirk_read, 594 .write = vfio_nvidia_3d4_quirk_write, 595 .endianness = DEVICE_LITTLE_ENDIAN, 596 }; 597 598 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque, 599 hwaddr addr, unsigned size) 600 { 601 VFIONvidia3d0Quirk *quirk = opaque; 602 VFIOPCIDevice *vdev = quirk->vdev; 603 VFIONvidia3d0State old_state = quirk->state; 604 uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 605 addr + 0x10, size); 606 607 quirk->state = NONE; 608 609 if (old_state == READ && 610 (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { 611 uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); 612 613 data = vfio_pci_read_config(&vdev->pdev, offset, size); 614 trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name, 615 offset, size, data); 616 } 617 618 return data; 619 } 620 621 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr, 622 uint64_t data, unsigned size) 623 { 624 VFIONvidia3d0Quirk *quirk = opaque; 625 VFIOPCIDevice *vdev = quirk->vdev; 626 VFIONvidia3d0State old_state = quirk->state; 627 628 quirk->state = NONE; 629 630 if (old_state == SELECT) { 631 quirk->offset = (uint32_t)data; 632 quirk->state = WINDOW; 633 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 634 nv3d0_states[quirk->state]); 635 } else if (old_state == WRITE) { 636 if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { 637 uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); 638 639 vfio_pci_write_config(&vdev->pdev, offset, data, size); 640 trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name, 641 offset, data, size); 642 return; 643 } 644 } 645 646 vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 647 addr + 0x10, data, size); 648 } 649 650 static const MemoryRegionOps vfio_nvidia_3d0_quirk = { 651 .read = vfio_nvidia_3d0_quirk_read, 652 .write = vfio_nvidia_3d0_quirk_write, 653 .endianness = DEVICE_LITTLE_ENDIAN, 654 }; 655 656 static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev) 657 { 658 VFIOQuirk *quirk; 659 VFIONvidia3d0Quirk *data; 660 661 if (vdev->no_geforce_quirks || 662 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 663 !vdev->bars[1].region.size) { 664 return; 665 } 666 667 quirk = vfio_quirk_alloc(2); 668 quirk->data = data = g_malloc0(sizeof(*data)); 669 data->vdev = vdev; 670 671 memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk, 672 data, "vfio-nvidia-3d4-quirk", 2); 673 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 674 0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]); 675 676 memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk, 677 data, "vfio-nvidia-3d0-quirk", 2); 678 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 679 0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]); 680 681 QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks, 682 quirk, next); 683 684 trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name); 685 } 686 687 /* 688 * The second quirk is documented in envytools. The I/O port BAR5 is just 689 * a set of address/data ports to the MMIO BARs. The BAR we care about is 690 * again BAR0. This backdoor is apparently a bit newer than the one above 691 * so we need to not only trap 256 bytes @0x1800, but all of PCI config 692 * space, including extended space is available at the 4k @0x88000. 693 */ 694 typedef struct VFIONvidiaBAR5Quirk { 695 uint32_t master; 696 uint32_t enable; 697 MemoryRegion *addr_mem; 698 MemoryRegion *data_mem; 699 bool enabled; 700 VFIOConfigWindowQuirk window; /* last for match data */ 701 } VFIONvidiaBAR5Quirk; 702 703 static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk *bar5) 704 { 705 VFIOPCIDevice *vdev = bar5->window.vdev; 706 707 if (((bar5->master & bar5->enable) & 0x1) == bar5->enabled) { 708 return; 709 } 710 711 bar5->enabled = !bar5->enabled; 712 trace_vfio_quirk_nvidia_bar5_state(vdev->vbasedev.name, 713 bar5->enabled ? "Enable" : "Disable"); 714 memory_region_set_enabled(bar5->addr_mem, bar5->enabled); 715 memory_region_set_enabled(bar5->data_mem, bar5->enabled); 716 } 717 718 static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque, 719 hwaddr addr, unsigned size) 720 { 721 VFIONvidiaBAR5Quirk *bar5 = opaque; 722 VFIOPCIDevice *vdev = bar5->window.vdev; 723 724 return vfio_region_read(&vdev->bars[5].region, addr, size); 725 } 726 727 static void vfio_nvidia_bar5_quirk_master_write(void *opaque, hwaddr addr, 728 uint64_t data, unsigned size) 729 { 730 VFIONvidiaBAR5Quirk *bar5 = opaque; 731 VFIOPCIDevice *vdev = bar5->window.vdev; 732 733 vfio_region_write(&vdev->bars[5].region, addr, data, size); 734 735 bar5->master = data; 736 vfio_nvidia_bar5_enable(bar5); 737 } 738 739 static const MemoryRegionOps vfio_nvidia_bar5_quirk_master = { 740 .read = vfio_nvidia_bar5_quirk_master_read, 741 .write = vfio_nvidia_bar5_quirk_master_write, 742 .endianness = DEVICE_LITTLE_ENDIAN, 743 }; 744 745 static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque, 746 hwaddr addr, unsigned size) 747 { 748 VFIONvidiaBAR5Quirk *bar5 = opaque; 749 VFIOPCIDevice *vdev = bar5->window.vdev; 750 751 return vfio_region_read(&vdev->bars[5].region, addr + 4, size); 752 } 753 754 static void vfio_nvidia_bar5_quirk_enable_write(void *opaque, hwaddr addr, 755 uint64_t data, unsigned size) 756 { 757 VFIONvidiaBAR5Quirk *bar5 = opaque; 758 VFIOPCIDevice *vdev = bar5->window.vdev; 759 760 vfio_region_write(&vdev->bars[5].region, addr + 4, data, size); 761 762 bar5->enable = data; 763 vfio_nvidia_bar5_enable(bar5); 764 } 765 766 static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable = { 767 .read = vfio_nvidia_bar5_quirk_enable_read, 768 .write = vfio_nvidia_bar5_quirk_enable_write, 769 .endianness = DEVICE_LITTLE_ENDIAN, 770 }; 771 772 static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr) 773 { 774 VFIOQuirk *quirk; 775 VFIONvidiaBAR5Quirk *bar5; 776 VFIOConfigWindowQuirk *window; 777 778 if (vdev->no_geforce_quirks || 779 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 780 !vdev->vga || nr != 5 || !vdev->bars[5].ioport) { 781 return; 782 } 783 784 quirk = vfio_quirk_alloc(4); 785 bar5 = quirk->data = g_malloc0(sizeof(*bar5) + 786 (sizeof(VFIOConfigWindowMatch) * 2)); 787 window = &bar5->window; 788 789 window->vdev = vdev; 790 window->address_offset = 0x8; 791 window->data_offset = 0xc; 792 window->nr_matches = 2; 793 window->matches[0].match = 0x1800; 794 window->matches[0].mask = PCI_CONFIG_SPACE_SIZE - 1; 795 window->matches[1].match = 0x88000; 796 window->matches[1].mask = vdev->config_size - 1; 797 window->bar = nr; 798 window->addr_mem = bar5->addr_mem = &quirk->mem[0]; 799 window->data_mem = bar5->data_mem = &quirk->mem[1]; 800 801 memory_region_init_io(window->addr_mem, OBJECT(vdev), 802 &vfio_generic_window_address_quirk, window, 803 "vfio-nvidia-bar5-window-address-quirk", 4); 804 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 805 window->address_offset, 806 window->addr_mem, 1); 807 memory_region_set_enabled(window->addr_mem, false); 808 809 memory_region_init_io(window->data_mem, OBJECT(vdev), 810 &vfio_generic_window_data_quirk, window, 811 "vfio-nvidia-bar5-window-data-quirk", 4); 812 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 813 window->data_offset, 814 window->data_mem, 1); 815 memory_region_set_enabled(window->data_mem, false); 816 817 memory_region_init_io(&quirk->mem[2], OBJECT(vdev), 818 &vfio_nvidia_bar5_quirk_master, bar5, 819 "vfio-nvidia-bar5-master-quirk", 4); 820 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 821 0, &quirk->mem[2], 1); 822 823 memory_region_init_io(&quirk->mem[3], OBJECT(vdev), 824 &vfio_nvidia_bar5_quirk_enable, bar5, 825 "vfio-nvidia-bar5-enable-quirk", 4); 826 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 827 4, &quirk->mem[3], 1); 828 829 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 830 831 trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name); 832 } 833 834 typedef struct LastDataSet { 835 VFIOQuirk *quirk; 836 hwaddr addr; 837 uint64_t data; 838 unsigned size; 839 int hits; 840 int added; 841 } LastDataSet; 842 843 #define MAX_DYN_IOEVENTFD 10 844 #define HITS_FOR_IOEVENTFD 10 845 846 /* 847 * Finally, BAR0 itself. We want to redirect any accesses to either 848 * 0x1800 or 0x88000 through the PCI config space access functions. 849 */ 850 static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr, 851 uint64_t data, unsigned size) 852 { 853 VFIOConfigMirrorQuirk *mirror = opaque; 854 VFIOPCIDevice *vdev = mirror->vdev; 855 PCIDevice *pdev = &vdev->pdev; 856 LastDataSet *last = (LastDataSet *)&mirror->data; 857 858 vfio_generic_quirk_mirror_write(opaque, addr, data, size); 859 860 /* 861 * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the 862 * MSI capability ID register. Both the ID and next register are 863 * read-only, so we allow writes covering either of those to real hw. 864 */ 865 if ((pdev->cap_present & QEMU_PCI_CAP_MSI) && 866 vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) { 867 vfio_region_write(&vdev->bars[mirror->bar].region, 868 addr + mirror->offset, data, size); 869 trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name); 870 } 871 872 /* 873 * Automatically add an ioeventfd to handle any repeated write with the 874 * same data and size above the standard PCI config space header. This is 875 * primarily expected to accelerate the MSI-ACK behavior, such as noted 876 * above. Current hardware/drivers should trigger an ioeventfd at config 877 * offset 0x704 (region offset 0x88704), with data 0x0, size 4. 878 * 879 * The criteria of 10 successive hits is arbitrary but reliably adds the 880 * MSI-ACK region. Note that as some writes are bypassed via the ioeventfd, 881 * the remaining ones have a greater chance of being seen successively. 882 * To avoid the pathological case of burning up all of QEMU's open file 883 * handles, arbitrarily limit this algorithm from adding no more than 10 884 * ioeventfds, print an error if we would have added an 11th, and then 885 * stop counting. 886 */ 887 if (!vdev->no_kvm_ioeventfd && 888 addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) { 889 if (addr != last->addr || data != last->data || size != last->size) { 890 last->addr = addr; 891 last->data = data; 892 last->size = size; 893 last->hits = 1; 894 } else if (++last->hits >= HITS_FOR_IOEVENTFD) { 895 if (last->added < MAX_DYN_IOEVENTFD) { 896 VFIOIOEventFD *ioeventfd; 897 ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size, 898 data, &vdev->bars[mirror->bar].region, 899 mirror->offset + addr, true); 900 if (ioeventfd) { 901 VFIOQuirk *quirk = last->quirk; 902 903 QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next); 904 last->added++; 905 } 906 } else { 907 last->added++; 908 warn_report("NVIDIA ioeventfd queue full for %s, unable to " 909 "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", " 910 "size %u", vdev->vbasedev.name, addr, data, size); 911 } 912 } 913 } 914 } 915 916 static const MemoryRegionOps vfio_nvidia_mirror_quirk = { 917 .read = vfio_generic_quirk_mirror_read, 918 .write = vfio_nvidia_quirk_mirror_write, 919 .endianness = DEVICE_LITTLE_ENDIAN, 920 }; 921 922 static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk) 923 { 924 VFIOConfigMirrorQuirk *mirror = quirk->data; 925 LastDataSet *last = (LastDataSet *)&mirror->data; 926 927 last->addr = last->data = last->size = last->hits = last->added = 0; 928 929 vfio_drop_dynamic_eventfds(vdev, quirk); 930 } 931 932 static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr) 933 { 934 VFIOQuirk *quirk; 935 VFIOConfigMirrorQuirk *mirror; 936 LastDataSet *last; 937 938 if (vdev->no_geforce_quirks || 939 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 940 !vfio_is_vga(vdev) || nr != 0) { 941 return; 942 } 943 944 quirk = vfio_quirk_alloc(1); 945 quirk->reset = vfio_nvidia_bar0_quirk_reset; 946 mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); 947 mirror->mem = quirk->mem; 948 mirror->vdev = vdev; 949 mirror->offset = 0x88000; 950 mirror->bar = nr; 951 last = (LastDataSet *)&mirror->data; 952 last->quirk = quirk; 953 954 memory_region_init_io(mirror->mem, OBJECT(vdev), 955 &vfio_nvidia_mirror_quirk, mirror, 956 "vfio-nvidia-bar0-88000-mirror-quirk", 957 vdev->config_size); 958 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 959 mirror->offset, mirror->mem, 1); 960 961 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 962 963 /* The 0x1800 offset mirror only seems to get used by legacy VGA */ 964 if (vdev->vga) { 965 quirk = vfio_quirk_alloc(1); 966 quirk->reset = vfio_nvidia_bar0_quirk_reset; 967 mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); 968 mirror->mem = quirk->mem; 969 mirror->vdev = vdev; 970 mirror->offset = 0x1800; 971 mirror->bar = nr; 972 last = (LastDataSet *)&mirror->data; 973 last->quirk = quirk; 974 975 memory_region_init_io(mirror->mem, OBJECT(vdev), 976 &vfio_nvidia_mirror_quirk, mirror, 977 "vfio-nvidia-bar0-1800-mirror-quirk", 978 PCI_CONFIG_SPACE_SIZE); 979 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 980 mirror->offset, mirror->mem, 1); 981 982 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 983 } 984 985 trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name); 986 } 987 988 /* 989 * TODO - Some Nvidia devices provide config access to their companion HDA 990 * device and even to their parent bridge via these config space mirrors. 991 * Add quirks for those regions. 992 */ 993 994 #define PCI_VENDOR_ID_REALTEK 0x10ec 995 996 /* 997 * RTL8168 devices have a backdoor that can access the MSI-X table. At BAR2 998 * offset 0x70 there is a dword data register, offset 0x74 is a dword address 999 * register. According to the Linux r8169 driver, the MSI-X table is addressed 1000 * when the "type" portion of the address register is set to 0x1. This appears 1001 * to be bits 16:30. Bit 31 is both a write indicator and some sort of 1002 * "address latched" indicator. Bits 12:15 are a mask field, which we can 1003 * ignore because the MSI-X table should always be accessed as a dword (full 1004 * mask). Bits 0:11 is offset within the type. 1005 * 1006 * Example trace: 1007 * 1008 * Read from MSI-X table offset 0 1009 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr 1010 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch 1011 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data 1012 * 1013 * Write 0xfee00000 to MSI-X table offset 0 1014 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data 1015 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write 1016 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete 1017 */ 1018 typedef struct VFIOrtl8168Quirk { 1019 VFIOPCIDevice *vdev; 1020 uint32_t addr; 1021 uint32_t data; 1022 bool enabled; 1023 } VFIOrtl8168Quirk; 1024 1025 static uint64_t vfio_rtl8168_quirk_address_read(void *opaque, 1026 hwaddr addr, unsigned size) 1027 { 1028 VFIOrtl8168Quirk *rtl = opaque; 1029 VFIOPCIDevice *vdev = rtl->vdev; 1030 uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size); 1031 1032 if (rtl->enabled) { 1033 data = rtl->addr ^ 0x80000000U; /* latch/complete */ 1034 trace_vfio_quirk_rtl8168_fake_latch(vdev->vbasedev.name, data); 1035 } 1036 1037 return data; 1038 } 1039 1040 static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr, 1041 uint64_t data, unsigned size) 1042 { 1043 VFIOrtl8168Quirk *rtl = opaque; 1044 VFIOPCIDevice *vdev = rtl->vdev; 1045 1046 rtl->enabled = false; 1047 1048 if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */ 1049 rtl->enabled = true; 1050 rtl->addr = (uint32_t)data; 1051 1052 if (data & 0x80000000U) { /* Do write */ 1053 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) { 1054 hwaddr offset = data & 0xfff; 1055 uint64_t val = rtl->data; 1056 1057 trace_vfio_quirk_rtl8168_msix_write(vdev->vbasedev.name, 1058 (uint16_t)offset, val); 1059 1060 /* Write to the proper guest MSI-X table instead */ 1061 memory_region_dispatch_write(&vdev->pdev.msix_table_mmio, 1062 offset, val, 1063 size_memop(size) | MO_LE, 1064 MEMTXATTRS_UNSPECIFIED); 1065 } 1066 return; /* Do not write guest MSI-X data to hardware */ 1067 } 1068 } 1069 1070 vfio_region_write(&vdev->bars[2].region, addr + 0x74, data, size); 1071 } 1072 1073 static const MemoryRegionOps vfio_rtl_address_quirk = { 1074 .read = vfio_rtl8168_quirk_address_read, 1075 .write = vfio_rtl8168_quirk_address_write, 1076 .valid = { 1077 .min_access_size = 4, 1078 .max_access_size = 4, 1079 .unaligned = false, 1080 }, 1081 .endianness = DEVICE_LITTLE_ENDIAN, 1082 }; 1083 1084 static uint64_t vfio_rtl8168_quirk_data_read(void *opaque, 1085 hwaddr addr, unsigned size) 1086 { 1087 VFIOrtl8168Quirk *rtl = opaque; 1088 VFIOPCIDevice *vdev = rtl->vdev; 1089 uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x70, size); 1090 1091 if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) { 1092 hwaddr offset = rtl->addr & 0xfff; 1093 memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset, 1094 &data, size_memop(size) | MO_LE, 1095 MEMTXATTRS_UNSPECIFIED); 1096 trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data); 1097 } 1098 1099 return data; 1100 } 1101 1102 static void vfio_rtl8168_quirk_data_write(void *opaque, hwaddr addr, 1103 uint64_t data, unsigned size) 1104 { 1105 VFIOrtl8168Quirk *rtl = opaque; 1106 VFIOPCIDevice *vdev = rtl->vdev; 1107 1108 rtl->data = (uint32_t)data; 1109 1110 vfio_region_write(&vdev->bars[2].region, addr + 0x70, data, size); 1111 } 1112 1113 static const MemoryRegionOps vfio_rtl_data_quirk = { 1114 .read = vfio_rtl8168_quirk_data_read, 1115 .write = vfio_rtl8168_quirk_data_write, 1116 .valid = { 1117 .min_access_size = 4, 1118 .max_access_size = 4, 1119 .unaligned = false, 1120 }, 1121 .endianness = DEVICE_LITTLE_ENDIAN, 1122 }; 1123 1124 static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) 1125 { 1126 VFIOQuirk *quirk; 1127 VFIOrtl8168Quirk *rtl; 1128 1129 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_REALTEK, 0x8168) || nr != 2) { 1130 return; 1131 } 1132 1133 quirk = vfio_quirk_alloc(2); 1134 quirk->data = rtl = g_malloc0(sizeof(*rtl)); 1135 rtl->vdev = vdev; 1136 1137 memory_region_init_io(&quirk->mem[0], OBJECT(vdev), 1138 &vfio_rtl_address_quirk, rtl, 1139 "vfio-rtl8168-window-address-quirk", 4); 1140 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 1141 0x74, &quirk->mem[0], 1); 1142 1143 memory_region_init_io(&quirk->mem[1], OBJECT(vdev), 1144 &vfio_rtl_data_quirk, rtl, 1145 "vfio-rtl8168-window-data-quirk", 4); 1146 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 1147 0x70, &quirk->mem[1], 1); 1148 1149 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 1150 1151 trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name); 1152 } 1153 1154 #define IGD_ASLS 0xfc /* ASL Storage Register */ 1155 1156 /* 1157 * The OpRegion includes the Video BIOS Table, which seems important for 1158 * telling the driver what sort of outputs it has. Without this, the device 1159 * may work in the guest, but we may not get output. This also requires BIOS 1160 * support to reserve and populate a section of guest memory sufficient for 1161 * the table and to write the base address of that memory to the ASLS register 1162 * of the IGD device. 1163 */ 1164 int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, 1165 struct vfio_region_info *info, Error **errp) 1166 { 1167 int ret; 1168 1169 vdev->igd_opregion = g_malloc0(info->size); 1170 ret = pread(vdev->vbasedev.fd, vdev->igd_opregion, 1171 info->size, info->offset); 1172 if (ret != info->size) { 1173 error_setg(errp, "failed to read IGD OpRegion"); 1174 g_free(vdev->igd_opregion); 1175 vdev->igd_opregion = NULL; 1176 return -EINVAL; 1177 } 1178 1179 /* 1180 * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to 1181 * allocate 32bit reserved memory for, copy these contents into, and write 1182 * the reserved memory base address to the device ASLS register at 0xFC. 1183 * Alignment of this reserved region seems flexible, but using a 4k page 1184 * alignment seems to work well. This interface assumes a single IGD 1185 * device, which may be at VM address 00:02.0 in legacy mode or another 1186 * address in UPT mode. 1187 * 1188 * NB, there may be future use cases discovered where the VM should have 1189 * direct interaction with the host OpRegion, in which case the write to 1190 * the ASLS register would trigger MemoryRegion setup to enable that. 1191 */ 1192 fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion", 1193 vdev->igd_opregion, info->size); 1194 1195 trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name); 1196 1197 pci_set_long(vdev->pdev.config + IGD_ASLS, 0); 1198 pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0); 1199 pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); 1200 1201 return 0; 1202 } 1203 1204 /* 1205 * Common quirk probe entry points. 1206 */ 1207 void vfio_vga_quirk_setup(VFIOPCIDevice *vdev) 1208 { 1209 vfio_vga_probe_ati_3c3_quirk(vdev); 1210 vfio_vga_probe_nvidia_3d0_quirk(vdev); 1211 } 1212 1213 void vfio_vga_quirk_exit(VFIOPCIDevice *vdev) 1214 { 1215 VFIOQuirk *quirk; 1216 int i, j; 1217 1218 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { 1219 QLIST_FOREACH(quirk, &vdev->vga->region[i].quirks, next) { 1220 for (j = 0; j < quirk->nr_mem; j++) { 1221 memory_region_del_subregion(&vdev->vga->region[i].mem, 1222 &quirk->mem[j]); 1223 } 1224 } 1225 } 1226 } 1227 1228 void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev) 1229 { 1230 int i, j; 1231 1232 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { 1233 while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) { 1234 VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks); 1235 QLIST_REMOVE(quirk, next); 1236 for (j = 0; j < quirk->nr_mem; j++) { 1237 object_unparent(OBJECT(&quirk->mem[j])); 1238 } 1239 g_free(quirk->mem); 1240 g_free(quirk->data); 1241 g_free(quirk); 1242 } 1243 } 1244 } 1245 1246 void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) 1247 { 1248 vfio_probe_ati_bar4_quirk(vdev, nr); 1249 vfio_probe_ati_bar2_quirk(vdev, nr); 1250 vfio_probe_nvidia_bar5_quirk(vdev, nr); 1251 vfio_probe_nvidia_bar0_quirk(vdev, nr); 1252 vfio_probe_rtl8168_bar2_quirk(vdev, nr); 1253 #ifdef CONFIG_VFIO_IGD 1254 vfio_probe_igd_bar4_quirk(vdev, nr); 1255 #endif 1256 } 1257 1258 void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr) 1259 { 1260 VFIOBAR *bar = &vdev->bars[nr]; 1261 VFIOQuirk *quirk; 1262 int i; 1263 1264 QLIST_FOREACH(quirk, &bar->quirks, next) { 1265 while (!QLIST_EMPTY(&quirk->ioeventfds)) { 1266 vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds)); 1267 } 1268 1269 for (i = 0; i < quirk->nr_mem; i++) { 1270 memory_region_del_subregion(bar->region.mem, &quirk->mem[i]); 1271 } 1272 } 1273 } 1274 1275 void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr) 1276 { 1277 VFIOBAR *bar = &vdev->bars[nr]; 1278 int i; 1279 1280 while (!QLIST_EMPTY(&bar->quirks)) { 1281 VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks); 1282 QLIST_REMOVE(quirk, next); 1283 for (i = 0; i < quirk->nr_mem; i++) { 1284 object_unparent(OBJECT(&quirk->mem[i])); 1285 } 1286 g_free(quirk->mem); 1287 g_free(quirk->data); 1288 g_free(quirk); 1289 } 1290 } 1291 1292 /* 1293 * Reset quirks 1294 */ 1295 void vfio_quirk_reset(VFIOPCIDevice *vdev) 1296 { 1297 int i; 1298 1299 for (i = 0; i < PCI_ROM_SLOT; i++) { 1300 VFIOQuirk *quirk; 1301 VFIOBAR *bar = &vdev->bars[i]; 1302 1303 QLIST_FOREACH(quirk, &bar->quirks, next) { 1304 if (quirk->reset) { 1305 quirk->reset(vdev, quirk); 1306 } 1307 } 1308 } 1309 } 1310 1311 /* 1312 * AMD Radeon PCI config reset, based on Linux: 1313 * drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running() 1314 * drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset 1315 * drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc() 1316 * drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock() 1317 * IDs: include/drm/drm_pciids.h 1318 * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0 1319 * 1320 * Bonaire and Hawaii GPUs do not respond to a bus reset. This is a bug in the 1321 * hardware that should be fixed on future ASICs. The symptom of this is that 1322 * once the accerlated driver loads, Windows guests will bsod on subsequent 1323 * attmpts to load the driver, such as after VM reset or shutdown/restart. To 1324 * work around this, we do an AMD specific PCI config reset, followed by an SMC 1325 * reset. The PCI config reset only works if SMC firmware is running, so we 1326 * have a dependency on the state of the device as to whether this reset will 1327 * be effective. There are still cases where we won't be able to kick the 1328 * device into working, but this greatly improves the usability overall. The 1329 * config reset magic is relatively common on AMD GPUs, but the setup and SMC 1330 * poking is largely ASIC specific. 1331 */ 1332 static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev) 1333 { 1334 uint32_t clk, pc_c; 1335 1336 /* 1337 * Registers 200h and 204h are index and data registers for accessing 1338 * indirect configuration registers within the device. 1339 */ 1340 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); 1341 clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1342 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4); 1343 pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1344 1345 return (!(clk & 1) && (0x20100 <= pc_c)); 1346 } 1347 1348 /* 1349 * The scope of a config reset is controlled by a mode bit in the misc register 1350 * and a fuse, exposed as a bit in another register. The fuse is the default 1351 * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula 1352 * scope = !(misc ^ fuse), where the resulting scope is defined the same as 1353 * the fuse. A truth table therefore tells us that if misc == fuse, we need 1354 * to flip the value of the bit in the misc register. 1355 */ 1356 static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev) 1357 { 1358 uint32_t misc, fuse; 1359 bool a, b; 1360 1361 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4); 1362 fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1363 b = fuse & 64; 1364 1365 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4); 1366 misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1367 a = misc & 2; 1368 1369 if (a == b) { 1370 vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4); 1371 vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */ 1372 } 1373 } 1374 1375 static int vfio_radeon_reset(VFIOPCIDevice *vdev) 1376 { 1377 PCIDevice *pdev = &vdev->pdev; 1378 int i, ret = 0; 1379 uint32_t data; 1380 1381 /* Defer to a kernel implemented reset */ 1382 if (vdev->vbasedev.reset_works) { 1383 trace_vfio_quirk_ati_bonaire_reset_skipped(vdev->vbasedev.name); 1384 return -ENODEV; 1385 } 1386 1387 /* Enable only memory BAR access */ 1388 vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2); 1389 1390 /* Reset only works if SMC firmware is loaded and running */ 1391 if (!vfio_radeon_smc_is_running(vdev)) { 1392 ret = -EINVAL; 1393 trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev->vbasedev.name); 1394 goto out; 1395 } 1396 1397 /* Make sure only the GFX function is reset */ 1398 vfio_radeon_set_gfx_only_reset(vdev); 1399 1400 /* AMD PCI config reset */ 1401 vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4); 1402 usleep(100); 1403 1404 /* Read back the memory size to make sure we're out of reset */ 1405 for (i = 0; i < 100000; i++) { 1406 if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) { 1407 goto reset_smc; 1408 } 1409 usleep(1); 1410 } 1411 1412 trace_vfio_quirk_ati_bonaire_reset_timeout(vdev->vbasedev.name); 1413 1414 reset_smc: 1415 /* Reset SMC */ 1416 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4); 1417 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1418 data |= 1; 1419 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); 1420 1421 /* Disable SMC clock */ 1422 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); 1423 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1424 data |= 1; 1425 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); 1426 1427 trace_vfio_quirk_ati_bonaire_reset_done(vdev->vbasedev.name); 1428 1429 out: 1430 /* Restore PCI command register */ 1431 vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2); 1432 1433 return ret; 1434 } 1435 1436 void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev) 1437 { 1438 switch (vdev->vendor_id) { 1439 case 0x1002: 1440 switch (vdev->device_id) { 1441 /* Bonaire */ 1442 case 0x6649: /* Bonaire [FirePro W5100] */ 1443 case 0x6650: 1444 case 0x6651: 1445 case 0x6658: /* Bonaire XTX [Radeon R7 260X] */ 1446 case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */ 1447 case 0x665d: /* Bonaire [Radeon R7 200 Series] */ 1448 /* Hawaii */ 1449 case 0x67A0: /* Hawaii XT GL [FirePro W9100] */ 1450 case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */ 1451 case 0x67A2: 1452 case 0x67A8: 1453 case 0x67A9: 1454 case 0x67AA: 1455 case 0x67B0: /* Hawaii XT [Radeon R9 290X] */ 1456 case 0x67B1: /* Hawaii PRO [Radeon R9 290] */ 1457 case 0x67B8: 1458 case 0x67B9: 1459 case 0x67BA: 1460 case 0x67BE: 1461 vdev->resetfn = vfio_radeon_reset; 1462 trace_vfio_quirk_ati_bonaire_reset(vdev->vbasedev.name); 1463 break; 1464 } 1465 break; 1466 } 1467 } 1468 1469 /* 1470 * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify 1471 * devices as a member of a clique. Devices within the same clique ID 1472 * are capable of direct P2P. It's the user's responsibility that this 1473 * is correct. The spec says that this may reside at any unused config 1474 * offset, but reserves and recommends hypervisors place this at C8h. 1475 * The spec also states that the hypervisor should place this capability 1476 * at the end of the capability list, thus next is defined as 0h. 1477 * 1478 * +----------------+----------------+----------------+----------------+ 1479 * | sig 7:0 ('P') | vndr len (8h) | next (0h) | cap id (9h) | 1480 * +----------------+----------------+----------------+----------------+ 1481 * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)| sig 23:8 ('P2') | 1482 * +---------------------------------+---------------------------------+ 1483 * 1484 * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf 1485 */ 1486 static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v, 1487 const char *name, void *opaque, 1488 Error **errp) 1489 { 1490 DeviceState *dev = DEVICE(obj); 1491 Property *prop = opaque; 1492 uint8_t *ptr = qdev_get_prop_ptr(dev, prop); 1493 1494 visit_type_uint8(v, name, ptr, errp); 1495 } 1496 1497 static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v, 1498 const char *name, void *opaque, 1499 Error **errp) 1500 { 1501 DeviceState *dev = DEVICE(obj); 1502 Property *prop = opaque; 1503 uint8_t value, *ptr = qdev_get_prop_ptr(dev, prop); 1504 Error *local_err = NULL; 1505 1506 if (dev->realized) { 1507 qdev_prop_set_after_realize(dev, name, errp); 1508 return; 1509 } 1510 1511 visit_type_uint8(v, name, &value, &local_err); 1512 if (local_err) { 1513 error_propagate(errp, local_err); 1514 return; 1515 } 1516 1517 if (value & ~0xF) { 1518 error_setg(errp, "Property %s: valid range 0-15", name); 1519 return; 1520 } 1521 1522 *ptr = value; 1523 } 1524 1525 const PropertyInfo qdev_prop_nv_gpudirect_clique = { 1526 .name = "uint4", 1527 .description = "NVIDIA GPUDirect Clique ID (0 - 15)", 1528 .get = get_nv_gpudirect_clique_id, 1529 .set = set_nv_gpudirect_clique_id, 1530 }; 1531 1532 static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp) 1533 { 1534 PCIDevice *pdev = &vdev->pdev; 1535 int ret, pos = 0xC8; 1536 1537 if (vdev->nv_gpudirect_clique == 0xFF) { 1538 return 0; 1539 } 1540 1541 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) { 1542 error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor"); 1543 return -EINVAL; 1544 } 1545 1546 if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) != 1547 PCI_BASE_CLASS_DISPLAY) { 1548 error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class"); 1549 return -EINVAL; 1550 } 1551 1552 ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp); 1553 if (ret < 0) { 1554 error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: "); 1555 return ret; 1556 } 1557 1558 memset(vdev->emulated_config_bits + pos, 0xFF, 8); 1559 pos += PCI_CAP_FLAGS; 1560 pci_set_byte(pdev->config + pos++, 8); 1561 pci_set_byte(pdev->config + pos++, 'P'); 1562 pci_set_byte(pdev->config + pos++, '2'); 1563 pci_set_byte(pdev->config + pos++, 'P'); 1564 pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3); 1565 pci_set_byte(pdev->config + pos, 0); 1566 1567 return 0; 1568 } 1569 1570 int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp) 1571 { 1572 int ret; 1573 1574 ret = vfio_add_nv_gpudirect_cap(vdev, errp); 1575 if (ret) { 1576 return ret; 1577 } 1578 1579 return 0; 1580 } 1581 1582 static void vfio_pci_nvlink2_get_tgt(Object *obj, Visitor *v, 1583 const char *name, 1584 void *opaque, Error **errp) 1585 { 1586 uint64_t tgt = (uintptr_t) opaque; 1587 visit_type_uint64(v, name, &tgt, errp); 1588 } 1589 1590 static void vfio_pci_nvlink2_get_link_speed(Object *obj, Visitor *v, 1591 const char *name, 1592 void *opaque, Error **errp) 1593 { 1594 uint32_t link_speed = (uint32_t)(uintptr_t) opaque; 1595 visit_type_uint32(v, name, &link_speed, errp); 1596 } 1597 1598 int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp) 1599 { 1600 int ret; 1601 void *p; 1602 struct vfio_region_info *nv2reg = NULL; 1603 struct vfio_info_cap_header *hdr; 1604 struct vfio_region_info_cap_nvlink2_ssatgt *cap; 1605 VFIOQuirk *quirk; 1606 1607 ret = vfio_get_dev_region_info(&vdev->vbasedev, 1608 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | 1609 PCI_VENDOR_ID_NVIDIA, 1610 VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, 1611 &nv2reg); 1612 if (ret) { 1613 return ret; 1614 } 1615 1616 hdr = vfio_get_region_info_cap(nv2reg, VFIO_REGION_INFO_CAP_NVLINK2_SSATGT); 1617 if (!hdr) { 1618 ret = -ENODEV; 1619 goto free_exit; 1620 } 1621 cap = (void *) hdr; 1622 1623 p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE | PROT_EXEC, 1624 MAP_SHARED, vdev->vbasedev.fd, nv2reg->offset); 1625 if (p == MAP_FAILED) { 1626 ret = -errno; 1627 goto free_exit; 1628 } 1629 1630 quirk = vfio_quirk_alloc(1); 1631 memory_region_init_ram_ptr(&quirk->mem[0], OBJECT(vdev), "nvlink2-mr", 1632 nv2reg->size, p); 1633 QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next); 1634 1635 object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64", 1636 vfio_pci_nvlink2_get_tgt, NULL, NULL, 1637 (void *) (uintptr_t) cap->tgt); 1638 trace_vfio_pci_nvidia_gpu_setup_quirk(vdev->vbasedev.name, cap->tgt, 1639 nv2reg->size); 1640 free_exit: 1641 g_free(nv2reg); 1642 1643 return ret; 1644 } 1645 1646 int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp) 1647 { 1648 int ret; 1649 void *p; 1650 struct vfio_region_info *atsdreg = NULL; 1651 struct vfio_info_cap_header *hdr; 1652 struct vfio_region_info_cap_nvlink2_ssatgt *captgt; 1653 struct vfio_region_info_cap_nvlink2_lnkspd *capspeed; 1654 VFIOQuirk *quirk; 1655 1656 ret = vfio_get_dev_region_info(&vdev->vbasedev, 1657 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | 1658 PCI_VENDOR_ID_IBM, 1659 VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, 1660 &atsdreg); 1661 if (ret) { 1662 return ret; 1663 } 1664 1665 hdr = vfio_get_region_info_cap(atsdreg, 1666 VFIO_REGION_INFO_CAP_NVLINK2_SSATGT); 1667 if (!hdr) { 1668 ret = -ENODEV; 1669 goto free_exit; 1670 } 1671 captgt = (void *) hdr; 1672 1673 hdr = vfio_get_region_info_cap(atsdreg, 1674 VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD); 1675 if (!hdr) { 1676 ret = -ENODEV; 1677 goto free_exit; 1678 } 1679 capspeed = (void *) hdr; 1680 1681 /* Some NVLink bridges may not have assigned ATSD */ 1682 if (atsdreg->size) { 1683 p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE | PROT_EXEC, 1684 MAP_SHARED, vdev->vbasedev.fd, atsdreg->offset); 1685 if (p == MAP_FAILED) { 1686 ret = -errno; 1687 goto free_exit; 1688 } 1689 1690 quirk = vfio_quirk_alloc(1); 1691 memory_region_init_ram_device_ptr(&quirk->mem[0], OBJECT(vdev), 1692 "nvlink2-atsd-mr", atsdreg->size, p); 1693 QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next); 1694 } 1695 1696 object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64", 1697 vfio_pci_nvlink2_get_tgt, NULL, NULL, 1698 (void *) (uintptr_t) captgt->tgt); 1699 trace_vfio_pci_nvlink2_setup_quirk_ssatgt(vdev->vbasedev.name, captgt->tgt, 1700 atsdreg->size); 1701 1702 object_property_add(OBJECT(vdev), "nvlink2-link-speed", "uint32", 1703 vfio_pci_nvlink2_get_link_speed, NULL, NULL, 1704 (void *) (uintptr_t) capspeed->link_speed); 1705 trace_vfio_pci_nvlink2_setup_quirk_lnkspd(vdev->vbasedev.name, 1706 capspeed->link_speed); 1707 free_exit: 1708 g_free(atsdreg); 1709 1710 return ret; 1711 } 1712