1 /* 2 * device quirks for PCI devices 3 * 4 * Copyright Red Hat, Inc. 2012-2015 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include "qemu/units.h" 15 #include "qemu/error-report.h" 16 #include "qemu/main-loop.h" 17 #include "qemu/range.h" 18 #include "qapi/error.h" 19 #include "qapi/visitor.h" 20 #include <sys/ioctl.h> 21 #include "hw/nvram/fw_cfg.h" 22 #include "pci.h" 23 #include "trace.h" 24 25 /* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */ 26 static bool vfio_pci_is(VFIOPCIDevice *vdev, uint32_t vendor, uint32_t device) 27 { 28 return (vendor == PCI_ANY_ID || vendor == vdev->vendor_id) && 29 (device == PCI_ANY_ID || device == vdev->device_id); 30 } 31 32 static bool vfio_is_vga(VFIOPCIDevice *vdev) 33 { 34 PCIDevice *pdev = &vdev->pdev; 35 uint16_t class = pci_get_word(pdev->config + PCI_CLASS_DEVICE); 36 37 return class == PCI_CLASS_DISPLAY_VGA; 38 } 39 40 /* 41 * List of device ids/vendor ids for which to disable 42 * option rom loading. This avoids the guest hangs during rom 43 * execution as noticed with the BCM 57810 card for lack of a 44 * more better way to handle such issues. 45 * The user can still override by specifying a romfile or 46 * rombar=1. 47 * Please see https://bugs.launchpad.net/qemu/+bug/1284874 48 * for an analysis of the 57810 card hang. When adding 49 * a new vendor id/device id combination below, please also add 50 * your card/environment details and information that could 51 * help in debugging to the bug tracking this issue 52 */ 53 static const struct { 54 uint32_t vendor; 55 uint32_t device; 56 } romblacklist[] = { 57 { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */ 58 }; 59 60 bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev) 61 { 62 int i; 63 64 for (i = 0 ; i < ARRAY_SIZE(romblacklist); i++) { 65 if (vfio_pci_is(vdev, romblacklist[i].vendor, romblacklist[i].device)) { 66 trace_vfio_quirk_rom_blacklisted(vdev->vbasedev.name, 67 romblacklist[i].vendor, 68 romblacklist[i].device); 69 return true; 70 } 71 } 72 return false; 73 } 74 75 /* 76 * Device specific region quirks (mostly backdoors to PCI config space) 77 */ 78 79 /* 80 * The generic window quirks operate on an address and data register, 81 * vfio_generic_window_address_quirk handles the address register and 82 * vfio_generic_window_data_quirk handles the data register. These ops 83 * pass reads and writes through to hardware until a value matching the 84 * stored address match/mask is written. When this occurs, the data 85 * register access emulated PCI config space for the device rather than 86 * passing through accesses. This enables devices where PCI config space 87 * is accessible behind a window register to maintain the virtualization 88 * provided through vfio. 89 */ 90 typedef struct VFIOConfigWindowMatch { 91 uint32_t match; 92 uint32_t mask; 93 } VFIOConfigWindowMatch; 94 95 typedef struct VFIOConfigWindowQuirk { 96 struct VFIOPCIDevice *vdev; 97 98 uint32_t address_val; 99 100 uint32_t address_offset; 101 uint32_t data_offset; 102 103 bool window_enabled; 104 uint8_t bar; 105 106 MemoryRegion *addr_mem; 107 MemoryRegion *data_mem; 108 109 uint32_t nr_matches; 110 VFIOConfigWindowMatch matches[]; 111 } VFIOConfigWindowQuirk; 112 113 static uint64_t vfio_generic_window_quirk_address_read(void *opaque, 114 hwaddr addr, 115 unsigned size) 116 { 117 VFIOConfigWindowQuirk *window = opaque; 118 VFIOPCIDevice *vdev = window->vdev; 119 120 return vfio_region_read(&vdev->bars[window->bar].region, 121 addr + window->address_offset, size); 122 } 123 124 static void vfio_generic_window_quirk_address_write(void *opaque, hwaddr addr, 125 uint64_t data, 126 unsigned size) 127 { 128 VFIOConfigWindowQuirk *window = opaque; 129 VFIOPCIDevice *vdev = window->vdev; 130 int i; 131 132 window->window_enabled = false; 133 134 vfio_region_write(&vdev->bars[window->bar].region, 135 addr + window->address_offset, data, size); 136 137 for (i = 0; i < window->nr_matches; i++) { 138 if ((data & ~window->matches[i].mask) == window->matches[i].match) { 139 window->window_enabled = true; 140 window->address_val = data & window->matches[i].mask; 141 trace_vfio_quirk_generic_window_address_write(vdev->vbasedev.name, 142 memory_region_name(window->addr_mem), data); 143 break; 144 } 145 } 146 } 147 148 static const MemoryRegionOps vfio_generic_window_address_quirk = { 149 .read = vfio_generic_window_quirk_address_read, 150 .write = vfio_generic_window_quirk_address_write, 151 .endianness = DEVICE_LITTLE_ENDIAN, 152 }; 153 154 static uint64_t vfio_generic_window_quirk_data_read(void *opaque, 155 hwaddr addr, unsigned size) 156 { 157 VFIOConfigWindowQuirk *window = opaque; 158 VFIOPCIDevice *vdev = window->vdev; 159 uint64_t data; 160 161 /* Always read data reg, discard if window enabled */ 162 data = vfio_region_read(&vdev->bars[window->bar].region, 163 addr + window->data_offset, size); 164 165 if (window->window_enabled) { 166 data = vfio_pci_read_config(&vdev->pdev, window->address_val, size); 167 trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name, 168 memory_region_name(window->data_mem), data); 169 } 170 171 return data; 172 } 173 174 static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr, 175 uint64_t data, unsigned size) 176 { 177 VFIOConfigWindowQuirk *window = opaque; 178 VFIOPCIDevice *vdev = window->vdev; 179 180 if (window->window_enabled) { 181 vfio_pci_write_config(&vdev->pdev, window->address_val, data, size); 182 trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name, 183 memory_region_name(window->data_mem), data); 184 return; 185 } 186 187 vfio_region_write(&vdev->bars[window->bar].region, 188 addr + window->data_offset, data, size); 189 } 190 191 static const MemoryRegionOps vfio_generic_window_data_quirk = { 192 .read = vfio_generic_window_quirk_data_read, 193 .write = vfio_generic_window_quirk_data_write, 194 .endianness = DEVICE_LITTLE_ENDIAN, 195 }; 196 197 /* 198 * The generic mirror quirk handles devices which expose PCI config space 199 * through a region within a BAR. When enabled, reads and writes are 200 * redirected through to emulated PCI config space. XXX if PCI config space 201 * used memory regions, this could just be an alias. 202 */ 203 typedef struct VFIOConfigMirrorQuirk { 204 struct VFIOPCIDevice *vdev; 205 uint32_t offset; 206 uint8_t bar; 207 MemoryRegion *mem; 208 uint8_t data[]; 209 } VFIOConfigMirrorQuirk; 210 211 static uint64_t vfio_generic_quirk_mirror_read(void *opaque, 212 hwaddr addr, unsigned size) 213 { 214 VFIOConfigMirrorQuirk *mirror = opaque; 215 VFIOPCIDevice *vdev = mirror->vdev; 216 uint64_t data; 217 218 /* Read and discard in case the hardware cares */ 219 (void)vfio_region_read(&vdev->bars[mirror->bar].region, 220 addr + mirror->offset, size); 221 222 data = vfio_pci_read_config(&vdev->pdev, addr, size); 223 trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name, 224 memory_region_name(mirror->mem), 225 addr, data); 226 return data; 227 } 228 229 static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr, 230 uint64_t data, unsigned size) 231 { 232 VFIOConfigMirrorQuirk *mirror = opaque; 233 VFIOPCIDevice *vdev = mirror->vdev; 234 235 vfio_pci_write_config(&vdev->pdev, addr, data, size); 236 trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name, 237 memory_region_name(mirror->mem), 238 addr, data); 239 } 240 241 static const MemoryRegionOps vfio_generic_mirror_quirk = { 242 .read = vfio_generic_quirk_mirror_read, 243 .write = vfio_generic_quirk_mirror_write, 244 .endianness = DEVICE_LITTLE_ENDIAN, 245 }; 246 247 /* Is range1 fully contained within range2? */ 248 static bool vfio_range_contained(uint64_t first1, uint64_t len1, 249 uint64_t first2, uint64_t len2) { 250 return (first1 >= first2 && first1 + len1 <= first2 + len2); 251 } 252 253 #define PCI_VENDOR_ID_ATI 0x1002 254 255 /* 256 * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR 257 * through VGA register 0x3c3. On newer cards, the I/O port BAR is always 258 * BAR4 (older cards like the X550 used BAR1, but we don't care to support 259 * those). Note that on bare metal, a read of 0x3c3 doesn't always return the 260 * I/O port BAR address. Originally this was coded to return the virtual BAR 261 * address only if the physical register read returns the actual BAR address, 262 * but users have reported greater success if we return the virtual address 263 * unconditionally. 264 */ 265 static uint64_t vfio_ati_3c3_quirk_read(void *opaque, 266 hwaddr addr, unsigned size) 267 { 268 VFIOPCIDevice *vdev = opaque; 269 uint64_t data = vfio_pci_read_config(&vdev->pdev, 270 PCI_BASE_ADDRESS_4 + 1, size); 271 272 trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data); 273 274 return data; 275 } 276 277 static const MemoryRegionOps vfio_ati_3c3_quirk = { 278 .read = vfio_ati_3c3_quirk_read, 279 .endianness = DEVICE_LITTLE_ENDIAN, 280 }; 281 282 static VFIOQuirk *vfio_quirk_alloc(int nr_mem) 283 { 284 VFIOQuirk *quirk = g_new0(VFIOQuirk, 1); 285 QLIST_INIT(&quirk->ioeventfds); 286 quirk->mem = g_new0(MemoryRegion, nr_mem); 287 quirk->nr_mem = nr_mem; 288 289 return quirk; 290 } 291 292 static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd) 293 { 294 QLIST_REMOVE(ioeventfd, next); 295 memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size, 296 true, ioeventfd->data, &ioeventfd->e); 297 298 if (ioeventfd->vfio) { 299 struct vfio_device_ioeventfd vfio_ioeventfd; 300 301 vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd); 302 vfio_ioeventfd.flags = ioeventfd->size; 303 vfio_ioeventfd.data = ioeventfd->data; 304 vfio_ioeventfd.offset = ioeventfd->region->fd_offset + 305 ioeventfd->region_addr; 306 vfio_ioeventfd.fd = -1; 307 308 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd)) { 309 error_report("Failed to remove vfio ioeventfd for %s+0x%" 310 HWADDR_PRIx"[%d]:0x%"PRIx64" (%m)", 311 memory_region_name(ioeventfd->mr), ioeventfd->addr, 312 ioeventfd->size, ioeventfd->data); 313 } 314 } else { 315 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), 316 NULL, NULL, NULL); 317 } 318 319 event_notifier_cleanup(&ioeventfd->e); 320 trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr), 321 (uint64_t)ioeventfd->addr, ioeventfd->size, 322 ioeventfd->data); 323 g_free(ioeventfd); 324 } 325 326 static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk) 327 { 328 VFIOIOEventFD *ioeventfd, *tmp; 329 330 QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) { 331 if (ioeventfd->dynamic) { 332 vfio_ioeventfd_exit(vdev, ioeventfd); 333 } 334 } 335 } 336 337 static void vfio_ioeventfd_handler(void *opaque) 338 { 339 VFIOIOEventFD *ioeventfd = opaque; 340 341 if (event_notifier_test_and_clear(&ioeventfd->e)) { 342 vfio_region_write(ioeventfd->region, ioeventfd->region_addr, 343 ioeventfd->data, ioeventfd->size); 344 trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr), 345 (uint64_t)ioeventfd->addr, ioeventfd->size, 346 ioeventfd->data); 347 } 348 } 349 350 static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev, 351 MemoryRegion *mr, hwaddr addr, 352 unsigned size, uint64_t data, 353 VFIORegion *region, 354 hwaddr region_addr, bool dynamic) 355 { 356 VFIOIOEventFD *ioeventfd; 357 358 if (vdev->no_kvm_ioeventfd) { 359 return NULL; 360 } 361 362 ioeventfd = g_malloc0(sizeof(*ioeventfd)); 363 364 if (event_notifier_init(&ioeventfd->e, 0)) { 365 g_free(ioeventfd); 366 return NULL; 367 } 368 369 /* 370 * MemoryRegion and relative offset, plus additional ioeventfd setup 371 * parameters for configuring and later tearing down KVM ioeventfd. 372 */ 373 ioeventfd->mr = mr; 374 ioeventfd->addr = addr; 375 ioeventfd->size = size; 376 ioeventfd->data = data; 377 ioeventfd->dynamic = dynamic; 378 /* 379 * VFIORegion and relative offset for implementing the userspace 380 * handler. data & size fields shared for both uses. 381 */ 382 ioeventfd->region = region; 383 ioeventfd->region_addr = region_addr; 384 385 if (!vdev->no_vfio_ioeventfd) { 386 struct vfio_device_ioeventfd vfio_ioeventfd; 387 388 vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd); 389 vfio_ioeventfd.flags = ioeventfd->size; 390 vfio_ioeventfd.data = ioeventfd->data; 391 vfio_ioeventfd.offset = ioeventfd->region->fd_offset + 392 ioeventfd->region_addr; 393 vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e); 394 395 ioeventfd->vfio = !ioctl(vdev->vbasedev.fd, 396 VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd); 397 } 398 399 if (!ioeventfd->vfio) { 400 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), 401 vfio_ioeventfd_handler, NULL, ioeventfd); 402 } 403 404 memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size, 405 true, ioeventfd->data, &ioeventfd->e); 406 trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr, 407 size, data, ioeventfd->vfio); 408 409 return ioeventfd; 410 } 411 412 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev) 413 { 414 VFIOQuirk *quirk; 415 416 /* 417 * As long as the BAR is >= 256 bytes it will be aligned such that the 418 * lower byte is always zero. Filter out anything else, if it exists. 419 */ 420 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 421 !vdev->bars[4].ioport || vdev->bars[4].region.size < 256) { 422 return; 423 } 424 425 quirk = vfio_quirk_alloc(1); 426 427 memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev, 428 "vfio-ati-3c3-quirk", 1); 429 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 430 3 /* offset 3 bytes from 0x3c0 */, quirk->mem); 431 432 QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks, 433 quirk, next); 434 435 trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name); 436 } 437 438 /* 439 * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI 440 * config space through MMIO BAR2 at offset 0x4000. Nothing seems to access 441 * the MMIO space directly, but a window to this space is provided through 442 * I/O port BAR4. Offset 0x0 is the address register and offset 0x4 is the 443 * data register. When the address is programmed to a range of 0x4000-0x4fff 444 * PCI configuration space is available. Experimentation seems to indicate 445 * that read-only may be provided by hardware. 446 */ 447 static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr) 448 { 449 VFIOQuirk *quirk; 450 VFIOConfigWindowQuirk *window; 451 452 /* This windows doesn't seem to be used except by legacy VGA code */ 453 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 454 !vdev->vga || nr != 4) { 455 return; 456 } 457 458 quirk = vfio_quirk_alloc(2); 459 window = quirk->data = g_malloc0(sizeof(*window) + 460 sizeof(VFIOConfigWindowMatch)); 461 window->vdev = vdev; 462 window->address_offset = 0; 463 window->data_offset = 4; 464 window->nr_matches = 1; 465 window->matches[0].match = 0x4000; 466 window->matches[0].mask = vdev->config_size - 1; 467 window->bar = nr; 468 window->addr_mem = &quirk->mem[0]; 469 window->data_mem = &quirk->mem[1]; 470 471 memory_region_init_io(window->addr_mem, OBJECT(vdev), 472 &vfio_generic_window_address_quirk, window, 473 "vfio-ati-bar4-window-address-quirk", 4); 474 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 475 window->address_offset, 476 window->addr_mem, 1); 477 478 memory_region_init_io(window->data_mem, OBJECT(vdev), 479 &vfio_generic_window_data_quirk, window, 480 "vfio-ati-bar4-window-data-quirk", 4); 481 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 482 window->data_offset, 483 window->data_mem, 1); 484 485 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 486 487 trace_vfio_quirk_ati_bar4_probe(vdev->vbasedev.name); 488 } 489 490 /* 491 * Trap the BAR2 MMIO mirror to config space as well. 492 */ 493 static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr) 494 { 495 VFIOQuirk *quirk; 496 VFIOConfigMirrorQuirk *mirror; 497 498 /* Only enable on newer devices where BAR2 is 64bit */ 499 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 500 !vdev->vga || nr != 2 || !vdev->bars[2].mem64) { 501 return; 502 } 503 504 quirk = vfio_quirk_alloc(1); 505 mirror = quirk->data = g_malloc0(sizeof(*mirror)); 506 mirror->mem = quirk->mem; 507 mirror->vdev = vdev; 508 mirror->offset = 0x4000; 509 mirror->bar = nr; 510 511 memory_region_init_io(mirror->mem, OBJECT(vdev), 512 &vfio_generic_mirror_quirk, mirror, 513 "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE); 514 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 515 mirror->offset, mirror->mem, 1); 516 517 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 518 519 trace_vfio_quirk_ati_bar2_probe(vdev->vbasedev.name); 520 } 521 522 /* 523 * Older ATI/AMD cards like the X550 have a similar window to that above. 524 * I/O port BAR1 provides a window to a mirror of PCI config space located 525 * in BAR2 at offset 0xf00. We don't care to support such older cards, but 526 * note it for future reference. 527 */ 528 529 /* 530 * Nvidia has several different methods to get to config space, the 531 * nouveu project has several of these documented here: 532 * https://github.com/pathscale/envytools/tree/master/hwdocs 533 * 534 * The first quirk is actually not documented in envytools and is found 535 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an 536 * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access 537 * the mirror of PCI config space found at BAR0 offset 0x1800. The access 538 * sequence first writes 0x338 to I/O port 0x3d4. The target offset is 539 * then written to 0x3d0. Finally 0x538 is written for a read and 0x738 540 * is written for a write to 0x3d4. The BAR0 offset is then accessible 541 * through 0x3d0. This quirk doesn't seem to be necessary on newer cards 542 * that use the I/O port BAR5 window but it doesn't hurt to leave it. 543 */ 544 typedef enum {NONE = 0, SELECT, WINDOW, READ, WRITE} VFIONvidia3d0State; 545 static const char *nv3d0_states[] = { "NONE", "SELECT", 546 "WINDOW", "READ", "WRITE" }; 547 548 typedef struct VFIONvidia3d0Quirk { 549 VFIOPCIDevice *vdev; 550 VFIONvidia3d0State state; 551 uint32_t offset; 552 } VFIONvidia3d0Quirk; 553 554 static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque, 555 hwaddr addr, unsigned size) 556 { 557 VFIONvidia3d0Quirk *quirk = opaque; 558 VFIOPCIDevice *vdev = quirk->vdev; 559 560 quirk->state = NONE; 561 562 return vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 563 addr + 0x14, size); 564 } 565 566 static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr, 567 uint64_t data, unsigned size) 568 { 569 VFIONvidia3d0Quirk *quirk = opaque; 570 VFIOPCIDevice *vdev = quirk->vdev; 571 VFIONvidia3d0State old_state = quirk->state; 572 573 quirk->state = NONE; 574 575 switch (data) { 576 case 0x338: 577 if (old_state == NONE) { 578 quirk->state = SELECT; 579 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 580 nv3d0_states[quirk->state]); 581 } 582 break; 583 case 0x538: 584 if (old_state == WINDOW) { 585 quirk->state = READ; 586 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 587 nv3d0_states[quirk->state]); 588 } 589 break; 590 case 0x738: 591 if (old_state == WINDOW) { 592 quirk->state = WRITE; 593 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 594 nv3d0_states[quirk->state]); 595 } 596 break; 597 } 598 599 vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 600 addr + 0x14, data, size); 601 } 602 603 static const MemoryRegionOps vfio_nvidia_3d4_quirk = { 604 .read = vfio_nvidia_3d4_quirk_read, 605 .write = vfio_nvidia_3d4_quirk_write, 606 .endianness = DEVICE_LITTLE_ENDIAN, 607 }; 608 609 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque, 610 hwaddr addr, unsigned size) 611 { 612 VFIONvidia3d0Quirk *quirk = opaque; 613 VFIOPCIDevice *vdev = quirk->vdev; 614 VFIONvidia3d0State old_state = quirk->state; 615 uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 616 addr + 0x10, size); 617 618 quirk->state = NONE; 619 620 if (old_state == READ && 621 (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { 622 uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); 623 624 data = vfio_pci_read_config(&vdev->pdev, offset, size); 625 trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name, 626 offset, size, data); 627 } 628 629 return data; 630 } 631 632 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr, 633 uint64_t data, unsigned size) 634 { 635 VFIONvidia3d0Quirk *quirk = opaque; 636 VFIOPCIDevice *vdev = quirk->vdev; 637 VFIONvidia3d0State old_state = quirk->state; 638 639 quirk->state = NONE; 640 641 if (old_state == SELECT) { 642 quirk->offset = (uint32_t)data; 643 quirk->state = WINDOW; 644 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 645 nv3d0_states[quirk->state]); 646 } else if (old_state == WRITE) { 647 if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { 648 uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); 649 650 vfio_pci_write_config(&vdev->pdev, offset, data, size); 651 trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name, 652 offset, data, size); 653 return; 654 } 655 } 656 657 vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 658 addr + 0x10, data, size); 659 } 660 661 static const MemoryRegionOps vfio_nvidia_3d0_quirk = { 662 .read = vfio_nvidia_3d0_quirk_read, 663 .write = vfio_nvidia_3d0_quirk_write, 664 .endianness = DEVICE_LITTLE_ENDIAN, 665 }; 666 667 static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev) 668 { 669 VFIOQuirk *quirk; 670 VFIONvidia3d0Quirk *data; 671 672 if (vdev->no_geforce_quirks || 673 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 674 !vdev->bars[1].region.size) { 675 return; 676 } 677 678 quirk = vfio_quirk_alloc(2); 679 quirk->data = data = g_malloc0(sizeof(*data)); 680 data->vdev = vdev; 681 682 memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk, 683 data, "vfio-nvidia-3d4-quirk", 2); 684 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 685 0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]); 686 687 memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk, 688 data, "vfio-nvidia-3d0-quirk", 2); 689 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 690 0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]); 691 692 QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks, 693 quirk, next); 694 695 trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name); 696 } 697 698 /* 699 * The second quirk is documented in envytools. The I/O port BAR5 is just 700 * a set of address/data ports to the MMIO BARs. The BAR we care about is 701 * again BAR0. This backdoor is apparently a bit newer than the one above 702 * so we need to not only trap 256 bytes @0x1800, but all of PCI config 703 * space, including extended space is available at the 4k @0x88000. 704 */ 705 typedef struct VFIONvidiaBAR5Quirk { 706 uint32_t master; 707 uint32_t enable; 708 MemoryRegion *addr_mem; 709 MemoryRegion *data_mem; 710 bool enabled; 711 VFIOConfigWindowQuirk window; /* last for match data */ 712 } VFIONvidiaBAR5Quirk; 713 714 static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk *bar5) 715 { 716 VFIOPCIDevice *vdev = bar5->window.vdev; 717 718 if (((bar5->master & bar5->enable) & 0x1) == bar5->enabled) { 719 return; 720 } 721 722 bar5->enabled = !bar5->enabled; 723 trace_vfio_quirk_nvidia_bar5_state(vdev->vbasedev.name, 724 bar5->enabled ? "Enable" : "Disable"); 725 memory_region_set_enabled(bar5->addr_mem, bar5->enabled); 726 memory_region_set_enabled(bar5->data_mem, bar5->enabled); 727 } 728 729 static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque, 730 hwaddr addr, unsigned size) 731 { 732 VFIONvidiaBAR5Quirk *bar5 = opaque; 733 VFIOPCIDevice *vdev = bar5->window.vdev; 734 735 return vfio_region_read(&vdev->bars[5].region, addr, size); 736 } 737 738 static void vfio_nvidia_bar5_quirk_master_write(void *opaque, hwaddr addr, 739 uint64_t data, unsigned size) 740 { 741 VFIONvidiaBAR5Quirk *bar5 = opaque; 742 VFIOPCIDevice *vdev = bar5->window.vdev; 743 744 vfio_region_write(&vdev->bars[5].region, addr, data, size); 745 746 bar5->master = data; 747 vfio_nvidia_bar5_enable(bar5); 748 } 749 750 static const MemoryRegionOps vfio_nvidia_bar5_quirk_master = { 751 .read = vfio_nvidia_bar5_quirk_master_read, 752 .write = vfio_nvidia_bar5_quirk_master_write, 753 .endianness = DEVICE_LITTLE_ENDIAN, 754 }; 755 756 static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque, 757 hwaddr addr, unsigned size) 758 { 759 VFIONvidiaBAR5Quirk *bar5 = opaque; 760 VFIOPCIDevice *vdev = bar5->window.vdev; 761 762 return vfio_region_read(&vdev->bars[5].region, addr + 4, size); 763 } 764 765 static void vfio_nvidia_bar5_quirk_enable_write(void *opaque, hwaddr addr, 766 uint64_t data, unsigned size) 767 { 768 VFIONvidiaBAR5Quirk *bar5 = opaque; 769 VFIOPCIDevice *vdev = bar5->window.vdev; 770 771 vfio_region_write(&vdev->bars[5].region, addr + 4, data, size); 772 773 bar5->enable = data; 774 vfio_nvidia_bar5_enable(bar5); 775 } 776 777 static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable = { 778 .read = vfio_nvidia_bar5_quirk_enable_read, 779 .write = vfio_nvidia_bar5_quirk_enable_write, 780 .endianness = DEVICE_LITTLE_ENDIAN, 781 }; 782 783 static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr) 784 { 785 VFIOQuirk *quirk; 786 VFIONvidiaBAR5Quirk *bar5; 787 VFIOConfigWindowQuirk *window; 788 789 if (vdev->no_geforce_quirks || 790 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 791 !vdev->vga || nr != 5 || !vdev->bars[5].ioport) { 792 return; 793 } 794 795 quirk = vfio_quirk_alloc(4); 796 bar5 = quirk->data = g_malloc0(sizeof(*bar5) + 797 (sizeof(VFIOConfigWindowMatch) * 2)); 798 window = &bar5->window; 799 800 window->vdev = vdev; 801 window->address_offset = 0x8; 802 window->data_offset = 0xc; 803 window->nr_matches = 2; 804 window->matches[0].match = 0x1800; 805 window->matches[0].mask = PCI_CONFIG_SPACE_SIZE - 1; 806 window->matches[1].match = 0x88000; 807 window->matches[1].mask = vdev->config_size - 1; 808 window->bar = nr; 809 window->addr_mem = bar5->addr_mem = &quirk->mem[0]; 810 window->data_mem = bar5->data_mem = &quirk->mem[1]; 811 812 memory_region_init_io(window->addr_mem, OBJECT(vdev), 813 &vfio_generic_window_address_quirk, window, 814 "vfio-nvidia-bar5-window-address-quirk", 4); 815 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 816 window->address_offset, 817 window->addr_mem, 1); 818 memory_region_set_enabled(window->addr_mem, false); 819 820 memory_region_init_io(window->data_mem, OBJECT(vdev), 821 &vfio_generic_window_data_quirk, window, 822 "vfio-nvidia-bar5-window-data-quirk", 4); 823 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 824 window->data_offset, 825 window->data_mem, 1); 826 memory_region_set_enabled(window->data_mem, false); 827 828 memory_region_init_io(&quirk->mem[2], OBJECT(vdev), 829 &vfio_nvidia_bar5_quirk_master, bar5, 830 "vfio-nvidia-bar5-master-quirk", 4); 831 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 832 0, &quirk->mem[2], 1); 833 834 memory_region_init_io(&quirk->mem[3], OBJECT(vdev), 835 &vfio_nvidia_bar5_quirk_enable, bar5, 836 "vfio-nvidia-bar5-enable-quirk", 4); 837 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 838 4, &quirk->mem[3], 1); 839 840 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 841 842 trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name); 843 } 844 845 typedef struct LastDataSet { 846 VFIOQuirk *quirk; 847 hwaddr addr; 848 uint64_t data; 849 unsigned size; 850 int hits; 851 int added; 852 } LastDataSet; 853 854 #define MAX_DYN_IOEVENTFD 10 855 #define HITS_FOR_IOEVENTFD 10 856 857 /* 858 * Finally, BAR0 itself. We want to redirect any accesses to either 859 * 0x1800 or 0x88000 through the PCI config space access functions. 860 */ 861 static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr, 862 uint64_t data, unsigned size) 863 { 864 VFIOConfigMirrorQuirk *mirror = opaque; 865 VFIOPCIDevice *vdev = mirror->vdev; 866 PCIDevice *pdev = &vdev->pdev; 867 LastDataSet *last = (LastDataSet *)&mirror->data; 868 869 vfio_generic_quirk_mirror_write(opaque, addr, data, size); 870 871 /* 872 * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the 873 * MSI capability ID register. Both the ID and next register are 874 * read-only, so we allow writes covering either of those to real hw. 875 */ 876 if ((pdev->cap_present & QEMU_PCI_CAP_MSI) && 877 vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) { 878 vfio_region_write(&vdev->bars[mirror->bar].region, 879 addr + mirror->offset, data, size); 880 trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name); 881 } 882 883 /* 884 * Automatically add an ioeventfd to handle any repeated write with the 885 * same data and size above the standard PCI config space header. This is 886 * primarily expected to accelerate the MSI-ACK behavior, such as noted 887 * above. Current hardware/drivers should trigger an ioeventfd at config 888 * offset 0x704 (region offset 0x88704), with data 0x0, size 4. 889 * 890 * The criteria of 10 successive hits is arbitrary but reliably adds the 891 * MSI-ACK region. Note that as some writes are bypassed via the ioeventfd, 892 * the remaining ones have a greater chance of being seen successively. 893 * To avoid the pathological case of burning up all of QEMU's open file 894 * handles, arbitrarily limit this algorithm from adding no more than 10 895 * ioeventfds, print an error if we would have added an 11th, and then 896 * stop counting. 897 */ 898 if (!vdev->no_kvm_ioeventfd && 899 addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) { 900 if (addr != last->addr || data != last->data || size != last->size) { 901 last->addr = addr; 902 last->data = data; 903 last->size = size; 904 last->hits = 1; 905 } else if (++last->hits >= HITS_FOR_IOEVENTFD) { 906 if (last->added < MAX_DYN_IOEVENTFD) { 907 VFIOIOEventFD *ioeventfd; 908 ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size, 909 data, &vdev->bars[mirror->bar].region, 910 mirror->offset + addr, true); 911 if (ioeventfd) { 912 VFIOQuirk *quirk = last->quirk; 913 914 QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next); 915 last->added++; 916 } 917 } else { 918 last->added++; 919 warn_report("NVIDIA ioeventfd queue full for %s, unable to " 920 "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", " 921 "size %u", vdev->vbasedev.name, addr, data, size); 922 } 923 } 924 } 925 } 926 927 static const MemoryRegionOps vfio_nvidia_mirror_quirk = { 928 .read = vfio_generic_quirk_mirror_read, 929 .write = vfio_nvidia_quirk_mirror_write, 930 .endianness = DEVICE_LITTLE_ENDIAN, 931 }; 932 933 static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk) 934 { 935 VFIOConfigMirrorQuirk *mirror = quirk->data; 936 LastDataSet *last = (LastDataSet *)&mirror->data; 937 938 last->addr = last->data = last->size = last->hits = last->added = 0; 939 940 vfio_drop_dynamic_eventfds(vdev, quirk); 941 } 942 943 static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr) 944 { 945 VFIOQuirk *quirk; 946 VFIOConfigMirrorQuirk *mirror; 947 LastDataSet *last; 948 949 if (vdev->no_geforce_quirks || 950 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 951 !vfio_is_vga(vdev) || nr != 0) { 952 return; 953 } 954 955 quirk = vfio_quirk_alloc(1); 956 quirk->reset = vfio_nvidia_bar0_quirk_reset; 957 mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); 958 mirror->mem = quirk->mem; 959 mirror->vdev = vdev; 960 mirror->offset = 0x88000; 961 mirror->bar = nr; 962 last = (LastDataSet *)&mirror->data; 963 last->quirk = quirk; 964 965 memory_region_init_io(mirror->mem, OBJECT(vdev), 966 &vfio_nvidia_mirror_quirk, mirror, 967 "vfio-nvidia-bar0-88000-mirror-quirk", 968 vdev->config_size); 969 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 970 mirror->offset, mirror->mem, 1); 971 972 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 973 974 /* The 0x1800 offset mirror only seems to get used by legacy VGA */ 975 if (vdev->vga) { 976 quirk = vfio_quirk_alloc(1); 977 quirk->reset = vfio_nvidia_bar0_quirk_reset; 978 mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); 979 mirror->mem = quirk->mem; 980 mirror->vdev = vdev; 981 mirror->offset = 0x1800; 982 mirror->bar = nr; 983 last = (LastDataSet *)&mirror->data; 984 last->quirk = quirk; 985 986 memory_region_init_io(mirror->mem, OBJECT(vdev), 987 &vfio_nvidia_mirror_quirk, mirror, 988 "vfio-nvidia-bar0-1800-mirror-quirk", 989 PCI_CONFIG_SPACE_SIZE); 990 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 991 mirror->offset, mirror->mem, 1); 992 993 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 994 } 995 996 trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name); 997 } 998 999 /* 1000 * TODO - Some Nvidia devices provide config access to their companion HDA 1001 * device and even to their parent bridge via these config space mirrors. 1002 * Add quirks for those regions. 1003 */ 1004 1005 #define PCI_VENDOR_ID_REALTEK 0x10ec 1006 1007 /* 1008 * RTL8168 devices have a backdoor that can access the MSI-X table. At BAR2 1009 * offset 0x70 there is a dword data register, offset 0x74 is a dword address 1010 * register. According to the Linux r8169 driver, the MSI-X table is addressed 1011 * when the "type" portion of the address register is set to 0x1. This appears 1012 * to be bits 16:30. Bit 31 is both a write indicator and some sort of 1013 * "address latched" indicator. Bits 12:15 are a mask field, which we can 1014 * ignore because the MSI-X table should always be accessed as a dword (full 1015 * mask). Bits 0:11 is offset within the type. 1016 * 1017 * Example trace: 1018 * 1019 * Read from MSI-X table offset 0 1020 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr 1021 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch 1022 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data 1023 * 1024 * Write 0xfee00000 to MSI-X table offset 0 1025 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data 1026 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write 1027 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete 1028 */ 1029 typedef struct VFIOrtl8168Quirk { 1030 VFIOPCIDevice *vdev; 1031 uint32_t addr; 1032 uint32_t data; 1033 bool enabled; 1034 } VFIOrtl8168Quirk; 1035 1036 static uint64_t vfio_rtl8168_quirk_address_read(void *opaque, 1037 hwaddr addr, unsigned size) 1038 { 1039 VFIOrtl8168Quirk *rtl = opaque; 1040 VFIOPCIDevice *vdev = rtl->vdev; 1041 uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size); 1042 1043 if (rtl->enabled) { 1044 data = rtl->addr ^ 0x80000000U; /* latch/complete */ 1045 trace_vfio_quirk_rtl8168_fake_latch(vdev->vbasedev.name, data); 1046 } 1047 1048 return data; 1049 } 1050 1051 static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr, 1052 uint64_t data, unsigned size) 1053 { 1054 VFIOrtl8168Quirk *rtl = opaque; 1055 VFIOPCIDevice *vdev = rtl->vdev; 1056 1057 rtl->enabled = false; 1058 1059 if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */ 1060 rtl->enabled = true; 1061 rtl->addr = (uint32_t)data; 1062 1063 if (data & 0x80000000U) { /* Do write */ 1064 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) { 1065 hwaddr offset = data & 0xfff; 1066 uint64_t val = rtl->data; 1067 1068 trace_vfio_quirk_rtl8168_msix_write(vdev->vbasedev.name, 1069 (uint16_t)offset, val); 1070 1071 /* Write to the proper guest MSI-X table instead */ 1072 memory_region_dispatch_write(&vdev->pdev.msix_table_mmio, 1073 offset, val, size, 1074 MEMTXATTRS_UNSPECIFIED); 1075 } 1076 return; /* Do not write guest MSI-X data to hardware */ 1077 } 1078 } 1079 1080 vfio_region_write(&vdev->bars[2].region, addr + 0x74, data, size); 1081 } 1082 1083 static const MemoryRegionOps vfio_rtl_address_quirk = { 1084 .read = vfio_rtl8168_quirk_address_read, 1085 .write = vfio_rtl8168_quirk_address_write, 1086 .valid = { 1087 .min_access_size = 4, 1088 .max_access_size = 4, 1089 .unaligned = false, 1090 }, 1091 .endianness = DEVICE_LITTLE_ENDIAN, 1092 }; 1093 1094 static uint64_t vfio_rtl8168_quirk_data_read(void *opaque, 1095 hwaddr addr, unsigned size) 1096 { 1097 VFIOrtl8168Quirk *rtl = opaque; 1098 VFIOPCIDevice *vdev = rtl->vdev; 1099 uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x70, size); 1100 1101 if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) { 1102 hwaddr offset = rtl->addr & 0xfff; 1103 memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset, 1104 &data, size, MEMTXATTRS_UNSPECIFIED); 1105 trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data); 1106 } 1107 1108 return data; 1109 } 1110 1111 static void vfio_rtl8168_quirk_data_write(void *opaque, hwaddr addr, 1112 uint64_t data, unsigned size) 1113 { 1114 VFIOrtl8168Quirk *rtl = opaque; 1115 VFIOPCIDevice *vdev = rtl->vdev; 1116 1117 rtl->data = (uint32_t)data; 1118 1119 vfio_region_write(&vdev->bars[2].region, addr + 0x70, data, size); 1120 } 1121 1122 static const MemoryRegionOps vfio_rtl_data_quirk = { 1123 .read = vfio_rtl8168_quirk_data_read, 1124 .write = vfio_rtl8168_quirk_data_write, 1125 .valid = { 1126 .min_access_size = 4, 1127 .max_access_size = 4, 1128 .unaligned = false, 1129 }, 1130 .endianness = DEVICE_LITTLE_ENDIAN, 1131 }; 1132 1133 static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) 1134 { 1135 VFIOQuirk *quirk; 1136 VFIOrtl8168Quirk *rtl; 1137 1138 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_REALTEK, 0x8168) || nr != 2) { 1139 return; 1140 } 1141 1142 quirk = vfio_quirk_alloc(2); 1143 quirk->data = rtl = g_malloc0(sizeof(*rtl)); 1144 rtl->vdev = vdev; 1145 1146 memory_region_init_io(&quirk->mem[0], OBJECT(vdev), 1147 &vfio_rtl_address_quirk, rtl, 1148 "vfio-rtl8168-window-address-quirk", 4); 1149 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 1150 0x74, &quirk->mem[0], 1); 1151 1152 memory_region_init_io(&quirk->mem[1], OBJECT(vdev), 1153 &vfio_rtl_data_quirk, rtl, 1154 "vfio-rtl8168-window-data-quirk", 4); 1155 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 1156 0x70, &quirk->mem[1], 1); 1157 1158 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 1159 1160 trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name); 1161 } 1162 1163 /* 1164 * Intel IGD support 1165 * 1166 * Obviously IGD is not a discrete device, this is evidenced not only by it 1167 * being integrated into the CPU, but by the various chipset and BIOS 1168 * dependencies that it brings along with it. Intel is trying to move away 1169 * from this and Broadwell and newer devices can run in what Intel calls 1170 * "Universal Pass-Through" mode, or UPT. Theoretically in UPT mode, nothing 1171 * more is required beyond assigning the IGD device to a VM. There are 1172 * however support limitations to this mode. It only supports IGD as a 1173 * secondary graphics device in the VM and it doesn't officially support any 1174 * physical outputs. 1175 * 1176 * The code here attempts to enable what we'll call legacy mode assignment, 1177 * IGD retains most of the capabilities we expect for it to have on bare 1178 * metal. To enable this mode, the IGD device must be assigned to the VM 1179 * at PCI address 00:02.0, it must have a ROM, it very likely needs VGA 1180 * support, we must have VM BIOS support for reserving and populating some 1181 * of the required tables, and we need to tweak the chipset with revisions 1182 * and IDs and an LPC/ISA bridge device. The intention is to make all of 1183 * this happen automatically by installing the device at the correct VM PCI 1184 * bus address. If any of the conditions are not met, we cross our fingers 1185 * and hope the user knows better. 1186 * 1187 * NB - It is possible to enable physical outputs in UPT mode by supplying 1188 * an OpRegion table. We don't do this by default because the guest driver 1189 * behaves differently if an OpRegion is provided and no monitor is attached 1190 * vs no OpRegion and a monitor being attached or not. Effectively, if a 1191 * headless setup is desired, the OpRegion gets in the way of that. 1192 */ 1193 1194 /* 1195 * This presumes the device is already known to be an Intel VGA device, so we 1196 * take liberties in which device ID bits match which generation. This should 1197 * not be taken as an indication that all the devices are supported, or even 1198 * supportable, some of them don't even support VT-d. 1199 * See linux:include/drm/i915_pciids.h for IDs. 1200 */ 1201 static int igd_gen(VFIOPCIDevice *vdev) 1202 { 1203 if ((vdev->device_id & 0xfff) == 0xa84) { 1204 return 8; /* Broxton */ 1205 } 1206 1207 switch (vdev->device_id & 0xff00) { 1208 /* Old, untested, unavailable, unknown */ 1209 case 0x0000: 1210 case 0x2500: 1211 case 0x2700: 1212 case 0x2900: 1213 case 0x2a00: 1214 case 0x2e00: 1215 case 0x3500: 1216 case 0xa000: 1217 return -1; 1218 /* SandyBridge, IvyBridge, ValleyView, Haswell */ 1219 case 0x0100: 1220 case 0x0400: 1221 case 0x0a00: 1222 case 0x0c00: 1223 case 0x0d00: 1224 case 0x0f00: 1225 return 6; 1226 /* BroadWell, CherryView, SkyLake, KabyLake */ 1227 case 0x1600: 1228 case 0x1900: 1229 case 0x2200: 1230 case 0x5900: 1231 return 8; 1232 } 1233 1234 return 8; /* Assume newer is compatible */ 1235 } 1236 1237 typedef struct VFIOIGDQuirk { 1238 struct VFIOPCIDevice *vdev; 1239 uint32_t index; 1240 uint32_t bdsm; 1241 } VFIOIGDQuirk; 1242 1243 #define IGD_GMCH 0x50 /* Graphics Control Register */ 1244 #define IGD_BDSM 0x5c /* Base Data of Stolen Memory */ 1245 #define IGD_ASLS 0xfc /* ASL Storage Register */ 1246 1247 /* 1248 * The OpRegion includes the Video BIOS Table, which seems important for 1249 * telling the driver what sort of outputs it has. Without this, the device 1250 * may work in the guest, but we may not get output. This also requires BIOS 1251 * support to reserve and populate a section of guest memory sufficient for 1252 * the table and to write the base address of that memory to the ASLS register 1253 * of the IGD device. 1254 */ 1255 int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, 1256 struct vfio_region_info *info, Error **errp) 1257 { 1258 int ret; 1259 1260 vdev->igd_opregion = g_malloc0(info->size); 1261 ret = pread(vdev->vbasedev.fd, vdev->igd_opregion, 1262 info->size, info->offset); 1263 if (ret != info->size) { 1264 error_setg(errp, "failed to read IGD OpRegion"); 1265 g_free(vdev->igd_opregion); 1266 vdev->igd_opregion = NULL; 1267 return -EINVAL; 1268 } 1269 1270 /* 1271 * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to 1272 * allocate 32bit reserved memory for, copy these contents into, and write 1273 * the reserved memory base address to the device ASLS register at 0xFC. 1274 * Alignment of this reserved region seems flexible, but using a 4k page 1275 * alignment seems to work well. This interface assumes a single IGD 1276 * device, which may be at VM address 00:02.0 in legacy mode or another 1277 * address in UPT mode. 1278 * 1279 * NB, there may be future use cases discovered where the VM should have 1280 * direct interaction with the host OpRegion, in which case the write to 1281 * the ASLS register would trigger MemoryRegion setup to enable that. 1282 */ 1283 fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion", 1284 vdev->igd_opregion, info->size); 1285 1286 trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name); 1287 1288 pci_set_long(vdev->pdev.config + IGD_ASLS, 0); 1289 pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0); 1290 pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); 1291 1292 return 0; 1293 } 1294 1295 /* 1296 * The rather short list of registers that we copy from the host devices. 1297 * The LPC/ISA bridge values are definitely needed to support the vBIOS, the 1298 * host bridge values may or may not be needed depending on the guest OS. 1299 * Since we're only munging revision and subsystem values on the host bridge, 1300 * we don't require our own device. The LPC/ISA bridge needs to be our very 1301 * own though. 1302 */ 1303 typedef struct { 1304 uint8_t offset; 1305 uint8_t len; 1306 } IGDHostInfo; 1307 1308 static const IGDHostInfo igd_host_bridge_infos[] = { 1309 {PCI_REVISION_ID, 2}, 1310 {PCI_SUBSYSTEM_VENDOR_ID, 2}, 1311 {PCI_SUBSYSTEM_ID, 2}, 1312 }; 1313 1314 static const IGDHostInfo igd_lpc_bridge_infos[] = { 1315 {PCI_VENDOR_ID, 2}, 1316 {PCI_DEVICE_ID, 2}, 1317 {PCI_REVISION_ID, 2}, 1318 {PCI_SUBSYSTEM_VENDOR_ID, 2}, 1319 {PCI_SUBSYSTEM_ID, 2}, 1320 }; 1321 1322 static int vfio_pci_igd_copy(VFIOPCIDevice *vdev, PCIDevice *pdev, 1323 struct vfio_region_info *info, 1324 const IGDHostInfo *list, int len) 1325 { 1326 int i, ret; 1327 1328 for (i = 0; i < len; i++) { 1329 ret = pread(vdev->vbasedev.fd, pdev->config + list[i].offset, 1330 list[i].len, info->offset + list[i].offset); 1331 if (ret != list[i].len) { 1332 error_report("IGD copy failed: %m"); 1333 return -errno; 1334 } 1335 } 1336 1337 return 0; 1338 } 1339 1340 /* 1341 * Stuff a few values into the host bridge. 1342 */ 1343 static int vfio_pci_igd_host_init(VFIOPCIDevice *vdev, 1344 struct vfio_region_info *info) 1345 { 1346 PCIBus *bus; 1347 PCIDevice *host_bridge; 1348 int ret; 1349 1350 bus = pci_device_root_bus(&vdev->pdev); 1351 host_bridge = pci_find_device(bus, 0, PCI_DEVFN(0, 0)); 1352 1353 if (!host_bridge) { 1354 error_report("Can't find host bridge"); 1355 return -ENODEV; 1356 } 1357 1358 ret = vfio_pci_igd_copy(vdev, host_bridge, info, igd_host_bridge_infos, 1359 ARRAY_SIZE(igd_host_bridge_infos)); 1360 if (!ret) { 1361 trace_vfio_pci_igd_host_bridge_enabled(vdev->vbasedev.name); 1362 } 1363 1364 return ret; 1365 } 1366 1367 /* 1368 * IGD LPC/ISA bridge support code. The vBIOS needs this, but we can't write 1369 * arbitrary values into just any bridge, so we must create our own. We try 1370 * to handle if the user has created it for us, which they might want to do 1371 * to enable multifunction so we don't occupy the whole PCI slot. 1372 */ 1373 static void vfio_pci_igd_lpc_bridge_realize(PCIDevice *pdev, Error **errp) 1374 { 1375 if (pdev->devfn != PCI_DEVFN(0x1f, 0)) { 1376 error_setg(errp, "VFIO dummy ISA/LPC bridge must have address 1f.0"); 1377 } 1378 } 1379 1380 static void vfio_pci_igd_lpc_bridge_class_init(ObjectClass *klass, void *data) 1381 { 1382 DeviceClass *dc = DEVICE_CLASS(klass); 1383 PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); 1384 1385 set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); 1386 dc->desc = "VFIO dummy ISA/LPC bridge for IGD assignment"; 1387 dc->hotpluggable = false; 1388 k->realize = vfio_pci_igd_lpc_bridge_realize; 1389 k->class_id = PCI_CLASS_BRIDGE_ISA; 1390 } 1391 1392 static TypeInfo vfio_pci_igd_lpc_bridge_info = { 1393 .name = "vfio-pci-igd-lpc-bridge", 1394 .parent = TYPE_PCI_DEVICE, 1395 .class_init = vfio_pci_igd_lpc_bridge_class_init, 1396 .interfaces = (InterfaceInfo[]) { 1397 { INTERFACE_CONVENTIONAL_PCI_DEVICE }, 1398 { }, 1399 }, 1400 }; 1401 1402 static void vfio_pci_igd_register_types(void) 1403 { 1404 type_register_static(&vfio_pci_igd_lpc_bridge_info); 1405 } 1406 1407 type_init(vfio_pci_igd_register_types) 1408 1409 static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev, 1410 struct vfio_region_info *info) 1411 { 1412 PCIDevice *lpc_bridge; 1413 int ret; 1414 1415 lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev), 1416 0, PCI_DEVFN(0x1f, 0)); 1417 if (!lpc_bridge) { 1418 lpc_bridge = pci_create_simple(pci_device_root_bus(&vdev->pdev), 1419 PCI_DEVFN(0x1f, 0), "vfio-pci-igd-lpc-bridge"); 1420 } 1421 1422 ret = vfio_pci_igd_copy(vdev, lpc_bridge, info, igd_lpc_bridge_infos, 1423 ARRAY_SIZE(igd_lpc_bridge_infos)); 1424 if (!ret) { 1425 trace_vfio_pci_igd_lpc_bridge_enabled(vdev->vbasedev.name); 1426 } 1427 1428 return ret; 1429 } 1430 1431 /* 1432 * IGD Gen8 and newer support up to 8MB for the GTT and use a 64bit PTE 1433 * entry, older IGDs use 2MB and 32bit. Each PTE maps a 4k page. Therefore 1434 * we either have 2M/4k * 4 = 2k or 8M/4k * 8 = 16k as the maximum iobar index 1435 * for programming the GTT. 1436 * 1437 * See linux:include/drm/i915_drm.h for shift and mask values. 1438 */ 1439 static int vfio_igd_gtt_max(VFIOPCIDevice *vdev) 1440 { 1441 uint32_t gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch)); 1442 int ggms, gen = igd_gen(vdev); 1443 1444 gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch)); 1445 ggms = (gmch >> (gen < 8 ? 8 : 6)) & 0x3; 1446 if (gen > 6) { 1447 ggms = 1 << ggms; 1448 } 1449 1450 ggms *= MiB; 1451 1452 return (ggms / (4 * KiB)) * (gen < 8 ? 4 : 8); 1453 } 1454 1455 /* 1456 * The IGD ROM will make use of stolen memory (GGMS) for support of VESA modes. 1457 * Somehow the host stolen memory range is used for this, but how the ROM gets 1458 * it is a mystery, perhaps it's hardcoded into the ROM. Thankfully though, it 1459 * reprograms the GTT through the IOBAR where we can trap it and transpose the 1460 * programming to the VM allocated buffer. That buffer gets reserved by the VM 1461 * firmware via the fw_cfg entry added below. Here we're just monitoring the 1462 * IOBAR address and data registers to detect a write sequence targeting the 1463 * GTTADR. This code is developed by observed behavior and doesn't have a 1464 * direct spec reference, unfortunately. 1465 */ 1466 static uint64_t vfio_igd_quirk_data_read(void *opaque, 1467 hwaddr addr, unsigned size) 1468 { 1469 VFIOIGDQuirk *igd = opaque; 1470 VFIOPCIDevice *vdev = igd->vdev; 1471 1472 igd->index = ~0; 1473 1474 return vfio_region_read(&vdev->bars[4].region, addr + 4, size); 1475 } 1476 1477 static void vfio_igd_quirk_data_write(void *opaque, hwaddr addr, 1478 uint64_t data, unsigned size) 1479 { 1480 VFIOIGDQuirk *igd = opaque; 1481 VFIOPCIDevice *vdev = igd->vdev; 1482 uint64_t val = data; 1483 int gen = igd_gen(vdev); 1484 1485 /* 1486 * Programming the GGMS starts at index 0x1 and uses every 4th index (ie. 1487 * 0x1, 0x5, 0x9, 0xd,...). For pre-Gen8 each 4-byte write is a whole PTE 1488 * entry, with 0th bit enable set. For Gen8 and up, PTEs are 64bit, so 1489 * entries 0x5 & 0xd are the high dword, in our case zero. Each PTE points 1490 * to a 4k page, which we translate to a page from the VM allocated region, 1491 * pointed to by the BDSM register. If this is not set, we fail. 1492 * 1493 * We trap writes to the full configured GTT size, but we typically only 1494 * see the vBIOS writing up to (nearly) the 1MB barrier. In fact it often 1495 * seems to miss the last entry for an even 1MB GTT. Doing a gratuitous 1496 * write of that last entry does work, but is hopefully unnecessary since 1497 * we clear the previous GTT on initialization. 1498 */ 1499 if ((igd->index % 4 == 1) && igd->index < vfio_igd_gtt_max(vdev)) { 1500 if (gen < 8 || (igd->index % 8 == 1)) { 1501 uint32_t base; 1502 1503 base = pci_get_long(vdev->pdev.config + IGD_BDSM); 1504 if (!base) { 1505 hw_error("vfio-igd: Guest attempted to program IGD GTT before " 1506 "BIOS reserved stolen memory. Unsupported BIOS?"); 1507 } 1508 1509 val = data - igd->bdsm + base; 1510 } else { 1511 val = 0; /* upper 32bits of pte, we only enable below 4G PTEs */ 1512 } 1513 1514 trace_vfio_pci_igd_bar4_write(vdev->vbasedev.name, 1515 igd->index, data, val); 1516 } 1517 1518 vfio_region_write(&vdev->bars[4].region, addr + 4, val, size); 1519 1520 igd->index = ~0; 1521 } 1522 1523 static const MemoryRegionOps vfio_igd_data_quirk = { 1524 .read = vfio_igd_quirk_data_read, 1525 .write = vfio_igd_quirk_data_write, 1526 .endianness = DEVICE_LITTLE_ENDIAN, 1527 }; 1528 1529 static uint64_t vfio_igd_quirk_index_read(void *opaque, 1530 hwaddr addr, unsigned size) 1531 { 1532 VFIOIGDQuirk *igd = opaque; 1533 VFIOPCIDevice *vdev = igd->vdev; 1534 1535 igd->index = ~0; 1536 1537 return vfio_region_read(&vdev->bars[4].region, addr, size); 1538 } 1539 1540 static void vfio_igd_quirk_index_write(void *opaque, hwaddr addr, 1541 uint64_t data, unsigned size) 1542 { 1543 VFIOIGDQuirk *igd = opaque; 1544 VFIOPCIDevice *vdev = igd->vdev; 1545 1546 igd->index = data; 1547 1548 vfio_region_write(&vdev->bars[4].region, addr, data, size); 1549 } 1550 1551 static const MemoryRegionOps vfio_igd_index_quirk = { 1552 .read = vfio_igd_quirk_index_read, 1553 .write = vfio_igd_quirk_index_write, 1554 .endianness = DEVICE_LITTLE_ENDIAN, 1555 }; 1556 1557 static void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) 1558 { 1559 struct vfio_region_info *rom = NULL, *opregion = NULL, 1560 *host = NULL, *lpc = NULL; 1561 VFIOQuirk *quirk; 1562 VFIOIGDQuirk *igd; 1563 PCIDevice *lpc_bridge; 1564 int i, ret, ggms_mb, gms_mb = 0, gen; 1565 uint64_t *bdsm_size; 1566 uint32_t gmch; 1567 uint16_t cmd_orig, cmd; 1568 Error *err = NULL; 1569 1570 /* 1571 * This must be an Intel VGA device at address 00:02.0 for us to even 1572 * consider enabling legacy mode. The vBIOS has dependencies on the 1573 * PCI bus address. 1574 */ 1575 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || 1576 !vfio_is_vga(vdev) || nr != 4 || 1577 &vdev->pdev != pci_find_device(pci_device_root_bus(&vdev->pdev), 1578 0, PCI_DEVFN(0x2, 0))) { 1579 return; 1580 } 1581 1582 /* 1583 * We need to create an LPC/ISA bridge at PCI bus address 00:1f.0 that we 1584 * can stuff host values into, so if there's already one there and it's not 1585 * one we can hack on, legacy mode is no-go. Sorry Q35. 1586 */ 1587 lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev), 1588 0, PCI_DEVFN(0x1f, 0)); 1589 if (lpc_bridge && !object_dynamic_cast(OBJECT(lpc_bridge), 1590 "vfio-pci-igd-lpc-bridge")) { 1591 error_report("IGD device %s cannot support legacy mode due to existing " 1592 "devices at address 1f.0", vdev->vbasedev.name); 1593 return; 1594 } 1595 1596 /* 1597 * IGD is not a standard, they like to change their specs often. We 1598 * only attempt to support back to SandBridge and we hope that newer 1599 * devices maintain compatibility with generation 8. 1600 */ 1601 gen = igd_gen(vdev); 1602 if (gen != 6 && gen != 8) { 1603 error_report("IGD device %s is unsupported in legacy mode, " 1604 "try SandyBridge or newer", vdev->vbasedev.name); 1605 return; 1606 } 1607 1608 /* 1609 * Most of what we're doing here is to enable the ROM to run, so if 1610 * there's no ROM, there's no point in setting up this quirk. 1611 * NB. We only seem to get BIOS ROMs, so a UEFI VM would need CSM support. 1612 */ 1613 ret = vfio_get_region_info(&vdev->vbasedev, 1614 VFIO_PCI_ROM_REGION_INDEX, &rom); 1615 if ((ret || !rom->size) && !vdev->pdev.romfile) { 1616 error_report("IGD device %s has no ROM, legacy mode disabled", 1617 vdev->vbasedev.name); 1618 goto out; 1619 } 1620 1621 /* 1622 * Ignore the hotplug corner case, mark the ROM failed, we can't 1623 * create the devices we need for legacy mode in the hotplug scenario. 1624 */ 1625 if (vdev->pdev.qdev.hotplugged) { 1626 error_report("IGD device %s hotplugged, ROM disabled, " 1627 "legacy mode disabled", vdev->vbasedev.name); 1628 vdev->rom_read_failed = true; 1629 goto out; 1630 } 1631 1632 /* 1633 * Check whether we have all the vfio device specific regions to 1634 * support legacy mode (added in Linux v4.6). If not, bail. 1635 */ 1636 ret = vfio_get_dev_region_info(&vdev->vbasedev, 1637 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, 1638 VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); 1639 if (ret) { 1640 error_report("IGD device %s does not support OpRegion access," 1641 "legacy mode disabled", vdev->vbasedev.name); 1642 goto out; 1643 } 1644 1645 ret = vfio_get_dev_region_info(&vdev->vbasedev, 1646 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, 1647 VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &host); 1648 if (ret) { 1649 error_report("IGD device %s does not support host bridge access," 1650 "legacy mode disabled", vdev->vbasedev.name); 1651 goto out; 1652 } 1653 1654 ret = vfio_get_dev_region_info(&vdev->vbasedev, 1655 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, 1656 VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, &lpc); 1657 if (ret) { 1658 error_report("IGD device %s does not support LPC bridge access," 1659 "legacy mode disabled", vdev->vbasedev.name); 1660 goto out; 1661 } 1662 1663 gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4); 1664 1665 /* 1666 * If IGD VGA Disable is clear (expected) and VGA is not already enabled, 1667 * try to enable it. Probably shouldn't be using legacy mode without VGA, 1668 * but also no point in us enabling VGA if disabled in hardware. 1669 */ 1670 if (!(gmch & 0x2) && !vdev->vga && vfio_populate_vga(vdev, &err)) { 1671 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 1672 error_report("IGD device %s failed to enable VGA access, " 1673 "legacy mode disabled", vdev->vbasedev.name); 1674 goto out; 1675 } 1676 1677 /* Create our LPC/ISA bridge */ 1678 ret = vfio_pci_igd_lpc_init(vdev, lpc); 1679 if (ret) { 1680 error_report("IGD device %s failed to create LPC bridge, " 1681 "legacy mode disabled", vdev->vbasedev.name); 1682 goto out; 1683 } 1684 1685 /* Stuff some host values into the VM PCI host bridge */ 1686 ret = vfio_pci_igd_host_init(vdev, host); 1687 if (ret) { 1688 error_report("IGD device %s failed to modify host bridge, " 1689 "legacy mode disabled", vdev->vbasedev.name); 1690 goto out; 1691 } 1692 1693 /* Setup OpRegion access */ 1694 ret = vfio_pci_igd_opregion_init(vdev, opregion, &err); 1695 if (ret) { 1696 error_append_hint(&err, "IGD legacy mode disabled\n"); 1697 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); 1698 goto out; 1699 } 1700 1701 /* Setup our quirk to munge GTT addresses to the VM allocated buffer */ 1702 quirk = vfio_quirk_alloc(2); 1703 igd = quirk->data = g_malloc0(sizeof(*igd)); 1704 igd->vdev = vdev; 1705 igd->index = ~0; 1706 igd->bdsm = vfio_pci_read_config(&vdev->pdev, IGD_BDSM, 4); 1707 igd->bdsm &= ~((1 * MiB) - 1); /* 1MB aligned */ 1708 1709 memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_igd_index_quirk, 1710 igd, "vfio-igd-index-quirk", 4); 1711 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 1712 0, &quirk->mem[0], 1); 1713 1714 memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_igd_data_quirk, 1715 igd, "vfio-igd-data-quirk", 4); 1716 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 1717 4, &quirk->mem[1], 1); 1718 1719 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 1720 1721 /* Determine the size of stolen memory needed for GTT */ 1722 ggms_mb = (gmch >> (gen < 8 ? 8 : 6)) & 0x3; 1723 if (gen > 6) { 1724 ggms_mb = 1 << ggms_mb; 1725 } 1726 1727 /* 1728 * Assume we have no GMS memory, but allow it to be overrided by device 1729 * option (experimental). The spec doesn't actually allow zero GMS when 1730 * when IVD (IGD VGA Disable) is clear, but the claim is that it's unused, 1731 * so let's not waste VM memory for it. 1732 */ 1733 gmch &= ~((gen < 8 ? 0x1f : 0xff) << (gen < 8 ? 3 : 8)); 1734 1735 if (vdev->igd_gms) { 1736 if (vdev->igd_gms <= 0x10) { 1737 gms_mb = vdev->igd_gms * 32; 1738 gmch |= vdev->igd_gms << (gen < 8 ? 3 : 8); 1739 } else { 1740 error_report("Unsupported IGD GMS value 0x%x", vdev->igd_gms); 1741 vdev->igd_gms = 0; 1742 } 1743 } 1744 1745 /* 1746 * Request reserved memory for stolen memory via fw_cfg. VM firmware 1747 * must allocate a 1MB aligned reserved memory region below 4GB with 1748 * the requested size (in bytes) for use by the Intel PCI class VGA 1749 * device at VM address 00:02.0. The base address of this reserved 1750 * memory region must be written to the device BDSM regsiter at PCI 1751 * config offset 0x5C. 1752 */ 1753 bdsm_size = g_malloc(sizeof(*bdsm_size)); 1754 *bdsm_size = cpu_to_le64((ggms_mb + gms_mb) * MiB); 1755 fw_cfg_add_file(fw_cfg_find(), "etc/igd-bdsm-size", 1756 bdsm_size, sizeof(*bdsm_size)); 1757 1758 /* GMCH is read-only, emulated */ 1759 pci_set_long(vdev->pdev.config + IGD_GMCH, gmch); 1760 pci_set_long(vdev->pdev.wmask + IGD_GMCH, 0); 1761 pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0); 1762 1763 /* BDSM is read-write, emulated. The BIOS needs to be able to write it */ 1764 pci_set_long(vdev->pdev.config + IGD_BDSM, 0); 1765 pci_set_long(vdev->pdev.wmask + IGD_BDSM, ~0); 1766 pci_set_long(vdev->emulated_config_bits + IGD_BDSM, ~0); 1767 1768 /* 1769 * This IOBAR gives us access to GTTADR, which allows us to write to 1770 * the GTT itself. So let's go ahead and write zero to all the GTT 1771 * entries to avoid spurious DMA faults. Be sure I/O access is enabled 1772 * before talking to the device. 1773 */ 1774 if (pread(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig), 1775 vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) { 1776 error_report("IGD device %s - failed to read PCI command register", 1777 vdev->vbasedev.name); 1778 } 1779 1780 cmd = cmd_orig | PCI_COMMAND_IO; 1781 1782 if (pwrite(vdev->vbasedev.fd, &cmd, sizeof(cmd), 1783 vdev->config_offset + PCI_COMMAND) != sizeof(cmd)) { 1784 error_report("IGD device %s - failed to write PCI command register", 1785 vdev->vbasedev.name); 1786 } 1787 1788 for (i = 1; i < vfio_igd_gtt_max(vdev); i += 4) { 1789 vfio_region_write(&vdev->bars[4].region, 0, i, 4); 1790 vfio_region_write(&vdev->bars[4].region, 4, 0, 4); 1791 } 1792 1793 if (pwrite(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig), 1794 vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) { 1795 error_report("IGD device %s - failed to restore PCI command register", 1796 vdev->vbasedev.name); 1797 } 1798 1799 trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, ggms_mb + gms_mb); 1800 1801 out: 1802 g_free(rom); 1803 g_free(opregion); 1804 g_free(host); 1805 g_free(lpc); 1806 } 1807 1808 /* 1809 * Common quirk probe entry points. 1810 */ 1811 void vfio_vga_quirk_setup(VFIOPCIDevice *vdev) 1812 { 1813 vfio_vga_probe_ati_3c3_quirk(vdev); 1814 vfio_vga_probe_nvidia_3d0_quirk(vdev); 1815 } 1816 1817 void vfio_vga_quirk_exit(VFIOPCIDevice *vdev) 1818 { 1819 VFIOQuirk *quirk; 1820 int i, j; 1821 1822 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { 1823 QLIST_FOREACH(quirk, &vdev->vga->region[i].quirks, next) { 1824 for (j = 0; j < quirk->nr_mem; j++) { 1825 memory_region_del_subregion(&vdev->vga->region[i].mem, 1826 &quirk->mem[j]); 1827 } 1828 } 1829 } 1830 } 1831 1832 void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev) 1833 { 1834 int i, j; 1835 1836 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { 1837 while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) { 1838 VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks); 1839 QLIST_REMOVE(quirk, next); 1840 for (j = 0; j < quirk->nr_mem; j++) { 1841 object_unparent(OBJECT(&quirk->mem[j])); 1842 } 1843 g_free(quirk->mem); 1844 g_free(quirk->data); 1845 g_free(quirk); 1846 } 1847 } 1848 } 1849 1850 void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) 1851 { 1852 vfio_probe_ati_bar4_quirk(vdev, nr); 1853 vfio_probe_ati_bar2_quirk(vdev, nr); 1854 vfio_probe_nvidia_bar5_quirk(vdev, nr); 1855 vfio_probe_nvidia_bar0_quirk(vdev, nr); 1856 vfio_probe_rtl8168_bar2_quirk(vdev, nr); 1857 vfio_probe_igd_bar4_quirk(vdev, nr); 1858 } 1859 1860 void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr) 1861 { 1862 VFIOBAR *bar = &vdev->bars[nr]; 1863 VFIOQuirk *quirk; 1864 int i; 1865 1866 QLIST_FOREACH(quirk, &bar->quirks, next) { 1867 while (!QLIST_EMPTY(&quirk->ioeventfds)) { 1868 vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds)); 1869 } 1870 1871 for (i = 0; i < quirk->nr_mem; i++) { 1872 memory_region_del_subregion(bar->region.mem, &quirk->mem[i]); 1873 } 1874 } 1875 } 1876 1877 void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr) 1878 { 1879 VFIOBAR *bar = &vdev->bars[nr]; 1880 int i; 1881 1882 while (!QLIST_EMPTY(&bar->quirks)) { 1883 VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks); 1884 QLIST_REMOVE(quirk, next); 1885 for (i = 0; i < quirk->nr_mem; i++) { 1886 object_unparent(OBJECT(&quirk->mem[i])); 1887 } 1888 g_free(quirk->mem); 1889 g_free(quirk->data); 1890 g_free(quirk); 1891 } 1892 } 1893 1894 /* 1895 * Reset quirks 1896 */ 1897 void vfio_quirk_reset(VFIOPCIDevice *vdev) 1898 { 1899 int i; 1900 1901 for (i = 0; i < PCI_ROM_SLOT; i++) { 1902 VFIOQuirk *quirk; 1903 VFIOBAR *bar = &vdev->bars[i]; 1904 1905 QLIST_FOREACH(quirk, &bar->quirks, next) { 1906 if (quirk->reset) { 1907 quirk->reset(vdev, quirk); 1908 } 1909 } 1910 } 1911 } 1912 1913 /* 1914 * AMD Radeon PCI config reset, based on Linux: 1915 * drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running() 1916 * drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset 1917 * drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc() 1918 * drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock() 1919 * IDs: include/drm/drm_pciids.h 1920 * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0 1921 * 1922 * Bonaire and Hawaii GPUs do not respond to a bus reset. This is a bug in the 1923 * hardware that should be fixed on future ASICs. The symptom of this is that 1924 * once the accerlated driver loads, Windows guests will bsod on subsequent 1925 * attmpts to load the driver, such as after VM reset or shutdown/restart. To 1926 * work around this, we do an AMD specific PCI config reset, followed by an SMC 1927 * reset. The PCI config reset only works if SMC firmware is running, so we 1928 * have a dependency on the state of the device as to whether this reset will 1929 * be effective. There are still cases where we won't be able to kick the 1930 * device into working, but this greatly improves the usability overall. The 1931 * config reset magic is relatively common on AMD GPUs, but the setup and SMC 1932 * poking is largely ASIC specific. 1933 */ 1934 static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev) 1935 { 1936 uint32_t clk, pc_c; 1937 1938 /* 1939 * Registers 200h and 204h are index and data registers for accessing 1940 * indirect configuration registers within the device. 1941 */ 1942 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); 1943 clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1944 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4); 1945 pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1946 1947 return (!(clk & 1) && (0x20100 <= pc_c)); 1948 } 1949 1950 /* 1951 * The scope of a config reset is controlled by a mode bit in the misc register 1952 * and a fuse, exposed as a bit in another register. The fuse is the default 1953 * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula 1954 * scope = !(misc ^ fuse), where the resulting scope is defined the same as 1955 * the fuse. A truth table therefore tells us that if misc == fuse, we need 1956 * to flip the value of the bit in the misc register. 1957 */ 1958 static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev) 1959 { 1960 uint32_t misc, fuse; 1961 bool a, b; 1962 1963 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4); 1964 fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1965 b = fuse & 64; 1966 1967 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4); 1968 misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1969 a = misc & 2; 1970 1971 if (a == b) { 1972 vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4); 1973 vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */ 1974 } 1975 } 1976 1977 static int vfio_radeon_reset(VFIOPCIDevice *vdev) 1978 { 1979 PCIDevice *pdev = &vdev->pdev; 1980 int i, ret = 0; 1981 uint32_t data; 1982 1983 /* Defer to a kernel implemented reset */ 1984 if (vdev->vbasedev.reset_works) { 1985 trace_vfio_quirk_ati_bonaire_reset_skipped(vdev->vbasedev.name); 1986 return -ENODEV; 1987 } 1988 1989 /* Enable only memory BAR access */ 1990 vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2); 1991 1992 /* Reset only works if SMC firmware is loaded and running */ 1993 if (!vfio_radeon_smc_is_running(vdev)) { 1994 ret = -EINVAL; 1995 trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev->vbasedev.name); 1996 goto out; 1997 } 1998 1999 /* Make sure only the GFX function is reset */ 2000 vfio_radeon_set_gfx_only_reset(vdev); 2001 2002 /* AMD PCI config reset */ 2003 vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4); 2004 usleep(100); 2005 2006 /* Read back the memory size to make sure we're out of reset */ 2007 for (i = 0; i < 100000; i++) { 2008 if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) { 2009 goto reset_smc; 2010 } 2011 usleep(1); 2012 } 2013 2014 trace_vfio_quirk_ati_bonaire_reset_timeout(vdev->vbasedev.name); 2015 2016 reset_smc: 2017 /* Reset SMC */ 2018 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4); 2019 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 2020 data |= 1; 2021 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); 2022 2023 /* Disable SMC clock */ 2024 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); 2025 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 2026 data |= 1; 2027 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); 2028 2029 trace_vfio_quirk_ati_bonaire_reset_done(vdev->vbasedev.name); 2030 2031 out: 2032 /* Restore PCI command register */ 2033 vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2); 2034 2035 return ret; 2036 } 2037 2038 void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev) 2039 { 2040 switch (vdev->vendor_id) { 2041 case 0x1002: 2042 switch (vdev->device_id) { 2043 /* Bonaire */ 2044 case 0x6649: /* Bonaire [FirePro W5100] */ 2045 case 0x6650: 2046 case 0x6651: 2047 case 0x6658: /* Bonaire XTX [Radeon R7 260X] */ 2048 case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */ 2049 case 0x665d: /* Bonaire [Radeon R7 200 Series] */ 2050 /* Hawaii */ 2051 case 0x67A0: /* Hawaii XT GL [FirePro W9100] */ 2052 case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */ 2053 case 0x67A2: 2054 case 0x67A8: 2055 case 0x67A9: 2056 case 0x67AA: 2057 case 0x67B0: /* Hawaii XT [Radeon R9 290X] */ 2058 case 0x67B1: /* Hawaii PRO [Radeon R9 290] */ 2059 case 0x67B8: 2060 case 0x67B9: 2061 case 0x67BA: 2062 case 0x67BE: 2063 vdev->resetfn = vfio_radeon_reset; 2064 trace_vfio_quirk_ati_bonaire_reset(vdev->vbasedev.name); 2065 break; 2066 } 2067 break; 2068 } 2069 } 2070 2071 /* 2072 * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify 2073 * devices as a member of a clique. Devices within the same clique ID 2074 * are capable of direct P2P. It's the user's responsibility that this 2075 * is correct. The spec says that this may reside at any unused config 2076 * offset, but reserves and recommends hypervisors place this at C8h. 2077 * The spec also states that the hypervisor should place this capability 2078 * at the end of the capability list, thus next is defined as 0h. 2079 * 2080 * +----------------+----------------+----------------+----------------+ 2081 * | sig 7:0 ('P') | vndr len (8h) | next (0h) | cap id (9h) | 2082 * +----------------+----------------+----------------+----------------+ 2083 * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)| sig 23:8 ('P2') | 2084 * +---------------------------------+---------------------------------+ 2085 * 2086 * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf 2087 */ 2088 static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v, 2089 const char *name, void *opaque, 2090 Error **errp) 2091 { 2092 DeviceState *dev = DEVICE(obj); 2093 Property *prop = opaque; 2094 uint8_t *ptr = qdev_get_prop_ptr(dev, prop); 2095 2096 visit_type_uint8(v, name, ptr, errp); 2097 } 2098 2099 static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v, 2100 const char *name, void *opaque, 2101 Error **errp) 2102 { 2103 DeviceState *dev = DEVICE(obj); 2104 Property *prop = opaque; 2105 uint8_t value, *ptr = qdev_get_prop_ptr(dev, prop); 2106 Error *local_err = NULL; 2107 2108 if (dev->realized) { 2109 qdev_prop_set_after_realize(dev, name, errp); 2110 return; 2111 } 2112 2113 visit_type_uint8(v, name, &value, &local_err); 2114 if (local_err) { 2115 error_propagate(errp, local_err); 2116 return; 2117 } 2118 2119 if (value & ~0xF) { 2120 error_setg(errp, "Property %s: valid range 0-15", name); 2121 return; 2122 } 2123 2124 *ptr = value; 2125 } 2126 2127 const PropertyInfo qdev_prop_nv_gpudirect_clique = { 2128 .name = "uint4", 2129 .description = "NVIDIA GPUDirect Clique ID (0 - 15)", 2130 .get = get_nv_gpudirect_clique_id, 2131 .set = set_nv_gpudirect_clique_id, 2132 }; 2133 2134 static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp) 2135 { 2136 PCIDevice *pdev = &vdev->pdev; 2137 int ret, pos = 0xC8; 2138 2139 if (vdev->nv_gpudirect_clique == 0xFF) { 2140 return 0; 2141 } 2142 2143 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) { 2144 error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor"); 2145 return -EINVAL; 2146 } 2147 2148 if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) != 2149 PCI_BASE_CLASS_DISPLAY) { 2150 error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class"); 2151 return -EINVAL; 2152 } 2153 2154 ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp); 2155 if (ret < 0) { 2156 error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: "); 2157 return ret; 2158 } 2159 2160 memset(vdev->emulated_config_bits + pos, 0xFF, 8); 2161 pos += PCI_CAP_FLAGS; 2162 pci_set_byte(pdev->config + pos++, 8); 2163 pci_set_byte(pdev->config + pos++, 'P'); 2164 pci_set_byte(pdev->config + pos++, '2'); 2165 pci_set_byte(pdev->config + pos++, 'P'); 2166 pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3); 2167 pci_set_byte(pdev->config + pos, 0); 2168 2169 return 0; 2170 } 2171 2172 int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp) 2173 { 2174 int ret; 2175 2176 ret = vfio_add_nv_gpudirect_cap(vdev, errp); 2177 if (ret) { 2178 return ret; 2179 } 2180 2181 return 0; 2182 } 2183