1 /* 2 * device quirks for PCI devices 3 * 4 * Copyright Red Hat, Inc. 2012-2015 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include "config-devices.h" 15 #include "exec/memop.h" 16 #include "qemu/units.h" 17 #include "qemu/error-report.h" 18 #include "qemu/main-loop.h" 19 #include "qemu/module.h" 20 #include "qemu/range.h" 21 #include "qapi/error.h" 22 #include "qapi/visitor.h" 23 #include <sys/ioctl.h> 24 #include "hw/hw.h" 25 #include "hw/nvram/fw_cfg.h" 26 #include "hw/qdev-properties.h" 27 #include "pci.h" 28 #include "trace.h" 29 30 /* 31 * List of device ids/vendor ids for which to disable 32 * option rom loading. This avoids the guest hangs during rom 33 * execution as noticed with the BCM 57810 card for lack of a 34 * more better way to handle such issues. 35 * The user can still override by specifying a romfile or 36 * rombar=1. 37 * Please see https://bugs.launchpad.net/qemu/+bug/1284874 38 * for an analysis of the 57810 card hang. When adding 39 * a new vendor id/device id combination below, please also add 40 * your card/environment details and information that could 41 * help in debugging to the bug tracking this issue 42 */ 43 static const struct { 44 uint32_t vendor; 45 uint32_t device; 46 } romblacklist[] = { 47 { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */ 48 }; 49 50 bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev) 51 { 52 int i; 53 54 for (i = 0 ; i < ARRAY_SIZE(romblacklist); i++) { 55 if (vfio_pci_is(vdev, romblacklist[i].vendor, romblacklist[i].device)) { 56 trace_vfio_quirk_rom_blacklisted(vdev->vbasedev.name, 57 romblacklist[i].vendor, 58 romblacklist[i].device); 59 return true; 60 } 61 } 62 return false; 63 } 64 65 /* 66 * Device specific region quirks (mostly backdoors to PCI config space) 67 */ 68 69 /* 70 * The generic window quirks operate on an address and data register, 71 * vfio_generic_window_address_quirk handles the address register and 72 * vfio_generic_window_data_quirk handles the data register. These ops 73 * pass reads and writes through to hardware until a value matching the 74 * stored address match/mask is written. When this occurs, the data 75 * register access emulated PCI config space for the device rather than 76 * passing through accesses. This enables devices where PCI config space 77 * is accessible behind a window register to maintain the virtualization 78 * provided through vfio. 79 */ 80 typedef struct VFIOConfigWindowMatch { 81 uint32_t match; 82 uint32_t mask; 83 } VFIOConfigWindowMatch; 84 85 typedef struct VFIOConfigWindowQuirk { 86 struct VFIOPCIDevice *vdev; 87 88 uint32_t address_val; 89 90 uint32_t address_offset; 91 uint32_t data_offset; 92 93 bool window_enabled; 94 uint8_t bar; 95 96 MemoryRegion *addr_mem; 97 MemoryRegion *data_mem; 98 99 uint32_t nr_matches; 100 VFIOConfigWindowMatch matches[]; 101 } VFIOConfigWindowQuirk; 102 103 static uint64_t vfio_generic_window_quirk_address_read(void *opaque, 104 hwaddr addr, 105 unsigned size) 106 { 107 VFIOConfigWindowQuirk *window = opaque; 108 VFIOPCIDevice *vdev = window->vdev; 109 110 return vfio_region_read(&vdev->bars[window->bar].region, 111 addr + window->address_offset, size); 112 } 113 114 static void vfio_generic_window_quirk_address_write(void *opaque, hwaddr addr, 115 uint64_t data, 116 unsigned size) 117 { 118 VFIOConfigWindowQuirk *window = opaque; 119 VFIOPCIDevice *vdev = window->vdev; 120 int i; 121 122 window->window_enabled = false; 123 124 vfio_region_write(&vdev->bars[window->bar].region, 125 addr + window->address_offset, data, size); 126 127 for (i = 0; i < window->nr_matches; i++) { 128 if ((data & ~window->matches[i].mask) == window->matches[i].match) { 129 window->window_enabled = true; 130 window->address_val = data & window->matches[i].mask; 131 trace_vfio_quirk_generic_window_address_write(vdev->vbasedev.name, 132 memory_region_name(window->addr_mem), data); 133 break; 134 } 135 } 136 } 137 138 static const MemoryRegionOps vfio_generic_window_address_quirk = { 139 .read = vfio_generic_window_quirk_address_read, 140 .write = vfio_generic_window_quirk_address_write, 141 .endianness = DEVICE_LITTLE_ENDIAN, 142 }; 143 144 static uint64_t vfio_generic_window_quirk_data_read(void *opaque, 145 hwaddr addr, unsigned size) 146 { 147 VFIOConfigWindowQuirk *window = opaque; 148 VFIOPCIDevice *vdev = window->vdev; 149 uint64_t data; 150 151 /* Always read data reg, discard if window enabled */ 152 data = vfio_region_read(&vdev->bars[window->bar].region, 153 addr + window->data_offset, size); 154 155 if (window->window_enabled) { 156 data = vfio_pci_read_config(&vdev->pdev, window->address_val, size); 157 trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name, 158 memory_region_name(window->data_mem), data); 159 } 160 161 return data; 162 } 163 164 static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr, 165 uint64_t data, unsigned size) 166 { 167 VFIOConfigWindowQuirk *window = opaque; 168 VFIOPCIDevice *vdev = window->vdev; 169 170 if (window->window_enabled) { 171 vfio_pci_write_config(&vdev->pdev, window->address_val, data, size); 172 trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name, 173 memory_region_name(window->data_mem), data); 174 return; 175 } 176 177 vfio_region_write(&vdev->bars[window->bar].region, 178 addr + window->data_offset, data, size); 179 } 180 181 static const MemoryRegionOps vfio_generic_window_data_quirk = { 182 .read = vfio_generic_window_quirk_data_read, 183 .write = vfio_generic_window_quirk_data_write, 184 .endianness = DEVICE_LITTLE_ENDIAN, 185 }; 186 187 /* 188 * The generic mirror quirk handles devices which expose PCI config space 189 * through a region within a BAR. When enabled, reads and writes are 190 * redirected through to emulated PCI config space. XXX if PCI config space 191 * used memory regions, this could just be an alias. 192 */ 193 typedef struct VFIOConfigMirrorQuirk { 194 struct VFIOPCIDevice *vdev; 195 uint32_t offset; 196 uint8_t bar; 197 MemoryRegion *mem; 198 uint8_t data[]; 199 } VFIOConfigMirrorQuirk; 200 201 static uint64_t vfio_generic_quirk_mirror_read(void *opaque, 202 hwaddr addr, unsigned size) 203 { 204 VFIOConfigMirrorQuirk *mirror = opaque; 205 VFIOPCIDevice *vdev = mirror->vdev; 206 uint64_t data; 207 208 /* Read and discard in case the hardware cares */ 209 (void)vfio_region_read(&vdev->bars[mirror->bar].region, 210 addr + mirror->offset, size); 211 212 data = vfio_pci_read_config(&vdev->pdev, addr, size); 213 trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name, 214 memory_region_name(mirror->mem), 215 addr, data); 216 return data; 217 } 218 219 static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr, 220 uint64_t data, unsigned size) 221 { 222 VFIOConfigMirrorQuirk *mirror = opaque; 223 VFIOPCIDevice *vdev = mirror->vdev; 224 225 vfio_pci_write_config(&vdev->pdev, addr, data, size); 226 trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name, 227 memory_region_name(mirror->mem), 228 addr, data); 229 } 230 231 static const MemoryRegionOps vfio_generic_mirror_quirk = { 232 .read = vfio_generic_quirk_mirror_read, 233 .write = vfio_generic_quirk_mirror_write, 234 .endianness = DEVICE_LITTLE_ENDIAN, 235 }; 236 237 /* Is range1 fully contained within range2? */ 238 static bool vfio_range_contained(uint64_t first1, uint64_t len1, 239 uint64_t first2, uint64_t len2) { 240 return (first1 >= first2 && first1 + len1 <= first2 + len2); 241 } 242 243 #define PCI_VENDOR_ID_ATI 0x1002 244 245 /* 246 * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR 247 * through VGA register 0x3c3. On newer cards, the I/O port BAR is always 248 * BAR4 (older cards like the X550 used BAR1, but we don't care to support 249 * those). Note that on bare metal, a read of 0x3c3 doesn't always return the 250 * I/O port BAR address. Originally this was coded to return the virtual BAR 251 * address only if the physical register read returns the actual BAR address, 252 * but users have reported greater success if we return the virtual address 253 * unconditionally. 254 */ 255 static uint64_t vfio_ati_3c3_quirk_read(void *opaque, 256 hwaddr addr, unsigned size) 257 { 258 VFIOPCIDevice *vdev = opaque; 259 uint64_t data = vfio_pci_read_config(&vdev->pdev, 260 PCI_BASE_ADDRESS_4 + 1, size); 261 262 trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data); 263 264 return data; 265 } 266 267 static const MemoryRegionOps vfio_ati_3c3_quirk = { 268 .read = vfio_ati_3c3_quirk_read, 269 .endianness = DEVICE_LITTLE_ENDIAN, 270 }; 271 272 VFIOQuirk *vfio_quirk_alloc(int nr_mem) 273 { 274 VFIOQuirk *quirk = g_new0(VFIOQuirk, 1); 275 QLIST_INIT(&quirk->ioeventfds); 276 quirk->mem = g_new0(MemoryRegion, nr_mem); 277 quirk->nr_mem = nr_mem; 278 279 return quirk; 280 } 281 282 static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd) 283 { 284 QLIST_REMOVE(ioeventfd, next); 285 memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size, 286 true, ioeventfd->data, &ioeventfd->e); 287 288 if (ioeventfd->vfio) { 289 struct vfio_device_ioeventfd vfio_ioeventfd; 290 291 vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd); 292 vfio_ioeventfd.flags = ioeventfd->size; 293 vfio_ioeventfd.data = ioeventfd->data; 294 vfio_ioeventfd.offset = ioeventfd->region->fd_offset + 295 ioeventfd->region_addr; 296 vfio_ioeventfd.fd = -1; 297 298 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd)) { 299 error_report("Failed to remove vfio ioeventfd for %s+0x%" 300 HWADDR_PRIx"[%d]:0x%"PRIx64" (%m)", 301 memory_region_name(ioeventfd->mr), ioeventfd->addr, 302 ioeventfd->size, ioeventfd->data); 303 } 304 } else { 305 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), 306 NULL, NULL, NULL); 307 } 308 309 event_notifier_cleanup(&ioeventfd->e); 310 trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr), 311 (uint64_t)ioeventfd->addr, ioeventfd->size, 312 ioeventfd->data); 313 g_free(ioeventfd); 314 } 315 316 static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk) 317 { 318 VFIOIOEventFD *ioeventfd, *tmp; 319 320 QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) { 321 if (ioeventfd->dynamic) { 322 vfio_ioeventfd_exit(vdev, ioeventfd); 323 } 324 } 325 } 326 327 static void vfio_ioeventfd_handler(void *opaque) 328 { 329 VFIOIOEventFD *ioeventfd = opaque; 330 331 if (event_notifier_test_and_clear(&ioeventfd->e)) { 332 vfio_region_write(ioeventfd->region, ioeventfd->region_addr, 333 ioeventfd->data, ioeventfd->size); 334 trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr), 335 (uint64_t)ioeventfd->addr, ioeventfd->size, 336 ioeventfd->data); 337 } 338 } 339 340 static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev, 341 MemoryRegion *mr, hwaddr addr, 342 unsigned size, uint64_t data, 343 VFIORegion *region, 344 hwaddr region_addr, bool dynamic) 345 { 346 VFIOIOEventFD *ioeventfd; 347 348 if (vdev->no_kvm_ioeventfd) { 349 return NULL; 350 } 351 352 ioeventfd = g_malloc0(sizeof(*ioeventfd)); 353 354 if (event_notifier_init(&ioeventfd->e, 0)) { 355 g_free(ioeventfd); 356 return NULL; 357 } 358 359 /* 360 * MemoryRegion and relative offset, plus additional ioeventfd setup 361 * parameters for configuring and later tearing down KVM ioeventfd. 362 */ 363 ioeventfd->mr = mr; 364 ioeventfd->addr = addr; 365 ioeventfd->size = size; 366 ioeventfd->data = data; 367 ioeventfd->dynamic = dynamic; 368 /* 369 * VFIORegion and relative offset for implementing the userspace 370 * handler. data & size fields shared for both uses. 371 */ 372 ioeventfd->region = region; 373 ioeventfd->region_addr = region_addr; 374 375 if (!vdev->no_vfio_ioeventfd) { 376 struct vfio_device_ioeventfd vfio_ioeventfd; 377 378 vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd); 379 vfio_ioeventfd.flags = ioeventfd->size; 380 vfio_ioeventfd.data = ioeventfd->data; 381 vfio_ioeventfd.offset = ioeventfd->region->fd_offset + 382 ioeventfd->region_addr; 383 vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e); 384 385 ioeventfd->vfio = !ioctl(vdev->vbasedev.fd, 386 VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd); 387 } 388 389 if (!ioeventfd->vfio) { 390 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), 391 vfio_ioeventfd_handler, NULL, ioeventfd); 392 } 393 394 memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size, 395 true, ioeventfd->data, &ioeventfd->e); 396 trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr, 397 size, data, ioeventfd->vfio); 398 399 return ioeventfd; 400 } 401 402 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev) 403 { 404 VFIOQuirk *quirk; 405 406 /* 407 * As long as the BAR is >= 256 bytes it will be aligned such that the 408 * lower byte is always zero. Filter out anything else, if it exists. 409 */ 410 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 411 !vdev->bars[4].ioport || vdev->bars[4].region.size < 256) { 412 return; 413 } 414 415 quirk = vfio_quirk_alloc(1); 416 417 memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev, 418 "vfio-ati-3c3-quirk", 1); 419 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 420 3 /* offset 3 bytes from 0x3c0 */, quirk->mem); 421 422 QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks, 423 quirk, next); 424 425 trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name); 426 } 427 428 /* 429 * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI 430 * config space through MMIO BAR2 at offset 0x4000. Nothing seems to access 431 * the MMIO space directly, but a window to this space is provided through 432 * I/O port BAR4. Offset 0x0 is the address register and offset 0x4 is the 433 * data register. When the address is programmed to a range of 0x4000-0x4fff 434 * PCI configuration space is available. Experimentation seems to indicate 435 * that read-only may be provided by hardware. 436 */ 437 static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr) 438 { 439 VFIOQuirk *quirk; 440 VFIOConfigWindowQuirk *window; 441 442 /* This windows doesn't seem to be used except by legacy VGA code */ 443 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 444 !vdev->vga || nr != 4) { 445 return; 446 } 447 448 quirk = vfio_quirk_alloc(2); 449 window = quirk->data = g_malloc0(sizeof(*window) + 450 sizeof(VFIOConfigWindowMatch)); 451 window->vdev = vdev; 452 window->address_offset = 0; 453 window->data_offset = 4; 454 window->nr_matches = 1; 455 window->matches[0].match = 0x4000; 456 window->matches[0].mask = vdev->config_size - 1; 457 window->bar = nr; 458 window->addr_mem = &quirk->mem[0]; 459 window->data_mem = &quirk->mem[1]; 460 461 memory_region_init_io(window->addr_mem, OBJECT(vdev), 462 &vfio_generic_window_address_quirk, window, 463 "vfio-ati-bar4-window-address-quirk", 4); 464 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 465 window->address_offset, 466 window->addr_mem, 1); 467 468 memory_region_init_io(window->data_mem, OBJECT(vdev), 469 &vfio_generic_window_data_quirk, window, 470 "vfio-ati-bar4-window-data-quirk", 4); 471 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 472 window->data_offset, 473 window->data_mem, 1); 474 475 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 476 477 trace_vfio_quirk_ati_bar4_probe(vdev->vbasedev.name); 478 } 479 480 /* 481 * Trap the BAR2 MMIO mirror to config space as well. 482 */ 483 static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr) 484 { 485 VFIOQuirk *quirk; 486 VFIOConfigMirrorQuirk *mirror; 487 488 /* Only enable on newer devices where BAR2 is 64bit */ 489 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 490 !vdev->vga || nr != 2 || !vdev->bars[2].mem64) { 491 return; 492 } 493 494 quirk = vfio_quirk_alloc(1); 495 mirror = quirk->data = g_malloc0(sizeof(*mirror)); 496 mirror->mem = quirk->mem; 497 mirror->vdev = vdev; 498 mirror->offset = 0x4000; 499 mirror->bar = nr; 500 501 memory_region_init_io(mirror->mem, OBJECT(vdev), 502 &vfio_generic_mirror_quirk, mirror, 503 "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE); 504 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 505 mirror->offset, mirror->mem, 1); 506 507 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 508 509 trace_vfio_quirk_ati_bar2_probe(vdev->vbasedev.name); 510 } 511 512 /* 513 * Older ATI/AMD cards like the X550 have a similar window to that above. 514 * I/O port BAR1 provides a window to a mirror of PCI config space located 515 * in BAR2 at offset 0xf00. We don't care to support such older cards, but 516 * note it for future reference. 517 */ 518 519 /* 520 * Nvidia has several different methods to get to config space, the 521 * nouveu project has several of these documented here: 522 * https://github.com/pathscale/envytools/tree/master/hwdocs 523 * 524 * The first quirk is actually not documented in envytools and is found 525 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an 526 * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access 527 * the mirror of PCI config space found at BAR0 offset 0x1800. The access 528 * sequence first writes 0x338 to I/O port 0x3d4. The target offset is 529 * then written to 0x3d0. Finally 0x538 is written for a read and 0x738 530 * is written for a write to 0x3d4. The BAR0 offset is then accessible 531 * through 0x3d0. This quirk doesn't seem to be necessary on newer cards 532 * that use the I/O port BAR5 window but it doesn't hurt to leave it. 533 */ 534 typedef enum {NONE = 0, SELECT, WINDOW, READ, WRITE} VFIONvidia3d0State; 535 static const char *nv3d0_states[] = { "NONE", "SELECT", 536 "WINDOW", "READ", "WRITE" }; 537 538 typedef struct VFIONvidia3d0Quirk { 539 VFIOPCIDevice *vdev; 540 VFIONvidia3d0State state; 541 uint32_t offset; 542 } VFIONvidia3d0Quirk; 543 544 static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque, 545 hwaddr addr, unsigned size) 546 { 547 VFIONvidia3d0Quirk *quirk = opaque; 548 VFIOPCIDevice *vdev = quirk->vdev; 549 550 quirk->state = NONE; 551 552 return vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 553 addr + 0x14, size); 554 } 555 556 static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr, 557 uint64_t data, unsigned size) 558 { 559 VFIONvidia3d0Quirk *quirk = opaque; 560 VFIOPCIDevice *vdev = quirk->vdev; 561 VFIONvidia3d0State old_state = quirk->state; 562 563 quirk->state = NONE; 564 565 switch (data) { 566 case 0x338: 567 if (old_state == NONE) { 568 quirk->state = SELECT; 569 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 570 nv3d0_states[quirk->state]); 571 } 572 break; 573 case 0x538: 574 if (old_state == WINDOW) { 575 quirk->state = READ; 576 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 577 nv3d0_states[quirk->state]); 578 } 579 break; 580 case 0x738: 581 if (old_state == WINDOW) { 582 quirk->state = WRITE; 583 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 584 nv3d0_states[quirk->state]); 585 } 586 break; 587 } 588 589 vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 590 addr + 0x14, data, size); 591 } 592 593 static const MemoryRegionOps vfio_nvidia_3d4_quirk = { 594 .read = vfio_nvidia_3d4_quirk_read, 595 .write = vfio_nvidia_3d4_quirk_write, 596 .endianness = DEVICE_LITTLE_ENDIAN, 597 }; 598 599 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque, 600 hwaddr addr, unsigned size) 601 { 602 VFIONvidia3d0Quirk *quirk = opaque; 603 VFIOPCIDevice *vdev = quirk->vdev; 604 VFIONvidia3d0State old_state = quirk->state; 605 uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 606 addr + 0x10, size); 607 608 quirk->state = NONE; 609 610 if (old_state == READ && 611 (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { 612 uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); 613 614 data = vfio_pci_read_config(&vdev->pdev, offset, size); 615 trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name, 616 offset, size, data); 617 } 618 619 return data; 620 } 621 622 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr, 623 uint64_t data, unsigned size) 624 { 625 VFIONvidia3d0Quirk *quirk = opaque; 626 VFIOPCIDevice *vdev = quirk->vdev; 627 VFIONvidia3d0State old_state = quirk->state; 628 629 quirk->state = NONE; 630 631 if (old_state == SELECT) { 632 quirk->offset = (uint32_t)data; 633 quirk->state = WINDOW; 634 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 635 nv3d0_states[quirk->state]); 636 } else if (old_state == WRITE) { 637 if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { 638 uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); 639 640 vfio_pci_write_config(&vdev->pdev, offset, data, size); 641 trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name, 642 offset, data, size); 643 return; 644 } 645 } 646 647 vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 648 addr + 0x10, data, size); 649 } 650 651 static const MemoryRegionOps vfio_nvidia_3d0_quirk = { 652 .read = vfio_nvidia_3d0_quirk_read, 653 .write = vfio_nvidia_3d0_quirk_write, 654 .endianness = DEVICE_LITTLE_ENDIAN, 655 }; 656 657 static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev) 658 { 659 VFIOQuirk *quirk; 660 VFIONvidia3d0Quirk *data; 661 662 if (vdev->no_geforce_quirks || 663 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 664 !vdev->bars[1].region.size) { 665 return; 666 } 667 668 quirk = vfio_quirk_alloc(2); 669 quirk->data = data = g_malloc0(sizeof(*data)); 670 data->vdev = vdev; 671 672 memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk, 673 data, "vfio-nvidia-3d4-quirk", 2); 674 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 675 0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]); 676 677 memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk, 678 data, "vfio-nvidia-3d0-quirk", 2); 679 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 680 0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]); 681 682 QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks, 683 quirk, next); 684 685 trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name); 686 } 687 688 /* 689 * The second quirk is documented in envytools. The I/O port BAR5 is just 690 * a set of address/data ports to the MMIO BARs. The BAR we care about is 691 * again BAR0. This backdoor is apparently a bit newer than the one above 692 * so we need to not only trap 256 bytes @0x1800, but all of PCI config 693 * space, including extended space is available at the 4k @0x88000. 694 */ 695 typedef struct VFIONvidiaBAR5Quirk { 696 uint32_t master; 697 uint32_t enable; 698 MemoryRegion *addr_mem; 699 MemoryRegion *data_mem; 700 bool enabled; 701 VFIOConfigWindowQuirk window; /* last for match data */ 702 } VFIONvidiaBAR5Quirk; 703 704 static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk *bar5) 705 { 706 VFIOPCIDevice *vdev = bar5->window.vdev; 707 708 if (((bar5->master & bar5->enable) & 0x1) == bar5->enabled) { 709 return; 710 } 711 712 bar5->enabled = !bar5->enabled; 713 trace_vfio_quirk_nvidia_bar5_state(vdev->vbasedev.name, 714 bar5->enabled ? "Enable" : "Disable"); 715 memory_region_set_enabled(bar5->addr_mem, bar5->enabled); 716 memory_region_set_enabled(bar5->data_mem, bar5->enabled); 717 } 718 719 static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque, 720 hwaddr addr, unsigned size) 721 { 722 VFIONvidiaBAR5Quirk *bar5 = opaque; 723 VFIOPCIDevice *vdev = bar5->window.vdev; 724 725 return vfio_region_read(&vdev->bars[5].region, addr, size); 726 } 727 728 static void vfio_nvidia_bar5_quirk_master_write(void *opaque, hwaddr addr, 729 uint64_t data, unsigned size) 730 { 731 VFIONvidiaBAR5Quirk *bar5 = opaque; 732 VFIOPCIDevice *vdev = bar5->window.vdev; 733 734 vfio_region_write(&vdev->bars[5].region, addr, data, size); 735 736 bar5->master = data; 737 vfio_nvidia_bar5_enable(bar5); 738 } 739 740 static const MemoryRegionOps vfio_nvidia_bar5_quirk_master = { 741 .read = vfio_nvidia_bar5_quirk_master_read, 742 .write = vfio_nvidia_bar5_quirk_master_write, 743 .endianness = DEVICE_LITTLE_ENDIAN, 744 }; 745 746 static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque, 747 hwaddr addr, unsigned size) 748 { 749 VFIONvidiaBAR5Quirk *bar5 = opaque; 750 VFIOPCIDevice *vdev = bar5->window.vdev; 751 752 return vfio_region_read(&vdev->bars[5].region, addr + 4, size); 753 } 754 755 static void vfio_nvidia_bar5_quirk_enable_write(void *opaque, hwaddr addr, 756 uint64_t data, unsigned size) 757 { 758 VFIONvidiaBAR5Quirk *bar5 = opaque; 759 VFIOPCIDevice *vdev = bar5->window.vdev; 760 761 vfio_region_write(&vdev->bars[5].region, addr + 4, data, size); 762 763 bar5->enable = data; 764 vfio_nvidia_bar5_enable(bar5); 765 } 766 767 static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable = { 768 .read = vfio_nvidia_bar5_quirk_enable_read, 769 .write = vfio_nvidia_bar5_quirk_enable_write, 770 .endianness = DEVICE_LITTLE_ENDIAN, 771 }; 772 773 static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr) 774 { 775 VFIOQuirk *quirk; 776 VFIONvidiaBAR5Quirk *bar5; 777 VFIOConfigWindowQuirk *window; 778 779 if (vdev->no_geforce_quirks || 780 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 781 !vdev->vga || nr != 5 || !vdev->bars[5].ioport) { 782 return; 783 } 784 785 quirk = vfio_quirk_alloc(4); 786 bar5 = quirk->data = g_malloc0(sizeof(*bar5) + 787 (sizeof(VFIOConfigWindowMatch) * 2)); 788 window = &bar5->window; 789 790 window->vdev = vdev; 791 window->address_offset = 0x8; 792 window->data_offset = 0xc; 793 window->nr_matches = 2; 794 window->matches[0].match = 0x1800; 795 window->matches[0].mask = PCI_CONFIG_SPACE_SIZE - 1; 796 window->matches[1].match = 0x88000; 797 window->matches[1].mask = vdev->config_size - 1; 798 window->bar = nr; 799 window->addr_mem = bar5->addr_mem = &quirk->mem[0]; 800 window->data_mem = bar5->data_mem = &quirk->mem[1]; 801 802 memory_region_init_io(window->addr_mem, OBJECT(vdev), 803 &vfio_generic_window_address_quirk, window, 804 "vfio-nvidia-bar5-window-address-quirk", 4); 805 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 806 window->address_offset, 807 window->addr_mem, 1); 808 memory_region_set_enabled(window->addr_mem, false); 809 810 memory_region_init_io(window->data_mem, OBJECT(vdev), 811 &vfio_generic_window_data_quirk, window, 812 "vfio-nvidia-bar5-window-data-quirk", 4); 813 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 814 window->data_offset, 815 window->data_mem, 1); 816 memory_region_set_enabled(window->data_mem, false); 817 818 memory_region_init_io(&quirk->mem[2], OBJECT(vdev), 819 &vfio_nvidia_bar5_quirk_master, bar5, 820 "vfio-nvidia-bar5-master-quirk", 4); 821 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 822 0, &quirk->mem[2], 1); 823 824 memory_region_init_io(&quirk->mem[3], OBJECT(vdev), 825 &vfio_nvidia_bar5_quirk_enable, bar5, 826 "vfio-nvidia-bar5-enable-quirk", 4); 827 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 828 4, &quirk->mem[3], 1); 829 830 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 831 832 trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name); 833 } 834 835 typedef struct LastDataSet { 836 VFIOQuirk *quirk; 837 hwaddr addr; 838 uint64_t data; 839 unsigned size; 840 int hits; 841 int added; 842 } LastDataSet; 843 844 #define MAX_DYN_IOEVENTFD 10 845 #define HITS_FOR_IOEVENTFD 10 846 847 /* 848 * Finally, BAR0 itself. We want to redirect any accesses to either 849 * 0x1800 or 0x88000 through the PCI config space access functions. 850 */ 851 static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr, 852 uint64_t data, unsigned size) 853 { 854 VFIOConfigMirrorQuirk *mirror = opaque; 855 VFIOPCIDevice *vdev = mirror->vdev; 856 PCIDevice *pdev = &vdev->pdev; 857 LastDataSet *last = (LastDataSet *)&mirror->data; 858 859 vfio_generic_quirk_mirror_write(opaque, addr, data, size); 860 861 /* 862 * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the 863 * MSI capability ID register. Both the ID and next register are 864 * read-only, so we allow writes covering either of those to real hw. 865 */ 866 if ((pdev->cap_present & QEMU_PCI_CAP_MSI) && 867 vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) { 868 vfio_region_write(&vdev->bars[mirror->bar].region, 869 addr + mirror->offset, data, size); 870 trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name); 871 } 872 873 /* 874 * Automatically add an ioeventfd to handle any repeated write with the 875 * same data and size above the standard PCI config space header. This is 876 * primarily expected to accelerate the MSI-ACK behavior, such as noted 877 * above. Current hardware/drivers should trigger an ioeventfd at config 878 * offset 0x704 (region offset 0x88704), with data 0x0, size 4. 879 * 880 * The criteria of 10 successive hits is arbitrary but reliably adds the 881 * MSI-ACK region. Note that as some writes are bypassed via the ioeventfd, 882 * the remaining ones have a greater chance of being seen successively. 883 * To avoid the pathological case of burning up all of QEMU's open file 884 * handles, arbitrarily limit this algorithm from adding no more than 10 885 * ioeventfds, print an error if we would have added an 11th, and then 886 * stop counting. 887 */ 888 if (!vdev->no_kvm_ioeventfd && 889 addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) { 890 if (addr != last->addr || data != last->data || size != last->size) { 891 last->addr = addr; 892 last->data = data; 893 last->size = size; 894 last->hits = 1; 895 } else if (++last->hits >= HITS_FOR_IOEVENTFD) { 896 if (last->added < MAX_DYN_IOEVENTFD) { 897 VFIOIOEventFD *ioeventfd; 898 ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size, 899 data, &vdev->bars[mirror->bar].region, 900 mirror->offset + addr, true); 901 if (ioeventfd) { 902 VFIOQuirk *quirk = last->quirk; 903 904 QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next); 905 last->added++; 906 } 907 } else { 908 last->added++; 909 warn_report("NVIDIA ioeventfd queue full for %s, unable to " 910 "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", " 911 "size %u", vdev->vbasedev.name, addr, data, size); 912 } 913 } 914 } 915 } 916 917 static const MemoryRegionOps vfio_nvidia_mirror_quirk = { 918 .read = vfio_generic_quirk_mirror_read, 919 .write = vfio_nvidia_quirk_mirror_write, 920 .endianness = DEVICE_LITTLE_ENDIAN, 921 }; 922 923 static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk) 924 { 925 VFIOConfigMirrorQuirk *mirror = quirk->data; 926 LastDataSet *last = (LastDataSet *)&mirror->data; 927 928 last->addr = last->data = last->size = last->hits = last->added = 0; 929 930 vfio_drop_dynamic_eventfds(vdev, quirk); 931 } 932 933 static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr) 934 { 935 VFIOQuirk *quirk; 936 VFIOConfigMirrorQuirk *mirror; 937 LastDataSet *last; 938 939 if (vdev->no_geforce_quirks || 940 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 941 !vfio_is_vga(vdev) || nr != 0) { 942 return; 943 } 944 945 quirk = vfio_quirk_alloc(1); 946 quirk->reset = vfio_nvidia_bar0_quirk_reset; 947 mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); 948 mirror->mem = quirk->mem; 949 mirror->vdev = vdev; 950 mirror->offset = 0x88000; 951 mirror->bar = nr; 952 last = (LastDataSet *)&mirror->data; 953 last->quirk = quirk; 954 955 memory_region_init_io(mirror->mem, OBJECT(vdev), 956 &vfio_nvidia_mirror_quirk, mirror, 957 "vfio-nvidia-bar0-88000-mirror-quirk", 958 vdev->config_size); 959 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 960 mirror->offset, mirror->mem, 1); 961 962 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 963 964 /* The 0x1800 offset mirror only seems to get used by legacy VGA */ 965 if (vdev->vga) { 966 quirk = vfio_quirk_alloc(1); 967 quirk->reset = vfio_nvidia_bar0_quirk_reset; 968 mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); 969 mirror->mem = quirk->mem; 970 mirror->vdev = vdev; 971 mirror->offset = 0x1800; 972 mirror->bar = nr; 973 last = (LastDataSet *)&mirror->data; 974 last->quirk = quirk; 975 976 memory_region_init_io(mirror->mem, OBJECT(vdev), 977 &vfio_nvidia_mirror_quirk, mirror, 978 "vfio-nvidia-bar0-1800-mirror-quirk", 979 PCI_CONFIG_SPACE_SIZE); 980 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 981 mirror->offset, mirror->mem, 1); 982 983 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 984 } 985 986 trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name); 987 } 988 989 /* 990 * TODO - Some Nvidia devices provide config access to their companion HDA 991 * device and even to their parent bridge via these config space mirrors. 992 * Add quirks for those regions. 993 */ 994 995 #define PCI_VENDOR_ID_REALTEK 0x10ec 996 997 /* 998 * RTL8168 devices have a backdoor that can access the MSI-X table. At BAR2 999 * offset 0x70 there is a dword data register, offset 0x74 is a dword address 1000 * register. According to the Linux r8169 driver, the MSI-X table is addressed 1001 * when the "type" portion of the address register is set to 0x1. This appears 1002 * to be bits 16:30. Bit 31 is both a write indicator and some sort of 1003 * "address latched" indicator. Bits 12:15 are a mask field, which we can 1004 * ignore because the MSI-X table should always be accessed as a dword (full 1005 * mask). Bits 0:11 is offset within the type. 1006 * 1007 * Example trace: 1008 * 1009 * Read from MSI-X table offset 0 1010 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr 1011 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch 1012 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data 1013 * 1014 * Write 0xfee00000 to MSI-X table offset 0 1015 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data 1016 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write 1017 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete 1018 */ 1019 typedef struct VFIOrtl8168Quirk { 1020 VFIOPCIDevice *vdev; 1021 uint32_t addr; 1022 uint32_t data; 1023 bool enabled; 1024 } VFIOrtl8168Quirk; 1025 1026 static uint64_t vfio_rtl8168_quirk_address_read(void *opaque, 1027 hwaddr addr, unsigned size) 1028 { 1029 VFIOrtl8168Quirk *rtl = opaque; 1030 VFIOPCIDevice *vdev = rtl->vdev; 1031 uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size); 1032 1033 if (rtl->enabled) { 1034 data = rtl->addr ^ 0x80000000U; /* latch/complete */ 1035 trace_vfio_quirk_rtl8168_fake_latch(vdev->vbasedev.name, data); 1036 } 1037 1038 return data; 1039 } 1040 1041 static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr, 1042 uint64_t data, unsigned size) 1043 { 1044 VFIOrtl8168Quirk *rtl = opaque; 1045 VFIOPCIDevice *vdev = rtl->vdev; 1046 1047 rtl->enabled = false; 1048 1049 if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */ 1050 rtl->enabled = true; 1051 rtl->addr = (uint32_t)data; 1052 1053 if (data & 0x80000000U) { /* Do write */ 1054 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) { 1055 hwaddr offset = data & 0xfff; 1056 uint64_t val = rtl->data; 1057 1058 trace_vfio_quirk_rtl8168_msix_write(vdev->vbasedev.name, 1059 (uint16_t)offset, val); 1060 1061 /* Write to the proper guest MSI-X table instead */ 1062 memory_region_dispatch_write(&vdev->pdev.msix_table_mmio, 1063 offset, val, 1064 size_memop(size) | MO_LE, 1065 MEMTXATTRS_UNSPECIFIED); 1066 } 1067 return; /* Do not write guest MSI-X data to hardware */ 1068 } 1069 } 1070 1071 vfio_region_write(&vdev->bars[2].region, addr + 0x74, data, size); 1072 } 1073 1074 static const MemoryRegionOps vfio_rtl_address_quirk = { 1075 .read = vfio_rtl8168_quirk_address_read, 1076 .write = vfio_rtl8168_quirk_address_write, 1077 .valid = { 1078 .min_access_size = 4, 1079 .max_access_size = 4, 1080 .unaligned = false, 1081 }, 1082 .endianness = DEVICE_LITTLE_ENDIAN, 1083 }; 1084 1085 static uint64_t vfio_rtl8168_quirk_data_read(void *opaque, 1086 hwaddr addr, unsigned size) 1087 { 1088 VFIOrtl8168Quirk *rtl = opaque; 1089 VFIOPCIDevice *vdev = rtl->vdev; 1090 uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x70, size); 1091 1092 if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) { 1093 hwaddr offset = rtl->addr & 0xfff; 1094 memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset, 1095 &data, size_memop(size) | MO_LE, 1096 MEMTXATTRS_UNSPECIFIED); 1097 trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data); 1098 } 1099 1100 return data; 1101 } 1102 1103 static void vfio_rtl8168_quirk_data_write(void *opaque, hwaddr addr, 1104 uint64_t data, unsigned size) 1105 { 1106 VFIOrtl8168Quirk *rtl = opaque; 1107 VFIOPCIDevice *vdev = rtl->vdev; 1108 1109 rtl->data = (uint32_t)data; 1110 1111 vfio_region_write(&vdev->bars[2].region, addr + 0x70, data, size); 1112 } 1113 1114 static const MemoryRegionOps vfio_rtl_data_quirk = { 1115 .read = vfio_rtl8168_quirk_data_read, 1116 .write = vfio_rtl8168_quirk_data_write, 1117 .valid = { 1118 .min_access_size = 4, 1119 .max_access_size = 4, 1120 .unaligned = false, 1121 }, 1122 .endianness = DEVICE_LITTLE_ENDIAN, 1123 }; 1124 1125 static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) 1126 { 1127 VFIOQuirk *quirk; 1128 VFIOrtl8168Quirk *rtl; 1129 1130 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_REALTEK, 0x8168) || nr != 2) { 1131 return; 1132 } 1133 1134 quirk = vfio_quirk_alloc(2); 1135 quirk->data = rtl = g_malloc0(sizeof(*rtl)); 1136 rtl->vdev = vdev; 1137 1138 memory_region_init_io(&quirk->mem[0], OBJECT(vdev), 1139 &vfio_rtl_address_quirk, rtl, 1140 "vfio-rtl8168-window-address-quirk", 4); 1141 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 1142 0x74, &quirk->mem[0], 1); 1143 1144 memory_region_init_io(&quirk->mem[1], OBJECT(vdev), 1145 &vfio_rtl_data_quirk, rtl, 1146 "vfio-rtl8168-window-data-quirk", 4); 1147 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 1148 0x70, &quirk->mem[1], 1); 1149 1150 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 1151 1152 trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name); 1153 } 1154 1155 #define IGD_ASLS 0xfc /* ASL Storage Register */ 1156 1157 /* 1158 * The OpRegion includes the Video BIOS Table, which seems important for 1159 * telling the driver what sort of outputs it has. Without this, the device 1160 * may work in the guest, but we may not get output. This also requires BIOS 1161 * support to reserve and populate a section of guest memory sufficient for 1162 * the table and to write the base address of that memory to the ASLS register 1163 * of the IGD device. 1164 */ 1165 int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, 1166 struct vfio_region_info *info, Error **errp) 1167 { 1168 int ret; 1169 1170 vdev->igd_opregion = g_malloc0(info->size); 1171 ret = pread(vdev->vbasedev.fd, vdev->igd_opregion, 1172 info->size, info->offset); 1173 if (ret != info->size) { 1174 error_setg(errp, "failed to read IGD OpRegion"); 1175 g_free(vdev->igd_opregion); 1176 vdev->igd_opregion = NULL; 1177 return -EINVAL; 1178 } 1179 1180 /* 1181 * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to 1182 * allocate 32bit reserved memory for, copy these contents into, and write 1183 * the reserved memory base address to the device ASLS register at 0xFC. 1184 * Alignment of this reserved region seems flexible, but using a 4k page 1185 * alignment seems to work well. This interface assumes a single IGD 1186 * device, which may be at VM address 00:02.0 in legacy mode or another 1187 * address in UPT mode. 1188 * 1189 * NB, there may be future use cases discovered where the VM should have 1190 * direct interaction with the host OpRegion, in which case the write to 1191 * the ASLS register would trigger MemoryRegion setup to enable that. 1192 */ 1193 fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion", 1194 vdev->igd_opregion, info->size); 1195 1196 trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name); 1197 1198 pci_set_long(vdev->pdev.config + IGD_ASLS, 0); 1199 pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0); 1200 pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); 1201 1202 return 0; 1203 } 1204 1205 /* 1206 * Common quirk probe entry points. 1207 */ 1208 void vfio_vga_quirk_setup(VFIOPCIDevice *vdev) 1209 { 1210 vfio_vga_probe_ati_3c3_quirk(vdev); 1211 vfio_vga_probe_nvidia_3d0_quirk(vdev); 1212 } 1213 1214 void vfio_vga_quirk_exit(VFIOPCIDevice *vdev) 1215 { 1216 VFIOQuirk *quirk; 1217 int i, j; 1218 1219 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { 1220 QLIST_FOREACH(quirk, &vdev->vga->region[i].quirks, next) { 1221 for (j = 0; j < quirk->nr_mem; j++) { 1222 memory_region_del_subregion(&vdev->vga->region[i].mem, 1223 &quirk->mem[j]); 1224 } 1225 } 1226 } 1227 } 1228 1229 void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev) 1230 { 1231 int i, j; 1232 1233 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { 1234 while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) { 1235 VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks); 1236 QLIST_REMOVE(quirk, next); 1237 for (j = 0; j < quirk->nr_mem; j++) { 1238 object_unparent(OBJECT(&quirk->mem[j])); 1239 } 1240 g_free(quirk->mem); 1241 g_free(quirk->data); 1242 g_free(quirk); 1243 } 1244 } 1245 } 1246 1247 void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) 1248 { 1249 vfio_probe_ati_bar4_quirk(vdev, nr); 1250 vfio_probe_ati_bar2_quirk(vdev, nr); 1251 vfio_probe_nvidia_bar5_quirk(vdev, nr); 1252 vfio_probe_nvidia_bar0_quirk(vdev, nr); 1253 vfio_probe_rtl8168_bar2_quirk(vdev, nr); 1254 #ifdef CONFIG_VFIO_IGD 1255 vfio_probe_igd_bar4_quirk(vdev, nr); 1256 #endif 1257 } 1258 1259 void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr) 1260 { 1261 VFIOBAR *bar = &vdev->bars[nr]; 1262 VFIOQuirk *quirk; 1263 int i; 1264 1265 QLIST_FOREACH(quirk, &bar->quirks, next) { 1266 while (!QLIST_EMPTY(&quirk->ioeventfds)) { 1267 vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds)); 1268 } 1269 1270 for (i = 0; i < quirk->nr_mem; i++) { 1271 memory_region_del_subregion(bar->region.mem, &quirk->mem[i]); 1272 } 1273 } 1274 } 1275 1276 void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr) 1277 { 1278 VFIOBAR *bar = &vdev->bars[nr]; 1279 int i; 1280 1281 while (!QLIST_EMPTY(&bar->quirks)) { 1282 VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks); 1283 QLIST_REMOVE(quirk, next); 1284 for (i = 0; i < quirk->nr_mem; i++) { 1285 object_unparent(OBJECT(&quirk->mem[i])); 1286 } 1287 g_free(quirk->mem); 1288 g_free(quirk->data); 1289 g_free(quirk); 1290 } 1291 } 1292 1293 /* 1294 * Reset quirks 1295 */ 1296 void vfio_quirk_reset(VFIOPCIDevice *vdev) 1297 { 1298 int i; 1299 1300 for (i = 0; i < PCI_ROM_SLOT; i++) { 1301 VFIOQuirk *quirk; 1302 VFIOBAR *bar = &vdev->bars[i]; 1303 1304 QLIST_FOREACH(quirk, &bar->quirks, next) { 1305 if (quirk->reset) { 1306 quirk->reset(vdev, quirk); 1307 } 1308 } 1309 } 1310 } 1311 1312 /* 1313 * AMD Radeon PCI config reset, based on Linux: 1314 * drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running() 1315 * drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset 1316 * drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc() 1317 * drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock() 1318 * IDs: include/drm/drm_pciids.h 1319 * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0 1320 * 1321 * Bonaire and Hawaii GPUs do not respond to a bus reset. This is a bug in the 1322 * hardware that should be fixed on future ASICs. The symptom of this is that 1323 * once the accerlated driver loads, Windows guests will bsod on subsequent 1324 * attmpts to load the driver, such as after VM reset or shutdown/restart. To 1325 * work around this, we do an AMD specific PCI config reset, followed by an SMC 1326 * reset. The PCI config reset only works if SMC firmware is running, so we 1327 * have a dependency on the state of the device as to whether this reset will 1328 * be effective. There are still cases where we won't be able to kick the 1329 * device into working, but this greatly improves the usability overall. The 1330 * config reset magic is relatively common on AMD GPUs, but the setup and SMC 1331 * poking is largely ASIC specific. 1332 */ 1333 static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev) 1334 { 1335 uint32_t clk, pc_c; 1336 1337 /* 1338 * Registers 200h and 204h are index and data registers for accessing 1339 * indirect configuration registers within the device. 1340 */ 1341 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); 1342 clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1343 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4); 1344 pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1345 1346 return (!(clk & 1) && (0x20100 <= pc_c)); 1347 } 1348 1349 /* 1350 * The scope of a config reset is controlled by a mode bit in the misc register 1351 * and a fuse, exposed as a bit in another register. The fuse is the default 1352 * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula 1353 * scope = !(misc ^ fuse), where the resulting scope is defined the same as 1354 * the fuse. A truth table therefore tells us that if misc == fuse, we need 1355 * to flip the value of the bit in the misc register. 1356 */ 1357 static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev) 1358 { 1359 uint32_t misc, fuse; 1360 bool a, b; 1361 1362 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4); 1363 fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1364 b = fuse & 64; 1365 1366 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4); 1367 misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1368 a = misc & 2; 1369 1370 if (a == b) { 1371 vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4); 1372 vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */ 1373 } 1374 } 1375 1376 static int vfio_radeon_reset(VFIOPCIDevice *vdev) 1377 { 1378 PCIDevice *pdev = &vdev->pdev; 1379 int i, ret = 0; 1380 uint32_t data; 1381 1382 /* Defer to a kernel implemented reset */ 1383 if (vdev->vbasedev.reset_works) { 1384 trace_vfio_quirk_ati_bonaire_reset_skipped(vdev->vbasedev.name); 1385 return -ENODEV; 1386 } 1387 1388 /* Enable only memory BAR access */ 1389 vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2); 1390 1391 /* Reset only works if SMC firmware is loaded and running */ 1392 if (!vfio_radeon_smc_is_running(vdev)) { 1393 ret = -EINVAL; 1394 trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev->vbasedev.name); 1395 goto out; 1396 } 1397 1398 /* Make sure only the GFX function is reset */ 1399 vfio_radeon_set_gfx_only_reset(vdev); 1400 1401 /* AMD PCI config reset */ 1402 vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4); 1403 usleep(100); 1404 1405 /* Read back the memory size to make sure we're out of reset */ 1406 for (i = 0; i < 100000; i++) { 1407 if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) { 1408 goto reset_smc; 1409 } 1410 usleep(1); 1411 } 1412 1413 trace_vfio_quirk_ati_bonaire_reset_timeout(vdev->vbasedev.name); 1414 1415 reset_smc: 1416 /* Reset SMC */ 1417 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4); 1418 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1419 data |= 1; 1420 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); 1421 1422 /* Disable SMC clock */ 1423 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); 1424 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1425 data |= 1; 1426 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); 1427 1428 trace_vfio_quirk_ati_bonaire_reset_done(vdev->vbasedev.name); 1429 1430 out: 1431 /* Restore PCI command register */ 1432 vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2); 1433 1434 return ret; 1435 } 1436 1437 void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev) 1438 { 1439 switch (vdev->vendor_id) { 1440 case 0x1002: 1441 switch (vdev->device_id) { 1442 /* Bonaire */ 1443 case 0x6649: /* Bonaire [FirePro W5100] */ 1444 case 0x6650: 1445 case 0x6651: 1446 case 0x6658: /* Bonaire XTX [Radeon R7 260X] */ 1447 case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */ 1448 case 0x665d: /* Bonaire [Radeon R7 200 Series] */ 1449 /* Hawaii */ 1450 case 0x67A0: /* Hawaii XT GL [FirePro W9100] */ 1451 case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */ 1452 case 0x67A2: 1453 case 0x67A8: 1454 case 0x67A9: 1455 case 0x67AA: 1456 case 0x67B0: /* Hawaii XT [Radeon R9 290X] */ 1457 case 0x67B1: /* Hawaii PRO [Radeon R9 290] */ 1458 case 0x67B8: 1459 case 0x67B9: 1460 case 0x67BA: 1461 case 0x67BE: 1462 vdev->resetfn = vfio_radeon_reset; 1463 trace_vfio_quirk_ati_bonaire_reset(vdev->vbasedev.name); 1464 break; 1465 } 1466 break; 1467 } 1468 } 1469 1470 /* 1471 * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify 1472 * devices as a member of a clique. Devices within the same clique ID 1473 * are capable of direct P2P. It's the user's responsibility that this 1474 * is correct. The spec says that this may reside at any unused config 1475 * offset, but reserves and recommends hypervisors place this at C8h. 1476 * The spec also states that the hypervisor should place this capability 1477 * at the end of the capability list, thus next is defined as 0h. 1478 * 1479 * +----------------+----------------+----------------+----------------+ 1480 * | sig 7:0 ('P') | vndr len (8h) | next (0h) | cap id (9h) | 1481 * +----------------+----------------+----------------+----------------+ 1482 * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)| sig 23:8 ('P2') | 1483 * +---------------------------------+---------------------------------+ 1484 * 1485 * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf 1486 */ 1487 static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v, 1488 const char *name, void *opaque, 1489 Error **errp) 1490 { 1491 DeviceState *dev = DEVICE(obj); 1492 Property *prop = opaque; 1493 uint8_t *ptr = qdev_get_prop_ptr(dev, prop); 1494 1495 visit_type_uint8(v, name, ptr, errp); 1496 } 1497 1498 static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v, 1499 const char *name, void *opaque, 1500 Error **errp) 1501 { 1502 DeviceState *dev = DEVICE(obj); 1503 Property *prop = opaque; 1504 uint8_t value, *ptr = qdev_get_prop_ptr(dev, prop); 1505 Error *local_err = NULL; 1506 1507 if (dev->realized) { 1508 qdev_prop_set_after_realize(dev, name, errp); 1509 return; 1510 } 1511 1512 if (!visit_type_uint8(v, name, &value, &local_err)) { 1513 error_propagate(errp, local_err); 1514 return; 1515 } 1516 1517 if (value & ~0xF) { 1518 error_setg(errp, "Property %s: valid range 0-15", name); 1519 return; 1520 } 1521 1522 *ptr = value; 1523 } 1524 1525 const PropertyInfo qdev_prop_nv_gpudirect_clique = { 1526 .name = "uint4", 1527 .description = "NVIDIA GPUDirect Clique ID (0 - 15)", 1528 .get = get_nv_gpudirect_clique_id, 1529 .set = set_nv_gpudirect_clique_id, 1530 }; 1531 1532 static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp) 1533 { 1534 PCIDevice *pdev = &vdev->pdev; 1535 int ret, pos = 0xC8; 1536 1537 if (vdev->nv_gpudirect_clique == 0xFF) { 1538 return 0; 1539 } 1540 1541 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) { 1542 error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor"); 1543 return -EINVAL; 1544 } 1545 1546 if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) != 1547 PCI_BASE_CLASS_DISPLAY) { 1548 error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class"); 1549 return -EINVAL; 1550 } 1551 1552 ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp); 1553 if (ret < 0) { 1554 error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: "); 1555 return ret; 1556 } 1557 1558 memset(vdev->emulated_config_bits + pos, 0xFF, 8); 1559 pos += PCI_CAP_FLAGS; 1560 pci_set_byte(pdev->config + pos++, 8); 1561 pci_set_byte(pdev->config + pos++, 'P'); 1562 pci_set_byte(pdev->config + pos++, '2'); 1563 pci_set_byte(pdev->config + pos++, 'P'); 1564 pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3); 1565 pci_set_byte(pdev->config + pos, 0); 1566 1567 return 0; 1568 } 1569 1570 static void vfio_pci_nvlink2_get_tgt(Object *obj, Visitor *v, 1571 const char *name, 1572 void *opaque, Error **errp) 1573 { 1574 uint64_t tgt = (uintptr_t) opaque; 1575 visit_type_uint64(v, name, &tgt, errp); 1576 } 1577 1578 static void vfio_pci_nvlink2_get_link_speed(Object *obj, Visitor *v, 1579 const char *name, 1580 void *opaque, Error **errp) 1581 { 1582 uint32_t link_speed = (uint32_t)(uintptr_t) opaque; 1583 visit_type_uint32(v, name, &link_speed, errp); 1584 } 1585 1586 int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp) 1587 { 1588 int ret; 1589 void *p; 1590 struct vfio_region_info *nv2reg = NULL; 1591 struct vfio_info_cap_header *hdr; 1592 struct vfio_region_info_cap_nvlink2_ssatgt *cap; 1593 VFIOQuirk *quirk; 1594 1595 ret = vfio_get_dev_region_info(&vdev->vbasedev, 1596 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | 1597 PCI_VENDOR_ID_NVIDIA, 1598 VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, 1599 &nv2reg); 1600 if (ret) { 1601 return ret; 1602 } 1603 1604 hdr = vfio_get_region_info_cap(nv2reg, VFIO_REGION_INFO_CAP_NVLINK2_SSATGT); 1605 if (!hdr) { 1606 ret = -ENODEV; 1607 goto free_exit; 1608 } 1609 cap = (void *) hdr; 1610 1611 p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE, 1612 MAP_SHARED, vdev->vbasedev.fd, nv2reg->offset); 1613 if (p == MAP_FAILED) { 1614 ret = -errno; 1615 goto free_exit; 1616 } 1617 1618 quirk = vfio_quirk_alloc(1); 1619 memory_region_init_ram_ptr(&quirk->mem[0], OBJECT(vdev), "nvlink2-mr", 1620 nv2reg->size, p); 1621 QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next); 1622 1623 object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64", 1624 vfio_pci_nvlink2_get_tgt, NULL, NULL, 1625 (void *) (uintptr_t) cap->tgt); 1626 trace_vfio_pci_nvidia_gpu_setup_quirk(vdev->vbasedev.name, cap->tgt, 1627 nv2reg->size); 1628 free_exit: 1629 g_free(nv2reg); 1630 1631 return ret; 1632 } 1633 1634 int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp) 1635 { 1636 int ret; 1637 void *p; 1638 struct vfio_region_info *atsdreg = NULL; 1639 struct vfio_info_cap_header *hdr; 1640 struct vfio_region_info_cap_nvlink2_ssatgt *captgt; 1641 struct vfio_region_info_cap_nvlink2_lnkspd *capspeed; 1642 VFIOQuirk *quirk; 1643 1644 ret = vfio_get_dev_region_info(&vdev->vbasedev, 1645 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | 1646 PCI_VENDOR_ID_IBM, 1647 VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, 1648 &atsdreg); 1649 if (ret) { 1650 return ret; 1651 } 1652 1653 hdr = vfio_get_region_info_cap(atsdreg, 1654 VFIO_REGION_INFO_CAP_NVLINK2_SSATGT); 1655 if (!hdr) { 1656 ret = -ENODEV; 1657 goto free_exit; 1658 } 1659 captgt = (void *) hdr; 1660 1661 hdr = vfio_get_region_info_cap(atsdreg, 1662 VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD); 1663 if (!hdr) { 1664 ret = -ENODEV; 1665 goto free_exit; 1666 } 1667 capspeed = (void *) hdr; 1668 1669 /* Some NVLink bridges may not have assigned ATSD */ 1670 if (atsdreg->size) { 1671 p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE, 1672 MAP_SHARED, vdev->vbasedev.fd, atsdreg->offset); 1673 if (p == MAP_FAILED) { 1674 ret = -errno; 1675 goto free_exit; 1676 } 1677 1678 quirk = vfio_quirk_alloc(1); 1679 memory_region_init_ram_device_ptr(&quirk->mem[0], OBJECT(vdev), 1680 "nvlink2-atsd-mr", atsdreg->size, p); 1681 QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next); 1682 } 1683 1684 object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64", 1685 vfio_pci_nvlink2_get_tgt, NULL, NULL, 1686 (void *) (uintptr_t) captgt->tgt); 1687 trace_vfio_pci_nvlink2_setup_quirk_ssatgt(vdev->vbasedev.name, captgt->tgt, 1688 atsdreg->size); 1689 1690 object_property_add(OBJECT(vdev), "nvlink2-link-speed", "uint32", 1691 vfio_pci_nvlink2_get_link_speed, NULL, NULL, 1692 (void *) (uintptr_t) capspeed->link_speed); 1693 trace_vfio_pci_nvlink2_setup_quirk_lnkspd(vdev->vbasedev.name, 1694 capspeed->link_speed); 1695 free_exit: 1696 g_free(atsdreg); 1697 1698 return ret; 1699 } 1700 1701 /* 1702 * The VMD endpoint provides a real PCIe domain to the guest and the guest 1703 * kernel performs enumeration of the VMD sub-device domain. Guest transactions 1704 * to VMD sub-devices go through MMU translation from guest addresses to 1705 * physical addresses. When MMIO goes to an endpoint after being translated to 1706 * physical addresses, the bridge rejects the transaction because the window 1707 * has been programmed with guest addresses. 1708 * 1709 * VMD can use the Host Physical Address in order to correctly program the 1710 * bridge windows in its PCIe domain. VMD device 28C0 has HPA shadow registers 1711 * located at offset 0x2000 in MEMBAR2 (BAR 4). This quirk provides the HPA 1712 * shadow registers in a vendor-specific capability register for devices 1713 * without native support. The position of 0xE8-0xFF is in the reserved range 1714 * of the VMD device capability space following the Power Management 1715 * Capability. 1716 */ 1717 #define VMD_SHADOW_CAP_VER 1 1718 #define VMD_SHADOW_CAP_LEN 24 1719 static int vfio_add_vmd_shadow_cap(VFIOPCIDevice *vdev, Error **errp) 1720 { 1721 uint8_t membar_phys[16]; 1722 int ret, pos = 0xE8; 1723 1724 if (!(vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x201D) || 1725 vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x467F) || 1726 vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x4C3D) || 1727 vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x9A0B))) { 1728 return 0; 1729 } 1730 1731 ret = pread(vdev->vbasedev.fd, membar_phys, 16, 1732 vdev->config_offset + PCI_BASE_ADDRESS_2); 1733 if (ret != 16) { 1734 error_report("VMD %s cannot read MEMBARs (%d)", 1735 vdev->vbasedev.name, ret); 1736 return -EFAULT; 1737 } 1738 1739 ret = pci_add_capability(&vdev->pdev, PCI_CAP_ID_VNDR, pos, 1740 VMD_SHADOW_CAP_LEN, errp); 1741 if (ret < 0) { 1742 error_prepend(errp, "Failed to add VMD MEMBAR Shadow cap: "); 1743 return ret; 1744 } 1745 1746 memset(vdev->emulated_config_bits + pos, 0xFF, VMD_SHADOW_CAP_LEN); 1747 pos += PCI_CAP_FLAGS; 1748 pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_LEN); 1749 pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_VER); 1750 pci_set_long(vdev->pdev.config + pos, 0x53484457); /* SHDW */ 1751 memcpy(vdev->pdev.config + pos + 4, membar_phys, 16); 1752 1753 return 0; 1754 } 1755 1756 int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp) 1757 { 1758 int ret; 1759 1760 ret = vfio_add_nv_gpudirect_cap(vdev, errp); 1761 if (ret) { 1762 return ret; 1763 } 1764 1765 ret = vfio_add_vmd_shadow_cap(vdev, errp); 1766 if (ret) { 1767 return ret; 1768 } 1769 1770 return 0; 1771 } 1772