1 /* 2 * device quirks for PCI devices 3 * 4 * Copyright Red Hat, Inc. 2012-2015 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include CONFIG_DEVICES 15 #include "exec/memop.h" 16 #include "qemu/units.h" 17 #include "qemu/log.h" 18 #include "qemu/error-report.h" 19 #include "qemu/main-loop.h" 20 #include "qemu/module.h" 21 #include "qemu/range.h" 22 #include "qapi/error.h" 23 #include "qapi/visitor.h" 24 #include <sys/ioctl.h> 25 #include "hw/hw.h" 26 #include "hw/nvram/fw_cfg.h" 27 #include "hw/qdev-properties.h" 28 #include "pci.h" 29 #include "trace.h" 30 31 /* 32 * List of device ids/vendor ids for which to disable 33 * option rom loading. This avoids the guest hangs during rom 34 * execution as noticed with the BCM 57810 card for lack of a 35 * more better way to handle such issues. 36 * The user can still override by specifying a romfile or 37 * rombar=1. 38 * Please see https://bugs.launchpad.net/qemu/+bug/1284874 39 * for an analysis of the 57810 card hang. When adding 40 * a new vendor id/device id combination below, please also add 41 * your card/environment details and information that could 42 * help in debugging to the bug tracking this issue 43 */ 44 static const struct { 45 uint32_t vendor; 46 uint32_t device; 47 } rom_denylist[] = { 48 { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */ 49 }; 50 51 bool vfio_opt_rom_in_denylist(VFIOPCIDevice *vdev) 52 { 53 int i; 54 55 for (i = 0 ; i < ARRAY_SIZE(rom_denylist); i++) { 56 if (vfio_pci_is(vdev, rom_denylist[i].vendor, rom_denylist[i].device)) { 57 trace_vfio_quirk_rom_in_denylist(vdev->vbasedev.name, 58 rom_denylist[i].vendor, 59 rom_denylist[i].device); 60 return true; 61 } 62 } 63 return false; 64 } 65 66 /* 67 * Device specific region quirks (mostly backdoors to PCI config space) 68 */ 69 70 /* 71 * The generic window quirks operate on an address and data register, 72 * vfio_generic_window_address_quirk handles the address register and 73 * vfio_generic_window_data_quirk handles the data register. These ops 74 * pass reads and writes through to hardware until a value matching the 75 * stored address match/mask is written. When this occurs, the data 76 * register access emulated PCI config space for the device rather than 77 * passing through accesses. This enables devices where PCI config space 78 * is accessible behind a window register to maintain the virtualization 79 * provided through vfio. 80 */ 81 typedef struct VFIOConfigWindowMatch { 82 uint32_t match; 83 uint32_t mask; 84 } VFIOConfigWindowMatch; 85 86 typedef struct VFIOConfigWindowQuirk { 87 struct VFIOPCIDevice *vdev; 88 89 uint32_t address_val; 90 91 uint32_t address_offset; 92 uint32_t data_offset; 93 94 bool window_enabled; 95 uint8_t bar; 96 97 MemoryRegion *addr_mem; 98 MemoryRegion *data_mem; 99 100 uint32_t nr_matches; 101 VFIOConfigWindowMatch matches[]; 102 } VFIOConfigWindowQuirk; 103 104 static uint64_t vfio_generic_window_quirk_address_read(void *opaque, 105 hwaddr addr, 106 unsigned size) 107 { 108 VFIOConfigWindowQuirk *window = opaque; 109 VFIOPCIDevice *vdev = window->vdev; 110 111 return vfio_region_read(&vdev->bars[window->bar].region, 112 addr + window->address_offset, size); 113 } 114 115 static void vfio_generic_window_quirk_address_write(void *opaque, hwaddr addr, 116 uint64_t data, 117 unsigned size) 118 { 119 VFIOConfigWindowQuirk *window = opaque; 120 VFIOPCIDevice *vdev = window->vdev; 121 int i; 122 123 window->window_enabled = false; 124 125 vfio_region_write(&vdev->bars[window->bar].region, 126 addr + window->address_offset, data, size); 127 128 for (i = 0; i < window->nr_matches; i++) { 129 if ((data & ~window->matches[i].mask) == window->matches[i].match) { 130 window->window_enabled = true; 131 window->address_val = data & window->matches[i].mask; 132 trace_vfio_quirk_generic_window_address_write(vdev->vbasedev.name, 133 memory_region_name(window->addr_mem), data); 134 break; 135 } 136 } 137 } 138 139 static const MemoryRegionOps vfio_generic_window_address_quirk = { 140 .read = vfio_generic_window_quirk_address_read, 141 .write = vfio_generic_window_quirk_address_write, 142 .endianness = DEVICE_LITTLE_ENDIAN, 143 }; 144 145 static uint64_t vfio_generic_window_quirk_data_read(void *opaque, 146 hwaddr addr, unsigned size) 147 { 148 VFIOConfigWindowQuirk *window = opaque; 149 VFIOPCIDevice *vdev = window->vdev; 150 uint64_t data; 151 152 /* Always read data reg, discard if window enabled */ 153 data = vfio_region_read(&vdev->bars[window->bar].region, 154 addr + window->data_offset, size); 155 156 if (window->window_enabled) { 157 data = vfio_pci_read_config(&vdev->pdev, window->address_val, size); 158 trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name, 159 memory_region_name(window->data_mem), data); 160 } 161 162 return data; 163 } 164 165 static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr, 166 uint64_t data, unsigned size) 167 { 168 VFIOConfigWindowQuirk *window = opaque; 169 VFIOPCIDevice *vdev = window->vdev; 170 171 if (window->window_enabled) { 172 vfio_pci_write_config(&vdev->pdev, window->address_val, data, size); 173 trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name, 174 memory_region_name(window->data_mem), data); 175 return; 176 } 177 178 vfio_region_write(&vdev->bars[window->bar].region, 179 addr + window->data_offset, data, size); 180 } 181 182 static const MemoryRegionOps vfio_generic_window_data_quirk = { 183 .read = vfio_generic_window_quirk_data_read, 184 .write = vfio_generic_window_quirk_data_write, 185 .endianness = DEVICE_LITTLE_ENDIAN, 186 }; 187 188 /* 189 * The generic mirror quirk handles devices which expose PCI config space 190 * through a region within a BAR. When enabled, reads and writes are 191 * redirected through to emulated PCI config space. XXX if PCI config space 192 * used memory regions, this could just be an alias. 193 */ 194 typedef struct VFIOConfigMirrorQuirk { 195 struct VFIOPCIDevice *vdev; 196 uint32_t offset; 197 uint8_t bar; 198 MemoryRegion *mem; 199 uint8_t data[]; 200 } VFIOConfigMirrorQuirk; 201 202 static uint64_t vfio_generic_quirk_mirror_read(void *opaque, 203 hwaddr addr, unsigned size) 204 { 205 VFIOConfigMirrorQuirk *mirror = opaque; 206 VFIOPCIDevice *vdev = mirror->vdev; 207 uint64_t data; 208 209 /* Read and discard in case the hardware cares */ 210 (void)vfio_region_read(&vdev->bars[mirror->bar].region, 211 addr + mirror->offset, size); 212 213 data = vfio_pci_read_config(&vdev->pdev, addr, size); 214 trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name, 215 memory_region_name(mirror->mem), 216 addr, data); 217 return data; 218 } 219 220 static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr, 221 uint64_t data, unsigned size) 222 { 223 VFIOConfigMirrorQuirk *mirror = opaque; 224 VFIOPCIDevice *vdev = mirror->vdev; 225 226 vfio_pci_write_config(&vdev->pdev, addr, data, size); 227 trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name, 228 memory_region_name(mirror->mem), 229 addr, data); 230 } 231 232 static const MemoryRegionOps vfio_generic_mirror_quirk = { 233 .read = vfio_generic_quirk_mirror_read, 234 .write = vfio_generic_quirk_mirror_write, 235 .endianness = DEVICE_LITTLE_ENDIAN, 236 }; 237 238 /* Is range1 fully contained within range2? */ 239 static bool vfio_range_contained(uint64_t first1, uint64_t len1, 240 uint64_t first2, uint64_t len2) { 241 return (first1 >= first2 && first1 + len1 <= first2 + len2); 242 } 243 244 #define PCI_VENDOR_ID_ATI 0x1002 245 246 /* 247 * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR 248 * through VGA register 0x3c3. On newer cards, the I/O port BAR is always 249 * BAR4 (older cards like the X550 used BAR1, but we don't care to support 250 * those). Note that on bare metal, a read of 0x3c3 doesn't always return the 251 * I/O port BAR address. Originally this was coded to return the virtual BAR 252 * address only if the physical register read returns the actual BAR address, 253 * but users have reported greater success if we return the virtual address 254 * unconditionally. 255 */ 256 static uint64_t vfio_ati_3c3_quirk_read(void *opaque, 257 hwaddr addr, unsigned size) 258 { 259 VFIOPCIDevice *vdev = opaque; 260 uint64_t data = vfio_pci_read_config(&vdev->pdev, 261 PCI_BASE_ADDRESS_4 + 1, size); 262 263 trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data); 264 265 return data; 266 } 267 268 static void vfio_ati_3c3_quirk_write(void *opaque, hwaddr addr, 269 uint64_t data, unsigned size) 270 { 271 qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid access\n", __func__); 272 } 273 274 static const MemoryRegionOps vfio_ati_3c3_quirk = { 275 .read = vfio_ati_3c3_quirk_read, 276 .write = vfio_ati_3c3_quirk_write, 277 .endianness = DEVICE_LITTLE_ENDIAN, 278 }; 279 280 VFIOQuirk *vfio_quirk_alloc(int nr_mem) 281 { 282 VFIOQuirk *quirk = g_new0(VFIOQuirk, 1); 283 QLIST_INIT(&quirk->ioeventfds); 284 quirk->mem = g_new0(MemoryRegion, nr_mem); 285 quirk->nr_mem = nr_mem; 286 287 return quirk; 288 } 289 290 static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd) 291 { 292 QLIST_REMOVE(ioeventfd, next); 293 memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size, 294 true, ioeventfd->data, &ioeventfd->e); 295 296 if (ioeventfd->vfio) { 297 struct vfio_device_ioeventfd vfio_ioeventfd; 298 299 vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd); 300 vfio_ioeventfd.flags = ioeventfd->size; 301 vfio_ioeventfd.data = ioeventfd->data; 302 vfio_ioeventfd.offset = ioeventfd->region->fd_offset + 303 ioeventfd->region_addr; 304 vfio_ioeventfd.fd = -1; 305 306 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd)) { 307 error_report("Failed to remove vfio ioeventfd for %s+0x%" 308 HWADDR_PRIx"[%d]:0x%"PRIx64" (%m)", 309 memory_region_name(ioeventfd->mr), ioeventfd->addr, 310 ioeventfd->size, ioeventfd->data); 311 } 312 } else { 313 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), 314 NULL, NULL, NULL); 315 } 316 317 event_notifier_cleanup(&ioeventfd->e); 318 trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr), 319 (uint64_t)ioeventfd->addr, ioeventfd->size, 320 ioeventfd->data); 321 g_free(ioeventfd); 322 } 323 324 static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk) 325 { 326 VFIOIOEventFD *ioeventfd, *tmp; 327 328 QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) { 329 if (ioeventfd->dynamic) { 330 vfio_ioeventfd_exit(vdev, ioeventfd); 331 } 332 } 333 } 334 335 static void vfio_ioeventfd_handler(void *opaque) 336 { 337 VFIOIOEventFD *ioeventfd = opaque; 338 339 if (event_notifier_test_and_clear(&ioeventfd->e)) { 340 vfio_region_write(ioeventfd->region, ioeventfd->region_addr, 341 ioeventfd->data, ioeventfd->size); 342 trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr), 343 (uint64_t)ioeventfd->addr, ioeventfd->size, 344 ioeventfd->data); 345 } 346 } 347 348 static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev, 349 MemoryRegion *mr, hwaddr addr, 350 unsigned size, uint64_t data, 351 VFIORegion *region, 352 hwaddr region_addr, bool dynamic) 353 { 354 VFIOIOEventFD *ioeventfd; 355 356 if (vdev->no_kvm_ioeventfd) { 357 return NULL; 358 } 359 360 ioeventfd = g_malloc0(sizeof(*ioeventfd)); 361 362 if (event_notifier_init(&ioeventfd->e, 0)) { 363 g_free(ioeventfd); 364 return NULL; 365 } 366 367 /* 368 * MemoryRegion and relative offset, plus additional ioeventfd setup 369 * parameters for configuring and later tearing down KVM ioeventfd. 370 */ 371 ioeventfd->mr = mr; 372 ioeventfd->addr = addr; 373 ioeventfd->size = size; 374 ioeventfd->data = data; 375 ioeventfd->dynamic = dynamic; 376 /* 377 * VFIORegion and relative offset for implementing the userspace 378 * handler. data & size fields shared for both uses. 379 */ 380 ioeventfd->region = region; 381 ioeventfd->region_addr = region_addr; 382 383 if (!vdev->no_vfio_ioeventfd) { 384 struct vfio_device_ioeventfd vfio_ioeventfd; 385 386 vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd); 387 vfio_ioeventfd.flags = ioeventfd->size; 388 vfio_ioeventfd.data = ioeventfd->data; 389 vfio_ioeventfd.offset = ioeventfd->region->fd_offset + 390 ioeventfd->region_addr; 391 vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e); 392 393 ioeventfd->vfio = !ioctl(vdev->vbasedev.fd, 394 VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd); 395 } 396 397 if (!ioeventfd->vfio) { 398 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), 399 vfio_ioeventfd_handler, NULL, ioeventfd); 400 } 401 402 memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size, 403 true, ioeventfd->data, &ioeventfd->e); 404 trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr, 405 size, data, ioeventfd->vfio); 406 407 return ioeventfd; 408 } 409 410 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev) 411 { 412 VFIOQuirk *quirk; 413 414 /* 415 * As long as the BAR is >= 256 bytes it will be aligned such that the 416 * lower byte is always zero. Filter out anything else, if it exists. 417 */ 418 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 419 !vdev->bars[4].ioport || vdev->bars[4].region.size < 256) { 420 return; 421 } 422 423 quirk = vfio_quirk_alloc(1); 424 425 memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev, 426 "vfio-ati-3c3-quirk", 1); 427 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 428 3 /* offset 3 bytes from 0x3c0 */, quirk->mem); 429 430 QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks, 431 quirk, next); 432 433 trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name); 434 } 435 436 /* 437 * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI 438 * config space through MMIO BAR2 at offset 0x4000. Nothing seems to access 439 * the MMIO space directly, but a window to this space is provided through 440 * I/O port BAR4. Offset 0x0 is the address register and offset 0x4 is the 441 * data register. When the address is programmed to a range of 0x4000-0x4fff 442 * PCI configuration space is available. Experimentation seems to indicate 443 * that read-only may be provided by hardware. 444 */ 445 static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr) 446 { 447 VFIOQuirk *quirk; 448 VFIOConfigWindowQuirk *window; 449 450 /* This windows doesn't seem to be used except by legacy VGA code */ 451 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 452 !vdev->vga || nr != 4) { 453 return; 454 } 455 456 quirk = vfio_quirk_alloc(2); 457 window = quirk->data = g_malloc0(sizeof(*window) + 458 sizeof(VFIOConfigWindowMatch)); 459 window->vdev = vdev; 460 window->address_offset = 0; 461 window->data_offset = 4; 462 window->nr_matches = 1; 463 window->matches[0].match = 0x4000; 464 window->matches[0].mask = vdev->config_size - 1; 465 window->bar = nr; 466 window->addr_mem = &quirk->mem[0]; 467 window->data_mem = &quirk->mem[1]; 468 469 memory_region_init_io(window->addr_mem, OBJECT(vdev), 470 &vfio_generic_window_address_quirk, window, 471 "vfio-ati-bar4-window-address-quirk", 4); 472 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 473 window->address_offset, 474 window->addr_mem, 1); 475 476 memory_region_init_io(window->data_mem, OBJECT(vdev), 477 &vfio_generic_window_data_quirk, window, 478 "vfio-ati-bar4-window-data-quirk", 4); 479 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 480 window->data_offset, 481 window->data_mem, 1); 482 483 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 484 485 trace_vfio_quirk_ati_bar4_probe(vdev->vbasedev.name); 486 } 487 488 /* 489 * Trap the BAR2 MMIO mirror to config space as well. 490 */ 491 static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr) 492 { 493 VFIOQuirk *quirk; 494 VFIOConfigMirrorQuirk *mirror; 495 496 /* Only enable on newer devices where BAR2 is 64bit */ 497 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 498 !vdev->vga || nr != 2 || !vdev->bars[2].mem64) { 499 return; 500 } 501 502 quirk = vfio_quirk_alloc(1); 503 mirror = quirk->data = g_malloc0(sizeof(*mirror)); 504 mirror->mem = quirk->mem; 505 mirror->vdev = vdev; 506 mirror->offset = 0x4000; 507 mirror->bar = nr; 508 509 memory_region_init_io(mirror->mem, OBJECT(vdev), 510 &vfio_generic_mirror_quirk, mirror, 511 "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE); 512 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 513 mirror->offset, mirror->mem, 1); 514 515 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 516 517 trace_vfio_quirk_ati_bar2_probe(vdev->vbasedev.name); 518 } 519 520 /* 521 * Older ATI/AMD cards like the X550 have a similar window to that above. 522 * I/O port BAR1 provides a window to a mirror of PCI config space located 523 * in BAR2 at offset 0xf00. We don't care to support such older cards, but 524 * note it for future reference. 525 */ 526 527 /* 528 * Nvidia has several different methods to get to config space, the 529 * nouveu project has several of these documented here: 530 * https://github.com/pathscale/envytools/tree/master/hwdocs 531 * 532 * The first quirk is actually not documented in envytools and is found 533 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an 534 * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access 535 * the mirror of PCI config space found at BAR0 offset 0x1800. The access 536 * sequence first writes 0x338 to I/O port 0x3d4. The target offset is 537 * then written to 0x3d0. Finally 0x538 is written for a read and 0x738 538 * is written for a write to 0x3d4. The BAR0 offset is then accessible 539 * through 0x3d0. This quirk doesn't seem to be necessary on newer cards 540 * that use the I/O port BAR5 window but it doesn't hurt to leave it. 541 */ 542 typedef enum {NONE = 0, SELECT, WINDOW, READ, WRITE} VFIONvidia3d0State; 543 static const char *nv3d0_states[] = { "NONE", "SELECT", 544 "WINDOW", "READ", "WRITE" }; 545 546 typedef struct VFIONvidia3d0Quirk { 547 VFIOPCIDevice *vdev; 548 VFIONvidia3d0State state; 549 uint32_t offset; 550 } VFIONvidia3d0Quirk; 551 552 static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque, 553 hwaddr addr, unsigned size) 554 { 555 VFIONvidia3d0Quirk *quirk = opaque; 556 VFIOPCIDevice *vdev = quirk->vdev; 557 558 quirk->state = NONE; 559 560 return vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 561 addr + 0x14, size); 562 } 563 564 static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr, 565 uint64_t data, unsigned size) 566 { 567 VFIONvidia3d0Quirk *quirk = opaque; 568 VFIOPCIDevice *vdev = quirk->vdev; 569 VFIONvidia3d0State old_state = quirk->state; 570 571 quirk->state = NONE; 572 573 switch (data) { 574 case 0x338: 575 if (old_state == NONE) { 576 quirk->state = SELECT; 577 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 578 nv3d0_states[quirk->state]); 579 } 580 break; 581 case 0x538: 582 if (old_state == WINDOW) { 583 quirk->state = READ; 584 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 585 nv3d0_states[quirk->state]); 586 } 587 break; 588 case 0x738: 589 if (old_state == WINDOW) { 590 quirk->state = WRITE; 591 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 592 nv3d0_states[quirk->state]); 593 } 594 break; 595 } 596 597 vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 598 addr + 0x14, data, size); 599 } 600 601 static const MemoryRegionOps vfio_nvidia_3d4_quirk = { 602 .read = vfio_nvidia_3d4_quirk_read, 603 .write = vfio_nvidia_3d4_quirk_write, 604 .endianness = DEVICE_LITTLE_ENDIAN, 605 }; 606 607 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque, 608 hwaddr addr, unsigned size) 609 { 610 VFIONvidia3d0Quirk *quirk = opaque; 611 VFIOPCIDevice *vdev = quirk->vdev; 612 VFIONvidia3d0State old_state = quirk->state; 613 uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 614 addr + 0x10, size); 615 616 quirk->state = NONE; 617 618 if (old_state == READ && 619 (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { 620 uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); 621 622 data = vfio_pci_read_config(&vdev->pdev, offset, size); 623 trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name, 624 offset, size, data); 625 } 626 627 return data; 628 } 629 630 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr, 631 uint64_t data, unsigned size) 632 { 633 VFIONvidia3d0Quirk *quirk = opaque; 634 VFIOPCIDevice *vdev = quirk->vdev; 635 VFIONvidia3d0State old_state = quirk->state; 636 637 quirk->state = NONE; 638 639 if (old_state == SELECT) { 640 quirk->offset = (uint32_t)data; 641 quirk->state = WINDOW; 642 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 643 nv3d0_states[quirk->state]); 644 } else if (old_state == WRITE) { 645 if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { 646 uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); 647 648 vfio_pci_write_config(&vdev->pdev, offset, data, size); 649 trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name, 650 offset, data, size); 651 return; 652 } 653 } 654 655 vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 656 addr + 0x10, data, size); 657 } 658 659 static const MemoryRegionOps vfio_nvidia_3d0_quirk = { 660 .read = vfio_nvidia_3d0_quirk_read, 661 .write = vfio_nvidia_3d0_quirk_write, 662 .endianness = DEVICE_LITTLE_ENDIAN, 663 }; 664 665 static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev) 666 { 667 VFIOQuirk *quirk; 668 VFIONvidia3d0Quirk *data; 669 670 if (vdev->no_geforce_quirks || 671 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 672 !vdev->bars[1].region.size) { 673 return; 674 } 675 676 quirk = vfio_quirk_alloc(2); 677 quirk->data = data = g_malloc0(sizeof(*data)); 678 data->vdev = vdev; 679 680 memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk, 681 data, "vfio-nvidia-3d4-quirk", 2); 682 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 683 0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]); 684 685 memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk, 686 data, "vfio-nvidia-3d0-quirk", 2); 687 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 688 0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]); 689 690 QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks, 691 quirk, next); 692 693 trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name); 694 } 695 696 /* 697 * The second quirk is documented in envytools. The I/O port BAR5 is just 698 * a set of address/data ports to the MMIO BARs. The BAR we care about is 699 * again BAR0. This backdoor is apparently a bit newer than the one above 700 * so we need to not only trap 256 bytes @0x1800, but all of PCI config 701 * space, including extended space is available at the 4k @0x88000. 702 */ 703 typedef struct VFIONvidiaBAR5Quirk { 704 uint32_t master; 705 uint32_t enable; 706 MemoryRegion *addr_mem; 707 MemoryRegion *data_mem; 708 bool enabled; 709 VFIOConfigWindowQuirk window; /* last for match data */ 710 } VFIONvidiaBAR5Quirk; 711 712 static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk *bar5) 713 { 714 VFIOPCIDevice *vdev = bar5->window.vdev; 715 716 if (((bar5->master & bar5->enable) & 0x1) == bar5->enabled) { 717 return; 718 } 719 720 bar5->enabled = !bar5->enabled; 721 trace_vfio_quirk_nvidia_bar5_state(vdev->vbasedev.name, 722 bar5->enabled ? "Enable" : "Disable"); 723 memory_region_set_enabled(bar5->addr_mem, bar5->enabled); 724 memory_region_set_enabled(bar5->data_mem, bar5->enabled); 725 } 726 727 static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque, 728 hwaddr addr, unsigned size) 729 { 730 VFIONvidiaBAR5Quirk *bar5 = opaque; 731 VFIOPCIDevice *vdev = bar5->window.vdev; 732 733 return vfio_region_read(&vdev->bars[5].region, addr, size); 734 } 735 736 static void vfio_nvidia_bar5_quirk_master_write(void *opaque, hwaddr addr, 737 uint64_t data, unsigned size) 738 { 739 VFIONvidiaBAR5Quirk *bar5 = opaque; 740 VFIOPCIDevice *vdev = bar5->window.vdev; 741 742 vfio_region_write(&vdev->bars[5].region, addr, data, size); 743 744 bar5->master = data; 745 vfio_nvidia_bar5_enable(bar5); 746 } 747 748 static const MemoryRegionOps vfio_nvidia_bar5_quirk_master = { 749 .read = vfio_nvidia_bar5_quirk_master_read, 750 .write = vfio_nvidia_bar5_quirk_master_write, 751 .endianness = DEVICE_LITTLE_ENDIAN, 752 }; 753 754 static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque, 755 hwaddr addr, unsigned size) 756 { 757 VFIONvidiaBAR5Quirk *bar5 = opaque; 758 VFIOPCIDevice *vdev = bar5->window.vdev; 759 760 return vfio_region_read(&vdev->bars[5].region, addr + 4, size); 761 } 762 763 static void vfio_nvidia_bar5_quirk_enable_write(void *opaque, hwaddr addr, 764 uint64_t data, unsigned size) 765 { 766 VFIONvidiaBAR5Quirk *bar5 = opaque; 767 VFIOPCIDevice *vdev = bar5->window.vdev; 768 769 vfio_region_write(&vdev->bars[5].region, addr + 4, data, size); 770 771 bar5->enable = data; 772 vfio_nvidia_bar5_enable(bar5); 773 } 774 775 static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable = { 776 .read = vfio_nvidia_bar5_quirk_enable_read, 777 .write = vfio_nvidia_bar5_quirk_enable_write, 778 .endianness = DEVICE_LITTLE_ENDIAN, 779 }; 780 781 static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr) 782 { 783 VFIOQuirk *quirk; 784 VFIONvidiaBAR5Quirk *bar5; 785 VFIOConfigWindowQuirk *window; 786 787 if (vdev->no_geforce_quirks || 788 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 789 !vdev->vga || nr != 5 || !vdev->bars[5].ioport) { 790 return; 791 } 792 793 quirk = vfio_quirk_alloc(4); 794 bar5 = quirk->data = g_malloc0(sizeof(*bar5) + 795 (sizeof(VFIOConfigWindowMatch) * 2)); 796 window = &bar5->window; 797 798 window->vdev = vdev; 799 window->address_offset = 0x8; 800 window->data_offset = 0xc; 801 window->nr_matches = 2; 802 window->matches[0].match = 0x1800; 803 window->matches[0].mask = PCI_CONFIG_SPACE_SIZE - 1; 804 window->matches[1].match = 0x88000; 805 window->matches[1].mask = vdev->config_size - 1; 806 window->bar = nr; 807 window->addr_mem = bar5->addr_mem = &quirk->mem[0]; 808 window->data_mem = bar5->data_mem = &quirk->mem[1]; 809 810 memory_region_init_io(window->addr_mem, OBJECT(vdev), 811 &vfio_generic_window_address_quirk, window, 812 "vfio-nvidia-bar5-window-address-quirk", 4); 813 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 814 window->address_offset, 815 window->addr_mem, 1); 816 memory_region_set_enabled(window->addr_mem, false); 817 818 memory_region_init_io(window->data_mem, OBJECT(vdev), 819 &vfio_generic_window_data_quirk, window, 820 "vfio-nvidia-bar5-window-data-quirk", 4); 821 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 822 window->data_offset, 823 window->data_mem, 1); 824 memory_region_set_enabled(window->data_mem, false); 825 826 memory_region_init_io(&quirk->mem[2], OBJECT(vdev), 827 &vfio_nvidia_bar5_quirk_master, bar5, 828 "vfio-nvidia-bar5-master-quirk", 4); 829 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 830 0, &quirk->mem[2], 1); 831 832 memory_region_init_io(&quirk->mem[3], OBJECT(vdev), 833 &vfio_nvidia_bar5_quirk_enable, bar5, 834 "vfio-nvidia-bar5-enable-quirk", 4); 835 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 836 4, &quirk->mem[3], 1); 837 838 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 839 840 trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name); 841 } 842 843 typedef struct LastDataSet { 844 VFIOQuirk *quirk; 845 hwaddr addr; 846 uint64_t data; 847 unsigned size; 848 int hits; 849 int added; 850 } LastDataSet; 851 852 #define MAX_DYN_IOEVENTFD 10 853 #define HITS_FOR_IOEVENTFD 10 854 855 /* 856 * Finally, BAR0 itself. We want to redirect any accesses to either 857 * 0x1800 or 0x88000 through the PCI config space access functions. 858 */ 859 static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr, 860 uint64_t data, unsigned size) 861 { 862 VFIOConfigMirrorQuirk *mirror = opaque; 863 VFIOPCIDevice *vdev = mirror->vdev; 864 PCIDevice *pdev = &vdev->pdev; 865 LastDataSet *last = (LastDataSet *)&mirror->data; 866 867 vfio_generic_quirk_mirror_write(opaque, addr, data, size); 868 869 /* 870 * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the 871 * MSI capability ID register. Both the ID and next register are 872 * read-only, so we allow writes covering either of those to real hw. 873 */ 874 if ((pdev->cap_present & QEMU_PCI_CAP_MSI) && 875 vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) { 876 vfio_region_write(&vdev->bars[mirror->bar].region, 877 addr + mirror->offset, data, size); 878 trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name); 879 } 880 881 /* 882 * Automatically add an ioeventfd to handle any repeated write with the 883 * same data and size above the standard PCI config space header. This is 884 * primarily expected to accelerate the MSI-ACK behavior, such as noted 885 * above. Current hardware/drivers should trigger an ioeventfd at config 886 * offset 0x704 (region offset 0x88704), with data 0x0, size 4. 887 * 888 * The criteria of 10 successive hits is arbitrary but reliably adds the 889 * MSI-ACK region. Note that as some writes are bypassed via the ioeventfd, 890 * the remaining ones have a greater chance of being seen successively. 891 * To avoid the pathological case of burning up all of QEMU's open file 892 * handles, arbitrarily limit this algorithm from adding no more than 10 893 * ioeventfds, print an error if we would have added an 11th, and then 894 * stop counting. 895 */ 896 if (!vdev->no_kvm_ioeventfd && 897 addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) { 898 if (addr != last->addr || data != last->data || size != last->size) { 899 last->addr = addr; 900 last->data = data; 901 last->size = size; 902 last->hits = 1; 903 } else if (++last->hits >= HITS_FOR_IOEVENTFD) { 904 if (last->added < MAX_DYN_IOEVENTFD) { 905 VFIOIOEventFD *ioeventfd; 906 ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size, 907 data, &vdev->bars[mirror->bar].region, 908 mirror->offset + addr, true); 909 if (ioeventfd) { 910 VFIOQuirk *quirk = last->quirk; 911 912 QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next); 913 last->added++; 914 } 915 } else { 916 last->added++; 917 warn_report("NVIDIA ioeventfd queue full for %s, unable to " 918 "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", " 919 "size %u", vdev->vbasedev.name, addr, data, size); 920 } 921 } 922 } 923 } 924 925 static const MemoryRegionOps vfio_nvidia_mirror_quirk = { 926 .read = vfio_generic_quirk_mirror_read, 927 .write = vfio_nvidia_quirk_mirror_write, 928 .endianness = DEVICE_LITTLE_ENDIAN, 929 }; 930 931 static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk) 932 { 933 VFIOConfigMirrorQuirk *mirror = quirk->data; 934 LastDataSet *last = (LastDataSet *)&mirror->data; 935 936 last->addr = last->data = last->size = last->hits = last->added = 0; 937 938 vfio_drop_dynamic_eventfds(vdev, quirk); 939 } 940 941 static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr) 942 { 943 VFIOQuirk *quirk; 944 VFIOConfigMirrorQuirk *mirror; 945 LastDataSet *last; 946 947 if (vdev->no_geforce_quirks || 948 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 949 !vfio_is_vga(vdev) || nr != 0) { 950 return; 951 } 952 953 quirk = vfio_quirk_alloc(1); 954 quirk->reset = vfio_nvidia_bar0_quirk_reset; 955 mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); 956 mirror->mem = quirk->mem; 957 mirror->vdev = vdev; 958 mirror->offset = 0x88000; 959 mirror->bar = nr; 960 last = (LastDataSet *)&mirror->data; 961 last->quirk = quirk; 962 963 memory_region_init_io(mirror->mem, OBJECT(vdev), 964 &vfio_nvidia_mirror_quirk, mirror, 965 "vfio-nvidia-bar0-88000-mirror-quirk", 966 vdev->config_size); 967 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 968 mirror->offset, mirror->mem, 1); 969 970 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 971 972 /* The 0x1800 offset mirror only seems to get used by legacy VGA */ 973 if (vdev->vga) { 974 quirk = vfio_quirk_alloc(1); 975 quirk->reset = vfio_nvidia_bar0_quirk_reset; 976 mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); 977 mirror->mem = quirk->mem; 978 mirror->vdev = vdev; 979 mirror->offset = 0x1800; 980 mirror->bar = nr; 981 last = (LastDataSet *)&mirror->data; 982 last->quirk = quirk; 983 984 memory_region_init_io(mirror->mem, OBJECT(vdev), 985 &vfio_nvidia_mirror_quirk, mirror, 986 "vfio-nvidia-bar0-1800-mirror-quirk", 987 PCI_CONFIG_SPACE_SIZE); 988 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 989 mirror->offset, mirror->mem, 1); 990 991 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 992 } 993 994 trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name); 995 } 996 997 /* 998 * TODO - Some Nvidia devices provide config access to their companion HDA 999 * device and even to their parent bridge via these config space mirrors. 1000 * Add quirks for those regions. 1001 */ 1002 1003 #define PCI_VENDOR_ID_REALTEK 0x10ec 1004 1005 /* 1006 * RTL8168 devices have a backdoor that can access the MSI-X table. At BAR2 1007 * offset 0x70 there is a dword data register, offset 0x74 is a dword address 1008 * register. According to the Linux r8169 driver, the MSI-X table is addressed 1009 * when the "type" portion of the address register is set to 0x1. This appears 1010 * to be bits 16:30. Bit 31 is both a write indicator and some sort of 1011 * "address latched" indicator. Bits 12:15 are a mask field, which we can 1012 * ignore because the MSI-X table should always be accessed as a dword (full 1013 * mask). Bits 0:11 is offset within the type. 1014 * 1015 * Example trace: 1016 * 1017 * Read from MSI-X table offset 0 1018 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr 1019 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch 1020 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data 1021 * 1022 * Write 0xfee00000 to MSI-X table offset 0 1023 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data 1024 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write 1025 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete 1026 */ 1027 typedef struct VFIOrtl8168Quirk { 1028 VFIOPCIDevice *vdev; 1029 uint32_t addr; 1030 uint32_t data; 1031 bool enabled; 1032 } VFIOrtl8168Quirk; 1033 1034 static uint64_t vfio_rtl8168_quirk_address_read(void *opaque, 1035 hwaddr addr, unsigned size) 1036 { 1037 VFIOrtl8168Quirk *rtl = opaque; 1038 VFIOPCIDevice *vdev = rtl->vdev; 1039 uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size); 1040 1041 if (rtl->enabled) { 1042 data = rtl->addr ^ 0x80000000U; /* latch/complete */ 1043 trace_vfio_quirk_rtl8168_fake_latch(vdev->vbasedev.name, data); 1044 } 1045 1046 return data; 1047 } 1048 1049 static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr, 1050 uint64_t data, unsigned size) 1051 { 1052 VFIOrtl8168Quirk *rtl = opaque; 1053 VFIOPCIDevice *vdev = rtl->vdev; 1054 1055 rtl->enabled = false; 1056 1057 if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */ 1058 rtl->enabled = true; 1059 rtl->addr = (uint32_t)data; 1060 1061 if (data & 0x80000000U) { /* Do write */ 1062 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) { 1063 hwaddr offset = data & 0xfff; 1064 uint64_t val = rtl->data; 1065 1066 trace_vfio_quirk_rtl8168_msix_write(vdev->vbasedev.name, 1067 (uint16_t)offset, val); 1068 1069 /* Write to the proper guest MSI-X table instead */ 1070 memory_region_dispatch_write(&vdev->pdev.msix_table_mmio, 1071 offset, val, 1072 size_memop(size) | MO_LE, 1073 MEMTXATTRS_UNSPECIFIED); 1074 } 1075 return; /* Do not write guest MSI-X data to hardware */ 1076 } 1077 } 1078 1079 vfio_region_write(&vdev->bars[2].region, addr + 0x74, data, size); 1080 } 1081 1082 static const MemoryRegionOps vfio_rtl_address_quirk = { 1083 .read = vfio_rtl8168_quirk_address_read, 1084 .write = vfio_rtl8168_quirk_address_write, 1085 .valid = { 1086 .min_access_size = 4, 1087 .max_access_size = 4, 1088 .unaligned = false, 1089 }, 1090 .endianness = DEVICE_LITTLE_ENDIAN, 1091 }; 1092 1093 static uint64_t vfio_rtl8168_quirk_data_read(void *opaque, 1094 hwaddr addr, unsigned size) 1095 { 1096 VFIOrtl8168Quirk *rtl = opaque; 1097 VFIOPCIDevice *vdev = rtl->vdev; 1098 uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x70, size); 1099 1100 if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) { 1101 hwaddr offset = rtl->addr & 0xfff; 1102 memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset, 1103 &data, size_memop(size) | MO_LE, 1104 MEMTXATTRS_UNSPECIFIED); 1105 trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data); 1106 } 1107 1108 return data; 1109 } 1110 1111 static void vfio_rtl8168_quirk_data_write(void *opaque, hwaddr addr, 1112 uint64_t data, unsigned size) 1113 { 1114 VFIOrtl8168Quirk *rtl = opaque; 1115 VFIOPCIDevice *vdev = rtl->vdev; 1116 1117 rtl->data = (uint32_t)data; 1118 1119 vfio_region_write(&vdev->bars[2].region, addr + 0x70, data, size); 1120 } 1121 1122 static const MemoryRegionOps vfio_rtl_data_quirk = { 1123 .read = vfio_rtl8168_quirk_data_read, 1124 .write = vfio_rtl8168_quirk_data_write, 1125 .valid = { 1126 .min_access_size = 4, 1127 .max_access_size = 4, 1128 .unaligned = false, 1129 }, 1130 .endianness = DEVICE_LITTLE_ENDIAN, 1131 }; 1132 1133 static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) 1134 { 1135 VFIOQuirk *quirk; 1136 VFIOrtl8168Quirk *rtl; 1137 1138 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_REALTEK, 0x8168) || nr != 2) { 1139 return; 1140 } 1141 1142 quirk = vfio_quirk_alloc(2); 1143 quirk->data = rtl = g_malloc0(sizeof(*rtl)); 1144 rtl->vdev = vdev; 1145 1146 memory_region_init_io(&quirk->mem[0], OBJECT(vdev), 1147 &vfio_rtl_address_quirk, rtl, 1148 "vfio-rtl8168-window-address-quirk", 4); 1149 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 1150 0x74, &quirk->mem[0], 1); 1151 1152 memory_region_init_io(&quirk->mem[1], OBJECT(vdev), 1153 &vfio_rtl_data_quirk, rtl, 1154 "vfio-rtl8168-window-data-quirk", 4); 1155 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 1156 0x70, &quirk->mem[1], 1); 1157 1158 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 1159 1160 trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name); 1161 } 1162 1163 #define IGD_ASLS 0xfc /* ASL Storage Register */ 1164 1165 /* 1166 * The OpRegion includes the Video BIOS Table, which seems important for 1167 * telling the driver what sort of outputs it has. Without this, the device 1168 * may work in the guest, but we may not get output. This also requires BIOS 1169 * support to reserve and populate a section of guest memory sufficient for 1170 * the table and to write the base address of that memory to the ASLS register 1171 * of the IGD device. 1172 */ 1173 int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, 1174 struct vfio_region_info *info, Error **errp) 1175 { 1176 int ret; 1177 1178 vdev->igd_opregion = g_malloc0(info->size); 1179 ret = pread(vdev->vbasedev.fd, vdev->igd_opregion, 1180 info->size, info->offset); 1181 if (ret != info->size) { 1182 error_setg(errp, "failed to read IGD OpRegion"); 1183 g_free(vdev->igd_opregion); 1184 vdev->igd_opregion = NULL; 1185 return -EINVAL; 1186 } 1187 1188 /* 1189 * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to 1190 * allocate 32bit reserved memory for, copy these contents into, and write 1191 * the reserved memory base address to the device ASLS register at 0xFC. 1192 * Alignment of this reserved region seems flexible, but using a 4k page 1193 * alignment seems to work well. This interface assumes a single IGD 1194 * device, which may be at VM address 00:02.0 in legacy mode or another 1195 * address in UPT mode. 1196 * 1197 * NB, there may be future use cases discovered where the VM should have 1198 * direct interaction with the host OpRegion, in which case the write to 1199 * the ASLS register would trigger MemoryRegion setup to enable that. 1200 */ 1201 fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion", 1202 vdev->igd_opregion, info->size); 1203 1204 trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name); 1205 1206 pci_set_long(vdev->pdev.config + IGD_ASLS, 0); 1207 pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0); 1208 pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); 1209 1210 return 0; 1211 } 1212 1213 /* 1214 * Common quirk probe entry points. 1215 */ 1216 void vfio_vga_quirk_setup(VFIOPCIDevice *vdev) 1217 { 1218 vfio_vga_probe_ati_3c3_quirk(vdev); 1219 vfio_vga_probe_nvidia_3d0_quirk(vdev); 1220 } 1221 1222 void vfio_vga_quirk_exit(VFIOPCIDevice *vdev) 1223 { 1224 VFIOQuirk *quirk; 1225 int i, j; 1226 1227 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { 1228 QLIST_FOREACH(quirk, &vdev->vga->region[i].quirks, next) { 1229 for (j = 0; j < quirk->nr_mem; j++) { 1230 memory_region_del_subregion(&vdev->vga->region[i].mem, 1231 &quirk->mem[j]); 1232 } 1233 } 1234 } 1235 } 1236 1237 void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev) 1238 { 1239 int i, j; 1240 1241 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { 1242 while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) { 1243 VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks); 1244 QLIST_REMOVE(quirk, next); 1245 for (j = 0; j < quirk->nr_mem; j++) { 1246 object_unparent(OBJECT(&quirk->mem[j])); 1247 } 1248 g_free(quirk->mem); 1249 g_free(quirk->data); 1250 g_free(quirk); 1251 } 1252 } 1253 } 1254 1255 void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) 1256 { 1257 vfio_probe_ati_bar4_quirk(vdev, nr); 1258 vfio_probe_ati_bar2_quirk(vdev, nr); 1259 vfio_probe_nvidia_bar5_quirk(vdev, nr); 1260 vfio_probe_nvidia_bar0_quirk(vdev, nr); 1261 vfio_probe_rtl8168_bar2_quirk(vdev, nr); 1262 #ifdef CONFIG_VFIO_IGD 1263 vfio_probe_igd_bar4_quirk(vdev, nr); 1264 #endif 1265 } 1266 1267 void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr) 1268 { 1269 VFIOBAR *bar = &vdev->bars[nr]; 1270 VFIOQuirk *quirk; 1271 int i; 1272 1273 QLIST_FOREACH(quirk, &bar->quirks, next) { 1274 while (!QLIST_EMPTY(&quirk->ioeventfds)) { 1275 vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds)); 1276 } 1277 1278 for (i = 0; i < quirk->nr_mem; i++) { 1279 memory_region_del_subregion(bar->region.mem, &quirk->mem[i]); 1280 } 1281 } 1282 } 1283 1284 void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr) 1285 { 1286 VFIOBAR *bar = &vdev->bars[nr]; 1287 int i; 1288 1289 while (!QLIST_EMPTY(&bar->quirks)) { 1290 VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks); 1291 QLIST_REMOVE(quirk, next); 1292 for (i = 0; i < quirk->nr_mem; i++) { 1293 object_unparent(OBJECT(&quirk->mem[i])); 1294 } 1295 g_free(quirk->mem); 1296 g_free(quirk->data); 1297 g_free(quirk); 1298 } 1299 } 1300 1301 /* 1302 * Reset quirks 1303 */ 1304 void vfio_quirk_reset(VFIOPCIDevice *vdev) 1305 { 1306 int i; 1307 1308 for (i = 0; i < PCI_ROM_SLOT; i++) { 1309 VFIOQuirk *quirk; 1310 VFIOBAR *bar = &vdev->bars[i]; 1311 1312 QLIST_FOREACH(quirk, &bar->quirks, next) { 1313 if (quirk->reset) { 1314 quirk->reset(vdev, quirk); 1315 } 1316 } 1317 } 1318 } 1319 1320 /* 1321 * AMD Radeon PCI config reset, based on Linux: 1322 * drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running() 1323 * drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset 1324 * drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc() 1325 * drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock() 1326 * IDs: include/drm/drm_pciids.h 1327 * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0 1328 * 1329 * Bonaire and Hawaii GPUs do not respond to a bus reset. This is a bug in the 1330 * hardware that should be fixed on future ASICs. The symptom of this is that 1331 * once the accerlated driver loads, Windows guests will bsod on subsequent 1332 * attmpts to load the driver, such as after VM reset or shutdown/restart. To 1333 * work around this, we do an AMD specific PCI config reset, followed by an SMC 1334 * reset. The PCI config reset only works if SMC firmware is running, so we 1335 * have a dependency on the state of the device as to whether this reset will 1336 * be effective. There are still cases where we won't be able to kick the 1337 * device into working, but this greatly improves the usability overall. The 1338 * config reset magic is relatively common on AMD GPUs, but the setup and SMC 1339 * poking is largely ASIC specific. 1340 */ 1341 static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev) 1342 { 1343 uint32_t clk, pc_c; 1344 1345 /* 1346 * Registers 200h and 204h are index and data registers for accessing 1347 * indirect configuration registers within the device. 1348 */ 1349 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); 1350 clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1351 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4); 1352 pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1353 1354 return (!(clk & 1) && (0x20100 <= pc_c)); 1355 } 1356 1357 /* 1358 * The scope of a config reset is controlled by a mode bit in the misc register 1359 * and a fuse, exposed as a bit in another register. The fuse is the default 1360 * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula 1361 * scope = !(misc ^ fuse), where the resulting scope is defined the same as 1362 * the fuse. A truth table therefore tells us that if misc == fuse, we need 1363 * to flip the value of the bit in the misc register. 1364 */ 1365 static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev) 1366 { 1367 uint32_t misc, fuse; 1368 bool a, b; 1369 1370 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4); 1371 fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1372 b = fuse & 64; 1373 1374 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4); 1375 misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1376 a = misc & 2; 1377 1378 if (a == b) { 1379 vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4); 1380 vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */ 1381 } 1382 } 1383 1384 static int vfio_radeon_reset(VFIOPCIDevice *vdev) 1385 { 1386 PCIDevice *pdev = &vdev->pdev; 1387 int i, ret = 0; 1388 uint32_t data; 1389 1390 /* Defer to a kernel implemented reset */ 1391 if (vdev->vbasedev.reset_works) { 1392 trace_vfio_quirk_ati_bonaire_reset_skipped(vdev->vbasedev.name); 1393 return -ENODEV; 1394 } 1395 1396 /* Enable only memory BAR access */ 1397 vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2); 1398 1399 /* Reset only works if SMC firmware is loaded and running */ 1400 if (!vfio_radeon_smc_is_running(vdev)) { 1401 ret = -EINVAL; 1402 trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev->vbasedev.name); 1403 goto out; 1404 } 1405 1406 /* Make sure only the GFX function is reset */ 1407 vfio_radeon_set_gfx_only_reset(vdev); 1408 1409 /* AMD PCI config reset */ 1410 vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4); 1411 usleep(100); 1412 1413 /* Read back the memory size to make sure we're out of reset */ 1414 for (i = 0; i < 100000; i++) { 1415 if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) { 1416 goto reset_smc; 1417 } 1418 usleep(1); 1419 } 1420 1421 trace_vfio_quirk_ati_bonaire_reset_timeout(vdev->vbasedev.name); 1422 1423 reset_smc: 1424 /* Reset SMC */ 1425 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4); 1426 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1427 data |= 1; 1428 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); 1429 1430 /* Disable SMC clock */ 1431 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); 1432 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1433 data |= 1; 1434 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); 1435 1436 trace_vfio_quirk_ati_bonaire_reset_done(vdev->vbasedev.name); 1437 1438 out: 1439 /* Restore PCI command register */ 1440 vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2); 1441 1442 return ret; 1443 } 1444 1445 void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev) 1446 { 1447 switch (vdev->vendor_id) { 1448 case 0x1002: 1449 switch (vdev->device_id) { 1450 /* Bonaire */ 1451 case 0x6649: /* Bonaire [FirePro W5100] */ 1452 case 0x6650: 1453 case 0x6651: 1454 case 0x6658: /* Bonaire XTX [Radeon R7 260X] */ 1455 case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */ 1456 case 0x665d: /* Bonaire [Radeon R7 200 Series] */ 1457 /* Hawaii */ 1458 case 0x67A0: /* Hawaii XT GL [FirePro W9100] */ 1459 case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */ 1460 case 0x67A2: 1461 case 0x67A8: 1462 case 0x67A9: 1463 case 0x67AA: 1464 case 0x67B0: /* Hawaii XT [Radeon R9 290X] */ 1465 case 0x67B1: /* Hawaii PRO [Radeon R9 290] */ 1466 case 0x67B8: 1467 case 0x67B9: 1468 case 0x67BA: 1469 case 0x67BE: 1470 vdev->resetfn = vfio_radeon_reset; 1471 trace_vfio_quirk_ati_bonaire_reset(vdev->vbasedev.name); 1472 break; 1473 } 1474 break; 1475 } 1476 } 1477 1478 /* 1479 * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify 1480 * devices as a member of a clique. Devices within the same clique ID 1481 * are capable of direct P2P. It's the user's responsibility that this 1482 * is correct. The spec says that this may reside at any unused config 1483 * offset, but reserves and recommends hypervisors place this at C8h. 1484 * The spec also states that the hypervisor should place this capability 1485 * at the end of the capability list, thus next is defined as 0h. 1486 * 1487 * +----------------+----------------+----------------+----------------+ 1488 * | sig 7:0 ('P') | vndr len (8h) | next (0h) | cap id (9h) | 1489 * +----------------+----------------+----------------+----------------+ 1490 * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)| sig 23:8 ('P2') | 1491 * +---------------------------------+---------------------------------+ 1492 * 1493 * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf 1494 */ 1495 static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v, 1496 const char *name, void *opaque, 1497 Error **errp) 1498 { 1499 Property *prop = opaque; 1500 uint8_t *ptr = object_field_prop_ptr(obj, prop); 1501 1502 visit_type_uint8(v, name, ptr, errp); 1503 } 1504 1505 static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v, 1506 const char *name, void *opaque, 1507 Error **errp) 1508 { 1509 Property *prop = opaque; 1510 uint8_t value, *ptr = object_field_prop_ptr(obj, prop); 1511 1512 if (!visit_type_uint8(v, name, &value, errp)) { 1513 return; 1514 } 1515 1516 if (value & ~0xF) { 1517 error_setg(errp, "Property %s: valid range 0-15", name); 1518 return; 1519 } 1520 1521 *ptr = value; 1522 } 1523 1524 const PropertyInfo qdev_prop_nv_gpudirect_clique = { 1525 .name = "uint4", 1526 .description = "NVIDIA GPUDirect Clique ID (0 - 15)", 1527 .get = get_nv_gpudirect_clique_id, 1528 .set = set_nv_gpudirect_clique_id, 1529 }; 1530 1531 static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp) 1532 { 1533 PCIDevice *pdev = &vdev->pdev; 1534 int ret, pos = 0xC8; 1535 1536 if (vdev->nv_gpudirect_clique == 0xFF) { 1537 return 0; 1538 } 1539 1540 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) { 1541 error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor"); 1542 return -EINVAL; 1543 } 1544 1545 if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) != 1546 PCI_BASE_CLASS_DISPLAY) { 1547 error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class"); 1548 return -EINVAL; 1549 } 1550 1551 ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp); 1552 if (ret < 0) { 1553 error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: "); 1554 return ret; 1555 } 1556 1557 memset(vdev->emulated_config_bits + pos, 0xFF, 8); 1558 pos += PCI_CAP_FLAGS; 1559 pci_set_byte(pdev->config + pos++, 8); 1560 pci_set_byte(pdev->config + pos++, 'P'); 1561 pci_set_byte(pdev->config + pos++, '2'); 1562 pci_set_byte(pdev->config + pos++, 'P'); 1563 pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3); 1564 pci_set_byte(pdev->config + pos, 0); 1565 1566 return 0; 1567 } 1568 1569 static void vfio_pci_nvlink2_get_tgt(Object *obj, Visitor *v, 1570 const char *name, 1571 void *opaque, Error **errp) 1572 { 1573 uint64_t tgt = (uintptr_t) opaque; 1574 visit_type_uint64(v, name, &tgt, errp); 1575 } 1576 1577 static void vfio_pci_nvlink2_get_link_speed(Object *obj, Visitor *v, 1578 const char *name, 1579 void *opaque, Error **errp) 1580 { 1581 uint32_t link_speed = (uint32_t)(uintptr_t) opaque; 1582 visit_type_uint32(v, name, &link_speed, errp); 1583 } 1584 1585 int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp) 1586 { 1587 int ret; 1588 void *p; 1589 struct vfio_region_info *nv2reg = NULL; 1590 struct vfio_info_cap_header *hdr; 1591 struct vfio_region_info_cap_nvlink2_ssatgt *cap; 1592 VFIOQuirk *quirk; 1593 1594 ret = vfio_get_dev_region_info(&vdev->vbasedev, 1595 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | 1596 PCI_VENDOR_ID_NVIDIA, 1597 VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, 1598 &nv2reg); 1599 if (ret) { 1600 return ret; 1601 } 1602 1603 hdr = vfio_get_region_info_cap(nv2reg, VFIO_REGION_INFO_CAP_NVLINK2_SSATGT); 1604 if (!hdr) { 1605 ret = -ENODEV; 1606 goto free_exit; 1607 } 1608 cap = (void *) hdr; 1609 1610 p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE, 1611 MAP_SHARED, vdev->vbasedev.fd, nv2reg->offset); 1612 if (p == MAP_FAILED) { 1613 ret = -errno; 1614 goto free_exit; 1615 } 1616 1617 quirk = vfio_quirk_alloc(1); 1618 memory_region_init_ram_ptr(&quirk->mem[0], OBJECT(vdev), "nvlink2-mr", 1619 nv2reg->size, p); 1620 QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next); 1621 1622 object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64", 1623 vfio_pci_nvlink2_get_tgt, NULL, NULL, 1624 (void *) (uintptr_t) cap->tgt); 1625 trace_vfio_pci_nvidia_gpu_setup_quirk(vdev->vbasedev.name, cap->tgt, 1626 nv2reg->size); 1627 free_exit: 1628 g_free(nv2reg); 1629 1630 return ret; 1631 } 1632 1633 int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp) 1634 { 1635 int ret; 1636 void *p; 1637 struct vfio_region_info *atsdreg = NULL; 1638 struct vfio_info_cap_header *hdr; 1639 struct vfio_region_info_cap_nvlink2_ssatgt *captgt; 1640 struct vfio_region_info_cap_nvlink2_lnkspd *capspeed; 1641 VFIOQuirk *quirk; 1642 1643 ret = vfio_get_dev_region_info(&vdev->vbasedev, 1644 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | 1645 PCI_VENDOR_ID_IBM, 1646 VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, 1647 &atsdreg); 1648 if (ret) { 1649 return ret; 1650 } 1651 1652 hdr = vfio_get_region_info_cap(atsdreg, 1653 VFIO_REGION_INFO_CAP_NVLINK2_SSATGT); 1654 if (!hdr) { 1655 ret = -ENODEV; 1656 goto free_exit; 1657 } 1658 captgt = (void *) hdr; 1659 1660 hdr = vfio_get_region_info_cap(atsdreg, 1661 VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD); 1662 if (!hdr) { 1663 ret = -ENODEV; 1664 goto free_exit; 1665 } 1666 capspeed = (void *) hdr; 1667 1668 /* Some NVLink bridges may not have assigned ATSD */ 1669 if (atsdreg->size) { 1670 p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE, 1671 MAP_SHARED, vdev->vbasedev.fd, atsdreg->offset); 1672 if (p == MAP_FAILED) { 1673 ret = -errno; 1674 goto free_exit; 1675 } 1676 1677 quirk = vfio_quirk_alloc(1); 1678 memory_region_init_ram_device_ptr(&quirk->mem[0], OBJECT(vdev), 1679 "nvlink2-atsd-mr", atsdreg->size, p); 1680 QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next); 1681 } 1682 1683 object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64", 1684 vfio_pci_nvlink2_get_tgt, NULL, NULL, 1685 (void *) (uintptr_t) captgt->tgt); 1686 trace_vfio_pci_nvlink2_setup_quirk_ssatgt(vdev->vbasedev.name, captgt->tgt, 1687 atsdreg->size); 1688 1689 object_property_add(OBJECT(vdev), "nvlink2-link-speed", "uint32", 1690 vfio_pci_nvlink2_get_link_speed, NULL, NULL, 1691 (void *) (uintptr_t) capspeed->link_speed); 1692 trace_vfio_pci_nvlink2_setup_quirk_lnkspd(vdev->vbasedev.name, 1693 capspeed->link_speed); 1694 free_exit: 1695 g_free(atsdreg); 1696 1697 return ret; 1698 } 1699 1700 /* 1701 * The VMD endpoint provides a real PCIe domain to the guest and the guest 1702 * kernel performs enumeration of the VMD sub-device domain. Guest transactions 1703 * to VMD sub-devices go through MMU translation from guest addresses to 1704 * physical addresses. When MMIO goes to an endpoint after being translated to 1705 * physical addresses, the bridge rejects the transaction because the window 1706 * has been programmed with guest addresses. 1707 * 1708 * VMD can use the Host Physical Address in order to correctly program the 1709 * bridge windows in its PCIe domain. VMD device 28C0 has HPA shadow registers 1710 * located at offset 0x2000 in MEMBAR2 (BAR 4). This quirk provides the HPA 1711 * shadow registers in a vendor-specific capability register for devices 1712 * without native support. The position of 0xE8-0xFF is in the reserved range 1713 * of the VMD device capability space following the Power Management 1714 * Capability. 1715 */ 1716 #define VMD_SHADOW_CAP_VER 1 1717 #define VMD_SHADOW_CAP_LEN 24 1718 static int vfio_add_vmd_shadow_cap(VFIOPCIDevice *vdev, Error **errp) 1719 { 1720 uint8_t membar_phys[16]; 1721 int ret, pos = 0xE8; 1722 1723 if (!(vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x201D) || 1724 vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x467F) || 1725 vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x4C3D) || 1726 vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x9A0B))) { 1727 return 0; 1728 } 1729 1730 ret = pread(vdev->vbasedev.fd, membar_phys, 16, 1731 vdev->config_offset + PCI_BASE_ADDRESS_2); 1732 if (ret != 16) { 1733 error_report("VMD %s cannot read MEMBARs (%d)", 1734 vdev->vbasedev.name, ret); 1735 return -EFAULT; 1736 } 1737 1738 ret = pci_add_capability(&vdev->pdev, PCI_CAP_ID_VNDR, pos, 1739 VMD_SHADOW_CAP_LEN, errp); 1740 if (ret < 0) { 1741 error_prepend(errp, "Failed to add VMD MEMBAR Shadow cap: "); 1742 return ret; 1743 } 1744 1745 memset(vdev->emulated_config_bits + pos, 0xFF, VMD_SHADOW_CAP_LEN); 1746 pos += PCI_CAP_FLAGS; 1747 pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_LEN); 1748 pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_VER); 1749 pci_set_long(vdev->pdev.config + pos, 0x53484457); /* SHDW */ 1750 memcpy(vdev->pdev.config + pos + 4, membar_phys, 16); 1751 1752 return 0; 1753 } 1754 1755 int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp) 1756 { 1757 int ret; 1758 1759 ret = vfio_add_nv_gpudirect_cap(vdev, errp); 1760 if (ret) { 1761 return ret; 1762 } 1763 1764 ret = vfio_add_vmd_shadow_cap(vdev, errp); 1765 if (ret) { 1766 return ret; 1767 } 1768 1769 return 0; 1770 } 1771