1 /* 2 * generic functions used by VFIO devices 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Based on qemu-kvm device-assignment: 13 * Adapted for KVM by Qumranet. 14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) 15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) 16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) 17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) 18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) 19 */ 20 21 #include "qemu/osdep.h" 22 #include <sys/ioctl.h> 23 #ifdef CONFIG_KVM 24 #include <linux/kvm.h> 25 #endif 26 #include <linux/vfio.h> 27 28 #include "hw/vfio/vfio-common.h" 29 #include "hw/vfio/vfio.h" 30 #include "exec/address-spaces.h" 31 #include "exec/memory.h" 32 #include "exec/ram_addr.h" 33 #include "hw/hw.h" 34 #include "qemu/error-report.h" 35 #include "qemu/main-loop.h" 36 #include "qemu/range.h" 37 #include "sysemu/kvm.h" 38 #include "sysemu/reset.h" 39 #include "sysemu/runstate.h" 40 #include "trace.h" 41 #include "qapi/error.h" 42 #include "migration/migration.h" 43 #include "migration/misc.h" 44 #include "migration/blocker.h" 45 #include "migration/qemu-file.h" 46 #include "sysemu/tpm.h" 47 48 VFIOGroupList vfio_group_list = 49 QLIST_HEAD_INITIALIZER(vfio_group_list); 50 static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces = 51 QLIST_HEAD_INITIALIZER(vfio_address_spaces); 52 53 #ifdef CONFIG_KVM 54 /* 55 * We have a single VFIO pseudo device per KVM VM. Once created it lives 56 * for the life of the VM. Closing the file descriptor only drops our 57 * reference to it and the device's reference to kvm. Therefore once 58 * initialized, this file descriptor is only released on QEMU exit and 59 * we'll re-use it should another vfio device be attached before then. 60 */ 61 static int vfio_kvm_device_fd = -1; 62 #endif 63 64 /* 65 * Common VFIO interrupt disable 66 */ 67 void vfio_disable_irqindex(VFIODevice *vbasedev, int index) 68 { 69 struct vfio_irq_set irq_set = { 70 .argsz = sizeof(irq_set), 71 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 72 .index = index, 73 .start = 0, 74 .count = 0, 75 }; 76 77 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 78 } 79 80 void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index) 81 { 82 struct vfio_irq_set irq_set = { 83 .argsz = sizeof(irq_set), 84 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, 85 .index = index, 86 .start = 0, 87 .count = 1, 88 }; 89 90 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 91 } 92 93 void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) 94 { 95 struct vfio_irq_set irq_set = { 96 .argsz = sizeof(irq_set), 97 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK, 98 .index = index, 99 .start = 0, 100 .count = 1, 101 }; 102 103 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 104 } 105 106 static inline const char *action_to_str(int action) 107 { 108 switch (action) { 109 case VFIO_IRQ_SET_ACTION_MASK: 110 return "MASK"; 111 case VFIO_IRQ_SET_ACTION_UNMASK: 112 return "UNMASK"; 113 case VFIO_IRQ_SET_ACTION_TRIGGER: 114 return "TRIGGER"; 115 default: 116 return "UNKNOWN ACTION"; 117 } 118 } 119 120 static const char *index_to_str(VFIODevice *vbasedev, int index) 121 { 122 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { 123 return NULL; 124 } 125 126 switch (index) { 127 case VFIO_PCI_INTX_IRQ_INDEX: 128 return "INTX"; 129 case VFIO_PCI_MSI_IRQ_INDEX: 130 return "MSI"; 131 case VFIO_PCI_MSIX_IRQ_INDEX: 132 return "MSIX"; 133 case VFIO_PCI_ERR_IRQ_INDEX: 134 return "ERR"; 135 case VFIO_PCI_REQ_IRQ_INDEX: 136 return "REQ"; 137 default: 138 return NULL; 139 } 140 } 141 142 static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state) 143 { 144 switch (container->iommu_type) { 145 case VFIO_TYPE1v2_IOMMU: 146 case VFIO_TYPE1_IOMMU: 147 /* 148 * We support coordinated discarding of RAM via the RamDiscardManager. 149 */ 150 return ram_block_uncoordinated_discard_disable(state); 151 default: 152 /* 153 * VFIO_SPAPR_TCE_IOMMU most probably works just fine with 154 * RamDiscardManager, however, it is completely untested. 155 * 156 * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does 157 * completely the opposite of managing mapping/pinning dynamically as 158 * required by RamDiscardManager. We would have to special-case sections 159 * with a RamDiscardManager. 160 */ 161 return ram_block_discard_disable(state); 162 } 163 } 164 165 int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, 166 int action, int fd, Error **errp) 167 { 168 struct vfio_irq_set *irq_set; 169 int argsz, ret = 0; 170 const char *name; 171 int32_t *pfd; 172 173 argsz = sizeof(*irq_set) + sizeof(*pfd); 174 175 irq_set = g_malloc0(argsz); 176 irq_set->argsz = argsz; 177 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action; 178 irq_set->index = index; 179 irq_set->start = subindex; 180 irq_set->count = 1; 181 pfd = (int32_t *)&irq_set->data; 182 *pfd = fd; 183 184 if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) { 185 ret = -errno; 186 } 187 g_free(irq_set); 188 189 if (!ret) { 190 return 0; 191 } 192 193 error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure"); 194 195 name = index_to_str(vbasedev, index); 196 if (name) { 197 error_prepend(errp, "%s-%d: ", name, subindex); 198 } else { 199 error_prepend(errp, "index %d-%d: ", index, subindex); 200 } 201 error_prepend(errp, 202 "Failed to %s %s eventfd signaling for interrupt ", 203 fd < 0 ? "tear down" : "set up", action_to_str(action)); 204 return ret; 205 } 206 207 /* 208 * IO Port/MMIO - Beware of the endians, VFIO is always little endian 209 */ 210 void vfio_region_write(void *opaque, hwaddr addr, 211 uint64_t data, unsigned size) 212 { 213 VFIORegion *region = opaque; 214 VFIODevice *vbasedev = region->vbasedev; 215 union { 216 uint8_t byte; 217 uint16_t word; 218 uint32_t dword; 219 uint64_t qword; 220 } buf; 221 222 switch (size) { 223 case 1: 224 buf.byte = data; 225 break; 226 case 2: 227 buf.word = cpu_to_le16(data); 228 break; 229 case 4: 230 buf.dword = cpu_to_le32(data); 231 break; 232 case 8: 233 buf.qword = cpu_to_le64(data); 234 break; 235 default: 236 hw_error("vfio: unsupported write size, %u bytes", size); 237 break; 238 } 239 240 if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { 241 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 242 ",%d) failed: %m", 243 __func__, vbasedev->name, region->nr, 244 addr, data, size); 245 } 246 247 trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); 248 249 /* 250 * A read or write to a BAR always signals an INTx EOI. This will 251 * do nothing if not pending (including not in INTx mode). We assume 252 * that a BAR access is in response to an interrupt and that BAR 253 * accesses will service the interrupt. Unfortunately, we don't know 254 * which access will service the interrupt, so we're potentially 255 * getting quite a few host interrupts per guest interrupt. 256 */ 257 vbasedev->ops->vfio_eoi(vbasedev); 258 } 259 260 uint64_t vfio_region_read(void *opaque, 261 hwaddr addr, unsigned size) 262 { 263 VFIORegion *region = opaque; 264 VFIODevice *vbasedev = region->vbasedev; 265 union { 266 uint8_t byte; 267 uint16_t word; 268 uint32_t dword; 269 uint64_t qword; 270 } buf; 271 uint64_t data = 0; 272 273 if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { 274 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", 275 __func__, vbasedev->name, region->nr, 276 addr, size); 277 return (uint64_t)-1; 278 } 279 switch (size) { 280 case 1: 281 data = buf.byte; 282 break; 283 case 2: 284 data = le16_to_cpu(buf.word); 285 break; 286 case 4: 287 data = le32_to_cpu(buf.dword); 288 break; 289 case 8: 290 data = le64_to_cpu(buf.qword); 291 break; 292 default: 293 hw_error("vfio: unsupported read size, %u bytes", size); 294 break; 295 } 296 297 trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data); 298 299 /* Same as write above */ 300 vbasedev->ops->vfio_eoi(vbasedev); 301 302 return data; 303 } 304 305 const MemoryRegionOps vfio_region_ops = { 306 .read = vfio_region_read, 307 .write = vfio_region_write, 308 .endianness = DEVICE_LITTLE_ENDIAN, 309 .valid = { 310 .min_access_size = 1, 311 .max_access_size = 8, 312 }, 313 .impl = { 314 .min_access_size = 1, 315 .max_access_size = 8, 316 }, 317 }; 318 319 /* 320 * Device state interfaces 321 */ 322 323 typedef struct { 324 unsigned long *bitmap; 325 hwaddr size; 326 hwaddr pages; 327 } VFIOBitmap; 328 329 static int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size) 330 { 331 vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size(); 332 vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) / 333 BITS_PER_BYTE; 334 vbmap->bitmap = g_try_malloc0(vbmap->size); 335 if (!vbmap->bitmap) { 336 return -ENOMEM; 337 } 338 339 return 0; 340 } 341 342 static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, 343 uint64_t size, ram_addr_t ram_addr); 344 345 bool vfio_mig_active(void) 346 { 347 VFIOGroup *group; 348 VFIODevice *vbasedev; 349 350 if (QLIST_EMPTY(&vfio_group_list)) { 351 return false; 352 } 353 354 QLIST_FOREACH(group, &vfio_group_list, next) { 355 QLIST_FOREACH(vbasedev, &group->device_list, next) { 356 if (vbasedev->migration_blocker) { 357 return false; 358 } 359 } 360 } 361 return true; 362 } 363 364 static Error *multiple_devices_migration_blocker; 365 static Error *giommu_migration_blocker; 366 367 static unsigned int vfio_migratable_device_num(void) 368 { 369 VFIOGroup *group; 370 VFIODevice *vbasedev; 371 unsigned int device_num = 0; 372 373 QLIST_FOREACH(group, &vfio_group_list, next) { 374 QLIST_FOREACH(vbasedev, &group->device_list, next) { 375 if (vbasedev->migration) { 376 device_num++; 377 } 378 } 379 } 380 381 return device_num; 382 } 383 384 int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp) 385 { 386 int ret; 387 388 if (multiple_devices_migration_blocker || 389 vfio_migratable_device_num() <= 1) { 390 return 0; 391 } 392 393 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { 394 error_setg(errp, "Migration is currently not supported with multiple " 395 "VFIO devices"); 396 return -EINVAL; 397 } 398 399 error_setg(&multiple_devices_migration_blocker, 400 "Migration is currently not supported with multiple " 401 "VFIO devices"); 402 ret = migrate_add_blocker(multiple_devices_migration_blocker, errp); 403 if (ret < 0) { 404 error_free(multiple_devices_migration_blocker); 405 multiple_devices_migration_blocker = NULL; 406 } 407 408 return ret; 409 } 410 411 void vfio_unblock_multiple_devices_migration(void) 412 { 413 if (!multiple_devices_migration_blocker || 414 vfio_migratable_device_num() > 1) { 415 return; 416 } 417 418 migrate_del_blocker(multiple_devices_migration_blocker); 419 error_free(multiple_devices_migration_blocker); 420 multiple_devices_migration_blocker = NULL; 421 } 422 423 static bool vfio_viommu_preset(void) 424 { 425 VFIOAddressSpace *space; 426 427 QLIST_FOREACH(space, &vfio_address_spaces, list) { 428 if (space->as != &address_space_memory) { 429 return true; 430 } 431 } 432 433 return false; 434 } 435 436 int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp) 437 { 438 int ret; 439 440 if (giommu_migration_blocker || 441 !vfio_viommu_preset()) { 442 return 0; 443 } 444 445 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { 446 error_setg(errp, 447 "Migration is currently not supported with vIOMMU enabled"); 448 return -EINVAL; 449 } 450 451 error_setg(&giommu_migration_blocker, 452 "Migration is currently not supported with vIOMMU enabled"); 453 ret = migrate_add_blocker(giommu_migration_blocker, errp); 454 if (ret < 0) { 455 error_free(giommu_migration_blocker); 456 giommu_migration_blocker = NULL; 457 } 458 459 return ret; 460 } 461 462 void vfio_migration_finalize(void) 463 { 464 if (!giommu_migration_blocker || 465 vfio_viommu_preset()) { 466 return; 467 } 468 469 migrate_del_blocker(giommu_migration_blocker); 470 error_free(giommu_migration_blocker); 471 giommu_migration_blocker = NULL; 472 } 473 474 static void vfio_set_migration_error(int err) 475 { 476 MigrationState *ms = migrate_get_current(); 477 478 if (migration_is_setup_or_active(ms->state)) { 479 WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) { 480 if (ms->to_dst_file) { 481 qemu_file_set_error(ms->to_dst_file, err); 482 } 483 } 484 } 485 } 486 487 static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) 488 { 489 VFIOGroup *group; 490 VFIODevice *vbasedev; 491 MigrationState *ms = migrate_get_current(); 492 493 if (ms->state != MIGRATION_STATUS_ACTIVE && 494 ms->state != MIGRATION_STATUS_DEVICE) { 495 return false; 496 } 497 498 QLIST_FOREACH(group, &container->group_list, container_next) { 499 QLIST_FOREACH(vbasedev, &group->device_list, next) { 500 VFIOMigration *migration = vbasedev->migration; 501 502 if (!migration) { 503 return false; 504 } 505 506 if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF && 507 (migration->device_state == VFIO_DEVICE_STATE_RUNNING || 508 migration->device_state == VFIO_DEVICE_STATE_PRE_COPY)) { 509 return false; 510 } 511 } 512 } 513 return true; 514 } 515 516 static bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) 517 { 518 VFIOGroup *group; 519 VFIODevice *vbasedev; 520 521 QLIST_FOREACH(group, &container->group_list, container_next) { 522 QLIST_FOREACH(vbasedev, &group->device_list, next) { 523 if (!vbasedev->dirty_pages_supported) { 524 return false; 525 } 526 } 527 } 528 529 return true; 530 } 531 532 /* 533 * Check if all VFIO devices are running and migration is active, which is 534 * essentially equivalent to the migration being in pre-copy phase. 535 */ 536 static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) 537 { 538 VFIOGroup *group; 539 VFIODevice *vbasedev; 540 541 if (!migration_is_active(migrate_get_current())) { 542 return false; 543 } 544 545 QLIST_FOREACH(group, &container->group_list, container_next) { 546 QLIST_FOREACH(vbasedev, &group->device_list, next) { 547 VFIOMigration *migration = vbasedev->migration; 548 549 if (!migration) { 550 return false; 551 } 552 553 if (migration->device_state == VFIO_DEVICE_STATE_RUNNING || 554 migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) { 555 continue; 556 } else { 557 return false; 558 } 559 } 560 } 561 return true; 562 } 563 564 static int vfio_dma_unmap_bitmap(VFIOContainer *container, 565 hwaddr iova, ram_addr_t size, 566 IOMMUTLBEntry *iotlb) 567 { 568 struct vfio_iommu_type1_dma_unmap *unmap; 569 struct vfio_bitmap *bitmap; 570 VFIOBitmap vbmap; 571 int ret; 572 573 ret = vfio_bitmap_alloc(&vbmap, size); 574 if (ret) { 575 return ret; 576 } 577 578 unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap)); 579 580 unmap->argsz = sizeof(*unmap) + sizeof(*bitmap); 581 unmap->iova = iova; 582 unmap->size = size; 583 unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP; 584 bitmap = (struct vfio_bitmap *)&unmap->data; 585 586 /* 587 * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of 588 * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize 589 * to qemu_real_host_page_size. 590 */ 591 bitmap->pgsize = qemu_real_host_page_size(); 592 bitmap->size = vbmap.size; 593 bitmap->data = (__u64 *)vbmap.bitmap; 594 595 if (vbmap.size > container->max_dirty_bitmap_size) { 596 error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size); 597 ret = -E2BIG; 598 goto unmap_exit; 599 } 600 601 ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap); 602 if (!ret) { 603 cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, 604 iotlb->translated_addr, vbmap.pages); 605 } else { 606 error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m"); 607 } 608 609 unmap_exit: 610 g_free(unmap); 611 g_free(vbmap.bitmap); 612 613 return ret; 614 } 615 616 /* 617 * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 618 */ 619 static int vfio_dma_unmap(VFIOContainer *container, 620 hwaddr iova, ram_addr_t size, 621 IOMMUTLBEntry *iotlb) 622 { 623 struct vfio_iommu_type1_dma_unmap unmap = { 624 .argsz = sizeof(unmap), 625 .flags = 0, 626 .iova = iova, 627 .size = size, 628 }; 629 bool need_dirty_sync = false; 630 int ret; 631 632 if (iotlb && vfio_devices_all_running_and_mig_active(container)) { 633 if (!vfio_devices_all_device_dirty_tracking(container) && 634 container->dirty_pages_supported) { 635 return vfio_dma_unmap_bitmap(container, iova, size, iotlb); 636 } 637 638 need_dirty_sync = true; 639 } 640 641 while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { 642 /* 643 * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c 644 * v4.15) where an overflow in its wrap-around check prevents us from 645 * unmapping the last page of the address space. Test for the error 646 * condition and re-try the unmap excluding the last page. The 647 * expectation is that we've never mapped the last page anyway and this 648 * unmap request comes via vIOMMU support which also makes it unlikely 649 * that this page is used. This bug was introduced well after type1 v2 650 * support was introduced, so we shouldn't need to test for v1. A fix 651 * is queued for kernel v5.0 so this workaround can be removed once 652 * affected kernels are sufficiently deprecated. 653 */ 654 if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) && 655 container->iommu_type == VFIO_TYPE1v2_IOMMU) { 656 trace_vfio_dma_unmap_overflow_workaround(); 657 unmap.size -= 1ULL << ctz64(container->pgsizes); 658 continue; 659 } 660 error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno)); 661 return -errno; 662 } 663 664 if (need_dirty_sync) { 665 ret = vfio_get_dirty_bitmap(container, iova, size, 666 iotlb->translated_addr); 667 if (ret) { 668 return ret; 669 } 670 } 671 672 return 0; 673 } 674 675 static int vfio_dma_map(VFIOContainer *container, hwaddr iova, 676 ram_addr_t size, void *vaddr, bool readonly) 677 { 678 struct vfio_iommu_type1_dma_map map = { 679 .argsz = sizeof(map), 680 .flags = VFIO_DMA_MAP_FLAG_READ, 681 .vaddr = (__u64)(uintptr_t)vaddr, 682 .iova = iova, 683 .size = size, 684 }; 685 686 if (!readonly) { 687 map.flags |= VFIO_DMA_MAP_FLAG_WRITE; 688 } 689 690 /* 691 * Try the mapping, if it fails with EBUSY, unmap the region and try 692 * again. This shouldn't be necessary, but we sometimes see it in 693 * the VGA ROM space. 694 */ 695 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || 696 (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 && 697 ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { 698 return 0; 699 } 700 701 error_report("VFIO_MAP_DMA failed: %s", strerror(errno)); 702 return -errno; 703 } 704 705 static void vfio_host_win_add(VFIOContainer *container, 706 hwaddr min_iova, hwaddr max_iova, 707 uint64_t iova_pgsizes) 708 { 709 VFIOHostDMAWindow *hostwin; 710 711 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { 712 if (ranges_overlap(hostwin->min_iova, 713 hostwin->max_iova - hostwin->min_iova + 1, 714 min_iova, 715 max_iova - min_iova + 1)) { 716 hw_error("%s: Overlapped IOMMU are not enabled", __func__); 717 } 718 } 719 720 hostwin = g_malloc0(sizeof(*hostwin)); 721 722 hostwin->min_iova = min_iova; 723 hostwin->max_iova = max_iova; 724 hostwin->iova_pgsizes = iova_pgsizes; 725 QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); 726 } 727 728 static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova, 729 hwaddr max_iova) 730 { 731 VFIOHostDMAWindow *hostwin; 732 733 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { 734 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { 735 QLIST_REMOVE(hostwin, hostwin_next); 736 g_free(hostwin); 737 return 0; 738 } 739 } 740 741 return -1; 742 } 743 744 static bool vfio_listener_skipped_section(MemoryRegionSection *section) 745 { 746 return (!memory_region_is_ram(section->mr) && 747 !memory_region_is_iommu(section->mr)) || 748 memory_region_is_protected(section->mr) || 749 /* 750 * Sizing an enabled 64-bit BAR can cause spurious mappings to 751 * addresses in the upper part of the 64-bit address space. These 752 * are never accessed by the CPU and beyond the address width of 753 * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width. 754 */ 755 section->offset_within_address_space & (1ULL << 63); 756 } 757 758 /* Called with rcu_read_lock held. */ 759 static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, 760 ram_addr_t *ram_addr, bool *read_only) 761 { 762 bool ret, mr_has_discard_manager; 763 764 ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, 765 &mr_has_discard_manager); 766 if (ret && mr_has_discard_manager) { 767 /* 768 * Malicious VMs might trigger discarding of IOMMU-mapped memory. The 769 * pages will remain pinned inside vfio until unmapped, resulting in a 770 * higher memory consumption than expected. If memory would get 771 * populated again later, there would be an inconsistency between pages 772 * pinned by vfio and pages seen by QEMU. This is the case until 773 * unmapped from the IOMMU (e.g., during device reset). 774 * 775 * With malicious guests, we really only care about pinning more memory 776 * than expected. RLIMIT_MEMLOCK set for the user/process can never be 777 * exceeded and can be used to mitigate this problem. 778 */ 779 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of" 780 " RAM (e.g., virtio-mem) works, however, malicious" 781 " guests can trigger pinning of more memory than" 782 " intended via an IOMMU. It's possible to mitigate " 783 " by setting/adjusting RLIMIT_MEMLOCK."); 784 } 785 return ret; 786 } 787 788 static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 789 { 790 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); 791 VFIOContainer *container = giommu->container; 792 hwaddr iova = iotlb->iova + giommu->iommu_offset; 793 void *vaddr; 794 int ret; 795 796 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP", 797 iova, iova + iotlb->addr_mask); 798 799 if (iotlb->target_as != &address_space_memory) { 800 error_report("Wrong target AS \"%s\", only system memory is allowed", 801 iotlb->target_as->name ? iotlb->target_as->name : "none"); 802 vfio_set_migration_error(-EINVAL); 803 return; 804 } 805 806 rcu_read_lock(); 807 808 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { 809 bool read_only; 810 811 if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) { 812 goto out; 813 } 814 /* 815 * vaddr is only valid until rcu_read_unlock(). But after 816 * vfio_dma_map has set up the mapping the pages will be 817 * pinned by the kernel. This makes sure that the RAM backend 818 * of vaddr will always be there, even if the memory object is 819 * destroyed and its backing memory munmap-ed. 820 */ 821 ret = vfio_dma_map(container, iova, 822 iotlb->addr_mask + 1, vaddr, 823 read_only); 824 if (ret) { 825 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " 826 "0x%"HWADDR_PRIx", %p) = %d (%s)", 827 container, iova, 828 iotlb->addr_mask + 1, vaddr, ret, strerror(-ret)); 829 } 830 } else { 831 ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb); 832 if (ret) { 833 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " 834 "0x%"HWADDR_PRIx") = %d (%s)", 835 container, iova, 836 iotlb->addr_mask + 1, ret, strerror(-ret)); 837 vfio_set_migration_error(ret); 838 } 839 } 840 out: 841 rcu_read_unlock(); 842 } 843 844 static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, 845 MemoryRegionSection *section) 846 { 847 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, 848 listener); 849 const hwaddr size = int128_get64(section->size); 850 const hwaddr iova = section->offset_within_address_space; 851 int ret; 852 853 /* Unmap with a single call. */ 854 ret = vfio_dma_unmap(vrdl->container, iova, size , NULL); 855 if (ret) { 856 error_report("%s: vfio_dma_unmap() failed: %s", __func__, 857 strerror(-ret)); 858 } 859 } 860 861 static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, 862 MemoryRegionSection *section) 863 { 864 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, 865 listener); 866 const hwaddr end = section->offset_within_region + 867 int128_get64(section->size); 868 hwaddr start, next, iova; 869 void *vaddr; 870 int ret; 871 872 /* 873 * Map in (aligned within memory region) minimum granularity, so we can 874 * unmap in minimum granularity later. 875 */ 876 for (start = section->offset_within_region; start < end; start = next) { 877 next = ROUND_UP(start + 1, vrdl->granularity); 878 next = MIN(next, end); 879 880 iova = start - section->offset_within_region + 881 section->offset_within_address_space; 882 vaddr = memory_region_get_ram_ptr(section->mr) + start; 883 884 ret = vfio_dma_map(vrdl->container, iova, next - start, 885 vaddr, section->readonly); 886 if (ret) { 887 /* Rollback */ 888 vfio_ram_discard_notify_discard(rdl, section); 889 return ret; 890 } 891 } 892 return 0; 893 } 894 895 static void vfio_register_ram_discard_listener(VFIOContainer *container, 896 MemoryRegionSection *section) 897 { 898 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); 899 VFIORamDiscardListener *vrdl; 900 901 /* Ignore some corner cases not relevant in practice. */ 902 g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE)); 903 g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space, 904 TARGET_PAGE_SIZE)); 905 g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE)); 906 907 vrdl = g_new0(VFIORamDiscardListener, 1); 908 vrdl->container = container; 909 vrdl->mr = section->mr; 910 vrdl->offset_within_address_space = section->offset_within_address_space; 911 vrdl->size = int128_get64(section->size); 912 vrdl->granularity = ram_discard_manager_get_min_granularity(rdm, 913 section->mr); 914 915 g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); 916 g_assert(container->pgsizes && 917 vrdl->granularity >= 1ULL << ctz64(container->pgsizes)); 918 919 ram_discard_listener_init(&vrdl->listener, 920 vfio_ram_discard_notify_populate, 921 vfio_ram_discard_notify_discard, true); 922 ram_discard_manager_register_listener(rdm, &vrdl->listener, section); 923 QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next); 924 925 /* 926 * Sanity-check if we have a theoretically problematic setup where we could 927 * exceed the maximum number of possible DMA mappings over time. We assume 928 * that each mapped section in the same address space as a RamDiscardManager 929 * section consumes exactly one DMA mapping, with the exception of 930 * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections 931 * in the same address space as RamDiscardManager sections. 932 * 933 * We assume that each section in the address space consumes one memslot. 934 * We take the number of KVM memory slots as a best guess for the maximum 935 * number of sections in the address space we could have over time, 936 * also consuming DMA mappings. 937 */ 938 if (container->dma_max_mappings) { 939 unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; 940 941 #ifdef CONFIG_KVM 942 if (kvm_enabled()) { 943 max_memslots = kvm_get_max_memslots(); 944 } 945 #endif 946 947 QLIST_FOREACH(vrdl, &container->vrdl_list, next) { 948 hwaddr start, end; 949 950 start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, 951 vrdl->granularity); 952 end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size, 953 vrdl->granularity); 954 vrdl_mappings += (end - start) / vrdl->granularity; 955 vrdl_count++; 956 } 957 958 if (vrdl_mappings + max_memslots - vrdl_count > 959 container->dma_max_mappings) { 960 warn_report("%s: possibly running out of DMA mappings. E.g., try" 961 " increasing the 'block-size' of virtio-mem devies." 962 " Maximum possible DMA mappings: %d, Maximum possible" 963 " memslots: %d", __func__, container->dma_max_mappings, 964 max_memslots); 965 } 966 } 967 } 968 969 static void vfio_unregister_ram_discard_listener(VFIOContainer *container, 970 MemoryRegionSection *section) 971 { 972 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); 973 VFIORamDiscardListener *vrdl = NULL; 974 975 QLIST_FOREACH(vrdl, &container->vrdl_list, next) { 976 if (vrdl->mr == section->mr && 977 vrdl->offset_within_address_space == 978 section->offset_within_address_space) { 979 break; 980 } 981 } 982 983 if (!vrdl) { 984 hw_error("vfio: Trying to unregister missing RAM discard listener"); 985 } 986 987 ram_discard_manager_unregister_listener(rdm, &vrdl->listener); 988 QLIST_REMOVE(vrdl, next); 989 g_free(vrdl); 990 } 991 992 static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container, 993 hwaddr iova, hwaddr end) 994 { 995 VFIOHostDMAWindow *hostwin; 996 bool hostwin_found = false; 997 998 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { 999 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { 1000 hostwin_found = true; 1001 break; 1002 } 1003 } 1004 1005 return hostwin_found ? hostwin : NULL; 1006 } 1007 1008 static bool vfio_known_safe_misalignment(MemoryRegionSection *section) 1009 { 1010 MemoryRegion *mr = section->mr; 1011 1012 if (!TPM_IS_CRB(mr->owner)) { 1013 return false; 1014 } 1015 1016 /* this is a known safe misaligned region, just trace for debug purpose */ 1017 trace_vfio_known_safe_misalignment(memory_region_name(mr), 1018 section->offset_within_address_space, 1019 section->offset_within_region, 1020 qemu_real_host_page_size()); 1021 return true; 1022 } 1023 1024 static bool vfio_listener_valid_section(MemoryRegionSection *section, 1025 const char *name) 1026 { 1027 if (vfio_listener_skipped_section(section)) { 1028 trace_vfio_listener_region_skip(name, 1029 section->offset_within_address_space, 1030 section->offset_within_address_space + 1031 int128_get64(int128_sub(section->size, int128_one()))); 1032 return false; 1033 } 1034 1035 if (unlikely((section->offset_within_address_space & 1036 ~qemu_real_host_page_mask()) != 1037 (section->offset_within_region & ~qemu_real_host_page_mask()))) { 1038 if (!vfio_known_safe_misalignment(section)) { 1039 error_report("%s received unaligned region %s iova=0x%"PRIx64 1040 " offset_within_region=0x%"PRIx64 1041 " qemu_real_host_page_size=0x%"PRIxPTR, 1042 __func__, memory_region_name(section->mr), 1043 section->offset_within_address_space, 1044 section->offset_within_region, 1045 qemu_real_host_page_size()); 1046 } 1047 return false; 1048 } 1049 1050 return true; 1051 } 1052 1053 static bool vfio_get_section_iova_range(VFIOContainer *container, 1054 MemoryRegionSection *section, 1055 hwaddr *out_iova, hwaddr *out_end, 1056 Int128 *out_llend) 1057 { 1058 Int128 llend; 1059 hwaddr iova; 1060 1061 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); 1062 llend = int128_make64(section->offset_within_address_space); 1063 llend = int128_add(llend, section->size); 1064 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask())); 1065 1066 if (int128_ge(int128_make64(iova), llend)) { 1067 return false; 1068 } 1069 1070 *out_iova = iova; 1071 *out_end = int128_get64(int128_sub(llend, int128_one())); 1072 if (out_llend) { 1073 *out_llend = llend; 1074 } 1075 return true; 1076 } 1077 1078 static void vfio_listener_region_add(MemoryListener *listener, 1079 MemoryRegionSection *section) 1080 { 1081 VFIOContainer *container = container_of(listener, VFIOContainer, listener); 1082 hwaddr iova, end; 1083 Int128 llend, llsize; 1084 void *vaddr; 1085 int ret; 1086 VFIOHostDMAWindow *hostwin; 1087 Error *err = NULL; 1088 1089 if (!vfio_listener_valid_section(section, "region_add")) { 1090 return; 1091 } 1092 1093 if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { 1094 if (memory_region_is_ram_device(section->mr)) { 1095 trace_vfio_listener_region_add_no_dma_map( 1096 memory_region_name(section->mr), 1097 section->offset_within_address_space, 1098 int128_getlo(section->size), 1099 qemu_real_host_page_size()); 1100 } 1101 return; 1102 } 1103 1104 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { 1105 hwaddr pgsize = 0; 1106 1107 /* For now intersections are not allowed, we may relax this later */ 1108 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { 1109 if (ranges_overlap(hostwin->min_iova, 1110 hostwin->max_iova - hostwin->min_iova + 1, 1111 section->offset_within_address_space, 1112 int128_get64(section->size))) { 1113 error_setg(&err, 1114 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing" 1115 "host DMA window [0x%"PRIx64",0x%"PRIx64"]", 1116 section->offset_within_address_space, 1117 section->offset_within_address_space + 1118 int128_get64(section->size) - 1, 1119 hostwin->min_iova, hostwin->max_iova); 1120 goto fail; 1121 } 1122 } 1123 1124 ret = vfio_spapr_create_window(container, section, &pgsize); 1125 if (ret) { 1126 error_setg_errno(&err, -ret, "Failed to create SPAPR window"); 1127 goto fail; 1128 } 1129 1130 vfio_host_win_add(container, section->offset_within_address_space, 1131 section->offset_within_address_space + 1132 int128_get64(section->size) - 1, pgsize); 1133 #ifdef CONFIG_KVM 1134 if (kvm_enabled()) { 1135 VFIOGroup *group; 1136 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); 1137 struct kvm_vfio_spapr_tce param; 1138 struct kvm_device_attr attr = { 1139 .group = KVM_DEV_VFIO_GROUP, 1140 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, 1141 .addr = (uint64_t)(unsigned long)¶m, 1142 }; 1143 1144 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD, 1145 ¶m.tablefd)) { 1146 QLIST_FOREACH(group, &container->group_list, container_next) { 1147 param.groupfd = group->fd; 1148 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 1149 error_report("vfio: failed to setup fd %d " 1150 "for a group with fd %d: %s", 1151 param.tablefd, param.groupfd, 1152 strerror(errno)); 1153 return; 1154 } 1155 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd); 1156 } 1157 } 1158 } 1159 #endif 1160 } 1161 1162 hostwin = vfio_find_hostwin(container, iova, end); 1163 if (!hostwin) { 1164 error_setg(&err, "Container %p can't map guest IOVA region" 1165 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); 1166 goto fail; 1167 } 1168 1169 memory_region_ref(section->mr); 1170 1171 if (memory_region_is_iommu(section->mr)) { 1172 VFIOGuestIOMMU *giommu; 1173 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); 1174 int iommu_idx; 1175 1176 trace_vfio_listener_region_add_iommu(iova, end); 1177 /* 1178 * FIXME: For VFIO iommu types which have KVM acceleration to 1179 * avoid bouncing all map/unmaps through qemu this way, this 1180 * would be the right place to wire that up (tell the KVM 1181 * device emulation the VFIO iommu handles to use). 1182 */ 1183 giommu = g_malloc0(sizeof(*giommu)); 1184 giommu->iommu_mr = iommu_mr; 1185 giommu->iommu_offset = section->offset_within_address_space - 1186 section->offset_within_region; 1187 giommu->container = container; 1188 llend = int128_add(int128_make64(section->offset_within_region), 1189 section->size); 1190 llend = int128_sub(llend, int128_one()); 1191 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, 1192 MEMTXATTRS_UNSPECIFIED); 1193 iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, 1194 IOMMU_NOTIFIER_IOTLB_EVENTS, 1195 section->offset_within_region, 1196 int128_get64(llend), 1197 iommu_idx); 1198 1199 ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr, 1200 container->pgsizes, 1201 &err); 1202 if (ret) { 1203 g_free(giommu); 1204 goto fail; 1205 } 1206 1207 ret = memory_region_register_iommu_notifier(section->mr, &giommu->n, 1208 &err); 1209 if (ret) { 1210 g_free(giommu); 1211 goto fail; 1212 } 1213 QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); 1214 memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); 1215 1216 return; 1217 } 1218 1219 /* Here we assume that memory_region_is_ram(section->mr)==true */ 1220 1221 /* 1222 * For RAM memory regions with a RamDiscardManager, we only want to map the 1223 * actually populated parts - and update the mapping whenever we're notified 1224 * about changes. 1225 */ 1226 if (memory_region_has_ram_discard_manager(section->mr)) { 1227 vfio_register_ram_discard_listener(container, section); 1228 return; 1229 } 1230 1231 vaddr = memory_region_get_ram_ptr(section->mr) + 1232 section->offset_within_region + 1233 (iova - section->offset_within_address_space); 1234 1235 trace_vfio_listener_region_add_ram(iova, end, vaddr); 1236 1237 llsize = int128_sub(llend, int128_make64(iova)); 1238 1239 if (memory_region_is_ram_device(section->mr)) { 1240 hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; 1241 1242 if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { 1243 trace_vfio_listener_region_add_no_dma_map( 1244 memory_region_name(section->mr), 1245 section->offset_within_address_space, 1246 int128_getlo(section->size), 1247 pgmask + 1); 1248 return; 1249 } 1250 } 1251 1252 ret = vfio_dma_map(container, iova, int128_get64(llsize), 1253 vaddr, section->readonly); 1254 if (ret) { 1255 error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", " 1256 "0x%"HWADDR_PRIx", %p) = %d (%s)", 1257 container, iova, int128_get64(llsize), vaddr, ret, 1258 strerror(-ret)); 1259 if (memory_region_is_ram_device(section->mr)) { 1260 /* Allow unexpected mappings not to be fatal for RAM devices */ 1261 error_report_err(err); 1262 return; 1263 } 1264 goto fail; 1265 } 1266 1267 return; 1268 1269 fail: 1270 if (memory_region_is_ram_device(section->mr)) { 1271 error_report("failed to vfio_dma_map. pci p2p may not work"); 1272 return; 1273 } 1274 /* 1275 * On the initfn path, store the first error in the container so we 1276 * can gracefully fail. Runtime, there's not much we can do other 1277 * than throw a hardware error. 1278 */ 1279 if (!container->initialized) { 1280 if (!container->error) { 1281 error_propagate_prepend(&container->error, err, 1282 "Region %s: ", 1283 memory_region_name(section->mr)); 1284 } else { 1285 error_free(err); 1286 } 1287 } else { 1288 error_report_err(err); 1289 hw_error("vfio: DMA mapping failed, unable to continue"); 1290 } 1291 } 1292 1293 static void vfio_listener_region_del(MemoryListener *listener, 1294 MemoryRegionSection *section) 1295 { 1296 VFIOContainer *container = container_of(listener, VFIOContainer, listener); 1297 hwaddr iova, end; 1298 Int128 llend, llsize; 1299 int ret; 1300 bool try_unmap = true; 1301 1302 if (!vfio_listener_valid_section(section, "region_del")) { 1303 return; 1304 } 1305 1306 if (memory_region_is_iommu(section->mr)) { 1307 VFIOGuestIOMMU *giommu; 1308 1309 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { 1310 if (MEMORY_REGION(giommu->iommu_mr) == section->mr && 1311 giommu->n.start == section->offset_within_region) { 1312 memory_region_unregister_iommu_notifier(section->mr, 1313 &giommu->n); 1314 QLIST_REMOVE(giommu, giommu_next); 1315 g_free(giommu); 1316 break; 1317 } 1318 } 1319 1320 /* 1321 * FIXME: We assume the one big unmap below is adequate to 1322 * remove any individual page mappings in the IOMMU which 1323 * might have been copied into VFIO. This works for a page table 1324 * based IOMMU where a big unmap flattens a large range of IO-PTEs. 1325 * That may not be true for all IOMMU types. 1326 */ 1327 } 1328 1329 if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { 1330 return; 1331 } 1332 1333 llsize = int128_sub(llend, int128_make64(iova)); 1334 1335 trace_vfio_listener_region_del(iova, end); 1336 1337 if (memory_region_is_ram_device(section->mr)) { 1338 hwaddr pgmask; 1339 VFIOHostDMAWindow *hostwin; 1340 1341 hostwin = vfio_find_hostwin(container, iova, end); 1342 assert(hostwin); /* or region_add() would have failed */ 1343 1344 pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; 1345 try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); 1346 } else if (memory_region_has_ram_discard_manager(section->mr)) { 1347 vfio_unregister_ram_discard_listener(container, section); 1348 /* Unregistering will trigger an unmap. */ 1349 try_unmap = false; 1350 } 1351 1352 if (try_unmap) { 1353 if (int128_eq(llsize, int128_2_64())) { 1354 /* The unmap ioctl doesn't accept a full 64-bit span. */ 1355 llsize = int128_rshift(llsize, 1); 1356 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); 1357 if (ret) { 1358 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " 1359 "0x%"HWADDR_PRIx") = %d (%s)", 1360 container, iova, int128_get64(llsize), ret, 1361 strerror(-ret)); 1362 } 1363 iova += int128_get64(llsize); 1364 } 1365 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); 1366 if (ret) { 1367 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " 1368 "0x%"HWADDR_PRIx") = %d (%s)", 1369 container, iova, int128_get64(llsize), ret, 1370 strerror(-ret)); 1371 } 1372 } 1373 1374 memory_region_unref(section->mr); 1375 1376 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { 1377 vfio_spapr_remove_window(container, 1378 section->offset_within_address_space); 1379 if (vfio_host_win_del(container, 1380 section->offset_within_address_space, 1381 section->offset_within_address_space + 1382 int128_get64(section->size) - 1) < 0) { 1383 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, 1384 __func__, section->offset_within_address_space); 1385 } 1386 } 1387 } 1388 1389 static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) 1390 { 1391 int ret; 1392 struct vfio_iommu_type1_dirty_bitmap dirty = { 1393 .argsz = sizeof(dirty), 1394 }; 1395 1396 if (!container->dirty_pages_supported) { 1397 return 0; 1398 } 1399 1400 if (start) { 1401 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; 1402 } else { 1403 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; 1404 } 1405 1406 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); 1407 if (ret) { 1408 ret = -errno; 1409 error_report("Failed to set dirty tracking flag 0x%x errno: %d", 1410 dirty.flags, errno); 1411 } 1412 1413 return ret; 1414 } 1415 1416 typedef struct VFIODirtyRanges { 1417 hwaddr min32; 1418 hwaddr max32; 1419 hwaddr min64; 1420 hwaddr max64; 1421 } VFIODirtyRanges; 1422 1423 typedef struct VFIODirtyRangesListener { 1424 VFIOContainer *container; 1425 VFIODirtyRanges ranges; 1426 MemoryListener listener; 1427 } VFIODirtyRangesListener; 1428 1429 static void vfio_dirty_tracking_update(MemoryListener *listener, 1430 MemoryRegionSection *section) 1431 { 1432 VFIODirtyRangesListener *dirty = container_of(listener, 1433 VFIODirtyRangesListener, 1434 listener); 1435 VFIODirtyRanges *range = &dirty->ranges; 1436 hwaddr iova, end, *min, *max; 1437 1438 if (!vfio_listener_valid_section(section, "tracking_update") || 1439 !vfio_get_section_iova_range(dirty->container, section, 1440 &iova, &end, NULL)) { 1441 return; 1442 } 1443 1444 /* 1445 * The address space passed to the dirty tracker is reduced to two ranges: 1446 * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges. 1447 * The underlying reports of dirty will query a sub-interval of each of 1448 * these ranges. 1449 * 1450 * The purpose of the dual range handling is to handle known cases of big 1451 * holes in the address space, like the x86 AMD 1T hole. The alternative 1452 * would be an IOVATree but that has a much bigger runtime overhead and 1453 * unnecessary complexity. 1454 */ 1455 min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; 1456 max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; 1457 1458 if (*min > iova) { 1459 *min = iova; 1460 } 1461 if (*max < end) { 1462 *max = end; 1463 } 1464 1465 trace_vfio_device_dirty_tracking_update(iova, end, *min, *max); 1466 return; 1467 } 1468 1469 static const MemoryListener vfio_dirty_tracking_listener = { 1470 .name = "vfio-tracking", 1471 .region_add = vfio_dirty_tracking_update, 1472 }; 1473 1474 static void vfio_dirty_tracking_init(VFIOContainer *container, 1475 VFIODirtyRanges *ranges) 1476 { 1477 VFIODirtyRangesListener dirty; 1478 1479 memset(&dirty, 0, sizeof(dirty)); 1480 dirty.ranges.min32 = UINT32_MAX; 1481 dirty.ranges.min64 = UINT64_MAX; 1482 dirty.listener = vfio_dirty_tracking_listener; 1483 dirty.container = container; 1484 1485 memory_listener_register(&dirty.listener, 1486 container->space->as); 1487 1488 *ranges = dirty.ranges; 1489 1490 /* 1491 * The memory listener is synchronous, and used to calculate the range 1492 * to dirty tracking. Unregister it after we are done as we are not 1493 * interested in any follow-up updates. 1494 */ 1495 memory_listener_unregister(&dirty.listener); 1496 } 1497 1498 static void vfio_devices_dma_logging_stop(VFIOContainer *container) 1499 { 1500 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), 1501 sizeof(uint64_t))] = {}; 1502 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 1503 VFIODevice *vbasedev; 1504 VFIOGroup *group; 1505 1506 feature->argsz = sizeof(buf); 1507 feature->flags = VFIO_DEVICE_FEATURE_SET | 1508 VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; 1509 1510 QLIST_FOREACH(group, &container->group_list, container_next) { 1511 QLIST_FOREACH(vbasedev, &group->device_list, next) { 1512 if (!vbasedev->dirty_tracking) { 1513 continue; 1514 } 1515 1516 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 1517 warn_report("%s: Failed to stop DMA logging, err %d (%s)", 1518 vbasedev->name, -errno, strerror(errno)); 1519 } 1520 vbasedev->dirty_tracking = false; 1521 } 1522 } 1523 } 1524 1525 static struct vfio_device_feature * 1526 vfio_device_feature_dma_logging_start_create(VFIOContainer *container, 1527 VFIODirtyRanges *tracking) 1528 { 1529 struct vfio_device_feature *feature; 1530 size_t feature_size; 1531 struct vfio_device_feature_dma_logging_control *control; 1532 struct vfio_device_feature_dma_logging_range *ranges; 1533 1534 feature_size = sizeof(struct vfio_device_feature) + 1535 sizeof(struct vfio_device_feature_dma_logging_control); 1536 feature = g_try_malloc0(feature_size); 1537 if (!feature) { 1538 errno = ENOMEM; 1539 return NULL; 1540 } 1541 feature->argsz = feature_size; 1542 feature->flags = VFIO_DEVICE_FEATURE_SET | 1543 VFIO_DEVICE_FEATURE_DMA_LOGGING_START; 1544 1545 control = (struct vfio_device_feature_dma_logging_control *)feature->data; 1546 control->page_size = qemu_real_host_page_size(); 1547 1548 /* 1549 * DMA logging uAPI guarantees to support at least a number of ranges that 1550 * fits into a single host kernel base page. 1551 */ 1552 control->num_ranges = !!tracking->max32 + !!tracking->max64; 1553 ranges = g_try_new0(struct vfio_device_feature_dma_logging_range, 1554 control->num_ranges); 1555 if (!ranges) { 1556 g_free(feature); 1557 errno = ENOMEM; 1558 1559 return NULL; 1560 } 1561 1562 control->ranges = (__u64)(uintptr_t)ranges; 1563 if (tracking->max32) { 1564 ranges->iova = tracking->min32; 1565 ranges->length = (tracking->max32 - tracking->min32) + 1; 1566 ranges++; 1567 } 1568 if (tracking->max64) { 1569 ranges->iova = tracking->min64; 1570 ranges->length = (tracking->max64 - tracking->min64) + 1; 1571 } 1572 1573 trace_vfio_device_dirty_tracking_start(control->num_ranges, 1574 tracking->min32, tracking->max32, 1575 tracking->min64, tracking->max64); 1576 1577 return feature; 1578 } 1579 1580 static void vfio_device_feature_dma_logging_start_destroy( 1581 struct vfio_device_feature *feature) 1582 { 1583 struct vfio_device_feature_dma_logging_control *control = 1584 (struct vfio_device_feature_dma_logging_control *)feature->data; 1585 struct vfio_device_feature_dma_logging_range *ranges = 1586 (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges; 1587 1588 g_free(ranges); 1589 g_free(feature); 1590 } 1591 1592 static int vfio_devices_dma_logging_start(VFIOContainer *container) 1593 { 1594 struct vfio_device_feature *feature; 1595 VFIODirtyRanges ranges; 1596 VFIODevice *vbasedev; 1597 VFIOGroup *group; 1598 int ret = 0; 1599 1600 vfio_dirty_tracking_init(container, &ranges); 1601 feature = vfio_device_feature_dma_logging_start_create(container, 1602 &ranges); 1603 if (!feature) { 1604 return -errno; 1605 } 1606 1607 QLIST_FOREACH(group, &container->group_list, container_next) { 1608 QLIST_FOREACH(vbasedev, &group->device_list, next) { 1609 if (vbasedev->dirty_tracking) { 1610 continue; 1611 } 1612 1613 ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); 1614 if (ret) { 1615 ret = -errno; 1616 error_report("%s: Failed to start DMA logging, err %d (%s)", 1617 vbasedev->name, ret, strerror(errno)); 1618 goto out; 1619 } 1620 vbasedev->dirty_tracking = true; 1621 } 1622 } 1623 1624 out: 1625 if (ret) { 1626 vfio_devices_dma_logging_stop(container); 1627 } 1628 1629 vfio_device_feature_dma_logging_start_destroy(feature); 1630 1631 return ret; 1632 } 1633 1634 static void vfio_listener_log_global_start(MemoryListener *listener) 1635 { 1636 VFIOContainer *container = container_of(listener, VFIOContainer, listener); 1637 int ret; 1638 1639 if (vfio_devices_all_device_dirty_tracking(container)) { 1640 ret = vfio_devices_dma_logging_start(container); 1641 } else { 1642 ret = vfio_set_dirty_page_tracking(container, true); 1643 } 1644 1645 if (ret) { 1646 error_report("vfio: Could not start dirty page tracking, err: %d (%s)", 1647 ret, strerror(-ret)); 1648 vfio_set_migration_error(ret); 1649 } 1650 } 1651 1652 static void vfio_listener_log_global_stop(MemoryListener *listener) 1653 { 1654 VFIOContainer *container = container_of(listener, VFIOContainer, listener); 1655 int ret = 0; 1656 1657 if (vfio_devices_all_device_dirty_tracking(container)) { 1658 vfio_devices_dma_logging_stop(container); 1659 } else { 1660 ret = vfio_set_dirty_page_tracking(container, false); 1661 } 1662 1663 if (ret) { 1664 error_report("vfio: Could not stop dirty page tracking, err: %d (%s)", 1665 ret, strerror(-ret)); 1666 vfio_set_migration_error(ret); 1667 } 1668 } 1669 1670 static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, 1671 hwaddr size, void *bitmap) 1672 { 1673 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 1674 sizeof(struct vfio_device_feature_dma_logging_report), 1675 sizeof(__u64))] = {}; 1676 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 1677 struct vfio_device_feature_dma_logging_report *report = 1678 (struct vfio_device_feature_dma_logging_report *)feature->data; 1679 1680 report->iova = iova; 1681 report->length = size; 1682 report->page_size = qemu_real_host_page_size(); 1683 report->bitmap = (__u64)(uintptr_t)bitmap; 1684 1685 feature->argsz = sizeof(buf); 1686 feature->flags = VFIO_DEVICE_FEATURE_GET | 1687 VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; 1688 1689 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 1690 return -errno; 1691 } 1692 1693 return 0; 1694 } 1695 1696 static int vfio_devices_query_dirty_bitmap(VFIOContainer *container, 1697 VFIOBitmap *vbmap, hwaddr iova, 1698 hwaddr size) 1699 { 1700 VFIODevice *vbasedev; 1701 VFIOGroup *group; 1702 int ret; 1703 1704 QLIST_FOREACH(group, &container->group_list, container_next) { 1705 QLIST_FOREACH(vbasedev, &group->device_list, next) { 1706 ret = vfio_device_dma_logging_report(vbasedev, iova, size, 1707 vbmap->bitmap); 1708 if (ret) { 1709 error_report("%s: Failed to get DMA logging report, iova: " 1710 "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx 1711 ", err: %d (%s)", 1712 vbasedev->name, iova, size, ret, strerror(-ret)); 1713 1714 return ret; 1715 } 1716 } 1717 } 1718 1719 return 0; 1720 } 1721 1722 static int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, 1723 hwaddr iova, hwaddr size) 1724 { 1725 struct vfio_iommu_type1_dirty_bitmap *dbitmap; 1726 struct vfio_iommu_type1_dirty_bitmap_get *range; 1727 int ret; 1728 1729 dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); 1730 1731 dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); 1732 dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; 1733 range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; 1734 range->iova = iova; 1735 range->size = size; 1736 1737 /* 1738 * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of 1739 * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize 1740 * to qemu_real_host_page_size. 1741 */ 1742 range->bitmap.pgsize = qemu_real_host_page_size(); 1743 range->bitmap.size = vbmap->size; 1744 range->bitmap.data = (__u64 *)vbmap->bitmap; 1745 1746 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); 1747 if (ret) { 1748 ret = -errno; 1749 error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64 1750 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova, 1751 (uint64_t)range->size, errno); 1752 } 1753 1754 g_free(dbitmap); 1755 1756 return ret; 1757 } 1758 1759 static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, 1760 uint64_t size, ram_addr_t ram_addr) 1761 { 1762 bool all_device_dirty_tracking = 1763 vfio_devices_all_device_dirty_tracking(container); 1764 uint64_t dirty_pages; 1765 VFIOBitmap vbmap; 1766 int ret; 1767 1768 if (!container->dirty_pages_supported && !all_device_dirty_tracking) { 1769 cpu_physical_memory_set_dirty_range(ram_addr, size, 1770 tcg_enabled() ? DIRTY_CLIENTS_ALL : 1771 DIRTY_CLIENTS_NOCODE); 1772 return 0; 1773 } 1774 1775 ret = vfio_bitmap_alloc(&vbmap, size); 1776 if (ret) { 1777 return ret; 1778 } 1779 1780 if (all_device_dirty_tracking) { 1781 ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size); 1782 } else { 1783 ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size); 1784 } 1785 1786 if (ret) { 1787 goto out; 1788 } 1789 1790 dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, 1791 vbmap.pages); 1792 1793 trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size, 1794 ram_addr, dirty_pages); 1795 out: 1796 g_free(vbmap.bitmap); 1797 1798 return ret; 1799 } 1800 1801 typedef struct { 1802 IOMMUNotifier n; 1803 VFIOGuestIOMMU *giommu; 1804 } vfio_giommu_dirty_notifier; 1805 1806 static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 1807 { 1808 vfio_giommu_dirty_notifier *gdn = container_of(n, 1809 vfio_giommu_dirty_notifier, n); 1810 VFIOGuestIOMMU *giommu = gdn->giommu; 1811 VFIOContainer *container = giommu->container; 1812 hwaddr iova = iotlb->iova + giommu->iommu_offset; 1813 ram_addr_t translated_addr; 1814 int ret = -EINVAL; 1815 1816 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); 1817 1818 if (iotlb->target_as != &address_space_memory) { 1819 error_report("Wrong target AS \"%s\", only system memory is allowed", 1820 iotlb->target_as->name ? iotlb->target_as->name : "none"); 1821 goto out; 1822 } 1823 1824 rcu_read_lock(); 1825 if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { 1826 ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1, 1827 translated_addr); 1828 if (ret) { 1829 error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " 1830 "0x%"HWADDR_PRIx") = %d (%s)", 1831 container, iova, iotlb->addr_mask + 1, ret, 1832 strerror(-ret)); 1833 } 1834 } 1835 rcu_read_unlock(); 1836 1837 out: 1838 if (ret) { 1839 vfio_set_migration_error(ret); 1840 } 1841 } 1842 1843 static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, 1844 void *opaque) 1845 { 1846 const hwaddr size = int128_get64(section->size); 1847 const hwaddr iova = section->offset_within_address_space; 1848 const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) + 1849 section->offset_within_region; 1850 VFIORamDiscardListener *vrdl = opaque; 1851 1852 /* 1853 * Sync the whole mapped region (spanning multiple individual mappings) 1854 * in one go. 1855 */ 1856 return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr); 1857 } 1858 1859 static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, 1860 MemoryRegionSection *section) 1861 { 1862 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); 1863 VFIORamDiscardListener *vrdl = NULL; 1864 1865 QLIST_FOREACH(vrdl, &container->vrdl_list, next) { 1866 if (vrdl->mr == section->mr && 1867 vrdl->offset_within_address_space == 1868 section->offset_within_address_space) { 1869 break; 1870 } 1871 } 1872 1873 if (!vrdl) { 1874 hw_error("vfio: Trying to sync missing RAM discard listener"); 1875 } 1876 1877 /* 1878 * We only want/can synchronize the bitmap for actually mapped parts - 1879 * which correspond to populated parts. Replay all populated parts. 1880 */ 1881 return ram_discard_manager_replay_populated(rdm, section, 1882 vfio_ram_discard_get_dirty_bitmap, 1883 &vrdl); 1884 } 1885 1886 static int vfio_sync_dirty_bitmap(VFIOContainer *container, 1887 MemoryRegionSection *section) 1888 { 1889 ram_addr_t ram_addr; 1890 1891 if (memory_region_is_iommu(section->mr)) { 1892 VFIOGuestIOMMU *giommu; 1893 1894 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { 1895 if (MEMORY_REGION(giommu->iommu_mr) == section->mr && 1896 giommu->n.start == section->offset_within_region) { 1897 Int128 llend; 1898 vfio_giommu_dirty_notifier gdn = { .giommu = giommu }; 1899 int idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr, 1900 MEMTXATTRS_UNSPECIFIED); 1901 1902 llend = int128_add(int128_make64(section->offset_within_region), 1903 section->size); 1904 llend = int128_sub(llend, int128_one()); 1905 1906 iommu_notifier_init(&gdn.n, 1907 vfio_iommu_map_dirty_notify, 1908 IOMMU_NOTIFIER_MAP, 1909 section->offset_within_region, 1910 int128_get64(llend), 1911 idx); 1912 memory_region_iommu_replay(giommu->iommu_mr, &gdn.n); 1913 break; 1914 } 1915 } 1916 return 0; 1917 } else if (memory_region_has_ram_discard_manager(section->mr)) { 1918 return vfio_sync_ram_discard_listener_dirty_bitmap(container, section); 1919 } 1920 1921 ram_addr = memory_region_get_ram_addr(section->mr) + 1922 section->offset_within_region; 1923 1924 return vfio_get_dirty_bitmap(container, 1925 REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), 1926 int128_get64(section->size), ram_addr); 1927 } 1928 1929 static void vfio_listener_log_sync(MemoryListener *listener, 1930 MemoryRegionSection *section) 1931 { 1932 VFIOContainer *container = container_of(listener, VFIOContainer, listener); 1933 int ret; 1934 1935 if (vfio_listener_skipped_section(section)) { 1936 return; 1937 } 1938 1939 if (vfio_devices_all_dirty_tracking(container)) { 1940 ret = vfio_sync_dirty_bitmap(container, section); 1941 if (ret) { 1942 error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret, 1943 strerror(-ret)); 1944 vfio_set_migration_error(ret); 1945 } 1946 } 1947 } 1948 1949 static const MemoryListener vfio_memory_listener = { 1950 .name = "vfio", 1951 .region_add = vfio_listener_region_add, 1952 .region_del = vfio_listener_region_del, 1953 .log_global_start = vfio_listener_log_global_start, 1954 .log_global_stop = vfio_listener_log_global_stop, 1955 .log_sync = vfio_listener_log_sync, 1956 }; 1957 1958 static void vfio_listener_release(VFIOContainer *container) 1959 { 1960 memory_listener_unregister(&container->listener); 1961 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { 1962 memory_listener_unregister(&container->prereg_listener); 1963 } 1964 } 1965 1966 static struct vfio_info_cap_header * 1967 vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id) 1968 { 1969 struct vfio_info_cap_header *hdr; 1970 1971 for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) { 1972 if (hdr->id == id) { 1973 return hdr; 1974 } 1975 } 1976 1977 return NULL; 1978 } 1979 1980 struct vfio_info_cap_header * 1981 vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) 1982 { 1983 if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) { 1984 return NULL; 1985 } 1986 1987 return vfio_get_cap((void *)info, info->cap_offset, id); 1988 } 1989 1990 static struct vfio_info_cap_header * 1991 vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) 1992 { 1993 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { 1994 return NULL; 1995 } 1996 1997 return vfio_get_cap((void *)info, info->cap_offset, id); 1998 } 1999 2000 struct vfio_info_cap_header * 2001 vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id) 2002 { 2003 if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) { 2004 return NULL; 2005 } 2006 2007 return vfio_get_cap((void *)info, info->cap_offset, id); 2008 } 2009 2010 bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, 2011 unsigned int *avail) 2012 { 2013 struct vfio_info_cap_header *hdr; 2014 struct vfio_iommu_type1_info_dma_avail *cap; 2015 2016 /* If the capability cannot be found, assume no DMA limiting */ 2017 hdr = vfio_get_iommu_type1_info_cap(info, 2018 VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL); 2019 if (hdr == NULL) { 2020 return false; 2021 } 2022 2023 if (avail != NULL) { 2024 cap = (void *) hdr; 2025 *avail = cap->avail; 2026 } 2027 2028 return true; 2029 } 2030 2031 static int vfio_setup_region_sparse_mmaps(VFIORegion *region, 2032 struct vfio_region_info *info) 2033 { 2034 struct vfio_info_cap_header *hdr; 2035 struct vfio_region_info_cap_sparse_mmap *sparse; 2036 int i, j; 2037 2038 hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP); 2039 if (!hdr) { 2040 return -ENODEV; 2041 } 2042 2043 sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header); 2044 2045 trace_vfio_region_sparse_mmap_header(region->vbasedev->name, 2046 region->nr, sparse->nr_areas); 2047 2048 region->mmaps = g_new0(VFIOMmap, sparse->nr_areas); 2049 2050 for (i = 0, j = 0; i < sparse->nr_areas; i++) { 2051 if (sparse->areas[i].size) { 2052 trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset, 2053 sparse->areas[i].offset + 2054 sparse->areas[i].size - 1); 2055 region->mmaps[j].offset = sparse->areas[i].offset; 2056 region->mmaps[j].size = sparse->areas[i].size; 2057 j++; 2058 } 2059 } 2060 2061 region->nr_mmaps = j; 2062 region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap)); 2063 2064 return 0; 2065 } 2066 2067 int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, 2068 int index, const char *name) 2069 { 2070 struct vfio_region_info *info; 2071 int ret; 2072 2073 ret = vfio_get_region_info(vbasedev, index, &info); 2074 if (ret) { 2075 return ret; 2076 } 2077 2078 region->vbasedev = vbasedev; 2079 region->flags = info->flags; 2080 region->size = info->size; 2081 region->fd_offset = info->offset; 2082 region->nr = index; 2083 2084 if (region->size) { 2085 region->mem = g_new0(MemoryRegion, 1); 2086 memory_region_init_io(region->mem, obj, &vfio_region_ops, 2087 region, name, region->size); 2088 2089 if (!vbasedev->no_mmap && 2090 region->flags & VFIO_REGION_INFO_FLAG_MMAP) { 2091 2092 ret = vfio_setup_region_sparse_mmaps(region, info); 2093 2094 if (ret) { 2095 region->nr_mmaps = 1; 2096 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); 2097 region->mmaps[0].offset = 0; 2098 region->mmaps[0].size = region->size; 2099 } 2100 } 2101 } 2102 2103 g_free(info); 2104 2105 trace_vfio_region_setup(vbasedev->name, index, name, 2106 region->flags, region->fd_offset, region->size); 2107 return 0; 2108 } 2109 2110 static void vfio_subregion_unmap(VFIORegion *region, int index) 2111 { 2112 trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem), 2113 region->mmaps[index].offset, 2114 region->mmaps[index].offset + 2115 region->mmaps[index].size - 1); 2116 memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem); 2117 munmap(region->mmaps[index].mmap, region->mmaps[index].size); 2118 object_unparent(OBJECT(®ion->mmaps[index].mem)); 2119 region->mmaps[index].mmap = NULL; 2120 } 2121 2122 int vfio_region_mmap(VFIORegion *region) 2123 { 2124 int i, prot = 0; 2125 char *name; 2126 2127 if (!region->mem) { 2128 return 0; 2129 } 2130 2131 prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0; 2132 prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; 2133 2134 for (i = 0; i < region->nr_mmaps; i++) { 2135 region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot, 2136 MAP_SHARED, region->vbasedev->fd, 2137 region->fd_offset + 2138 region->mmaps[i].offset); 2139 if (region->mmaps[i].mmap == MAP_FAILED) { 2140 int ret = -errno; 2141 2142 trace_vfio_region_mmap_fault(memory_region_name(region->mem), i, 2143 region->fd_offset + 2144 region->mmaps[i].offset, 2145 region->fd_offset + 2146 region->mmaps[i].offset + 2147 region->mmaps[i].size - 1, ret); 2148 2149 region->mmaps[i].mmap = NULL; 2150 2151 for (i--; i >= 0; i--) { 2152 vfio_subregion_unmap(region, i); 2153 } 2154 2155 return ret; 2156 } 2157 2158 name = g_strdup_printf("%s mmaps[%d]", 2159 memory_region_name(region->mem), i); 2160 memory_region_init_ram_device_ptr(®ion->mmaps[i].mem, 2161 memory_region_owner(region->mem), 2162 name, region->mmaps[i].size, 2163 region->mmaps[i].mmap); 2164 g_free(name); 2165 memory_region_add_subregion(region->mem, region->mmaps[i].offset, 2166 ®ion->mmaps[i].mem); 2167 2168 trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem), 2169 region->mmaps[i].offset, 2170 region->mmaps[i].offset + 2171 region->mmaps[i].size - 1); 2172 } 2173 2174 return 0; 2175 } 2176 2177 void vfio_region_unmap(VFIORegion *region) 2178 { 2179 int i; 2180 2181 if (!region->mem) { 2182 return; 2183 } 2184 2185 for (i = 0; i < region->nr_mmaps; i++) { 2186 if (region->mmaps[i].mmap) { 2187 vfio_subregion_unmap(region, i); 2188 } 2189 } 2190 } 2191 2192 void vfio_region_exit(VFIORegion *region) 2193 { 2194 int i; 2195 2196 if (!region->mem) { 2197 return; 2198 } 2199 2200 for (i = 0; i < region->nr_mmaps; i++) { 2201 if (region->mmaps[i].mmap) { 2202 memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); 2203 } 2204 } 2205 2206 trace_vfio_region_exit(region->vbasedev->name, region->nr); 2207 } 2208 2209 void vfio_region_finalize(VFIORegion *region) 2210 { 2211 int i; 2212 2213 if (!region->mem) { 2214 return; 2215 } 2216 2217 for (i = 0; i < region->nr_mmaps; i++) { 2218 if (region->mmaps[i].mmap) { 2219 munmap(region->mmaps[i].mmap, region->mmaps[i].size); 2220 object_unparent(OBJECT(®ion->mmaps[i].mem)); 2221 } 2222 } 2223 2224 object_unparent(OBJECT(region->mem)); 2225 2226 g_free(region->mem); 2227 g_free(region->mmaps); 2228 2229 trace_vfio_region_finalize(region->vbasedev->name, region->nr); 2230 2231 region->mem = NULL; 2232 region->mmaps = NULL; 2233 region->nr_mmaps = 0; 2234 region->size = 0; 2235 region->flags = 0; 2236 region->nr = 0; 2237 } 2238 2239 void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled) 2240 { 2241 int i; 2242 2243 if (!region->mem) { 2244 return; 2245 } 2246 2247 for (i = 0; i < region->nr_mmaps; i++) { 2248 if (region->mmaps[i].mmap) { 2249 memory_region_set_enabled(®ion->mmaps[i].mem, enabled); 2250 } 2251 } 2252 2253 trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem), 2254 enabled); 2255 } 2256 2257 void vfio_reset_handler(void *opaque) 2258 { 2259 VFIOGroup *group; 2260 VFIODevice *vbasedev; 2261 2262 QLIST_FOREACH(group, &vfio_group_list, next) { 2263 QLIST_FOREACH(vbasedev, &group->device_list, next) { 2264 if (vbasedev->dev->realized) { 2265 vbasedev->ops->vfio_compute_needs_reset(vbasedev); 2266 } 2267 } 2268 } 2269 2270 QLIST_FOREACH(group, &vfio_group_list, next) { 2271 QLIST_FOREACH(vbasedev, &group->device_list, next) { 2272 if (vbasedev->dev->realized && vbasedev->needs_reset) { 2273 vbasedev->ops->vfio_hot_reset_multi(vbasedev); 2274 } 2275 } 2276 } 2277 } 2278 2279 static void vfio_kvm_device_add_group(VFIOGroup *group) 2280 { 2281 #ifdef CONFIG_KVM 2282 struct kvm_device_attr attr = { 2283 .group = KVM_DEV_VFIO_GROUP, 2284 .attr = KVM_DEV_VFIO_GROUP_ADD, 2285 .addr = (uint64_t)(unsigned long)&group->fd, 2286 }; 2287 2288 if (!kvm_enabled()) { 2289 return; 2290 } 2291 2292 if (vfio_kvm_device_fd < 0) { 2293 struct kvm_create_device cd = { 2294 .type = KVM_DEV_TYPE_VFIO, 2295 }; 2296 2297 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) { 2298 error_report("Failed to create KVM VFIO device: %m"); 2299 return; 2300 } 2301 2302 vfio_kvm_device_fd = cd.fd; 2303 } 2304 2305 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 2306 error_report("Failed to add group %d to KVM VFIO device: %m", 2307 group->groupid); 2308 } 2309 #endif 2310 } 2311 2312 static void vfio_kvm_device_del_group(VFIOGroup *group) 2313 { 2314 #ifdef CONFIG_KVM 2315 struct kvm_device_attr attr = { 2316 .group = KVM_DEV_VFIO_GROUP, 2317 .attr = KVM_DEV_VFIO_GROUP_DEL, 2318 .addr = (uint64_t)(unsigned long)&group->fd, 2319 }; 2320 2321 if (vfio_kvm_device_fd < 0) { 2322 return; 2323 } 2324 2325 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 2326 error_report("Failed to remove group %d from KVM VFIO device: %m", 2327 group->groupid); 2328 } 2329 #endif 2330 } 2331 2332 static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as) 2333 { 2334 VFIOAddressSpace *space; 2335 2336 QLIST_FOREACH(space, &vfio_address_spaces, list) { 2337 if (space->as == as) { 2338 return space; 2339 } 2340 } 2341 2342 /* No suitable VFIOAddressSpace, create a new one */ 2343 space = g_malloc0(sizeof(*space)); 2344 space->as = as; 2345 QLIST_INIT(&space->containers); 2346 2347 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list); 2348 2349 return space; 2350 } 2351 2352 static void vfio_put_address_space(VFIOAddressSpace *space) 2353 { 2354 if (QLIST_EMPTY(&space->containers)) { 2355 QLIST_REMOVE(space, list); 2356 g_free(space); 2357 } 2358 } 2359 2360 /* 2361 * vfio_get_iommu_type - selects the richest iommu_type (v2 first) 2362 */ 2363 static int vfio_get_iommu_type(VFIOContainer *container, 2364 Error **errp) 2365 { 2366 int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU, 2367 VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU }; 2368 int i; 2369 2370 for (i = 0; i < ARRAY_SIZE(iommu_types); i++) { 2371 if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) { 2372 return iommu_types[i]; 2373 } 2374 } 2375 error_setg(errp, "No available IOMMU models"); 2376 return -EINVAL; 2377 } 2378 2379 static int vfio_init_container(VFIOContainer *container, int group_fd, 2380 Error **errp) 2381 { 2382 int iommu_type, ret; 2383 2384 iommu_type = vfio_get_iommu_type(container, errp); 2385 if (iommu_type < 0) { 2386 return iommu_type; 2387 } 2388 2389 ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd); 2390 if (ret) { 2391 error_setg_errno(errp, errno, "Failed to set group container"); 2392 return -errno; 2393 } 2394 2395 while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) { 2396 if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { 2397 /* 2398 * On sPAPR, despite the IOMMU subdriver always advertises v1 and 2399 * v2, the running platform may not support v2 and there is no 2400 * way to guess it until an IOMMU group gets added to the container. 2401 * So in case it fails with v2, try v1 as a fallback. 2402 */ 2403 iommu_type = VFIO_SPAPR_TCE_IOMMU; 2404 continue; 2405 } 2406 error_setg_errno(errp, errno, "Failed to set iommu for container"); 2407 return -errno; 2408 } 2409 2410 container->iommu_type = iommu_type; 2411 return 0; 2412 } 2413 2414 static int vfio_get_iommu_info(VFIOContainer *container, 2415 struct vfio_iommu_type1_info **info) 2416 { 2417 2418 size_t argsz = sizeof(struct vfio_iommu_type1_info); 2419 2420 *info = g_new0(struct vfio_iommu_type1_info, 1); 2421 again: 2422 (*info)->argsz = argsz; 2423 2424 if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) { 2425 g_free(*info); 2426 *info = NULL; 2427 return -errno; 2428 } 2429 2430 if (((*info)->argsz > argsz)) { 2431 argsz = (*info)->argsz; 2432 *info = g_realloc(*info, argsz); 2433 goto again; 2434 } 2435 2436 return 0; 2437 } 2438 2439 static struct vfio_info_cap_header * 2440 vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) 2441 { 2442 struct vfio_info_cap_header *hdr; 2443 void *ptr = info; 2444 2445 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { 2446 return NULL; 2447 } 2448 2449 for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { 2450 if (hdr->id == id) { 2451 return hdr; 2452 } 2453 } 2454 2455 return NULL; 2456 } 2457 2458 static void vfio_get_iommu_info_migration(VFIOContainer *container, 2459 struct vfio_iommu_type1_info *info) 2460 { 2461 struct vfio_info_cap_header *hdr; 2462 struct vfio_iommu_type1_info_cap_migration *cap_mig; 2463 2464 hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION); 2465 if (!hdr) { 2466 return; 2467 } 2468 2469 cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration, 2470 header); 2471 2472 /* 2473 * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of 2474 * qemu_real_host_page_size to mark those dirty. 2475 */ 2476 if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) { 2477 container->dirty_pages_supported = true; 2478 container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; 2479 container->dirty_pgsizes = cap_mig->pgsize_bitmap; 2480 } 2481 } 2482 2483 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, 2484 Error **errp) 2485 { 2486 VFIOContainer *container; 2487 int ret, fd; 2488 VFIOAddressSpace *space; 2489 2490 space = vfio_get_address_space(as); 2491 2492 /* 2493 * VFIO is currently incompatible with discarding of RAM insofar as the 2494 * madvise to purge (zap) the page from QEMU's address space does not 2495 * interact with the memory API and therefore leaves stale virtual to 2496 * physical mappings in the IOMMU if the page was previously pinned. We 2497 * therefore set discarding broken for each group added to a container, 2498 * whether the container is used individually or shared. This provides 2499 * us with options to allow devices within a group to opt-in and allow 2500 * discarding, so long as it is done consistently for a group (for instance 2501 * if the device is an mdev device where it is known that the host vendor 2502 * driver will never pin pages outside of the working set of the guest 2503 * driver, which would thus not be discarding candidates). 2504 * 2505 * The first opportunity to induce pinning occurs here where we attempt to 2506 * attach the group to existing containers within the AddressSpace. If any 2507 * pages are already zapped from the virtual address space, such as from 2508 * previous discards, new pinning will cause valid mappings to be 2509 * re-established. Likewise, when the overall MemoryListener for a new 2510 * container is registered, a replay of mappings within the AddressSpace 2511 * will occur, re-establishing any previously zapped pages as well. 2512 * 2513 * Especially virtio-balloon is currently only prevented from discarding 2514 * new memory, it will not yet set ram_block_discard_set_required() and 2515 * therefore, neither stops us here or deals with the sudden memory 2516 * consumption of inflated memory. 2517 * 2518 * We do support discarding of memory coordinated via the RamDiscardManager 2519 * with some IOMMU types. vfio_ram_block_discard_disable() handles the 2520 * details once we know which type of IOMMU we are using. 2521 */ 2522 2523 QLIST_FOREACH(container, &space->containers, next) { 2524 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { 2525 ret = vfio_ram_block_discard_disable(container, true); 2526 if (ret) { 2527 error_setg_errno(errp, -ret, 2528 "Cannot set discarding of RAM broken"); 2529 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, 2530 &container->fd)) { 2531 error_report("vfio: error disconnecting group %d from" 2532 " container", group->groupid); 2533 } 2534 return ret; 2535 } 2536 group->container = container; 2537 QLIST_INSERT_HEAD(&container->group_list, group, container_next); 2538 vfio_kvm_device_add_group(group); 2539 return 0; 2540 } 2541 } 2542 2543 fd = qemu_open_old("/dev/vfio/vfio", O_RDWR); 2544 if (fd < 0) { 2545 error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio"); 2546 ret = -errno; 2547 goto put_space_exit; 2548 } 2549 2550 ret = ioctl(fd, VFIO_GET_API_VERSION); 2551 if (ret != VFIO_API_VERSION) { 2552 error_setg(errp, "supported vfio version: %d, " 2553 "reported version: %d", VFIO_API_VERSION, ret); 2554 ret = -EINVAL; 2555 goto close_fd_exit; 2556 } 2557 2558 container = g_malloc0(sizeof(*container)); 2559 container->space = space; 2560 container->fd = fd; 2561 container->error = NULL; 2562 container->dirty_pages_supported = false; 2563 container->dma_max_mappings = 0; 2564 QLIST_INIT(&container->giommu_list); 2565 QLIST_INIT(&container->hostwin_list); 2566 QLIST_INIT(&container->vrdl_list); 2567 2568 ret = vfio_init_container(container, group->fd, errp); 2569 if (ret) { 2570 goto free_container_exit; 2571 } 2572 2573 ret = vfio_ram_block_discard_disable(container, true); 2574 if (ret) { 2575 error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); 2576 goto free_container_exit; 2577 } 2578 2579 switch (container->iommu_type) { 2580 case VFIO_TYPE1v2_IOMMU: 2581 case VFIO_TYPE1_IOMMU: 2582 { 2583 struct vfio_iommu_type1_info *info; 2584 2585 ret = vfio_get_iommu_info(container, &info); 2586 if (ret) { 2587 error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info"); 2588 goto enable_discards_exit; 2589 } 2590 2591 if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { 2592 container->pgsizes = info->iova_pgsizes; 2593 } else { 2594 container->pgsizes = qemu_real_host_page_size(); 2595 } 2596 2597 if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) { 2598 container->dma_max_mappings = 65535; 2599 } 2600 vfio_get_iommu_info_migration(container, info); 2601 g_free(info); 2602 2603 /* 2604 * FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE 2605 * information to get the actual window extent rather than assume 2606 * a 64-bit IOVA address space. 2607 */ 2608 vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes); 2609 2610 break; 2611 } 2612 case VFIO_SPAPR_TCE_v2_IOMMU: 2613 case VFIO_SPAPR_TCE_IOMMU: 2614 { 2615 struct vfio_iommu_spapr_tce_info info; 2616 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; 2617 2618 /* 2619 * The host kernel code implementing VFIO_IOMMU_DISABLE is called 2620 * when container fd is closed so we do not call it explicitly 2621 * in this file. 2622 */ 2623 if (!v2) { 2624 ret = ioctl(fd, VFIO_IOMMU_ENABLE); 2625 if (ret) { 2626 error_setg_errno(errp, errno, "failed to enable container"); 2627 ret = -errno; 2628 goto enable_discards_exit; 2629 } 2630 } else { 2631 container->prereg_listener = vfio_prereg_listener; 2632 2633 memory_listener_register(&container->prereg_listener, 2634 &address_space_memory); 2635 if (container->error) { 2636 memory_listener_unregister(&container->prereg_listener); 2637 ret = -1; 2638 error_propagate_prepend(errp, container->error, 2639 "RAM memory listener initialization failed: "); 2640 goto enable_discards_exit; 2641 } 2642 } 2643 2644 info.argsz = sizeof(info); 2645 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); 2646 if (ret) { 2647 error_setg_errno(errp, errno, 2648 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed"); 2649 ret = -errno; 2650 if (v2) { 2651 memory_listener_unregister(&container->prereg_listener); 2652 } 2653 goto enable_discards_exit; 2654 } 2655 2656 if (v2) { 2657 container->pgsizes = info.ddw.pgsizes; 2658 /* 2659 * There is a default window in just created container. 2660 * To make region_add/del simpler, we better remove this 2661 * window now and let those iommu_listener callbacks 2662 * create/remove them when needed. 2663 */ 2664 ret = vfio_spapr_remove_window(container, info.dma32_window_start); 2665 if (ret) { 2666 error_setg_errno(errp, -ret, 2667 "failed to remove existing window"); 2668 goto enable_discards_exit; 2669 } 2670 } else { 2671 /* The default table uses 4K pages */ 2672 container->pgsizes = 0x1000; 2673 vfio_host_win_add(container, info.dma32_window_start, 2674 info.dma32_window_start + 2675 info.dma32_window_size - 1, 2676 0x1000); 2677 } 2678 } 2679 } 2680 2681 vfio_kvm_device_add_group(group); 2682 2683 QLIST_INIT(&container->group_list); 2684 QLIST_INSERT_HEAD(&space->containers, container, next); 2685 2686 group->container = container; 2687 QLIST_INSERT_HEAD(&container->group_list, group, container_next); 2688 2689 container->listener = vfio_memory_listener; 2690 2691 memory_listener_register(&container->listener, container->space->as); 2692 2693 if (container->error) { 2694 ret = -1; 2695 error_propagate_prepend(errp, container->error, 2696 "memory listener initialization failed: "); 2697 goto listener_release_exit; 2698 } 2699 2700 container->initialized = true; 2701 2702 return 0; 2703 listener_release_exit: 2704 QLIST_REMOVE(group, container_next); 2705 QLIST_REMOVE(container, next); 2706 vfio_kvm_device_del_group(group); 2707 vfio_listener_release(container); 2708 2709 enable_discards_exit: 2710 vfio_ram_block_discard_disable(container, false); 2711 2712 free_container_exit: 2713 g_free(container); 2714 2715 close_fd_exit: 2716 close(fd); 2717 2718 put_space_exit: 2719 vfio_put_address_space(space); 2720 2721 return ret; 2722 } 2723 2724 static void vfio_disconnect_container(VFIOGroup *group) 2725 { 2726 VFIOContainer *container = group->container; 2727 2728 QLIST_REMOVE(group, container_next); 2729 group->container = NULL; 2730 2731 /* 2732 * Explicitly release the listener first before unset container, 2733 * since unset may destroy the backend container if it's the last 2734 * group. 2735 */ 2736 if (QLIST_EMPTY(&container->group_list)) { 2737 vfio_listener_release(container); 2738 } 2739 2740 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { 2741 error_report("vfio: error disconnecting group %d from container", 2742 group->groupid); 2743 } 2744 2745 if (QLIST_EMPTY(&container->group_list)) { 2746 VFIOAddressSpace *space = container->space; 2747 VFIOGuestIOMMU *giommu, *tmp; 2748 VFIOHostDMAWindow *hostwin, *next; 2749 2750 QLIST_REMOVE(container, next); 2751 2752 QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) { 2753 memory_region_unregister_iommu_notifier( 2754 MEMORY_REGION(giommu->iommu_mr), &giommu->n); 2755 QLIST_REMOVE(giommu, giommu_next); 2756 g_free(giommu); 2757 } 2758 2759 QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next, 2760 next) { 2761 QLIST_REMOVE(hostwin, hostwin_next); 2762 g_free(hostwin); 2763 } 2764 2765 trace_vfio_disconnect_container(container->fd); 2766 close(container->fd); 2767 g_free(container); 2768 2769 vfio_put_address_space(space); 2770 } 2771 } 2772 2773 VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp) 2774 { 2775 VFIOGroup *group; 2776 char path[32]; 2777 struct vfio_group_status status = { .argsz = sizeof(status) }; 2778 2779 QLIST_FOREACH(group, &vfio_group_list, next) { 2780 if (group->groupid == groupid) { 2781 /* Found it. Now is it already in the right context? */ 2782 if (group->container->space->as == as) { 2783 return group; 2784 } else { 2785 error_setg(errp, "group %d used in multiple address spaces", 2786 group->groupid); 2787 return NULL; 2788 } 2789 } 2790 } 2791 2792 group = g_malloc0(sizeof(*group)); 2793 2794 snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); 2795 group->fd = qemu_open_old(path, O_RDWR); 2796 if (group->fd < 0) { 2797 error_setg_errno(errp, errno, "failed to open %s", path); 2798 goto free_group_exit; 2799 } 2800 2801 if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) { 2802 error_setg_errno(errp, errno, "failed to get group %d status", groupid); 2803 goto close_fd_exit; 2804 } 2805 2806 if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) { 2807 error_setg(errp, "group %d is not viable", groupid); 2808 error_append_hint(errp, 2809 "Please ensure all devices within the iommu_group " 2810 "are bound to their vfio bus driver.\n"); 2811 goto close_fd_exit; 2812 } 2813 2814 group->groupid = groupid; 2815 QLIST_INIT(&group->device_list); 2816 2817 if (vfio_connect_container(group, as, errp)) { 2818 error_prepend(errp, "failed to setup container for group %d: ", 2819 groupid); 2820 goto close_fd_exit; 2821 } 2822 2823 if (QLIST_EMPTY(&vfio_group_list)) { 2824 qemu_register_reset(vfio_reset_handler, NULL); 2825 } 2826 2827 QLIST_INSERT_HEAD(&vfio_group_list, group, next); 2828 2829 return group; 2830 2831 close_fd_exit: 2832 close(group->fd); 2833 2834 free_group_exit: 2835 g_free(group); 2836 2837 return NULL; 2838 } 2839 2840 void vfio_put_group(VFIOGroup *group) 2841 { 2842 if (!group || !QLIST_EMPTY(&group->device_list)) { 2843 return; 2844 } 2845 2846 if (!group->ram_block_discard_allowed) { 2847 vfio_ram_block_discard_disable(group->container, false); 2848 } 2849 vfio_kvm_device_del_group(group); 2850 vfio_disconnect_container(group); 2851 QLIST_REMOVE(group, next); 2852 trace_vfio_put_group(group->fd); 2853 close(group->fd); 2854 g_free(group); 2855 2856 if (QLIST_EMPTY(&vfio_group_list)) { 2857 qemu_unregister_reset(vfio_reset_handler, NULL); 2858 } 2859 } 2860 2861 struct vfio_device_info *vfio_get_device_info(int fd) 2862 { 2863 struct vfio_device_info *info; 2864 uint32_t argsz = sizeof(*info); 2865 2866 info = g_malloc0(argsz); 2867 2868 retry: 2869 info->argsz = argsz; 2870 2871 if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { 2872 g_free(info); 2873 return NULL; 2874 } 2875 2876 if (info->argsz > argsz) { 2877 argsz = info->argsz; 2878 info = g_realloc(info, argsz); 2879 goto retry; 2880 } 2881 2882 return info; 2883 } 2884 2885 int vfio_get_device(VFIOGroup *group, const char *name, 2886 VFIODevice *vbasedev, Error **errp) 2887 { 2888 g_autofree struct vfio_device_info *info = NULL; 2889 int fd; 2890 2891 fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); 2892 if (fd < 0) { 2893 error_setg_errno(errp, errno, "error getting device from group %d", 2894 group->groupid); 2895 error_append_hint(errp, 2896 "Verify all devices in group %d are bound to vfio-<bus> " 2897 "or pci-stub and not already in use\n", group->groupid); 2898 return fd; 2899 } 2900 2901 info = vfio_get_device_info(fd); 2902 if (!info) { 2903 error_setg_errno(errp, errno, "error getting device info"); 2904 close(fd); 2905 return -1; 2906 } 2907 2908 /* 2909 * Set discarding of RAM as not broken for this group if the driver knows 2910 * the device operates compatibly with discarding. Setting must be 2911 * consistent per group, but since compatibility is really only possible 2912 * with mdev currently, we expect singleton groups. 2913 */ 2914 if (vbasedev->ram_block_discard_allowed != 2915 group->ram_block_discard_allowed) { 2916 if (!QLIST_EMPTY(&group->device_list)) { 2917 error_setg(errp, "Inconsistent setting of support for discarding " 2918 "RAM (e.g., balloon) within group"); 2919 close(fd); 2920 return -1; 2921 } 2922 2923 if (!group->ram_block_discard_allowed) { 2924 group->ram_block_discard_allowed = true; 2925 vfio_ram_block_discard_disable(group->container, false); 2926 } 2927 } 2928 2929 vbasedev->fd = fd; 2930 vbasedev->group = group; 2931 QLIST_INSERT_HEAD(&group->device_list, vbasedev, next); 2932 2933 vbasedev->num_irqs = info->num_irqs; 2934 vbasedev->num_regions = info->num_regions; 2935 vbasedev->flags = info->flags; 2936 2937 trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs); 2938 2939 vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET); 2940 2941 return 0; 2942 } 2943 2944 void vfio_put_base_device(VFIODevice *vbasedev) 2945 { 2946 if (!vbasedev->group) { 2947 return; 2948 } 2949 QLIST_REMOVE(vbasedev, next); 2950 vbasedev->group = NULL; 2951 trace_vfio_put_base_device(vbasedev->fd); 2952 close(vbasedev->fd); 2953 } 2954 2955 int vfio_get_region_info(VFIODevice *vbasedev, int index, 2956 struct vfio_region_info **info) 2957 { 2958 size_t argsz = sizeof(struct vfio_region_info); 2959 2960 *info = g_malloc0(argsz); 2961 2962 (*info)->index = index; 2963 retry: 2964 (*info)->argsz = argsz; 2965 2966 if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { 2967 g_free(*info); 2968 *info = NULL; 2969 return -errno; 2970 } 2971 2972 if ((*info)->argsz > argsz) { 2973 argsz = (*info)->argsz; 2974 *info = g_realloc(*info, argsz); 2975 2976 goto retry; 2977 } 2978 2979 return 0; 2980 } 2981 2982 int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, 2983 uint32_t subtype, struct vfio_region_info **info) 2984 { 2985 int i; 2986 2987 for (i = 0; i < vbasedev->num_regions; i++) { 2988 struct vfio_info_cap_header *hdr; 2989 struct vfio_region_info_cap_type *cap_type; 2990 2991 if (vfio_get_region_info(vbasedev, i, info)) { 2992 continue; 2993 } 2994 2995 hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE); 2996 if (!hdr) { 2997 g_free(*info); 2998 continue; 2999 } 3000 3001 cap_type = container_of(hdr, struct vfio_region_info_cap_type, header); 3002 3003 trace_vfio_get_dev_region(vbasedev->name, i, 3004 cap_type->type, cap_type->subtype); 3005 3006 if (cap_type->type == type && cap_type->subtype == subtype) { 3007 return 0; 3008 } 3009 3010 g_free(*info); 3011 } 3012 3013 *info = NULL; 3014 return -ENODEV; 3015 } 3016 3017 bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) 3018 { 3019 struct vfio_region_info *info = NULL; 3020 bool ret = false; 3021 3022 if (!vfio_get_region_info(vbasedev, region, &info)) { 3023 if (vfio_get_region_info_cap(info, cap_type)) { 3024 ret = true; 3025 } 3026 g_free(info); 3027 } 3028 3029 return ret; 3030 } 3031 3032 /* 3033 * Interfaces for IBM EEH (Enhanced Error Handling) 3034 */ 3035 static bool vfio_eeh_container_ok(VFIOContainer *container) 3036 { 3037 /* 3038 * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO 3039 * implementation is broken if there are multiple groups in a 3040 * container. The hardware works in units of Partitionable 3041 * Endpoints (== IOMMU groups) and the EEH operations naively 3042 * iterate across all groups in the container, without any logic 3043 * to make sure the groups have their state synchronized. For 3044 * certain operations (ENABLE) that might be ok, until an error 3045 * occurs, but for others (GET_STATE) it's clearly broken. 3046 */ 3047 3048 /* 3049 * XXX Once fixed kernels exist, test for them here 3050 */ 3051 3052 if (QLIST_EMPTY(&container->group_list)) { 3053 return false; 3054 } 3055 3056 if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) { 3057 return false; 3058 } 3059 3060 return true; 3061 } 3062 3063 static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op) 3064 { 3065 struct vfio_eeh_pe_op pe_op = { 3066 .argsz = sizeof(pe_op), 3067 .op = op, 3068 }; 3069 int ret; 3070 3071 if (!vfio_eeh_container_ok(container)) { 3072 error_report("vfio/eeh: EEH_PE_OP 0x%x: " 3073 "kernel requires a container with exactly one group", op); 3074 return -EPERM; 3075 } 3076 3077 ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op); 3078 if (ret < 0) { 3079 error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op); 3080 return -errno; 3081 } 3082 3083 return ret; 3084 } 3085 3086 static VFIOContainer *vfio_eeh_as_container(AddressSpace *as) 3087 { 3088 VFIOAddressSpace *space = vfio_get_address_space(as); 3089 VFIOContainer *container = NULL; 3090 3091 if (QLIST_EMPTY(&space->containers)) { 3092 /* No containers to act on */ 3093 goto out; 3094 } 3095 3096 container = QLIST_FIRST(&space->containers); 3097 3098 if (QLIST_NEXT(container, next)) { 3099 /* We don't yet have logic to synchronize EEH state across 3100 * multiple containers */ 3101 container = NULL; 3102 goto out; 3103 } 3104 3105 out: 3106 vfio_put_address_space(space); 3107 return container; 3108 } 3109 3110 bool vfio_eeh_as_ok(AddressSpace *as) 3111 { 3112 VFIOContainer *container = vfio_eeh_as_container(as); 3113 3114 return (container != NULL) && vfio_eeh_container_ok(container); 3115 } 3116 3117 int vfio_eeh_as_op(AddressSpace *as, uint32_t op) 3118 { 3119 VFIOContainer *container = vfio_eeh_as_container(as); 3120 3121 if (!container) { 3122 return -ENODEV; 3123 } 3124 return vfio_eeh_container_op(container, op); 3125 } 3126