1 /* 2 * generic functions used by VFIO devices 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Based on qemu-kvm device-assignment: 13 * Adapted for KVM by Qumranet. 14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) 15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) 16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) 17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) 18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) 19 */ 20 21 #include "qemu/osdep.h" 22 #include <sys/ioctl.h> 23 #ifdef CONFIG_KVM 24 #include <linux/kvm.h> 25 #endif 26 #include <linux/vfio.h> 27 28 #include "hw/vfio/vfio-common.h" 29 #include "hw/vfio/pci.h" 30 #include "exec/address-spaces.h" 31 #include "exec/memory.h" 32 #include "exec/ram_addr.h" 33 #include "hw/hw.h" 34 #include "qemu/error-report.h" 35 #include "qemu/main-loop.h" 36 #include "qemu/range.h" 37 #include "sysemu/kvm.h" 38 #include "sysemu/reset.h" 39 #include "sysemu/runstate.h" 40 #include "trace.h" 41 #include "qapi/error.h" 42 #include "migration/migration.h" 43 #include "migration/misc.h" 44 #include "migration/blocker.h" 45 #include "migration/qemu-file.h" 46 #include "sysemu/tpm.h" 47 48 VFIODeviceList vfio_device_list = 49 QLIST_HEAD_INITIALIZER(vfio_device_list); 50 static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces = 51 QLIST_HEAD_INITIALIZER(vfio_address_spaces); 52 53 #ifdef CONFIG_KVM 54 /* 55 * We have a single VFIO pseudo device per KVM VM. Once created it lives 56 * for the life of the VM. Closing the file descriptor only drops our 57 * reference to it and the device's reference to kvm. Therefore once 58 * initialized, this file descriptor is only released on QEMU exit and 59 * we'll re-use it should another vfio device be attached before then. 60 */ 61 int vfio_kvm_device_fd = -1; 62 #endif 63 64 /* 65 * Device state interfaces 66 */ 67 68 bool vfio_mig_active(void) 69 { 70 VFIODevice *vbasedev; 71 72 if (QLIST_EMPTY(&vfio_device_list)) { 73 return false; 74 } 75 76 QLIST_FOREACH(vbasedev, &vfio_device_list, next) { 77 if (vbasedev->migration_blocker) { 78 return false; 79 } 80 } 81 return true; 82 } 83 84 static Error *multiple_devices_migration_blocker; 85 86 /* 87 * Multiple devices migration is allowed only if all devices support P2P 88 * migration. Single device migration is allowed regardless of P2P migration 89 * support. 90 */ 91 static bool vfio_multiple_devices_migration_is_supported(void) 92 { 93 VFIODevice *vbasedev; 94 unsigned int device_num = 0; 95 bool all_support_p2p = true; 96 97 QLIST_FOREACH(vbasedev, &vfio_device_list, next) { 98 if (vbasedev->migration) { 99 device_num++; 100 101 if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) { 102 all_support_p2p = false; 103 } 104 } 105 } 106 107 return all_support_p2p || device_num <= 1; 108 } 109 110 int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp) 111 { 112 int ret; 113 114 if (vfio_multiple_devices_migration_is_supported()) { 115 return 0; 116 } 117 118 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { 119 error_setg(errp, "Multiple VFIO devices migration is supported only if " 120 "all of them support P2P migration"); 121 return -EINVAL; 122 } 123 124 if (multiple_devices_migration_blocker) { 125 return 0; 126 } 127 128 error_setg(&multiple_devices_migration_blocker, 129 "Multiple VFIO devices migration is supported only if all of " 130 "them support P2P migration"); 131 ret = migrate_add_blocker(&multiple_devices_migration_blocker, errp); 132 133 return ret; 134 } 135 136 void vfio_unblock_multiple_devices_migration(void) 137 { 138 if (!multiple_devices_migration_blocker || 139 !vfio_multiple_devices_migration_is_supported()) { 140 return; 141 } 142 143 migrate_del_blocker(&multiple_devices_migration_blocker); 144 } 145 146 bool vfio_viommu_preset(VFIODevice *vbasedev) 147 { 148 return vbasedev->container->space->as != &address_space_memory; 149 } 150 151 static void vfio_set_migration_error(int err) 152 { 153 MigrationState *ms = migrate_get_current(); 154 155 if (migration_is_setup_or_active(ms->state)) { 156 WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) { 157 if (ms->to_dst_file) { 158 qemu_file_set_error(ms->to_dst_file, err); 159 } 160 } 161 } 162 } 163 164 bool vfio_device_state_is_running(VFIODevice *vbasedev) 165 { 166 VFIOMigration *migration = vbasedev->migration; 167 168 return migration->device_state == VFIO_DEVICE_STATE_RUNNING || 169 migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P; 170 } 171 172 bool vfio_device_state_is_precopy(VFIODevice *vbasedev) 173 { 174 VFIOMigration *migration = vbasedev->migration; 175 176 return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY || 177 migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P; 178 } 179 180 static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) 181 { 182 VFIODevice *vbasedev; 183 MigrationState *ms = migrate_get_current(); 184 185 if (ms->state != MIGRATION_STATUS_ACTIVE && 186 ms->state != MIGRATION_STATUS_DEVICE) { 187 return false; 188 } 189 190 QLIST_FOREACH(vbasedev, &container->device_list, container_next) { 191 VFIOMigration *migration = vbasedev->migration; 192 193 if (!migration) { 194 return false; 195 } 196 197 if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF && 198 (vfio_device_state_is_running(vbasedev) || 199 vfio_device_state_is_precopy(vbasedev))) { 200 return false; 201 } 202 } 203 return true; 204 } 205 206 bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) 207 { 208 VFIODevice *vbasedev; 209 210 QLIST_FOREACH(vbasedev, &container->device_list, container_next) { 211 if (!vbasedev->dirty_pages_supported) { 212 return false; 213 } 214 } 215 216 return true; 217 } 218 219 /* 220 * Check if all VFIO devices are running and migration is active, which is 221 * essentially equivalent to the migration being in pre-copy phase. 222 */ 223 bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) 224 { 225 VFIODevice *vbasedev; 226 227 if (!migration_is_active(migrate_get_current())) { 228 return false; 229 } 230 231 QLIST_FOREACH(vbasedev, &container->device_list, container_next) { 232 VFIOMigration *migration = vbasedev->migration; 233 234 if (!migration) { 235 return false; 236 } 237 238 if (vfio_device_state_is_running(vbasedev) || 239 vfio_device_state_is_precopy(vbasedev)) { 240 continue; 241 } else { 242 return false; 243 } 244 } 245 return true; 246 } 247 248 static bool vfio_listener_skipped_section(MemoryRegionSection *section) 249 { 250 return (!memory_region_is_ram(section->mr) && 251 !memory_region_is_iommu(section->mr)) || 252 memory_region_is_protected(section->mr) || 253 /* 254 * Sizing an enabled 64-bit BAR can cause spurious mappings to 255 * addresses in the upper part of the 64-bit address space. These 256 * are never accessed by the CPU and beyond the address width of 257 * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width. 258 */ 259 section->offset_within_address_space & (1ULL << 63); 260 } 261 262 /* Called with rcu_read_lock held. */ 263 static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, 264 ram_addr_t *ram_addr, bool *read_only) 265 { 266 bool ret, mr_has_discard_manager; 267 268 ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, 269 &mr_has_discard_manager); 270 if (ret && mr_has_discard_manager) { 271 /* 272 * Malicious VMs might trigger discarding of IOMMU-mapped memory. The 273 * pages will remain pinned inside vfio until unmapped, resulting in a 274 * higher memory consumption than expected. If memory would get 275 * populated again later, there would be an inconsistency between pages 276 * pinned by vfio and pages seen by QEMU. This is the case until 277 * unmapped from the IOMMU (e.g., during device reset). 278 * 279 * With malicious guests, we really only care about pinning more memory 280 * than expected. RLIMIT_MEMLOCK set for the user/process can never be 281 * exceeded and can be used to mitigate this problem. 282 */ 283 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of" 284 " RAM (e.g., virtio-mem) works, however, malicious" 285 " guests can trigger pinning of more memory than" 286 " intended via an IOMMU. It's possible to mitigate " 287 " by setting/adjusting RLIMIT_MEMLOCK."); 288 } 289 return ret; 290 } 291 292 static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 293 { 294 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); 295 VFIOContainer *container = giommu->container; 296 hwaddr iova = iotlb->iova + giommu->iommu_offset; 297 void *vaddr; 298 int ret; 299 300 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP", 301 iova, iova + iotlb->addr_mask); 302 303 if (iotlb->target_as != &address_space_memory) { 304 error_report("Wrong target AS \"%s\", only system memory is allowed", 305 iotlb->target_as->name ? iotlb->target_as->name : "none"); 306 vfio_set_migration_error(-EINVAL); 307 return; 308 } 309 310 rcu_read_lock(); 311 312 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { 313 bool read_only; 314 315 if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) { 316 goto out; 317 } 318 /* 319 * vaddr is only valid until rcu_read_unlock(). But after 320 * vfio_dma_map has set up the mapping the pages will be 321 * pinned by the kernel. This makes sure that the RAM backend 322 * of vaddr will always be there, even if the memory object is 323 * destroyed and its backing memory munmap-ed. 324 */ 325 ret = vfio_dma_map(container, iova, 326 iotlb->addr_mask + 1, vaddr, 327 read_only); 328 if (ret) { 329 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " 330 "0x%"HWADDR_PRIx", %p) = %d (%s)", 331 container, iova, 332 iotlb->addr_mask + 1, vaddr, ret, strerror(-ret)); 333 } 334 } else { 335 ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb); 336 if (ret) { 337 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " 338 "0x%"HWADDR_PRIx") = %d (%s)", 339 container, iova, 340 iotlb->addr_mask + 1, ret, strerror(-ret)); 341 vfio_set_migration_error(ret); 342 } 343 } 344 out: 345 rcu_read_unlock(); 346 } 347 348 static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, 349 MemoryRegionSection *section) 350 { 351 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, 352 listener); 353 const hwaddr size = int128_get64(section->size); 354 const hwaddr iova = section->offset_within_address_space; 355 int ret; 356 357 /* Unmap with a single call. */ 358 ret = vfio_dma_unmap(vrdl->container, iova, size , NULL); 359 if (ret) { 360 error_report("%s: vfio_dma_unmap() failed: %s", __func__, 361 strerror(-ret)); 362 } 363 } 364 365 static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, 366 MemoryRegionSection *section) 367 { 368 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, 369 listener); 370 const hwaddr end = section->offset_within_region + 371 int128_get64(section->size); 372 hwaddr start, next, iova; 373 void *vaddr; 374 int ret; 375 376 /* 377 * Map in (aligned within memory region) minimum granularity, so we can 378 * unmap in minimum granularity later. 379 */ 380 for (start = section->offset_within_region; start < end; start = next) { 381 next = ROUND_UP(start + 1, vrdl->granularity); 382 next = MIN(next, end); 383 384 iova = start - section->offset_within_region + 385 section->offset_within_address_space; 386 vaddr = memory_region_get_ram_ptr(section->mr) + start; 387 388 ret = vfio_dma_map(vrdl->container, iova, next - start, 389 vaddr, section->readonly); 390 if (ret) { 391 /* Rollback */ 392 vfio_ram_discard_notify_discard(rdl, section); 393 return ret; 394 } 395 } 396 return 0; 397 } 398 399 static void vfio_register_ram_discard_listener(VFIOContainer *container, 400 MemoryRegionSection *section) 401 { 402 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); 403 VFIORamDiscardListener *vrdl; 404 405 /* Ignore some corner cases not relevant in practice. */ 406 g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE)); 407 g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space, 408 TARGET_PAGE_SIZE)); 409 g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE)); 410 411 vrdl = g_new0(VFIORamDiscardListener, 1); 412 vrdl->container = container; 413 vrdl->mr = section->mr; 414 vrdl->offset_within_address_space = section->offset_within_address_space; 415 vrdl->size = int128_get64(section->size); 416 vrdl->granularity = ram_discard_manager_get_min_granularity(rdm, 417 section->mr); 418 419 g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); 420 g_assert(container->pgsizes && 421 vrdl->granularity >= 1ULL << ctz64(container->pgsizes)); 422 423 ram_discard_listener_init(&vrdl->listener, 424 vfio_ram_discard_notify_populate, 425 vfio_ram_discard_notify_discard, true); 426 ram_discard_manager_register_listener(rdm, &vrdl->listener, section); 427 QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next); 428 429 /* 430 * Sanity-check if we have a theoretically problematic setup where we could 431 * exceed the maximum number of possible DMA mappings over time. We assume 432 * that each mapped section in the same address space as a RamDiscardManager 433 * section consumes exactly one DMA mapping, with the exception of 434 * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections 435 * in the same address space as RamDiscardManager sections. 436 * 437 * We assume that each section in the address space consumes one memslot. 438 * We take the number of KVM memory slots as a best guess for the maximum 439 * number of sections in the address space we could have over time, 440 * also consuming DMA mappings. 441 */ 442 if (container->dma_max_mappings) { 443 unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; 444 445 #ifdef CONFIG_KVM 446 if (kvm_enabled()) { 447 max_memslots = kvm_get_max_memslots(); 448 } 449 #endif 450 451 QLIST_FOREACH(vrdl, &container->vrdl_list, next) { 452 hwaddr start, end; 453 454 start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, 455 vrdl->granularity); 456 end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size, 457 vrdl->granularity); 458 vrdl_mappings += (end - start) / vrdl->granularity; 459 vrdl_count++; 460 } 461 462 if (vrdl_mappings + max_memslots - vrdl_count > 463 container->dma_max_mappings) { 464 warn_report("%s: possibly running out of DMA mappings. E.g., try" 465 " increasing the 'block-size' of virtio-mem devies." 466 " Maximum possible DMA mappings: %d, Maximum possible" 467 " memslots: %d", __func__, container->dma_max_mappings, 468 max_memslots); 469 } 470 } 471 } 472 473 static void vfio_unregister_ram_discard_listener(VFIOContainer *container, 474 MemoryRegionSection *section) 475 { 476 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); 477 VFIORamDiscardListener *vrdl = NULL; 478 479 QLIST_FOREACH(vrdl, &container->vrdl_list, next) { 480 if (vrdl->mr == section->mr && 481 vrdl->offset_within_address_space == 482 section->offset_within_address_space) { 483 break; 484 } 485 } 486 487 if (!vrdl) { 488 hw_error("vfio: Trying to unregister missing RAM discard listener"); 489 } 490 491 ram_discard_manager_unregister_listener(rdm, &vrdl->listener); 492 QLIST_REMOVE(vrdl, next); 493 g_free(vrdl); 494 } 495 496 static bool vfio_known_safe_misalignment(MemoryRegionSection *section) 497 { 498 MemoryRegion *mr = section->mr; 499 500 if (!TPM_IS_CRB(mr->owner)) { 501 return false; 502 } 503 504 /* this is a known safe misaligned region, just trace for debug purpose */ 505 trace_vfio_known_safe_misalignment(memory_region_name(mr), 506 section->offset_within_address_space, 507 section->offset_within_region, 508 qemu_real_host_page_size()); 509 return true; 510 } 511 512 static bool vfio_listener_valid_section(MemoryRegionSection *section, 513 const char *name) 514 { 515 if (vfio_listener_skipped_section(section)) { 516 trace_vfio_listener_region_skip(name, 517 section->offset_within_address_space, 518 section->offset_within_address_space + 519 int128_get64(int128_sub(section->size, int128_one()))); 520 return false; 521 } 522 523 if (unlikely((section->offset_within_address_space & 524 ~qemu_real_host_page_mask()) != 525 (section->offset_within_region & ~qemu_real_host_page_mask()))) { 526 if (!vfio_known_safe_misalignment(section)) { 527 error_report("%s received unaligned region %s iova=0x%"PRIx64 528 " offset_within_region=0x%"PRIx64 529 " qemu_real_host_page_size=0x%"PRIxPTR, 530 __func__, memory_region_name(section->mr), 531 section->offset_within_address_space, 532 section->offset_within_region, 533 qemu_real_host_page_size()); 534 } 535 return false; 536 } 537 538 return true; 539 } 540 541 static bool vfio_get_section_iova_range(VFIOContainer *container, 542 MemoryRegionSection *section, 543 hwaddr *out_iova, hwaddr *out_end, 544 Int128 *out_llend) 545 { 546 Int128 llend; 547 hwaddr iova; 548 549 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); 550 llend = int128_make64(section->offset_within_address_space); 551 llend = int128_add(llend, section->size); 552 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask())); 553 554 if (int128_ge(int128_make64(iova), llend)) { 555 return false; 556 } 557 558 *out_iova = iova; 559 *out_end = int128_get64(int128_sub(llend, int128_one())); 560 if (out_llend) { 561 *out_llend = llend; 562 } 563 return true; 564 } 565 566 static void vfio_listener_region_add(MemoryListener *listener, 567 MemoryRegionSection *section) 568 { 569 VFIOContainer *container = container_of(listener, VFIOContainer, listener); 570 hwaddr iova, end; 571 Int128 llend, llsize; 572 void *vaddr; 573 int ret; 574 Error *err = NULL; 575 576 if (!vfio_listener_valid_section(section, "region_add")) { 577 return; 578 } 579 580 if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { 581 if (memory_region_is_ram_device(section->mr)) { 582 trace_vfio_listener_region_add_no_dma_map( 583 memory_region_name(section->mr), 584 section->offset_within_address_space, 585 int128_getlo(section->size), 586 qemu_real_host_page_size()); 587 } 588 return; 589 } 590 591 if (vfio_container_add_section_window(container, section, &err)) { 592 goto fail; 593 } 594 595 memory_region_ref(section->mr); 596 597 if (memory_region_is_iommu(section->mr)) { 598 VFIOGuestIOMMU *giommu; 599 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); 600 int iommu_idx; 601 602 trace_vfio_listener_region_add_iommu(iova, end); 603 /* 604 * FIXME: For VFIO iommu types which have KVM acceleration to 605 * avoid bouncing all map/unmaps through qemu this way, this 606 * would be the right place to wire that up (tell the KVM 607 * device emulation the VFIO iommu handles to use). 608 */ 609 giommu = g_malloc0(sizeof(*giommu)); 610 giommu->iommu_mr = iommu_mr; 611 giommu->iommu_offset = section->offset_within_address_space - 612 section->offset_within_region; 613 giommu->container = container; 614 llend = int128_add(int128_make64(section->offset_within_region), 615 section->size); 616 llend = int128_sub(llend, int128_one()); 617 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, 618 MEMTXATTRS_UNSPECIFIED); 619 iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, 620 IOMMU_NOTIFIER_IOTLB_EVENTS, 621 section->offset_within_region, 622 int128_get64(llend), 623 iommu_idx); 624 625 ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr, 626 container->pgsizes, 627 &err); 628 if (ret) { 629 g_free(giommu); 630 goto fail; 631 } 632 633 if (container->iova_ranges) { 634 ret = memory_region_iommu_set_iova_ranges(giommu->iommu_mr, 635 container->iova_ranges, &err); 636 if (ret) { 637 g_free(giommu); 638 goto fail; 639 } 640 } 641 642 ret = memory_region_register_iommu_notifier(section->mr, &giommu->n, 643 &err); 644 if (ret) { 645 g_free(giommu); 646 goto fail; 647 } 648 QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); 649 memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); 650 651 return; 652 } 653 654 /* Here we assume that memory_region_is_ram(section->mr)==true */ 655 656 /* 657 * For RAM memory regions with a RamDiscardManager, we only want to map the 658 * actually populated parts - and update the mapping whenever we're notified 659 * about changes. 660 */ 661 if (memory_region_has_ram_discard_manager(section->mr)) { 662 vfio_register_ram_discard_listener(container, section); 663 return; 664 } 665 666 vaddr = memory_region_get_ram_ptr(section->mr) + 667 section->offset_within_region + 668 (iova - section->offset_within_address_space); 669 670 trace_vfio_listener_region_add_ram(iova, end, vaddr); 671 672 llsize = int128_sub(llend, int128_make64(iova)); 673 674 if (memory_region_is_ram_device(section->mr)) { 675 hwaddr pgmask = (1ULL << ctz64(container->pgsizes)) - 1; 676 677 if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { 678 trace_vfio_listener_region_add_no_dma_map( 679 memory_region_name(section->mr), 680 section->offset_within_address_space, 681 int128_getlo(section->size), 682 pgmask + 1); 683 return; 684 } 685 } 686 687 ret = vfio_dma_map(container, iova, int128_get64(llsize), 688 vaddr, section->readonly); 689 if (ret) { 690 error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", " 691 "0x%"HWADDR_PRIx", %p) = %d (%s)", 692 container, iova, int128_get64(llsize), vaddr, ret, 693 strerror(-ret)); 694 if (memory_region_is_ram_device(section->mr)) { 695 /* Allow unexpected mappings not to be fatal for RAM devices */ 696 error_report_err(err); 697 return; 698 } 699 goto fail; 700 } 701 702 return; 703 704 fail: 705 if (memory_region_is_ram_device(section->mr)) { 706 error_reportf_err(err, "PCI p2p may not work: "); 707 return; 708 } 709 /* 710 * On the initfn path, store the first error in the container so we 711 * can gracefully fail. Runtime, there's not much we can do other 712 * than throw a hardware error. 713 */ 714 if (!container->initialized) { 715 if (!container->error) { 716 error_propagate_prepend(&container->error, err, 717 "Region %s: ", 718 memory_region_name(section->mr)); 719 } else { 720 error_free(err); 721 } 722 } else { 723 error_report_err(err); 724 hw_error("vfio: DMA mapping failed, unable to continue"); 725 } 726 } 727 728 static void vfio_listener_region_del(MemoryListener *listener, 729 MemoryRegionSection *section) 730 { 731 VFIOContainer *container = container_of(listener, VFIOContainer, listener); 732 hwaddr iova, end; 733 Int128 llend, llsize; 734 int ret; 735 bool try_unmap = true; 736 737 if (!vfio_listener_valid_section(section, "region_del")) { 738 return; 739 } 740 741 if (memory_region_is_iommu(section->mr)) { 742 VFIOGuestIOMMU *giommu; 743 744 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { 745 if (MEMORY_REGION(giommu->iommu_mr) == section->mr && 746 giommu->n.start == section->offset_within_region) { 747 memory_region_unregister_iommu_notifier(section->mr, 748 &giommu->n); 749 QLIST_REMOVE(giommu, giommu_next); 750 g_free(giommu); 751 break; 752 } 753 } 754 755 /* 756 * FIXME: We assume the one big unmap below is adequate to 757 * remove any individual page mappings in the IOMMU which 758 * might have been copied into VFIO. This works for a page table 759 * based IOMMU where a big unmap flattens a large range of IO-PTEs. 760 * That may not be true for all IOMMU types. 761 */ 762 } 763 764 if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { 765 return; 766 } 767 768 llsize = int128_sub(llend, int128_make64(iova)); 769 770 trace_vfio_listener_region_del(iova, end); 771 772 if (memory_region_is_ram_device(section->mr)) { 773 hwaddr pgmask; 774 775 pgmask = (1ULL << ctz64(container->pgsizes)) - 1; 776 try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); 777 } else if (memory_region_has_ram_discard_manager(section->mr)) { 778 vfio_unregister_ram_discard_listener(container, section); 779 /* Unregistering will trigger an unmap. */ 780 try_unmap = false; 781 } 782 783 if (try_unmap) { 784 if (int128_eq(llsize, int128_2_64())) { 785 /* The unmap ioctl doesn't accept a full 64-bit span. */ 786 llsize = int128_rshift(llsize, 1); 787 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); 788 if (ret) { 789 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " 790 "0x%"HWADDR_PRIx") = %d (%s)", 791 container, iova, int128_get64(llsize), ret, 792 strerror(-ret)); 793 } 794 iova += int128_get64(llsize); 795 } 796 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); 797 if (ret) { 798 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " 799 "0x%"HWADDR_PRIx") = %d (%s)", 800 container, iova, int128_get64(llsize), ret, 801 strerror(-ret)); 802 } 803 } 804 805 memory_region_unref(section->mr); 806 807 vfio_container_del_section_window(container, section); 808 } 809 810 typedef struct VFIODirtyRanges { 811 hwaddr min32; 812 hwaddr max32; 813 hwaddr min64; 814 hwaddr max64; 815 hwaddr minpci64; 816 hwaddr maxpci64; 817 } VFIODirtyRanges; 818 819 typedef struct VFIODirtyRangesListener { 820 VFIOContainer *container; 821 VFIODirtyRanges ranges; 822 MemoryListener listener; 823 } VFIODirtyRangesListener; 824 825 static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, 826 VFIOContainer *container) 827 { 828 VFIOPCIDevice *pcidev; 829 VFIODevice *vbasedev; 830 Object *owner; 831 832 owner = memory_region_owner(section->mr); 833 834 QLIST_FOREACH(vbasedev, &container->device_list, container_next) { 835 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { 836 continue; 837 } 838 pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 839 if (OBJECT(pcidev) == owner) { 840 return true; 841 } 842 } 843 844 return false; 845 } 846 847 static void vfio_dirty_tracking_update(MemoryListener *listener, 848 MemoryRegionSection *section) 849 { 850 VFIODirtyRangesListener *dirty = container_of(listener, 851 VFIODirtyRangesListener, 852 listener); 853 VFIODirtyRanges *range = &dirty->ranges; 854 hwaddr iova, end, *min, *max; 855 856 if (!vfio_listener_valid_section(section, "tracking_update") || 857 !vfio_get_section_iova_range(dirty->container, section, 858 &iova, &end, NULL)) { 859 return; 860 } 861 862 /* 863 * The address space passed to the dirty tracker is reduced to three ranges: 864 * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the 865 * PCI 64-bit hole. 866 * 867 * The underlying reports of dirty will query a sub-interval of each of 868 * these ranges. 869 * 870 * The purpose of the three range handling is to handle known cases of big 871 * holes in the address space, like the x86 AMD 1T hole, and firmware (like 872 * OVMF) which may relocate the pci-hole64 to the end of the address space. 873 * The latter would otherwise generate large ranges for tracking, stressing 874 * the limits of supported hardware. The pci-hole32 will always be below 4G 875 * (overlapping or not) so it doesn't need special handling and is part of 876 * the 32-bit range. 877 * 878 * The alternative would be an IOVATree but that has a much bigger runtime 879 * overhead and unnecessary complexity. 880 */ 881 if (vfio_section_is_vfio_pci(section, dirty->container) && 882 iova >= UINT32_MAX) { 883 min = &range->minpci64; 884 max = &range->maxpci64; 885 } else { 886 min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; 887 max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; 888 } 889 if (*min > iova) { 890 *min = iova; 891 } 892 if (*max < end) { 893 *max = end; 894 } 895 896 trace_vfio_device_dirty_tracking_update(iova, end, *min, *max); 897 return; 898 } 899 900 static const MemoryListener vfio_dirty_tracking_listener = { 901 .name = "vfio-tracking", 902 .region_add = vfio_dirty_tracking_update, 903 }; 904 905 static void vfio_dirty_tracking_init(VFIOContainer *container, 906 VFIODirtyRanges *ranges) 907 { 908 VFIODirtyRangesListener dirty; 909 910 memset(&dirty, 0, sizeof(dirty)); 911 dirty.ranges.min32 = UINT32_MAX; 912 dirty.ranges.min64 = UINT64_MAX; 913 dirty.ranges.minpci64 = UINT64_MAX; 914 dirty.listener = vfio_dirty_tracking_listener; 915 dirty.container = container; 916 917 memory_listener_register(&dirty.listener, 918 container->space->as); 919 920 *ranges = dirty.ranges; 921 922 /* 923 * The memory listener is synchronous, and used to calculate the range 924 * to dirty tracking. Unregister it after we are done as we are not 925 * interested in any follow-up updates. 926 */ 927 memory_listener_unregister(&dirty.listener); 928 } 929 930 static void vfio_devices_dma_logging_stop(VFIOContainer *container) 931 { 932 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), 933 sizeof(uint64_t))] = {}; 934 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 935 VFIODevice *vbasedev; 936 937 feature->argsz = sizeof(buf); 938 feature->flags = VFIO_DEVICE_FEATURE_SET | 939 VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; 940 941 QLIST_FOREACH(vbasedev, &container->device_list, container_next) { 942 if (!vbasedev->dirty_tracking) { 943 continue; 944 } 945 946 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 947 warn_report("%s: Failed to stop DMA logging, err %d (%s)", 948 vbasedev->name, -errno, strerror(errno)); 949 } 950 vbasedev->dirty_tracking = false; 951 } 952 } 953 954 static struct vfio_device_feature * 955 vfio_device_feature_dma_logging_start_create(VFIOContainer *container, 956 VFIODirtyRanges *tracking) 957 { 958 struct vfio_device_feature *feature; 959 size_t feature_size; 960 struct vfio_device_feature_dma_logging_control *control; 961 struct vfio_device_feature_dma_logging_range *ranges; 962 963 feature_size = sizeof(struct vfio_device_feature) + 964 sizeof(struct vfio_device_feature_dma_logging_control); 965 feature = g_try_malloc0(feature_size); 966 if (!feature) { 967 errno = ENOMEM; 968 return NULL; 969 } 970 feature->argsz = feature_size; 971 feature->flags = VFIO_DEVICE_FEATURE_SET | 972 VFIO_DEVICE_FEATURE_DMA_LOGGING_START; 973 974 control = (struct vfio_device_feature_dma_logging_control *)feature->data; 975 control->page_size = qemu_real_host_page_size(); 976 977 /* 978 * DMA logging uAPI guarantees to support at least a number of ranges that 979 * fits into a single host kernel base page. 980 */ 981 control->num_ranges = !!tracking->max32 + !!tracking->max64 + 982 !!tracking->maxpci64; 983 ranges = g_try_new0(struct vfio_device_feature_dma_logging_range, 984 control->num_ranges); 985 if (!ranges) { 986 g_free(feature); 987 errno = ENOMEM; 988 989 return NULL; 990 } 991 992 control->ranges = (__u64)(uintptr_t)ranges; 993 if (tracking->max32) { 994 ranges->iova = tracking->min32; 995 ranges->length = (tracking->max32 - tracking->min32) + 1; 996 ranges++; 997 } 998 if (tracking->max64) { 999 ranges->iova = tracking->min64; 1000 ranges->length = (tracking->max64 - tracking->min64) + 1; 1001 ranges++; 1002 } 1003 if (tracking->maxpci64) { 1004 ranges->iova = tracking->minpci64; 1005 ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1; 1006 } 1007 1008 trace_vfio_device_dirty_tracking_start(control->num_ranges, 1009 tracking->min32, tracking->max32, 1010 tracking->min64, tracking->max64, 1011 tracking->minpci64, tracking->maxpci64); 1012 1013 return feature; 1014 } 1015 1016 static void vfio_device_feature_dma_logging_start_destroy( 1017 struct vfio_device_feature *feature) 1018 { 1019 struct vfio_device_feature_dma_logging_control *control = 1020 (struct vfio_device_feature_dma_logging_control *)feature->data; 1021 struct vfio_device_feature_dma_logging_range *ranges = 1022 (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges; 1023 1024 g_free(ranges); 1025 g_free(feature); 1026 } 1027 1028 static int vfio_devices_dma_logging_start(VFIOContainer *container) 1029 { 1030 struct vfio_device_feature *feature; 1031 VFIODirtyRanges ranges; 1032 VFIODevice *vbasedev; 1033 int ret = 0; 1034 1035 vfio_dirty_tracking_init(container, &ranges); 1036 feature = vfio_device_feature_dma_logging_start_create(container, 1037 &ranges); 1038 if (!feature) { 1039 return -errno; 1040 } 1041 1042 QLIST_FOREACH(vbasedev, &container->device_list, container_next) { 1043 if (vbasedev->dirty_tracking) { 1044 continue; 1045 } 1046 1047 ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); 1048 if (ret) { 1049 ret = -errno; 1050 error_report("%s: Failed to start DMA logging, err %d (%s)", 1051 vbasedev->name, ret, strerror(errno)); 1052 goto out; 1053 } 1054 vbasedev->dirty_tracking = true; 1055 } 1056 1057 out: 1058 if (ret) { 1059 vfio_devices_dma_logging_stop(container); 1060 } 1061 1062 vfio_device_feature_dma_logging_start_destroy(feature); 1063 1064 return ret; 1065 } 1066 1067 static void vfio_listener_log_global_start(MemoryListener *listener) 1068 { 1069 VFIOContainer *container = container_of(listener, VFIOContainer, listener); 1070 int ret; 1071 1072 if (vfio_devices_all_device_dirty_tracking(container)) { 1073 ret = vfio_devices_dma_logging_start(container); 1074 } else { 1075 ret = vfio_set_dirty_page_tracking(container, true); 1076 } 1077 1078 if (ret) { 1079 error_report("vfio: Could not start dirty page tracking, err: %d (%s)", 1080 ret, strerror(-ret)); 1081 vfio_set_migration_error(ret); 1082 } 1083 } 1084 1085 static void vfio_listener_log_global_stop(MemoryListener *listener) 1086 { 1087 VFIOContainer *container = container_of(listener, VFIOContainer, listener); 1088 int ret = 0; 1089 1090 if (vfio_devices_all_device_dirty_tracking(container)) { 1091 vfio_devices_dma_logging_stop(container); 1092 } else { 1093 ret = vfio_set_dirty_page_tracking(container, false); 1094 } 1095 1096 if (ret) { 1097 error_report("vfio: Could not stop dirty page tracking, err: %d (%s)", 1098 ret, strerror(-ret)); 1099 vfio_set_migration_error(ret); 1100 } 1101 } 1102 1103 static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, 1104 hwaddr size, void *bitmap) 1105 { 1106 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 1107 sizeof(struct vfio_device_feature_dma_logging_report), 1108 sizeof(__u64))] = {}; 1109 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 1110 struct vfio_device_feature_dma_logging_report *report = 1111 (struct vfio_device_feature_dma_logging_report *)feature->data; 1112 1113 report->iova = iova; 1114 report->length = size; 1115 report->page_size = qemu_real_host_page_size(); 1116 report->bitmap = (__u64)(uintptr_t)bitmap; 1117 1118 feature->argsz = sizeof(buf); 1119 feature->flags = VFIO_DEVICE_FEATURE_GET | 1120 VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; 1121 1122 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 1123 return -errno; 1124 } 1125 1126 return 0; 1127 } 1128 1129 int vfio_devices_query_dirty_bitmap(VFIOContainer *container, 1130 VFIOBitmap *vbmap, hwaddr iova, 1131 hwaddr size) 1132 { 1133 VFIODevice *vbasedev; 1134 int ret; 1135 1136 QLIST_FOREACH(vbasedev, &container->device_list, container_next) { 1137 ret = vfio_device_dma_logging_report(vbasedev, iova, size, 1138 vbmap->bitmap); 1139 if (ret) { 1140 error_report("%s: Failed to get DMA logging report, iova: " 1141 "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx 1142 ", err: %d (%s)", 1143 vbasedev->name, iova, size, ret, strerror(-ret)); 1144 1145 return ret; 1146 } 1147 } 1148 1149 return 0; 1150 } 1151 1152 int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, 1153 uint64_t size, ram_addr_t ram_addr) 1154 { 1155 bool all_device_dirty_tracking = 1156 vfio_devices_all_device_dirty_tracking(container); 1157 uint64_t dirty_pages; 1158 VFIOBitmap vbmap; 1159 int ret; 1160 1161 if (!container->dirty_pages_supported && !all_device_dirty_tracking) { 1162 cpu_physical_memory_set_dirty_range(ram_addr, size, 1163 tcg_enabled() ? DIRTY_CLIENTS_ALL : 1164 DIRTY_CLIENTS_NOCODE); 1165 return 0; 1166 } 1167 1168 ret = vfio_bitmap_alloc(&vbmap, size); 1169 if (ret) { 1170 return ret; 1171 } 1172 1173 if (all_device_dirty_tracking) { 1174 ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size); 1175 } else { 1176 ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size); 1177 } 1178 1179 if (ret) { 1180 goto out; 1181 } 1182 1183 dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, 1184 vbmap.pages); 1185 1186 trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size, 1187 ram_addr, dirty_pages); 1188 out: 1189 g_free(vbmap.bitmap); 1190 1191 return ret; 1192 } 1193 1194 typedef struct { 1195 IOMMUNotifier n; 1196 VFIOGuestIOMMU *giommu; 1197 } vfio_giommu_dirty_notifier; 1198 1199 static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 1200 { 1201 vfio_giommu_dirty_notifier *gdn = container_of(n, 1202 vfio_giommu_dirty_notifier, n); 1203 VFIOGuestIOMMU *giommu = gdn->giommu; 1204 VFIOContainer *container = giommu->container; 1205 hwaddr iova = iotlb->iova + giommu->iommu_offset; 1206 ram_addr_t translated_addr; 1207 int ret = -EINVAL; 1208 1209 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); 1210 1211 if (iotlb->target_as != &address_space_memory) { 1212 error_report("Wrong target AS \"%s\", only system memory is allowed", 1213 iotlb->target_as->name ? iotlb->target_as->name : "none"); 1214 goto out; 1215 } 1216 1217 rcu_read_lock(); 1218 if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { 1219 ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1, 1220 translated_addr); 1221 if (ret) { 1222 error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " 1223 "0x%"HWADDR_PRIx") = %d (%s)", 1224 container, iova, iotlb->addr_mask + 1, ret, 1225 strerror(-ret)); 1226 } 1227 } 1228 rcu_read_unlock(); 1229 1230 out: 1231 if (ret) { 1232 vfio_set_migration_error(ret); 1233 } 1234 } 1235 1236 static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, 1237 void *opaque) 1238 { 1239 const hwaddr size = int128_get64(section->size); 1240 const hwaddr iova = section->offset_within_address_space; 1241 const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) + 1242 section->offset_within_region; 1243 VFIORamDiscardListener *vrdl = opaque; 1244 1245 /* 1246 * Sync the whole mapped region (spanning multiple individual mappings) 1247 * in one go. 1248 */ 1249 return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr); 1250 } 1251 1252 static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, 1253 MemoryRegionSection *section) 1254 { 1255 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); 1256 VFIORamDiscardListener *vrdl = NULL; 1257 1258 QLIST_FOREACH(vrdl, &container->vrdl_list, next) { 1259 if (vrdl->mr == section->mr && 1260 vrdl->offset_within_address_space == 1261 section->offset_within_address_space) { 1262 break; 1263 } 1264 } 1265 1266 if (!vrdl) { 1267 hw_error("vfio: Trying to sync missing RAM discard listener"); 1268 } 1269 1270 /* 1271 * We only want/can synchronize the bitmap for actually mapped parts - 1272 * which correspond to populated parts. Replay all populated parts. 1273 */ 1274 return ram_discard_manager_replay_populated(rdm, section, 1275 vfio_ram_discard_get_dirty_bitmap, 1276 &vrdl); 1277 } 1278 1279 static int vfio_sync_dirty_bitmap(VFIOContainer *container, 1280 MemoryRegionSection *section) 1281 { 1282 ram_addr_t ram_addr; 1283 1284 if (memory_region_is_iommu(section->mr)) { 1285 VFIOGuestIOMMU *giommu; 1286 1287 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { 1288 if (MEMORY_REGION(giommu->iommu_mr) == section->mr && 1289 giommu->n.start == section->offset_within_region) { 1290 Int128 llend; 1291 vfio_giommu_dirty_notifier gdn = { .giommu = giommu }; 1292 int idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr, 1293 MEMTXATTRS_UNSPECIFIED); 1294 1295 llend = int128_add(int128_make64(section->offset_within_region), 1296 section->size); 1297 llend = int128_sub(llend, int128_one()); 1298 1299 iommu_notifier_init(&gdn.n, 1300 vfio_iommu_map_dirty_notify, 1301 IOMMU_NOTIFIER_MAP, 1302 section->offset_within_region, 1303 int128_get64(llend), 1304 idx); 1305 memory_region_iommu_replay(giommu->iommu_mr, &gdn.n); 1306 break; 1307 } 1308 } 1309 return 0; 1310 } else if (memory_region_has_ram_discard_manager(section->mr)) { 1311 return vfio_sync_ram_discard_listener_dirty_bitmap(container, section); 1312 } 1313 1314 ram_addr = memory_region_get_ram_addr(section->mr) + 1315 section->offset_within_region; 1316 1317 return vfio_get_dirty_bitmap(container, 1318 REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), 1319 int128_get64(section->size), ram_addr); 1320 } 1321 1322 static void vfio_listener_log_sync(MemoryListener *listener, 1323 MemoryRegionSection *section) 1324 { 1325 VFIOContainer *container = container_of(listener, VFIOContainer, listener); 1326 int ret; 1327 1328 if (vfio_listener_skipped_section(section)) { 1329 return; 1330 } 1331 1332 if (vfio_devices_all_dirty_tracking(container)) { 1333 ret = vfio_sync_dirty_bitmap(container, section); 1334 if (ret) { 1335 error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret, 1336 strerror(-ret)); 1337 vfio_set_migration_error(ret); 1338 } 1339 } 1340 } 1341 1342 const MemoryListener vfio_memory_listener = { 1343 .name = "vfio", 1344 .region_add = vfio_listener_region_add, 1345 .region_del = vfio_listener_region_del, 1346 .log_global_start = vfio_listener_log_global_start, 1347 .log_global_stop = vfio_listener_log_global_stop, 1348 .log_sync = vfio_listener_log_sync, 1349 }; 1350 1351 void vfio_reset_handler(void *opaque) 1352 { 1353 VFIODevice *vbasedev; 1354 1355 QLIST_FOREACH(vbasedev, &vfio_device_list, next) { 1356 if (vbasedev->dev->realized) { 1357 vbasedev->ops->vfio_compute_needs_reset(vbasedev); 1358 } 1359 } 1360 1361 QLIST_FOREACH(vbasedev, &vfio_device_list, next) { 1362 if (vbasedev->dev->realized && vbasedev->needs_reset) { 1363 vbasedev->ops->vfio_hot_reset_multi(vbasedev); 1364 } 1365 } 1366 } 1367 1368 int vfio_kvm_device_add_fd(int fd, Error **errp) 1369 { 1370 #ifdef CONFIG_KVM 1371 struct kvm_device_attr attr = { 1372 .group = KVM_DEV_VFIO_FILE, 1373 .attr = KVM_DEV_VFIO_FILE_ADD, 1374 .addr = (uint64_t)(unsigned long)&fd, 1375 }; 1376 1377 if (!kvm_enabled()) { 1378 return 0; 1379 } 1380 1381 if (vfio_kvm_device_fd < 0) { 1382 struct kvm_create_device cd = { 1383 .type = KVM_DEV_TYPE_VFIO, 1384 }; 1385 1386 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) { 1387 error_setg_errno(errp, errno, "Failed to create KVM VFIO device"); 1388 return -errno; 1389 } 1390 1391 vfio_kvm_device_fd = cd.fd; 1392 } 1393 1394 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 1395 error_setg_errno(errp, errno, "Failed to add fd %d to KVM VFIO device", 1396 fd); 1397 return -errno; 1398 } 1399 #endif 1400 return 0; 1401 } 1402 1403 int vfio_kvm_device_del_fd(int fd, Error **errp) 1404 { 1405 #ifdef CONFIG_KVM 1406 struct kvm_device_attr attr = { 1407 .group = KVM_DEV_VFIO_FILE, 1408 .attr = KVM_DEV_VFIO_FILE_DEL, 1409 .addr = (uint64_t)(unsigned long)&fd, 1410 }; 1411 1412 if (vfio_kvm_device_fd < 0) { 1413 error_setg(errp, "KVM VFIO device isn't created yet"); 1414 return -EINVAL; 1415 } 1416 1417 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 1418 error_setg_errno(errp, errno, 1419 "Failed to remove fd %d from KVM VFIO device", fd); 1420 return -errno; 1421 } 1422 #endif 1423 return 0; 1424 } 1425 1426 VFIOAddressSpace *vfio_get_address_space(AddressSpace *as) 1427 { 1428 VFIOAddressSpace *space; 1429 1430 QLIST_FOREACH(space, &vfio_address_spaces, list) { 1431 if (space->as == as) { 1432 return space; 1433 } 1434 } 1435 1436 /* No suitable VFIOAddressSpace, create a new one */ 1437 space = g_malloc0(sizeof(*space)); 1438 space->as = as; 1439 QLIST_INIT(&space->containers); 1440 1441 if (QLIST_EMPTY(&vfio_address_spaces)) { 1442 qemu_register_reset(vfio_reset_handler, NULL); 1443 } 1444 1445 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list); 1446 1447 return space; 1448 } 1449 1450 void vfio_put_address_space(VFIOAddressSpace *space) 1451 { 1452 if (QLIST_EMPTY(&space->containers)) { 1453 QLIST_REMOVE(space, list); 1454 g_free(space); 1455 } 1456 if (QLIST_EMPTY(&vfio_address_spaces)) { 1457 qemu_unregister_reset(vfio_reset_handler, NULL); 1458 } 1459 } 1460 1461 struct vfio_device_info *vfio_get_device_info(int fd) 1462 { 1463 struct vfio_device_info *info; 1464 uint32_t argsz = sizeof(*info); 1465 1466 info = g_malloc0(argsz); 1467 1468 retry: 1469 info->argsz = argsz; 1470 1471 if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { 1472 g_free(info); 1473 return NULL; 1474 } 1475 1476 if (info->argsz > argsz) { 1477 argsz = info->argsz; 1478 info = g_realloc(info, argsz); 1479 goto retry; 1480 } 1481 1482 return info; 1483 } 1484