1 /* 2 * generic functions used by VFIO devices 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Based on qemu-kvm device-assignment: 13 * Adapted for KVM by Qumranet. 14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) 15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) 16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) 17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) 18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) 19 */ 20 21 #include "qemu/osdep.h" 22 #include <sys/ioctl.h> 23 #ifdef CONFIG_KVM 24 #include <linux/kvm.h> 25 #endif 26 #include <linux/vfio.h> 27 28 #include "hw/vfio/vfio-common.h" 29 #include "hw/vfio/pci.h" 30 #include "exec/address-spaces.h" 31 #include "exec/memory.h" 32 #include "exec/ram_addr.h" 33 #include "hw/hw.h" 34 #include "qemu/error-report.h" 35 #include "qemu/main-loop.h" 36 #include "qemu/range.h" 37 #include "sysemu/kvm.h" 38 #include "sysemu/reset.h" 39 #include "sysemu/runstate.h" 40 #include "trace.h" 41 #include "qapi/error.h" 42 #include "migration/misc.h" 43 #include "migration/blocker.h" 44 #include "migration/qemu-file.h" 45 #include "sysemu/tpm.h" 46 47 VFIODeviceList vfio_device_list = 48 QLIST_HEAD_INITIALIZER(vfio_device_list); 49 static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces = 50 QLIST_HEAD_INITIALIZER(vfio_address_spaces); 51 52 #ifdef CONFIG_KVM 53 /* 54 * We have a single VFIO pseudo device per KVM VM. Once created it lives 55 * for the life of the VM. Closing the file descriptor only drops our 56 * reference to it and the device's reference to kvm. Therefore once 57 * initialized, this file descriptor is only released on QEMU exit and 58 * we'll re-use it should another vfio device be attached before then. 59 */ 60 int vfio_kvm_device_fd = -1; 61 #endif 62 63 /* 64 * Device state interfaces 65 */ 66 67 bool vfio_mig_active(void) 68 { 69 VFIODevice *vbasedev; 70 71 if (QLIST_EMPTY(&vfio_device_list)) { 72 return false; 73 } 74 75 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { 76 if (vbasedev->migration_blocker) { 77 return false; 78 } 79 } 80 return true; 81 } 82 83 static Error *multiple_devices_migration_blocker; 84 85 /* 86 * Multiple devices migration is allowed only if all devices support P2P 87 * migration. Single device migration is allowed regardless of P2P migration 88 * support. 89 */ 90 static bool vfio_multiple_devices_migration_is_supported(void) 91 { 92 VFIODevice *vbasedev; 93 unsigned int device_num = 0; 94 bool all_support_p2p = true; 95 96 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { 97 if (vbasedev->migration) { 98 device_num++; 99 100 if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) { 101 all_support_p2p = false; 102 } 103 } 104 } 105 106 return all_support_p2p || device_num <= 1; 107 } 108 109 int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp) 110 { 111 int ret; 112 113 if (vfio_multiple_devices_migration_is_supported()) { 114 return 0; 115 } 116 117 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { 118 error_setg(errp, "Multiple VFIO devices migration is supported only if " 119 "all of them support P2P migration"); 120 return -EINVAL; 121 } 122 123 if (multiple_devices_migration_blocker) { 124 return 0; 125 } 126 127 error_setg(&multiple_devices_migration_blocker, 128 "Multiple VFIO devices migration is supported only if all of " 129 "them support P2P migration"); 130 ret = migrate_add_blocker_normal(&multiple_devices_migration_blocker, errp); 131 132 return ret; 133 } 134 135 void vfio_unblock_multiple_devices_migration(void) 136 { 137 if (!multiple_devices_migration_blocker || 138 !vfio_multiple_devices_migration_is_supported()) { 139 return; 140 } 141 142 migrate_del_blocker(&multiple_devices_migration_blocker); 143 } 144 145 bool vfio_viommu_preset(VFIODevice *vbasedev) 146 { 147 return vbasedev->bcontainer->space->as != &address_space_memory; 148 } 149 150 static void vfio_set_migration_error(int err) 151 { 152 if (migration_is_setup_or_active()) { 153 migration_file_set_error(err); 154 } 155 } 156 157 bool vfio_device_state_is_running(VFIODevice *vbasedev) 158 { 159 VFIOMigration *migration = vbasedev->migration; 160 161 return migration->device_state == VFIO_DEVICE_STATE_RUNNING || 162 migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P; 163 } 164 165 bool vfio_device_state_is_precopy(VFIODevice *vbasedev) 166 { 167 VFIOMigration *migration = vbasedev->migration; 168 169 return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY || 170 migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P; 171 } 172 173 static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer) 174 { 175 VFIODevice *vbasedev; 176 177 if (!migration_is_active() && !migration_is_device()) { 178 return false; 179 } 180 181 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 182 VFIOMigration *migration = vbasedev->migration; 183 184 if (!migration) { 185 return false; 186 } 187 188 if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF && 189 (vfio_device_state_is_running(vbasedev) || 190 vfio_device_state_is_precopy(vbasedev))) { 191 return false; 192 } 193 } 194 return true; 195 } 196 197 bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer) 198 { 199 VFIODevice *vbasedev; 200 201 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 202 if (!vbasedev->dirty_pages_supported) { 203 return false; 204 } 205 } 206 207 return true; 208 } 209 210 /* 211 * Check if all VFIO devices are running and migration is active, which is 212 * essentially equivalent to the migration being in pre-copy phase. 213 */ 214 bool 215 vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer) 216 { 217 VFIODevice *vbasedev; 218 219 if (!migration_is_active()) { 220 return false; 221 } 222 223 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 224 VFIOMigration *migration = vbasedev->migration; 225 226 if (!migration) { 227 return false; 228 } 229 230 if (vfio_device_state_is_running(vbasedev) || 231 vfio_device_state_is_precopy(vbasedev)) { 232 continue; 233 } else { 234 return false; 235 } 236 } 237 return true; 238 } 239 240 static bool vfio_listener_skipped_section(MemoryRegionSection *section) 241 { 242 return (!memory_region_is_ram(section->mr) && 243 !memory_region_is_iommu(section->mr)) || 244 memory_region_is_protected(section->mr) || 245 /* 246 * Sizing an enabled 64-bit BAR can cause spurious mappings to 247 * addresses in the upper part of the 64-bit address space. These 248 * are never accessed by the CPU and beyond the address width of 249 * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width. 250 */ 251 section->offset_within_address_space & (1ULL << 63); 252 } 253 254 /* Called with rcu_read_lock held. */ 255 static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, 256 ram_addr_t *ram_addr, bool *read_only) 257 { 258 bool ret, mr_has_discard_manager; 259 260 ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, 261 &mr_has_discard_manager); 262 if (ret && mr_has_discard_manager) { 263 /* 264 * Malicious VMs might trigger discarding of IOMMU-mapped memory. The 265 * pages will remain pinned inside vfio until unmapped, resulting in a 266 * higher memory consumption than expected. If memory would get 267 * populated again later, there would be an inconsistency between pages 268 * pinned by vfio and pages seen by QEMU. This is the case until 269 * unmapped from the IOMMU (e.g., during device reset). 270 * 271 * With malicious guests, we really only care about pinning more memory 272 * than expected. RLIMIT_MEMLOCK set for the user/process can never be 273 * exceeded and can be used to mitigate this problem. 274 */ 275 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of" 276 " RAM (e.g., virtio-mem) works, however, malicious" 277 " guests can trigger pinning of more memory than" 278 " intended via an IOMMU. It's possible to mitigate " 279 " by setting/adjusting RLIMIT_MEMLOCK."); 280 } 281 return ret; 282 } 283 284 static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 285 { 286 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); 287 VFIOContainerBase *bcontainer = giommu->bcontainer; 288 hwaddr iova = iotlb->iova + giommu->iommu_offset; 289 void *vaddr; 290 int ret; 291 292 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP", 293 iova, iova + iotlb->addr_mask); 294 295 if (iotlb->target_as != &address_space_memory) { 296 error_report("Wrong target AS \"%s\", only system memory is allowed", 297 iotlb->target_as->name ? iotlb->target_as->name : "none"); 298 vfio_set_migration_error(-EINVAL); 299 return; 300 } 301 302 rcu_read_lock(); 303 304 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { 305 bool read_only; 306 307 if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) { 308 goto out; 309 } 310 /* 311 * vaddr is only valid until rcu_read_unlock(). But after 312 * vfio_dma_map has set up the mapping the pages will be 313 * pinned by the kernel. This makes sure that the RAM backend 314 * of vaddr will always be there, even if the memory object is 315 * destroyed and its backing memory munmap-ed. 316 */ 317 ret = vfio_container_dma_map(bcontainer, iova, 318 iotlb->addr_mask + 1, vaddr, 319 read_only); 320 if (ret) { 321 error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " 322 "0x%"HWADDR_PRIx", %p) = %d (%s)", 323 bcontainer, iova, 324 iotlb->addr_mask + 1, vaddr, ret, strerror(-ret)); 325 } 326 } else { 327 ret = vfio_container_dma_unmap(bcontainer, iova, 328 iotlb->addr_mask + 1, iotlb); 329 if (ret) { 330 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " 331 "0x%"HWADDR_PRIx") = %d (%s)", 332 bcontainer, iova, 333 iotlb->addr_mask + 1, ret, strerror(-ret)); 334 vfio_set_migration_error(ret); 335 } 336 } 337 out: 338 rcu_read_unlock(); 339 } 340 341 static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, 342 MemoryRegionSection *section) 343 { 344 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, 345 listener); 346 VFIOContainerBase *bcontainer = vrdl->bcontainer; 347 const hwaddr size = int128_get64(section->size); 348 const hwaddr iova = section->offset_within_address_space; 349 int ret; 350 351 /* Unmap with a single call. */ 352 ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL); 353 if (ret) { 354 error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, 355 strerror(-ret)); 356 } 357 } 358 359 static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, 360 MemoryRegionSection *section) 361 { 362 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, 363 listener); 364 VFIOContainerBase *bcontainer = vrdl->bcontainer; 365 const hwaddr end = section->offset_within_region + 366 int128_get64(section->size); 367 hwaddr start, next, iova; 368 void *vaddr; 369 int ret; 370 371 /* 372 * Map in (aligned within memory region) minimum granularity, so we can 373 * unmap in minimum granularity later. 374 */ 375 for (start = section->offset_within_region; start < end; start = next) { 376 next = ROUND_UP(start + 1, vrdl->granularity); 377 next = MIN(next, end); 378 379 iova = start - section->offset_within_region + 380 section->offset_within_address_space; 381 vaddr = memory_region_get_ram_ptr(section->mr) + start; 382 383 ret = vfio_container_dma_map(bcontainer, iova, next - start, 384 vaddr, section->readonly); 385 if (ret) { 386 /* Rollback */ 387 vfio_ram_discard_notify_discard(rdl, section); 388 return ret; 389 } 390 } 391 return 0; 392 } 393 394 static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer, 395 MemoryRegionSection *section) 396 { 397 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); 398 VFIORamDiscardListener *vrdl; 399 400 /* Ignore some corner cases not relevant in practice. */ 401 g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE)); 402 g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space, 403 TARGET_PAGE_SIZE)); 404 g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE)); 405 406 vrdl = g_new0(VFIORamDiscardListener, 1); 407 vrdl->bcontainer = bcontainer; 408 vrdl->mr = section->mr; 409 vrdl->offset_within_address_space = section->offset_within_address_space; 410 vrdl->size = int128_get64(section->size); 411 vrdl->granularity = ram_discard_manager_get_min_granularity(rdm, 412 section->mr); 413 414 g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); 415 g_assert(bcontainer->pgsizes && 416 vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes)); 417 418 ram_discard_listener_init(&vrdl->listener, 419 vfio_ram_discard_notify_populate, 420 vfio_ram_discard_notify_discard, true); 421 ram_discard_manager_register_listener(rdm, &vrdl->listener, section); 422 QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next); 423 424 /* 425 * Sanity-check if we have a theoretically problematic setup where we could 426 * exceed the maximum number of possible DMA mappings over time. We assume 427 * that each mapped section in the same address space as a RamDiscardManager 428 * section consumes exactly one DMA mapping, with the exception of 429 * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections 430 * in the same address space as RamDiscardManager sections. 431 * 432 * We assume that each section in the address space consumes one memslot. 433 * We take the number of KVM memory slots as a best guess for the maximum 434 * number of sections in the address space we could have over time, 435 * also consuming DMA mappings. 436 */ 437 if (bcontainer->dma_max_mappings) { 438 unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; 439 440 #ifdef CONFIG_KVM 441 if (kvm_enabled()) { 442 max_memslots = kvm_get_max_memslots(); 443 } 444 #endif 445 446 QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { 447 hwaddr start, end; 448 449 start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, 450 vrdl->granularity); 451 end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size, 452 vrdl->granularity); 453 vrdl_mappings += (end - start) / vrdl->granularity; 454 vrdl_count++; 455 } 456 457 if (vrdl_mappings + max_memslots - vrdl_count > 458 bcontainer->dma_max_mappings) { 459 warn_report("%s: possibly running out of DMA mappings. E.g., try" 460 " increasing the 'block-size' of virtio-mem devies." 461 " Maximum possible DMA mappings: %d, Maximum possible" 462 " memslots: %d", __func__, bcontainer->dma_max_mappings, 463 max_memslots); 464 } 465 } 466 } 467 468 static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer, 469 MemoryRegionSection *section) 470 { 471 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); 472 VFIORamDiscardListener *vrdl = NULL; 473 474 QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { 475 if (vrdl->mr == section->mr && 476 vrdl->offset_within_address_space == 477 section->offset_within_address_space) { 478 break; 479 } 480 } 481 482 if (!vrdl) { 483 hw_error("vfio: Trying to unregister missing RAM discard listener"); 484 } 485 486 ram_discard_manager_unregister_listener(rdm, &vrdl->listener); 487 QLIST_REMOVE(vrdl, next); 488 g_free(vrdl); 489 } 490 491 static bool vfio_known_safe_misalignment(MemoryRegionSection *section) 492 { 493 MemoryRegion *mr = section->mr; 494 495 if (!TPM_IS_CRB(mr->owner)) { 496 return false; 497 } 498 499 /* this is a known safe misaligned region, just trace for debug purpose */ 500 trace_vfio_known_safe_misalignment(memory_region_name(mr), 501 section->offset_within_address_space, 502 section->offset_within_region, 503 qemu_real_host_page_size()); 504 return true; 505 } 506 507 static bool vfio_listener_valid_section(MemoryRegionSection *section, 508 const char *name) 509 { 510 if (vfio_listener_skipped_section(section)) { 511 trace_vfio_listener_region_skip(name, 512 section->offset_within_address_space, 513 section->offset_within_address_space + 514 int128_get64(int128_sub(section->size, int128_one()))); 515 return false; 516 } 517 518 if (unlikely((section->offset_within_address_space & 519 ~qemu_real_host_page_mask()) != 520 (section->offset_within_region & ~qemu_real_host_page_mask()))) { 521 if (!vfio_known_safe_misalignment(section)) { 522 error_report("%s received unaligned region %s iova=0x%"PRIx64 523 " offset_within_region=0x%"PRIx64 524 " qemu_real_host_page_size=0x%"PRIxPTR, 525 __func__, memory_region_name(section->mr), 526 section->offset_within_address_space, 527 section->offset_within_region, 528 qemu_real_host_page_size()); 529 } 530 return false; 531 } 532 533 return true; 534 } 535 536 static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, 537 MemoryRegionSection *section, 538 hwaddr *out_iova, hwaddr *out_end, 539 Int128 *out_llend) 540 { 541 Int128 llend; 542 hwaddr iova; 543 544 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); 545 llend = int128_make64(section->offset_within_address_space); 546 llend = int128_add(llend, section->size); 547 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask())); 548 549 if (int128_ge(int128_make64(iova), llend)) { 550 return false; 551 } 552 553 *out_iova = iova; 554 *out_end = int128_get64(int128_sub(llend, int128_one())); 555 if (out_llend) { 556 *out_llend = llend; 557 } 558 return true; 559 } 560 561 static void vfio_listener_region_add(MemoryListener *listener, 562 MemoryRegionSection *section) 563 { 564 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, 565 listener); 566 hwaddr iova, end; 567 Int128 llend, llsize; 568 void *vaddr; 569 int ret; 570 Error *err = NULL; 571 572 if (!vfio_listener_valid_section(section, "region_add")) { 573 return; 574 } 575 576 if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, 577 &llend)) { 578 if (memory_region_is_ram_device(section->mr)) { 579 trace_vfio_listener_region_add_no_dma_map( 580 memory_region_name(section->mr), 581 section->offset_within_address_space, 582 int128_getlo(section->size), 583 qemu_real_host_page_size()); 584 } 585 return; 586 } 587 588 if (vfio_container_add_section_window(bcontainer, section, &err)) { 589 goto fail; 590 } 591 592 memory_region_ref(section->mr); 593 594 if (memory_region_is_iommu(section->mr)) { 595 VFIOGuestIOMMU *giommu; 596 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); 597 int iommu_idx; 598 599 trace_vfio_listener_region_add_iommu(iova, end); 600 /* 601 * FIXME: For VFIO iommu types which have KVM acceleration to 602 * avoid bouncing all map/unmaps through qemu this way, this 603 * would be the right place to wire that up (tell the KVM 604 * device emulation the VFIO iommu handles to use). 605 */ 606 giommu = g_malloc0(sizeof(*giommu)); 607 giommu->iommu_mr = iommu_mr; 608 giommu->iommu_offset = section->offset_within_address_space - 609 section->offset_within_region; 610 giommu->bcontainer = bcontainer; 611 llend = int128_add(int128_make64(section->offset_within_region), 612 section->size); 613 llend = int128_sub(llend, int128_one()); 614 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, 615 MEMTXATTRS_UNSPECIFIED); 616 iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, 617 IOMMU_NOTIFIER_IOTLB_EVENTS, 618 section->offset_within_region, 619 int128_get64(llend), 620 iommu_idx); 621 622 ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr, 623 bcontainer->pgsizes, 624 &err); 625 if (ret) { 626 g_free(giommu); 627 goto fail; 628 } 629 630 if (bcontainer->iova_ranges) { 631 ret = memory_region_iommu_set_iova_ranges(giommu->iommu_mr, 632 bcontainer->iova_ranges, 633 &err); 634 if (ret) { 635 g_free(giommu); 636 goto fail; 637 } 638 } 639 640 ret = memory_region_register_iommu_notifier(section->mr, &giommu->n, 641 &err); 642 if (ret) { 643 g_free(giommu); 644 goto fail; 645 } 646 QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next); 647 memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); 648 649 return; 650 } 651 652 /* Here we assume that memory_region_is_ram(section->mr)==true */ 653 654 /* 655 * For RAM memory regions with a RamDiscardManager, we only want to map the 656 * actually populated parts - and update the mapping whenever we're notified 657 * about changes. 658 */ 659 if (memory_region_has_ram_discard_manager(section->mr)) { 660 vfio_register_ram_discard_listener(bcontainer, section); 661 return; 662 } 663 664 vaddr = memory_region_get_ram_ptr(section->mr) + 665 section->offset_within_region + 666 (iova - section->offset_within_address_space); 667 668 trace_vfio_listener_region_add_ram(iova, end, vaddr); 669 670 llsize = int128_sub(llend, int128_make64(iova)); 671 672 if (memory_region_is_ram_device(section->mr)) { 673 hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; 674 675 if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { 676 trace_vfio_listener_region_add_no_dma_map( 677 memory_region_name(section->mr), 678 section->offset_within_address_space, 679 int128_getlo(section->size), 680 pgmask + 1); 681 return; 682 } 683 } 684 685 ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), 686 vaddr, section->readonly); 687 if (ret) { 688 error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " 689 "0x%"HWADDR_PRIx", %p) = %d (%s)", 690 bcontainer, iova, int128_get64(llsize), vaddr, ret, 691 strerror(-ret)); 692 if (memory_region_is_ram_device(section->mr)) { 693 /* Allow unexpected mappings not to be fatal for RAM devices */ 694 error_report_err(err); 695 return; 696 } 697 goto fail; 698 } 699 700 return; 701 702 fail: 703 if (memory_region_is_ram_device(section->mr)) { 704 error_reportf_err(err, "PCI p2p may not work: "); 705 return; 706 } 707 /* 708 * On the initfn path, store the first error in the container so we 709 * can gracefully fail. Runtime, there's not much we can do other 710 * than throw a hardware error. 711 */ 712 if (!bcontainer->initialized) { 713 if (!bcontainer->error) { 714 error_propagate_prepend(&bcontainer->error, err, 715 "Region %s: ", 716 memory_region_name(section->mr)); 717 } else { 718 error_free(err); 719 } 720 } else { 721 error_report_err(err); 722 hw_error("vfio: DMA mapping failed, unable to continue"); 723 } 724 } 725 726 static void vfio_listener_region_del(MemoryListener *listener, 727 MemoryRegionSection *section) 728 { 729 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, 730 listener); 731 hwaddr iova, end; 732 Int128 llend, llsize; 733 int ret; 734 bool try_unmap = true; 735 736 if (!vfio_listener_valid_section(section, "region_del")) { 737 return; 738 } 739 740 if (memory_region_is_iommu(section->mr)) { 741 VFIOGuestIOMMU *giommu; 742 743 QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { 744 if (MEMORY_REGION(giommu->iommu_mr) == section->mr && 745 giommu->n.start == section->offset_within_region) { 746 memory_region_unregister_iommu_notifier(section->mr, 747 &giommu->n); 748 QLIST_REMOVE(giommu, giommu_next); 749 g_free(giommu); 750 break; 751 } 752 } 753 754 /* 755 * FIXME: We assume the one big unmap below is adequate to 756 * remove any individual page mappings in the IOMMU which 757 * might have been copied into VFIO. This works for a page table 758 * based IOMMU where a big unmap flattens a large range of IO-PTEs. 759 * That may not be true for all IOMMU types. 760 */ 761 } 762 763 if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, 764 &llend)) { 765 return; 766 } 767 768 llsize = int128_sub(llend, int128_make64(iova)); 769 770 trace_vfio_listener_region_del(iova, end); 771 772 if (memory_region_is_ram_device(section->mr)) { 773 hwaddr pgmask; 774 775 pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; 776 try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); 777 } else if (memory_region_has_ram_discard_manager(section->mr)) { 778 vfio_unregister_ram_discard_listener(bcontainer, section); 779 /* Unregistering will trigger an unmap. */ 780 try_unmap = false; 781 } 782 783 if (try_unmap) { 784 if (int128_eq(llsize, int128_2_64())) { 785 /* The unmap ioctl doesn't accept a full 64-bit span. */ 786 llsize = int128_rshift(llsize, 1); 787 ret = vfio_container_dma_unmap(bcontainer, iova, 788 int128_get64(llsize), NULL); 789 if (ret) { 790 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " 791 "0x%"HWADDR_PRIx") = %d (%s)", 792 bcontainer, iova, int128_get64(llsize), ret, 793 strerror(-ret)); 794 } 795 iova += int128_get64(llsize); 796 } 797 ret = vfio_container_dma_unmap(bcontainer, iova, 798 int128_get64(llsize), NULL); 799 if (ret) { 800 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " 801 "0x%"HWADDR_PRIx") = %d (%s)", 802 bcontainer, iova, int128_get64(llsize), ret, 803 strerror(-ret)); 804 } 805 } 806 807 memory_region_unref(section->mr); 808 809 vfio_container_del_section_window(bcontainer, section); 810 } 811 812 typedef struct VFIODirtyRanges { 813 hwaddr min32; 814 hwaddr max32; 815 hwaddr min64; 816 hwaddr max64; 817 hwaddr minpci64; 818 hwaddr maxpci64; 819 } VFIODirtyRanges; 820 821 typedef struct VFIODirtyRangesListener { 822 VFIOContainerBase *bcontainer; 823 VFIODirtyRanges ranges; 824 MemoryListener listener; 825 } VFIODirtyRangesListener; 826 827 static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, 828 VFIOContainerBase *bcontainer) 829 { 830 VFIOPCIDevice *pcidev; 831 VFIODevice *vbasedev; 832 Object *owner; 833 834 owner = memory_region_owner(section->mr); 835 836 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 837 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { 838 continue; 839 } 840 pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 841 if (OBJECT(pcidev) == owner) { 842 return true; 843 } 844 } 845 846 return false; 847 } 848 849 static void vfio_dirty_tracking_update(MemoryListener *listener, 850 MemoryRegionSection *section) 851 { 852 VFIODirtyRangesListener *dirty = container_of(listener, 853 VFIODirtyRangesListener, 854 listener); 855 VFIODirtyRanges *range = &dirty->ranges; 856 hwaddr iova, end, *min, *max; 857 858 if (!vfio_listener_valid_section(section, "tracking_update") || 859 !vfio_get_section_iova_range(dirty->bcontainer, section, 860 &iova, &end, NULL)) { 861 return; 862 } 863 864 /* 865 * The address space passed to the dirty tracker is reduced to three ranges: 866 * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the 867 * PCI 64-bit hole. 868 * 869 * The underlying reports of dirty will query a sub-interval of each of 870 * these ranges. 871 * 872 * The purpose of the three range handling is to handle known cases of big 873 * holes in the address space, like the x86 AMD 1T hole, and firmware (like 874 * OVMF) which may relocate the pci-hole64 to the end of the address space. 875 * The latter would otherwise generate large ranges for tracking, stressing 876 * the limits of supported hardware. The pci-hole32 will always be below 4G 877 * (overlapping or not) so it doesn't need special handling and is part of 878 * the 32-bit range. 879 * 880 * The alternative would be an IOVATree but that has a much bigger runtime 881 * overhead and unnecessary complexity. 882 */ 883 if (vfio_section_is_vfio_pci(section, dirty->bcontainer) && 884 iova >= UINT32_MAX) { 885 min = &range->minpci64; 886 max = &range->maxpci64; 887 } else { 888 min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; 889 max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; 890 } 891 if (*min > iova) { 892 *min = iova; 893 } 894 if (*max < end) { 895 *max = end; 896 } 897 898 trace_vfio_device_dirty_tracking_update(iova, end, *min, *max); 899 return; 900 } 901 902 static const MemoryListener vfio_dirty_tracking_listener = { 903 .name = "vfio-tracking", 904 .region_add = vfio_dirty_tracking_update, 905 }; 906 907 static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer, 908 VFIODirtyRanges *ranges) 909 { 910 VFIODirtyRangesListener dirty; 911 912 memset(&dirty, 0, sizeof(dirty)); 913 dirty.ranges.min32 = UINT32_MAX; 914 dirty.ranges.min64 = UINT64_MAX; 915 dirty.ranges.minpci64 = UINT64_MAX; 916 dirty.listener = vfio_dirty_tracking_listener; 917 dirty.bcontainer = bcontainer; 918 919 memory_listener_register(&dirty.listener, 920 bcontainer->space->as); 921 922 *ranges = dirty.ranges; 923 924 /* 925 * The memory listener is synchronous, and used to calculate the range 926 * to dirty tracking. Unregister it after we are done as we are not 927 * interested in any follow-up updates. 928 */ 929 memory_listener_unregister(&dirty.listener); 930 } 931 932 static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) 933 { 934 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), 935 sizeof(uint64_t))] = {}; 936 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 937 VFIODevice *vbasedev; 938 939 feature->argsz = sizeof(buf); 940 feature->flags = VFIO_DEVICE_FEATURE_SET | 941 VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; 942 943 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 944 if (!vbasedev->dirty_tracking) { 945 continue; 946 } 947 948 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 949 warn_report("%s: Failed to stop DMA logging, err %d (%s)", 950 vbasedev->name, -errno, strerror(errno)); 951 } 952 vbasedev->dirty_tracking = false; 953 } 954 } 955 956 static struct vfio_device_feature * 957 vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer, 958 VFIODirtyRanges *tracking) 959 { 960 struct vfio_device_feature *feature; 961 size_t feature_size; 962 struct vfio_device_feature_dma_logging_control *control; 963 struct vfio_device_feature_dma_logging_range *ranges; 964 965 feature_size = sizeof(struct vfio_device_feature) + 966 sizeof(struct vfio_device_feature_dma_logging_control); 967 feature = g_try_malloc0(feature_size); 968 if (!feature) { 969 errno = ENOMEM; 970 return NULL; 971 } 972 feature->argsz = feature_size; 973 feature->flags = VFIO_DEVICE_FEATURE_SET | 974 VFIO_DEVICE_FEATURE_DMA_LOGGING_START; 975 976 control = (struct vfio_device_feature_dma_logging_control *)feature->data; 977 control->page_size = qemu_real_host_page_size(); 978 979 /* 980 * DMA logging uAPI guarantees to support at least a number of ranges that 981 * fits into a single host kernel base page. 982 */ 983 control->num_ranges = !!tracking->max32 + !!tracking->max64 + 984 !!tracking->maxpci64; 985 ranges = g_try_new0(struct vfio_device_feature_dma_logging_range, 986 control->num_ranges); 987 if (!ranges) { 988 g_free(feature); 989 errno = ENOMEM; 990 991 return NULL; 992 } 993 994 control->ranges = (uintptr_t)ranges; 995 if (tracking->max32) { 996 ranges->iova = tracking->min32; 997 ranges->length = (tracking->max32 - tracking->min32) + 1; 998 ranges++; 999 } 1000 if (tracking->max64) { 1001 ranges->iova = tracking->min64; 1002 ranges->length = (tracking->max64 - tracking->min64) + 1; 1003 ranges++; 1004 } 1005 if (tracking->maxpci64) { 1006 ranges->iova = tracking->minpci64; 1007 ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1; 1008 } 1009 1010 trace_vfio_device_dirty_tracking_start(control->num_ranges, 1011 tracking->min32, tracking->max32, 1012 tracking->min64, tracking->max64, 1013 tracking->minpci64, tracking->maxpci64); 1014 1015 return feature; 1016 } 1017 1018 static void vfio_device_feature_dma_logging_start_destroy( 1019 struct vfio_device_feature *feature) 1020 { 1021 struct vfio_device_feature_dma_logging_control *control = 1022 (struct vfio_device_feature_dma_logging_control *)feature->data; 1023 struct vfio_device_feature_dma_logging_range *ranges = 1024 (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges; 1025 1026 g_free(ranges); 1027 g_free(feature); 1028 } 1029 1030 static int vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer) 1031 { 1032 struct vfio_device_feature *feature; 1033 VFIODirtyRanges ranges; 1034 VFIODevice *vbasedev; 1035 int ret = 0; 1036 1037 vfio_dirty_tracking_init(bcontainer, &ranges); 1038 feature = vfio_device_feature_dma_logging_start_create(bcontainer, 1039 &ranges); 1040 if (!feature) { 1041 return -errno; 1042 } 1043 1044 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 1045 if (vbasedev->dirty_tracking) { 1046 continue; 1047 } 1048 1049 ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); 1050 if (ret) { 1051 ret = -errno; 1052 error_report("%s: Failed to start DMA logging, err %d (%s)", 1053 vbasedev->name, ret, strerror(errno)); 1054 goto out; 1055 } 1056 vbasedev->dirty_tracking = true; 1057 } 1058 1059 out: 1060 if (ret) { 1061 vfio_devices_dma_logging_stop(bcontainer); 1062 } 1063 1064 vfio_device_feature_dma_logging_start_destroy(feature); 1065 1066 return ret; 1067 } 1068 1069 static bool vfio_listener_log_global_start(MemoryListener *listener, 1070 Error **errp) 1071 { 1072 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, 1073 listener); 1074 int ret; 1075 1076 if (vfio_devices_all_device_dirty_tracking(bcontainer)) { 1077 ret = vfio_devices_dma_logging_start(bcontainer); 1078 } else { 1079 ret = vfio_container_set_dirty_page_tracking(bcontainer, true); 1080 } 1081 1082 if (ret) { 1083 error_report("vfio: Could not start dirty page tracking, err: %d (%s)", 1084 ret, strerror(-ret)); 1085 vfio_set_migration_error(ret); 1086 } 1087 return !ret; 1088 } 1089 1090 static void vfio_listener_log_global_stop(MemoryListener *listener) 1091 { 1092 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, 1093 listener); 1094 int ret = 0; 1095 1096 if (vfio_devices_all_device_dirty_tracking(bcontainer)) { 1097 vfio_devices_dma_logging_stop(bcontainer); 1098 } else { 1099 ret = vfio_container_set_dirty_page_tracking(bcontainer, false); 1100 } 1101 1102 if (ret) { 1103 error_report("vfio: Could not stop dirty page tracking, err: %d (%s)", 1104 ret, strerror(-ret)); 1105 vfio_set_migration_error(ret); 1106 } 1107 } 1108 1109 static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, 1110 hwaddr size, void *bitmap) 1111 { 1112 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 1113 sizeof(struct vfio_device_feature_dma_logging_report), 1114 sizeof(uint64_t))] = {}; 1115 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 1116 struct vfio_device_feature_dma_logging_report *report = 1117 (struct vfio_device_feature_dma_logging_report *)feature->data; 1118 1119 report->iova = iova; 1120 report->length = size; 1121 report->page_size = qemu_real_host_page_size(); 1122 report->bitmap = (uintptr_t)bitmap; 1123 1124 feature->argsz = sizeof(buf); 1125 feature->flags = VFIO_DEVICE_FEATURE_GET | 1126 VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; 1127 1128 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 1129 return -errno; 1130 } 1131 1132 return 0; 1133 } 1134 1135 int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer, 1136 VFIOBitmap *vbmap, hwaddr iova, 1137 hwaddr size) 1138 { 1139 VFIODevice *vbasedev; 1140 int ret; 1141 1142 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 1143 ret = vfio_device_dma_logging_report(vbasedev, iova, size, 1144 vbmap->bitmap); 1145 if (ret) { 1146 error_report("%s: Failed to get DMA logging report, iova: " 1147 "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx 1148 ", err: %d (%s)", 1149 vbasedev->name, iova, size, ret, strerror(-ret)); 1150 1151 return ret; 1152 } 1153 } 1154 1155 return 0; 1156 } 1157 1158 int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, 1159 uint64_t size, ram_addr_t ram_addr) 1160 { 1161 bool all_device_dirty_tracking = 1162 vfio_devices_all_device_dirty_tracking(bcontainer); 1163 uint64_t dirty_pages; 1164 VFIOBitmap vbmap; 1165 int ret; 1166 1167 if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) { 1168 cpu_physical_memory_set_dirty_range(ram_addr, size, 1169 tcg_enabled() ? DIRTY_CLIENTS_ALL : 1170 DIRTY_CLIENTS_NOCODE); 1171 return 0; 1172 } 1173 1174 ret = vfio_bitmap_alloc(&vbmap, size); 1175 if (ret) { 1176 return ret; 1177 } 1178 1179 if (all_device_dirty_tracking) { 1180 ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size); 1181 } else { 1182 ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size); 1183 } 1184 1185 if (ret) { 1186 goto out; 1187 } 1188 1189 dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, 1190 vbmap.pages); 1191 1192 trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages); 1193 out: 1194 g_free(vbmap.bitmap); 1195 1196 return ret; 1197 } 1198 1199 typedef struct { 1200 IOMMUNotifier n; 1201 VFIOGuestIOMMU *giommu; 1202 } vfio_giommu_dirty_notifier; 1203 1204 static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 1205 { 1206 vfio_giommu_dirty_notifier *gdn = container_of(n, 1207 vfio_giommu_dirty_notifier, n); 1208 VFIOGuestIOMMU *giommu = gdn->giommu; 1209 VFIOContainerBase *bcontainer = giommu->bcontainer; 1210 hwaddr iova = iotlb->iova + giommu->iommu_offset; 1211 ram_addr_t translated_addr; 1212 int ret = -EINVAL; 1213 1214 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); 1215 1216 if (iotlb->target_as != &address_space_memory) { 1217 error_report("Wrong target AS \"%s\", only system memory is allowed", 1218 iotlb->target_as->name ? iotlb->target_as->name : "none"); 1219 goto out; 1220 } 1221 1222 rcu_read_lock(); 1223 if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { 1224 ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, 1225 translated_addr); 1226 if (ret) { 1227 error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " 1228 "0x%"HWADDR_PRIx") = %d (%s)", 1229 bcontainer, iova, iotlb->addr_mask + 1, ret, 1230 strerror(-ret)); 1231 } 1232 } 1233 rcu_read_unlock(); 1234 1235 out: 1236 if (ret) { 1237 vfio_set_migration_error(ret); 1238 } 1239 } 1240 1241 static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, 1242 void *opaque) 1243 { 1244 const hwaddr size = int128_get64(section->size); 1245 const hwaddr iova = section->offset_within_address_space; 1246 const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) + 1247 section->offset_within_region; 1248 VFIORamDiscardListener *vrdl = opaque; 1249 1250 /* 1251 * Sync the whole mapped region (spanning multiple individual mappings) 1252 * in one go. 1253 */ 1254 return vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr); 1255 } 1256 1257 static int 1258 vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, 1259 MemoryRegionSection *section) 1260 { 1261 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); 1262 VFIORamDiscardListener *vrdl = NULL; 1263 1264 QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { 1265 if (vrdl->mr == section->mr && 1266 vrdl->offset_within_address_space == 1267 section->offset_within_address_space) { 1268 break; 1269 } 1270 } 1271 1272 if (!vrdl) { 1273 hw_error("vfio: Trying to sync missing RAM discard listener"); 1274 } 1275 1276 /* 1277 * We only want/can synchronize the bitmap for actually mapped parts - 1278 * which correspond to populated parts. Replay all populated parts. 1279 */ 1280 return ram_discard_manager_replay_populated(rdm, section, 1281 vfio_ram_discard_get_dirty_bitmap, 1282 &vrdl); 1283 } 1284 1285 static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer, 1286 MemoryRegionSection *section) 1287 { 1288 ram_addr_t ram_addr; 1289 1290 if (memory_region_is_iommu(section->mr)) { 1291 VFIOGuestIOMMU *giommu; 1292 1293 QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { 1294 if (MEMORY_REGION(giommu->iommu_mr) == section->mr && 1295 giommu->n.start == section->offset_within_region) { 1296 Int128 llend; 1297 vfio_giommu_dirty_notifier gdn = { .giommu = giommu }; 1298 int idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr, 1299 MEMTXATTRS_UNSPECIFIED); 1300 1301 llend = int128_add(int128_make64(section->offset_within_region), 1302 section->size); 1303 llend = int128_sub(llend, int128_one()); 1304 1305 iommu_notifier_init(&gdn.n, 1306 vfio_iommu_map_dirty_notify, 1307 IOMMU_NOTIFIER_MAP, 1308 section->offset_within_region, 1309 int128_get64(llend), 1310 idx); 1311 memory_region_iommu_replay(giommu->iommu_mr, &gdn.n); 1312 break; 1313 } 1314 } 1315 return 0; 1316 } else if (memory_region_has_ram_discard_manager(section->mr)) { 1317 return vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section); 1318 } 1319 1320 ram_addr = memory_region_get_ram_addr(section->mr) + 1321 section->offset_within_region; 1322 1323 return vfio_get_dirty_bitmap(bcontainer, 1324 REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), 1325 int128_get64(section->size), ram_addr); 1326 } 1327 1328 static void vfio_listener_log_sync(MemoryListener *listener, 1329 MemoryRegionSection *section) 1330 { 1331 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, 1332 listener); 1333 int ret; 1334 1335 if (vfio_listener_skipped_section(section)) { 1336 return; 1337 } 1338 1339 if (vfio_devices_all_dirty_tracking(bcontainer)) { 1340 ret = vfio_sync_dirty_bitmap(bcontainer, section); 1341 if (ret) { 1342 error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret, 1343 strerror(-ret)); 1344 vfio_set_migration_error(ret); 1345 } 1346 } 1347 } 1348 1349 const MemoryListener vfio_memory_listener = { 1350 .name = "vfio", 1351 .region_add = vfio_listener_region_add, 1352 .region_del = vfio_listener_region_del, 1353 .log_global_start = vfio_listener_log_global_start, 1354 .log_global_stop = vfio_listener_log_global_stop, 1355 .log_sync = vfio_listener_log_sync, 1356 }; 1357 1358 void vfio_reset_handler(void *opaque) 1359 { 1360 VFIODevice *vbasedev; 1361 1362 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { 1363 if (vbasedev->dev->realized) { 1364 vbasedev->ops->vfio_compute_needs_reset(vbasedev); 1365 } 1366 } 1367 1368 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { 1369 if (vbasedev->dev->realized && vbasedev->needs_reset) { 1370 vbasedev->ops->vfio_hot_reset_multi(vbasedev); 1371 } 1372 } 1373 } 1374 1375 int vfio_kvm_device_add_fd(int fd, Error **errp) 1376 { 1377 #ifdef CONFIG_KVM 1378 struct kvm_device_attr attr = { 1379 .group = KVM_DEV_VFIO_FILE, 1380 .attr = KVM_DEV_VFIO_FILE_ADD, 1381 .addr = (uint64_t)(unsigned long)&fd, 1382 }; 1383 1384 if (!kvm_enabled()) { 1385 return 0; 1386 } 1387 1388 if (vfio_kvm_device_fd < 0) { 1389 struct kvm_create_device cd = { 1390 .type = KVM_DEV_TYPE_VFIO, 1391 }; 1392 1393 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) { 1394 error_setg_errno(errp, errno, "Failed to create KVM VFIO device"); 1395 return -errno; 1396 } 1397 1398 vfio_kvm_device_fd = cd.fd; 1399 } 1400 1401 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 1402 error_setg_errno(errp, errno, "Failed to add fd %d to KVM VFIO device", 1403 fd); 1404 return -errno; 1405 } 1406 #endif 1407 return 0; 1408 } 1409 1410 int vfio_kvm_device_del_fd(int fd, Error **errp) 1411 { 1412 #ifdef CONFIG_KVM 1413 struct kvm_device_attr attr = { 1414 .group = KVM_DEV_VFIO_FILE, 1415 .attr = KVM_DEV_VFIO_FILE_DEL, 1416 .addr = (uint64_t)(unsigned long)&fd, 1417 }; 1418 1419 if (vfio_kvm_device_fd < 0) { 1420 error_setg(errp, "KVM VFIO device isn't created yet"); 1421 return -EINVAL; 1422 } 1423 1424 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 1425 error_setg_errno(errp, errno, 1426 "Failed to remove fd %d from KVM VFIO device", fd); 1427 return -errno; 1428 } 1429 #endif 1430 return 0; 1431 } 1432 1433 VFIOAddressSpace *vfio_get_address_space(AddressSpace *as) 1434 { 1435 VFIOAddressSpace *space; 1436 1437 QLIST_FOREACH(space, &vfio_address_spaces, list) { 1438 if (space->as == as) { 1439 return space; 1440 } 1441 } 1442 1443 /* No suitable VFIOAddressSpace, create a new one */ 1444 space = g_malloc0(sizeof(*space)); 1445 space->as = as; 1446 QLIST_INIT(&space->containers); 1447 1448 if (QLIST_EMPTY(&vfio_address_spaces)) { 1449 qemu_register_reset(vfio_reset_handler, NULL); 1450 } 1451 1452 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list); 1453 1454 return space; 1455 } 1456 1457 void vfio_put_address_space(VFIOAddressSpace *space) 1458 { 1459 if (!QLIST_EMPTY(&space->containers)) { 1460 return; 1461 } 1462 1463 QLIST_REMOVE(space, list); 1464 g_free(space); 1465 1466 if (QLIST_EMPTY(&vfio_address_spaces)) { 1467 qemu_unregister_reset(vfio_reset_handler, NULL); 1468 } 1469 } 1470 1471 struct vfio_device_info *vfio_get_device_info(int fd) 1472 { 1473 struct vfio_device_info *info; 1474 uint32_t argsz = sizeof(*info); 1475 1476 info = g_malloc0(argsz); 1477 1478 retry: 1479 info->argsz = argsz; 1480 1481 if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { 1482 g_free(info); 1483 return NULL; 1484 } 1485 1486 if (info->argsz > argsz) { 1487 argsz = info->argsz; 1488 info = g_realloc(info, argsz); 1489 goto retry; 1490 } 1491 1492 return info; 1493 } 1494 1495 int vfio_attach_device(char *name, VFIODevice *vbasedev, 1496 AddressSpace *as, Error **errp) 1497 { 1498 const VFIOIOMMUClass *ops = 1499 VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); 1500 1501 if (vbasedev->iommufd) { 1502 ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); 1503 } 1504 1505 assert(ops); 1506 1507 return ops->attach_device(name, vbasedev, as, errp); 1508 } 1509 1510 void vfio_detach_device(VFIODevice *vbasedev) 1511 { 1512 if (!vbasedev->bcontainer) { 1513 return; 1514 } 1515 vbasedev->bcontainer->ops->detach_device(vbasedev); 1516 } 1517