1 /* 2 * generic functions used by VFIO devices 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Based on qemu-kvm device-assignment: 13 * Adapted for KVM by Qumranet. 14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) 15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) 16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) 17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) 18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) 19 */ 20 21 #include "qemu/osdep.h" 22 #include <sys/ioctl.h> 23 #ifdef CONFIG_KVM 24 #include <linux/kvm.h> 25 #endif 26 #include <linux/vfio.h> 27 28 #include "hw/vfio/vfio-common.h" 29 #include "hw/vfio/pci.h" 30 #include "exec/address-spaces.h" 31 #include "exec/memory.h" 32 #include "exec/ram_addr.h" 33 #include "hw/hw.h" 34 #include "qemu/error-report.h" 35 #include "qemu/main-loop.h" 36 #include "qemu/range.h" 37 #include "sysemu/kvm.h" 38 #include "sysemu/reset.h" 39 #include "sysemu/runstate.h" 40 #include "trace.h" 41 #include "qapi/error.h" 42 #include "migration/misc.h" 43 #include "migration/blocker.h" 44 #include "migration/qemu-file.h" 45 #include "sysemu/tpm.h" 46 47 VFIODeviceList vfio_device_list = 48 QLIST_HEAD_INITIALIZER(vfio_device_list); 49 static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces = 50 QLIST_HEAD_INITIALIZER(vfio_address_spaces); 51 52 #ifdef CONFIG_KVM 53 /* 54 * We have a single VFIO pseudo device per KVM VM. Once created it lives 55 * for the life of the VM. Closing the file descriptor only drops our 56 * reference to it and the device's reference to kvm. Therefore once 57 * initialized, this file descriptor is only released on QEMU exit and 58 * we'll re-use it should another vfio device be attached before then. 59 */ 60 int vfio_kvm_device_fd = -1; 61 #endif 62 63 /* 64 * Device state interfaces 65 */ 66 67 bool vfio_mig_active(void) 68 { 69 VFIODevice *vbasedev; 70 71 if (QLIST_EMPTY(&vfio_device_list)) { 72 return false; 73 } 74 75 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { 76 if (vbasedev->migration_blocker) { 77 return false; 78 } 79 } 80 return true; 81 } 82 83 static Error *multiple_devices_migration_blocker; 84 85 /* 86 * Multiple devices migration is allowed only if all devices support P2P 87 * migration. Single device migration is allowed regardless of P2P migration 88 * support. 89 */ 90 static bool vfio_multiple_devices_migration_is_supported(void) 91 { 92 VFIODevice *vbasedev; 93 unsigned int device_num = 0; 94 bool all_support_p2p = true; 95 96 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { 97 if (vbasedev->migration) { 98 device_num++; 99 100 if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) { 101 all_support_p2p = false; 102 } 103 } 104 } 105 106 return all_support_p2p || device_num <= 1; 107 } 108 109 int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp) 110 { 111 int ret; 112 113 if (vfio_multiple_devices_migration_is_supported()) { 114 return 0; 115 } 116 117 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { 118 error_setg(errp, "Multiple VFIO devices migration is supported only if " 119 "all of them support P2P migration"); 120 return -EINVAL; 121 } 122 123 if (multiple_devices_migration_blocker) { 124 return 0; 125 } 126 127 error_setg(&multiple_devices_migration_blocker, 128 "Multiple VFIO devices migration is supported only if all of " 129 "them support P2P migration"); 130 ret = migrate_add_blocker_normal(&multiple_devices_migration_blocker, errp); 131 132 return ret; 133 } 134 135 void vfio_unblock_multiple_devices_migration(void) 136 { 137 if (!multiple_devices_migration_blocker || 138 !vfio_multiple_devices_migration_is_supported()) { 139 return; 140 } 141 142 migrate_del_blocker(&multiple_devices_migration_blocker); 143 } 144 145 bool vfio_viommu_preset(VFIODevice *vbasedev) 146 { 147 return vbasedev->bcontainer->space->as != &address_space_memory; 148 } 149 150 static void vfio_set_migration_error(int ret) 151 { 152 if (migration_is_setup_or_active()) { 153 migration_file_set_error(ret, NULL); 154 } 155 } 156 157 bool vfio_device_state_is_running(VFIODevice *vbasedev) 158 { 159 VFIOMigration *migration = vbasedev->migration; 160 161 return migration->device_state == VFIO_DEVICE_STATE_RUNNING || 162 migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P; 163 } 164 165 bool vfio_device_state_is_precopy(VFIODevice *vbasedev) 166 { 167 VFIOMigration *migration = vbasedev->migration; 168 169 return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY || 170 migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P; 171 } 172 173 static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer) 174 { 175 VFIODevice *vbasedev; 176 177 if (!migration_is_active() && !migration_is_device()) { 178 return false; 179 } 180 181 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 182 VFIOMigration *migration = vbasedev->migration; 183 184 if (!migration) { 185 return false; 186 } 187 188 if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF && 189 (vfio_device_state_is_running(vbasedev) || 190 vfio_device_state_is_precopy(vbasedev))) { 191 return false; 192 } 193 } 194 return true; 195 } 196 197 bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer) 198 { 199 VFIODevice *vbasedev; 200 201 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 202 if (!vbasedev->dirty_pages_supported) { 203 return false; 204 } 205 } 206 207 return true; 208 } 209 210 /* 211 * Check if all VFIO devices are running and migration is active, which is 212 * essentially equivalent to the migration being in pre-copy phase. 213 */ 214 bool 215 vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer) 216 { 217 VFIODevice *vbasedev; 218 219 if (!migration_is_active()) { 220 return false; 221 } 222 223 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 224 VFIOMigration *migration = vbasedev->migration; 225 226 if (!migration) { 227 return false; 228 } 229 230 if (vfio_device_state_is_running(vbasedev) || 231 vfio_device_state_is_precopy(vbasedev)) { 232 continue; 233 } else { 234 return false; 235 } 236 } 237 return true; 238 } 239 240 static bool vfio_listener_skipped_section(MemoryRegionSection *section) 241 { 242 return (!memory_region_is_ram(section->mr) && 243 !memory_region_is_iommu(section->mr)) || 244 memory_region_is_protected(section->mr) || 245 /* 246 * Sizing an enabled 64-bit BAR can cause spurious mappings to 247 * addresses in the upper part of the 64-bit address space. These 248 * are never accessed by the CPU and beyond the address width of 249 * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width. 250 */ 251 section->offset_within_address_space & (1ULL << 63); 252 } 253 254 /* Called with rcu_read_lock held. */ 255 static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, 256 ram_addr_t *ram_addr, bool *read_only) 257 { 258 bool ret, mr_has_discard_manager; 259 260 ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, 261 &mr_has_discard_manager); 262 if (ret && mr_has_discard_manager) { 263 /* 264 * Malicious VMs might trigger discarding of IOMMU-mapped memory. The 265 * pages will remain pinned inside vfio until unmapped, resulting in a 266 * higher memory consumption than expected. If memory would get 267 * populated again later, there would be an inconsistency between pages 268 * pinned by vfio and pages seen by QEMU. This is the case until 269 * unmapped from the IOMMU (e.g., during device reset). 270 * 271 * With malicious guests, we really only care about pinning more memory 272 * than expected. RLIMIT_MEMLOCK set for the user/process can never be 273 * exceeded and can be used to mitigate this problem. 274 */ 275 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of" 276 " RAM (e.g., virtio-mem) works, however, malicious" 277 " guests can trigger pinning of more memory than" 278 " intended via an IOMMU. It's possible to mitigate " 279 " by setting/adjusting RLIMIT_MEMLOCK."); 280 } 281 return ret; 282 } 283 284 static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 285 { 286 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); 287 VFIOContainerBase *bcontainer = giommu->bcontainer; 288 hwaddr iova = iotlb->iova + giommu->iommu_offset; 289 void *vaddr; 290 int ret; 291 292 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP", 293 iova, iova + iotlb->addr_mask); 294 295 if (iotlb->target_as != &address_space_memory) { 296 error_report("Wrong target AS \"%s\", only system memory is allowed", 297 iotlb->target_as->name ? iotlb->target_as->name : "none"); 298 vfio_set_migration_error(-EINVAL); 299 return; 300 } 301 302 rcu_read_lock(); 303 304 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { 305 bool read_only; 306 307 if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) { 308 goto out; 309 } 310 /* 311 * vaddr is only valid until rcu_read_unlock(). But after 312 * vfio_dma_map has set up the mapping the pages will be 313 * pinned by the kernel. This makes sure that the RAM backend 314 * of vaddr will always be there, even if the memory object is 315 * destroyed and its backing memory munmap-ed. 316 */ 317 ret = vfio_container_dma_map(bcontainer, iova, 318 iotlb->addr_mask + 1, vaddr, 319 read_only); 320 if (ret) { 321 error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " 322 "0x%"HWADDR_PRIx", %p) = %d (%s)", 323 bcontainer, iova, 324 iotlb->addr_mask + 1, vaddr, ret, strerror(-ret)); 325 } 326 } else { 327 ret = vfio_container_dma_unmap(bcontainer, iova, 328 iotlb->addr_mask + 1, iotlb); 329 if (ret) { 330 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " 331 "0x%"HWADDR_PRIx") = %d (%s)", 332 bcontainer, iova, 333 iotlb->addr_mask + 1, ret, strerror(-ret)); 334 vfio_set_migration_error(ret); 335 } 336 } 337 out: 338 rcu_read_unlock(); 339 } 340 341 static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, 342 MemoryRegionSection *section) 343 { 344 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, 345 listener); 346 VFIOContainerBase *bcontainer = vrdl->bcontainer; 347 const hwaddr size = int128_get64(section->size); 348 const hwaddr iova = section->offset_within_address_space; 349 int ret; 350 351 /* Unmap with a single call. */ 352 ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL); 353 if (ret) { 354 error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, 355 strerror(-ret)); 356 } 357 } 358 359 static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, 360 MemoryRegionSection *section) 361 { 362 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, 363 listener); 364 VFIOContainerBase *bcontainer = vrdl->bcontainer; 365 const hwaddr end = section->offset_within_region + 366 int128_get64(section->size); 367 hwaddr start, next, iova; 368 void *vaddr; 369 int ret; 370 371 /* 372 * Map in (aligned within memory region) minimum granularity, so we can 373 * unmap in minimum granularity later. 374 */ 375 for (start = section->offset_within_region; start < end; start = next) { 376 next = ROUND_UP(start + 1, vrdl->granularity); 377 next = MIN(next, end); 378 379 iova = start - section->offset_within_region + 380 section->offset_within_address_space; 381 vaddr = memory_region_get_ram_ptr(section->mr) + start; 382 383 ret = vfio_container_dma_map(bcontainer, iova, next - start, 384 vaddr, section->readonly); 385 if (ret) { 386 /* Rollback */ 387 vfio_ram_discard_notify_discard(rdl, section); 388 return ret; 389 } 390 } 391 return 0; 392 } 393 394 static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer, 395 MemoryRegionSection *section) 396 { 397 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); 398 VFIORamDiscardListener *vrdl; 399 400 /* Ignore some corner cases not relevant in practice. */ 401 g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE)); 402 g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space, 403 TARGET_PAGE_SIZE)); 404 g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE)); 405 406 vrdl = g_new0(VFIORamDiscardListener, 1); 407 vrdl->bcontainer = bcontainer; 408 vrdl->mr = section->mr; 409 vrdl->offset_within_address_space = section->offset_within_address_space; 410 vrdl->size = int128_get64(section->size); 411 vrdl->granularity = ram_discard_manager_get_min_granularity(rdm, 412 section->mr); 413 414 g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); 415 g_assert(bcontainer->pgsizes && 416 vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes)); 417 418 ram_discard_listener_init(&vrdl->listener, 419 vfio_ram_discard_notify_populate, 420 vfio_ram_discard_notify_discard, true); 421 ram_discard_manager_register_listener(rdm, &vrdl->listener, section); 422 QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next); 423 424 /* 425 * Sanity-check if we have a theoretically problematic setup where we could 426 * exceed the maximum number of possible DMA mappings over time. We assume 427 * that each mapped section in the same address space as a RamDiscardManager 428 * section consumes exactly one DMA mapping, with the exception of 429 * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections 430 * in the same address space as RamDiscardManager sections. 431 * 432 * We assume that each section in the address space consumes one memslot. 433 * We take the number of KVM memory slots as a best guess for the maximum 434 * number of sections in the address space we could have over time, 435 * also consuming DMA mappings. 436 */ 437 if (bcontainer->dma_max_mappings) { 438 unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; 439 440 #ifdef CONFIG_KVM 441 if (kvm_enabled()) { 442 max_memslots = kvm_get_max_memslots(); 443 } 444 #endif 445 446 QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { 447 hwaddr start, end; 448 449 start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, 450 vrdl->granularity); 451 end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size, 452 vrdl->granularity); 453 vrdl_mappings += (end - start) / vrdl->granularity; 454 vrdl_count++; 455 } 456 457 if (vrdl_mappings + max_memslots - vrdl_count > 458 bcontainer->dma_max_mappings) { 459 warn_report("%s: possibly running out of DMA mappings. E.g., try" 460 " increasing the 'block-size' of virtio-mem devies." 461 " Maximum possible DMA mappings: %d, Maximum possible" 462 " memslots: %d", __func__, bcontainer->dma_max_mappings, 463 max_memslots); 464 } 465 } 466 } 467 468 static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer, 469 MemoryRegionSection *section) 470 { 471 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); 472 VFIORamDiscardListener *vrdl = NULL; 473 474 QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { 475 if (vrdl->mr == section->mr && 476 vrdl->offset_within_address_space == 477 section->offset_within_address_space) { 478 break; 479 } 480 } 481 482 if (!vrdl) { 483 hw_error("vfio: Trying to unregister missing RAM discard listener"); 484 } 485 486 ram_discard_manager_unregister_listener(rdm, &vrdl->listener); 487 QLIST_REMOVE(vrdl, next); 488 g_free(vrdl); 489 } 490 491 static bool vfio_known_safe_misalignment(MemoryRegionSection *section) 492 { 493 MemoryRegion *mr = section->mr; 494 495 if (!TPM_IS_CRB(mr->owner)) { 496 return false; 497 } 498 499 /* this is a known safe misaligned region, just trace for debug purpose */ 500 trace_vfio_known_safe_misalignment(memory_region_name(mr), 501 section->offset_within_address_space, 502 section->offset_within_region, 503 qemu_real_host_page_size()); 504 return true; 505 } 506 507 static bool vfio_listener_valid_section(MemoryRegionSection *section, 508 const char *name) 509 { 510 if (vfio_listener_skipped_section(section)) { 511 trace_vfio_listener_region_skip(name, 512 section->offset_within_address_space, 513 section->offset_within_address_space + 514 int128_get64(int128_sub(section->size, int128_one()))); 515 return false; 516 } 517 518 if (unlikely((section->offset_within_address_space & 519 ~qemu_real_host_page_mask()) != 520 (section->offset_within_region & ~qemu_real_host_page_mask()))) { 521 if (!vfio_known_safe_misalignment(section)) { 522 error_report("%s received unaligned region %s iova=0x%"PRIx64 523 " offset_within_region=0x%"PRIx64 524 " qemu_real_host_page_size=0x%"PRIxPTR, 525 __func__, memory_region_name(section->mr), 526 section->offset_within_address_space, 527 section->offset_within_region, 528 qemu_real_host_page_size()); 529 } 530 return false; 531 } 532 533 return true; 534 } 535 536 static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, 537 MemoryRegionSection *section, 538 hwaddr *out_iova, hwaddr *out_end, 539 Int128 *out_llend) 540 { 541 Int128 llend; 542 hwaddr iova; 543 544 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); 545 llend = int128_make64(section->offset_within_address_space); 546 llend = int128_add(llend, section->size); 547 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask())); 548 549 if (int128_ge(int128_make64(iova), llend)) { 550 return false; 551 } 552 553 *out_iova = iova; 554 *out_end = int128_get64(int128_sub(llend, int128_one())); 555 if (out_llend) { 556 *out_llend = llend; 557 } 558 return true; 559 } 560 561 static void vfio_listener_region_add(MemoryListener *listener, 562 MemoryRegionSection *section) 563 { 564 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, 565 listener); 566 hwaddr iova, end; 567 Int128 llend, llsize; 568 void *vaddr; 569 int ret; 570 Error *err = NULL; 571 572 if (!vfio_listener_valid_section(section, "region_add")) { 573 return; 574 } 575 576 if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, 577 &llend)) { 578 if (memory_region_is_ram_device(section->mr)) { 579 trace_vfio_listener_region_add_no_dma_map( 580 memory_region_name(section->mr), 581 section->offset_within_address_space, 582 int128_getlo(section->size), 583 qemu_real_host_page_size()); 584 } 585 return; 586 } 587 588 if (vfio_container_add_section_window(bcontainer, section, &err)) { 589 goto fail; 590 } 591 592 memory_region_ref(section->mr); 593 594 if (memory_region_is_iommu(section->mr)) { 595 VFIOGuestIOMMU *giommu; 596 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); 597 int iommu_idx; 598 599 trace_vfio_listener_region_add_iommu(iova, end); 600 /* 601 * FIXME: For VFIO iommu types which have KVM acceleration to 602 * avoid bouncing all map/unmaps through qemu this way, this 603 * would be the right place to wire that up (tell the KVM 604 * device emulation the VFIO iommu handles to use). 605 */ 606 giommu = g_malloc0(sizeof(*giommu)); 607 giommu->iommu_mr = iommu_mr; 608 giommu->iommu_offset = section->offset_within_address_space - 609 section->offset_within_region; 610 giommu->bcontainer = bcontainer; 611 llend = int128_add(int128_make64(section->offset_within_region), 612 section->size); 613 llend = int128_sub(llend, int128_one()); 614 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, 615 MEMTXATTRS_UNSPECIFIED); 616 iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, 617 IOMMU_NOTIFIER_IOTLB_EVENTS, 618 section->offset_within_region, 619 int128_get64(llend), 620 iommu_idx); 621 622 ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr, 623 bcontainer->pgsizes, 624 &err); 625 if (ret) { 626 g_free(giommu); 627 goto fail; 628 } 629 630 if (bcontainer->iova_ranges) { 631 ret = memory_region_iommu_set_iova_ranges(giommu->iommu_mr, 632 bcontainer->iova_ranges, 633 &err); 634 if (ret) { 635 g_free(giommu); 636 goto fail; 637 } 638 } 639 640 ret = memory_region_register_iommu_notifier(section->mr, &giommu->n, 641 &err); 642 if (ret) { 643 g_free(giommu); 644 goto fail; 645 } 646 QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next); 647 memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); 648 649 return; 650 } 651 652 /* Here we assume that memory_region_is_ram(section->mr)==true */ 653 654 /* 655 * For RAM memory regions with a RamDiscardManager, we only want to map the 656 * actually populated parts - and update the mapping whenever we're notified 657 * about changes. 658 */ 659 if (memory_region_has_ram_discard_manager(section->mr)) { 660 vfio_register_ram_discard_listener(bcontainer, section); 661 return; 662 } 663 664 vaddr = memory_region_get_ram_ptr(section->mr) + 665 section->offset_within_region + 666 (iova - section->offset_within_address_space); 667 668 trace_vfio_listener_region_add_ram(iova, end, vaddr); 669 670 llsize = int128_sub(llend, int128_make64(iova)); 671 672 if (memory_region_is_ram_device(section->mr)) { 673 hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; 674 675 if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { 676 trace_vfio_listener_region_add_no_dma_map( 677 memory_region_name(section->mr), 678 section->offset_within_address_space, 679 int128_getlo(section->size), 680 pgmask + 1); 681 return; 682 } 683 } 684 685 ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), 686 vaddr, section->readonly); 687 if (ret) { 688 error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " 689 "0x%"HWADDR_PRIx", %p) = %d (%s)", 690 bcontainer, iova, int128_get64(llsize), vaddr, ret, 691 strerror(-ret)); 692 if (memory_region_is_ram_device(section->mr)) { 693 /* Allow unexpected mappings not to be fatal for RAM devices */ 694 error_report_err(err); 695 return; 696 } 697 goto fail; 698 } 699 700 return; 701 702 fail: 703 if (memory_region_is_ram_device(section->mr)) { 704 error_reportf_err(err, "PCI p2p may not work: "); 705 return; 706 } 707 /* 708 * On the initfn path, store the first error in the container so we 709 * can gracefully fail. Runtime, there's not much we can do other 710 * than throw a hardware error. 711 */ 712 if (!bcontainer->initialized) { 713 if (!bcontainer->error) { 714 error_propagate_prepend(&bcontainer->error, err, 715 "Region %s: ", 716 memory_region_name(section->mr)); 717 } else { 718 error_free(err); 719 } 720 } else { 721 error_report_err(err); 722 hw_error("vfio: DMA mapping failed, unable to continue"); 723 } 724 } 725 726 static void vfio_listener_region_del(MemoryListener *listener, 727 MemoryRegionSection *section) 728 { 729 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, 730 listener); 731 hwaddr iova, end; 732 Int128 llend, llsize; 733 int ret; 734 bool try_unmap = true; 735 736 if (!vfio_listener_valid_section(section, "region_del")) { 737 return; 738 } 739 740 if (memory_region_is_iommu(section->mr)) { 741 VFIOGuestIOMMU *giommu; 742 743 QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { 744 if (MEMORY_REGION(giommu->iommu_mr) == section->mr && 745 giommu->n.start == section->offset_within_region) { 746 memory_region_unregister_iommu_notifier(section->mr, 747 &giommu->n); 748 QLIST_REMOVE(giommu, giommu_next); 749 g_free(giommu); 750 break; 751 } 752 } 753 754 /* 755 * FIXME: We assume the one big unmap below is adequate to 756 * remove any individual page mappings in the IOMMU which 757 * might have been copied into VFIO. This works for a page table 758 * based IOMMU where a big unmap flattens a large range of IO-PTEs. 759 * That may not be true for all IOMMU types. 760 */ 761 } 762 763 if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, 764 &llend)) { 765 return; 766 } 767 768 llsize = int128_sub(llend, int128_make64(iova)); 769 770 trace_vfio_listener_region_del(iova, end); 771 772 if (memory_region_is_ram_device(section->mr)) { 773 hwaddr pgmask; 774 775 pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; 776 try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); 777 } else if (memory_region_has_ram_discard_manager(section->mr)) { 778 vfio_unregister_ram_discard_listener(bcontainer, section); 779 /* Unregistering will trigger an unmap. */ 780 try_unmap = false; 781 } 782 783 if (try_unmap) { 784 if (int128_eq(llsize, int128_2_64())) { 785 /* The unmap ioctl doesn't accept a full 64-bit span. */ 786 llsize = int128_rshift(llsize, 1); 787 ret = vfio_container_dma_unmap(bcontainer, iova, 788 int128_get64(llsize), NULL); 789 if (ret) { 790 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " 791 "0x%"HWADDR_PRIx") = %d (%s)", 792 bcontainer, iova, int128_get64(llsize), ret, 793 strerror(-ret)); 794 } 795 iova += int128_get64(llsize); 796 } 797 ret = vfio_container_dma_unmap(bcontainer, iova, 798 int128_get64(llsize), NULL); 799 if (ret) { 800 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " 801 "0x%"HWADDR_PRIx") = %d (%s)", 802 bcontainer, iova, int128_get64(llsize), ret, 803 strerror(-ret)); 804 } 805 } 806 807 memory_region_unref(section->mr); 808 809 vfio_container_del_section_window(bcontainer, section); 810 } 811 812 typedef struct VFIODirtyRanges { 813 hwaddr min32; 814 hwaddr max32; 815 hwaddr min64; 816 hwaddr max64; 817 hwaddr minpci64; 818 hwaddr maxpci64; 819 } VFIODirtyRanges; 820 821 typedef struct VFIODirtyRangesListener { 822 VFIOContainerBase *bcontainer; 823 VFIODirtyRanges ranges; 824 MemoryListener listener; 825 } VFIODirtyRangesListener; 826 827 static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, 828 VFIOContainerBase *bcontainer) 829 { 830 VFIOPCIDevice *pcidev; 831 VFIODevice *vbasedev; 832 Object *owner; 833 834 owner = memory_region_owner(section->mr); 835 836 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 837 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { 838 continue; 839 } 840 pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 841 if (OBJECT(pcidev) == owner) { 842 return true; 843 } 844 } 845 846 return false; 847 } 848 849 static void vfio_dirty_tracking_update(MemoryListener *listener, 850 MemoryRegionSection *section) 851 { 852 VFIODirtyRangesListener *dirty = container_of(listener, 853 VFIODirtyRangesListener, 854 listener); 855 VFIODirtyRanges *range = &dirty->ranges; 856 hwaddr iova, end, *min, *max; 857 858 if (!vfio_listener_valid_section(section, "tracking_update") || 859 !vfio_get_section_iova_range(dirty->bcontainer, section, 860 &iova, &end, NULL)) { 861 return; 862 } 863 864 /* 865 * The address space passed to the dirty tracker is reduced to three ranges: 866 * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the 867 * PCI 64-bit hole. 868 * 869 * The underlying reports of dirty will query a sub-interval of each of 870 * these ranges. 871 * 872 * The purpose of the three range handling is to handle known cases of big 873 * holes in the address space, like the x86 AMD 1T hole, and firmware (like 874 * OVMF) which may relocate the pci-hole64 to the end of the address space. 875 * The latter would otherwise generate large ranges for tracking, stressing 876 * the limits of supported hardware. The pci-hole32 will always be below 4G 877 * (overlapping or not) so it doesn't need special handling and is part of 878 * the 32-bit range. 879 * 880 * The alternative would be an IOVATree but that has a much bigger runtime 881 * overhead and unnecessary complexity. 882 */ 883 if (vfio_section_is_vfio_pci(section, dirty->bcontainer) && 884 iova >= UINT32_MAX) { 885 min = &range->minpci64; 886 max = &range->maxpci64; 887 } else { 888 min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; 889 max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; 890 } 891 if (*min > iova) { 892 *min = iova; 893 } 894 if (*max < end) { 895 *max = end; 896 } 897 898 trace_vfio_device_dirty_tracking_update(iova, end, *min, *max); 899 return; 900 } 901 902 static const MemoryListener vfio_dirty_tracking_listener = { 903 .name = "vfio-tracking", 904 .region_add = vfio_dirty_tracking_update, 905 }; 906 907 static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer, 908 VFIODirtyRanges *ranges) 909 { 910 VFIODirtyRangesListener dirty; 911 912 memset(&dirty, 0, sizeof(dirty)); 913 dirty.ranges.min32 = UINT32_MAX; 914 dirty.ranges.min64 = UINT64_MAX; 915 dirty.ranges.minpci64 = UINT64_MAX; 916 dirty.listener = vfio_dirty_tracking_listener; 917 dirty.bcontainer = bcontainer; 918 919 memory_listener_register(&dirty.listener, 920 bcontainer->space->as); 921 922 *ranges = dirty.ranges; 923 924 /* 925 * The memory listener is synchronous, and used to calculate the range 926 * to dirty tracking. Unregister it after we are done as we are not 927 * interested in any follow-up updates. 928 */ 929 memory_listener_unregister(&dirty.listener); 930 } 931 932 static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) 933 { 934 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), 935 sizeof(uint64_t))] = {}; 936 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 937 VFIODevice *vbasedev; 938 939 feature->argsz = sizeof(buf); 940 feature->flags = VFIO_DEVICE_FEATURE_SET | 941 VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; 942 943 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 944 if (!vbasedev->dirty_tracking) { 945 continue; 946 } 947 948 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 949 warn_report("%s: Failed to stop DMA logging, err %d (%s)", 950 vbasedev->name, -errno, strerror(errno)); 951 } 952 vbasedev->dirty_tracking = false; 953 } 954 } 955 956 static struct vfio_device_feature * 957 vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer, 958 VFIODirtyRanges *tracking) 959 { 960 struct vfio_device_feature *feature; 961 size_t feature_size; 962 struct vfio_device_feature_dma_logging_control *control; 963 struct vfio_device_feature_dma_logging_range *ranges; 964 965 feature_size = sizeof(struct vfio_device_feature) + 966 sizeof(struct vfio_device_feature_dma_logging_control); 967 feature = g_try_malloc0(feature_size); 968 if (!feature) { 969 errno = ENOMEM; 970 return NULL; 971 } 972 feature->argsz = feature_size; 973 feature->flags = VFIO_DEVICE_FEATURE_SET | 974 VFIO_DEVICE_FEATURE_DMA_LOGGING_START; 975 976 control = (struct vfio_device_feature_dma_logging_control *)feature->data; 977 control->page_size = qemu_real_host_page_size(); 978 979 /* 980 * DMA logging uAPI guarantees to support at least a number of ranges that 981 * fits into a single host kernel base page. 982 */ 983 control->num_ranges = !!tracking->max32 + !!tracking->max64 + 984 !!tracking->maxpci64; 985 ranges = g_try_new0(struct vfio_device_feature_dma_logging_range, 986 control->num_ranges); 987 if (!ranges) { 988 g_free(feature); 989 errno = ENOMEM; 990 991 return NULL; 992 } 993 994 control->ranges = (uintptr_t)ranges; 995 if (tracking->max32) { 996 ranges->iova = tracking->min32; 997 ranges->length = (tracking->max32 - tracking->min32) + 1; 998 ranges++; 999 } 1000 if (tracking->max64) { 1001 ranges->iova = tracking->min64; 1002 ranges->length = (tracking->max64 - tracking->min64) + 1; 1003 ranges++; 1004 } 1005 if (tracking->maxpci64) { 1006 ranges->iova = tracking->minpci64; 1007 ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1; 1008 } 1009 1010 trace_vfio_device_dirty_tracking_start(control->num_ranges, 1011 tracking->min32, tracking->max32, 1012 tracking->min64, tracking->max64, 1013 tracking->minpci64, tracking->maxpci64); 1014 1015 return feature; 1016 } 1017 1018 static void vfio_device_feature_dma_logging_start_destroy( 1019 struct vfio_device_feature *feature) 1020 { 1021 struct vfio_device_feature_dma_logging_control *control = 1022 (struct vfio_device_feature_dma_logging_control *)feature->data; 1023 struct vfio_device_feature_dma_logging_range *ranges = 1024 (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges; 1025 1026 g_free(ranges); 1027 g_free(feature); 1028 } 1029 1030 static int vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer, 1031 Error **errp) 1032 { 1033 struct vfio_device_feature *feature; 1034 VFIODirtyRanges ranges; 1035 VFIODevice *vbasedev; 1036 int ret = 0; 1037 1038 vfio_dirty_tracking_init(bcontainer, &ranges); 1039 feature = vfio_device_feature_dma_logging_start_create(bcontainer, 1040 &ranges); 1041 if (!feature) { 1042 error_setg_errno(errp, errno, "Failed to prepare DMA logging"); 1043 return -errno; 1044 } 1045 1046 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 1047 if (vbasedev->dirty_tracking) { 1048 continue; 1049 } 1050 1051 ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); 1052 if (ret) { 1053 ret = -errno; 1054 error_setg_errno(errp, errno, "%s: Failed to start DMA logging", 1055 vbasedev->name); 1056 goto out; 1057 } 1058 vbasedev->dirty_tracking = true; 1059 } 1060 1061 out: 1062 if (ret) { 1063 vfio_devices_dma_logging_stop(bcontainer); 1064 } 1065 1066 vfio_device_feature_dma_logging_start_destroy(feature); 1067 1068 return ret; 1069 } 1070 1071 static bool vfio_listener_log_global_start(MemoryListener *listener, 1072 Error **errp) 1073 { 1074 ERRP_GUARD(); 1075 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, 1076 listener); 1077 int ret; 1078 1079 if (vfio_devices_all_device_dirty_tracking(bcontainer)) { 1080 ret = vfio_devices_dma_logging_start(bcontainer, errp); 1081 } else { 1082 ret = vfio_container_set_dirty_page_tracking(bcontainer, true, errp); 1083 } 1084 1085 if (ret) { 1086 error_prepend(errp, "vfio: Could not start dirty page tracking - "); 1087 } 1088 return !ret; 1089 } 1090 1091 static void vfio_listener_log_global_stop(MemoryListener *listener) 1092 { 1093 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, 1094 listener); 1095 Error *local_err = NULL; 1096 int ret = 0; 1097 1098 if (vfio_devices_all_device_dirty_tracking(bcontainer)) { 1099 vfio_devices_dma_logging_stop(bcontainer); 1100 } else { 1101 ret = vfio_container_set_dirty_page_tracking(bcontainer, false, 1102 &local_err); 1103 } 1104 1105 if (ret) { 1106 error_prepend(&local_err, 1107 "vfio: Could not stop dirty page tracking - "); 1108 error_report_err(local_err); 1109 vfio_set_migration_error(ret); 1110 } 1111 } 1112 1113 static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, 1114 hwaddr size, void *bitmap) 1115 { 1116 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 1117 sizeof(struct vfio_device_feature_dma_logging_report), 1118 sizeof(uint64_t))] = {}; 1119 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 1120 struct vfio_device_feature_dma_logging_report *report = 1121 (struct vfio_device_feature_dma_logging_report *)feature->data; 1122 1123 report->iova = iova; 1124 report->length = size; 1125 report->page_size = qemu_real_host_page_size(); 1126 report->bitmap = (uintptr_t)bitmap; 1127 1128 feature->argsz = sizeof(buf); 1129 feature->flags = VFIO_DEVICE_FEATURE_GET | 1130 VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; 1131 1132 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 1133 return -errno; 1134 } 1135 1136 return 0; 1137 } 1138 1139 int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer, 1140 VFIOBitmap *vbmap, hwaddr iova, 1141 hwaddr size) 1142 { 1143 VFIODevice *vbasedev; 1144 int ret; 1145 1146 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 1147 ret = vfio_device_dma_logging_report(vbasedev, iova, size, 1148 vbmap->bitmap); 1149 if (ret) { 1150 error_report("%s: Failed to get DMA logging report, iova: " 1151 "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx 1152 ", err: %d (%s)", 1153 vbasedev->name, iova, size, ret, strerror(-ret)); 1154 1155 return ret; 1156 } 1157 } 1158 1159 return 0; 1160 } 1161 1162 int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, 1163 uint64_t size, ram_addr_t ram_addr) 1164 { 1165 bool all_device_dirty_tracking = 1166 vfio_devices_all_device_dirty_tracking(bcontainer); 1167 uint64_t dirty_pages; 1168 VFIOBitmap vbmap; 1169 int ret; 1170 1171 if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) { 1172 cpu_physical_memory_set_dirty_range(ram_addr, size, 1173 tcg_enabled() ? DIRTY_CLIENTS_ALL : 1174 DIRTY_CLIENTS_NOCODE); 1175 return 0; 1176 } 1177 1178 ret = vfio_bitmap_alloc(&vbmap, size); 1179 if (ret) { 1180 return ret; 1181 } 1182 1183 if (all_device_dirty_tracking) { 1184 ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size); 1185 } else { 1186 ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size); 1187 } 1188 1189 if (ret) { 1190 goto out; 1191 } 1192 1193 dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, 1194 vbmap.pages); 1195 1196 trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages); 1197 out: 1198 g_free(vbmap.bitmap); 1199 1200 return ret; 1201 } 1202 1203 typedef struct { 1204 IOMMUNotifier n; 1205 VFIOGuestIOMMU *giommu; 1206 } vfio_giommu_dirty_notifier; 1207 1208 static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 1209 { 1210 vfio_giommu_dirty_notifier *gdn = container_of(n, 1211 vfio_giommu_dirty_notifier, n); 1212 VFIOGuestIOMMU *giommu = gdn->giommu; 1213 VFIOContainerBase *bcontainer = giommu->bcontainer; 1214 hwaddr iova = iotlb->iova + giommu->iommu_offset; 1215 ram_addr_t translated_addr; 1216 int ret = -EINVAL; 1217 1218 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); 1219 1220 if (iotlb->target_as != &address_space_memory) { 1221 error_report("Wrong target AS \"%s\", only system memory is allowed", 1222 iotlb->target_as->name ? iotlb->target_as->name : "none"); 1223 goto out; 1224 } 1225 1226 rcu_read_lock(); 1227 if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { 1228 ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, 1229 translated_addr); 1230 if (ret) { 1231 error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " 1232 "0x%"HWADDR_PRIx") = %d (%s)", 1233 bcontainer, iova, iotlb->addr_mask + 1, ret, 1234 strerror(-ret)); 1235 } 1236 } 1237 rcu_read_unlock(); 1238 1239 out: 1240 if (ret) { 1241 vfio_set_migration_error(ret); 1242 } 1243 } 1244 1245 static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, 1246 void *opaque) 1247 { 1248 const hwaddr size = int128_get64(section->size); 1249 const hwaddr iova = section->offset_within_address_space; 1250 const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) + 1251 section->offset_within_region; 1252 VFIORamDiscardListener *vrdl = opaque; 1253 1254 /* 1255 * Sync the whole mapped region (spanning multiple individual mappings) 1256 * in one go. 1257 */ 1258 return vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr); 1259 } 1260 1261 static int 1262 vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, 1263 MemoryRegionSection *section) 1264 { 1265 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); 1266 VFIORamDiscardListener *vrdl = NULL; 1267 1268 QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { 1269 if (vrdl->mr == section->mr && 1270 vrdl->offset_within_address_space == 1271 section->offset_within_address_space) { 1272 break; 1273 } 1274 } 1275 1276 if (!vrdl) { 1277 hw_error("vfio: Trying to sync missing RAM discard listener"); 1278 } 1279 1280 /* 1281 * We only want/can synchronize the bitmap for actually mapped parts - 1282 * which correspond to populated parts. Replay all populated parts. 1283 */ 1284 return ram_discard_manager_replay_populated(rdm, section, 1285 vfio_ram_discard_get_dirty_bitmap, 1286 &vrdl); 1287 } 1288 1289 static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer, 1290 MemoryRegionSection *section) 1291 { 1292 ram_addr_t ram_addr; 1293 1294 if (memory_region_is_iommu(section->mr)) { 1295 VFIOGuestIOMMU *giommu; 1296 1297 QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { 1298 if (MEMORY_REGION(giommu->iommu_mr) == section->mr && 1299 giommu->n.start == section->offset_within_region) { 1300 Int128 llend; 1301 vfio_giommu_dirty_notifier gdn = { .giommu = giommu }; 1302 int idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr, 1303 MEMTXATTRS_UNSPECIFIED); 1304 1305 llend = int128_add(int128_make64(section->offset_within_region), 1306 section->size); 1307 llend = int128_sub(llend, int128_one()); 1308 1309 iommu_notifier_init(&gdn.n, 1310 vfio_iommu_map_dirty_notify, 1311 IOMMU_NOTIFIER_MAP, 1312 section->offset_within_region, 1313 int128_get64(llend), 1314 idx); 1315 memory_region_iommu_replay(giommu->iommu_mr, &gdn.n); 1316 break; 1317 } 1318 } 1319 return 0; 1320 } else if (memory_region_has_ram_discard_manager(section->mr)) { 1321 return vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section); 1322 } 1323 1324 ram_addr = memory_region_get_ram_addr(section->mr) + 1325 section->offset_within_region; 1326 1327 return vfio_get_dirty_bitmap(bcontainer, 1328 REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), 1329 int128_get64(section->size), ram_addr); 1330 } 1331 1332 static void vfio_listener_log_sync(MemoryListener *listener, 1333 MemoryRegionSection *section) 1334 { 1335 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, 1336 listener); 1337 int ret; 1338 1339 if (vfio_listener_skipped_section(section)) { 1340 return; 1341 } 1342 1343 if (vfio_devices_all_dirty_tracking(bcontainer)) { 1344 ret = vfio_sync_dirty_bitmap(bcontainer, section); 1345 if (ret) { 1346 error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret, 1347 strerror(-ret)); 1348 vfio_set_migration_error(ret); 1349 } 1350 } 1351 } 1352 1353 const MemoryListener vfio_memory_listener = { 1354 .name = "vfio", 1355 .region_add = vfio_listener_region_add, 1356 .region_del = vfio_listener_region_del, 1357 .log_global_start = vfio_listener_log_global_start, 1358 .log_global_stop = vfio_listener_log_global_stop, 1359 .log_sync = vfio_listener_log_sync, 1360 }; 1361 1362 void vfio_reset_handler(void *opaque) 1363 { 1364 VFIODevice *vbasedev; 1365 1366 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { 1367 if (vbasedev->dev->realized) { 1368 vbasedev->ops->vfio_compute_needs_reset(vbasedev); 1369 } 1370 } 1371 1372 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { 1373 if (vbasedev->dev->realized && vbasedev->needs_reset) { 1374 vbasedev->ops->vfio_hot_reset_multi(vbasedev); 1375 } 1376 } 1377 } 1378 1379 int vfio_kvm_device_add_fd(int fd, Error **errp) 1380 { 1381 #ifdef CONFIG_KVM 1382 struct kvm_device_attr attr = { 1383 .group = KVM_DEV_VFIO_FILE, 1384 .attr = KVM_DEV_VFIO_FILE_ADD, 1385 .addr = (uint64_t)(unsigned long)&fd, 1386 }; 1387 1388 if (!kvm_enabled()) { 1389 return 0; 1390 } 1391 1392 if (vfio_kvm_device_fd < 0) { 1393 struct kvm_create_device cd = { 1394 .type = KVM_DEV_TYPE_VFIO, 1395 }; 1396 1397 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) { 1398 error_setg_errno(errp, errno, "Failed to create KVM VFIO device"); 1399 return -errno; 1400 } 1401 1402 vfio_kvm_device_fd = cd.fd; 1403 } 1404 1405 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 1406 error_setg_errno(errp, errno, "Failed to add fd %d to KVM VFIO device", 1407 fd); 1408 return -errno; 1409 } 1410 #endif 1411 return 0; 1412 } 1413 1414 int vfio_kvm_device_del_fd(int fd, Error **errp) 1415 { 1416 #ifdef CONFIG_KVM 1417 struct kvm_device_attr attr = { 1418 .group = KVM_DEV_VFIO_FILE, 1419 .attr = KVM_DEV_VFIO_FILE_DEL, 1420 .addr = (uint64_t)(unsigned long)&fd, 1421 }; 1422 1423 if (vfio_kvm_device_fd < 0) { 1424 error_setg(errp, "KVM VFIO device isn't created yet"); 1425 return -EINVAL; 1426 } 1427 1428 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 1429 error_setg_errno(errp, errno, 1430 "Failed to remove fd %d from KVM VFIO device", fd); 1431 return -errno; 1432 } 1433 #endif 1434 return 0; 1435 } 1436 1437 VFIOAddressSpace *vfio_get_address_space(AddressSpace *as) 1438 { 1439 VFIOAddressSpace *space; 1440 1441 QLIST_FOREACH(space, &vfio_address_spaces, list) { 1442 if (space->as == as) { 1443 return space; 1444 } 1445 } 1446 1447 /* No suitable VFIOAddressSpace, create a new one */ 1448 space = g_malloc0(sizeof(*space)); 1449 space->as = as; 1450 QLIST_INIT(&space->containers); 1451 1452 if (QLIST_EMPTY(&vfio_address_spaces)) { 1453 qemu_register_reset(vfio_reset_handler, NULL); 1454 } 1455 1456 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list); 1457 1458 return space; 1459 } 1460 1461 void vfio_put_address_space(VFIOAddressSpace *space) 1462 { 1463 if (!QLIST_EMPTY(&space->containers)) { 1464 return; 1465 } 1466 1467 QLIST_REMOVE(space, list); 1468 g_free(space); 1469 1470 if (QLIST_EMPTY(&vfio_address_spaces)) { 1471 qemu_unregister_reset(vfio_reset_handler, NULL); 1472 } 1473 } 1474 1475 struct vfio_device_info *vfio_get_device_info(int fd) 1476 { 1477 struct vfio_device_info *info; 1478 uint32_t argsz = sizeof(*info); 1479 1480 info = g_malloc0(argsz); 1481 1482 retry: 1483 info->argsz = argsz; 1484 1485 if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { 1486 g_free(info); 1487 return NULL; 1488 } 1489 1490 if (info->argsz > argsz) { 1491 argsz = info->argsz; 1492 info = g_realloc(info, argsz); 1493 goto retry; 1494 } 1495 1496 return info; 1497 } 1498 1499 int vfio_attach_device(char *name, VFIODevice *vbasedev, 1500 AddressSpace *as, Error **errp) 1501 { 1502 const VFIOIOMMUClass *ops = 1503 VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); 1504 1505 if (vbasedev->iommufd) { 1506 ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); 1507 } 1508 1509 assert(ops); 1510 1511 return ops->attach_device(name, vbasedev, as, errp); 1512 } 1513 1514 void vfio_detach_device(VFIODevice *vbasedev) 1515 { 1516 if (!vbasedev->bcontainer) { 1517 return; 1518 } 1519 vbasedev->bcontainer->ops->detach_device(vbasedev); 1520 } 1521