1 /* 2 * generic functions used by VFIO devices 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Based on qemu-kvm device-assignment: 13 * Adapted for KVM by Qumranet. 14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) 15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) 16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) 17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) 18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) 19 */ 20 21 #include "qemu/osdep.h" 22 #include <sys/ioctl.h> 23 #ifdef CONFIG_KVM 24 #include <linux/kvm.h> 25 #endif 26 #include <linux/vfio.h> 27 28 #include "hw/vfio/vfio-common.h" 29 #include "hw/vfio/pci.h" 30 #include "exec/address-spaces.h" 31 #include "exec/memory.h" 32 #include "exec/ram_addr.h" 33 #include "hw/hw.h" 34 #include "qemu/error-report.h" 35 #include "qemu/main-loop.h" 36 #include "qemu/range.h" 37 #include "sysemu/kvm.h" 38 #include "sysemu/reset.h" 39 #include "sysemu/runstate.h" 40 #include "trace.h" 41 #include "qapi/error.h" 42 #include "migration/misc.h" 43 #include "migration/blocker.h" 44 #include "migration/qemu-file.h" 45 #include "sysemu/tpm.h" 46 47 VFIODeviceList vfio_device_list = 48 QLIST_HEAD_INITIALIZER(vfio_device_list); 49 static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces = 50 QLIST_HEAD_INITIALIZER(vfio_address_spaces); 51 52 #ifdef CONFIG_KVM 53 /* 54 * We have a single VFIO pseudo device per KVM VM. Once created it lives 55 * for the life of the VM. Closing the file descriptor only drops our 56 * reference to it and the device's reference to kvm. Therefore once 57 * initialized, this file descriptor is only released on QEMU exit and 58 * we'll re-use it should another vfio device be attached before then. 59 */ 60 int vfio_kvm_device_fd = -1; 61 #endif 62 63 /* 64 * Device state interfaces 65 */ 66 67 bool vfio_mig_active(void) 68 { 69 VFIODevice *vbasedev; 70 71 if (QLIST_EMPTY(&vfio_device_list)) { 72 return false; 73 } 74 75 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { 76 if (vbasedev->migration_blocker) { 77 return false; 78 } 79 } 80 return true; 81 } 82 83 static Error *multiple_devices_migration_blocker; 84 85 /* 86 * Multiple devices migration is allowed only if all devices support P2P 87 * migration. Single device migration is allowed regardless of P2P migration 88 * support. 89 */ 90 static bool vfio_multiple_devices_migration_is_supported(void) 91 { 92 VFIODevice *vbasedev; 93 unsigned int device_num = 0; 94 bool all_support_p2p = true; 95 96 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { 97 if (vbasedev->migration) { 98 device_num++; 99 100 if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) { 101 all_support_p2p = false; 102 } 103 } 104 } 105 106 return all_support_p2p || device_num <= 1; 107 } 108 109 int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp) 110 { 111 int ret; 112 113 if (vfio_multiple_devices_migration_is_supported()) { 114 return 0; 115 } 116 117 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { 118 error_setg(errp, "Multiple VFIO devices migration is supported only if " 119 "all of them support P2P migration"); 120 return -EINVAL; 121 } 122 123 if (multiple_devices_migration_blocker) { 124 return 0; 125 } 126 127 error_setg(&multiple_devices_migration_blocker, 128 "Multiple VFIO devices migration is supported only if all of " 129 "them support P2P migration"); 130 ret = migrate_add_blocker_normal(&multiple_devices_migration_blocker, errp); 131 132 return ret; 133 } 134 135 void vfio_unblock_multiple_devices_migration(void) 136 { 137 if (!multiple_devices_migration_blocker || 138 !vfio_multiple_devices_migration_is_supported()) { 139 return; 140 } 141 142 migrate_del_blocker(&multiple_devices_migration_blocker); 143 } 144 145 bool vfio_viommu_preset(VFIODevice *vbasedev) 146 { 147 return vbasedev->bcontainer->space->as != &address_space_memory; 148 } 149 150 static void vfio_set_migration_error(int ret) 151 { 152 if (migration_is_running()) { 153 migration_file_set_error(ret, NULL); 154 } 155 } 156 157 bool vfio_device_state_is_running(VFIODevice *vbasedev) 158 { 159 VFIOMigration *migration = vbasedev->migration; 160 161 return migration->device_state == VFIO_DEVICE_STATE_RUNNING || 162 migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P; 163 } 164 165 bool vfio_device_state_is_precopy(VFIODevice *vbasedev) 166 { 167 VFIOMigration *migration = vbasedev->migration; 168 169 return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY || 170 migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P; 171 } 172 173 static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer) 174 { 175 VFIODevice *vbasedev; 176 177 if (!migration_is_active() && !migration_is_device()) { 178 return false; 179 } 180 181 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 182 VFIOMigration *migration = vbasedev->migration; 183 184 if (!migration) { 185 return false; 186 } 187 188 if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF && 189 (vfio_device_state_is_running(vbasedev) || 190 vfio_device_state_is_precopy(vbasedev))) { 191 return false; 192 } 193 } 194 return true; 195 } 196 197 bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer) 198 { 199 VFIODevice *vbasedev; 200 201 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 202 if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) { 203 return false; 204 } 205 if (!vbasedev->dirty_pages_supported) { 206 return false; 207 } 208 } 209 210 return true; 211 } 212 213 /* 214 * Check if all VFIO devices are running and migration is active, which is 215 * essentially equivalent to the migration being in pre-copy phase. 216 */ 217 bool 218 vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer) 219 { 220 VFIODevice *vbasedev; 221 222 if (!migration_is_active()) { 223 return false; 224 } 225 226 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 227 VFIOMigration *migration = vbasedev->migration; 228 229 if (!migration) { 230 return false; 231 } 232 233 if (vfio_device_state_is_running(vbasedev) || 234 vfio_device_state_is_precopy(vbasedev)) { 235 continue; 236 } else { 237 return false; 238 } 239 } 240 return true; 241 } 242 243 static bool vfio_listener_skipped_section(MemoryRegionSection *section) 244 { 245 return (!memory_region_is_ram(section->mr) && 246 !memory_region_is_iommu(section->mr)) || 247 memory_region_is_protected(section->mr) || 248 /* 249 * Sizing an enabled 64-bit BAR can cause spurious mappings to 250 * addresses in the upper part of the 64-bit address space. These 251 * are never accessed by the CPU and beyond the address width of 252 * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width. 253 */ 254 section->offset_within_address_space & (1ULL << 63); 255 } 256 257 /* Called with rcu_read_lock held. */ 258 static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, 259 ram_addr_t *ram_addr, bool *read_only, 260 Error **errp) 261 { 262 bool ret, mr_has_discard_manager; 263 264 ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, 265 &mr_has_discard_manager, errp); 266 if (ret && mr_has_discard_manager) { 267 /* 268 * Malicious VMs might trigger discarding of IOMMU-mapped memory. The 269 * pages will remain pinned inside vfio until unmapped, resulting in a 270 * higher memory consumption than expected. If memory would get 271 * populated again later, there would be an inconsistency between pages 272 * pinned by vfio and pages seen by QEMU. This is the case until 273 * unmapped from the IOMMU (e.g., during device reset). 274 * 275 * With malicious guests, we really only care about pinning more memory 276 * than expected. RLIMIT_MEMLOCK set for the user/process can never be 277 * exceeded and can be used to mitigate this problem. 278 */ 279 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of" 280 " RAM (e.g., virtio-mem) works, however, malicious" 281 " guests can trigger pinning of more memory than" 282 " intended via an IOMMU. It's possible to mitigate " 283 " by setting/adjusting RLIMIT_MEMLOCK."); 284 } 285 return ret; 286 } 287 288 static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 289 { 290 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); 291 VFIOContainerBase *bcontainer = giommu->bcontainer; 292 hwaddr iova = iotlb->iova + giommu->iommu_offset; 293 void *vaddr; 294 int ret; 295 Error *local_err = NULL; 296 297 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP", 298 iova, iova + iotlb->addr_mask); 299 300 if (iotlb->target_as != &address_space_memory) { 301 error_report("Wrong target AS \"%s\", only system memory is allowed", 302 iotlb->target_as->name ? iotlb->target_as->name : "none"); 303 vfio_set_migration_error(-EINVAL); 304 return; 305 } 306 307 rcu_read_lock(); 308 309 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { 310 bool read_only; 311 312 if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) { 313 error_report_err(local_err); 314 goto out; 315 } 316 /* 317 * vaddr is only valid until rcu_read_unlock(). But after 318 * vfio_dma_map has set up the mapping the pages will be 319 * pinned by the kernel. This makes sure that the RAM backend 320 * of vaddr will always be there, even if the memory object is 321 * destroyed and its backing memory munmap-ed. 322 */ 323 ret = vfio_container_dma_map(bcontainer, iova, 324 iotlb->addr_mask + 1, vaddr, 325 read_only); 326 if (ret) { 327 error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " 328 "0x%"HWADDR_PRIx", %p) = %d (%s)", 329 bcontainer, iova, 330 iotlb->addr_mask + 1, vaddr, ret, strerror(-ret)); 331 } 332 } else { 333 ret = vfio_container_dma_unmap(bcontainer, iova, 334 iotlb->addr_mask + 1, iotlb); 335 if (ret) { 336 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " 337 "0x%"HWADDR_PRIx") = %d (%s)", 338 bcontainer, iova, 339 iotlb->addr_mask + 1, ret, strerror(-ret)); 340 vfio_set_migration_error(ret); 341 } 342 } 343 out: 344 rcu_read_unlock(); 345 } 346 347 static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, 348 MemoryRegionSection *section) 349 { 350 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, 351 listener); 352 VFIOContainerBase *bcontainer = vrdl->bcontainer; 353 const hwaddr size = int128_get64(section->size); 354 const hwaddr iova = section->offset_within_address_space; 355 int ret; 356 357 /* Unmap with a single call. */ 358 ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL); 359 if (ret) { 360 error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, 361 strerror(-ret)); 362 } 363 } 364 365 static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, 366 MemoryRegionSection *section) 367 { 368 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, 369 listener); 370 VFIOContainerBase *bcontainer = vrdl->bcontainer; 371 const hwaddr end = section->offset_within_region + 372 int128_get64(section->size); 373 hwaddr start, next, iova; 374 void *vaddr; 375 int ret; 376 377 /* 378 * Map in (aligned within memory region) minimum granularity, so we can 379 * unmap in minimum granularity later. 380 */ 381 for (start = section->offset_within_region; start < end; start = next) { 382 next = ROUND_UP(start + 1, vrdl->granularity); 383 next = MIN(next, end); 384 385 iova = start - section->offset_within_region + 386 section->offset_within_address_space; 387 vaddr = memory_region_get_ram_ptr(section->mr) + start; 388 389 ret = vfio_container_dma_map(bcontainer, iova, next - start, 390 vaddr, section->readonly); 391 if (ret) { 392 /* Rollback */ 393 vfio_ram_discard_notify_discard(rdl, section); 394 return ret; 395 } 396 } 397 return 0; 398 } 399 400 static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer, 401 MemoryRegionSection *section) 402 { 403 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); 404 VFIORamDiscardListener *vrdl; 405 406 /* Ignore some corner cases not relevant in practice. */ 407 g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE)); 408 g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space, 409 TARGET_PAGE_SIZE)); 410 g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE)); 411 412 vrdl = g_new0(VFIORamDiscardListener, 1); 413 vrdl->bcontainer = bcontainer; 414 vrdl->mr = section->mr; 415 vrdl->offset_within_address_space = section->offset_within_address_space; 416 vrdl->size = int128_get64(section->size); 417 vrdl->granularity = ram_discard_manager_get_min_granularity(rdm, 418 section->mr); 419 420 g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); 421 g_assert(bcontainer->pgsizes && 422 vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes)); 423 424 ram_discard_listener_init(&vrdl->listener, 425 vfio_ram_discard_notify_populate, 426 vfio_ram_discard_notify_discard, true); 427 ram_discard_manager_register_listener(rdm, &vrdl->listener, section); 428 QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next); 429 430 /* 431 * Sanity-check if we have a theoretically problematic setup where we could 432 * exceed the maximum number of possible DMA mappings over time. We assume 433 * that each mapped section in the same address space as a RamDiscardManager 434 * section consumes exactly one DMA mapping, with the exception of 435 * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections 436 * in the same address space as RamDiscardManager sections. 437 * 438 * We assume that each section in the address space consumes one memslot. 439 * We take the number of KVM memory slots as a best guess for the maximum 440 * number of sections in the address space we could have over time, 441 * also consuming DMA mappings. 442 */ 443 if (bcontainer->dma_max_mappings) { 444 unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; 445 446 #ifdef CONFIG_KVM 447 if (kvm_enabled()) { 448 max_memslots = kvm_get_max_memslots(); 449 } 450 #endif 451 452 QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { 453 hwaddr start, end; 454 455 start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, 456 vrdl->granularity); 457 end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size, 458 vrdl->granularity); 459 vrdl_mappings += (end - start) / vrdl->granularity; 460 vrdl_count++; 461 } 462 463 if (vrdl_mappings + max_memslots - vrdl_count > 464 bcontainer->dma_max_mappings) { 465 warn_report("%s: possibly running out of DMA mappings. E.g., try" 466 " increasing the 'block-size' of virtio-mem devies." 467 " Maximum possible DMA mappings: %d, Maximum possible" 468 " memslots: %d", __func__, bcontainer->dma_max_mappings, 469 max_memslots); 470 } 471 } 472 } 473 474 static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer, 475 MemoryRegionSection *section) 476 { 477 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); 478 VFIORamDiscardListener *vrdl = NULL; 479 480 QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { 481 if (vrdl->mr == section->mr && 482 vrdl->offset_within_address_space == 483 section->offset_within_address_space) { 484 break; 485 } 486 } 487 488 if (!vrdl) { 489 hw_error("vfio: Trying to unregister missing RAM discard listener"); 490 } 491 492 ram_discard_manager_unregister_listener(rdm, &vrdl->listener); 493 QLIST_REMOVE(vrdl, next); 494 g_free(vrdl); 495 } 496 497 static bool vfio_known_safe_misalignment(MemoryRegionSection *section) 498 { 499 MemoryRegion *mr = section->mr; 500 501 if (!TPM_IS_CRB(mr->owner)) { 502 return false; 503 } 504 505 /* this is a known safe misaligned region, just trace for debug purpose */ 506 trace_vfio_known_safe_misalignment(memory_region_name(mr), 507 section->offset_within_address_space, 508 section->offset_within_region, 509 qemu_real_host_page_size()); 510 return true; 511 } 512 513 static bool vfio_listener_valid_section(MemoryRegionSection *section, 514 const char *name) 515 { 516 if (vfio_listener_skipped_section(section)) { 517 trace_vfio_listener_region_skip(name, 518 section->offset_within_address_space, 519 section->offset_within_address_space + 520 int128_get64(int128_sub(section->size, int128_one()))); 521 return false; 522 } 523 524 if (unlikely((section->offset_within_address_space & 525 ~qemu_real_host_page_mask()) != 526 (section->offset_within_region & ~qemu_real_host_page_mask()))) { 527 if (!vfio_known_safe_misalignment(section)) { 528 error_report("%s received unaligned region %s iova=0x%"PRIx64 529 " offset_within_region=0x%"PRIx64 530 " qemu_real_host_page_size=0x%"PRIxPTR, 531 __func__, memory_region_name(section->mr), 532 section->offset_within_address_space, 533 section->offset_within_region, 534 qemu_real_host_page_size()); 535 } 536 return false; 537 } 538 539 return true; 540 } 541 542 static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, 543 MemoryRegionSection *section, 544 hwaddr *out_iova, hwaddr *out_end, 545 Int128 *out_llend) 546 { 547 Int128 llend; 548 hwaddr iova; 549 550 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); 551 llend = int128_make64(section->offset_within_address_space); 552 llend = int128_add(llend, section->size); 553 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask())); 554 555 if (int128_ge(int128_make64(iova), llend)) { 556 return false; 557 } 558 559 *out_iova = iova; 560 *out_end = int128_get64(int128_sub(llend, int128_one())); 561 if (out_llend) { 562 *out_llend = llend; 563 } 564 return true; 565 } 566 567 static void vfio_listener_region_add(MemoryListener *listener, 568 MemoryRegionSection *section) 569 { 570 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, 571 listener); 572 hwaddr iova, end; 573 Int128 llend, llsize; 574 void *vaddr; 575 int ret; 576 Error *err = NULL; 577 578 if (!vfio_listener_valid_section(section, "region_add")) { 579 return; 580 } 581 582 if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, 583 &llend)) { 584 if (memory_region_is_ram_device(section->mr)) { 585 trace_vfio_listener_region_add_no_dma_map( 586 memory_region_name(section->mr), 587 section->offset_within_address_space, 588 int128_getlo(section->size), 589 qemu_real_host_page_size()); 590 } 591 return; 592 } 593 594 if (!vfio_container_add_section_window(bcontainer, section, &err)) { 595 goto fail; 596 } 597 598 memory_region_ref(section->mr); 599 600 if (memory_region_is_iommu(section->mr)) { 601 VFIOGuestIOMMU *giommu; 602 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); 603 int iommu_idx; 604 605 trace_vfio_listener_region_add_iommu(section->mr->name, iova, end); 606 /* 607 * FIXME: For VFIO iommu types which have KVM acceleration to 608 * avoid bouncing all map/unmaps through qemu this way, this 609 * would be the right place to wire that up (tell the KVM 610 * device emulation the VFIO iommu handles to use). 611 */ 612 giommu = g_malloc0(sizeof(*giommu)); 613 giommu->iommu_mr = iommu_mr; 614 giommu->iommu_offset = section->offset_within_address_space - 615 section->offset_within_region; 616 giommu->bcontainer = bcontainer; 617 llend = int128_add(int128_make64(section->offset_within_region), 618 section->size); 619 llend = int128_sub(llend, int128_one()); 620 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, 621 MEMTXATTRS_UNSPECIFIED); 622 iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, 623 IOMMU_NOTIFIER_IOTLB_EVENTS, 624 section->offset_within_region, 625 int128_get64(llend), 626 iommu_idx); 627 628 ret = memory_region_register_iommu_notifier(section->mr, &giommu->n, 629 &err); 630 if (ret) { 631 g_free(giommu); 632 goto fail; 633 } 634 QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next); 635 memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); 636 637 return; 638 } 639 640 /* Here we assume that memory_region_is_ram(section->mr)==true */ 641 642 /* 643 * For RAM memory regions with a RamDiscardManager, we only want to map the 644 * actually populated parts - and update the mapping whenever we're notified 645 * about changes. 646 */ 647 if (memory_region_has_ram_discard_manager(section->mr)) { 648 vfio_register_ram_discard_listener(bcontainer, section); 649 return; 650 } 651 652 vaddr = memory_region_get_ram_ptr(section->mr) + 653 section->offset_within_region + 654 (iova - section->offset_within_address_space); 655 656 trace_vfio_listener_region_add_ram(iova, end, vaddr); 657 658 llsize = int128_sub(llend, int128_make64(iova)); 659 660 if (memory_region_is_ram_device(section->mr)) { 661 hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; 662 663 if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { 664 trace_vfio_listener_region_add_no_dma_map( 665 memory_region_name(section->mr), 666 section->offset_within_address_space, 667 int128_getlo(section->size), 668 pgmask + 1); 669 return; 670 } 671 } 672 673 ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), 674 vaddr, section->readonly); 675 if (ret) { 676 error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " 677 "0x%"HWADDR_PRIx", %p) = %d (%s)", 678 bcontainer, iova, int128_get64(llsize), vaddr, ret, 679 strerror(-ret)); 680 if (memory_region_is_ram_device(section->mr)) { 681 /* Allow unexpected mappings not to be fatal for RAM devices */ 682 error_report_err(err); 683 return; 684 } 685 goto fail; 686 } 687 688 return; 689 690 fail: 691 if (memory_region_is_ram_device(section->mr)) { 692 error_reportf_err(err, "PCI p2p may not work: "); 693 return; 694 } 695 /* 696 * On the initfn path, store the first error in the container so we 697 * can gracefully fail. Runtime, there's not much we can do other 698 * than throw a hardware error. 699 */ 700 if (!bcontainer->initialized) { 701 if (!bcontainer->error) { 702 error_propagate_prepend(&bcontainer->error, err, 703 "Region %s: ", 704 memory_region_name(section->mr)); 705 } else { 706 error_free(err); 707 } 708 } else { 709 error_report_err(err); 710 hw_error("vfio: DMA mapping failed, unable to continue"); 711 } 712 } 713 714 static void vfio_listener_region_del(MemoryListener *listener, 715 MemoryRegionSection *section) 716 { 717 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, 718 listener); 719 hwaddr iova, end; 720 Int128 llend, llsize; 721 int ret; 722 bool try_unmap = true; 723 724 if (!vfio_listener_valid_section(section, "region_del")) { 725 return; 726 } 727 728 if (memory_region_is_iommu(section->mr)) { 729 VFIOGuestIOMMU *giommu; 730 731 trace_vfio_listener_region_del_iommu(section->mr->name); 732 QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { 733 if (MEMORY_REGION(giommu->iommu_mr) == section->mr && 734 giommu->n.start == section->offset_within_region) { 735 memory_region_unregister_iommu_notifier(section->mr, 736 &giommu->n); 737 QLIST_REMOVE(giommu, giommu_next); 738 g_free(giommu); 739 break; 740 } 741 } 742 743 /* 744 * FIXME: We assume the one big unmap below is adequate to 745 * remove any individual page mappings in the IOMMU which 746 * might have been copied into VFIO. This works for a page table 747 * based IOMMU where a big unmap flattens a large range of IO-PTEs. 748 * That may not be true for all IOMMU types. 749 */ 750 } 751 752 if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, 753 &llend)) { 754 return; 755 } 756 757 llsize = int128_sub(llend, int128_make64(iova)); 758 759 trace_vfio_listener_region_del(iova, end); 760 761 if (memory_region_is_ram_device(section->mr)) { 762 hwaddr pgmask; 763 764 pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; 765 try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); 766 } else if (memory_region_has_ram_discard_manager(section->mr)) { 767 vfio_unregister_ram_discard_listener(bcontainer, section); 768 /* Unregistering will trigger an unmap. */ 769 try_unmap = false; 770 } 771 772 if (try_unmap) { 773 if (int128_eq(llsize, int128_2_64())) { 774 /* The unmap ioctl doesn't accept a full 64-bit span. */ 775 llsize = int128_rshift(llsize, 1); 776 ret = vfio_container_dma_unmap(bcontainer, iova, 777 int128_get64(llsize), NULL); 778 if (ret) { 779 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " 780 "0x%"HWADDR_PRIx") = %d (%s)", 781 bcontainer, iova, int128_get64(llsize), ret, 782 strerror(-ret)); 783 } 784 iova += int128_get64(llsize); 785 } 786 ret = vfio_container_dma_unmap(bcontainer, iova, 787 int128_get64(llsize), NULL); 788 if (ret) { 789 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " 790 "0x%"HWADDR_PRIx") = %d (%s)", 791 bcontainer, iova, int128_get64(llsize), ret, 792 strerror(-ret)); 793 } 794 } 795 796 memory_region_unref(section->mr); 797 798 vfio_container_del_section_window(bcontainer, section); 799 } 800 801 typedef struct VFIODirtyRanges { 802 hwaddr min32; 803 hwaddr max32; 804 hwaddr min64; 805 hwaddr max64; 806 hwaddr minpci64; 807 hwaddr maxpci64; 808 } VFIODirtyRanges; 809 810 typedef struct VFIODirtyRangesListener { 811 VFIOContainerBase *bcontainer; 812 VFIODirtyRanges ranges; 813 MemoryListener listener; 814 } VFIODirtyRangesListener; 815 816 static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, 817 VFIOContainerBase *bcontainer) 818 { 819 VFIOPCIDevice *pcidev; 820 VFIODevice *vbasedev; 821 Object *owner; 822 823 owner = memory_region_owner(section->mr); 824 825 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 826 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { 827 continue; 828 } 829 pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 830 if (OBJECT(pcidev) == owner) { 831 return true; 832 } 833 } 834 835 return false; 836 } 837 838 static void vfio_dirty_tracking_update_range(VFIODirtyRanges *range, 839 hwaddr iova, hwaddr end, 840 bool update_pci) 841 { 842 hwaddr *min, *max; 843 844 /* 845 * The address space passed to the dirty tracker is reduced to three ranges: 846 * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the 847 * PCI 64-bit hole. 848 * 849 * The underlying reports of dirty will query a sub-interval of each of 850 * these ranges. 851 * 852 * The purpose of the three range handling is to handle known cases of big 853 * holes in the address space, like the x86 AMD 1T hole, and firmware (like 854 * OVMF) which may relocate the pci-hole64 to the end of the address space. 855 * The latter would otherwise generate large ranges for tracking, stressing 856 * the limits of supported hardware. The pci-hole32 will always be below 4G 857 * (overlapping or not) so it doesn't need special handling and is part of 858 * the 32-bit range. 859 * 860 * The alternative would be an IOVATree but that has a much bigger runtime 861 * overhead and unnecessary complexity. 862 */ 863 if (update_pci && iova >= UINT32_MAX) { 864 min = &range->minpci64; 865 max = &range->maxpci64; 866 } else { 867 min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; 868 max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; 869 } 870 if (*min > iova) { 871 *min = iova; 872 } 873 if (*max < end) { 874 *max = end; 875 } 876 877 trace_vfio_device_dirty_tracking_update(iova, end, *min, *max); 878 } 879 880 static void vfio_dirty_tracking_update(MemoryListener *listener, 881 MemoryRegionSection *section) 882 { 883 VFIODirtyRangesListener *dirty = 884 container_of(listener, VFIODirtyRangesListener, listener); 885 hwaddr iova, end; 886 887 if (!vfio_listener_valid_section(section, "tracking_update") || 888 !vfio_get_section_iova_range(dirty->bcontainer, section, 889 &iova, &end, NULL)) { 890 return; 891 } 892 893 vfio_dirty_tracking_update_range(&dirty->ranges, iova, end, 894 vfio_section_is_vfio_pci(section, dirty->bcontainer)); 895 } 896 897 static const MemoryListener vfio_dirty_tracking_listener = { 898 .name = "vfio-tracking", 899 .region_add = vfio_dirty_tracking_update, 900 }; 901 902 static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer, 903 VFIODirtyRanges *ranges) 904 { 905 VFIODirtyRangesListener dirty; 906 907 memset(&dirty, 0, sizeof(dirty)); 908 dirty.ranges.min32 = UINT32_MAX; 909 dirty.ranges.min64 = UINT64_MAX; 910 dirty.ranges.minpci64 = UINT64_MAX; 911 dirty.listener = vfio_dirty_tracking_listener; 912 dirty.bcontainer = bcontainer; 913 914 memory_listener_register(&dirty.listener, 915 bcontainer->space->as); 916 917 *ranges = dirty.ranges; 918 919 /* 920 * The memory listener is synchronous, and used to calculate the range 921 * to dirty tracking. Unregister it after we are done as we are not 922 * interested in any follow-up updates. 923 */ 924 memory_listener_unregister(&dirty.listener); 925 } 926 927 static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) 928 { 929 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), 930 sizeof(uint64_t))] = {}; 931 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 932 VFIODevice *vbasedev; 933 934 feature->argsz = sizeof(buf); 935 feature->flags = VFIO_DEVICE_FEATURE_SET | 936 VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; 937 938 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 939 if (!vbasedev->dirty_tracking) { 940 continue; 941 } 942 943 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 944 warn_report("%s: Failed to stop DMA logging, err %d (%s)", 945 vbasedev->name, -errno, strerror(errno)); 946 } 947 vbasedev->dirty_tracking = false; 948 } 949 } 950 951 static struct vfio_device_feature * 952 vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer, 953 VFIODirtyRanges *tracking) 954 { 955 struct vfio_device_feature *feature; 956 size_t feature_size; 957 struct vfio_device_feature_dma_logging_control *control; 958 struct vfio_device_feature_dma_logging_range *ranges; 959 960 feature_size = sizeof(struct vfio_device_feature) + 961 sizeof(struct vfio_device_feature_dma_logging_control); 962 feature = g_try_malloc0(feature_size); 963 if (!feature) { 964 errno = ENOMEM; 965 return NULL; 966 } 967 feature->argsz = feature_size; 968 feature->flags = VFIO_DEVICE_FEATURE_SET | 969 VFIO_DEVICE_FEATURE_DMA_LOGGING_START; 970 971 control = (struct vfio_device_feature_dma_logging_control *)feature->data; 972 control->page_size = qemu_real_host_page_size(); 973 974 /* 975 * DMA logging uAPI guarantees to support at least a number of ranges that 976 * fits into a single host kernel base page. 977 */ 978 control->num_ranges = !!tracking->max32 + !!tracking->max64 + 979 !!tracking->maxpci64; 980 ranges = g_try_new0(struct vfio_device_feature_dma_logging_range, 981 control->num_ranges); 982 if (!ranges) { 983 g_free(feature); 984 errno = ENOMEM; 985 986 return NULL; 987 } 988 989 control->ranges = (uintptr_t)ranges; 990 if (tracking->max32) { 991 ranges->iova = tracking->min32; 992 ranges->length = (tracking->max32 - tracking->min32) + 1; 993 ranges++; 994 } 995 if (tracking->max64) { 996 ranges->iova = tracking->min64; 997 ranges->length = (tracking->max64 - tracking->min64) + 1; 998 ranges++; 999 } 1000 if (tracking->maxpci64) { 1001 ranges->iova = tracking->minpci64; 1002 ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1; 1003 } 1004 1005 trace_vfio_device_dirty_tracking_start(control->num_ranges, 1006 tracking->min32, tracking->max32, 1007 tracking->min64, tracking->max64, 1008 tracking->minpci64, tracking->maxpci64); 1009 1010 return feature; 1011 } 1012 1013 static void vfio_device_feature_dma_logging_start_destroy( 1014 struct vfio_device_feature *feature) 1015 { 1016 struct vfio_device_feature_dma_logging_control *control = 1017 (struct vfio_device_feature_dma_logging_control *)feature->data; 1018 struct vfio_device_feature_dma_logging_range *ranges = 1019 (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges; 1020 1021 g_free(ranges); 1022 g_free(feature); 1023 } 1024 1025 static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer, 1026 Error **errp) 1027 { 1028 struct vfio_device_feature *feature; 1029 VFIODirtyRanges ranges; 1030 VFIODevice *vbasedev; 1031 int ret = 0; 1032 1033 vfio_dirty_tracking_init(bcontainer, &ranges); 1034 feature = vfio_device_feature_dma_logging_start_create(bcontainer, 1035 &ranges); 1036 if (!feature) { 1037 error_setg_errno(errp, errno, "Failed to prepare DMA logging"); 1038 return false; 1039 } 1040 1041 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 1042 if (vbasedev->dirty_tracking) { 1043 continue; 1044 } 1045 1046 ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); 1047 if (ret) { 1048 ret = -errno; 1049 error_setg_errno(errp, errno, "%s: Failed to start DMA logging", 1050 vbasedev->name); 1051 goto out; 1052 } 1053 vbasedev->dirty_tracking = true; 1054 } 1055 1056 out: 1057 if (ret) { 1058 vfio_devices_dma_logging_stop(bcontainer); 1059 } 1060 1061 vfio_device_feature_dma_logging_start_destroy(feature); 1062 1063 return ret == 0; 1064 } 1065 1066 static bool vfio_listener_log_global_start(MemoryListener *listener, 1067 Error **errp) 1068 { 1069 ERRP_GUARD(); 1070 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, 1071 listener); 1072 bool ret; 1073 1074 if (vfio_devices_all_device_dirty_tracking(bcontainer)) { 1075 ret = vfio_devices_dma_logging_start(bcontainer, errp); 1076 } else { 1077 ret = vfio_container_set_dirty_page_tracking(bcontainer, true, errp) == 0; 1078 } 1079 1080 if (!ret) { 1081 error_prepend(errp, "vfio: Could not start dirty page tracking - "); 1082 } 1083 return ret; 1084 } 1085 1086 static void vfio_listener_log_global_stop(MemoryListener *listener) 1087 { 1088 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, 1089 listener); 1090 Error *local_err = NULL; 1091 int ret = 0; 1092 1093 if (vfio_devices_all_device_dirty_tracking(bcontainer)) { 1094 vfio_devices_dma_logging_stop(bcontainer); 1095 } else { 1096 ret = vfio_container_set_dirty_page_tracking(bcontainer, false, 1097 &local_err); 1098 } 1099 1100 if (ret) { 1101 error_prepend(&local_err, 1102 "vfio: Could not stop dirty page tracking - "); 1103 error_report_err(local_err); 1104 vfio_set_migration_error(ret); 1105 } 1106 } 1107 1108 static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, 1109 hwaddr size, void *bitmap) 1110 { 1111 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 1112 sizeof(struct vfio_device_feature_dma_logging_report), 1113 sizeof(uint64_t))] = {}; 1114 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 1115 struct vfio_device_feature_dma_logging_report *report = 1116 (struct vfio_device_feature_dma_logging_report *)feature->data; 1117 1118 report->iova = iova; 1119 report->length = size; 1120 report->page_size = qemu_real_host_page_size(); 1121 report->bitmap = (uintptr_t)bitmap; 1122 1123 feature->argsz = sizeof(buf); 1124 feature->flags = VFIO_DEVICE_FEATURE_GET | 1125 VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; 1126 1127 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 1128 return -errno; 1129 } 1130 1131 return 0; 1132 } 1133 1134 int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer, 1135 VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp) 1136 { 1137 VFIODevice *vbasedev; 1138 int ret; 1139 1140 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { 1141 ret = vfio_device_dma_logging_report(vbasedev, iova, size, 1142 vbmap->bitmap); 1143 if (ret) { 1144 error_setg_errno(errp, -ret, 1145 "%s: Failed to get DMA logging report, iova: " 1146 "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx, 1147 vbasedev->name, iova, size); 1148 1149 return ret; 1150 } 1151 } 1152 1153 return 0; 1154 } 1155 1156 int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, 1157 uint64_t size, ram_addr_t ram_addr, Error **errp) 1158 { 1159 bool all_device_dirty_tracking = 1160 vfio_devices_all_device_dirty_tracking(bcontainer); 1161 uint64_t dirty_pages; 1162 VFIOBitmap vbmap; 1163 int ret; 1164 1165 if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) { 1166 cpu_physical_memory_set_dirty_range(ram_addr, size, 1167 tcg_enabled() ? DIRTY_CLIENTS_ALL : 1168 DIRTY_CLIENTS_NOCODE); 1169 return 0; 1170 } 1171 1172 ret = vfio_bitmap_alloc(&vbmap, size); 1173 if (ret) { 1174 error_setg_errno(errp, -ret, 1175 "Failed to allocate dirty tracking bitmap"); 1176 return ret; 1177 } 1178 1179 if (all_device_dirty_tracking) { 1180 ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size, 1181 errp); 1182 } else { 1183 ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size, 1184 errp); 1185 } 1186 1187 if (ret) { 1188 goto out; 1189 } 1190 1191 dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, 1192 vbmap.pages); 1193 1194 trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages); 1195 out: 1196 g_free(vbmap.bitmap); 1197 1198 return ret; 1199 } 1200 1201 typedef struct { 1202 IOMMUNotifier n; 1203 VFIOGuestIOMMU *giommu; 1204 } vfio_giommu_dirty_notifier; 1205 1206 static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 1207 { 1208 vfio_giommu_dirty_notifier *gdn = container_of(n, 1209 vfio_giommu_dirty_notifier, n); 1210 VFIOGuestIOMMU *giommu = gdn->giommu; 1211 VFIOContainerBase *bcontainer = giommu->bcontainer; 1212 hwaddr iova = iotlb->iova + giommu->iommu_offset; 1213 ram_addr_t translated_addr; 1214 Error *local_err = NULL; 1215 int ret = -EINVAL; 1216 1217 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); 1218 1219 if (iotlb->target_as != &address_space_memory) { 1220 error_report("Wrong target AS \"%s\", only system memory is allowed", 1221 iotlb->target_as->name ? iotlb->target_as->name : "none"); 1222 goto out; 1223 } 1224 1225 rcu_read_lock(); 1226 if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) { 1227 error_report_err(local_err); 1228 goto out_unlock; 1229 } 1230 1231 ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, 1232 translated_addr, &local_err); 1233 if (ret) { 1234 error_prepend(&local_err, 1235 "vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " 1236 "0x%"HWADDR_PRIx") failed - ", bcontainer, iova, 1237 iotlb->addr_mask + 1); 1238 error_report_err(local_err); 1239 } 1240 1241 out_unlock: 1242 rcu_read_unlock(); 1243 1244 out: 1245 if (ret) { 1246 vfio_set_migration_error(ret); 1247 } 1248 } 1249 1250 static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, 1251 void *opaque) 1252 { 1253 const hwaddr size = int128_get64(section->size); 1254 const hwaddr iova = section->offset_within_address_space; 1255 const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) + 1256 section->offset_within_region; 1257 VFIORamDiscardListener *vrdl = opaque; 1258 Error *local_err = NULL; 1259 int ret; 1260 1261 /* 1262 * Sync the whole mapped region (spanning multiple individual mappings) 1263 * in one go. 1264 */ 1265 ret = vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr, 1266 &local_err); 1267 if (ret) { 1268 error_report_err(local_err); 1269 } 1270 return ret; 1271 } 1272 1273 static int 1274 vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, 1275 MemoryRegionSection *section) 1276 { 1277 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); 1278 VFIORamDiscardListener *vrdl = NULL; 1279 1280 QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { 1281 if (vrdl->mr == section->mr && 1282 vrdl->offset_within_address_space == 1283 section->offset_within_address_space) { 1284 break; 1285 } 1286 } 1287 1288 if (!vrdl) { 1289 hw_error("vfio: Trying to sync missing RAM discard listener"); 1290 } 1291 1292 /* 1293 * We only want/can synchronize the bitmap for actually mapped parts - 1294 * which correspond to populated parts. Replay all populated parts. 1295 */ 1296 return ram_discard_manager_replay_populated(rdm, section, 1297 vfio_ram_discard_get_dirty_bitmap, 1298 &vrdl); 1299 } 1300 1301 static int vfio_sync_iommu_dirty_bitmap(VFIOContainerBase *bcontainer, 1302 MemoryRegionSection *section) 1303 { 1304 VFIOGuestIOMMU *giommu; 1305 bool found = false; 1306 Int128 llend; 1307 vfio_giommu_dirty_notifier gdn; 1308 int idx; 1309 1310 QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { 1311 if (MEMORY_REGION(giommu->iommu_mr) == section->mr && 1312 giommu->n.start == section->offset_within_region) { 1313 found = true; 1314 break; 1315 } 1316 } 1317 1318 if (!found) { 1319 return 0; 1320 } 1321 1322 gdn.giommu = giommu; 1323 idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr, 1324 MEMTXATTRS_UNSPECIFIED); 1325 1326 llend = int128_add(int128_make64(section->offset_within_region), 1327 section->size); 1328 llend = int128_sub(llend, int128_one()); 1329 1330 iommu_notifier_init(&gdn.n, vfio_iommu_map_dirty_notify, IOMMU_NOTIFIER_MAP, 1331 section->offset_within_region, int128_get64(llend), 1332 idx); 1333 memory_region_iommu_replay(giommu->iommu_mr, &gdn.n); 1334 1335 return 0; 1336 } 1337 1338 static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer, 1339 MemoryRegionSection *section, Error **errp) 1340 { 1341 ram_addr_t ram_addr; 1342 1343 if (memory_region_is_iommu(section->mr)) { 1344 return vfio_sync_iommu_dirty_bitmap(bcontainer, section); 1345 } else if (memory_region_has_ram_discard_manager(section->mr)) { 1346 int ret; 1347 1348 ret = vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section); 1349 if (ret) { 1350 error_setg(errp, 1351 "Failed to sync dirty bitmap with RAM discard listener"); 1352 } 1353 return ret; 1354 } 1355 1356 ram_addr = memory_region_get_ram_addr(section->mr) + 1357 section->offset_within_region; 1358 1359 return vfio_get_dirty_bitmap(bcontainer, 1360 REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), 1361 int128_get64(section->size), ram_addr, errp); 1362 } 1363 1364 static void vfio_listener_log_sync(MemoryListener *listener, 1365 MemoryRegionSection *section) 1366 { 1367 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, 1368 listener); 1369 int ret; 1370 Error *local_err = NULL; 1371 1372 if (vfio_listener_skipped_section(section)) { 1373 return; 1374 } 1375 1376 if (vfio_devices_all_dirty_tracking(bcontainer)) { 1377 ret = vfio_sync_dirty_bitmap(bcontainer, section, &local_err); 1378 if (ret) { 1379 error_report_err(local_err); 1380 vfio_set_migration_error(ret); 1381 } 1382 } 1383 } 1384 1385 const MemoryListener vfio_memory_listener = { 1386 .name = "vfio", 1387 .region_add = vfio_listener_region_add, 1388 .region_del = vfio_listener_region_del, 1389 .log_global_start = vfio_listener_log_global_start, 1390 .log_global_stop = vfio_listener_log_global_stop, 1391 .log_sync = vfio_listener_log_sync, 1392 }; 1393 1394 void vfio_reset_handler(void *opaque) 1395 { 1396 VFIODevice *vbasedev; 1397 1398 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { 1399 if (vbasedev->dev->realized) { 1400 vbasedev->ops->vfio_compute_needs_reset(vbasedev); 1401 } 1402 } 1403 1404 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { 1405 if (vbasedev->dev->realized && vbasedev->needs_reset) { 1406 vbasedev->ops->vfio_hot_reset_multi(vbasedev); 1407 } 1408 } 1409 } 1410 1411 int vfio_kvm_device_add_fd(int fd, Error **errp) 1412 { 1413 #ifdef CONFIG_KVM 1414 struct kvm_device_attr attr = { 1415 .group = KVM_DEV_VFIO_FILE, 1416 .attr = KVM_DEV_VFIO_FILE_ADD, 1417 .addr = (uint64_t)(unsigned long)&fd, 1418 }; 1419 1420 if (!kvm_enabled()) { 1421 return 0; 1422 } 1423 1424 if (vfio_kvm_device_fd < 0) { 1425 struct kvm_create_device cd = { 1426 .type = KVM_DEV_TYPE_VFIO, 1427 }; 1428 1429 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) { 1430 error_setg_errno(errp, errno, "Failed to create KVM VFIO device"); 1431 return -errno; 1432 } 1433 1434 vfio_kvm_device_fd = cd.fd; 1435 } 1436 1437 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 1438 error_setg_errno(errp, errno, "Failed to add fd %d to KVM VFIO device", 1439 fd); 1440 return -errno; 1441 } 1442 #endif 1443 return 0; 1444 } 1445 1446 int vfio_kvm_device_del_fd(int fd, Error **errp) 1447 { 1448 #ifdef CONFIG_KVM 1449 struct kvm_device_attr attr = { 1450 .group = KVM_DEV_VFIO_FILE, 1451 .attr = KVM_DEV_VFIO_FILE_DEL, 1452 .addr = (uint64_t)(unsigned long)&fd, 1453 }; 1454 1455 if (vfio_kvm_device_fd < 0) { 1456 error_setg(errp, "KVM VFIO device isn't created yet"); 1457 return -EINVAL; 1458 } 1459 1460 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 1461 error_setg_errno(errp, errno, 1462 "Failed to remove fd %d from KVM VFIO device", fd); 1463 return -errno; 1464 } 1465 #endif 1466 return 0; 1467 } 1468 1469 VFIOAddressSpace *vfio_get_address_space(AddressSpace *as) 1470 { 1471 VFIOAddressSpace *space; 1472 1473 QLIST_FOREACH(space, &vfio_address_spaces, list) { 1474 if (space->as == as) { 1475 return space; 1476 } 1477 } 1478 1479 /* No suitable VFIOAddressSpace, create a new one */ 1480 space = g_malloc0(sizeof(*space)); 1481 space->as = as; 1482 QLIST_INIT(&space->containers); 1483 1484 if (QLIST_EMPTY(&vfio_address_spaces)) { 1485 qemu_register_reset(vfio_reset_handler, NULL); 1486 } 1487 1488 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list); 1489 1490 return space; 1491 } 1492 1493 void vfio_put_address_space(VFIOAddressSpace *space) 1494 { 1495 if (!QLIST_EMPTY(&space->containers)) { 1496 return; 1497 } 1498 1499 QLIST_REMOVE(space, list); 1500 g_free(space); 1501 1502 if (QLIST_EMPTY(&vfio_address_spaces)) { 1503 qemu_unregister_reset(vfio_reset_handler, NULL); 1504 } 1505 } 1506 1507 void vfio_address_space_insert(VFIOAddressSpace *space, 1508 VFIOContainerBase *bcontainer) 1509 { 1510 QLIST_INSERT_HEAD(&space->containers, bcontainer, next); 1511 bcontainer->space = space; 1512 } 1513 1514 struct vfio_device_info *vfio_get_device_info(int fd) 1515 { 1516 struct vfio_device_info *info; 1517 uint32_t argsz = sizeof(*info); 1518 1519 info = g_malloc0(argsz); 1520 1521 retry: 1522 info->argsz = argsz; 1523 1524 if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { 1525 g_free(info); 1526 return NULL; 1527 } 1528 1529 if (info->argsz > argsz) { 1530 argsz = info->argsz; 1531 info = g_realloc(info, argsz); 1532 goto retry; 1533 } 1534 1535 return info; 1536 } 1537 1538 bool vfio_attach_device(char *name, VFIODevice *vbasedev, 1539 AddressSpace *as, Error **errp) 1540 { 1541 const VFIOIOMMUClass *ops = 1542 VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); 1543 HostIOMMUDevice *hiod = NULL; 1544 1545 if (vbasedev->iommufd) { 1546 ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); 1547 } 1548 1549 assert(ops); 1550 1551 1552 if (!vbasedev->mdev) { 1553 hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); 1554 vbasedev->hiod = hiod; 1555 } 1556 1557 if (!ops->attach_device(name, vbasedev, as, errp)) { 1558 object_unref(hiod); 1559 vbasedev->hiod = NULL; 1560 return false; 1561 } 1562 1563 return true; 1564 } 1565 1566 void vfio_detach_device(VFIODevice *vbasedev) 1567 { 1568 if (!vbasedev->bcontainer) { 1569 return; 1570 } 1571 object_unref(vbasedev->hiod); 1572 VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev); 1573 } 1574