xref: /openbmc/qemu/hw/vfio/common.c (revision cbad45511840077dafb6e1d1bc2e228baabecff5)
1e2c7d025SEric Auger /*
2e2c7d025SEric Auger  * generic functions used by VFIO devices
3e2c7d025SEric Auger  *
4e2c7d025SEric Auger  * Copyright Red Hat, Inc. 2012
5e2c7d025SEric Auger  *
6e2c7d025SEric Auger  * Authors:
7e2c7d025SEric Auger  *  Alex Williamson <alex.williamson@redhat.com>
8e2c7d025SEric Auger  *
9e2c7d025SEric Auger  * This work is licensed under the terms of the GNU GPL, version 2.  See
10e2c7d025SEric Auger  * the COPYING file in the top-level directory.
11e2c7d025SEric Auger  *
12e2c7d025SEric Auger  * Based on qemu-kvm device-assignment:
13e2c7d025SEric Auger  *  Adapted for KVM by Qumranet.
14e2c7d025SEric Auger  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15e2c7d025SEric Auger  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16e2c7d025SEric Auger  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17e2c7d025SEric Auger  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18e2c7d025SEric Auger  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19e2c7d025SEric Auger  */
20e2c7d025SEric Auger 
21c6eacb1aSPeter Maydell #include "qemu/osdep.h"
22e2c7d025SEric Auger #include <sys/ioctl.h>
23a9c94277SMarkus Armbruster #ifdef CONFIG_KVM
24a9c94277SMarkus Armbruster #include <linux/kvm.h>
25a9c94277SMarkus Armbruster #endif
26e2c7d025SEric Auger #include <linux/vfio.h>
27e2c7d025SEric Auger 
28e2c7d025SEric Auger #include "hw/vfio/vfio-common.h"
29a31fe5daSJoao Martins #include "hw/vfio/pci.h"
30e2c7d025SEric Auger #include "exec/address-spaces.h"
31e2c7d025SEric Auger #include "exec/memory.h"
32b6dd6504SKirti Wankhede #include "exec/ram_addr.h"
33e2c7d025SEric Auger #include "hw/hw.h"
34e2c7d025SEric Auger #include "qemu/error-report.h"
35db725815SMarkus Armbruster #include "qemu/main-loop.h"
36f4ec5e26SAlexey Kardashevskiy #include "qemu/range.h"
37e2c7d025SEric Auger #include "sysemu/kvm.h"
3871e8a915SMarkus Armbruster #include "sysemu/reset.h"
390fd7616eSDavid Hildenbrand #include "sysemu/runstate.h"
40e2c7d025SEric Auger #include "trace.h"
4101905f58SEric Auger #include "qapi/error.h"
428b942af3SAvihai Horon #include "migration/misc.h"
4329d81b71SAvihai Horon #include "migration/blocker.h"
44236e0a45SAvihai Horon #include "migration/qemu-file.h"
45851d6d1aSEric Auger #include "sysemu/tpm.h"
46e2c7d025SEric Auger 
477e63b311SYi Liu VFIODeviceList vfio_device_list =
483d779abaSZhenzhong Duan     QLIST_HEAD_INITIALIZER(vfio_device_list);
4910ca76b4SPaolo Bonzini static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
50e2c7d025SEric Auger     QLIST_HEAD_INITIALIZER(vfio_address_spaces);
51e2c7d025SEric Auger 
52e2c7d025SEric Auger #ifdef CONFIG_KVM
53e2c7d025SEric Auger /*
54e2c7d025SEric Auger  * We have a single VFIO pseudo device per KVM VM.  Once created it lives
55e2c7d025SEric Auger  * for the life of the VM.  Closing the file descriptor only drops our
56e2c7d025SEric Auger  * reference to it and the device's reference to kvm.  Therefore once
57e2c7d025SEric Auger  * initialized, this file descriptor is only released on QEMU exit and
58e2c7d025SEric Auger  * we'll re-use it should another vfio device be attached before then.
59e2c7d025SEric Auger  */
607e63b311SYi Liu int vfio_kvm_device_fd = -1;
61e2c7d025SEric Auger #endif
62e2c7d025SEric Auger 
63e2c7d025SEric Auger /*
64b6dd6504SKirti Wankhede  * Device state interfaces
65b6dd6504SKirti Wankhede  */
66b6dd6504SKirti Wankhede 
vfio_mig_active(void)673710586cSKirti Wankhede bool vfio_mig_active(void)
683710586cSKirti Wankhede {
693710586cSKirti Wankhede     VFIODevice *vbasedev;
703710586cSKirti Wankhede 
713d779abaSZhenzhong Duan     if (QLIST_EMPTY(&vfio_device_list)) {
723710586cSKirti Wankhede         return false;
733710586cSKirti Wankhede     }
743710586cSKirti Wankhede 
759353b6daSVolker Rümelin     QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
763710586cSKirti Wankhede         if (vbasedev->migration_blocker) {
773710586cSKirti Wankhede             return false;
783710586cSKirti Wankhede         }
793710586cSKirti Wankhede     }
803710586cSKirti Wankhede     return true;
813710586cSKirti Wankhede }
823710586cSKirti Wankhede 
8329d81b71SAvihai Horon static Error *multiple_devices_migration_blocker;
8429d81b71SAvihai Horon 
855c7a4b60SAvihai Horon /*
865c7a4b60SAvihai Horon  * Multiple devices migration is allowed only if all devices support P2P
875c7a4b60SAvihai Horon  * migration. Single device migration is allowed regardless of P2P migration
885c7a4b60SAvihai Horon  * support.
895c7a4b60SAvihai Horon  */
vfio_multiple_devices_migration_is_supported(void)905c7a4b60SAvihai Horon static bool vfio_multiple_devices_migration_is_supported(void)
9129d81b71SAvihai Horon {
9229d81b71SAvihai Horon     VFIODevice *vbasedev;
9329d81b71SAvihai Horon     unsigned int device_num = 0;
945c7a4b60SAvihai Horon     bool all_support_p2p = true;
9529d81b71SAvihai Horon 
969353b6daSVolker Rümelin     QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
9729d81b71SAvihai Horon         if (vbasedev->migration) {
9829d81b71SAvihai Horon             device_num++;
995c7a4b60SAvihai Horon 
1005c7a4b60SAvihai Horon             if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) {
1015c7a4b60SAvihai Horon                 all_support_p2p = false;
1025c7a4b60SAvihai Horon             }
10329d81b71SAvihai Horon         }
10429d81b71SAvihai Horon     }
10529d81b71SAvihai Horon 
1065c7a4b60SAvihai Horon     return all_support_p2p || device_num <= 1;
10729d81b71SAvihai Horon }
10829d81b71SAvihai Horon 
vfio_block_multiple_devices_migration(VFIODevice * vbasedev,Error ** errp)1098bbcb64aSAvihai Horon int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)
11029d81b71SAvihai Horon {
11129d81b71SAvihai Horon     int ret;
11229d81b71SAvihai Horon 
1138118349bSAvihai Horon     if (vfio_multiple_devices_migration_is_supported()) {
11429d81b71SAvihai Horon         return 0;
11529d81b71SAvihai Horon     }
11629d81b71SAvihai Horon 
1178bbcb64aSAvihai Horon     if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
1185c7a4b60SAvihai Horon         error_setg(errp, "Multiple VFIO devices migration is supported only if "
1195c7a4b60SAvihai Horon                          "all of them support P2P migration");
1208bbcb64aSAvihai Horon         return -EINVAL;
1218bbcb64aSAvihai Horon     }
1228bbcb64aSAvihai Horon 
1238118349bSAvihai Horon     if (multiple_devices_migration_blocker) {
1248118349bSAvihai Horon         return 0;
1258118349bSAvihai Horon     }
1268118349bSAvihai Horon 
12729d81b71SAvihai Horon     error_setg(&multiple_devices_migration_blocker,
1285c7a4b60SAvihai Horon                "Multiple VFIO devices migration is supported only if all of "
1295c7a4b60SAvihai Horon                "them support P2P migration");
1300cb51c18SSteve Sistare     ret = migrate_add_blocker_normal(&multiple_devices_migration_blocker, errp);
13129d81b71SAvihai Horon 
13229d81b71SAvihai Horon     return ret;
13329d81b71SAvihai Horon }
13429d81b71SAvihai Horon 
vfio_unblock_multiple_devices_migration(void)13529d81b71SAvihai Horon void vfio_unblock_multiple_devices_migration(void)
13629d81b71SAvihai Horon {
13729d81b71SAvihai Horon     if (!multiple_devices_migration_blocker ||
1385c7a4b60SAvihai Horon         !vfio_multiple_devices_migration_is_supported()) {
13929d81b71SAvihai Horon         return;
14029d81b71SAvihai Horon     }
14129d81b71SAvihai Horon 
142c8a7fc51SSteve Sistare     migrate_del_blocker(&multiple_devices_migration_blocker);
14329d81b71SAvihai Horon }
14429d81b71SAvihai Horon 
vfio_viommu_preset(VFIODevice * vbasedev)1453c26c80aSZhenzhong Duan bool vfio_viommu_preset(VFIODevice *vbasedev)
146e4688320SJoao Martins {
1473e6015d1SZhenzhong Duan     return vbasedev->bcontainer->space->as != &address_space_memory;
148e4688320SJoao Martins }
149e4688320SJoao Martins 
vfio_set_migration_error(int ret)150019d9e6cSCédric Le Goater static void vfio_set_migration_error(int ret)
151236e0a45SAvihai Horon {
152*f018eb62SPeter Xu     if (migration_is_running()) {
153019d9e6cSCédric Le Goater         migration_file_set_error(ret, NULL);
154236e0a45SAvihai Horon     }
155236e0a45SAvihai Horon }
156236e0a45SAvihai Horon 
vfio_device_state_is_running(VFIODevice * vbasedev)1573d4d0f0eSJoao Martins bool vfio_device_state_is_running(VFIODevice *vbasedev)
1583d4d0f0eSJoao Martins {
1593d4d0f0eSJoao Martins     VFIOMigration *migration = vbasedev->migration;
1603d4d0f0eSJoao Martins 
16194f775e4SAvihai Horon     return migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
16294f775e4SAvihai Horon            migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P;
1633d4d0f0eSJoao Martins }
1643d4d0f0eSJoao Martins 
vfio_device_state_is_precopy(VFIODevice * vbasedev)1653d4d0f0eSJoao Martins bool vfio_device_state_is_precopy(VFIODevice *vbasedev)
1663d4d0f0eSJoao Martins {
1673d4d0f0eSJoao Martins     VFIOMigration *migration = vbasedev->migration;
1683d4d0f0eSJoao Martins 
16994f775e4SAvihai Horon     return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ||
17094f775e4SAvihai Horon            migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P;
1713d4d0f0eSJoao Martins }
1723d4d0f0eSJoao Martins 
vfio_devices_all_dirty_tracking(VFIOContainerBase * bcontainer)173e1cac6b2SEric Auger static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer)
174b6dd6504SKirti Wankhede {
175b6dd6504SKirti Wankhede     VFIODevice *vbasedev;
176b6dd6504SKirti Wankhede 
1779bb630c6SSteve Sistare     if (!migration_is_active() && !migration_is_device()) {
178b6dd6504SKirti Wankhede         return false;
179b6dd6504SKirti Wankhede     }
180b6dd6504SKirti Wankhede 
1813e6015d1SZhenzhong Duan     QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
182b6dd6504SKirti Wankhede         VFIOMigration *migration = vbasedev->migration;
183b6dd6504SKirti Wankhede 
184b6dd6504SKirti Wankhede         if (!migration) {
185b6dd6504SKirti Wankhede             return false;
186b6dd6504SKirti Wankhede         }
187b6dd6504SKirti Wankhede 
1887429aebeSAvihai Horon         if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
1893d4d0f0eSJoao Martins             (vfio_device_state_is_running(vbasedev) ||
1903d4d0f0eSJoao Martins              vfio_device_state_is_precopy(vbasedev))) {
19131bcbbb5SAvihai Horon             return false;
19231bcbbb5SAvihai Horon         }
193b6dd6504SKirti Wankhede     }
194b6dd6504SKirti Wankhede     return true;
195b6dd6504SKirti Wankhede }
196b6dd6504SKirti Wankhede 
vfio_devices_all_device_dirty_tracking(const VFIOContainerBase * bcontainer)1974517c33cSZhenzhong Duan bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer)
1985255bbf4SJoao Martins {
1995255bbf4SJoao Martins     VFIODevice *vbasedev;
2005255bbf4SJoao Martins 
2013e6015d1SZhenzhong Duan     QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
20230b91677SJoao Martins         if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) {
20330b91677SJoao Martins             return false;
20430b91677SJoao Martins         }
2055255bbf4SJoao Martins         if (!vbasedev->dirty_pages_supported) {
2065255bbf4SJoao Martins             return false;
2075255bbf4SJoao Martins         }
2085255bbf4SJoao Martins     }
2095255bbf4SJoao Martins 
2105255bbf4SJoao Martins     return true;
2115255bbf4SJoao Martins }
2125255bbf4SJoao Martins 
2138b942af3SAvihai Horon /*
2148b942af3SAvihai Horon  * Check if all VFIO devices are running and migration is active, which is
2158b942af3SAvihai Horon  * essentially equivalent to the migration being in pre-copy phase.
2168b942af3SAvihai Horon  */
2174517c33cSZhenzhong Duan bool
vfio_devices_all_running_and_mig_active(const VFIOContainerBase * bcontainer)2184517c33cSZhenzhong Duan vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer)
2199e7b0442SKirti Wankhede {
2209e7b0442SKirti Wankhede     VFIODevice *vbasedev;
2219e7b0442SKirti Wankhede 
2223a6813b6SSteve Sistare     if (!migration_is_active()) {
2239e7b0442SKirti Wankhede         return false;
2249e7b0442SKirti Wankhede     }
2259e7b0442SKirti Wankhede 
2263e6015d1SZhenzhong Duan     QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
2279e7b0442SKirti Wankhede         VFIOMigration *migration = vbasedev->migration;
2289e7b0442SKirti Wankhede 
2299e7b0442SKirti Wankhede         if (!migration) {
2309e7b0442SKirti Wankhede             return false;
2319e7b0442SKirti Wankhede         }
2329e7b0442SKirti Wankhede 
2333d4d0f0eSJoao Martins         if (vfio_device_state_is_running(vbasedev) ||
2343d4d0f0eSJoao Martins             vfio_device_state_is_precopy(vbasedev)) {
2359e7b0442SKirti Wankhede             continue;
2369e7b0442SKirti Wankhede         } else {
2379e7b0442SKirti Wankhede             return false;
2389e7b0442SKirti Wankhede         }
2399e7b0442SKirti Wankhede     }
2409e7b0442SKirti Wankhede     return true;
2419e7b0442SKirti Wankhede }
2429e7b0442SKirti Wankhede 
vfio_listener_skipped_section(MemoryRegionSection * section)243e2c7d025SEric Auger static bool vfio_listener_skipped_section(MemoryRegionSection *section)
244e2c7d025SEric Auger {
245e2c7d025SEric Auger     return (!memory_region_is_ram(section->mr) &&
246e2c7d025SEric Auger             !memory_region_is_iommu(section->mr)) ||
24756918a12SSean Christopherson            memory_region_is_protected(section->mr) ||
248e2c7d025SEric Auger            /*
249e2c7d025SEric Auger             * Sizing an enabled 64-bit BAR can cause spurious mappings to
250e2c7d025SEric Auger             * addresses in the upper part of the 64-bit address space.  These
251e2c7d025SEric Auger             * are never accessed by the CPU and beyond the address width of
252e2c7d025SEric Auger             * some IOMMU hardware.  TODO: VFIO should tell us the IOMMU width.
253e2c7d025SEric Auger             */
254e2c7d025SEric Auger            section->offset_within_address_space & (1ULL << 63);
255e2c7d025SEric Auger }
256e2c7d025SEric Auger 
2574a4b88fbSPeter Xu /* Called with rcu_read_lock held.  */
vfio_get_xlat_addr(IOMMUTLBEntry * iotlb,void ** vaddr,ram_addr_t * ram_addr,bool * read_only,Error ** errp)2589a04fe09SKirti Wankhede static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
259ebb481c0SCédric Le Goater                                ram_addr_t *ram_addr, bool *read_only,
260ebb481c0SCédric Le Goater                                Error **errp)
2614a4b88fbSPeter Xu {
262baa44bceSCindy Lu     bool ret, mr_has_discard_manager;
2634a4b88fbSPeter Xu 
264baa44bceSCindy Lu     ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
265ebb481c0SCédric Le Goater                                &mr_has_discard_manager, errp);
266baa44bceSCindy Lu     if (ret && mr_has_discard_manager) {
2670fd7616eSDavid Hildenbrand         /*
2680fd7616eSDavid Hildenbrand          * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
2690fd7616eSDavid Hildenbrand          * pages will remain pinned inside vfio until unmapped, resulting in a
2700fd7616eSDavid Hildenbrand          * higher memory consumption than expected. If memory would get
2710fd7616eSDavid Hildenbrand          * populated again later, there would be an inconsistency between pages
2720fd7616eSDavid Hildenbrand          * pinned by vfio and pages seen by QEMU. This is the case until
2730fd7616eSDavid Hildenbrand          * unmapped from the IOMMU (e.g., during device reset).
2740fd7616eSDavid Hildenbrand          *
2750fd7616eSDavid Hildenbrand          * With malicious guests, we really only care about pinning more memory
2760fd7616eSDavid Hildenbrand          * than expected. RLIMIT_MEMLOCK set for the user/process can never be
2770fd7616eSDavid Hildenbrand          * exceeded and can be used to mitigate this problem.
2780fd7616eSDavid Hildenbrand          */
2790fd7616eSDavid Hildenbrand         warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
2800fd7616eSDavid Hildenbrand                          " RAM (e.g., virtio-mem) works, however, malicious"
2810fd7616eSDavid Hildenbrand                          " guests can trigger pinning of more memory than"
2820fd7616eSDavid Hildenbrand                          " intended via an IOMMU. It's possible to mitigate "
2830fd7616eSDavid Hildenbrand                          " by setting/adjusting RLIMIT_MEMLOCK.");
2844a4b88fbSPeter Xu     }
285baa44bceSCindy Lu     return ret;
2864a4b88fbSPeter Xu }
2874a4b88fbSPeter Xu 
vfio_iommu_map_notify(IOMMUNotifier * n,IOMMUTLBEntry * iotlb)288cdb30812SPeter Xu static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
289e2c7d025SEric Auger {
290e2c7d025SEric Auger     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
291dddf83abSEric Auger     VFIOContainerBase *bcontainer = giommu->bcontainer;
292d78c19b5SAlexey Kardashevskiy     hwaddr iova = iotlb->iova + giommu->iommu_offset;
293e2c7d025SEric Auger     void *vaddr;
294e2c7d025SEric Auger     int ret;
295ebb481c0SCédric Le Goater     Error *local_err = NULL;
296e2c7d025SEric Auger 
29732138357SPeter Xu     trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
29832138357SPeter Xu                                 iova, iova + iotlb->addr_mask);
299e2c7d025SEric Auger 
300f1f93650SAlexey Kardashevskiy     if (iotlb->target_as != &address_space_memory) {
301f1f93650SAlexey Kardashevskiy         error_report("Wrong target AS \"%s\", only system memory is allowed",
302f1f93650SAlexey Kardashevskiy                      iotlb->target_as->name ? iotlb->target_as->name : "none");
303236e0a45SAvihai Horon         vfio_set_migration_error(-EINVAL);
304f1f93650SAlexey Kardashevskiy         return;
305f1f93650SAlexey Kardashevskiy     }
306f1f93650SAlexey Kardashevskiy 
30741063e1eSPaolo Bonzini     rcu_read_lock();
3084a4b88fbSPeter Xu 
309dfbd90e5SPeter Xu     if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
3109a04fe09SKirti Wankhede         bool read_only;
3119a04fe09SKirti Wankhede 
312ebb481c0SCédric Le Goater         if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) {
313ebb481c0SCédric Le Goater             error_report_err(local_err);
31441063e1eSPaolo Bonzini             goto out;
315e2c7d025SEric Auger         }
3164a4b88fbSPeter Xu         /*
3174a4b88fbSPeter Xu          * vaddr is only valid until rcu_read_unlock(). But after
3184a4b88fbSPeter Xu          * vfio_dma_map has set up the mapping the pages will be
3194a4b88fbSPeter Xu          * pinned by the kernel. This makes sure that the RAM backend
3204a4b88fbSPeter Xu          * of vaddr will always be there, even if the memory object is
3214a4b88fbSPeter Xu          * destroyed and its backing memory munmap-ed.
3224a4b88fbSPeter Xu          */
323b08501a9SEric Auger         ret = vfio_container_dma_map(bcontainer, iova,
324e2c7d025SEric Auger                                      iotlb->addr_mask + 1, vaddr,
3254a4b88fbSPeter Xu                                      read_only);
326e2c7d025SEric Auger         if (ret) {
327b08501a9SEric Auger             error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
328db9b829bSAvihai Horon                          "0x%"HWADDR_PRIx", %p) = %d (%s)",
329b08501a9SEric Auger                          bcontainer, iova,
330db9b829bSAvihai Horon                          iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
331e2c7d025SEric Auger         }
332e2c7d025SEric Auger     } else {
333b08501a9SEric Auger         ret = vfio_container_dma_unmap(bcontainer, iova,
334b08501a9SEric Auger                                        iotlb->addr_mask + 1, iotlb);
335e2c7d025SEric Auger         if (ret) {
336b08501a9SEric Auger             error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
337db9b829bSAvihai Horon                          "0x%"HWADDR_PRIx") = %d (%s)",
338b08501a9SEric Auger                          bcontainer, iova,
339db9b829bSAvihai Horon                          iotlb->addr_mask + 1, ret, strerror(-ret));
340236e0a45SAvihai Horon             vfio_set_migration_error(ret);
341e2c7d025SEric Auger         }
342e2c7d025SEric Auger     }
34341063e1eSPaolo Bonzini out:
34441063e1eSPaolo Bonzini     rcu_read_unlock();
345e2c7d025SEric Auger }
346e2c7d025SEric Auger 
vfio_ram_discard_notify_discard(RamDiscardListener * rdl,MemoryRegionSection * section)3475e3b981cSDavid Hildenbrand static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
3485e3b981cSDavid Hildenbrand                                             MemoryRegionSection *section)
3495e3b981cSDavid Hildenbrand {
3505e3b981cSDavid Hildenbrand     VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
3515e3b981cSDavid Hildenbrand                                                 listener);
352dc74a4b0SZhenzhong Duan     VFIOContainerBase *bcontainer = vrdl->bcontainer;
3535e3b981cSDavid Hildenbrand     const hwaddr size = int128_get64(section->size);
3545e3b981cSDavid Hildenbrand     const hwaddr iova = section->offset_within_address_space;
3555e3b981cSDavid Hildenbrand     int ret;
3565e3b981cSDavid Hildenbrand 
3575e3b981cSDavid Hildenbrand     /* Unmap with a single call. */
358dc74a4b0SZhenzhong Duan     ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL);
3595e3b981cSDavid Hildenbrand     if (ret) {
360b08501a9SEric Auger         error_report("%s: vfio_container_dma_unmap() failed: %s", __func__,
3615e3b981cSDavid Hildenbrand                      strerror(-ret));
3625e3b981cSDavid Hildenbrand     }
3635e3b981cSDavid Hildenbrand }
3645e3b981cSDavid Hildenbrand 
vfio_ram_discard_notify_populate(RamDiscardListener * rdl,MemoryRegionSection * section)3655e3b981cSDavid Hildenbrand static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
3665e3b981cSDavid Hildenbrand                                             MemoryRegionSection *section)
3675e3b981cSDavid Hildenbrand {
3685e3b981cSDavid Hildenbrand     VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
3695e3b981cSDavid Hildenbrand                                                 listener);
370dc74a4b0SZhenzhong Duan     VFIOContainerBase *bcontainer = vrdl->bcontainer;
3715e3b981cSDavid Hildenbrand     const hwaddr end = section->offset_within_region +
3725e3b981cSDavid Hildenbrand                        int128_get64(section->size);
3735e3b981cSDavid Hildenbrand     hwaddr start, next, iova;
3745e3b981cSDavid Hildenbrand     void *vaddr;
3755e3b981cSDavid Hildenbrand     int ret;
3765e3b981cSDavid Hildenbrand 
3775e3b981cSDavid Hildenbrand     /*
3785e3b981cSDavid Hildenbrand      * Map in (aligned within memory region) minimum granularity, so we can
3795e3b981cSDavid Hildenbrand      * unmap in minimum granularity later.
3805e3b981cSDavid Hildenbrand      */
3815e3b981cSDavid Hildenbrand     for (start = section->offset_within_region; start < end; start = next) {
3825e3b981cSDavid Hildenbrand         next = ROUND_UP(start + 1, vrdl->granularity);
3835e3b981cSDavid Hildenbrand         next = MIN(next, end);
3845e3b981cSDavid Hildenbrand 
3855e3b981cSDavid Hildenbrand         iova = start - section->offset_within_region +
3865e3b981cSDavid Hildenbrand                section->offset_within_address_space;
3875e3b981cSDavid Hildenbrand         vaddr = memory_region_get_ram_ptr(section->mr) + start;
3885e3b981cSDavid Hildenbrand 
389dc74a4b0SZhenzhong Duan         ret = vfio_container_dma_map(bcontainer, iova, next - start,
390dc74a4b0SZhenzhong Duan                                      vaddr, section->readonly);
3915e3b981cSDavid Hildenbrand         if (ret) {
3925e3b981cSDavid Hildenbrand             /* Rollback */
3935e3b981cSDavid Hildenbrand             vfio_ram_discard_notify_discard(rdl, section);
3945e3b981cSDavid Hildenbrand             return ret;
3955e3b981cSDavid Hildenbrand         }
3965e3b981cSDavid Hildenbrand     }
3975e3b981cSDavid Hildenbrand     return 0;
3985e3b981cSDavid Hildenbrand }
3995e3b981cSDavid Hildenbrand 
vfio_register_ram_discard_listener(VFIOContainerBase * bcontainer,MemoryRegionSection * section)400dc74a4b0SZhenzhong Duan static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer,
4015e3b981cSDavid Hildenbrand                                                MemoryRegionSection *section)
4025e3b981cSDavid Hildenbrand {
4035e3b981cSDavid Hildenbrand     RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
4045e3b981cSDavid Hildenbrand     VFIORamDiscardListener *vrdl;
4055e3b981cSDavid Hildenbrand 
4065e3b981cSDavid Hildenbrand     /* Ignore some corner cases not relevant in practice. */
4075e3b981cSDavid Hildenbrand     g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE));
4085e3b981cSDavid Hildenbrand     g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
4095e3b981cSDavid Hildenbrand                              TARGET_PAGE_SIZE));
4105e3b981cSDavid Hildenbrand     g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
4115e3b981cSDavid Hildenbrand 
4125e3b981cSDavid Hildenbrand     vrdl = g_new0(VFIORamDiscardListener, 1);
413dc74a4b0SZhenzhong Duan     vrdl->bcontainer = bcontainer;
4145e3b981cSDavid Hildenbrand     vrdl->mr = section->mr;
4155e3b981cSDavid Hildenbrand     vrdl->offset_within_address_space = section->offset_within_address_space;
4165e3b981cSDavid Hildenbrand     vrdl->size = int128_get64(section->size);
4175e3b981cSDavid Hildenbrand     vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
4185e3b981cSDavid Hildenbrand                                                                 section->mr);
4195e3b981cSDavid Hildenbrand 
4205e3b981cSDavid Hildenbrand     g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
4217ab1cb74SEric Auger     g_assert(bcontainer->pgsizes &&
4227ab1cb74SEric Auger              vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes));
4235e3b981cSDavid Hildenbrand 
4245e3b981cSDavid Hildenbrand     ram_discard_listener_init(&vrdl->listener,
4255e3b981cSDavid Hildenbrand                               vfio_ram_discard_notify_populate,
4265e3b981cSDavid Hildenbrand                               vfio_ram_discard_notify_discard, true);
4275e3b981cSDavid Hildenbrand     ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
428dc74a4b0SZhenzhong Duan     QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next);
429a74317f6SDavid Hildenbrand 
430a74317f6SDavid Hildenbrand     /*
431a74317f6SDavid Hildenbrand      * Sanity-check if we have a theoretically problematic setup where we could
432a74317f6SDavid Hildenbrand      * exceed the maximum number of possible DMA mappings over time. We assume
433a74317f6SDavid Hildenbrand      * that each mapped section in the same address space as a RamDiscardManager
434a74317f6SDavid Hildenbrand      * section consumes exactly one DMA mapping, with the exception of
435a74317f6SDavid Hildenbrand      * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections
436a74317f6SDavid Hildenbrand      * in the same address space as RamDiscardManager sections.
437a74317f6SDavid Hildenbrand      *
438a74317f6SDavid Hildenbrand      * We assume that each section in the address space consumes one memslot.
439a74317f6SDavid Hildenbrand      * We take the number of KVM memory slots as a best guess for the maximum
440a74317f6SDavid Hildenbrand      * number of sections in the address space we could have over time,
441a74317f6SDavid Hildenbrand      * also consuming DMA mappings.
442a74317f6SDavid Hildenbrand      */
4437ab1cb74SEric Auger     if (bcontainer->dma_max_mappings) {
444a74317f6SDavid Hildenbrand         unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
445a74317f6SDavid Hildenbrand 
446a74317f6SDavid Hildenbrand #ifdef CONFIG_KVM
447a74317f6SDavid Hildenbrand         if (kvm_enabled()) {
448a74317f6SDavid Hildenbrand             max_memslots = kvm_get_max_memslots();
449a74317f6SDavid Hildenbrand         }
450a74317f6SDavid Hildenbrand #endif
451a74317f6SDavid Hildenbrand 
452dc74a4b0SZhenzhong Duan         QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
453a74317f6SDavid Hildenbrand             hwaddr start, end;
454a74317f6SDavid Hildenbrand 
455a74317f6SDavid Hildenbrand             start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
456a74317f6SDavid Hildenbrand                                     vrdl->granularity);
457a74317f6SDavid Hildenbrand             end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
458a74317f6SDavid Hildenbrand                            vrdl->granularity);
459a74317f6SDavid Hildenbrand             vrdl_mappings += (end - start) / vrdl->granularity;
460a74317f6SDavid Hildenbrand             vrdl_count++;
461a74317f6SDavid Hildenbrand         }
462a74317f6SDavid Hildenbrand 
463a74317f6SDavid Hildenbrand         if (vrdl_mappings + max_memslots - vrdl_count >
4647ab1cb74SEric Auger             bcontainer->dma_max_mappings) {
465a74317f6SDavid Hildenbrand             warn_report("%s: possibly running out of DMA mappings. E.g., try"
466a74317f6SDavid Hildenbrand                         " increasing the 'block-size' of virtio-mem devies."
467a74317f6SDavid Hildenbrand                         " Maximum possible DMA mappings: %d, Maximum possible"
4687ab1cb74SEric Auger                         " memslots: %d", __func__, bcontainer->dma_max_mappings,
469a74317f6SDavid Hildenbrand                         max_memslots);
470a74317f6SDavid Hildenbrand         }
471a74317f6SDavid Hildenbrand     }
4725e3b981cSDavid Hildenbrand }
4735e3b981cSDavid Hildenbrand 
vfio_unregister_ram_discard_listener(VFIOContainerBase * bcontainer,MemoryRegionSection * section)474dc74a4b0SZhenzhong Duan static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer,
4755e3b981cSDavid Hildenbrand                                                  MemoryRegionSection *section)
4765e3b981cSDavid Hildenbrand {
4775e3b981cSDavid Hildenbrand     RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
4785e3b981cSDavid Hildenbrand     VFIORamDiscardListener *vrdl = NULL;
4795e3b981cSDavid Hildenbrand 
480dc74a4b0SZhenzhong Duan     QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
4815e3b981cSDavid Hildenbrand         if (vrdl->mr == section->mr &&
4825e3b981cSDavid Hildenbrand             vrdl->offset_within_address_space ==
4835e3b981cSDavid Hildenbrand             section->offset_within_address_space) {
4845e3b981cSDavid Hildenbrand             break;
4855e3b981cSDavid Hildenbrand         }
4865e3b981cSDavid Hildenbrand     }
4875e3b981cSDavid Hildenbrand 
4885e3b981cSDavid Hildenbrand     if (!vrdl) {
4895e3b981cSDavid Hildenbrand         hw_error("vfio: Trying to unregister missing RAM discard listener");
4905e3b981cSDavid Hildenbrand     }
4915e3b981cSDavid Hildenbrand 
4925e3b981cSDavid Hildenbrand     ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
4935e3b981cSDavid Hildenbrand     QLIST_REMOVE(vrdl, next);
4945e3b981cSDavid Hildenbrand     g_free(vrdl);
4955e3b981cSDavid Hildenbrand }
4965e3b981cSDavid Hildenbrand 
vfio_known_safe_misalignment(MemoryRegionSection * section)497851d6d1aSEric Auger static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
498851d6d1aSEric Auger {
499851d6d1aSEric Auger     MemoryRegion *mr = section->mr;
500851d6d1aSEric Auger 
501851d6d1aSEric Auger     if (!TPM_IS_CRB(mr->owner)) {
502851d6d1aSEric Auger         return false;
503851d6d1aSEric Auger     }
504851d6d1aSEric Auger 
505851d6d1aSEric Auger     /* this is a known safe misaligned region, just trace for debug purpose */
506851d6d1aSEric Auger     trace_vfio_known_safe_misalignment(memory_region_name(mr),
507851d6d1aSEric Auger                                        section->offset_within_address_space,
508851d6d1aSEric Auger                                        section->offset_within_region,
509851d6d1aSEric Auger                                        qemu_real_host_page_size());
510851d6d1aSEric Auger     return true;
511851d6d1aSEric Auger }
512851d6d1aSEric Auger 
vfio_listener_valid_section(MemoryRegionSection * section,const char * name)513b92f2376SJoao Martins static bool vfio_listener_valid_section(MemoryRegionSection *section,
514b92f2376SJoao Martins                                         const char *name)
515e2c7d025SEric Auger {
516e2c7d025SEric Auger     if (vfio_listener_skipped_section(section)) {
517b92f2376SJoao Martins         trace_vfio_listener_region_skip(name,
518e2c7d025SEric Auger                 section->offset_within_address_space,
519e2c7d025SEric Auger                 section->offset_within_address_space +
520e2c7d025SEric Auger                 int128_get64(int128_sub(section->size, int128_one())));
521b92f2376SJoao Martins         return false;
522e2c7d025SEric Auger     }
523e2c7d025SEric Auger 
5241eb7f642SKunkun Jiang     if (unlikely((section->offset_within_address_space &
5258e3b0cbbSMarc-André Lureau                   ~qemu_real_host_page_mask()) !=
5268e3b0cbbSMarc-André Lureau                  (section->offset_within_region & ~qemu_real_host_page_mask()))) {
527851d6d1aSEric Auger         if (!vfio_known_safe_misalignment(section)) {
528851d6d1aSEric Auger             error_report("%s received unaligned region %s iova=0x%"PRIx64
529851d6d1aSEric Auger                          " offset_within_region=0x%"PRIx64
530851d6d1aSEric Auger                          " qemu_real_host_page_size=0x%"PRIxPTR,
531851d6d1aSEric Auger                          __func__, memory_region_name(section->mr),
532851d6d1aSEric Auger                          section->offset_within_address_space,
533851d6d1aSEric Auger                          section->offset_within_region,
534851d6d1aSEric Auger                          qemu_real_host_page_size());
535851d6d1aSEric Auger         }
536b92f2376SJoao Martins         return false;
537b92f2376SJoao Martins     }
538b92f2376SJoao Martins 
539b92f2376SJoao Martins     return true;
540b92f2376SJoao Martins }
541b92f2376SJoao Martins 
vfio_get_section_iova_range(VFIOContainerBase * bcontainer,MemoryRegionSection * section,hwaddr * out_iova,hwaddr * out_end,Int128 * out_llend)542c7b313d3SEric Auger static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
5434ead8308SJoao Martins                                         MemoryRegionSection *section,
5444ead8308SJoao Martins                                         hwaddr *out_iova, hwaddr *out_end,
5454ead8308SJoao Martins                                         Int128 *out_llend)
5464ead8308SJoao Martins {
5474ead8308SJoao Martins     Int128 llend;
5484ead8308SJoao Martins     hwaddr iova;
5494ead8308SJoao Martins 
5504ead8308SJoao Martins     iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
5514ead8308SJoao Martins     llend = int128_make64(section->offset_within_address_space);
5524ead8308SJoao Martins     llend = int128_add(llend, section->size);
5534ead8308SJoao Martins     llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
5544ead8308SJoao Martins 
5554ead8308SJoao Martins     if (int128_ge(int128_make64(iova), llend)) {
5564ead8308SJoao Martins         return false;
5574ead8308SJoao Martins     }
5584ead8308SJoao Martins 
5594ead8308SJoao Martins     *out_iova = iova;
5604ead8308SJoao Martins     *out_end = int128_get64(int128_sub(llend, int128_one()));
5614ead8308SJoao Martins     if (out_llend) {
5624ead8308SJoao Martins         *out_llend = llend;
5634ead8308SJoao Martins     }
5644ead8308SJoao Martins     return true;
5654ead8308SJoao Martins }
5664ead8308SJoao Martins 
vfio_listener_region_add(MemoryListener * listener,MemoryRegionSection * section)567b92f2376SJoao Martins static void vfio_listener_region_add(MemoryListener *listener,
568b92f2376SJoao Martins                                      MemoryRegionSection *section)
569b92f2376SJoao Martins {
570c7b313d3SEric Auger     VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
571c7b313d3SEric Auger                                                  listener);
572b92f2376SJoao Martins     hwaddr iova, end;
573b92f2376SJoao Martins     Int128 llend, llsize;
574b92f2376SJoao Martins     void *vaddr;
575b92f2376SJoao Martins     int ret;
576b92f2376SJoao Martins     Error *err = NULL;
577b92f2376SJoao Martins 
578b92f2376SJoao Martins     if (!vfio_listener_valid_section(section, "region_add")) {
579e2c7d025SEric Auger         return;
580e2c7d025SEric Auger     }
581e2c7d025SEric Auger 
582c7b313d3SEric Auger     if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
583c7b313d3SEric Auger                                      &llend)) {
584e4b34708SKunkun Jiang         if (memory_region_is_ram_device(section->mr)) {
585e4b34708SKunkun Jiang             trace_vfio_listener_region_add_no_dma_map(
586e4b34708SKunkun Jiang                 memory_region_name(section->mr),
587e4b34708SKunkun Jiang                 section->offset_within_address_space,
588e4b34708SKunkun Jiang                 int128_getlo(section->size),
5898e3b0cbbSMarc-André Lureau                 qemu_real_host_page_size());
590e4b34708SKunkun Jiang         }
591e2c7d025SEric Auger         return;
592e2c7d025SEric Auger     }
5933898aad3SDavid Gibson 
59433e4c22fSZhenzhong Duan     if (!vfio_container_add_section_window(bcontainer, section, &err)) {
5952e4109deSAlexey Kardashevskiy         goto fail;
5962e4109deSAlexey Kardashevskiy     }
5972e4109deSAlexey Kardashevskiy 
598e2c7d025SEric Auger     memory_region_ref(section->mr);
599e2c7d025SEric Auger 
600e2c7d025SEric Auger     if (memory_region_is_iommu(section->mr)) {
601e2c7d025SEric Auger         VFIOGuestIOMMU *giommu;
6023df9d748SAlexey Kardashevskiy         IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
603cb1efcf4SPeter Maydell         int iommu_idx;
604e2c7d025SEric Auger 
605a6586419SEric Auger         trace_vfio_listener_region_add_iommu(section->mr->name, iova, end);
606e2c7d025SEric Auger         /*
607e2c7d025SEric Auger          * FIXME: For VFIO iommu types which have KVM acceleration to
608e2c7d025SEric Auger          * avoid bouncing all map/unmaps through qemu this way, this
609e2c7d025SEric Auger          * would be the right place to wire that up (tell the KVM
610e2c7d025SEric Auger          * device emulation the VFIO iommu handles to use).
611e2c7d025SEric Auger          */
612e2c7d025SEric Auger         giommu = g_malloc0(sizeof(*giommu));
61344ee6aaaSYi Liu         giommu->iommu_mr = iommu_mr;
614d78c19b5SAlexey Kardashevskiy         giommu->iommu_offset = section->offset_within_address_space -
615d78c19b5SAlexey Kardashevskiy                                section->offset_within_region;
616dddf83abSEric Auger         giommu->bcontainer = bcontainer;
617698feb5eSPeter Xu         llend = int128_add(int128_make64(section->offset_within_region),
618698feb5eSPeter Xu                            section->size);
619698feb5eSPeter Xu         llend = int128_sub(llend, int128_one());
620cb1efcf4SPeter Maydell         iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
621cb1efcf4SPeter Maydell                                                        MEMTXATTRS_UNSPECIFIED);
622698feb5eSPeter Xu         iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
6238dca037bSEric Auger                             IOMMU_NOTIFIER_IOTLB_EVENTS,
624698feb5eSPeter Xu                             section->offset_within_region,
625cb1efcf4SPeter Maydell                             int128_get64(llend),
626cb1efcf4SPeter Maydell                             iommu_idx);
627508ce5ebSDavid Gibson 
628549d4005SEric Auger         ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
629549d4005SEric Auger                                                     &err);
630549d4005SEric Auger         if (ret) {
631549d4005SEric Auger             g_free(giommu);
632549d4005SEric Auger             goto fail;
633549d4005SEric Auger         }
634dddf83abSEric Auger         QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next);
63544ee6aaaSYi Liu         memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
636e2c7d025SEric Auger 
637e2c7d025SEric Auger         return;
638e2c7d025SEric Auger     }
639e2c7d025SEric Auger 
640e2c7d025SEric Auger     /* Here we assume that memory_region_is_ram(section->mr)==true */
641e2c7d025SEric Auger 
6425e3b981cSDavid Hildenbrand     /*
6435e3b981cSDavid Hildenbrand      * For RAM memory regions with a RamDiscardManager, we only want to map the
6445e3b981cSDavid Hildenbrand      * actually populated parts - and update the mapping whenever we're notified
6455e3b981cSDavid Hildenbrand      * about changes.
6465e3b981cSDavid Hildenbrand      */
6475e3b981cSDavid Hildenbrand     if (memory_region_has_ram_discard_manager(section->mr)) {
648dc74a4b0SZhenzhong Duan         vfio_register_ram_discard_listener(bcontainer, section);
6495e3b981cSDavid Hildenbrand         return;
6505e3b981cSDavid Hildenbrand     }
6515e3b981cSDavid Hildenbrand 
652e2c7d025SEric Auger     vaddr = memory_region_get_ram_ptr(section->mr) +
653e2c7d025SEric Auger             section->offset_within_region +
654e2c7d025SEric Auger             (iova - section->offset_within_address_space);
655e2c7d025SEric Auger 
65655efcc53SBandan Das     trace_vfio_listener_region_add_ram(iova, end, vaddr);
657e2c7d025SEric Auger 
65855efcc53SBandan Das     llsize = int128_sub(llend, int128_make64(iova));
65955efcc53SBandan Das 
660567b5b30SAlexey Kardashevskiy     if (memory_region_is_ram_device(section->mr)) {
6617ab1cb74SEric Auger         hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
662567b5b30SAlexey Kardashevskiy 
663567b5b30SAlexey Kardashevskiy         if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
6645c086005SEric Auger             trace_vfio_listener_region_add_no_dma_map(
6655c086005SEric Auger                 memory_region_name(section->mr),
6665c086005SEric Auger                 section->offset_within_address_space,
667567b5b30SAlexey Kardashevskiy                 int128_getlo(section->size),
668567b5b30SAlexey Kardashevskiy                 pgmask + 1);
669567b5b30SAlexey Kardashevskiy             return;
670567b5b30SAlexey Kardashevskiy         }
671567b5b30SAlexey Kardashevskiy     }
672567b5b30SAlexey Kardashevskiy 
673c7b313d3SEric Auger     ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize),
674c7b313d3SEric Auger                                  vaddr, section->readonly);
675e2c7d025SEric Auger     if (ret) {
676b08501a9SEric Auger         error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
677db9b829bSAvihai Horon                    "0x%"HWADDR_PRIx", %p) = %d (%s)",
678c7b313d3SEric Auger                    bcontainer, iova, int128_get64(llsize), vaddr, ret,
679db9b829bSAvihai Horon                    strerror(-ret));
680567b5b30SAlexey Kardashevskiy         if (memory_region_is_ram_device(section->mr)) {
681567b5b30SAlexey Kardashevskiy             /* Allow unexpected mappings not to be fatal for RAM devices */
682d7d87836SEric Auger             error_report_err(err);
683567b5b30SAlexey Kardashevskiy             return;
684567b5b30SAlexey Kardashevskiy         }
685ac6dc389SDavid Gibson         goto fail;
686ac6dc389SDavid Gibson     }
687e2c7d025SEric Auger 
688ac6dc389SDavid Gibson     return;
689ac6dc389SDavid Gibson 
690ac6dc389SDavid Gibson fail:
691567b5b30SAlexey Kardashevskiy     if (memory_region_is_ram_device(section->mr)) {
692fde4dbb7SZhenzhong Duan         error_reportf_err(err, "PCI p2p may not work: ");
693567b5b30SAlexey Kardashevskiy         return;
694567b5b30SAlexey Kardashevskiy     }
695e2c7d025SEric Auger     /*
696e2c7d025SEric Auger      * On the initfn path, store the first error in the container so we
697e2c7d025SEric Auger      * can gracefully fail.  Runtime, there's not much we can do other
698e2c7d025SEric Auger      * than throw a hardware error.
699e2c7d025SEric Auger      */
700c7b313d3SEric Auger     if (!bcontainer->initialized) {
701c7b313d3SEric Auger         if (!bcontainer->error) {
702c7b313d3SEric Auger             error_propagate_prepend(&bcontainer->error, err,
703d7d87836SEric Auger                                     "Region %s: ",
704d7d87836SEric Auger                                     memory_region_name(section->mr));
705d7d87836SEric Auger         } else {
706d7d87836SEric Auger             error_free(err);
707e2c7d025SEric Auger         }
708e2c7d025SEric Auger     } else {
709d7d87836SEric Auger         error_report_err(err);
710e2c7d025SEric Auger         hw_error("vfio: DMA mapping failed, unable to continue");
711e2c7d025SEric Auger     }
712e2c7d025SEric Auger }
713e2c7d025SEric Auger 
vfio_listener_region_del(MemoryListener * listener,MemoryRegionSection * section)714e2c7d025SEric Auger static void vfio_listener_region_del(MemoryListener *listener,
715e2c7d025SEric Auger                                      MemoryRegionSection *section)
716e2c7d025SEric Auger {
717c7b313d3SEric Auger     VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
718c7b313d3SEric Auger                                                  listener);
719e2c7d025SEric Auger     hwaddr iova, end;
7207a057b4fSAlexey Kardashevskiy     Int128 llend, llsize;
721e2c7d025SEric Auger     int ret;
722567b5b30SAlexey Kardashevskiy     bool try_unmap = true;
723e2c7d025SEric Auger 
724b92f2376SJoao Martins     if (!vfio_listener_valid_section(section, "region_del")) {
725e2c7d025SEric Auger         return;
726e2c7d025SEric Auger     }
727e2c7d025SEric Auger 
728e2c7d025SEric Auger     if (memory_region_is_iommu(section->mr)) {
729e2c7d025SEric Auger         VFIOGuestIOMMU *giommu;
730e2c7d025SEric Auger 
731a6586419SEric Auger         trace_vfio_listener_region_del_iommu(section->mr->name);
732dddf83abSEric Auger         QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
73344ee6aaaSYi Liu             if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
734698feb5eSPeter Xu                 giommu->n.start == section->offset_within_region) {
7353df9d748SAlexey Kardashevskiy                 memory_region_unregister_iommu_notifier(section->mr,
736d22d8956SAlexey Kardashevskiy                                                         &giommu->n);
737e2c7d025SEric Auger                 QLIST_REMOVE(giommu, giommu_next);
738e2c7d025SEric Auger                 g_free(giommu);
739e2c7d025SEric Auger                 break;
740e2c7d025SEric Auger             }
741e2c7d025SEric Auger         }
742e2c7d025SEric Auger 
743e2c7d025SEric Auger         /*
744e2c7d025SEric Auger          * FIXME: We assume the one big unmap below is adequate to
745e2c7d025SEric Auger          * remove any individual page mappings in the IOMMU which
746e2c7d025SEric Auger          * might have been copied into VFIO. This works for a page table
747e2c7d025SEric Auger          * based IOMMU where a big unmap flattens a large range of IO-PTEs.
748e2c7d025SEric Auger          * That may not be true for all IOMMU types.
749e2c7d025SEric Auger          */
750e2c7d025SEric Auger     }
751e2c7d025SEric Auger 
752c7b313d3SEric Auger     if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
753c7b313d3SEric Auger                                      &llend)) {
754e2c7d025SEric Auger         return;
755e2c7d025SEric Auger     }
756e2c7d025SEric Auger 
7577a057b4fSAlexey Kardashevskiy     llsize = int128_sub(llend, int128_make64(iova));
758e2c7d025SEric Auger 
7597a057b4fSAlexey Kardashevskiy     trace_vfio_listener_region_del(iova, end);
7607a057b4fSAlexey Kardashevskiy 
761567b5b30SAlexey Kardashevskiy     if (memory_region_is_ram_device(section->mr)) {
762567b5b30SAlexey Kardashevskiy         hwaddr pgmask;
763567b5b30SAlexey Kardashevskiy 
7647ab1cb74SEric Auger         pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
765567b5b30SAlexey Kardashevskiy         try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
7665e3b981cSDavid Hildenbrand     } else if (memory_region_has_ram_discard_manager(section->mr)) {
767dc74a4b0SZhenzhong Duan         vfio_unregister_ram_discard_listener(bcontainer, section);
7685e3b981cSDavid Hildenbrand         /* Unregistering will trigger an unmap. */
7695e3b981cSDavid Hildenbrand         try_unmap = false;
770567b5b30SAlexey Kardashevskiy     }
771567b5b30SAlexey Kardashevskiy 
772567b5b30SAlexey Kardashevskiy     if (try_unmap) {
7731b296c3dSJean-Philippe Brucker         if (int128_eq(llsize, int128_2_64())) {
7741b296c3dSJean-Philippe Brucker             /* The unmap ioctl doesn't accept a full 64-bit span. */
7751b296c3dSJean-Philippe Brucker             llsize = int128_rshift(llsize, 1);
776c7b313d3SEric Auger             ret = vfio_container_dma_unmap(bcontainer, iova,
777b08501a9SEric Auger                                            int128_get64(llsize), NULL);
7781b296c3dSJean-Philippe Brucker             if (ret) {
779b08501a9SEric Auger                 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
780db9b829bSAvihai Horon                              "0x%"HWADDR_PRIx") = %d (%s)",
781c7b313d3SEric Auger                              bcontainer, iova, int128_get64(llsize), ret,
782db9b829bSAvihai Horon                              strerror(-ret));
7831b296c3dSJean-Philippe Brucker             }
7841b296c3dSJean-Philippe Brucker             iova += int128_get64(llsize);
7851b296c3dSJean-Philippe Brucker         }
786c7b313d3SEric Auger         ret = vfio_container_dma_unmap(bcontainer, iova,
787b08501a9SEric Auger                                        int128_get64(llsize), NULL);
788e2c7d025SEric Auger         if (ret) {
789b08501a9SEric Auger             error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
790db9b829bSAvihai Horon                          "0x%"HWADDR_PRIx") = %d (%s)",
791c7b313d3SEric Auger                          bcontainer, iova, int128_get64(llsize), ret,
792db9b829bSAvihai Horon                          strerror(-ret));
793e2c7d025SEric Auger         }
794567b5b30SAlexey Kardashevskiy     }
795567b5b30SAlexey Kardashevskiy 
796567b5b30SAlexey Kardashevskiy     memory_region_unref(section->mr);
7972e4109deSAlexey Kardashevskiy 
798233309e8SZhenzhong Duan     vfio_container_del_section_window(bcontainer, section);
799e2c7d025SEric Auger }
800e2c7d025SEric Auger 
80162c1b002SJoao Martins typedef struct VFIODirtyRanges {
80262c1b002SJoao Martins     hwaddr min32;
80362c1b002SJoao Martins     hwaddr max32;
80462c1b002SJoao Martins     hwaddr min64;
80562c1b002SJoao Martins     hwaddr max64;
806a31fe5daSJoao Martins     hwaddr minpci64;
807a31fe5daSJoao Martins     hwaddr maxpci64;
80862c1b002SJoao Martins } VFIODirtyRanges;
80962c1b002SJoao Martins 
81062c1b002SJoao Martins typedef struct VFIODirtyRangesListener {
811c7b313d3SEric Auger     VFIOContainerBase *bcontainer;
81262c1b002SJoao Martins     VFIODirtyRanges ranges;
81362c1b002SJoao Martins     MemoryListener listener;
81462c1b002SJoao Martins } VFIODirtyRangesListener;
81562c1b002SJoao Martins 
vfio_section_is_vfio_pci(MemoryRegionSection * section,VFIOContainerBase * bcontainer)816a31fe5daSJoao Martins static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
817c7b313d3SEric Auger                                      VFIOContainerBase *bcontainer)
818a31fe5daSJoao Martins {
819a31fe5daSJoao Martins     VFIOPCIDevice *pcidev;
820a31fe5daSJoao Martins     VFIODevice *vbasedev;
821a31fe5daSJoao Martins     Object *owner;
822a31fe5daSJoao Martins 
823a31fe5daSJoao Martins     owner = memory_region_owner(section->mr);
824a31fe5daSJoao Martins 
8253e6015d1SZhenzhong Duan     QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
826a31fe5daSJoao Martins         if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
827a31fe5daSJoao Martins             continue;
828a31fe5daSJoao Martins         }
829a31fe5daSJoao Martins         pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
830a31fe5daSJoao Martins         if (OBJECT(pcidev) == owner) {
831a31fe5daSJoao Martins             return true;
832a31fe5daSJoao Martins         }
833a31fe5daSJoao Martins     }
834a31fe5daSJoao Martins 
835a31fe5daSJoao Martins     return false;
836a31fe5daSJoao Martins }
837a31fe5daSJoao Martins 
vfio_dirty_tracking_update_range(VFIODirtyRanges * range,hwaddr iova,hwaddr end,bool update_pci)838344e7094SJoao Martins static void vfio_dirty_tracking_update_range(VFIODirtyRanges *range,
839344e7094SJoao Martins                                              hwaddr iova, hwaddr end,
840344e7094SJoao Martins                                              bool update_pci)
84162c1b002SJoao Martins {
842344e7094SJoao Martins     hwaddr *min, *max;
84362c1b002SJoao Martins 
84462c1b002SJoao Martins     /*
845a31fe5daSJoao Martins      * The address space passed to the dirty tracker is reduced to three ranges:
846a31fe5daSJoao Martins      * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the
847a31fe5daSJoao Martins      * PCI 64-bit hole.
848a31fe5daSJoao Martins      *
84962c1b002SJoao Martins      * The underlying reports of dirty will query a sub-interval of each of
85062c1b002SJoao Martins      * these ranges.
85162c1b002SJoao Martins      *
852a31fe5daSJoao Martins      * The purpose of the three range handling is to handle known cases of big
853a31fe5daSJoao Martins      * holes in the address space, like the x86 AMD 1T hole, and firmware (like
854a31fe5daSJoao Martins      * OVMF) which may relocate the pci-hole64 to the end of the address space.
855a31fe5daSJoao Martins      * The latter would otherwise generate large ranges for tracking, stressing
856a31fe5daSJoao Martins      * the limits of supported hardware. The pci-hole32 will always be below 4G
857a31fe5daSJoao Martins      * (overlapping or not) so it doesn't need special handling and is part of
858a31fe5daSJoao Martins      * the 32-bit range.
859a31fe5daSJoao Martins      *
860a31fe5daSJoao Martins      * The alternative would be an IOVATree but that has a much bigger runtime
861a31fe5daSJoao Martins      * overhead and unnecessary complexity.
86262c1b002SJoao Martins      */
863344e7094SJoao Martins     if (update_pci && iova >= UINT32_MAX) {
864a31fe5daSJoao Martins         min = &range->minpci64;
865a31fe5daSJoao Martins         max = &range->maxpci64;
866a31fe5daSJoao Martins     } else {
86762c1b002SJoao Martins         min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
86862c1b002SJoao Martins         max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
869a31fe5daSJoao Martins     }
87062c1b002SJoao Martins     if (*min > iova) {
87162c1b002SJoao Martins         *min = iova;
87262c1b002SJoao Martins     }
87362c1b002SJoao Martins     if (*max < end) {
87462c1b002SJoao Martins         *max = end;
87562c1b002SJoao Martins     }
87662c1b002SJoao Martins 
87762c1b002SJoao Martins     trace_vfio_device_dirty_tracking_update(iova, end, *min, *max);
878344e7094SJoao Martins }
879344e7094SJoao Martins 
vfio_dirty_tracking_update(MemoryListener * listener,MemoryRegionSection * section)880344e7094SJoao Martins static void vfio_dirty_tracking_update(MemoryListener *listener,
881344e7094SJoao Martins                                        MemoryRegionSection *section)
882344e7094SJoao Martins {
883344e7094SJoao Martins     VFIODirtyRangesListener *dirty =
884344e7094SJoao Martins         container_of(listener, VFIODirtyRangesListener, listener);
885344e7094SJoao Martins     hwaddr iova, end;
886344e7094SJoao Martins 
887344e7094SJoao Martins     if (!vfio_listener_valid_section(section, "tracking_update") ||
888344e7094SJoao Martins         !vfio_get_section_iova_range(dirty->bcontainer, section,
889344e7094SJoao Martins                                      &iova, &end, NULL)) {
89062c1b002SJoao Martins         return;
89162c1b002SJoao Martins     }
89262c1b002SJoao Martins 
893344e7094SJoao Martins     vfio_dirty_tracking_update_range(&dirty->ranges, iova, end,
894344e7094SJoao Martins                       vfio_section_is_vfio_pci(section, dirty->bcontainer));
895344e7094SJoao Martins }
896344e7094SJoao Martins 
89762c1b002SJoao Martins static const MemoryListener vfio_dirty_tracking_listener = {
89862c1b002SJoao Martins     .name = "vfio-tracking",
89962c1b002SJoao Martins     .region_add = vfio_dirty_tracking_update,
90062c1b002SJoao Martins };
90162c1b002SJoao Martins 
vfio_dirty_tracking_init(VFIOContainerBase * bcontainer,VFIODirtyRanges * ranges)902c7b313d3SEric Auger static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer,
90362c1b002SJoao Martins                                      VFIODirtyRanges *ranges)
90462c1b002SJoao Martins {
90562c1b002SJoao Martins     VFIODirtyRangesListener dirty;
90662c1b002SJoao Martins 
90762c1b002SJoao Martins     memset(&dirty, 0, sizeof(dirty));
90862c1b002SJoao Martins     dirty.ranges.min32 = UINT32_MAX;
90962c1b002SJoao Martins     dirty.ranges.min64 = UINT64_MAX;
910a31fe5daSJoao Martins     dirty.ranges.minpci64 = UINT64_MAX;
91162c1b002SJoao Martins     dirty.listener = vfio_dirty_tracking_listener;
912c7b313d3SEric Auger     dirty.bcontainer = bcontainer;
91362c1b002SJoao Martins 
91462c1b002SJoao Martins     memory_listener_register(&dirty.listener,
915c7b313d3SEric Auger                              bcontainer->space->as);
91662c1b002SJoao Martins 
91762c1b002SJoao Martins     *ranges = dirty.ranges;
91862c1b002SJoao Martins 
91962c1b002SJoao Martins     /*
92062c1b002SJoao Martins      * The memory listener is synchronous, and used to calculate the range
92162c1b002SJoao Martins      * to dirty tracking. Unregister it after we are done as we are not
92262c1b002SJoao Martins      * interested in any follow-up updates.
92362c1b002SJoao Martins      */
92462c1b002SJoao Martins     memory_listener_unregister(&dirty.listener);
92562c1b002SJoao Martins }
92662c1b002SJoao Martins 
vfio_devices_dma_logging_stop(VFIOContainerBase * bcontainer)927c7b313d3SEric Auger static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer)
9285255bbf4SJoao Martins {
9295255bbf4SJoao Martins     uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
9305255bbf4SJoao Martins                               sizeof(uint64_t))] = {};
9315255bbf4SJoao Martins     struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
9325255bbf4SJoao Martins     VFIODevice *vbasedev;
9335255bbf4SJoao Martins 
9345255bbf4SJoao Martins     feature->argsz = sizeof(buf);
9355255bbf4SJoao Martins     feature->flags = VFIO_DEVICE_FEATURE_SET |
9365255bbf4SJoao Martins                      VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
9375255bbf4SJoao Martins 
9383e6015d1SZhenzhong Duan     QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
9395255bbf4SJoao Martins         if (!vbasedev->dirty_tracking) {
9405255bbf4SJoao Martins             continue;
9415255bbf4SJoao Martins         }
9425255bbf4SJoao Martins 
9435255bbf4SJoao Martins         if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
9445255bbf4SJoao Martins             warn_report("%s: Failed to stop DMA logging, err %d (%s)",
9455255bbf4SJoao Martins                         vbasedev->name, -errno, strerror(errno));
9465255bbf4SJoao Martins         }
9475255bbf4SJoao Martins         vbasedev->dirty_tracking = false;
9485255bbf4SJoao Martins     }
9495255bbf4SJoao Martins }
9505255bbf4SJoao Martins 
9515255bbf4SJoao Martins static struct vfio_device_feature *
vfio_device_feature_dma_logging_start_create(VFIOContainerBase * bcontainer,VFIODirtyRanges * tracking)952c7b313d3SEric Auger vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer,
9535255bbf4SJoao Martins                                              VFIODirtyRanges *tracking)
9545255bbf4SJoao Martins {
9555255bbf4SJoao Martins     struct vfio_device_feature *feature;
9565255bbf4SJoao Martins     size_t feature_size;
9575255bbf4SJoao Martins     struct vfio_device_feature_dma_logging_control *control;
9585255bbf4SJoao Martins     struct vfio_device_feature_dma_logging_range *ranges;
9595255bbf4SJoao Martins 
9605255bbf4SJoao Martins     feature_size = sizeof(struct vfio_device_feature) +
9615255bbf4SJoao Martins                    sizeof(struct vfio_device_feature_dma_logging_control);
9625255bbf4SJoao Martins     feature = g_try_malloc0(feature_size);
9635255bbf4SJoao Martins     if (!feature) {
9645255bbf4SJoao Martins         errno = ENOMEM;
9655255bbf4SJoao Martins         return NULL;
9665255bbf4SJoao Martins     }
9675255bbf4SJoao Martins     feature->argsz = feature_size;
9685255bbf4SJoao Martins     feature->flags = VFIO_DEVICE_FEATURE_SET |
9695255bbf4SJoao Martins                      VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
9705255bbf4SJoao Martins 
9715255bbf4SJoao Martins     control = (struct vfio_device_feature_dma_logging_control *)feature->data;
9725255bbf4SJoao Martins     control->page_size = qemu_real_host_page_size();
9735255bbf4SJoao Martins 
9745255bbf4SJoao Martins     /*
9755255bbf4SJoao Martins      * DMA logging uAPI guarantees to support at least a number of ranges that
9765255bbf4SJoao Martins      * fits into a single host kernel base page.
9775255bbf4SJoao Martins      */
978a31fe5daSJoao Martins     control->num_ranges = !!tracking->max32 + !!tracking->max64 +
979a31fe5daSJoao Martins         !!tracking->maxpci64;
9805255bbf4SJoao Martins     ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
9815255bbf4SJoao Martins                         control->num_ranges);
9825255bbf4SJoao Martins     if (!ranges) {
9835255bbf4SJoao Martins         g_free(feature);
9845255bbf4SJoao Martins         errno = ENOMEM;
9855255bbf4SJoao Martins 
9865255bbf4SJoao Martins         return NULL;
9875255bbf4SJoao Martins     }
9885255bbf4SJoao Martins 
989592d0bc0SPaolo Bonzini     control->ranges = (uintptr_t)ranges;
9905255bbf4SJoao Martins     if (tracking->max32) {
9915255bbf4SJoao Martins         ranges->iova = tracking->min32;
9925255bbf4SJoao Martins         ranges->length = (tracking->max32 - tracking->min32) + 1;
9935255bbf4SJoao Martins         ranges++;
9945255bbf4SJoao Martins     }
9955255bbf4SJoao Martins     if (tracking->max64) {
9965255bbf4SJoao Martins         ranges->iova = tracking->min64;
9975255bbf4SJoao Martins         ranges->length = (tracking->max64 - tracking->min64) + 1;
998a31fe5daSJoao Martins         ranges++;
999a31fe5daSJoao Martins     }
1000a31fe5daSJoao Martins     if (tracking->maxpci64) {
1001a31fe5daSJoao Martins         ranges->iova = tracking->minpci64;
1002a31fe5daSJoao Martins         ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1;
10035255bbf4SJoao Martins     }
10045255bbf4SJoao Martins 
10055255bbf4SJoao Martins     trace_vfio_device_dirty_tracking_start(control->num_ranges,
10065255bbf4SJoao Martins                                            tracking->min32, tracking->max32,
1007a31fe5daSJoao Martins                                            tracking->min64, tracking->max64,
1008a31fe5daSJoao Martins                                            tracking->minpci64, tracking->maxpci64);
10095255bbf4SJoao Martins 
10105255bbf4SJoao Martins     return feature;
10115255bbf4SJoao Martins }
10125255bbf4SJoao Martins 
vfio_device_feature_dma_logging_start_destroy(struct vfio_device_feature * feature)10135255bbf4SJoao Martins static void vfio_device_feature_dma_logging_start_destroy(
10145255bbf4SJoao Martins     struct vfio_device_feature *feature)
10155255bbf4SJoao Martins {
10165255bbf4SJoao Martins     struct vfio_device_feature_dma_logging_control *control =
10175255bbf4SJoao Martins         (struct vfio_device_feature_dma_logging_control *)feature->data;
10185255bbf4SJoao Martins     struct vfio_device_feature_dma_logging_range *ranges =
10195255bbf4SJoao Martins         (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges;
10205255bbf4SJoao Martins 
10215255bbf4SJoao Martins     g_free(ranges);
10225255bbf4SJoao Martins     g_free(feature);
10235255bbf4SJoao Martins }
10245255bbf4SJoao Martins 
vfio_devices_dma_logging_start(VFIOContainerBase * bcontainer,Error ** errp)1025332b9b0dSCédric Le Goater static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer,
10260f21358fSCédric Le Goater                                           Error **errp)
10275255bbf4SJoao Martins {
10285255bbf4SJoao Martins     struct vfio_device_feature *feature;
10295255bbf4SJoao Martins     VFIODirtyRanges ranges;
10305255bbf4SJoao Martins     VFIODevice *vbasedev;
10315255bbf4SJoao Martins     int ret = 0;
10325255bbf4SJoao Martins 
1033c7b313d3SEric Auger     vfio_dirty_tracking_init(bcontainer, &ranges);
1034c7b313d3SEric Auger     feature = vfio_device_feature_dma_logging_start_create(bcontainer,
10355255bbf4SJoao Martins                                                            &ranges);
10365255bbf4SJoao Martins     if (!feature) {
10370f21358fSCédric Le Goater         error_setg_errno(errp, errno, "Failed to prepare DMA logging");
1038332b9b0dSCédric Le Goater         return false;
10395255bbf4SJoao Martins     }
10405255bbf4SJoao Martins 
10413e6015d1SZhenzhong Duan     QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
10425255bbf4SJoao Martins         if (vbasedev->dirty_tracking) {
10435255bbf4SJoao Martins             continue;
10445255bbf4SJoao Martins         }
10455255bbf4SJoao Martins 
10465255bbf4SJoao Martins         ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
10475255bbf4SJoao Martins         if (ret) {
10485255bbf4SJoao Martins             ret = -errno;
10490f21358fSCédric Le Goater             error_setg_errno(errp, errno, "%s: Failed to start DMA logging",
10500f21358fSCédric Le Goater                              vbasedev->name);
10515255bbf4SJoao Martins             goto out;
10525255bbf4SJoao Martins         }
10535255bbf4SJoao Martins         vbasedev->dirty_tracking = true;
10545255bbf4SJoao Martins     }
10555255bbf4SJoao Martins 
10565255bbf4SJoao Martins out:
10575255bbf4SJoao Martins     if (ret) {
1058c7b313d3SEric Auger         vfio_devices_dma_logging_stop(bcontainer);
10595255bbf4SJoao Martins     }
10605255bbf4SJoao Martins 
10615255bbf4SJoao Martins     vfio_device_feature_dma_logging_start_destroy(feature);
10625255bbf4SJoao Martins 
1063332b9b0dSCédric Le Goater     return ret == 0;
10645255bbf4SJoao Martins }
10655255bbf4SJoao Martins 
vfio_listener_log_global_start(MemoryListener * listener,Error ** errp)10663688fec8SCédric Le Goater static bool vfio_listener_log_global_start(MemoryListener *listener,
10673688fec8SCédric Le Goater                                            Error **errp)
1068758b96b6SKeqian Zhu {
10690f21358fSCédric Le Goater     ERRP_GUARD();
1070c7b313d3SEric Auger     VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
1071c7b313d3SEric Auger                                                  listener);
1072332b9b0dSCédric Le Goater     bool ret;
1073758b96b6SKeqian Zhu 
1074c7b313d3SEric Auger     if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
10750f21358fSCédric Le Goater         ret = vfio_devices_dma_logging_start(bcontainer, errp);
10765255bbf4SJoao Martins     } else {
1077332b9b0dSCédric Le Goater         ret = vfio_container_set_dirty_page_tracking(bcontainer, true, errp) == 0;
10785255bbf4SJoao Martins     }
10795255bbf4SJoao Martins 
1080332b9b0dSCédric Le Goater     if (!ret) {
10810f21358fSCédric Le Goater         error_prepend(errp, "vfio: Could not start dirty page tracking - ");
1082236e0a45SAvihai Horon     }
1083332b9b0dSCédric Le Goater     return ret;
1084758b96b6SKeqian Zhu }
1085758b96b6SKeqian Zhu 
vfio_listener_log_global_stop(MemoryListener * listener)1086758b96b6SKeqian Zhu static void vfio_listener_log_global_stop(MemoryListener *listener)
1087758b96b6SKeqian Zhu {
1088c7b313d3SEric Auger     VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
1089c7b313d3SEric Auger                                                  listener);
10900f21358fSCédric Le Goater     Error *local_err = NULL;
10915255bbf4SJoao Martins     int ret = 0;
1092758b96b6SKeqian Zhu 
1093c7b313d3SEric Auger     if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
1094c7b313d3SEric Auger         vfio_devices_dma_logging_stop(bcontainer);
10955255bbf4SJoao Martins     } else {
10960f21358fSCédric Le Goater         ret = vfio_container_set_dirty_page_tracking(bcontainer, false,
10970f21358fSCédric Le Goater                                                      &local_err);
10985255bbf4SJoao Martins     }
10995255bbf4SJoao Martins 
1100236e0a45SAvihai Horon     if (ret) {
11010f21358fSCédric Le Goater         error_prepend(&local_err,
11020f21358fSCédric Le Goater                       "vfio: Could not stop dirty page tracking - ");
11030f21358fSCédric Le Goater         error_report_err(local_err);
1104236e0a45SAvihai Horon         vfio_set_migration_error(ret);
1105236e0a45SAvihai Horon     }
1106758b96b6SKeqian Zhu }
1107758b96b6SKeqian Zhu 
vfio_device_dma_logging_report(VFIODevice * vbasedev,hwaddr iova,hwaddr size,void * bitmap)1108b153402aSJoao Martins static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
1109b153402aSJoao Martins                                           hwaddr size, void *bitmap)
1110b153402aSJoao Martins {
1111b153402aSJoao Martins     uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
1112b153402aSJoao Martins                         sizeof(struct vfio_device_feature_dma_logging_report),
1113850051b9SPaolo Bonzini                         sizeof(uint64_t))] = {};
1114b153402aSJoao Martins     struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
1115b153402aSJoao Martins     struct vfio_device_feature_dma_logging_report *report =
1116b153402aSJoao Martins         (struct vfio_device_feature_dma_logging_report *)feature->data;
1117b153402aSJoao Martins 
1118b153402aSJoao Martins     report->iova = iova;
1119b153402aSJoao Martins     report->length = size;
1120b153402aSJoao Martins     report->page_size = qemu_real_host_page_size();
1121592d0bc0SPaolo Bonzini     report->bitmap = (uintptr_t)bitmap;
1122b153402aSJoao Martins 
1123b153402aSJoao Martins     feature->argsz = sizeof(buf);
1124b153402aSJoao Martins     feature->flags = VFIO_DEVICE_FEATURE_GET |
1125b153402aSJoao Martins                      VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
1126b153402aSJoao Martins 
1127b153402aSJoao Martins     if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
1128b153402aSJoao Martins         return -errno;
1129b153402aSJoao Martins     }
1130b153402aSJoao Martins 
1131b153402aSJoao Martins     return 0;
1132b153402aSJoao Martins }
1133b153402aSJoao Martins 
vfio_devices_query_dirty_bitmap(const VFIOContainerBase * bcontainer,VFIOBitmap * vbmap,hwaddr iova,hwaddr size,Error ** errp)11344517c33cSZhenzhong Duan int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
11352da5f9e4SCédric Le Goater                  VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp)
1136b153402aSJoao Martins {
1137b153402aSJoao Martins     VFIODevice *vbasedev;
1138b153402aSJoao Martins     int ret;
1139b153402aSJoao Martins 
11403e6015d1SZhenzhong Duan     QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
1141b153402aSJoao Martins         ret = vfio_device_dma_logging_report(vbasedev, iova, size,
1142b153402aSJoao Martins                                              vbmap->bitmap);
1143b153402aSJoao Martins         if (ret) {
11442da5f9e4SCédric Le Goater             error_setg_errno(errp, -ret,
11452da5f9e4SCédric Le Goater                              "%s: Failed to get DMA logging report, iova: "
11462da5f9e4SCédric Le Goater                              "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx,
11472da5f9e4SCédric Le Goater                              vbasedev->name, iova, size);
1148b153402aSJoao Martins 
1149b153402aSJoao Martins             return ret;
1150b153402aSJoao Martins         }
1151b153402aSJoao Martins     }
1152b153402aSJoao Martins 
1153b153402aSJoao Martins     return 0;
1154b153402aSJoao Martins }
1155b153402aSJoao Martins 
vfio_get_dirty_bitmap(const VFIOContainerBase * bcontainer,uint64_t iova,uint64_t size,ram_addr_t ram_addr,Error ** errp)11564517c33cSZhenzhong Duan int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
11572da5f9e4SCédric Le Goater                           uint64_t size, ram_addr_t ram_addr, Error **errp)
11586607109fSAvihai Horon {
1159b153402aSJoao Martins     bool all_device_dirty_tracking =
1160e1cac6b2SEric Auger         vfio_devices_all_device_dirty_tracking(bcontainer);
11616fe4f6c9SJoao Martins     uint64_t dirty_pages;
11626607109fSAvihai Horon     VFIOBitmap vbmap;
11636607109fSAvihai Horon     int ret;
11646607109fSAvihai Horon 
1165e1cac6b2SEric Auger     if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) {
11666607109fSAvihai Horon         cpu_physical_memory_set_dirty_range(ram_addr, size,
11676607109fSAvihai Horon                                             tcg_enabled() ? DIRTY_CLIENTS_ALL :
11686607109fSAvihai Horon                                             DIRTY_CLIENTS_NOCODE);
11696607109fSAvihai Horon         return 0;
11706607109fSAvihai Horon     }
11716607109fSAvihai Horon 
11726607109fSAvihai Horon     ret = vfio_bitmap_alloc(&vbmap, size);
11736607109fSAvihai Horon     if (ret) {
11742da5f9e4SCédric Le Goater         error_setg_errno(errp, -ret,
11752da5f9e4SCédric Le Goater                          "Failed to allocate dirty tracking bitmap");
11766607109fSAvihai Horon         return ret;
11776607109fSAvihai Horon     }
11786607109fSAvihai Horon 
1179b153402aSJoao Martins     if (all_device_dirty_tracking) {
11802da5f9e4SCédric Le Goater         ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size,
11812da5f9e4SCédric Le Goater                                               errp);
1182b153402aSJoao Martins     } else {
11832da5f9e4SCédric Le Goater         ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size,
11842da5f9e4SCédric Le Goater                                                 errp);
1185b153402aSJoao Martins     }
1186b153402aSJoao Martins 
11876607109fSAvihai Horon     if (ret) {
11886607109fSAvihai Horon         goto out;
1189b6dd6504SKirti Wankhede     }
1190b6dd6504SKirti Wankhede 
11916fe4f6c9SJoao Martins     dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
1192725ccd7eSAvihai Horon                                                          vbmap.pages);
1193b6dd6504SKirti Wankhede 
1194e1cac6b2SEric Auger     trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages);
11956607109fSAvihai Horon out:
1196725ccd7eSAvihai Horon     g_free(vbmap.bitmap);
1197b6dd6504SKirti Wankhede 
1198b6dd6504SKirti Wankhede     return ret;
1199b6dd6504SKirti Wankhede }
1200b6dd6504SKirti Wankhede 
12019a04fe09SKirti Wankhede typedef struct {
12029a04fe09SKirti Wankhede     IOMMUNotifier n;
12039a04fe09SKirti Wankhede     VFIOGuestIOMMU *giommu;
12049a04fe09SKirti Wankhede } vfio_giommu_dirty_notifier;
12059a04fe09SKirti Wankhede 
vfio_iommu_map_dirty_notify(IOMMUNotifier * n,IOMMUTLBEntry * iotlb)12069a04fe09SKirti Wankhede static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
12079a04fe09SKirti Wankhede {
12089a04fe09SKirti Wankhede     vfio_giommu_dirty_notifier *gdn = container_of(n,
12099a04fe09SKirti Wankhede                                                 vfio_giommu_dirty_notifier, n);
12109a04fe09SKirti Wankhede     VFIOGuestIOMMU *giommu = gdn->giommu;
1211dddf83abSEric Auger     VFIOContainerBase *bcontainer = giommu->bcontainer;
12129a04fe09SKirti Wankhede     hwaddr iova = iotlb->iova + giommu->iommu_offset;
12139a04fe09SKirti Wankhede     ram_addr_t translated_addr;
1214ebb481c0SCédric Le Goater     Error *local_err = NULL;
1215236e0a45SAvihai Horon     int ret = -EINVAL;
12169a04fe09SKirti Wankhede 
12179a04fe09SKirti Wankhede     trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
12189a04fe09SKirti Wankhede 
12199a04fe09SKirti Wankhede     if (iotlb->target_as != &address_space_memory) {
12209a04fe09SKirti Wankhede         error_report("Wrong target AS \"%s\", only system memory is allowed",
12219a04fe09SKirti Wankhede                      iotlb->target_as->name ? iotlb->target_as->name : "none");
1222236e0a45SAvihai Horon         goto out;
12239a04fe09SKirti Wankhede     }
12249a04fe09SKirti Wankhede 
12259a04fe09SKirti Wankhede     rcu_read_lock();
1226ebb481c0SCédric Le Goater     if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) {
1227ebb481c0SCédric Le Goater         error_report_err(local_err);
122894d12088SCédric Le Goater         goto out_unlock;
122994d12088SCédric Le Goater     }
123094d12088SCédric Le Goater 
1231c7b313d3SEric Auger     ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1,
12322da5f9e4SCédric Le Goater                                 translated_addr, &local_err);
12339a04fe09SKirti Wankhede     if (ret) {
12342da5f9e4SCédric Le Goater         error_prepend(&local_err,
12352da5f9e4SCédric Le Goater                       "vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
12362da5f9e4SCédric Le Goater                       "0x%"HWADDR_PRIx") failed - ", bcontainer, iova,
12372da5f9e4SCédric Le Goater                       iotlb->addr_mask + 1);
12382da5f9e4SCédric Le Goater         error_report_err(local_err);
12399a04fe09SKirti Wankhede     }
124094d12088SCédric Le Goater 
124194d12088SCédric Le Goater out_unlock:
12429a04fe09SKirti Wankhede     rcu_read_unlock();
1243236e0a45SAvihai Horon 
1244236e0a45SAvihai Horon out:
1245236e0a45SAvihai Horon     if (ret) {
1246236e0a45SAvihai Horon         vfio_set_migration_error(ret);
1247236e0a45SAvihai Horon     }
12489a04fe09SKirti Wankhede }
12499a04fe09SKirti Wankhede 
vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection * section,void * opaque)12505e3b981cSDavid Hildenbrand static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
12515e3b981cSDavid Hildenbrand                                              void *opaque)
12525e3b981cSDavid Hildenbrand {
12535e3b981cSDavid Hildenbrand     const hwaddr size = int128_get64(section->size);
12545e3b981cSDavid Hildenbrand     const hwaddr iova = section->offset_within_address_space;
12555e3b981cSDavid Hildenbrand     const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
12565e3b981cSDavid Hildenbrand                                 section->offset_within_region;
12575e3b981cSDavid Hildenbrand     VFIORamDiscardListener *vrdl = opaque;
12582da5f9e4SCédric Le Goater     Error *local_err = NULL;
12592da5f9e4SCédric Le Goater     int ret;
12605e3b981cSDavid Hildenbrand 
12615e3b981cSDavid Hildenbrand     /*
12625e3b981cSDavid Hildenbrand      * Sync the whole mapped region (spanning multiple individual mappings)
12635e3b981cSDavid Hildenbrand      * in one go.
12645e3b981cSDavid Hildenbrand      */
12652da5f9e4SCédric Le Goater     ret = vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr,
12662da5f9e4SCédric Le Goater                                 &local_err);
12672da5f9e4SCédric Le Goater     if (ret) {
12682da5f9e4SCédric Le Goater         error_report_err(local_err);
12692da5f9e4SCédric Le Goater     }
12702da5f9e4SCédric Le Goater     return ret;
12715e3b981cSDavid Hildenbrand }
12725e3b981cSDavid Hildenbrand 
1273dc74a4b0SZhenzhong Duan static int
vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase * bcontainer,MemoryRegionSection * section)1274dc74a4b0SZhenzhong Duan vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer,
12755e3b981cSDavid Hildenbrand                                             MemoryRegionSection *section)
12765e3b981cSDavid Hildenbrand {
12775e3b981cSDavid Hildenbrand     RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
12785e3b981cSDavid Hildenbrand     VFIORamDiscardListener *vrdl = NULL;
12795e3b981cSDavid Hildenbrand 
1280dc74a4b0SZhenzhong Duan     QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
12815e3b981cSDavid Hildenbrand         if (vrdl->mr == section->mr &&
12825e3b981cSDavid Hildenbrand             vrdl->offset_within_address_space ==
12835e3b981cSDavid Hildenbrand             section->offset_within_address_space) {
12845e3b981cSDavid Hildenbrand             break;
12855e3b981cSDavid Hildenbrand         }
12865e3b981cSDavid Hildenbrand     }
12875e3b981cSDavid Hildenbrand 
12885e3b981cSDavid Hildenbrand     if (!vrdl) {
12895e3b981cSDavid Hildenbrand         hw_error("vfio: Trying to sync missing RAM discard listener");
12905e3b981cSDavid Hildenbrand     }
12915e3b981cSDavid Hildenbrand 
12925e3b981cSDavid Hildenbrand     /*
12935e3b981cSDavid Hildenbrand      * We only want/can synchronize the bitmap for actually mapped parts -
12945e3b981cSDavid Hildenbrand      * which correspond to populated parts. Replay all populated parts.
12955e3b981cSDavid Hildenbrand      */
12965e3b981cSDavid Hildenbrand     return ram_discard_manager_replay_populated(rdm, section,
12975e3b981cSDavid Hildenbrand                                               vfio_ram_discard_get_dirty_bitmap,
12985e3b981cSDavid Hildenbrand                                                 &vrdl);
12995e3b981cSDavid Hildenbrand }
13005e3b981cSDavid Hildenbrand 
vfio_sync_iommu_dirty_bitmap(VFIOContainerBase * bcontainer,MemoryRegionSection * section)1301723f702bSAvihai Horon static int vfio_sync_iommu_dirty_bitmap(VFIOContainerBase *bcontainer,
1302723f702bSAvihai Horon                                         MemoryRegionSection *section)
1303b6dd6504SKirti Wankhede {
13049a04fe09SKirti Wankhede     VFIOGuestIOMMU *giommu;
1305723f702bSAvihai Horon     bool found = false;
1306723f702bSAvihai Horon     Int128 llend;
1307723f702bSAvihai Horon     vfio_giommu_dirty_notifier gdn;
1308723f702bSAvihai Horon     int idx;
13099a04fe09SKirti Wankhede 
1310dddf83abSEric Auger     QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
131144ee6aaaSYi Liu         if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
13129a04fe09SKirti Wankhede             giommu->n.start == section->offset_within_region) {
1313723f702bSAvihai Horon             found = true;
1314723f702bSAvihai Horon             break;
1315723f702bSAvihai Horon         }
1316723f702bSAvihai Horon     }
1317723f702bSAvihai Horon 
1318723f702bSAvihai Horon     if (!found) {
1319723f702bSAvihai Horon         return 0;
1320723f702bSAvihai Horon     }
1321723f702bSAvihai Horon 
1322723f702bSAvihai Horon     gdn.giommu = giommu;
1323723f702bSAvihai Horon     idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr,
13249a04fe09SKirti Wankhede                                              MEMTXATTRS_UNSPECIFIED);
13259a04fe09SKirti Wankhede 
13269a04fe09SKirti Wankhede     llend = int128_add(int128_make64(section->offset_within_region),
13279a04fe09SKirti Wankhede                        section->size);
13289a04fe09SKirti Wankhede     llend = int128_sub(llend, int128_one());
13299a04fe09SKirti Wankhede 
1330723f702bSAvihai Horon     iommu_notifier_init(&gdn.n, vfio_iommu_map_dirty_notify, IOMMU_NOTIFIER_MAP,
1331723f702bSAvihai Horon                         section->offset_within_region, int128_get64(llend),
13329a04fe09SKirti Wankhede                         idx);
133344ee6aaaSYi Liu     memory_region_iommu_replay(giommu->iommu_mr, &gdn.n);
1334723f702bSAvihai Horon 
13359a04fe09SKirti Wankhede     return 0;
1336723f702bSAvihai Horon }
1337723f702bSAvihai Horon 
vfio_sync_dirty_bitmap(VFIOContainerBase * bcontainer,MemoryRegionSection * section,Error ** errp)1338723f702bSAvihai Horon static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer,
1339723f702bSAvihai Horon                                   MemoryRegionSection *section, Error **errp)
1340723f702bSAvihai Horon {
1341723f702bSAvihai Horon     ram_addr_t ram_addr;
1342723f702bSAvihai Horon 
1343723f702bSAvihai Horon     if (memory_region_is_iommu(section->mr)) {
1344723f702bSAvihai Horon         return vfio_sync_iommu_dirty_bitmap(bcontainer, section);
13455e3b981cSDavid Hildenbrand     } else if (memory_region_has_ram_discard_manager(section->mr)) {
13462da5f9e4SCédric Le Goater         int ret;
13472da5f9e4SCédric Le Goater 
13482da5f9e4SCédric Le Goater         ret = vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section);
13492da5f9e4SCédric Le Goater         if (ret) {
13502da5f9e4SCédric Le Goater             error_setg(errp,
13512da5f9e4SCédric Le Goater                        "Failed to sync dirty bitmap with RAM discard listener");
13522da5f9e4SCédric Le Goater         }
13532da5f9e4SCédric Le Goater         return ret;
13549a04fe09SKirti Wankhede     }
13559a04fe09SKirti Wankhede 
1356b6dd6504SKirti Wankhede     ram_addr = memory_region_get_ram_addr(section->mr) +
1357b6dd6504SKirti Wankhede                section->offset_within_region;
1358b6dd6504SKirti Wankhede 
1359c7b313d3SEric Auger     return vfio_get_dirty_bitmap(bcontainer,
13601eb7f642SKunkun Jiang                    REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
13612da5f9e4SCédric Le Goater                                  int128_get64(section->size), ram_addr, errp);
1362b6dd6504SKirti Wankhede }
1363b6dd6504SKirti Wankhede 
vfio_listener_log_sync(MemoryListener * listener,MemoryRegionSection * section)13644292d501SZenghui Yu static void vfio_listener_log_sync(MemoryListener *listener,
1365b6dd6504SKirti Wankhede         MemoryRegionSection *section)
1366b6dd6504SKirti Wankhede {
1367c7b313d3SEric Auger     VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
1368c7b313d3SEric Auger                                                  listener);
1369236e0a45SAvihai Horon     int ret;
13702da5f9e4SCédric Le Goater     Error *local_err = NULL;
1371b6dd6504SKirti Wankhede 
1372b051a3f6SAvihai Horon     if (vfio_listener_skipped_section(section)) {
1373b6dd6504SKirti Wankhede         return;
1374b6dd6504SKirti Wankhede     }
1375b6dd6504SKirti Wankhede 
1376c7b313d3SEric Auger     if (vfio_devices_all_dirty_tracking(bcontainer)) {
13772da5f9e4SCédric Le Goater         ret = vfio_sync_dirty_bitmap(bcontainer, section, &local_err);
1378236e0a45SAvihai Horon         if (ret) {
13792da5f9e4SCédric Le Goater             error_report_err(local_err);
1380236e0a45SAvihai Horon             vfio_set_migration_error(ret);
1381236e0a45SAvihai Horon         }
1382b6dd6504SKirti Wankhede     }
1383b6dd6504SKirti Wankhede }
1384b6dd6504SKirti Wankhede 
13857e63b311SYi Liu const MemoryListener vfio_memory_listener = {
1386142518bdSPeter Xu     .name = "vfio",
1387e2c7d025SEric Auger     .region_add = vfio_listener_region_add,
1388e2c7d025SEric Auger     .region_del = vfio_listener_region_del,
1389758b96b6SKeqian Zhu     .log_global_start = vfio_listener_log_global_start,
1390758b96b6SKeqian Zhu     .log_global_stop = vfio_listener_log_global_stop,
13914292d501SZenghui Yu     .log_sync = vfio_listener_log_sync,
1392e2c7d025SEric Auger };
1393e2c7d025SEric Auger 
vfio_reset_handler(void * opaque)1394e2c7d025SEric Auger void vfio_reset_handler(void *opaque)
1395e2c7d025SEric Auger {
1396e2c7d025SEric Auger     VFIODevice *vbasedev;
1397e2c7d025SEric Auger 
13989353b6daSVolker Rümelin     QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
13997da624e2SAlex Williamson         if (vbasedev->dev->realized) {
1400e2c7d025SEric Auger             vbasedev->ops->vfio_compute_needs_reset(vbasedev);
1401e2c7d025SEric Auger         }
1402e2c7d025SEric Auger     }
1403e2c7d025SEric Auger 
14049353b6daSVolker Rümelin     QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
14057da624e2SAlex Williamson         if (vbasedev->dev->realized && vbasedev->needs_reset) {
1406e2c7d025SEric Auger             vbasedev->ops->vfio_hot_reset_multi(vbasedev);
1407e2c7d025SEric Auger         }
1408e2c7d025SEric Auger     }
1409e2c7d025SEric Auger }
1410e2c7d025SEric Auger 
vfio_kvm_device_add_fd(int fd,Error ** errp)14115621c02dSZhenzhong Duan int vfio_kvm_device_add_fd(int fd, Error **errp)
1412e2c7d025SEric Auger {
1413e2c7d025SEric Auger #ifdef CONFIG_KVM
1414e2c7d025SEric Auger     struct kvm_device_attr attr = {
14155621c02dSZhenzhong Duan         .group = KVM_DEV_VFIO_FILE,
14165621c02dSZhenzhong Duan         .attr = KVM_DEV_VFIO_FILE_ADD,
14175621c02dSZhenzhong Duan         .addr = (uint64_t)(unsigned long)&fd,
1418e2c7d025SEric Auger     };
1419e2c7d025SEric Auger 
1420e2c7d025SEric Auger     if (!kvm_enabled()) {
14215621c02dSZhenzhong Duan         return 0;
1422e2c7d025SEric Auger     }
1423e2c7d025SEric Auger 
1424e2c7d025SEric Auger     if (vfio_kvm_device_fd < 0) {
1425e2c7d025SEric Auger         struct kvm_create_device cd = {
1426e2c7d025SEric Auger             .type = KVM_DEV_TYPE_VFIO,
1427e2c7d025SEric Auger         };
1428e2c7d025SEric Auger 
1429e2c7d025SEric Auger         if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
14305621c02dSZhenzhong Duan             error_setg_errno(errp, errno, "Failed to create KVM VFIO device");
14315621c02dSZhenzhong Duan             return -errno;
1432e2c7d025SEric Auger         }
1433e2c7d025SEric Auger 
1434e2c7d025SEric Auger         vfio_kvm_device_fd = cd.fd;
1435e2c7d025SEric Auger     }
1436e2c7d025SEric Auger 
1437e2c7d025SEric Auger     if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
14385621c02dSZhenzhong Duan         error_setg_errno(errp, errno, "Failed to add fd %d to KVM VFIO device",
14395621c02dSZhenzhong Duan                          fd);
14405621c02dSZhenzhong Duan         return -errno;
1441e2c7d025SEric Auger     }
1442e2c7d025SEric Auger #endif
14435621c02dSZhenzhong Duan     return 0;
14445621c02dSZhenzhong Duan }
14455621c02dSZhenzhong Duan 
vfio_kvm_device_del_fd(int fd,Error ** errp)14465621c02dSZhenzhong Duan int vfio_kvm_device_del_fd(int fd, Error **errp)
14475621c02dSZhenzhong Duan {
14485621c02dSZhenzhong Duan #ifdef CONFIG_KVM
14495621c02dSZhenzhong Duan     struct kvm_device_attr attr = {
14505621c02dSZhenzhong Duan         .group = KVM_DEV_VFIO_FILE,
14515621c02dSZhenzhong Duan         .attr = KVM_DEV_VFIO_FILE_DEL,
14525621c02dSZhenzhong Duan         .addr = (uint64_t)(unsigned long)&fd,
14535621c02dSZhenzhong Duan     };
14545621c02dSZhenzhong Duan 
14555621c02dSZhenzhong Duan     if (vfio_kvm_device_fd < 0) {
14565621c02dSZhenzhong Duan         error_setg(errp, "KVM VFIO device isn't created yet");
14575621c02dSZhenzhong Duan         return -EINVAL;
14585621c02dSZhenzhong Duan     }
14595621c02dSZhenzhong Duan 
14605621c02dSZhenzhong Duan     if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
14615621c02dSZhenzhong Duan         error_setg_errno(errp, errno,
14625621c02dSZhenzhong Duan                          "Failed to remove fd %d from KVM VFIO device", fd);
14635621c02dSZhenzhong Duan         return -errno;
14645621c02dSZhenzhong Duan     }
14655621c02dSZhenzhong Duan #endif
14665621c02dSZhenzhong Duan     return 0;
14675621c02dSZhenzhong Duan }
14685621c02dSZhenzhong Duan 
vfio_get_address_space(AddressSpace * as)14697e63b311SYi Liu VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
1470e2c7d025SEric Auger {
1471e2c7d025SEric Auger     VFIOAddressSpace *space;
1472e2c7d025SEric Auger 
1473e2c7d025SEric Auger     QLIST_FOREACH(space, &vfio_address_spaces, list) {
1474e2c7d025SEric Auger         if (space->as == as) {
1475e2c7d025SEric Auger             return space;
1476e2c7d025SEric Auger         }
1477e2c7d025SEric Auger     }
1478e2c7d025SEric Auger 
1479e2c7d025SEric Auger     /* No suitable VFIOAddressSpace, create a new one */
1480e2c7d025SEric Auger     space = g_malloc0(sizeof(*space));
1481e2c7d025SEric Auger     space->as = as;
1482e2c7d025SEric Auger     QLIST_INIT(&space->containers);
1483e2c7d025SEric Auger 
1484c8fcb90cSZhenzhong Duan     if (QLIST_EMPTY(&vfio_address_spaces)) {
1485c8fcb90cSZhenzhong Duan         qemu_register_reset(vfio_reset_handler, NULL);
1486c8fcb90cSZhenzhong Duan     }
1487c8fcb90cSZhenzhong Duan 
1488e2c7d025SEric Auger     QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
1489e2c7d025SEric Auger 
1490e2c7d025SEric Auger     return space;
1491e2c7d025SEric Auger }
1492e2c7d025SEric Auger 
vfio_put_address_space(VFIOAddressSpace * space)14937e63b311SYi Liu void vfio_put_address_space(VFIOAddressSpace *space)
1494e2c7d025SEric Auger {
14951eae5b7bSZhenzhong Duan     if (!QLIST_EMPTY(&space->containers)) {
14961eae5b7bSZhenzhong Duan         return;
14971eae5b7bSZhenzhong Duan     }
14981eae5b7bSZhenzhong Duan 
1499e2c7d025SEric Auger     QLIST_REMOVE(space, list);
1500e2c7d025SEric Auger     g_free(space);
15011eae5b7bSZhenzhong Duan 
1502c8fcb90cSZhenzhong Duan     if (QLIST_EMPTY(&vfio_address_spaces)) {
1503c8fcb90cSZhenzhong Duan         qemu_unregister_reset(vfio_reset_handler, NULL);
1504c8fcb90cSZhenzhong Duan     }
1505e2c7d025SEric Auger }
1506e2c7d025SEric Auger 
vfio_address_space_insert(VFIOAddressSpace * space,VFIOContainerBase * bcontainer)1507b7b79588SCédric Le Goater void vfio_address_space_insert(VFIOAddressSpace *space,
1508b7b79588SCédric Le Goater                                VFIOContainerBase *bcontainer)
1509b7b79588SCédric Le Goater {
1510b7b79588SCédric Le Goater     QLIST_INSERT_HEAD(&space->containers, bcontainer, next);
151109181a8eSCédric Le Goater     bcontainer->space = space;
1512b7b79588SCédric Le Goater }
1513b7b79588SCédric Le Goater 
vfio_get_device_info(int fd)1514634f38f0SAlex Williamson struct vfio_device_info *vfio_get_device_info(int fd)
1515634f38f0SAlex Williamson {
1516634f38f0SAlex Williamson     struct vfio_device_info *info;
1517634f38f0SAlex Williamson     uint32_t argsz = sizeof(*info);
1518634f38f0SAlex Williamson 
1519634f38f0SAlex Williamson     info = g_malloc0(argsz);
1520634f38f0SAlex Williamson 
1521634f38f0SAlex Williamson retry:
1522634f38f0SAlex Williamson     info->argsz = argsz;
1523634f38f0SAlex Williamson 
1524634f38f0SAlex Williamson     if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) {
1525634f38f0SAlex Williamson         g_free(info);
1526634f38f0SAlex Williamson         return NULL;
1527634f38f0SAlex Williamson     }
1528634f38f0SAlex Williamson 
1529634f38f0SAlex Williamson     if (info->argsz > argsz) {
1530634f38f0SAlex Williamson         argsz = info->argsz;
1531634f38f0SAlex Williamson         info = g_realloc(info, argsz);
1532634f38f0SAlex Williamson         goto retry;
1533634f38f0SAlex Williamson     }
1534634f38f0SAlex Williamson 
1535634f38f0SAlex Williamson     return info;
1536634f38f0SAlex Williamson }
15371eb31f13SEric Auger 
vfio_attach_device(char * name,VFIODevice * vbasedev,AddressSpace * as,Error ** errp)1538b7754835SZhenzhong Duan bool vfio_attach_device(char *name, VFIODevice *vbasedev,
15391eb31f13SEric Auger                         AddressSpace *as, Error **errp)
15401eb31f13SEric Auger {
15419812feefSCédric Le Goater     const VFIOIOMMUClass *ops =
15429812feefSCédric Le Goater         VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
154383a4d596SJoao Martins     HostIOMMUDevice *hiod = NULL;
15441eb31f13SEric Auger 
15455ee3dc7aSYi Liu     if (vbasedev->iommufd) {
1546ce5f6d49SCédric Le Goater         ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
15475ee3dc7aSYi Liu     }
15489812feefSCédric Le Goater 
15499812feefSCédric Le Goater     assert(ops);
15509812feefSCédric Le Goater 
1551a7fd91b8SZhenzhong Duan 
155283a4d596SJoao Martins     if (!vbasedev->mdev) {
1553a7fd91b8SZhenzhong Duan         hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename));
155483a4d596SJoao Martins         vbasedev->hiod = hiod;
1555a7fd91b8SZhenzhong Duan     }
1556a7fd91b8SZhenzhong Duan 
155783a4d596SJoao Martins     if (!ops->attach_device(name, vbasedev, as, errp)) {
1558a7fd91b8SZhenzhong Duan         object_unref(hiod);
155983a4d596SJoao Martins         vbasedev->hiod = NULL;
1560a7fd91b8SZhenzhong Duan         return false;
1561a7fd91b8SZhenzhong Duan     }
1562a7fd91b8SZhenzhong Duan 
1563a7fd91b8SZhenzhong Duan     return true;
15641eb31f13SEric Auger }
15651eb31f13SEric Auger 
vfio_detach_device(VFIODevice * vbasedev)15661eb31f13SEric Auger void vfio_detach_device(VFIODevice *vbasedev)
15671eb31f13SEric Auger {
15681eb31f13SEric Auger     if (!vbasedev->bcontainer) {
15691eb31f13SEric Auger         return;
15701eb31f13SEric Auger     }
1571a7fd91b8SZhenzhong Duan     object_unref(vbasedev->hiod);
157241d698b8SCédric Le Goater     VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev);
15731eb31f13SEric Auger }
1574