1e2c7d025SEric Auger /*
2e2c7d025SEric Auger * generic functions used by VFIO devices
3e2c7d025SEric Auger *
4e2c7d025SEric Auger * Copyright Red Hat, Inc. 2012
5e2c7d025SEric Auger *
6e2c7d025SEric Auger * Authors:
7e2c7d025SEric Auger * Alex Williamson <alex.williamson@redhat.com>
8e2c7d025SEric Auger *
9e2c7d025SEric Auger * This work is licensed under the terms of the GNU GPL, version 2. See
10e2c7d025SEric Auger * the COPYING file in the top-level directory.
11e2c7d025SEric Auger *
12e2c7d025SEric Auger * Based on qemu-kvm device-assignment:
13e2c7d025SEric Auger * Adapted for KVM by Qumranet.
14e2c7d025SEric Auger * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15e2c7d025SEric Auger * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16e2c7d025SEric Auger * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17e2c7d025SEric Auger * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18e2c7d025SEric Auger * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19e2c7d025SEric Auger */
20e2c7d025SEric Auger
21c6eacb1aSPeter Maydell #include "qemu/osdep.h"
22e2c7d025SEric Auger #include <sys/ioctl.h>
23a9c94277SMarkus Armbruster #ifdef CONFIG_KVM
24a9c94277SMarkus Armbruster #include <linux/kvm.h>
25a9c94277SMarkus Armbruster #endif
26e2c7d025SEric Auger #include <linux/vfio.h>
27e2c7d025SEric Auger
28e2c7d025SEric Auger #include "hw/vfio/vfio-common.h"
29a31fe5daSJoao Martins #include "hw/vfio/pci.h"
30e2c7d025SEric Auger #include "exec/address-spaces.h"
31e2c7d025SEric Auger #include "exec/memory.h"
32b6dd6504SKirti Wankhede #include "exec/ram_addr.h"
33e2c7d025SEric Auger #include "hw/hw.h"
34e2c7d025SEric Auger #include "qemu/error-report.h"
35db725815SMarkus Armbruster #include "qemu/main-loop.h"
36f4ec5e26SAlexey Kardashevskiy #include "qemu/range.h"
37e2c7d025SEric Auger #include "sysemu/kvm.h"
3871e8a915SMarkus Armbruster #include "sysemu/reset.h"
390fd7616eSDavid Hildenbrand #include "sysemu/runstate.h"
40e2c7d025SEric Auger #include "trace.h"
4101905f58SEric Auger #include "qapi/error.h"
428b942af3SAvihai Horon #include "migration/misc.h"
4329d81b71SAvihai Horon #include "migration/blocker.h"
44236e0a45SAvihai Horon #include "migration/qemu-file.h"
45851d6d1aSEric Auger #include "sysemu/tpm.h"
46e2c7d025SEric Auger
477e63b311SYi Liu VFIODeviceList vfio_device_list =
483d779abaSZhenzhong Duan QLIST_HEAD_INITIALIZER(vfio_device_list);
4910ca76b4SPaolo Bonzini static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
50e2c7d025SEric Auger QLIST_HEAD_INITIALIZER(vfio_address_spaces);
51e2c7d025SEric Auger
52e2c7d025SEric Auger #ifdef CONFIG_KVM
53e2c7d025SEric Auger /*
54e2c7d025SEric Auger * We have a single VFIO pseudo device per KVM VM. Once created it lives
55e2c7d025SEric Auger * for the life of the VM. Closing the file descriptor only drops our
56e2c7d025SEric Auger * reference to it and the device's reference to kvm. Therefore once
57e2c7d025SEric Auger * initialized, this file descriptor is only released on QEMU exit and
58e2c7d025SEric Auger * we'll re-use it should another vfio device be attached before then.
59e2c7d025SEric Auger */
607e63b311SYi Liu int vfio_kvm_device_fd = -1;
61e2c7d025SEric Auger #endif
62e2c7d025SEric Auger
63e2c7d025SEric Auger /*
64b6dd6504SKirti Wankhede * Device state interfaces
65b6dd6504SKirti Wankhede */
66b6dd6504SKirti Wankhede
vfio_mig_active(void)673710586cSKirti Wankhede bool vfio_mig_active(void)
683710586cSKirti Wankhede {
693710586cSKirti Wankhede VFIODevice *vbasedev;
703710586cSKirti Wankhede
713d779abaSZhenzhong Duan if (QLIST_EMPTY(&vfio_device_list)) {
723710586cSKirti Wankhede return false;
733710586cSKirti Wankhede }
743710586cSKirti Wankhede
759353b6daSVolker Rümelin QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
763710586cSKirti Wankhede if (vbasedev->migration_blocker) {
773710586cSKirti Wankhede return false;
783710586cSKirti Wankhede }
793710586cSKirti Wankhede }
803710586cSKirti Wankhede return true;
813710586cSKirti Wankhede }
823710586cSKirti Wankhede
8329d81b71SAvihai Horon static Error *multiple_devices_migration_blocker;
8429d81b71SAvihai Horon
855c7a4b60SAvihai Horon /*
865c7a4b60SAvihai Horon * Multiple devices migration is allowed only if all devices support P2P
875c7a4b60SAvihai Horon * migration. Single device migration is allowed regardless of P2P migration
885c7a4b60SAvihai Horon * support.
895c7a4b60SAvihai Horon */
vfio_multiple_devices_migration_is_supported(void)905c7a4b60SAvihai Horon static bool vfio_multiple_devices_migration_is_supported(void)
9129d81b71SAvihai Horon {
9229d81b71SAvihai Horon VFIODevice *vbasedev;
9329d81b71SAvihai Horon unsigned int device_num = 0;
945c7a4b60SAvihai Horon bool all_support_p2p = true;
9529d81b71SAvihai Horon
969353b6daSVolker Rümelin QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
9729d81b71SAvihai Horon if (vbasedev->migration) {
9829d81b71SAvihai Horon device_num++;
995c7a4b60SAvihai Horon
1005c7a4b60SAvihai Horon if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) {
1015c7a4b60SAvihai Horon all_support_p2p = false;
1025c7a4b60SAvihai Horon }
10329d81b71SAvihai Horon }
10429d81b71SAvihai Horon }
10529d81b71SAvihai Horon
1065c7a4b60SAvihai Horon return all_support_p2p || device_num <= 1;
10729d81b71SAvihai Horon }
10829d81b71SAvihai Horon
vfio_block_multiple_devices_migration(VFIODevice * vbasedev,Error ** errp)1098bbcb64aSAvihai Horon int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)
11029d81b71SAvihai Horon {
11129d81b71SAvihai Horon int ret;
11229d81b71SAvihai Horon
1138118349bSAvihai Horon if (vfio_multiple_devices_migration_is_supported()) {
11429d81b71SAvihai Horon return 0;
11529d81b71SAvihai Horon }
11629d81b71SAvihai Horon
1178bbcb64aSAvihai Horon if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
1185c7a4b60SAvihai Horon error_setg(errp, "Multiple VFIO devices migration is supported only if "
1195c7a4b60SAvihai Horon "all of them support P2P migration");
1208bbcb64aSAvihai Horon return -EINVAL;
1218bbcb64aSAvihai Horon }
1228bbcb64aSAvihai Horon
1238118349bSAvihai Horon if (multiple_devices_migration_blocker) {
1248118349bSAvihai Horon return 0;
1258118349bSAvihai Horon }
1268118349bSAvihai Horon
12729d81b71SAvihai Horon error_setg(&multiple_devices_migration_blocker,
1285c7a4b60SAvihai Horon "Multiple VFIO devices migration is supported only if all of "
1295c7a4b60SAvihai Horon "them support P2P migration");
1300cb51c18SSteve Sistare ret = migrate_add_blocker_normal(&multiple_devices_migration_blocker, errp);
13129d81b71SAvihai Horon
13229d81b71SAvihai Horon return ret;
13329d81b71SAvihai Horon }
13429d81b71SAvihai Horon
vfio_unblock_multiple_devices_migration(void)13529d81b71SAvihai Horon void vfio_unblock_multiple_devices_migration(void)
13629d81b71SAvihai Horon {
13729d81b71SAvihai Horon if (!multiple_devices_migration_blocker ||
1385c7a4b60SAvihai Horon !vfio_multiple_devices_migration_is_supported()) {
13929d81b71SAvihai Horon return;
14029d81b71SAvihai Horon }
14129d81b71SAvihai Horon
142c8a7fc51SSteve Sistare migrate_del_blocker(&multiple_devices_migration_blocker);
14329d81b71SAvihai Horon }
14429d81b71SAvihai Horon
vfio_viommu_preset(VFIODevice * vbasedev)1453c26c80aSZhenzhong Duan bool vfio_viommu_preset(VFIODevice *vbasedev)
146e4688320SJoao Martins {
1473e6015d1SZhenzhong Duan return vbasedev->bcontainer->space->as != &address_space_memory;
148e4688320SJoao Martins }
149e4688320SJoao Martins
vfio_set_migration_error(int ret)150019d9e6cSCédric Le Goater static void vfio_set_migration_error(int ret)
151236e0a45SAvihai Horon {
152*f018eb62SPeter Xu if (migration_is_running()) {
153019d9e6cSCédric Le Goater migration_file_set_error(ret, NULL);
154236e0a45SAvihai Horon }
155236e0a45SAvihai Horon }
156236e0a45SAvihai Horon
vfio_device_state_is_running(VFIODevice * vbasedev)1573d4d0f0eSJoao Martins bool vfio_device_state_is_running(VFIODevice *vbasedev)
1583d4d0f0eSJoao Martins {
1593d4d0f0eSJoao Martins VFIOMigration *migration = vbasedev->migration;
1603d4d0f0eSJoao Martins
16194f775e4SAvihai Horon return migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
16294f775e4SAvihai Horon migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P;
1633d4d0f0eSJoao Martins }
1643d4d0f0eSJoao Martins
vfio_device_state_is_precopy(VFIODevice * vbasedev)1653d4d0f0eSJoao Martins bool vfio_device_state_is_precopy(VFIODevice *vbasedev)
1663d4d0f0eSJoao Martins {
1673d4d0f0eSJoao Martins VFIOMigration *migration = vbasedev->migration;
1683d4d0f0eSJoao Martins
16994f775e4SAvihai Horon return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ||
17094f775e4SAvihai Horon migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P;
1713d4d0f0eSJoao Martins }
1723d4d0f0eSJoao Martins
vfio_devices_all_dirty_tracking(VFIOContainerBase * bcontainer)173e1cac6b2SEric Auger static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer)
174b6dd6504SKirti Wankhede {
175b6dd6504SKirti Wankhede VFIODevice *vbasedev;
176b6dd6504SKirti Wankhede
1779bb630c6SSteve Sistare if (!migration_is_active() && !migration_is_device()) {
178b6dd6504SKirti Wankhede return false;
179b6dd6504SKirti Wankhede }
180b6dd6504SKirti Wankhede
1813e6015d1SZhenzhong Duan QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
182b6dd6504SKirti Wankhede VFIOMigration *migration = vbasedev->migration;
183b6dd6504SKirti Wankhede
184b6dd6504SKirti Wankhede if (!migration) {
185b6dd6504SKirti Wankhede return false;
186b6dd6504SKirti Wankhede }
187b6dd6504SKirti Wankhede
1887429aebeSAvihai Horon if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
1893d4d0f0eSJoao Martins (vfio_device_state_is_running(vbasedev) ||
1903d4d0f0eSJoao Martins vfio_device_state_is_precopy(vbasedev))) {
19131bcbbb5SAvihai Horon return false;
19231bcbbb5SAvihai Horon }
193b6dd6504SKirti Wankhede }
194b6dd6504SKirti Wankhede return true;
195b6dd6504SKirti Wankhede }
196b6dd6504SKirti Wankhede
vfio_devices_all_device_dirty_tracking(const VFIOContainerBase * bcontainer)1974517c33cSZhenzhong Duan bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer)
1985255bbf4SJoao Martins {
1995255bbf4SJoao Martins VFIODevice *vbasedev;
2005255bbf4SJoao Martins
2013e6015d1SZhenzhong Duan QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
20230b91677SJoao Martins if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) {
20330b91677SJoao Martins return false;
20430b91677SJoao Martins }
2055255bbf4SJoao Martins if (!vbasedev->dirty_pages_supported) {
2065255bbf4SJoao Martins return false;
2075255bbf4SJoao Martins }
2085255bbf4SJoao Martins }
2095255bbf4SJoao Martins
2105255bbf4SJoao Martins return true;
2115255bbf4SJoao Martins }
2125255bbf4SJoao Martins
2138b942af3SAvihai Horon /*
2148b942af3SAvihai Horon * Check if all VFIO devices are running and migration is active, which is
2158b942af3SAvihai Horon * essentially equivalent to the migration being in pre-copy phase.
2168b942af3SAvihai Horon */
2174517c33cSZhenzhong Duan bool
vfio_devices_all_running_and_mig_active(const VFIOContainerBase * bcontainer)2184517c33cSZhenzhong Duan vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer)
2199e7b0442SKirti Wankhede {
2209e7b0442SKirti Wankhede VFIODevice *vbasedev;
2219e7b0442SKirti Wankhede
2223a6813b6SSteve Sistare if (!migration_is_active()) {
2239e7b0442SKirti Wankhede return false;
2249e7b0442SKirti Wankhede }
2259e7b0442SKirti Wankhede
2263e6015d1SZhenzhong Duan QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
2279e7b0442SKirti Wankhede VFIOMigration *migration = vbasedev->migration;
2289e7b0442SKirti Wankhede
2299e7b0442SKirti Wankhede if (!migration) {
2309e7b0442SKirti Wankhede return false;
2319e7b0442SKirti Wankhede }
2329e7b0442SKirti Wankhede
2333d4d0f0eSJoao Martins if (vfio_device_state_is_running(vbasedev) ||
2343d4d0f0eSJoao Martins vfio_device_state_is_precopy(vbasedev)) {
2359e7b0442SKirti Wankhede continue;
2369e7b0442SKirti Wankhede } else {
2379e7b0442SKirti Wankhede return false;
2389e7b0442SKirti Wankhede }
2399e7b0442SKirti Wankhede }
2409e7b0442SKirti Wankhede return true;
2419e7b0442SKirti Wankhede }
2429e7b0442SKirti Wankhede
vfio_listener_skipped_section(MemoryRegionSection * section)243e2c7d025SEric Auger static bool vfio_listener_skipped_section(MemoryRegionSection *section)
244e2c7d025SEric Auger {
245e2c7d025SEric Auger return (!memory_region_is_ram(section->mr) &&
246e2c7d025SEric Auger !memory_region_is_iommu(section->mr)) ||
24756918a12SSean Christopherson memory_region_is_protected(section->mr) ||
248e2c7d025SEric Auger /*
249e2c7d025SEric Auger * Sizing an enabled 64-bit BAR can cause spurious mappings to
250e2c7d025SEric Auger * addresses in the upper part of the 64-bit address space. These
251e2c7d025SEric Auger * are never accessed by the CPU and beyond the address width of
252e2c7d025SEric Auger * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
253e2c7d025SEric Auger */
254e2c7d025SEric Auger section->offset_within_address_space & (1ULL << 63);
255e2c7d025SEric Auger }
256e2c7d025SEric Auger
2574a4b88fbSPeter Xu /* Called with rcu_read_lock held. */
vfio_get_xlat_addr(IOMMUTLBEntry * iotlb,void ** vaddr,ram_addr_t * ram_addr,bool * read_only,Error ** errp)2589a04fe09SKirti Wankhede static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
259ebb481c0SCédric Le Goater ram_addr_t *ram_addr, bool *read_only,
260ebb481c0SCédric Le Goater Error **errp)
2614a4b88fbSPeter Xu {
262baa44bceSCindy Lu bool ret, mr_has_discard_manager;
2634a4b88fbSPeter Xu
264baa44bceSCindy Lu ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
265ebb481c0SCédric Le Goater &mr_has_discard_manager, errp);
266baa44bceSCindy Lu if (ret && mr_has_discard_manager) {
2670fd7616eSDavid Hildenbrand /*
2680fd7616eSDavid Hildenbrand * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
2690fd7616eSDavid Hildenbrand * pages will remain pinned inside vfio until unmapped, resulting in a
2700fd7616eSDavid Hildenbrand * higher memory consumption than expected. If memory would get
2710fd7616eSDavid Hildenbrand * populated again later, there would be an inconsistency between pages
2720fd7616eSDavid Hildenbrand * pinned by vfio and pages seen by QEMU. This is the case until
2730fd7616eSDavid Hildenbrand * unmapped from the IOMMU (e.g., during device reset).
2740fd7616eSDavid Hildenbrand *
2750fd7616eSDavid Hildenbrand * With malicious guests, we really only care about pinning more memory
2760fd7616eSDavid Hildenbrand * than expected. RLIMIT_MEMLOCK set for the user/process can never be
2770fd7616eSDavid Hildenbrand * exceeded and can be used to mitigate this problem.
2780fd7616eSDavid Hildenbrand */
2790fd7616eSDavid Hildenbrand warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
2800fd7616eSDavid Hildenbrand " RAM (e.g., virtio-mem) works, however, malicious"
2810fd7616eSDavid Hildenbrand " guests can trigger pinning of more memory than"
2820fd7616eSDavid Hildenbrand " intended via an IOMMU. It's possible to mitigate "
2830fd7616eSDavid Hildenbrand " by setting/adjusting RLIMIT_MEMLOCK.");
2844a4b88fbSPeter Xu }
285baa44bceSCindy Lu return ret;
2864a4b88fbSPeter Xu }
2874a4b88fbSPeter Xu
vfio_iommu_map_notify(IOMMUNotifier * n,IOMMUTLBEntry * iotlb)288cdb30812SPeter Xu static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
289e2c7d025SEric Auger {
290e2c7d025SEric Auger VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
291dddf83abSEric Auger VFIOContainerBase *bcontainer = giommu->bcontainer;
292d78c19b5SAlexey Kardashevskiy hwaddr iova = iotlb->iova + giommu->iommu_offset;
293e2c7d025SEric Auger void *vaddr;
294e2c7d025SEric Auger int ret;
295ebb481c0SCédric Le Goater Error *local_err = NULL;
296e2c7d025SEric Auger
29732138357SPeter Xu trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
29832138357SPeter Xu iova, iova + iotlb->addr_mask);
299e2c7d025SEric Auger
300f1f93650SAlexey Kardashevskiy if (iotlb->target_as != &address_space_memory) {
301f1f93650SAlexey Kardashevskiy error_report("Wrong target AS \"%s\", only system memory is allowed",
302f1f93650SAlexey Kardashevskiy iotlb->target_as->name ? iotlb->target_as->name : "none");
303236e0a45SAvihai Horon vfio_set_migration_error(-EINVAL);
304f1f93650SAlexey Kardashevskiy return;
305f1f93650SAlexey Kardashevskiy }
306f1f93650SAlexey Kardashevskiy
30741063e1eSPaolo Bonzini rcu_read_lock();
3084a4b88fbSPeter Xu
309dfbd90e5SPeter Xu if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
3109a04fe09SKirti Wankhede bool read_only;
3119a04fe09SKirti Wankhede
312ebb481c0SCédric Le Goater if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) {
313ebb481c0SCédric Le Goater error_report_err(local_err);
31441063e1eSPaolo Bonzini goto out;
315e2c7d025SEric Auger }
3164a4b88fbSPeter Xu /*
3174a4b88fbSPeter Xu * vaddr is only valid until rcu_read_unlock(). But after
3184a4b88fbSPeter Xu * vfio_dma_map has set up the mapping the pages will be
3194a4b88fbSPeter Xu * pinned by the kernel. This makes sure that the RAM backend
3204a4b88fbSPeter Xu * of vaddr will always be there, even if the memory object is
3214a4b88fbSPeter Xu * destroyed and its backing memory munmap-ed.
3224a4b88fbSPeter Xu */
323b08501a9SEric Auger ret = vfio_container_dma_map(bcontainer, iova,
324e2c7d025SEric Auger iotlb->addr_mask + 1, vaddr,
3254a4b88fbSPeter Xu read_only);
326e2c7d025SEric Auger if (ret) {
327b08501a9SEric Auger error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
328db9b829bSAvihai Horon "0x%"HWADDR_PRIx", %p) = %d (%s)",
329b08501a9SEric Auger bcontainer, iova,
330db9b829bSAvihai Horon iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
331e2c7d025SEric Auger }
332e2c7d025SEric Auger } else {
333b08501a9SEric Auger ret = vfio_container_dma_unmap(bcontainer, iova,
334b08501a9SEric Auger iotlb->addr_mask + 1, iotlb);
335e2c7d025SEric Auger if (ret) {
336b08501a9SEric Auger error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
337db9b829bSAvihai Horon "0x%"HWADDR_PRIx") = %d (%s)",
338b08501a9SEric Auger bcontainer, iova,
339db9b829bSAvihai Horon iotlb->addr_mask + 1, ret, strerror(-ret));
340236e0a45SAvihai Horon vfio_set_migration_error(ret);
341e2c7d025SEric Auger }
342e2c7d025SEric Auger }
34341063e1eSPaolo Bonzini out:
34441063e1eSPaolo Bonzini rcu_read_unlock();
345e2c7d025SEric Auger }
346e2c7d025SEric Auger
vfio_ram_discard_notify_discard(RamDiscardListener * rdl,MemoryRegionSection * section)3475e3b981cSDavid Hildenbrand static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
3485e3b981cSDavid Hildenbrand MemoryRegionSection *section)
3495e3b981cSDavid Hildenbrand {
3505e3b981cSDavid Hildenbrand VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
3515e3b981cSDavid Hildenbrand listener);
352dc74a4b0SZhenzhong Duan VFIOContainerBase *bcontainer = vrdl->bcontainer;
3535e3b981cSDavid Hildenbrand const hwaddr size = int128_get64(section->size);
3545e3b981cSDavid Hildenbrand const hwaddr iova = section->offset_within_address_space;
3555e3b981cSDavid Hildenbrand int ret;
3565e3b981cSDavid Hildenbrand
3575e3b981cSDavid Hildenbrand /* Unmap with a single call. */
358dc74a4b0SZhenzhong Duan ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL);
3595e3b981cSDavid Hildenbrand if (ret) {
360b08501a9SEric Auger error_report("%s: vfio_container_dma_unmap() failed: %s", __func__,
3615e3b981cSDavid Hildenbrand strerror(-ret));
3625e3b981cSDavid Hildenbrand }
3635e3b981cSDavid Hildenbrand }
3645e3b981cSDavid Hildenbrand
vfio_ram_discard_notify_populate(RamDiscardListener * rdl,MemoryRegionSection * section)3655e3b981cSDavid Hildenbrand static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
3665e3b981cSDavid Hildenbrand MemoryRegionSection *section)
3675e3b981cSDavid Hildenbrand {
3685e3b981cSDavid Hildenbrand VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
3695e3b981cSDavid Hildenbrand listener);
370dc74a4b0SZhenzhong Duan VFIOContainerBase *bcontainer = vrdl->bcontainer;
3715e3b981cSDavid Hildenbrand const hwaddr end = section->offset_within_region +
3725e3b981cSDavid Hildenbrand int128_get64(section->size);
3735e3b981cSDavid Hildenbrand hwaddr start, next, iova;
3745e3b981cSDavid Hildenbrand void *vaddr;
3755e3b981cSDavid Hildenbrand int ret;
3765e3b981cSDavid Hildenbrand
3775e3b981cSDavid Hildenbrand /*
3785e3b981cSDavid Hildenbrand * Map in (aligned within memory region) minimum granularity, so we can
3795e3b981cSDavid Hildenbrand * unmap in minimum granularity later.
3805e3b981cSDavid Hildenbrand */
3815e3b981cSDavid Hildenbrand for (start = section->offset_within_region; start < end; start = next) {
3825e3b981cSDavid Hildenbrand next = ROUND_UP(start + 1, vrdl->granularity);
3835e3b981cSDavid Hildenbrand next = MIN(next, end);
3845e3b981cSDavid Hildenbrand
3855e3b981cSDavid Hildenbrand iova = start - section->offset_within_region +
3865e3b981cSDavid Hildenbrand section->offset_within_address_space;
3875e3b981cSDavid Hildenbrand vaddr = memory_region_get_ram_ptr(section->mr) + start;
3885e3b981cSDavid Hildenbrand
389dc74a4b0SZhenzhong Duan ret = vfio_container_dma_map(bcontainer, iova, next - start,
390dc74a4b0SZhenzhong Duan vaddr, section->readonly);
3915e3b981cSDavid Hildenbrand if (ret) {
3925e3b981cSDavid Hildenbrand /* Rollback */
3935e3b981cSDavid Hildenbrand vfio_ram_discard_notify_discard(rdl, section);
3945e3b981cSDavid Hildenbrand return ret;
3955e3b981cSDavid Hildenbrand }
3965e3b981cSDavid Hildenbrand }
3975e3b981cSDavid Hildenbrand return 0;
3985e3b981cSDavid Hildenbrand }
3995e3b981cSDavid Hildenbrand
vfio_register_ram_discard_listener(VFIOContainerBase * bcontainer,MemoryRegionSection * section)400dc74a4b0SZhenzhong Duan static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer,
4015e3b981cSDavid Hildenbrand MemoryRegionSection *section)
4025e3b981cSDavid Hildenbrand {
4035e3b981cSDavid Hildenbrand RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
4045e3b981cSDavid Hildenbrand VFIORamDiscardListener *vrdl;
4055e3b981cSDavid Hildenbrand
4065e3b981cSDavid Hildenbrand /* Ignore some corner cases not relevant in practice. */
4075e3b981cSDavid Hildenbrand g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE));
4085e3b981cSDavid Hildenbrand g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
4095e3b981cSDavid Hildenbrand TARGET_PAGE_SIZE));
4105e3b981cSDavid Hildenbrand g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
4115e3b981cSDavid Hildenbrand
4125e3b981cSDavid Hildenbrand vrdl = g_new0(VFIORamDiscardListener, 1);
413dc74a4b0SZhenzhong Duan vrdl->bcontainer = bcontainer;
4145e3b981cSDavid Hildenbrand vrdl->mr = section->mr;
4155e3b981cSDavid Hildenbrand vrdl->offset_within_address_space = section->offset_within_address_space;
4165e3b981cSDavid Hildenbrand vrdl->size = int128_get64(section->size);
4175e3b981cSDavid Hildenbrand vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
4185e3b981cSDavid Hildenbrand section->mr);
4195e3b981cSDavid Hildenbrand
4205e3b981cSDavid Hildenbrand g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
4217ab1cb74SEric Auger g_assert(bcontainer->pgsizes &&
4227ab1cb74SEric Auger vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes));
4235e3b981cSDavid Hildenbrand
4245e3b981cSDavid Hildenbrand ram_discard_listener_init(&vrdl->listener,
4255e3b981cSDavid Hildenbrand vfio_ram_discard_notify_populate,
4265e3b981cSDavid Hildenbrand vfio_ram_discard_notify_discard, true);
4275e3b981cSDavid Hildenbrand ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
428dc74a4b0SZhenzhong Duan QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next);
429a74317f6SDavid Hildenbrand
430a74317f6SDavid Hildenbrand /*
431a74317f6SDavid Hildenbrand * Sanity-check if we have a theoretically problematic setup where we could
432a74317f6SDavid Hildenbrand * exceed the maximum number of possible DMA mappings over time. We assume
433a74317f6SDavid Hildenbrand * that each mapped section in the same address space as a RamDiscardManager
434a74317f6SDavid Hildenbrand * section consumes exactly one DMA mapping, with the exception of
435a74317f6SDavid Hildenbrand * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections
436a74317f6SDavid Hildenbrand * in the same address space as RamDiscardManager sections.
437a74317f6SDavid Hildenbrand *
438a74317f6SDavid Hildenbrand * We assume that each section in the address space consumes one memslot.
439a74317f6SDavid Hildenbrand * We take the number of KVM memory slots as a best guess for the maximum
440a74317f6SDavid Hildenbrand * number of sections in the address space we could have over time,
441a74317f6SDavid Hildenbrand * also consuming DMA mappings.
442a74317f6SDavid Hildenbrand */
4437ab1cb74SEric Auger if (bcontainer->dma_max_mappings) {
444a74317f6SDavid Hildenbrand unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
445a74317f6SDavid Hildenbrand
446a74317f6SDavid Hildenbrand #ifdef CONFIG_KVM
447a74317f6SDavid Hildenbrand if (kvm_enabled()) {
448a74317f6SDavid Hildenbrand max_memslots = kvm_get_max_memslots();
449a74317f6SDavid Hildenbrand }
450a74317f6SDavid Hildenbrand #endif
451a74317f6SDavid Hildenbrand
452dc74a4b0SZhenzhong Duan QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
453a74317f6SDavid Hildenbrand hwaddr start, end;
454a74317f6SDavid Hildenbrand
455a74317f6SDavid Hildenbrand start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
456a74317f6SDavid Hildenbrand vrdl->granularity);
457a74317f6SDavid Hildenbrand end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
458a74317f6SDavid Hildenbrand vrdl->granularity);
459a74317f6SDavid Hildenbrand vrdl_mappings += (end - start) / vrdl->granularity;
460a74317f6SDavid Hildenbrand vrdl_count++;
461a74317f6SDavid Hildenbrand }
462a74317f6SDavid Hildenbrand
463a74317f6SDavid Hildenbrand if (vrdl_mappings + max_memslots - vrdl_count >
4647ab1cb74SEric Auger bcontainer->dma_max_mappings) {
465a74317f6SDavid Hildenbrand warn_report("%s: possibly running out of DMA mappings. E.g., try"
466a74317f6SDavid Hildenbrand " increasing the 'block-size' of virtio-mem devies."
467a74317f6SDavid Hildenbrand " Maximum possible DMA mappings: %d, Maximum possible"
4687ab1cb74SEric Auger " memslots: %d", __func__, bcontainer->dma_max_mappings,
469a74317f6SDavid Hildenbrand max_memslots);
470a74317f6SDavid Hildenbrand }
471a74317f6SDavid Hildenbrand }
4725e3b981cSDavid Hildenbrand }
4735e3b981cSDavid Hildenbrand
vfio_unregister_ram_discard_listener(VFIOContainerBase * bcontainer,MemoryRegionSection * section)474dc74a4b0SZhenzhong Duan static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer,
4755e3b981cSDavid Hildenbrand MemoryRegionSection *section)
4765e3b981cSDavid Hildenbrand {
4775e3b981cSDavid Hildenbrand RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
4785e3b981cSDavid Hildenbrand VFIORamDiscardListener *vrdl = NULL;
4795e3b981cSDavid Hildenbrand
480dc74a4b0SZhenzhong Duan QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
4815e3b981cSDavid Hildenbrand if (vrdl->mr == section->mr &&
4825e3b981cSDavid Hildenbrand vrdl->offset_within_address_space ==
4835e3b981cSDavid Hildenbrand section->offset_within_address_space) {
4845e3b981cSDavid Hildenbrand break;
4855e3b981cSDavid Hildenbrand }
4865e3b981cSDavid Hildenbrand }
4875e3b981cSDavid Hildenbrand
4885e3b981cSDavid Hildenbrand if (!vrdl) {
4895e3b981cSDavid Hildenbrand hw_error("vfio: Trying to unregister missing RAM discard listener");
4905e3b981cSDavid Hildenbrand }
4915e3b981cSDavid Hildenbrand
4925e3b981cSDavid Hildenbrand ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
4935e3b981cSDavid Hildenbrand QLIST_REMOVE(vrdl, next);
4945e3b981cSDavid Hildenbrand g_free(vrdl);
4955e3b981cSDavid Hildenbrand }
4965e3b981cSDavid Hildenbrand
vfio_known_safe_misalignment(MemoryRegionSection * section)497851d6d1aSEric Auger static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
498851d6d1aSEric Auger {
499851d6d1aSEric Auger MemoryRegion *mr = section->mr;
500851d6d1aSEric Auger
501851d6d1aSEric Auger if (!TPM_IS_CRB(mr->owner)) {
502851d6d1aSEric Auger return false;
503851d6d1aSEric Auger }
504851d6d1aSEric Auger
505851d6d1aSEric Auger /* this is a known safe misaligned region, just trace for debug purpose */
506851d6d1aSEric Auger trace_vfio_known_safe_misalignment(memory_region_name(mr),
507851d6d1aSEric Auger section->offset_within_address_space,
508851d6d1aSEric Auger section->offset_within_region,
509851d6d1aSEric Auger qemu_real_host_page_size());
510851d6d1aSEric Auger return true;
511851d6d1aSEric Auger }
512851d6d1aSEric Auger
vfio_listener_valid_section(MemoryRegionSection * section,const char * name)513b92f2376SJoao Martins static bool vfio_listener_valid_section(MemoryRegionSection *section,
514b92f2376SJoao Martins const char *name)
515e2c7d025SEric Auger {
516e2c7d025SEric Auger if (vfio_listener_skipped_section(section)) {
517b92f2376SJoao Martins trace_vfio_listener_region_skip(name,
518e2c7d025SEric Auger section->offset_within_address_space,
519e2c7d025SEric Auger section->offset_within_address_space +
520e2c7d025SEric Auger int128_get64(int128_sub(section->size, int128_one())));
521b92f2376SJoao Martins return false;
522e2c7d025SEric Auger }
523e2c7d025SEric Auger
5241eb7f642SKunkun Jiang if (unlikely((section->offset_within_address_space &
5258e3b0cbbSMarc-André Lureau ~qemu_real_host_page_mask()) !=
5268e3b0cbbSMarc-André Lureau (section->offset_within_region & ~qemu_real_host_page_mask()))) {
527851d6d1aSEric Auger if (!vfio_known_safe_misalignment(section)) {
528851d6d1aSEric Auger error_report("%s received unaligned region %s iova=0x%"PRIx64
529851d6d1aSEric Auger " offset_within_region=0x%"PRIx64
530851d6d1aSEric Auger " qemu_real_host_page_size=0x%"PRIxPTR,
531851d6d1aSEric Auger __func__, memory_region_name(section->mr),
532851d6d1aSEric Auger section->offset_within_address_space,
533851d6d1aSEric Auger section->offset_within_region,
534851d6d1aSEric Auger qemu_real_host_page_size());
535851d6d1aSEric Auger }
536b92f2376SJoao Martins return false;
537b92f2376SJoao Martins }
538b92f2376SJoao Martins
539b92f2376SJoao Martins return true;
540b92f2376SJoao Martins }
541b92f2376SJoao Martins
vfio_get_section_iova_range(VFIOContainerBase * bcontainer,MemoryRegionSection * section,hwaddr * out_iova,hwaddr * out_end,Int128 * out_llend)542c7b313d3SEric Auger static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
5434ead8308SJoao Martins MemoryRegionSection *section,
5444ead8308SJoao Martins hwaddr *out_iova, hwaddr *out_end,
5454ead8308SJoao Martins Int128 *out_llend)
5464ead8308SJoao Martins {
5474ead8308SJoao Martins Int128 llend;
5484ead8308SJoao Martins hwaddr iova;
5494ead8308SJoao Martins
5504ead8308SJoao Martins iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
5514ead8308SJoao Martins llend = int128_make64(section->offset_within_address_space);
5524ead8308SJoao Martins llend = int128_add(llend, section->size);
5534ead8308SJoao Martins llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
5544ead8308SJoao Martins
5554ead8308SJoao Martins if (int128_ge(int128_make64(iova), llend)) {
5564ead8308SJoao Martins return false;
5574ead8308SJoao Martins }
5584ead8308SJoao Martins
5594ead8308SJoao Martins *out_iova = iova;
5604ead8308SJoao Martins *out_end = int128_get64(int128_sub(llend, int128_one()));
5614ead8308SJoao Martins if (out_llend) {
5624ead8308SJoao Martins *out_llend = llend;
5634ead8308SJoao Martins }
5644ead8308SJoao Martins return true;
5654ead8308SJoao Martins }
5664ead8308SJoao Martins
vfio_listener_region_add(MemoryListener * listener,MemoryRegionSection * section)567b92f2376SJoao Martins static void vfio_listener_region_add(MemoryListener *listener,
568b92f2376SJoao Martins MemoryRegionSection *section)
569b92f2376SJoao Martins {
570c7b313d3SEric Auger VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
571c7b313d3SEric Auger listener);
572b92f2376SJoao Martins hwaddr iova, end;
573b92f2376SJoao Martins Int128 llend, llsize;
574b92f2376SJoao Martins void *vaddr;
575b92f2376SJoao Martins int ret;
576b92f2376SJoao Martins Error *err = NULL;
577b92f2376SJoao Martins
578b92f2376SJoao Martins if (!vfio_listener_valid_section(section, "region_add")) {
579e2c7d025SEric Auger return;
580e2c7d025SEric Auger }
581e2c7d025SEric Auger
582c7b313d3SEric Auger if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
583c7b313d3SEric Auger &llend)) {
584e4b34708SKunkun Jiang if (memory_region_is_ram_device(section->mr)) {
585e4b34708SKunkun Jiang trace_vfio_listener_region_add_no_dma_map(
586e4b34708SKunkun Jiang memory_region_name(section->mr),
587e4b34708SKunkun Jiang section->offset_within_address_space,
588e4b34708SKunkun Jiang int128_getlo(section->size),
5898e3b0cbbSMarc-André Lureau qemu_real_host_page_size());
590e4b34708SKunkun Jiang }
591e2c7d025SEric Auger return;
592e2c7d025SEric Auger }
5933898aad3SDavid Gibson
59433e4c22fSZhenzhong Duan if (!vfio_container_add_section_window(bcontainer, section, &err)) {
5952e4109deSAlexey Kardashevskiy goto fail;
5962e4109deSAlexey Kardashevskiy }
5972e4109deSAlexey Kardashevskiy
598e2c7d025SEric Auger memory_region_ref(section->mr);
599e2c7d025SEric Auger
600e2c7d025SEric Auger if (memory_region_is_iommu(section->mr)) {
601e2c7d025SEric Auger VFIOGuestIOMMU *giommu;
6023df9d748SAlexey Kardashevskiy IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
603cb1efcf4SPeter Maydell int iommu_idx;
604e2c7d025SEric Auger
605a6586419SEric Auger trace_vfio_listener_region_add_iommu(section->mr->name, iova, end);
606e2c7d025SEric Auger /*
607e2c7d025SEric Auger * FIXME: For VFIO iommu types which have KVM acceleration to
608e2c7d025SEric Auger * avoid bouncing all map/unmaps through qemu this way, this
609e2c7d025SEric Auger * would be the right place to wire that up (tell the KVM
610e2c7d025SEric Auger * device emulation the VFIO iommu handles to use).
611e2c7d025SEric Auger */
612e2c7d025SEric Auger giommu = g_malloc0(sizeof(*giommu));
61344ee6aaaSYi Liu giommu->iommu_mr = iommu_mr;
614d78c19b5SAlexey Kardashevskiy giommu->iommu_offset = section->offset_within_address_space -
615d78c19b5SAlexey Kardashevskiy section->offset_within_region;
616dddf83abSEric Auger giommu->bcontainer = bcontainer;
617698feb5eSPeter Xu llend = int128_add(int128_make64(section->offset_within_region),
618698feb5eSPeter Xu section->size);
619698feb5eSPeter Xu llend = int128_sub(llend, int128_one());
620cb1efcf4SPeter Maydell iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
621cb1efcf4SPeter Maydell MEMTXATTRS_UNSPECIFIED);
622698feb5eSPeter Xu iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
6238dca037bSEric Auger IOMMU_NOTIFIER_IOTLB_EVENTS,
624698feb5eSPeter Xu section->offset_within_region,
625cb1efcf4SPeter Maydell int128_get64(llend),
626cb1efcf4SPeter Maydell iommu_idx);
627508ce5ebSDavid Gibson
628549d4005SEric Auger ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
629549d4005SEric Auger &err);
630549d4005SEric Auger if (ret) {
631549d4005SEric Auger g_free(giommu);
632549d4005SEric Auger goto fail;
633549d4005SEric Auger }
634dddf83abSEric Auger QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next);
63544ee6aaaSYi Liu memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
636e2c7d025SEric Auger
637e2c7d025SEric Auger return;
638e2c7d025SEric Auger }
639e2c7d025SEric Auger
640e2c7d025SEric Auger /* Here we assume that memory_region_is_ram(section->mr)==true */
641e2c7d025SEric Auger
6425e3b981cSDavid Hildenbrand /*
6435e3b981cSDavid Hildenbrand * For RAM memory regions with a RamDiscardManager, we only want to map the
6445e3b981cSDavid Hildenbrand * actually populated parts - and update the mapping whenever we're notified
6455e3b981cSDavid Hildenbrand * about changes.
6465e3b981cSDavid Hildenbrand */
6475e3b981cSDavid Hildenbrand if (memory_region_has_ram_discard_manager(section->mr)) {
648dc74a4b0SZhenzhong Duan vfio_register_ram_discard_listener(bcontainer, section);
6495e3b981cSDavid Hildenbrand return;
6505e3b981cSDavid Hildenbrand }
6515e3b981cSDavid Hildenbrand
652e2c7d025SEric Auger vaddr = memory_region_get_ram_ptr(section->mr) +
653e2c7d025SEric Auger section->offset_within_region +
654e2c7d025SEric Auger (iova - section->offset_within_address_space);
655e2c7d025SEric Auger
65655efcc53SBandan Das trace_vfio_listener_region_add_ram(iova, end, vaddr);
657e2c7d025SEric Auger
65855efcc53SBandan Das llsize = int128_sub(llend, int128_make64(iova));
65955efcc53SBandan Das
660567b5b30SAlexey Kardashevskiy if (memory_region_is_ram_device(section->mr)) {
6617ab1cb74SEric Auger hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
662567b5b30SAlexey Kardashevskiy
663567b5b30SAlexey Kardashevskiy if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
6645c086005SEric Auger trace_vfio_listener_region_add_no_dma_map(
6655c086005SEric Auger memory_region_name(section->mr),
6665c086005SEric Auger section->offset_within_address_space,
667567b5b30SAlexey Kardashevskiy int128_getlo(section->size),
668567b5b30SAlexey Kardashevskiy pgmask + 1);
669567b5b30SAlexey Kardashevskiy return;
670567b5b30SAlexey Kardashevskiy }
671567b5b30SAlexey Kardashevskiy }
672567b5b30SAlexey Kardashevskiy
673c7b313d3SEric Auger ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize),
674c7b313d3SEric Auger vaddr, section->readonly);
675e2c7d025SEric Auger if (ret) {
676b08501a9SEric Auger error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
677db9b829bSAvihai Horon "0x%"HWADDR_PRIx", %p) = %d (%s)",
678c7b313d3SEric Auger bcontainer, iova, int128_get64(llsize), vaddr, ret,
679db9b829bSAvihai Horon strerror(-ret));
680567b5b30SAlexey Kardashevskiy if (memory_region_is_ram_device(section->mr)) {
681567b5b30SAlexey Kardashevskiy /* Allow unexpected mappings not to be fatal for RAM devices */
682d7d87836SEric Auger error_report_err(err);
683567b5b30SAlexey Kardashevskiy return;
684567b5b30SAlexey Kardashevskiy }
685ac6dc389SDavid Gibson goto fail;
686ac6dc389SDavid Gibson }
687e2c7d025SEric Auger
688ac6dc389SDavid Gibson return;
689ac6dc389SDavid Gibson
690ac6dc389SDavid Gibson fail:
691567b5b30SAlexey Kardashevskiy if (memory_region_is_ram_device(section->mr)) {
692fde4dbb7SZhenzhong Duan error_reportf_err(err, "PCI p2p may not work: ");
693567b5b30SAlexey Kardashevskiy return;
694567b5b30SAlexey Kardashevskiy }
695e2c7d025SEric Auger /*
696e2c7d025SEric Auger * On the initfn path, store the first error in the container so we
697e2c7d025SEric Auger * can gracefully fail. Runtime, there's not much we can do other
698e2c7d025SEric Auger * than throw a hardware error.
699e2c7d025SEric Auger */
700c7b313d3SEric Auger if (!bcontainer->initialized) {
701c7b313d3SEric Auger if (!bcontainer->error) {
702c7b313d3SEric Auger error_propagate_prepend(&bcontainer->error, err,
703d7d87836SEric Auger "Region %s: ",
704d7d87836SEric Auger memory_region_name(section->mr));
705d7d87836SEric Auger } else {
706d7d87836SEric Auger error_free(err);
707e2c7d025SEric Auger }
708e2c7d025SEric Auger } else {
709d7d87836SEric Auger error_report_err(err);
710e2c7d025SEric Auger hw_error("vfio: DMA mapping failed, unable to continue");
711e2c7d025SEric Auger }
712e2c7d025SEric Auger }
713e2c7d025SEric Auger
vfio_listener_region_del(MemoryListener * listener,MemoryRegionSection * section)714e2c7d025SEric Auger static void vfio_listener_region_del(MemoryListener *listener,
715e2c7d025SEric Auger MemoryRegionSection *section)
716e2c7d025SEric Auger {
717c7b313d3SEric Auger VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
718c7b313d3SEric Auger listener);
719e2c7d025SEric Auger hwaddr iova, end;
7207a057b4fSAlexey Kardashevskiy Int128 llend, llsize;
721e2c7d025SEric Auger int ret;
722567b5b30SAlexey Kardashevskiy bool try_unmap = true;
723e2c7d025SEric Auger
724b92f2376SJoao Martins if (!vfio_listener_valid_section(section, "region_del")) {
725e2c7d025SEric Auger return;
726e2c7d025SEric Auger }
727e2c7d025SEric Auger
728e2c7d025SEric Auger if (memory_region_is_iommu(section->mr)) {
729e2c7d025SEric Auger VFIOGuestIOMMU *giommu;
730e2c7d025SEric Auger
731a6586419SEric Auger trace_vfio_listener_region_del_iommu(section->mr->name);
732dddf83abSEric Auger QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
73344ee6aaaSYi Liu if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
734698feb5eSPeter Xu giommu->n.start == section->offset_within_region) {
7353df9d748SAlexey Kardashevskiy memory_region_unregister_iommu_notifier(section->mr,
736d22d8956SAlexey Kardashevskiy &giommu->n);
737e2c7d025SEric Auger QLIST_REMOVE(giommu, giommu_next);
738e2c7d025SEric Auger g_free(giommu);
739e2c7d025SEric Auger break;
740e2c7d025SEric Auger }
741e2c7d025SEric Auger }
742e2c7d025SEric Auger
743e2c7d025SEric Auger /*
744e2c7d025SEric Auger * FIXME: We assume the one big unmap below is adequate to
745e2c7d025SEric Auger * remove any individual page mappings in the IOMMU which
746e2c7d025SEric Auger * might have been copied into VFIO. This works for a page table
747e2c7d025SEric Auger * based IOMMU where a big unmap flattens a large range of IO-PTEs.
748e2c7d025SEric Auger * That may not be true for all IOMMU types.
749e2c7d025SEric Auger */
750e2c7d025SEric Auger }
751e2c7d025SEric Auger
752c7b313d3SEric Auger if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
753c7b313d3SEric Auger &llend)) {
754e2c7d025SEric Auger return;
755e2c7d025SEric Auger }
756e2c7d025SEric Auger
7577a057b4fSAlexey Kardashevskiy llsize = int128_sub(llend, int128_make64(iova));
758e2c7d025SEric Auger
7597a057b4fSAlexey Kardashevskiy trace_vfio_listener_region_del(iova, end);
7607a057b4fSAlexey Kardashevskiy
761567b5b30SAlexey Kardashevskiy if (memory_region_is_ram_device(section->mr)) {
762567b5b30SAlexey Kardashevskiy hwaddr pgmask;
763567b5b30SAlexey Kardashevskiy
7647ab1cb74SEric Auger pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
765567b5b30SAlexey Kardashevskiy try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
7665e3b981cSDavid Hildenbrand } else if (memory_region_has_ram_discard_manager(section->mr)) {
767dc74a4b0SZhenzhong Duan vfio_unregister_ram_discard_listener(bcontainer, section);
7685e3b981cSDavid Hildenbrand /* Unregistering will trigger an unmap. */
7695e3b981cSDavid Hildenbrand try_unmap = false;
770567b5b30SAlexey Kardashevskiy }
771567b5b30SAlexey Kardashevskiy
772567b5b30SAlexey Kardashevskiy if (try_unmap) {
7731b296c3dSJean-Philippe Brucker if (int128_eq(llsize, int128_2_64())) {
7741b296c3dSJean-Philippe Brucker /* The unmap ioctl doesn't accept a full 64-bit span. */
7751b296c3dSJean-Philippe Brucker llsize = int128_rshift(llsize, 1);
776c7b313d3SEric Auger ret = vfio_container_dma_unmap(bcontainer, iova,
777b08501a9SEric Auger int128_get64(llsize), NULL);
7781b296c3dSJean-Philippe Brucker if (ret) {
779b08501a9SEric Auger error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
780db9b829bSAvihai Horon "0x%"HWADDR_PRIx") = %d (%s)",
781c7b313d3SEric Auger bcontainer, iova, int128_get64(llsize), ret,
782db9b829bSAvihai Horon strerror(-ret));
7831b296c3dSJean-Philippe Brucker }
7841b296c3dSJean-Philippe Brucker iova += int128_get64(llsize);
7851b296c3dSJean-Philippe Brucker }
786c7b313d3SEric Auger ret = vfio_container_dma_unmap(bcontainer, iova,
787b08501a9SEric Auger int128_get64(llsize), NULL);
788e2c7d025SEric Auger if (ret) {
789b08501a9SEric Auger error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
790db9b829bSAvihai Horon "0x%"HWADDR_PRIx") = %d (%s)",
791c7b313d3SEric Auger bcontainer, iova, int128_get64(llsize), ret,
792db9b829bSAvihai Horon strerror(-ret));
793e2c7d025SEric Auger }
794567b5b30SAlexey Kardashevskiy }
795567b5b30SAlexey Kardashevskiy
796567b5b30SAlexey Kardashevskiy memory_region_unref(section->mr);
7972e4109deSAlexey Kardashevskiy
798233309e8SZhenzhong Duan vfio_container_del_section_window(bcontainer, section);
799e2c7d025SEric Auger }
800e2c7d025SEric Auger
80162c1b002SJoao Martins typedef struct VFIODirtyRanges {
80262c1b002SJoao Martins hwaddr min32;
80362c1b002SJoao Martins hwaddr max32;
80462c1b002SJoao Martins hwaddr min64;
80562c1b002SJoao Martins hwaddr max64;
806a31fe5daSJoao Martins hwaddr minpci64;
807a31fe5daSJoao Martins hwaddr maxpci64;
80862c1b002SJoao Martins } VFIODirtyRanges;
80962c1b002SJoao Martins
81062c1b002SJoao Martins typedef struct VFIODirtyRangesListener {
811c7b313d3SEric Auger VFIOContainerBase *bcontainer;
81262c1b002SJoao Martins VFIODirtyRanges ranges;
81362c1b002SJoao Martins MemoryListener listener;
81462c1b002SJoao Martins } VFIODirtyRangesListener;
81562c1b002SJoao Martins
vfio_section_is_vfio_pci(MemoryRegionSection * section,VFIOContainerBase * bcontainer)816a31fe5daSJoao Martins static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
817c7b313d3SEric Auger VFIOContainerBase *bcontainer)
818a31fe5daSJoao Martins {
819a31fe5daSJoao Martins VFIOPCIDevice *pcidev;
820a31fe5daSJoao Martins VFIODevice *vbasedev;
821a31fe5daSJoao Martins Object *owner;
822a31fe5daSJoao Martins
823a31fe5daSJoao Martins owner = memory_region_owner(section->mr);
824a31fe5daSJoao Martins
8253e6015d1SZhenzhong Duan QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
826a31fe5daSJoao Martins if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
827a31fe5daSJoao Martins continue;
828a31fe5daSJoao Martins }
829a31fe5daSJoao Martins pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
830a31fe5daSJoao Martins if (OBJECT(pcidev) == owner) {
831a31fe5daSJoao Martins return true;
832a31fe5daSJoao Martins }
833a31fe5daSJoao Martins }
834a31fe5daSJoao Martins
835a31fe5daSJoao Martins return false;
836a31fe5daSJoao Martins }
837a31fe5daSJoao Martins
vfio_dirty_tracking_update_range(VFIODirtyRanges * range,hwaddr iova,hwaddr end,bool update_pci)838344e7094SJoao Martins static void vfio_dirty_tracking_update_range(VFIODirtyRanges *range,
839344e7094SJoao Martins hwaddr iova, hwaddr end,
840344e7094SJoao Martins bool update_pci)
84162c1b002SJoao Martins {
842344e7094SJoao Martins hwaddr *min, *max;
84362c1b002SJoao Martins
84462c1b002SJoao Martins /*
845a31fe5daSJoao Martins * The address space passed to the dirty tracker is reduced to three ranges:
846a31fe5daSJoao Martins * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the
847a31fe5daSJoao Martins * PCI 64-bit hole.
848a31fe5daSJoao Martins *
84962c1b002SJoao Martins * The underlying reports of dirty will query a sub-interval of each of
85062c1b002SJoao Martins * these ranges.
85162c1b002SJoao Martins *
852a31fe5daSJoao Martins * The purpose of the three range handling is to handle known cases of big
853a31fe5daSJoao Martins * holes in the address space, like the x86 AMD 1T hole, and firmware (like
854a31fe5daSJoao Martins * OVMF) which may relocate the pci-hole64 to the end of the address space.
855a31fe5daSJoao Martins * The latter would otherwise generate large ranges for tracking, stressing
856a31fe5daSJoao Martins * the limits of supported hardware. The pci-hole32 will always be below 4G
857a31fe5daSJoao Martins * (overlapping or not) so it doesn't need special handling and is part of
858a31fe5daSJoao Martins * the 32-bit range.
859a31fe5daSJoao Martins *
860a31fe5daSJoao Martins * The alternative would be an IOVATree but that has a much bigger runtime
861a31fe5daSJoao Martins * overhead and unnecessary complexity.
86262c1b002SJoao Martins */
863344e7094SJoao Martins if (update_pci && iova >= UINT32_MAX) {
864a31fe5daSJoao Martins min = &range->minpci64;
865a31fe5daSJoao Martins max = &range->maxpci64;
866a31fe5daSJoao Martins } else {
86762c1b002SJoao Martins min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
86862c1b002SJoao Martins max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
869a31fe5daSJoao Martins }
87062c1b002SJoao Martins if (*min > iova) {
87162c1b002SJoao Martins *min = iova;
87262c1b002SJoao Martins }
87362c1b002SJoao Martins if (*max < end) {
87462c1b002SJoao Martins *max = end;
87562c1b002SJoao Martins }
87662c1b002SJoao Martins
87762c1b002SJoao Martins trace_vfio_device_dirty_tracking_update(iova, end, *min, *max);
878344e7094SJoao Martins }
879344e7094SJoao Martins
vfio_dirty_tracking_update(MemoryListener * listener,MemoryRegionSection * section)880344e7094SJoao Martins static void vfio_dirty_tracking_update(MemoryListener *listener,
881344e7094SJoao Martins MemoryRegionSection *section)
882344e7094SJoao Martins {
883344e7094SJoao Martins VFIODirtyRangesListener *dirty =
884344e7094SJoao Martins container_of(listener, VFIODirtyRangesListener, listener);
885344e7094SJoao Martins hwaddr iova, end;
886344e7094SJoao Martins
887344e7094SJoao Martins if (!vfio_listener_valid_section(section, "tracking_update") ||
888344e7094SJoao Martins !vfio_get_section_iova_range(dirty->bcontainer, section,
889344e7094SJoao Martins &iova, &end, NULL)) {
89062c1b002SJoao Martins return;
89162c1b002SJoao Martins }
89262c1b002SJoao Martins
893344e7094SJoao Martins vfio_dirty_tracking_update_range(&dirty->ranges, iova, end,
894344e7094SJoao Martins vfio_section_is_vfio_pci(section, dirty->bcontainer));
895344e7094SJoao Martins }
896344e7094SJoao Martins
89762c1b002SJoao Martins static const MemoryListener vfio_dirty_tracking_listener = {
89862c1b002SJoao Martins .name = "vfio-tracking",
89962c1b002SJoao Martins .region_add = vfio_dirty_tracking_update,
90062c1b002SJoao Martins };
90162c1b002SJoao Martins
vfio_dirty_tracking_init(VFIOContainerBase * bcontainer,VFIODirtyRanges * ranges)902c7b313d3SEric Auger static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer,
90362c1b002SJoao Martins VFIODirtyRanges *ranges)
90462c1b002SJoao Martins {
90562c1b002SJoao Martins VFIODirtyRangesListener dirty;
90662c1b002SJoao Martins
90762c1b002SJoao Martins memset(&dirty, 0, sizeof(dirty));
90862c1b002SJoao Martins dirty.ranges.min32 = UINT32_MAX;
90962c1b002SJoao Martins dirty.ranges.min64 = UINT64_MAX;
910a31fe5daSJoao Martins dirty.ranges.minpci64 = UINT64_MAX;
91162c1b002SJoao Martins dirty.listener = vfio_dirty_tracking_listener;
912c7b313d3SEric Auger dirty.bcontainer = bcontainer;
91362c1b002SJoao Martins
91462c1b002SJoao Martins memory_listener_register(&dirty.listener,
915c7b313d3SEric Auger bcontainer->space->as);
91662c1b002SJoao Martins
91762c1b002SJoao Martins *ranges = dirty.ranges;
91862c1b002SJoao Martins
91962c1b002SJoao Martins /*
92062c1b002SJoao Martins * The memory listener is synchronous, and used to calculate the range
92162c1b002SJoao Martins * to dirty tracking. Unregister it after we are done as we are not
92262c1b002SJoao Martins * interested in any follow-up updates.
92362c1b002SJoao Martins */
92462c1b002SJoao Martins memory_listener_unregister(&dirty.listener);
92562c1b002SJoao Martins }
92662c1b002SJoao Martins
vfio_devices_dma_logging_stop(VFIOContainerBase * bcontainer)927c7b313d3SEric Auger static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer)
9285255bbf4SJoao Martins {
9295255bbf4SJoao Martins uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
9305255bbf4SJoao Martins sizeof(uint64_t))] = {};
9315255bbf4SJoao Martins struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
9325255bbf4SJoao Martins VFIODevice *vbasedev;
9335255bbf4SJoao Martins
9345255bbf4SJoao Martins feature->argsz = sizeof(buf);
9355255bbf4SJoao Martins feature->flags = VFIO_DEVICE_FEATURE_SET |
9365255bbf4SJoao Martins VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
9375255bbf4SJoao Martins
9383e6015d1SZhenzhong Duan QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
9395255bbf4SJoao Martins if (!vbasedev->dirty_tracking) {
9405255bbf4SJoao Martins continue;
9415255bbf4SJoao Martins }
9425255bbf4SJoao Martins
9435255bbf4SJoao Martins if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
9445255bbf4SJoao Martins warn_report("%s: Failed to stop DMA logging, err %d (%s)",
9455255bbf4SJoao Martins vbasedev->name, -errno, strerror(errno));
9465255bbf4SJoao Martins }
9475255bbf4SJoao Martins vbasedev->dirty_tracking = false;
9485255bbf4SJoao Martins }
9495255bbf4SJoao Martins }
9505255bbf4SJoao Martins
9515255bbf4SJoao Martins static struct vfio_device_feature *
vfio_device_feature_dma_logging_start_create(VFIOContainerBase * bcontainer,VFIODirtyRanges * tracking)952c7b313d3SEric Auger vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer,
9535255bbf4SJoao Martins VFIODirtyRanges *tracking)
9545255bbf4SJoao Martins {
9555255bbf4SJoao Martins struct vfio_device_feature *feature;
9565255bbf4SJoao Martins size_t feature_size;
9575255bbf4SJoao Martins struct vfio_device_feature_dma_logging_control *control;
9585255bbf4SJoao Martins struct vfio_device_feature_dma_logging_range *ranges;
9595255bbf4SJoao Martins
9605255bbf4SJoao Martins feature_size = sizeof(struct vfio_device_feature) +
9615255bbf4SJoao Martins sizeof(struct vfio_device_feature_dma_logging_control);
9625255bbf4SJoao Martins feature = g_try_malloc0(feature_size);
9635255bbf4SJoao Martins if (!feature) {
9645255bbf4SJoao Martins errno = ENOMEM;
9655255bbf4SJoao Martins return NULL;
9665255bbf4SJoao Martins }
9675255bbf4SJoao Martins feature->argsz = feature_size;
9685255bbf4SJoao Martins feature->flags = VFIO_DEVICE_FEATURE_SET |
9695255bbf4SJoao Martins VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
9705255bbf4SJoao Martins
9715255bbf4SJoao Martins control = (struct vfio_device_feature_dma_logging_control *)feature->data;
9725255bbf4SJoao Martins control->page_size = qemu_real_host_page_size();
9735255bbf4SJoao Martins
9745255bbf4SJoao Martins /*
9755255bbf4SJoao Martins * DMA logging uAPI guarantees to support at least a number of ranges that
9765255bbf4SJoao Martins * fits into a single host kernel base page.
9775255bbf4SJoao Martins */
978a31fe5daSJoao Martins control->num_ranges = !!tracking->max32 + !!tracking->max64 +
979a31fe5daSJoao Martins !!tracking->maxpci64;
9805255bbf4SJoao Martins ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
9815255bbf4SJoao Martins control->num_ranges);
9825255bbf4SJoao Martins if (!ranges) {
9835255bbf4SJoao Martins g_free(feature);
9845255bbf4SJoao Martins errno = ENOMEM;
9855255bbf4SJoao Martins
9865255bbf4SJoao Martins return NULL;
9875255bbf4SJoao Martins }
9885255bbf4SJoao Martins
989592d0bc0SPaolo Bonzini control->ranges = (uintptr_t)ranges;
9905255bbf4SJoao Martins if (tracking->max32) {
9915255bbf4SJoao Martins ranges->iova = tracking->min32;
9925255bbf4SJoao Martins ranges->length = (tracking->max32 - tracking->min32) + 1;
9935255bbf4SJoao Martins ranges++;
9945255bbf4SJoao Martins }
9955255bbf4SJoao Martins if (tracking->max64) {
9965255bbf4SJoao Martins ranges->iova = tracking->min64;
9975255bbf4SJoao Martins ranges->length = (tracking->max64 - tracking->min64) + 1;
998a31fe5daSJoao Martins ranges++;
999a31fe5daSJoao Martins }
1000a31fe5daSJoao Martins if (tracking->maxpci64) {
1001a31fe5daSJoao Martins ranges->iova = tracking->minpci64;
1002a31fe5daSJoao Martins ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1;
10035255bbf4SJoao Martins }
10045255bbf4SJoao Martins
10055255bbf4SJoao Martins trace_vfio_device_dirty_tracking_start(control->num_ranges,
10065255bbf4SJoao Martins tracking->min32, tracking->max32,
1007a31fe5daSJoao Martins tracking->min64, tracking->max64,
1008a31fe5daSJoao Martins tracking->minpci64, tracking->maxpci64);
10095255bbf4SJoao Martins
10105255bbf4SJoao Martins return feature;
10115255bbf4SJoao Martins }
10125255bbf4SJoao Martins
vfio_device_feature_dma_logging_start_destroy(struct vfio_device_feature * feature)10135255bbf4SJoao Martins static void vfio_device_feature_dma_logging_start_destroy(
10145255bbf4SJoao Martins struct vfio_device_feature *feature)
10155255bbf4SJoao Martins {
10165255bbf4SJoao Martins struct vfio_device_feature_dma_logging_control *control =
10175255bbf4SJoao Martins (struct vfio_device_feature_dma_logging_control *)feature->data;
10185255bbf4SJoao Martins struct vfio_device_feature_dma_logging_range *ranges =
10195255bbf4SJoao Martins (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges;
10205255bbf4SJoao Martins
10215255bbf4SJoao Martins g_free(ranges);
10225255bbf4SJoao Martins g_free(feature);
10235255bbf4SJoao Martins }
10245255bbf4SJoao Martins
vfio_devices_dma_logging_start(VFIOContainerBase * bcontainer,Error ** errp)1025332b9b0dSCédric Le Goater static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer,
10260f21358fSCédric Le Goater Error **errp)
10275255bbf4SJoao Martins {
10285255bbf4SJoao Martins struct vfio_device_feature *feature;
10295255bbf4SJoao Martins VFIODirtyRanges ranges;
10305255bbf4SJoao Martins VFIODevice *vbasedev;
10315255bbf4SJoao Martins int ret = 0;
10325255bbf4SJoao Martins
1033c7b313d3SEric Auger vfio_dirty_tracking_init(bcontainer, &ranges);
1034c7b313d3SEric Auger feature = vfio_device_feature_dma_logging_start_create(bcontainer,
10355255bbf4SJoao Martins &ranges);
10365255bbf4SJoao Martins if (!feature) {
10370f21358fSCédric Le Goater error_setg_errno(errp, errno, "Failed to prepare DMA logging");
1038332b9b0dSCédric Le Goater return false;
10395255bbf4SJoao Martins }
10405255bbf4SJoao Martins
10413e6015d1SZhenzhong Duan QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
10425255bbf4SJoao Martins if (vbasedev->dirty_tracking) {
10435255bbf4SJoao Martins continue;
10445255bbf4SJoao Martins }
10455255bbf4SJoao Martins
10465255bbf4SJoao Martins ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
10475255bbf4SJoao Martins if (ret) {
10485255bbf4SJoao Martins ret = -errno;
10490f21358fSCédric Le Goater error_setg_errno(errp, errno, "%s: Failed to start DMA logging",
10500f21358fSCédric Le Goater vbasedev->name);
10515255bbf4SJoao Martins goto out;
10525255bbf4SJoao Martins }
10535255bbf4SJoao Martins vbasedev->dirty_tracking = true;
10545255bbf4SJoao Martins }
10555255bbf4SJoao Martins
10565255bbf4SJoao Martins out:
10575255bbf4SJoao Martins if (ret) {
1058c7b313d3SEric Auger vfio_devices_dma_logging_stop(bcontainer);
10595255bbf4SJoao Martins }
10605255bbf4SJoao Martins
10615255bbf4SJoao Martins vfio_device_feature_dma_logging_start_destroy(feature);
10625255bbf4SJoao Martins
1063332b9b0dSCédric Le Goater return ret == 0;
10645255bbf4SJoao Martins }
10655255bbf4SJoao Martins
vfio_listener_log_global_start(MemoryListener * listener,Error ** errp)10663688fec8SCédric Le Goater static bool vfio_listener_log_global_start(MemoryListener *listener,
10673688fec8SCédric Le Goater Error **errp)
1068758b96b6SKeqian Zhu {
10690f21358fSCédric Le Goater ERRP_GUARD();
1070c7b313d3SEric Auger VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
1071c7b313d3SEric Auger listener);
1072332b9b0dSCédric Le Goater bool ret;
1073758b96b6SKeqian Zhu
1074c7b313d3SEric Auger if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
10750f21358fSCédric Le Goater ret = vfio_devices_dma_logging_start(bcontainer, errp);
10765255bbf4SJoao Martins } else {
1077332b9b0dSCédric Le Goater ret = vfio_container_set_dirty_page_tracking(bcontainer, true, errp) == 0;
10785255bbf4SJoao Martins }
10795255bbf4SJoao Martins
1080332b9b0dSCédric Le Goater if (!ret) {
10810f21358fSCédric Le Goater error_prepend(errp, "vfio: Could not start dirty page tracking - ");
1082236e0a45SAvihai Horon }
1083332b9b0dSCédric Le Goater return ret;
1084758b96b6SKeqian Zhu }
1085758b96b6SKeqian Zhu
vfio_listener_log_global_stop(MemoryListener * listener)1086758b96b6SKeqian Zhu static void vfio_listener_log_global_stop(MemoryListener *listener)
1087758b96b6SKeqian Zhu {
1088c7b313d3SEric Auger VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
1089c7b313d3SEric Auger listener);
10900f21358fSCédric Le Goater Error *local_err = NULL;
10915255bbf4SJoao Martins int ret = 0;
1092758b96b6SKeqian Zhu
1093c7b313d3SEric Auger if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
1094c7b313d3SEric Auger vfio_devices_dma_logging_stop(bcontainer);
10955255bbf4SJoao Martins } else {
10960f21358fSCédric Le Goater ret = vfio_container_set_dirty_page_tracking(bcontainer, false,
10970f21358fSCédric Le Goater &local_err);
10985255bbf4SJoao Martins }
10995255bbf4SJoao Martins
1100236e0a45SAvihai Horon if (ret) {
11010f21358fSCédric Le Goater error_prepend(&local_err,
11020f21358fSCédric Le Goater "vfio: Could not stop dirty page tracking - ");
11030f21358fSCédric Le Goater error_report_err(local_err);
1104236e0a45SAvihai Horon vfio_set_migration_error(ret);
1105236e0a45SAvihai Horon }
1106758b96b6SKeqian Zhu }
1107758b96b6SKeqian Zhu
vfio_device_dma_logging_report(VFIODevice * vbasedev,hwaddr iova,hwaddr size,void * bitmap)1108b153402aSJoao Martins static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
1109b153402aSJoao Martins hwaddr size, void *bitmap)
1110b153402aSJoao Martins {
1111b153402aSJoao Martins uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
1112b153402aSJoao Martins sizeof(struct vfio_device_feature_dma_logging_report),
1113850051b9SPaolo Bonzini sizeof(uint64_t))] = {};
1114b153402aSJoao Martins struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
1115b153402aSJoao Martins struct vfio_device_feature_dma_logging_report *report =
1116b153402aSJoao Martins (struct vfio_device_feature_dma_logging_report *)feature->data;
1117b153402aSJoao Martins
1118b153402aSJoao Martins report->iova = iova;
1119b153402aSJoao Martins report->length = size;
1120b153402aSJoao Martins report->page_size = qemu_real_host_page_size();
1121592d0bc0SPaolo Bonzini report->bitmap = (uintptr_t)bitmap;
1122b153402aSJoao Martins
1123b153402aSJoao Martins feature->argsz = sizeof(buf);
1124b153402aSJoao Martins feature->flags = VFIO_DEVICE_FEATURE_GET |
1125b153402aSJoao Martins VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
1126b153402aSJoao Martins
1127b153402aSJoao Martins if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
1128b153402aSJoao Martins return -errno;
1129b153402aSJoao Martins }
1130b153402aSJoao Martins
1131b153402aSJoao Martins return 0;
1132b153402aSJoao Martins }
1133b153402aSJoao Martins
vfio_devices_query_dirty_bitmap(const VFIOContainerBase * bcontainer,VFIOBitmap * vbmap,hwaddr iova,hwaddr size,Error ** errp)11344517c33cSZhenzhong Duan int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
11352da5f9e4SCédric Le Goater VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp)
1136b153402aSJoao Martins {
1137b153402aSJoao Martins VFIODevice *vbasedev;
1138b153402aSJoao Martins int ret;
1139b153402aSJoao Martins
11403e6015d1SZhenzhong Duan QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
1141b153402aSJoao Martins ret = vfio_device_dma_logging_report(vbasedev, iova, size,
1142b153402aSJoao Martins vbmap->bitmap);
1143b153402aSJoao Martins if (ret) {
11442da5f9e4SCédric Le Goater error_setg_errno(errp, -ret,
11452da5f9e4SCédric Le Goater "%s: Failed to get DMA logging report, iova: "
11462da5f9e4SCédric Le Goater "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx,
11472da5f9e4SCédric Le Goater vbasedev->name, iova, size);
1148b153402aSJoao Martins
1149b153402aSJoao Martins return ret;
1150b153402aSJoao Martins }
1151b153402aSJoao Martins }
1152b153402aSJoao Martins
1153b153402aSJoao Martins return 0;
1154b153402aSJoao Martins }
1155b153402aSJoao Martins
vfio_get_dirty_bitmap(const VFIOContainerBase * bcontainer,uint64_t iova,uint64_t size,ram_addr_t ram_addr,Error ** errp)11564517c33cSZhenzhong Duan int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
11572da5f9e4SCédric Le Goater uint64_t size, ram_addr_t ram_addr, Error **errp)
11586607109fSAvihai Horon {
1159b153402aSJoao Martins bool all_device_dirty_tracking =
1160e1cac6b2SEric Auger vfio_devices_all_device_dirty_tracking(bcontainer);
11616fe4f6c9SJoao Martins uint64_t dirty_pages;
11626607109fSAvihai Horon VFIOBitmap vbmap;
11636607109fSAvihai Horon int ret;
11646607109fSAvihai Horon
1165e1cac6b2SEric Auger if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) {
11666607109fSAvihai Horon cpu_physical_memory_set_dirty_range(ram_addr, size,
11676607109fSAvihai Horon tcg_enabled() ? DIRTY_CLIENTS_ALL :
11686607109fSAvihai Horon DIRTY_CLIENTS_NOCODE);
11696607109fSAvihai Horon return 0;
11706607109fSAvihai Horon }
11716607109fSAvihai Horon
11726607109fSAvihai Horon ret = vfio_bitmap_alloc(&vbmap, size);
11736607109fSAvihai Horon if (ret) {
11742da5f9e4SCédric Le Goater error_setg_errno(errp, -ret,
11752da5f9e4SCédric Le Goater "Failed to allocate dirty tracking bitmap");
11766607109fSAvihai Horon return ret;
11776607109fSAvihai Horon }
11786607109fSAvihai Horon
1179b153402aSJoao Martins if (all_device_dirty_tracking) {
11802da5f9e4SCédric Le Goater ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size,
11812da5f9e4SCédric Le Goater errp);
1182b153402aSJoao Martins } else {
11832da5f9e4SCédric Le Goater ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size,
11842da5f9e4SCédric Le Goater errp);
1185b153402aSJoao Martins }
1186b153402aSJoao Martins
11876607109fSAvihai Horon if (ret) {
11886607109fSAvihai Horon goto out;
1189b6dd6504SKirti Wankhede }
1190b6dd6504SKirti Wankhede
11916fe4f6c9SJoao Martins dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
1192725ccd7eSAvihai Horon vbmap.pages);
1193b6dd6504SKirti Wankhede
1194e1cac6b2SEric Auger trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages);
11956607109fSAvihai Horon out:
1196725ccd7eSAvihai Horon g_free(vbmap.bitmap);
1197b6dd6504SKirti Wankhede
1198b6dd6504SKirti Wankhede return ret;
1199b6dd6504SKirti Wankhede }
1200b6dd6504SKirti Wankhede
12019a04fe09SKirti Wankhede typedef struct {
12029a04fe09SKirti Wankhede IOMMUNotifier n;
12039a04fe09SKirti Wankhede VFIOGuestIOMMU *giommu;
12049a04fe09SKirti Wankhede } vfio_giommu_dirty_notifier;
12059a04fe09SKirti Wankhede
vfio_iommu_map_dirty_notify(IOMMUNotifier * n,IOMMUTLBEntry * iotlb)12069a04fe09SKirti Wankhede static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
12079a04fe09SKirti Wankhede {
12089a04fe09SKirti Wankhede vfio_giommu_dirty_notifier *gdn = container_of(n,
12099a04fe09SKirti Wankhede vfio_giommu_dirty_notifier, n);
12109a04fe09SKirti Wankhede VFIOGuestIOMMU *giommu = gdn->giommu;
1211dddf83abSEric Auger VFIOContainerBase *bcontainer = giommu->bcontainer;
12129a04fe09SKirti Wankhede hwaddr iova = iotlb->iova + giommu->iommu_offset;
12139a04fe09SKirti Wankhede ram_addr_t translated_addr;
1214ebb481c0SCédric Le Goater Error *local_err = NULL;
1215236e0a45SAvihai Horon int ret = -EINVAL;
12169a04fe09SKirti Wankhede
12179a04fe09SKirti Wankhede trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
12189a04fe09SKirti Wankhede
12199a04fe09SKirti Wankhede if (iotlb->target_as != &address_space_memory) {
12209a04fe09SKirti Wankhede error_report("Wrong target AS \"%s\", only system memory is allowed",
12219a04fe09SKirti Wankhede iotlb->target_as->name ? iotlb->target_as->name : "none");
1222236e0a45SAvihai Horon goto out;
12239a04fe09SKirti Wankhede }
12249a04fe09SKirti Wankhede
12259a04fe09SKirti Wankhede rcu_read_lock();
1226ebb481c0SCédric Le Goater if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) {
1227ebb481c0SCédric Le Goater error_report_err(local_err);
122894d12088SCédric Le Goater goto out_unlock;
122994d12088SCédric Le Goater }
123094d12088SCédric Le Goater
1231c7b313d3SEric Auger ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1,
12322da5f9e4SCédric Le Goater translated_addr, &local_err);
12339a04fe09SKirti Wankhede if (ret) {
12342da5f9e4SCédric Le Goater error_prepend(&local_err,
12352da5f9e4SCédric Le Goater "vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
12362da5f9e4SCédric Le Goater "0x%"HWADDR_PRIx") failed - ", bcontainer, iova,
12372da5f9e4SCédric Le Goater iotlb->addr_mask + 1);
12382da5f9e4SCédric Le Goater error_report_err(local_err);
12399a04fe09SKirti Wankhede }
124094d12088SCédric Le Goater
124194d12088SCédric Le Goater out_unlock:
12429a04fe09SKirti Wankhede rcu_read_unlock();
1243236e0a45SAvihai Horon
1244236e0a45SAvihai Horon out:
1245236e0a45SAvihai Horon if (ret) {
1246236e0a45SAvihai Horon vfio_set_migration_error(ret);
1247236e0a45SAvihai Horon }
12489a04fe09SKirti Wankhede }
12499a04fe09SKirti Wankhede
vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection * section,void * opaque)12505e3b981cSDavid Hildenbrand static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
12515e3b981cSDavid Hildenbrand void *opaque)
12525e3b981cSDavid Hildenbrand {
12535e3b981cSDavid Hildenbrand const hwaddr size = int128_get64(section->size);
12545e3b981cSDavid Hildenbrand const hwaddr iova = section->offset_within_address_space;
12555e3b981cSDavid Hildenbrand const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
12565e3b981cSDavid Hildenbrand section->offset_within_region;
12575e3b981cSDavid Hildenbrand VFIORamDiscardListener *vrdl = opaque;
12582da5f9e4SCédric Le Goater Error *local_err = NULL;
12592da5f9e4SCédric Le Goater int ret;
12605e3b981cSDavid Hildenbrand
12615e3b981cSDavid Hildenbrand /*
12625e3b981cSDavid Hildenbrand * Sync the whole mapped region (spanning multiple individual mappings)
12635e3b981cSDavid Hildenbrand * in one go.
12645e3b981cSDavid Hildenbrand */
12652da5f9e4SCédric Le Goater ret = vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr,
12662da5f9e4SCédric Le Goater &local_err);
12672da5f9e4SCédric Le Goater if (ret) {
12682da5f9e4SCédric Le Goater error_report_err(local_err);
12692da5f9e4SCédric Le Goater }
12702da5f9e4SCédric Le Goater return ret;
12715e3b981cSDavid Hildenbrand }
12725e3b981cSDavid Hildenbrand
1273dc74a4b0SZhenzhong Duan static int
vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase * bcontainer,MemoryRegionSection * section)1274dc74a4b0SZhenzhong Duan vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer,
12755e3b981cSDavid Hildenbrand MemoryRegionSection *section)
12765e3b981cSDavid Hildenbrand {
12775e3b981cSDavid Hildenbrand RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
12785e3b981cSDavid Hildenbrand VFIORamDiscardListener *vrdl = NULL;
12795e3b981cSDavid Hildenbrand
1280dc74a4b0SZhenzhong Duan QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
12815e3b981cSDavid Hildenbrand if (vrdl->mr == section->mr &&
12825e3b981cSDavid Hildenbrand vrdl->offset_within_address_space ==
12835e3b981cSDavid Hildenbrand section->offset_within_address_space) {
12845e3b981cSDavid Hildenbrand break;
12855e3b981cSDavid Hildenbrand }
12865e3b981cSDavid Hildenbrand }
12875e3b981cSDavid Hildenbrand
12885e3b981cSDavid Hildenbrand if (!vrdl) {
12895e3b981cSDavid Hildenbrand hw_error("vfio: Trying to sync missing RAM discard listener");
12905e3b981cSDavid Hildenbrand }
12915e3b981cSDavid Hildenbrand
12925e3b981cSDavid Hildenbrand /*
12935e3b981cSDavid Hildenbrand * We only want/can synchronize the bitmap for actually mapped parts -
12945e3b981cSDavid Hildenbrand * which correspond to populated parts. Replay all populated parts.
12955e3b981cSDavid Hildenbrand */
12965e3b981cSDavid Hildenbrand return ram_discard_manager_replay_populated(rdm, section,
12975e3b981cSDavid Hildenbrand vfio_ram_discard_get_dirty_bitmap,
12985e3b981cSDavid Hildenbrand &vrdl);
12995e3b981cSDavid Hildenbrand }
13005e3b981cSDavid Hildenbrand
vfio_sync_iommu_dirty_bitmap(VFIOContainerBase * bcontainer,MemoryRegionSection * section)1301723f702bSAvihai Horon static int vfio_sync_iommu_dirty_bitmap(VFIOContainerBase *bcontainer,
1302723f702bSAvihai Horon MemoryRegionSection *section)
1303b6dd6504SKirti Wankhede {
13049a04fe09SKirti Wankhede VFIOGuestIOMMU *giommu;
1305723f702bSAvihai Horon bool found = false;
1306723f702bSAvihai Horon Int128 llend;
1307723f702bSAvihai Horon vfio_giommu_dirty_notifier gdn;
1308723f702bSAvihai Horon int idx;
13099a04fe09SKirti Wankhede
1310dddf83abSEric Auger QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
131144ee6aaaSYi Liu if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
13129a04fe09SKirti Wankhede giommu->n.start == section->offset_within_region) {
1313723f702bSAvihai Horon found = true;
1314723f702bSAvihai Horon break;
1315723f702bSAvihai Horon }
1316723f702bSAvihai Horon }
1317723f702bSAvihai Horon
1318723f702bSAvihai Horon if (!found) {
1319723f702bSAvihai Horon return 0;
1320723f702bSAvihai Horon }
1321723f702bSAvihai Horon
1322723f702bSAvihai Horon gdn.giommu = giommu;
1323723f702bSAvihai Horon idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr,
13249a04fe09SKirti Wankhede MEMTXATTRS_UNSPECIFIED);
13259a04fe09SKirti Wankhede
13269a04fe09SKirti Wankhede llend = int128_add(int128_make64(section->offset_within_region),
13279a04fe09SKirti Wankhede section->size);
13289a04fe09SKirti Wankhede llend = int128_sub(llend, int128_one());
13299a04fe09SKirti Wankhede
1330723f702bSAvihai Horon iommu_notifier_init(&gdn.n, vfio_iommu_map_dirty_notify, IOMMU_NOTIFIER_MAP,
1331723f702bSAvihai Horon section->offset_within_region, int128_get64(llend),
13329a04fe09SKirti Wankhede idx);
133344ee6aaaSYi Liu memory_region_iommu_replay(giommu->iommu_mr, &gdn.n);
1334723f702bSAvihai Horon
13359a04fe09SKirti Wankhede return 0;
1336723f702bSAvihai Horon }
1337723f702bSAvihai Horon
vfio_sync_dirty_bitmap(VFIOContainerBase * bcontainer,MemoryRegionSection * section,Error ** errp)1338723f702bSAvihai Horon static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer,
1339723f702bSAvihai Horon MemoryRegionSection *section, Error **errp)
1340723f702bSAvihai Horon {
1341723f702bSAvihai Horon ram_addr_t ram_addr;
1342723f702bSAvihai Horon
1343723f702bSAvihai Horon if (memory_region_is_iommu(section->mr)) {
1344723f702bSAvihai Horon return vfio_sync_iommu_dirty_bitmap(bcontainer, section);
13455e3b981cSDavid Hildenbrand } else if (memory_region_has_ram_discard_manager(section->mr)) {
13462da5f9e4SCédric Le Goater int ret;
13472da5f9e4SCédric Le Goater
13482da5f9e4SCédric Le Goater ret = vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section);
13492da5f9e4SCédric Le Goater if (ret) {
13502da5f9e4SCédric Le Goater error_setg(errp,
13512da5f9e4SCédric Le Goater "Failed to sync dirty bitmap with RAM discard listener");
13522da5f9e4SCédric Le Goater }
13532da5f9e4SCédric Le Goater return ret;
13549a04fe09SKirti Wankhede }
13559a04fe09SKirti Wankhede
1356b6dd6504SKirti Wankhede ram_addr = memory_region_get_ram_addr(section->mr) +
1357b6dd6504SKirti Wankhede section->offset_within_region;
1358b6dd6504SKirti Wankhede
1359c7b313d3SEric Auger return vfio_get_dirty_bitmap(bcontainer,
13601eb7f642SKunkun Jiang REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
13612da5f9e4SCédric Le Goater int128_get64(section->size), ram_addr, errp);
1362b6dd6504SKirti Wankhede }
1363b6dd6504SKirti Wankhede
vfio_listener_log_sync(MemoryListener * listener,MemoryRegionSection * section)13644292d501SZenghui Yu static void vfio_listener_log_sync(MemoryListener *listener,
1365b6dd6504SKirti Wankhede MemoryRegionSection *section)
1366b6dd6504SKirti Wankhede {
1367c7b313d3SEric Auger VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
1368c7b313d3SEric Auger listener);
1369236e0a45SAvihai Horon int ret;
13702da5f9e4SCédric Le Goater Error *local_err = NULL;
1371b6dd6504SKirti Wankhede
1372b051a3f6SAvihai Horon if (vfio_listener_skipped_section(section)) {
1373b6dd6504SKirti Wankhede return;
1374b6dd6504SKirti Wankhede }
1375b6dd6504SKirti Wankhede
1376c7b313d3SEric Auger if (vfio_devices_all_dirty_tracking(bcontainer)) {
13772da5f9e4SCédric Le Goater ret = vfio_sync_dirty_bitmap(bcontainer, section, &local_err);
1378236e0a45SAvihai Horon if (ret) {
13792da5f9e4SCédric Le Goater error_report_err(local_err);
1380236e0a45SAvihai Horon vfio_set_migration_error(ret);
1381236e0a45SAvihai Horon }
1382b6dd6504SKirti Wankhede }
1383b6dd6504SKirti Wankhede }
1384b6dd6504SKirti Wankhede
13857e63b311SYi Liu const MemoryListener vfio_memory_listener = {
1386142518bdSPeter Xu .name = "vfio",
1387e2c7d025SEric Auger .region_add = vfio_listener_region_add,
1388e2c7d025SEric Auger .region_del = vfio_listener_region_del,
1389758b96b6SKeqian Zhu .log_global_start = vfio_listener_log_global_start,
1390758b96b6SKeqian Zhu .log_global_stop = vfio_listener_log_global_stop,
13914292d501SZenghui Yu .log_sync = vfio_listener_log_sync,
1392e2c7d025SEric Auger };
1393e2c7d025SEric Auger
vfio_reset_handler(void * opaque)1394e2c7d025SEric Auger void vfio_reset_handler(void *opaque)
1395e2c7d025SEric Auger {
1396e2c7d025SEric Auger VFIODevice *vbasedev;
1397e2c7d025SEric Auger
13989353b6daSVolker Rümelin QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
13997da624e2SAlex Williamson if (vbasedev->dev->realized) {
1400e2c7d025SEric Auger vbasedev->ops->vfio_compute_needs_reset(vbasedev);
1401e2c7d025SEric Auger }
1402e2c7d025SEric Auger }
1403e2c7d025SEric Auger
14049353b6daSVolker Rümelin QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
14057da624e2SAlex Williamson if (vbasedev->dev->realized && vbasedev->needs_reset) {
1406e2c7d025SEric Auger vbasedev->ops->vfio_hot_reset_multi(vbasedev);
1407e2c7d025SEric Auger }
1408e2c7d025SEric Auger }
1409e2c7d025SEric Auger }
1410e2c7d025SEric Auger
vfio_kvm_device_add_fd(int fd,Error ** errp)14115621c02dSZhenzhong Duan int vfio_kvm_device_add_fd(int fd, Error **errp)
1412e2c7d025SEric Auger {
1413e2c7d025SEric Auger #ifdef CONFIG_KVM
1414e2c7d025SEric Auger struct kvm_device_attr attr = {
14155621c02dSZhenzhong Duan .group = KVM_DEV_VFIO_FILE,
14165621c02dSZhenzhong Duan .attr = KVM_DEV_VFIO_FILE_ADD,
14175621c02dSZhenzhong Duan .addr = (uint64_t)(unsigned long)&fd,
1418e2c7d025SEric Auger };
1419e2c7d025SEric Auger
1420e2c7d025SEric Auger if (!kvm_enabled()) {
14215621c02dSZhenzhong Duan return 0;
1422e2c7d025SEric Auger }
1423e2c7d025SEric Auger
1424e2c7d025SEric Auger if (vfio_kvm_device_fd < 0) {
1425e2c7d025SEric Auger struct kvm_create_device cd = {
1426e2c7d025SEric Auger .type = KVM_DEV_TYPE_VFIO,
1427e2c7d025SEric Auger };
1428e2c7d025SEric Auger
1429e2c7d025SEric Auger if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
14305621c02dSZhenzhong Duan error_setg_errno(errp, errno, "Failed to create KVM VFIO device");
14315621c02dSZhenzhong Duan return -errno;
1432e2c7d025SEric Auger }
1433e2c7d025SEric Auger
1434e2c7d025SEric Auger vfio_kvm_device_fd = cd.fd;
1435e2c7d025SEric Auger }
1436e2c7d025SEric Auger
1437e2c7d025SEric Auger if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
14385621c02dSZhenzhong Duan error_setg_errno(errp, errno, "Failed to add fd %d to KVM VFIO device",
14395621c02dSZhenzhong Duan fd);
14405621c02dSZhenzhong Duan return -errno;
1441e2c7d025SEric Auger }
1442e2c7d025SEric Auger #endif
14435621c02dSZhenzhong Duan return 0;
14445621c02dSZhenzhong Duan }
14455621c02dSZhenzhong Duan
vfio_kvm_device_del_fd(int fd,Error ** errp)14465621c02dSZhenzhong Duan int vfio_kvm_device_del_fd(int fd, Error **errp)
14475621c02dSZhenzhong Duan {
14485621c02dSZhenzhong Duan #ifdef CONFIG_KVM
14495621c02dSZhenzhong Duan struct kvm_device_attr attr = {
14505621c02dSZhenzhong Duan .group = KVM_DEV_VFIO_FILE,
14515621c02dSZhenzhong Duan .attr = KVM_DEV_VFIO_FILE_DEL,
14525621c02dSZhenzhong Duan .addr = (uint64_t)(unsigned long)&fd,
14535621c02dSZhenzhong Duan };
14545621c02dSZhenzhong Duan
14555621c02dSZhenzhong Duan if (vfio_kvm_device_fd < 0) {
14565621c02dSZhenzhong Duan error_setg(errp, "KVM VFIO device isn't created yet");
14575621c02dSZhenzhong Duan return -EINVAL;
14585621c02dSZhenzhong Duan }
14595621c02dSZhenzhong Duan
14605621c02dSZhenzhong Duan if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
14615621c02dSZhenzhong Duan error_setg_errno(errp, errno,
14625621c02dSZhenzhong Duan "Failed to remove fd %d from KVM VFIO device", fd);
14635621c02dSZhenzhong Duan return -errno;
14645621c02dSZhenzhong Duan }
14655621c02dSZhenzhong Duan #endif
14665621c02dSZhenzhong Duan return 0;
14675621c02dSZhenzhong Duan }
14685621c02dSZhenzhong Duan
vfio_get_address_space(AddressSpace * as)14697e63b311SYi Liu VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
1470e2c7d025SEric Auger {
1471e2c7d025SEric Auger VFIOAddressSpace *space;
1472e2c7d025SEric Auger
1473e2c7d025SEric Auger QLIST_FOREACH(space, &vfio_address_spaces, list) {
1474e2c7d025SEric Auger if (space->as == as) {
1475e2c7d025SEric Auger return space;
1476e2c7d025SEric Auger }
1477e2c7d025SEric Auger }
1478e2c7d025SEric Auger
1479e2c7d025SEric Auger /* No suitable VFIOAddressSpace, create a new one */
1480e2c7d025SEric Auger space = g_malloc0(sizeof(*space));
1481e2c7d025SEric Auger space->as = as;
1482e2c7d025SEric Auger QLIST_INIT(&space->containers);
1483e2c7d025SEric Auger
1484c8fcb90cSZhenzhong Duan if (QLIST_EMPTY(&vfio_address_spaces)) {
1485c8fcb90cSZhenzhong Duan qemu_register_reset(vfio_reset_handler, NULL);
1486c8fcb90cSZhenzhong Duan }
1487c8fcb90cSZhenzhong Duan
1488e2c7d025SEric Auger QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
1489e2c7d025SEric Auger
1490e2c7d025SEric Auger return space;
1491e2c7d025SEric Auger }
1492e2c7d025SEric Auger
vfio_put_address_space(VFIOAddressSpace * space)14937e63b311SYi Liu void vfio_put_address_space(VFIOAddressSpace *space)
1494e2c7d025SEric Auger {
14951eae5b7bSZhenzhong Duan if (!QLIST_EMPTY(&space->containers)) {
14961eae5b7bSZhenzhong Duan return;
14971eae5b7bSZhenzhong Duan }
14981eae5b7bSZhenzhong Duan
1499e2c7d025SEric Auger QLIST_REMOVE(space, list);
1500e2c7d025SEric Auger g_free(space);
15011eae5b7bSZhenzhong Duan
1502c8fcb90cSZhenzhong Duan if (QLIST_EMPTY(&vfio_address_spaces)) {
1503c8fcb90cSZhenzhong Duan qemu_unregister_reset(vfio_reset_handler, NULL);
1504c8fcb90cSZhenzhong Duan }
1505e2c7d025SEric Auger }
1506e2c7d025SEric Auger
vfio_address_space_insert(VFIOAddressSpace * space,VFIOContainerBase * bcontainer)1507b7b79588SCédric Le Goater void vfio_address_space_insert(VFIOAddressSpace *space,
1508b7b79588SCédric Le Goater VFIOContainerBase *bcontainer)
1509b7b79588SCédric Le Goater {
1510b7b79588SCédric Le Goater QLIST_INSERT_HEAD(&space->containers, bcontainer, next);
151109181a8eSCédric Le Goater bcontainer->space = space;
1512b7b79588SCédric Le Goater }
1513b7b79588SCédric Le Goater
vfio_get_device_info(int fd)1514634f38f0SAlex Williamson struct vfio_device_info *vfio_get_device_info(int fd)
1515634f38f0SAlex Williamson {
1516634f38f0SAlex Williamson struct vfio_device_info *info;
1517634f38f0SAlex Williamson uint32_t argsz = sizeof(*info);
1518634f38f0SAlex Williamson
1519634f38f0SAlex Williamson info = g_malloc0(argsz);
1520634f38f0SAlex Williamson
1521634f38f0SAlex Williamson retry:
1522634f38f0SAlex Williamson info->argsz = argsz;
1523634f38f0SAlex Williamson
1524634f38f0SAlex Williamson if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) {
1525634f38f0SAlex Williamson g_free(info);
1526634f38f0SAlex Williamson return NULL;
1527634f38f0SAlex Williamson }
1528634f38f0SAlex Williamson
1529634f38f0SAlex Williamson if (info->argsz > argsz) {
1530634f38f0SAlex Williamson argsz = info->argsz;
1531634f38f0SAlex Williamson info = g_realloc(info, argsz);
1532634f38f0SAlex Williamson goto retry;
1533634f38f0SAlex Williamson }
1534634f38f0SAlex Williamson
1535634f38f0SAlex Williamson return info;
1536634f38f0SAlex Williamson }
15371eb31f13SEric Auger
vfio_attach_device(char * name,VFIODevice * vbasedev,AddressSpace * as,Error ** errp)1538b7754835SZhenzhong Duan bool vfio_attach_device(char *name, VFIODevice *vbasedev,
15391eb31f13SEric Auger AddressSpace *as, Error **errp)
15401eb31f13SEric Auger {
15419812feefSCédric Le Goater const VFIOIOMMUClass *ops =
15429812feefSCédric Le Goater VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
154383a4d596SJoao Martins HostIOMMUDevice *hiod = NULL;
15441eb31f13SEric Auger
15455ee3dc7aSYi Liu if (vbasedev->iommufd) {
1546ce5f6d49SCédric Le Goater ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
15475ee3dc7aSYi Liu }
15489812feefSCédric Le Goater
15499812feefSCédric Le Goater assert(ops);
15509812feefSCédric Le Goater
1551a7fd91b8SZhenzhong Duan
155283a4d596SJoao Martins if (!vbasedev->mdev) {
1553a7fd91b8SZhenzhong Duan hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename));
155483a4d596SJoao Martins vbasedev->hiod = hiod;
1555a7fd91b8SZhenzhong Duan }
1556a7fd91b8SZhenzhong Duan
155783a4d596SJoao Martins if (!ops->attach_device(name, vbasedev, as, errp)) {
1558a7fd91b8SZhenzhong Duan object_unref(hiod);
155983a4d596SJoao Martins vbasedev->hiod = NULL;
1560a7fd91b8SZhenzhong Duan return false;
1561a7fd91b8SZhenzhong Duan }
1562a7fd91b8SZhenzhong Duan
1563a7fd91b8SZhenzhong Duan return true;
15641eb31f13SEric Auger }
15651eb31f13SEric Auger
vfio_detach_device(VFIODevice * vbasedev)15661eb31f13SEric Auger void vfio_detach_device(VFIODevice *vbasedev)
15671eb31f13SEric Auger {
15681eb31f13SEric Auger if (!vbasedev->bcontainer) {
15691eb31f13SEric Auger return;
15701eb31f13SEric Auger }
1571a7fd91b8SZhenzhong Duan object_unref(vbasedev->hiod);
157241d698b8SCédric Le Goater VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev);
15731eb31f13SEric Auger }
1574