xref: /openbmc/qemu/util/vfio-helpers.c (revision 37b0b24e933c18269dddbf6b83f91823cacf8105)
1418026caSFam Zheng /*
2418026caSFam Zheng  * VFIO utility
3418026caSFam Zheng  *
4418026caSFam Zheng  * Copyright 2016 - 2018 Red Hat, Inc.
5418026caSFam Zheng  *
6418026caSFam Zheng  * Authors:
7418026caSFam Zheng  *   Fam Zheng <famz@redhat.com>
8418026caSFam Zheng  *
9418026caSFam Zheng  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10418026caSFam Zheng  * See the COPYING file in the top-level directory.
11418026caSFam Zheng  */
12418026caSFam Zheng 
13418026caSFam Zheng #include "qemu/osdep.h"
14418026caSFam Zheng #include <sys/ioctl.h>
15418026caSFam Zheng #include <linux/vfio.h>
16418026caSFam Zheng #include "qapi/error.h"
17418026caSFam Zheng #include "exec/ramlist.h"
18418026caSFam Zheng #include "exec/cpu-common.h"
19b430b513SDavid Hildenbrand #include "exec/memory.h"
20418026caSFam Zheng #include "trace.h"
21418026caSFam Zheng #include "qemu/error-report.h"
22418026caSFam Zheng #include "standard-headers/linux/pci_regs.h"
23418026caSFam Zheng #include "qemu/event_notifier.h"
24418026caSFam Zheng #include "qemu/vfio-helpers.h"
256e8a355dSDaniel Brodsky #include "qemu/lockable.h"
26418026caSFam Zheng #include "trace.h"
27418026caSFam Zheng 
28418026caSFam Zheng #define QEMU_VFIO_DEBUG 0
29418026caSFam Zheng 
30418026caSFam Zheng #define QEMU_VFIO_IOVA_MIN 0x10000ULL
31418026caSFam Zheng /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
32418026caSFam Zheng  * we can use a runtime limit; alternatively it's also possible to do platform
33418026caSFam Zheng  * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
34418026caSFam Zheng  **/
35418026caSFam Zheng #define QEMU_VFIO_IOVA_MAX (1ULL << 39)
36418026caSFam Zheng 
37418026caSFam Zheng typedef struct {
38418026caSFam Zheng     /* Page aligned addr. */
39418026caSFam Zheng     void *host;
40418026caSFam Zheng     size_t size;
41418026caSFam Zheng     uint64_t iova;
42418026caSFam Zheng } IOVAMapping;
43418026caSFam Zheng 
444487d420SEric Auger struct IOVARange {
454487d420SEric Auger     uint64_t start;
464487d420SEric Auger     uint64_t end;
474487d420SEric Auger };
484487d420SEric Auger 
49418026caSFam Zheng struct QEMUVFIOState {
50418026caSFam Zheng     QemuMutex lock;
51418026caSFam Zheng 
52418026caSFam Zheng     /* These fields are protected by BQL */
53418026caSFam Zheng     int container;
54418026caSFam Zheng     int group;
55418026caSFam Zheng     int device;
56418026caSFam Zheng     RAMBlockNotifier ram_notifier;
57418026caSFam Zheng     struct vfio_region_info config_region_info, bar_region_info[6];
584487d420SEric Auger     struct IOVARange *usable_iova_ranges;
594487d420SEric Auger     uint8_t nb_iova_ranges;
60418026caSFam Zheng 
61418026caSFam Zheng     /* These fields are protected by @lock */
62418026caSFam Zheng     /* VFIO's IO virtual address space is managed by splitting into a few
63418026caSFam Zheng      * sections:
64418026caSFam Zheng      *
65418026caSFam Zheng      * ---------------       <= 0
66418026caSFam Zheng      * |xxxxxxxxxxxxx|
67418026caSFam Zheng      * |-------------|       <= QEMU_VFIO_IOVA_MIN
68418026caSFam Zheng      * |             |
69418026caSFam Zheng      * |    Fixed    |
70418026caSFam Zheng      * |             |
71418026caSFam Zheng      * |-------------|       <= low_water_mark
72418026caSFam Zheng      * |             |
73418026caSFam Zheng      * |    Free     |
74418026caSFam Zheng      * |             |
75418026caSFam Zheng      * |-------------|       <= high_water_mark
76418026caSFam Zheng      * |             |
77418026caSFam Zheng      * |    Temp     |
78418026caSFam Zheng      * |             |
79418026caSFam Zheng      * |-------------|       <= QEMU_VFIO_IOVA_MAX
80418026caSFam Zheng      * |xxxxxxxxxxxxx|
81418026caSFam Zheng      * |xxxxxxxxxxxxx|
82418026caSFam Zheng      * ---------------
83418026caSFam Zheng      *
84418026caSFam Zheng      * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
85418026caSFam Zheng      *
86418026caSFam Zheng      * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
87418026caSFam Zheng      *   [QEMU_VFIO_IOVA_MIN, low_water_mark).  Once allocated they will not be
88418026caSFam Zheng      *   reclaimed - low_water_mark never shrinks;
89418026caSFam Zheng      *
90418026caSFam Zheng      * - IOVAs in range [low_water_mark, high_water_mark) are free;
91418026caSFam Zheng      *
92418026caSFam Zheng      * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
93418026caSFam Zheng      *   mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
94418026caSFam Zheng      *   is recycled. The caller should make sure I/O's depending on these
95418026caSFam Zheng      *   mappings are completed before calling.
96418026caSFam Zheng      **/
97418026caSFam Zheng     uint64_t low_water_mark;
98418026caSFam Zheng     uint64_t high_water_mark;
99418026caSFam Zheng     IOVAMapping *mappings;
100418026caSFam Zheng     int nr_mappings;
101418026caSFam Zheng };
102418026caSFam Zheng 
103418026caSFam Zheng /**
104418026caSFam Zheng  * Find group file by PCI device address as specified @device, and return the
105418026caSFam Zheng  * path. The returned string is owned by caller and should be g_free'ed later.
106418026caSFam Zheng  */
107418026caSFam Zheng static char *sysfs_find_group_file(const char *device, Error **errp)
108418026caSFam Zheng {
109418026caSFam Zheng     char *sysfs_link;
110418026caSFam Zheng     char *sysfs_group;
111418026caSFam Zheng     char *p;
112418026caSFam Zheng     char *path = NULL;
113418026caSFam Zheng 
114418026caSFam Zheng     sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device);
11578d8c99eSPaolo Bonzini     sysfs_group = g_malloc0(PATH_MAX);
116418026caSFam Zheng     if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) {
117418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to find iommu group sysfs path");
118418026caSFam Zheng         goto out;
119418026caSFam Zheng     }
120418026caSFam Zheng     p = strrchr(sysfs_group, '/');
121418026caSFam Zheng     if (!p) {
122418026caSFam Zheng         error_setg(errp, "Failed to find iommu group number");
123418026caSFam Zheng         goto out;
124418026caSFam Zheng     }
125418026caSFam Zheng 
126418026caSFam Zheng     path = g_strdup_printf("/dev/vfio/%s", p + 1);
127418026caSFam Zheng out:
128418026caSFam Zheng     g_free(sysfs_link);
129418026caSFam Zheng     g_free(sysfs_group);
130418026caSFam Zheng     return path;
131418026caSFam Zheng }
132418026caSFam Zheng 
133418026caSFam Zheng static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
134418026caSFam Zheng {
135418026caSFam Zheng     assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info));
136418026caSFam Zheng }
137418026caSFam Zheng 
138418026caSFam Zheng static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
139418026caSFam Zheng {
140df058222SPhilippe Mathieu-Daudé     g_autofree char *barname = NULL;
141418026caSFam Zheng     assert_bar_index_valid(s, index);
142418026caSFam Zheng     s->bar_region_info[index] = (struct vfio_region_info) {
143418026caSFam Zheng         .index = VFIO_PCI_BAR0_REGION_INDEX + index,
144418026caSFam Zheng         .argsz = sizeof(struct vfio_region_info),
145418026caSFam Zheng     };
146418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) {
147418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get BAR region info");
148418026caSFam Zheng         return -errno;
149418026caSFam Zheng     }
150df058222SPhilippe Mathieu-Daudé     barname = g_strdup_printf("bar[%d]", index);
151df058222SPhilippe Mathieu-Daudé     trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset,
152df058222SPhilippe Mathieu-Daudé                                 s->bar_region_info[index].size,
153df058222SPhilippe Mathieu-Daudé                                 s->bar_region_info[index].cap_offset);
154418026caSFam Zheng 
155418026caSFam Zheng     return 0;
156418026caSFam Zheng }
157418026caSFam Zheng 
158418026caSFam Zheng /**
159418026caSFam Zheng  * Map a PCI bar area.
160418026caSFam Zheng  */
161418026caSFam Zheng void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
162b02c01a5SPhilippe Mathieu-Daudé                             uint64_t offset, uint64_t size, int prot,
163418026caSFam Zheng                             Error **errp)
164418026caSFam Zheng {
165418026caSFam Zheng     void *p;
1668e3b0cbbSMarc-André Lureau     assert(QEMU_IS_ALIGNED(offset, qemu_real_host_page_size()));
167418026caSFam Zheng     assert_bar_index_valid(s, index);
168418026caSFam Zheng     p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
169b02c01a5SPhilippe Mathieu-Daudé              prot, MAP_SHARED,
170418026caSFam Zheng              s->device, s->bar_region_info[index].offset + offset);
1712817fbceSPhilippe Mathieu-Daudé     trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset ,
1722817fbceSPhilippe Mathieu-Daudé                                 size, offset, p);
173418026caSFam Zheng     if (p == MAP_FAILED) {
174418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to map BAR region");
175418026caSFam Zheng         p = NULL;
176418026caSFam Zheng     }
177418026caSFam Zheng     return p;
178418026caSFam Zheng }
179418026caSFam Zheng 
180418026caSFam Zheng /**
181418026caSFam Zheng  * Unmap a PCI bar area.
182418026caSFam Zheng  */
183418026caSFam Zheng void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar,
184418026caSFam Zheng                              uint64_t offset, uint64_t size)
185418026caSFam Zheng {
186418026caSFam Zheng     if (bar) {
187418026caSFam Zheng         munmap(bar, MIN(size, s->bar_region_info[index].size - offset));
188418026caSFam Zheng     }
189418026caSFam Zheng }
190418026caSFam Zheng 
191418026caSFam Zheng /**
192a6da793aSPhilippe Mathieu-Daudé  * Initialize device IRQ with @irq_type and register an event notifier.
193418026caSFam Zheng  */
194418026caSFam Zheng int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
195418026caSFam Zheng                            int irq_type, Error **errp)
196418026caSFam Zheng {
197418026caSFam Zheng     int r;
198418026caSFam Zheng     struct vfio_irq_set *irq_set;
199418026caSFam Zheng     size_t irq_set_size;
200418026caSFam Zheng     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
201418026caSFam Zheng 
202418026caSFam Zheng     irq_info.index = irq_type;
203418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) {
204418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get device interrupt info");
205418026caSFam Zheng         return -errno;
206418026caSFam Zheng     }
207418026caSFam Zheng     if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
208418026caSFam Zheng         error_setg(errp, "Device interrupt doesn't support eventfd");
209418026caSFam Zheng         return -EINVAL;
210418026caSFam Zheng     }
211418026caSFam Zheng 
212418026caSFam Zheng     irq_set_size = sizeof(*irq_set) + sizeof(int);
213418026caSFam Zheng     irq_set = g_malloc0(irq_set_size);
214418026caSFam Zheng 
215418026caSFam Zheng     /* Get to a known IRQ state */
216418026caSFam Zheng     *irq_set = (struct vfio_irq_set) {
217418026caSFam Zheng         .argsz = irq_set_size,
218418026caSFam Zheng         .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
219418026caSFam Zheng         .index = irq_info.index,
220418026caSFam Zheng         .start = 0,
221418026caSFam Zheng         .count = 1,
222418026caSFam Zheng     };
223418026caSFam Zheng 
224418026caSFam Zheng     *(int *)&irq_set->data = event_notifier_get_fd(e);
225418026caSFam Zheng     r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set);
226418026caSFam Zheng     g_free(irq_set);
227418026caSFam Zheng     if (r) {
228418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to setup device interrupt");
229418026caSFam Zheng         return -errno;
230418026caSFam Zheng     }
231418026caSFam Zheng     return 0;
232418026caSFam Zheng }
233418026caSFam Zheng 
234418026caSFam Zheng static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
235418026caSFam Zheng                                      int size, int ofs)
236418026caSFam Zheng {
237418026caSFam Zheng     int ret;
238418026caSFam Zheng 
2393d87c2d9SPhilippe Mathieu-Daudé     trace_qemu_vfio_pci_read_config(buf, ofs, size,
2403d87c2d9SPhilippe Mathieu-Daudé                                     s->config_region_info.offset,
2413d87c2d9SPhilippe Mathieu-Daudé                                     s->config_region_info.size);
2423d87c2d9SPhilippe Mathieu-Daudé     assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
243*37b0b24eSNikita Ivanov     ret = RETRY_ON_EINTR(
244*37b0b24eSNikita Ivanov         pread(s->device, buf, size, s->config_region_info.offset + ofs)
245*37b0b24eSNikita Ivanov     );
246418026caSFam Zheng     return ret == size ? 0 : -errno;
247418026caSFam Zheng }
248418026caSFam Zheng 
249418026caSFam Zheng static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs)
250418026caSFam Zheng {
251418026caSFam Zheng     int ret;
252418026caSFam Zheng 
2533d87c2d9SPhilippe Mathieu-Daudé     trace_qemu_vfio_pci_write_config(buf, ofs, size,
2543d87c2d9SPhilippe Mathieu-Daudé                                      s->config_region_info.offset,
2553d87c2d9SPhilippe Mathieu-Daudé                                      s->config_region_info.size);
2563d87c2d9SPhilippe Mathieu-Daudé     assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
257*37b0b24eSNikita Ivanov     ret = RETRY_ON_EINTR(
258*37b0b24eSNikita Ivanov         pwrite(s->device, buf, size, s->config_region_info.offset + ofs)
259*37b0b24eSNikita Ivanov     );
260418026caSFam Zheng     return ret == size ? 0 : -errno;
261418026caSFam Zheng }
262418026caSFam Zheng 
2634487d420SEric Auger static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf)
2644487d420SEric Auger {
2654487d420SEric Auger     struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf;
2664487d420SEric Auger     struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset;
2674487d420SEric Auger     struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
2684487d420SEric Auger     int i;
2694487d420SEric Auger 
2704487d420SEric Auger     while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
2714487d420SEric Auger         if (!cap->next) {
2724487d420SEric Auger             return;
2734487d420SEric Auger         }
2743d558330SMarkus Armbruster         cap = buf + cap->next;
2754487d420SEric Auger     }
2764487d420SEric Auger 
2774487d420SEric Auger     cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
2784487d420SEric Auger 
2794487d420SEric Auger     s->nb_iova_ranges = cap_iova_range->nr_iovas;
2804487d420SEric Auger     if (s->nb_iova_ranges > 1) {
2814487d420SEric Auger         s->usable_iova_ranges =
282b21e2380SMarkus Armbruster             g_renew(struct IOVARange, s->usable_iova_ranges,
283b21e2380SMarkus Armbruster                     s->nb_iova_ranges);
2844487d420SEric Auger     }
2854487d420SEric Auger 
2864487d420SEric Auger     for (i = 0; i < s->nb_iova_ranges; i++) {
2874487d420SEric Auger         s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
2884487d420SEric Auger         s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
2894487d420SEric Auger     }
2904487d420SEric Auger }
2914487d420SEric Auger 
292418026caSFam Zheng static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
293418026caSFam Zheng                               Error **errp)
294418026caSFam Zheng {
295418026caSFam Zheng     int ret;
296418026caSFam Zheng     int i;
297418026caSFam Zheng     uint16_t pci_cmd;
298418026caSFam Zheng     struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
2994487d420SEric Auger     struct vfio_iommu_type1_info *iommu_info = NULL;
3004487d420SEric Auger     size_t iommu_info_size = sizeof(*iommu_info);
301418026caSFam Zheng     struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
302418026caSFam Zheng     char *group_file = NULL;
303418026caSFam Zheng 
3044487d420SEric Auger     s->usable_iova_ranges = NULL;
3054487d420SEric Auger 
306418026caSFam Zheng     /* Create a new container */
307418026caSFam Zheng     s->container = open("/dev/vfio/vfio", O_RDWR);
308418026caSFam Zheng 
309418026caSFam Zheng     if (s->container == -1) {
310418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio");
311418026caSFam Zheng         return -errno;
312418026caSFam Zheng     }
313418026caSFam Zheng     if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
314418026caSFam Zheng         error_setg(errp, "Invalid VFIO version");
315418026caSFam Zheng         ret = -EINVAL;
316418026caSFam Zheng         goto fail_container;
317418026caSFam Zheng     }
318418026caSFam Zheng 
319418026caSFam Zheng     if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
320a4bc212aSPhilippe Mathieu-Daudé         error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported");
321418026caSFam Zheng         ret = -EINVAL;
322418026caSFam Zheng         goto fail_container;
323418026caSFam Zheng     }
324418026caSFam Zheng 
325418026caSFam Zheng     /* Open the group */
326418026caSFam Zheng     group_file = sysfs_find_group_file(device, errp);
327418026caSFam Zheng     if (!group_file) {
328418026caSFam Zheng         ret = -EINVAL;
329418026caSFam Zheng         goto fail_container;
330418026caSFam Zheng     }
331418026caSFam Zheng 
332418026caSFam Zheng     s->group = open(group_file, O_RDWR);
333418026caSFam Zheng     if (s->group == -1) {
334418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to open VFIO group file: %s",
335418026caSFam Zheng                          group_file);
336418026caSFam Zheng         g_free(group_file);
337418026caSFam Zheng         ret = -errno;
338418026caSFam Zheng         goto fail_container;
339418026caSFam Zheng     }
340418026caSFam Zheng     g_free(group_file);
341418026caSFam Zheng 
342418026caSFam Zheng     /* Test the group is viable and available */
343418026caSFam Zheng     if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) {
344418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get VFIO group status");
345418026caSFam Zheng         ret = -errno;
346418026caSFam Zheng         goto fail;
347418026caSFam Zheng     }
348418026caSFam Zheng 
349418026caSFam Zheng     if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
350418026caSFam Zheng         error_setg(errp, "VFIO group is not viable");
351418026caSFam Zheng         ret = -EINVAL;
352418026caSFam Zheng         goto fail;
353418026caSFam Zheng     }
354418026caSFam Zheng 
355418026caSFam Zheng     /* Add the group to the container */
356418026caSFam Zheng     if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) {
357418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to add group to VFIO container");
358418026caSFam Zheng         ret = -errno;
359418026caSFam Zheng         goto fail;
360418026caSFam Zheng     }
361418026caSFam Zheng 
362418026caSFam Zheng     /* Enable the IOMMU model we want */
363418026caSFam Zheng     if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
364418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type");
365418026caSFam Zheng         ret = -errno;
366418026caSFam Zheng         goto fail;
367418026caSFam Zheng     }
368418026caSFam Zheng 
3694487d420SEric Auger     iommu_info = g_malloc0(iommu_info_size);
3704487d420SEric Auger     iommu_info->argsz = iommu_info_size;
3714487d420SEric Auger 
372418026caSFam Zheng     /* Get additional IOMMU info */
3734487d420SEric Auger     if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
374418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get IOMMU info");
375418026caSFam Zheng         ret = -errno;
376418026caSFam Zheng         goto fail;
377418026caSFam Zheng     }
378418026caSFam Zheng 
3794487d420SEric Auger     /*
3804487d420SEric Auger      * if the kernel does not report usable IOVA regions, choose
3814487d420SEric Auger      * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
3824487d420SEric Auger      */
3834487d420SEric Auger     s->nb_iova_ranges = 1;
3844487d420SEric Auger     s->usable_iova_ranges = g_new0(struct IOVARange, 1);
3854487d420SEric Auger     s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
3864487d420SEric Auger     s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
3874487d420SEric Auger 
3884487d420SEric Auger     if (iommu_info->argsz > iommu_info_size) {
3894487d420SEric Auger         iommu_info_size = iommu_info->argsz;
3904487d420SEric Auger         iommu_info = g_realloc(iommu_info, iommu_info_size);
3914487d420SEric Auger         if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
3924487d420SEric Auger             ret = -errno;
3934487d420SEric Auger             goto fail;
3944487d420SEric Auger         }
3954487d420SEric Auger         collect_usable_iova_ranges(s, iommu_info);
3964487d420SEric Auger     }
3974487d420SEric Auger 
398418026caSFam Zheng     s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
399418026caSFam Zheng 
400418026caSFam Zheng     if (s->device < 0) {
401418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get device fd");
402418026caSFam Zheng         ret = -errno;
403418026caSFam Zheng         goto fail;
404418026caSFam Zheng     }
405418026caSFam Zheng 
406418026caSFam Zheng     /* Test and setup the device */
407418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) {
408418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get device info");
409418026caSFam Zheng         ret = -errno;
410418026caSFam Zheng         goto fail;
411418026caSFam Zheng     }
412418026caSFam Zheng 
413418026caSFam Zheng     if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
414418026caSFam Zheng         error_setg(errp, "Invalid device regions");
415418026caSFam Zheng         ret = -EINVAL;
416418026caSFam Zheng         goto fail;
417418026caSFam Zheng     }
418418026caSFam Zheng 
419418026caSFam Zheng     s->config_region_info = (struct vfio_region_info) {
420418026caSFam Zheng         .index = VFIO_PCI_CONFIG_REGION_INDEX,
421418026caSFam Zheng         .argsz = sizeof(struct vfio_region_info),
422418026caSFam Zheng     };
423418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) {
424418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get config region info");
425418026caSFam Zheng         ret = -errno;
426418026caSFam Zheng         goto fail;
427418026caSFam Zheng     }
428df058222SPhilippe Mathieu-Daudé     trace_qemu_vfio_region_info("config", s->config_region_info.offset,
429df058222SPhilippe Mathieu-Daudé                                 s->config_region_info.size,
430df058222SPhilippe Mathieu-Daudé                                 s->config_region_info.cap_offset);
431418026caSFam Zheng 
4329e722ebcSLi Qiang     for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
433418026caSFam Zheng         ret = qemu_vfio_pci_init_bar(s, i, errp);
434418026caSFam Zheng         if (ret) {
435418026caSFam Zheng             goto fail;
436418026caSFam Zheng         }
437418026caSFam Zheng     }
438418026caSFam Zheng 
439418026caSFam Zheng     /* Enable bus master */
440418026caSFam Zheng     ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
441418026caSFam Zheng     if (ret) {
442418026caSFam Zheng         goto fail;
443418026caSFam Zheng     }
444418026caSFam Zheng     pci_cmd |= PCI_COMMAND_MASTER;
445418026caSFam Zheng     ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
446418026caSFam Zheng     if (ret) {
447418026caSFam Zheng         goto fail;
448418026caSFam Zheng     }
4494487d420SEric Auger     g_free(iommu_info);
450418026caSFam Zheng     return 0;
451418026caSFam Zheng fail:
4524487d420SEric Auger     g_free(s->usable_iova_ranges);
4534487d420SEric Auger     s->usable_iova_ranges = NULL;
4544487d420SEric Auger     s->nb_iova_ranges = 0;
4554487d420SEric Auger     g_free(iommu_info);
456418026caSFam Zheng     close(s->group);
457418026caSFam Zheng fail_container:
458418026caSFam Zheng     close(s->container);
459418026caSFam Zheng     return ret;
460418026caSFam Zheng }
461418026caSFam Zheng 
4628f44304cSDavid Hildenbrand static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, void *host,
4638f44304cSDavid Hildenbrand                                       size_t size, size_t max_size)
464418026caSFam Zheng {
465418026caSFam Zheng     QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
466521b97cdSPhilippe Mathieu-Daudé     Error *local_err = NULL;
467082851a3SDavid Hildenbrand     int ret;
468082851a3SDavid Hildenbrand 
4698f44304cSDavid Hildenbrand     trace_qemu_vfio_ram_block_added(s, host, max_size);
470521b97cdSPhilippe Mathieu-Daudé     ret = qemu_vfio_dma_map(s, host, max_size, false, NULL, &local_err);
471082851a3SDavid Hildenbrand     if (ret) {
472521b97cdSPhilippe Mathieu-Daudé         error_reportf_err(local_err,
473521b97cdSPhilippe Mathieu-Daudé                           "qemu_vfio_dma_map(%p, %zu) failed: ",
474521b97cdSPhilippe Mathieu-Daudé                           host, max_size);
475082851a3SDavid Hildenbrand     }
476418026caSFam Zheng }
477418026caSFam Zheng 
4788f44304cSDavid Hildenbrand static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, void *host,
4798f44304cSDavid Hildenbrand                                         size_t size, size_t max_size)
480418026caSFam Zheng {
481418026caSFam Zheng     QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
482418026caSFam Zheng     if (host) {
4838f44304cSDavid Hildenbrand         trace_qemu_vfio_ram_block_removed(s, host, max_size);
484418026caSFam Zheng         qemu_vfio_dma_unmap(s, host);
485418026caSFam Zheng     }
486418026caSFam Zheng }
487418026caSFam Zheng 
488418026caSFam Zheng static void qemu_vfio_open_common(QEMUVFIOState *s)
489418026caSFam Zheng {
490549b50a3SMarkus Armbruster     qemu_mutex_init(&s->lock);
491418026caSFam Zheng     s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
492418026caSFam Zheng     s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
493418026caSFam Zheng     s->low_water_mark = QEMU_VFIO_IOVA_MIN;
494418026caSFam Zheng     s->high_water_mark = QEMU_VFIO_IOVA_MAX;
495082851a3SDavid Hildenbrand     ram_block_notifier_add(&s->ram_notifier);
496418026caSFam Zheng }
497418026caSFam Zheng 
498418026caSFam Zheng /**
499418026caSFam Zheng  * Open a PCI device, e.g. "0000:00:01.0".
500418026caSFam Zheng  */
501418026caSFam Zheng QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
502418026caSFam Zheng {
503418026caSFam Zheng     int r;
504418026caSFam Zheng     QEMUVFIOState *s = g_new0(QEMUVFIOState, 1);
505418026caSFam Zheng 
506b430b513SDavid Hildenbrand     /*
507b430b513SDavid Hildenbrand      * VFIO may pin all memory inside mappings, resulting it in pinning
508b430b513SDavid Hildenbrand      * all memory inside RAM blocks unconditionally.
509b430b513SDavid Hildenbrand      */
510b430b513SDavid Hildenbrand     r = ram_block_discard_disable(true);
511b430b513SDavid Hildenbrand     if (r) {
512b430b513SDavid Hildenbrand         error_setg_errno(errp, -r, "Cannot set discarding of RAM broken");
513b430b513SDavid Hildenbrand         g_free(s);
514b430b513SDavid Hildenbrand         return NULL;
515b430b513SDavid Hildenbrand     }
516b430b513SDavid Hildenbrand 
517418026caSFam Zheng     r = qemu_vfio_init_pci(s, device, errp);
518418026caSFam Zheng     if (r) {
519b430b513SDavid Hildenbrand         ram_block_discard_disable(false);
520418026caSFam Zheng         g_free(s);
521418026caSFam Zheng         return NULL;
522418026caSFam Zheng     }
523418026caSFam Zheng     qemu_vfio_open_common(s);
524418026caSFam Zheng     return s;
525418026caSFam Zheng }
526418026caSFam Zheng 
527418026caSFam Zheng static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
528418026caSFam Zheng {
529f6b8104dSPhilippe Mathieu-Daudé     for (int i = 0; i < s->nr_mappings; ++i) {
530f6b8104dSPhilippe Mathieu-Daudé         trace_qemu_vfio_dump_mapping(s->mappings[i].host,
531f6b8104dSPhilippe Mathieu-Daudé                                      s->mappings[i].iova,
532f6b8104dSPhilippe Mathieu-Daudé                                      s->mappings[i].size);
533418026caSFam Zheng     }
534418026caSFam Zheng }
535418026caSFam Zheng 
536418026caSFam Zheng /**
537418026caSFam Zheng  * Find the mapping entry that contains [host, host + size) and set @index to
538418026caSFam Zheng  * the position. If no entry contains it, @index is the position _after_ which
539418026caSFam Zheng  * to insert the new mapping. IOW, it is the index of the largest element that
540418026caSFam Zheng  * is smaller than @host, or -1 if no entry is.
541418026caSFam Zheng  */
542418026caSFam Zheng static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host,
543418026caSFam Zheng                                            int *index)
544418026caSFam Zheng {
545418026caSFam Zheng     IOVAMapping *p = s->mappings;
546418026caSFam Zheng     IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL;
547418026caSFam Zheng     IOVAMapping *mid;
548418026caSFam Zheng     trace_qemu_vfio_find_mapping(s, host);
549418026caSFam Zheng     if (!p) {
550418026caSFam Zheng         *index = -1;
551418026caSFam Zheng         return NULL;
552418026caSFam Zheng     }
553418026caSFam Zheng     while (true) {
554418026caSFam Zheng         mid = p + (q - p) / 2;
555418026caSFam Zheng         if (mid == p) {
556418026caSFam Zheng             break;
557418026caSFam Zheng         }
558418026caSFam Zheng         if (mid->host > host) {
559418026caSFam Zheng             q = mid;
560418026caSFam Zheng         } else if (mid->host < host) {
561418026caSFam Zheng             p = mid;
562418026caSFam Zheng         } else {
563418026caSFam Zheng             break;
564418026caSFam Zheng         }
565418026caSFam Zheng     }
566418026caSFam Zheng     if (mid->host > host) {
567418026caSFam Zheng         mid--;
568418026caSFam Zheng     } else if (mid < &s->mappings[s->nr_mappings - 1]
569418026caSFam Zheng                && (mid + 1)->host <= host) {
570418026caSFam Zheng         mid++;
571418026caSFam Zheng     }
572418026caSFam Zheng     *index = mid - &s->mappings[0];
573418026caSFam Zheng     if (mid >= &s->mappings[0] &&
574418026caSFam Zheng         mid->host <= host && mid->host + mid->size > host) {
575418026caSFam Zheng         assert(mid < &s->mappings[s->nr_mappings]);
576418026caSFam Zheng         return mid;
577418026caSFam Zheng     }
578418026caSFam Zheng     /* At this point *index + 1 is the right position to insert the new
579418026caSFam Zheng      * mapping.*/
580418026caSFam Zheng     return NULL;
581418026caSFam Zheng }
582418026caSFam Zheng 
583418026caSFam Zheng /**
584a6da793aSPhilippe Mathieu-Daudé  * Allocate IOVA and create a new mapping record and insert it in @s.
585418026caSFam Zheng  */
586418026caSFam Zheng static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s,
587418026caSFam Zheng                                           void *host, size_t size,
588418026caSFam Zheng                                           int index, uint64_t iova)
589418026caSFam Zheng {
590418026caSFam Zheng     int shift;
591418026caSFam Zheng     IOVAMapping m = {.host = host, .size = size, .iova = iova};
592418026caSFam Zheng     IOVAMapping *insert;
593418026caSFam Zheng 
5948e3b0cbbSMarc-André Lureau     assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size()));
5958e3b0cbbSMarc-André Lureau     assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size()));
5968e3b0cbbSMarc-André Lureau     assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size()));
597418026caSFam Zheng     trace_qemu_vfio_new_mapping(s, host, size, index, iova);
598418026caSFam Zheng 
599418026caSFam Zheng     assert(index >= 0);
600418026caSFam Zheng     s->nr_mappings++;
601d29eb678SOlaf Hering     s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
602418026caSFam Zheng     insert = &s->mappings[index];
603418026caSFam Zheng     shift = s->nr_mappings - index - 1;
604418026caSFam Zheng     if (shift) {
605418026caSFam Zheng         memmove(insert + 1, insert, shift * sizeof(s->mappings[0]));
606418026caSFam Zheng     }
607418026caSFam Zheng     *insert = m;
608418026caSFam Zheng     return insert;
609418026caSFam Zheng }
610418026caSFam Zheng 
611418026caSFam Zheng /* Do the DMA mapping with VFIO. */
612418026caSFam Zheng static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
613f38b376dSPhilippe Mathieu-Daudé                                 uint64_t iova, Error **errp)
614418026caSFam Zheng {
615418026caSFam Zheng     struct vfio_iommu_type1_dma_map dma_map = {
616418026caSFam Zheng         .argsz = sizeof(dma_map),
617418026caSFam Zheng         .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
618418026caSFam Zheng         .iova = iova,
619418026caSFam Zheng         .vaddr = (uintptr_t)host,
620418026caSFam Zheng         .size = size,
621418026caSFam Zheng     };
6224c946b22SPhilippe Mathieu-Daudé     trace_qemu_vfio_do_mapping(s, host, iova, size);
623418026caSFam Zheng 
624418026caSFam Zheng     if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
625f38b376dSPhilippe Mathieu-Daudé         error_setg_errno(errp, errno, "VFIO_MAP_DMA failed");
626418026caSFam Zheng         return -errno;
627418026caSFam Zheng     }
628418026caSFam Zheng     return 0;
629418026caSFam Zheng }
630418026caSFam Zheng 
631418026caSFam Zheng /**
632418026caSFam Zheng  * Undo the DMA mapping from @s with VFIO, and remove from mapping list.
633418026caSFam Zheng  */
634418026caSFam Zheng static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping,
635418026caSFam Zheng                                    Error **errp)
636418026caSFam Zheng {
637418026caSFam Zheng     int index;
638418026caSFam Zheng     struct vfio_iommu_type1_dma_unmap unmap = {
639418026caSFam Zheng         .argsz = sizeof(unmap),
640418026caSFam Zheng         .flags = 0,
641418026caSFam Zheng         .iova = mapping->iova,
642418026caSFam Zheng         .size = mapping->size,
643418026caSFam Zheng     };
644418026caSFam Zheng 
645418026caSFam Zheng     index = mapping - s->mappings;
646418026caSFam Zheng     assert(mapping->size > 0);
6478e3b0cbbSMarc-André Lureau     assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size()));
648418026caSFam Zheng     assert(index >= 0 && index < s->nr_mappings);
649418026caSFam Zheng     if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
650b09d51c9SMichal Privoznik         error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed");
651418026caSFam Zheng     }
652418026caSFam Zheng     memmove(mapping, &s->mappings[index + 1],
653418026caSFam Zheng             sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
654418026caSFam Zheng     s->nr_mappings--;
655d29eb678SOlaf Hering     s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
656418026caSFam Zheng }
657418026caSFam Zheng 
658418026caSFam Zheng /* Check if the mapping list is (ascending) ordered. */
659418026caSFam Zheng static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
660418026caSFam Zheng {
661418026caSFam Zheng     int i;
662418026caSFam Zheng     if (QEMU_VFIO_DEBUG) {
663418026caSFam Zheng         for (i = 0; i < s->nr_mappings - 1; ++i) {
664418026caSFam Zheng             if (!(s->mappings[i].host < s->mappings[i + 1].host)) {
665cb49dfceSPhilippe Mathieu-Daudé                 error_report("item %d not sorted!", i);
666418026caSFam Zheng                 qemu_vfio_dump_mappings(s);
667418026caSFam Zheng                 return false;
668418026caSFam Zheng             }
669418026caSFam Zheng             if (!(s->mappings[i].host + s->mappings[i].size <=
670418026caSFam Zheng                   s->mappings[i + 1].host)) {
671cb49dfceSPhilippe Mathieu-Daudé                 error_report("item %d overlap with next!", i);
672418026caSFam Zheng                 qemu_vfio_dump_mappings(s);
673418026caSFam Zheng                 return false;
674418026caSFam Zheng             }
675418026caSFam Zheng         }
676418026caSFam Zheng     }
677418026caSFam Zheng     return true;
678418026caSFam Zheng }
679418026caSFam Zheng 
680453095e9SPhilippe Mathieu-Daudé static bool qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size,
681453095e9SPhilippe Mathieu-Daudé                                       uint64_t *iova, Error **errp)
6829ab57411SEric Auger {
6839ab57411SEric Auger     int i;
6849ab57411SEric Auger 
6859ab57411SEric Auger     for (i = 0; i < s->nb_iova_ranges; i++) {
6869ab57411SEric Auger         if (s->usable_iova_ranges[i].end < s->low_water_mark) {
6879ab57411SEric Auger             continue;
6889ab57411SEric Auger         }
6899ab57411SEric Auger         s->low_water_mark =
6909ab57411SEric Auger             MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
6919ab57411SEric Auger 
6929ab57411SEric Auger         if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
6939ab57411SEric Auger             s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
6949ab57411SEric Auger             *iova = s->low_water_mark;
6959ab57411SEric Auger             s->low_water_mark += size;
696453095e9SPhilippe Mathieu-Daudé             return true;
6979ab57411SEric Auger         }
6989ab57411SEric Auger     }
699453095e9SPhilippe Mathieu-Daudé     error_setg(errp, "fixed iova range not found");
700453095e9SPhilippe Mathieu-Daudé 
701453095e9SPhilippe Mathieu-Daudé     return false;
7029ab57411SEric Auger }
7039ab57411SEric Auger 
704453095e9SPhilippe Mathieu-Daudé static bool qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size,
705453095e9SPhilippe Mathieu-Daudé                                      uint64_t *iova, Error **errp)
7069ab57411SEric Auger {
7079ab57411SEric Auger     int i;
7089ab57411SEric Auger 
7099ab57411SEric Auger     for (i = s->nb_iova_ranges - 1; i >= 0; i--) {
7109ab57411SEric Auger         if (s->usable_iova_ranges[i].start > s->high_water_mark) {
7119ab57411SEric Auger             continue;
7129ab57411SEric Auger         }
7139ab57411SEric Auger         s->high_water_mark =
7149ab57411SEric Auger             MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1);
7159ab57411SEric Auger 
7169ab57411SEric Auger         if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size ||
7179ab57411SEric Auger             s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) {
7189ab57411SEric Auger             *iova = s->high_water_mark - size;
7199ab57411SEric Auger             s->high_water_mark = *iova;
720453095e9SPhilippe Mathieu-Daudé             return true;
7219ab57411SEric Auger         }
7229ab57411SEric Auger     }
723453095e9SPhilippe Mathieu-Daudé     error_setg(errp, "temporary iova range not found");
724453095e9SPhilippe Mathieu-Daudé 
725453095e9SPhilippe Mathieu-Daudé     return false;
7269ab57411SEric Auger }
7279ab57411SEric Auger 
72871e3038cSPhilippe Mathieu-Daudé /**
72971e3038cSPhilippe Mathieu-Daudé  * qemu_vfio_water_mark_reached:
73071e3038cSPhilippe Mathieu-Daudé  *
73171e3038cSPhilippe Mathieu-Daudé  * Returns %true if high watermark has been reached, %false otherwise.
73271e3038cSPhilippe Mathieu-Daudé  */
73371e3038cSPhilippe Mathieu-Daudé static bool qemu_vfio_water_mark_reached(QEMUVFIOState *s, size_t size,
73471e3038cSPhilippe Mathieu-Daudé                                          Error **errp)
73571e3038cSPhilippe Mathieu-Daudé {
73671e3038cSPhilippe Mathieu-Daudé     if (s->high_water_mark - s->low_water_mark + 1 < size) {
73771e3038cSPhilippe Mathieu-Daudé         error_setg(errp, "iova exhausted (water mark reached)");
73871e3038cSPhilippe Mathieu-Daudé         return true;
73971e3038cSPhilippe Mathieu-Daudé     }
74071e3038cSPhilippe Mathieu-Daudé     return false;
74171e3038cSPhilippe Mathieu-Daudé }
74271e3038cSPhilippe Mathieu-Daudé 
743418026caSFam Zheng /* Map [host, host + size) area into a contiguous IOVA address space, and store
744418026caSFam Zheng  * the result in @iova if not NULL. The caller need to make sure the area is
745418026caSFam Zheng  * aligned to page size, and mustn't overlap with existing mapping areas (split
746418026caSFam Zheng  * mapping status within this area is not allowed).
747418026caSFam Zheng  */
748418026caSFam Zheng int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
749521b97cdSPhilippe Mathieu-Daudé                       bool temporary, uint64_t *iova, Error **errp)
750418026caSFam Zheng {
751418026caSFam Zheng     int index;
752418026caSFam Zheng     IOVAMapping *mapping;
753418026caSFam Zheng     uint64_t iova0;
754418026caSFam Zheng 
7558e3b0cbbSMarc-André Lureau     assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size()));
7568e3b0cbbSMarc-André Lureau     assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size()));
757418026caSFam Zheng     trace_qemu_vfio_dma_map(s, host, size, temporary, iova);
758a990858bSPhilippe Mathieu-Daudé     QEMU_LOCK_GUARD(&s->lock);
759418026caSFam Zheng     mapping = qemu_vfio_find_mapping(s, host, &index);
760418026caSFam Zheng     if (mapping) {
761418026caSFam Zheng         iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host);
762418026caSFam Zheng     } else {
7635a4f1626SPhilippe Mathieu-Daudé         int ret;
7645a4f1626SPhilippe Mathieu-Daudé 
76571e3038cSPhilippe Mathieu-Daudé         if (qemu_vfio_water_mark_reached(s, size, errp)) {
7665a4f1626SPhilippe Mathieu-Daudé             return -ENOMEM;
767418026caSFam Zheng         }
768418026caSFam Zheng         if (!temporary) {
769453095e9SPhilippe Mathieu-Daudé             if (!qemu_vfio_find_fixed_iova(s, size, &iova0, errp)) {
7705a4f1626SPhilippe Mathieu-Daudé                 return -ENOMEM;
7719ab57411SEric Auger             }
7729ab57411SEric Auger 
773418026caSFam Zheng             mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
774418026caSFam Zheng             assert(qemu_vfio_verify_mappings(s));
775f38b376dSPhilippe Mathieu-Daudé             ret = qemu_vfio_do_mapping(s, host, size, iova0, errp);
7765a4f1626SPhilippe Mathieu-Daudé             if (ret < 0) {
777418026caSFam Zheng                 qemu_vfio_undo_mapping(s, mapping, NULL);
7785a4f1626SPhilippe Mathieu-Daudé                 return ret;
779418026caSFam Zheng             }
780418026caSFam Zheng             qemu_vfio_dump_mappings(s);
781418026caSFam Zheng         } else {
782453095e9SPhilippe Mathieu-Daudé             if (!qemu_vfio_find_temp_iova(s, size, &iova0, errp)) {
7835a4f1626SPhilippe Mathieu-Daudé                 return -ENOMEM;
7849ab57411SEric Auger             }
785f38b376dSPhilippe Mathieu-Daudé             ret = qemu_vfio_do_mapping(s, host, size, iova0, errp);
7865a4f1626SPhilippe Mathieu-Daudé             if (ret < 0) {
7875a4f1626SPhilippe Mathieu-Daudé                 return ret;
788418026caSFam Zheng             }
789418026caSFam Zheng         }
790418026caSFam Zheng     }
7914c946b22SPhilippe Mathieu-Daudé     trace_qemu_vfio_dma_mapped(s, host, iova0, size);
792418026caSFam Zheng     if (iova) {
793418026caSFam Zheng         *iova = iova0;
794418026caSFam Zheng     }
7955a4f1626SPhilippe Mathieu-Daudé     return 0;
796418026caSFam Zheng }
797418026caSFam Zheng 
798418026caSFam Zheng /* Reset the high watermark and free all "temporary" mappings. */
799418026caSFam Zheng int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
800418026caSFam Zheng {
801418026caSFam Zheng     struct vfio_iommu_type1_dma_unmap unmap = {
802418026caSFam Zheng         .argsz = sizeof(unmap),
803418026caSFam Zheng         .flags = 0,
804418026caSFam Zheng         .iova = s->high_water_mark,
805418026caSFam Zheng         .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark,
806418026caSFam Zheng     };
807418026caSFam Zheng     trace_qemu_vfio_dma_reset_temporary(s);
8086e8a355dSDaniel Brodsky     QEMU_LOCK_GUARD(&s->lock);
809418026caSFam Zheng     if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
810b09d51c9SMichal Privoznik         error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
811418026caSFam Zheng         return -errno;
812418026caSFam Zheng     }
813418026caSFam Zheng     s->high_water_mark = QEMU_VFIO_IOVA_MAX;
814418026caSFam Zheng     return 0;
815418026caSFam Zheng }
816418026caSFam Zheng 
817418026caSFam Zheng /* Unmapping the whole area that was previously mapped with
818418026caSFam Zheng  * qemu_vfio_dma_map(). */
819418026caSFam Zheng void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host)
820418026caSFam Zheng {
821418026caSFam Zheng     int index = 0;
822418026caSFam Zheng     IOVAMapping *m;
823418026caSFam Zheng 
824418026caSFam Zheng     if (!host) {
825418026caSFam Zheng         return;
826418026caSFam Zheng     }
827418026caSFam Zheng 
828418026caSFam Zheng     trace_qemu_vfio_dma_unmap(s, host);
829a990858bSPhilippe Mathieu-Daudé     QEMU_LOCK_GUARD(&s->lock);
830418026caSFam Zheng     m = qemu_vfio_find_mapping(s, host, &index);
831418026caSFam Zheng     if (!m) {
832a990858bSPhilippe Mathieu-Daudé         return;
833418026caSFam Zheng     }
834418026caSFam Zheng     qemu_vfio_undo_mapping(s, m, NULL);
835418026caSFam Zheng }
836418026caSFam Zheng 
837418026caSFam Zheng static void qemu_vfio_reset(QEMUVFIOState *s)
838418026caSFam Zheng {
839418026caSFam Zheng     ioctl(s->device, VFIO_DEVICE_RESET);
840418026caSFam Zheng }
841418026caSFam Zheng 
842418026caSFam Zheng /* Close and free the VFIO resources. */
843418026caSFam Zheng void qemu_vfio_close(QEMUVFIOState *s)
844418026caSFam Zheng {
845418026caSFam Zheng     int i;
846418026caSFam Zheng 
847418026caSFam Zheng     if (!s) {
848418026caSFam Zheng         return;
849418026caSFam Zheng     }
8501f0fea38SStefan Hajnoczi 
8511f0fea38SStefan Hajnoczi     ram_block_notifier_remove(&s->ram_notifier);
8521f0fea38SStefan Hajnoczi 
853418026caSFam Zheng     for (i = 0; i < s->nr_mappings; ++i) {
854418026caSFam Zheng         qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
855418026caSFam Zheng     }
8561f0fea38SStefan Hajnoczi 
8574487d420SEric Auger     g_free(s->usable_iova_ranges);
8584487d420SEric Auger     s->nb_iova_ranges = 0;
859418026caSFam Zheng     qemu_vfio_reset(s);
860418026caSFam Zheng     close(s->device);
861418026caSFam Zheng     close(s->group);
862418026caSFam Zheng     close(s->container);
863b430b513SDavid Hildenbrand     ram_block_discard_disable(false);
864418026caSFam Zheng }
865