xref: /openbmc/qemu/util/vfio-helpers.c (revision dbdea0db)
1418026caSFam Zheng /*
2418026caSFam Zheng  * VFIO utility
3418026caSFam Zheng  *
4418026caSFam Zheng  * Copyright 2016 - 2018 Red Hat, Inc.
5418026caSFam Zheng  *
6418026caSFam Zheng  * Authors:
7418026caSFam Zheng  *   Fam Zheng <famz@redhat.com>
8418026caSFam Zheng  *
9418026caSFam Zheng  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10418026caSFam Zheng  * See the COPYING file in the top-level directory.
11418026caSFam Zheng  */
12418026caSFam Zheng 
13418026caSFam Zheng #include "qemu/osdep.h"
14418026caSFam Zheng #include <sys/ioctl.h>
15418026caSFam Zheng #include <linux/vfio.h>
16418026caSFam Zheng #include "qapi/error.h"
17418026caSFam Zheng #include "exec/ramlist.h"
18418026caSFam Zheng #include "exec/cpu-common.h"
19b430b513SDavid Hildenbrand #include "exec/memory.h"
20418026caSFam Zheng #include "trace.h"
21418026caSFam Zheng #include "qemu/error-report.h"
22418026caSFam Zheng #include "standard-headers/linux/pci_regs.h"
23418026caSFam Zheng #include "qemu/event_notifier.h"
24418026caSFam Zheng #include "qemu/vfio-helpers.h"
256e8a355dSDaniel Brodsky #include "qemu/lockable.h"
26418026caSFam Zheng #include "trace.h"
27418026caSFam Zheng 
28418026caSFam Zheng #define QEMU_VFIO_DEBUG 0
29418026caSFam Zheng 
30418026caSFam Zheng #define QEMU_VFIO_IOVA_MIN 0x10000ULL
31418026caSFam Zheng /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
32418026caSFam Zheng  * we can use a runtime limit; alternatively it's also possible to do platform
33418026caSFam Zheng  * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
34418026caSFam Zheng  **/
35418026caSFam Zheng #define QEMU_VFIO_IOVA_MAX (1ULL << 39)
36418026caSFam Zheng 
37418026caSFam Zheng typedef struct {
38418026caSFam Zheng     /* Page aligned addr. */
39418026caSFam Zheng     void *host;
40418026caSFam Zheng     size_t size;
41418026caSFam Zheng     uint64_t iova;
42418026caSFam Zheng } IOVAMapping;
43418026caSFam Zheng 
444487d420SEric Auger struct IOVARange {
454487d420SEric Auger     uint64_t start;
464487d420SEric Auger     uint64_t end;
474487d420SEric Auger };
484487d420SEric Auger 
49418026caSFam Zheng struct QEMUVFIOState {
50418026caSFam Zheng     QemuMutex lock;
51418026caSFam Zheng 
52418026caSFam Zheng     /* These fields are protected by BQL */
53418026caSFam Zheng     int container;
54418026caSFam Zheng     int group;
55418026caSFam Zheng     int device;
56418026caSFam Zheng     RAMBlockNotifier ram_notifier;
57418026caSFam Zheng     struct vfio_region_info config_region_info, bar_region_info[6];
584487d420SEric Auger     struct IOVARange *usable_iova_ranges;
594487d420SEric Auger     uint8_t nb_iova_ranges;
60418026caSFam Zheng 
61418026caSFam Zheng     /* These fields are protected by @lock */
62418026caSFam Zheng     /* VFIO's IO virtual address space is managed by splitting into a few
63418026caSFam Zheng      * sections:
64418026caSFam Zheng      *
65418026caSFam Zheng      * ---------------       <= 0
66418026caSFam Zheng      * |xxxxxxxxxxxxx|
67418026caSFam Zheng      * |-------------|       <= QEMU_VFIO_IOVA_MIN
68418026caSFam Zheng      * |             |
69418026caSFam Zheng      * |    Fixed    |
70418026caSFam Zheng      * |             |
71418026caSFam Zheng      * |-------------|       <= low_water_mark
72418026caSFam Zheng      * |             |
73418026caSFam Zheng      * |    Free     |
74418026caSFam Zheng      * |             |
75418026caSFam Zheng      * |-------------|       <= high_water_mark
76418026caSFam Zheng      * |             |
77418026caSFam Zheng      * |    Temp     |
78418026caSFam Zheng      * |             |
79418026caSFam Zheng      * |-------------|       <= QEMU_VFIO_IOVA_MAX
80418026caSFam Zheng      * |xxxxxxxxxxxxx|
81418026caSFam Zheng      * |xxxxxxxxxxxxx|
82418026caSFam Zheng      * ---------------
83418026caSFam Zheng      *
84418026caSFam Zheng      * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
85418026caSFam Zheng      *
86418026caSFam Zheng      * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
87418026caSFam Zheng      *   [QEMU_VFIO_IOVA_MIN, low_water_mark).  Once allocated they will not be
88418026caSFam Zheng      *   reclaimed - low_water_mark never shrinks;
89418026caSFam Zheng      *
90418026caSFam Zheng      * - IOVAs in range [low_water_mark, high_water_mark) are free;
91418026caSFam Zheng      *
92418026caSFam Zheng      * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
93418026caSFam Zheng      *   mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
94418026caSFam Zheng      *   is recycled. The caller should make sure I/O's depending on these
95418026caSFam Zheng      *   mappings are completed before calling.
96418026caSFam Zheng      **/
97418026caSFam Zheng     uint64_t low_water_mark;
98418026caSFam Zheng     uint64_t high_water_mark;
99418026caSFam Zheng     IOVAMapping *mappings;
100418026caSFam Zheng     int nr_mappings;
101418026caSFam Zheng };
102418026caSFam Zheng 
103418026caSFam Zheng /**
104418026caSFam Zheng  * Find group file by PCI device address as specified @device, and return the
105418026caSFam Zheng  * path. The returned string is owned by caller and should be g_free'ed later.
106418026caSFam Zheng  */
sysfs_find_group_file(const char * device,Error ** errp)107418026caSFam Zheng static char *sysfs_find_group_file(const char *device, Error **errp)
108418026caSFam Zheng {
109*dbdea0dbSAkihiko Odaki     g_autoptr(GError) gerr = NULL;
110418026caSFam Zheng     char *sysfs_link;
111418026caSFam Zheng     char *sysfs_group;
112418026caSFam Zheng     char *p;
113418026caSFam Zheng     char *path = NULL;
114418026caSFam Zheng 
115418026caSFam Zheng     sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device);
116*dbdea0dbSAkihiko Odaki     sysfs_group = g_file_read_link(sysfs_link, &gerr);
117*dbdea0dbSAkihiko Odaki     if (gerr) {
118*dbdea0dbSAkihiko Odaki         error_setg(errp, "Failed to find iommu group sysfs path: %s",
119*dbdea0dbSAkihiko Odaki                    gerr->message);
120418026caSFam Zheng         goto out;
121418026caSFam Zheng     }
122418026caSFam Zheng     p = strrchr(sysfs_group, '/');
123418026caSFam Zheng     if (!p) {
124418026caSFam Zheng         error_setg(errp, "Failed to find iommu group number");
125418026caSFam Zheng         goto out;
126418026caSFam Zheng     }
127418026caSFam Zheng 
128418026caSFam Zheng     path = g_strdup_printf("/dev/vfio/%s", p + 1);
129418026caSFam Zheng out:
130418026caSFam Zheng     g_free(sysfs_link);
131418026caSFam Zheng     g_free(sysfs_group);
132418026caSFam Zheng     return path;
133418026caSFam Zheng }
134418026caSFam Zheng 
assert_bar_index_valid(QEMUVFIOState * s,int index)135418026caSFam Zheng static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
136418026caSFam Zheng {
137418026caSFam Zheng     assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info));
138418026caSFam Zheng }
139418026caSFam Zheng 
qemu_vfio_pci_init_bar(QEMUVFIOState * s,int index,Error ** errp)140418026caSFam Zheng static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
141418026caSFam Zheng {
142df058222SPhilippe Mathieu-Daudé     g_autofree char *barname = NULL;
143418026caSFam Zheng     assert_bar_index_valid(s, index);
144418026caSFam Zheng     s->bar_region_info[index] = (struct vfio_region_info) {
145418026caSFam Zheng         .index = VFIO_PCI_BAR0_REGION_INDEX + index,
146418026caSFam Zheng         .argsz = sizeof(struct vfio_region_info),
147418026caSFam Zheng     };
148418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) {
149418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get BAR region info");
150418026caSFam Zheng         return -errno;
151418026caSFam Zheng     }
152df058222SPhilippe Mathieu-Daudé     barname = g_strdup_printf("bar[%d]", index);
153df058222SPhilippe Mathieu-Daudé     trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset,
154df058222SPhilippe Mathieu-Daudé                                 s->bar_region_info[index].size,
155df058222SPhilippe Mathieu-Daudé                                 s->bar_region_info[index].cap_offset);
156418026caSFam Zheng 
157418026caSFam Zheng     return 0;
158418026caSFam Zheng }
159418026caSFam Zheng 
160418026caSFam Zheng /**
161418026caSFam Zheng  * Map a PCI bar area.
162418026caSFam Zheng  */
qemu_vfio_pci_map_bar(QEMUVFIOState * s,int index,uint64_t offset,uint64_t size,int prot,Error ** errp)163418026caSFam Zheng void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
164b02c01a5SPhilippe Mathieu-Daudé                             uint64_t offset, uint64_t size, int prot,
165418026caSFam Zheng                             Error **errp)
166418026caSFam Zheng {
167418026caSFam Zheng     void *p;
1688e3b0cbbSMarc-André Lureau     assert(QEMU_IS_ALIGNED(offset, qemu_real_host_page_size()));
169418026caSFam Zheng     assert_bar_index_valid(s, index);
170418026caSFam Zheng     p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
171b02c01a5SPhilippe Mathieu-Daudé              prot, MAP_SHARED,
172418026caSFam Zheng              s->device, s->bar_region_info[index].offset + offset);
1732817fbceSPhilippe Mathieu-Daudé     trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset ,
1742817fbceSPhilippe Mathieu-Daudé                                 size, offset, p);
175418026caSFam Zheng     if (p == MAP_FAILED) {
176418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to map BAR region");
177418026caSFam Zheng         p = NULL;
178418026caSFam Zheng     }
179418026caSFam Zheng     return p;
180418026caSFam Zheng }
181418026caSFam Zheng 
182418026caSFam Zheng /**
183418026caSFam Zheng  * Unmap a PCI bar area.
184418026caSFam Zheng  */
qemu_vfio_pci_unmap_bar(QEMUVFIOState * s,int index,void * bar,uint64_t offset,uint64_t size)185418026caSFam Zheng void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar,
186418026caSFam Zheng                              uint64_t offset, uint64_t size)
187418026caSFam Zheng {
188418026caSFam Zheng     if (bar) {
189418026caSFam Zheng         munmap(bar, MIN(size, s->bar_region_info[index].size - offset));
190418026caSFam Zheng     }
191418026caSFam Zheng }
192418026caSFam Zheng 
193418026caSFam Zheng /**
194a6da793aSPhilippe Mathieu-Daudé  * Initialize device IRQ with @irq_type and register an event notifier.
195418026caSFam Zheng  */
qemu_vfio_pci_init_irq(QEMUVFIOState * s,EventNotifier * e,int irq_type,Error ** errp)196418026caSFam Zheng int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
197418026caSFam Zheng                            int irq_type, Error **errp)
198418026caSFam Zheng {
199418026caSFam Zheng     int r;
200418026caSFam Zheng     struct vfio_irq_set *irq_set;
201418026caSFam Zheng     size_t irq_set_size;
202418026caSFam Zheng     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
203418026caSFam Zheng 
204418026caSFam Zheng     irq_info.index = irq_type;
205418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) {
206418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get device interrupt info");
207418026caSFam Zheng         return -errno;
208418026caSFam Zheng     }
209418026caSFam Zheng     if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
210418026caSFam Zheng         error_setg(errp, "Device interrupt doesn't support eventfd");
211418026caSFam Zheng         return -EINVAL;
212418026caSFam Zheng     }
213418026caSFam Zheng 
214418026caSFam Zheng     irq_set_size = sizeof(*irq_set) + sizeof(int);
215418026caSFam Zheng     irq_set = g_malloc0(irq_set_size);
216418026caSFam Zheng 
217418026caSFam Zheng     /* Get to a known IRQ state */
218418026caSFam Zheng     *irq_set = (struct vfio_irq_set) {
219418026caSFam Zheng         .argsz = irq_set_size,
220418026caSFam Zheng         .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
221418026caSFam Zheng         .index = irq_info.index,
222418026caSFam Zheng         .start = 0,
223418026caSFam Zheng         .count = 1,
224418026caSFam Zheng     };
225418026caSFam Zheng 
226418026caSFam Zheng     *(int *)&irq_set->data = event_notifier_get_fd(e);
227418026caSFam Zheng     r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set);
228418026caSFam Zheng     g_free(irq_set);
229418026caSFam Zheng     if (r) {
230418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to setup device interrupt");
231418026caSFam Zheng         return -errno;
232418026caSFam Zheng     }
233418026caSFam Zheng     return 0;
234418026caSFam Zheng }
235418026caSFam Zheng 
qemu_vfio_pci_read_config(QEMUVFIOState * s,void * buf,int size,int ofs)236418026caSFam Zheng static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
237418026caSFam Zheng                                      int size, int ofs)
238418026caSFam Zheng {
239418026caSFam Zheng     int ret;
240418026caSFam Zheng 
2413d87c2d9SPhilippe Mathieu-Daudé     trace_qemu_vfio_pci_read_config(buf, ofs, size,
2423d87c2d9SPhilippe Mathieu-Daudé                                     s->config_region_info.offset,
2433d87c2d9SPhilippe Mathieu-Daudé                                     s->config_region_info.size);
2443d87c2d9SPhilippe Mathieu-Daudé     assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
24537b0b24eSNikita Ivanov     ret = RETRY_ON_EINTR(
24637b0b24eSNikita Ivanov         pread(s->device, buf, size, s->config_region_info.offset + ofs)
24737b0b24eSNikita Ivanov     );
248418026caSFam Zheng     return ret == size ? 0 : -errno;
249418026caSFam Zheng }
250418026caSFam Zheng 
qemu_vfio_pci_write_config(QEMUVFIOState * s,void * buf,int size,int ofs)251418026caSFam Zheng static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs)
252418026caSFam Zheng {
253418026caSFam Zheng     int ret;
254418026caSFam Zheng 
2553d87c2d9SPhilippe Mathieu-Daudé     trace_qemu_vfio_pci_write_config(buf, ofs, size,
2563d87c2d9SPhilippe Mathieu-Daudé                                      s->config_region_info.offset,
2573d87c2d9SPhilippe Mathieu-Daudé                                      s->config_region_info.size);
2583d87c2d9SPhilippe Mathieu-Daudé     assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
25937b0b24eSNikita Ivanov     ret = RETRY_ON_EINTR(
26037b0b24eSNikita Ivanov         pwrite(s->device, buf, size, s->config_region_info.offset + ofs)
26137b0b24eSNikita Ivanov     );
262418026caSFam Zheng     return ret == size ? 0 : -errno;
263418026caSFam Zheng }
264418026caSFam Zheng 
collect_usable_iova_ranges(QEMUVFIOState * s,void * buf)2654487d420SEric Auger static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf)
2664487d420SEric Auger {
2674487d420SEric Auger     struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf;
2684487d420SEric Auger     struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset;
2694487d420SEric Auger     struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
2704487d420SEric Auger     int i;
2714487d420SEric Auger 
2724487d420SEric Auger     while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
2734487d420SEric Auger         if (!cap->next) {
2744487d420SEric Auger             return;
2754487d420SEric Auger         }
2763d558330SMarkus Armbruster         cap = buf + cap->next;
2774487d420SEric Auger     }
2784487d420SEric Auger 
2794487d420SEric Auger     cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
2804487d420SEric Auger 
2814487d420SEric Auger     s->nb_iova_ranges = cap_iova_range->nr_iovas;
2824487d420SEric Auger     if (s->nb_iova_ranges > 1) {
2834487d420SEric Auger         s->usable_iova_ranges =
284b21e2380SMarkus Armbruster             g_renew(struct IOVARange, s->usable_iova_ranges,
285b21e2380SMarkus Armbruster                     s->nb_iova_ranges);
2864487d420SEric Auger     }
2874487d420SEric Auger 
2884487d420SEric Auger     for (i = 0; i < s->nb_iova_ranges; i++) {
2894487d420SEric Auger         s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
2904487d420SEric Auger         s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
2914487d420SEric Auger     }
2924487d420SEric Auger }
2934487d420SEric Auger 
qemu_vfio_init_pci(QEMUVFIOState * s,const char * device,Error ** errp)294418026caSFam Zheng static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
295418026caSFam Zheng                               Error **errp)
296418026caSFam Zheng {
297418026caSFam Zheng     int ret;
298418026caSFam Zheng     int i;
299418026caSFam Zheng     uint16_t pci_cmd;
300418026caSFam Zheng     struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
3014487d420SEric Auger     struct vfio_iommu_type1_info *iommu_info = NULL;
3024487d420SEric Auger     size_t iommu_info_size = sizeof(*iommu_info);
303418026caSFam Zheng     struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
304418026caSFam Zheng     char *group_file = NULL;
305418026caSFam Zheng 
3064487d420SEric Auger     s->usable_iova_ranges = NULL;
3074487d420SEric Auger 
308418026caSFam Zheng     /* Create a new container */
309418026caSFam Zheng     s->container = open("/dev/vfio/vfio", O_RDWR);
310418026caSFam Zheng 
311418026caSFam Zheng     if (s->container == -1) {
312418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio");
313418026caSFam Zheng         return -errno;
314418026caSFam Zheng     }
315418026caSFam Zheng     if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
316418026caSFam Zheng         error_setg(errp, "Invalid VFIO version");
317418026caSFam Zheng         ret = -EINVAL;
318418026caSFam Zheng         goto fail_container;
319418026caSFam Zheng     }
320418026caSFam Zheng 
321418026caSFam Zheng     if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
322a4bc212aSPhilippe Mathieu-Daudé         error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported");
323418026caSFam Zheng         ret = -EINVAL;
324418026caSFam Zheng         goto fail_container;
325418026caSFam Zheng     }
326418026caSFam Zheng 
327418026caSFam Zheng     /* Open the group */
328418026caSFam Zheng     group_file = sysfs_find_group_file(device, errp);
329418026caSFam Zheng     if (!group_file) {
330418026caSFam Zheng         ret = -EINVAL;
331418026caSFam Zheng         goto fail_container;
332418026caSFam Zheng     }
333418026caSFam Zheng 
334418026caSFam Zheng     s->group = open(group_file, O_RDWR);
335418026caSFam Zheng     if (s->group == -1) {
336418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to open VFIO group file: %s",
337418026caSFam Zheng                          group_file);
338418026caSFam Zheng         g_free(group_file);
339418026caSFam Zheng         ret = -errno;
340418026caSFam Zheng         goto fail_container;
341418026caSFam Zheng     }
342418026caSFam Zheng     g_free(group_file);
343418026caSFam Zheng 
344418026caSFam Zheng     /* Test the group is viable and available */
345418026caSFam Zheng     if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) {
346418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get VFIO group status");
347418026caSFam Zheng         ret = -errno;
348418026caSFam Zheng         goto fail;
349418026caSFam Zheng     }
350418026caSFam Zheng 
351418026caSFam Zheng     if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
352418026caSFam Zheng         error_setg(errp, "VFIO group is not viable");
353418026caSFam Zheng         ret = -EINVAL;
354418026caSFam Zheng         goto fail;
355418026caSFam Zheng     }
356418026caSFam Zheng 
357418026caSFam Zheng     /* Add the group to the container */
358418026caSFam Zheng     if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) {
359418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to add group to VFIO container");
360418026caSFam Zheng         ret = -errno;
361418026caSFam Zheng         goto fail;
362418026caSFam Zheng     }
363418026caSFam Zheng 
364418026caSFam Zheng     /* Enable the IOMMU model we want */
365418026caSFam Zheng     if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
366418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type");
367418026caSFam Zheng         ret = -errno;
368418026caSFam Zheng         goto fail;
369418026caSFam Zheng     }
370418026caSFam Zheng 
3714487d420SEric Auger     iommu_info = g_malloc0(iommu_info_size);
3724487d420SEric Auger     iommu_info->argsz = iommu_info_size;
3734487d420SEric Auger 
374418026caSFam Zheng     /* Get additional IOMMU info */
3754487d420SEric Auger     if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
376418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get IOMMU info");
377418026caSFam Zheng         ret = -errno;
378418026caSFam Zheng         goto fail;
379418026caSFam Zheng     }
380418026caSFam Zheng 
3814487d420SEric Auger     /*
3824487d420SEric Auger      * if the kernel does not report usable IOVA regions, choose
3834487d420SEric Auger      * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
3844487d420SEric Auger      */
3854487d420SEric Auger     s->nb_iova_ranges = 1;
3864487d420SEric Auger     s->usable_iova_ranges = g_new0(struct IOVARange, 1);
3874487d420SEric Auger     s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
3884487d420SEric Auger     s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
3894487d420SEric Auger 
3904487d420SEric Auger     if (iommu_info->argsz > iommu_info_size) {
3914487d420SEric Auger         iommu_info_size = iommu_info->argsz;
3924487d420SEric Auger         iommu_info = g_realloc(iommu_info, iommu_info_size);
3934487d420SEric Auger         if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
3944487d420SEric Auger             ret = -errno;
3954487d420SEric Auger             goto fail;
3964487d420SEric Auger         }
3974487d420SEric Auger         collect_usable_iova_ranges(s, iommu_info);
3984487d420SEric Auger     }
3994487d420SEric Auger 
400418026caSFam Zheng     s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
401418026caSFam Zheng 
402418026caSFam Zheng     if (s->device < 0) {
403418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get device fd");
404418026caSFam Zheng         ret = -errno;
405418026caSFam Zheng         goto fail;
406418026caSFam Zheng     }
407418026caSFam Zheng 
408418026caSFam Zheng     /* Test and setup the device */
409418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) {
410418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get device info");
411418026caSFam Zheng         ret = -errno;
412418026caSFam Zheng         goto fail;
413418026caSFam Zheng     }
414418026caSFam Zheng 
415418026caSFam Zheng     if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
416418026caSFam Zheng         error_setg(errp, "Invalid device regions");
417418026caSFam Zheng         ret = -EINVAL;
418418026caSFam Zheng         goto fail;
419418026caSFam Zheng     }
420418026caSFam Zheng 
421418026caSFam Zheng     s->config_region_info = (struct vfio_region_info) {
422418026caSFam Zheng         .index = VFIO_PCI_CONFIG_REGION_INDEX,
423418026caSFam Zheng         .argsz = sizeof(struct vfio_region_info),
424418026caSFam Zheng     };
425418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) {
426418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get config region info");
427418026caSFam Zheng         ret = -errno;
428418026caSFam Zheng         goto fail;
429418026caSFam Zheng     }
430df058222SPhilippe Mathieu-Daudé     trace_qemu_vfio_region_info("config", s->config_region_info.offset,
431df058222SPhilippe Mathieu-Daudé                                 s->config_region_info.size,
432df058222SPhilippe Mathieu-Daudé                                 s->config_region_info.cap_offset);
433418026caSFam Zheng 
4349e722ebcSLi Qiang     for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
435418026caSFam Zheng         ret = qemu_vfio_pci_init_bar(s, i, errp);
436418026caSFam Zheng         if (ret) {
437418026caSFam Zheng             goto fail;
438418026caSFam Zheng         }
439418026caSFam Zheng     }
440418026caSFam Zheng 
441418026caSFam Zheng     /* Enable bus master */
442418026caSFam Zheng     ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
443418026caSFam Zheng     if (ret) {
444418026caSFam Zheng         goto fail;
445418026caSFam Zheng     }
446418026caSFam Zheng     pci_cmd |= PCI_COMMAND_MASTER;
447418026caSFam Zheng     ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
448418026caSFam Zheng     if (ret) {
449418026caSFam Zheng         goto fail;
450418026caSFam Zheng     }
4514487d420SEric Auger     g_free(iommu_info);
452418026caSFam Zheng     return 0;
453418026caSFam Zheng fail:
4544487d420SEric Auger     g_free(s->usable_iova_ranges);
4554487d420SEric Auger     s->usable_iova_ranges = NULL;
4564487d420SEric Auger     s->nb_iova_ranges = 0;
4574487d420SEric Auger     g_free(iommu_info);
458418026caSFam Zheng     close(s->group);
459418026caSFam Zheng fail_container:
460418026caSFam Zheng     close(s->container);
461418026caSFam Zheng     return ret;
462418026caSFam Zheng }
463418026caSFam Zheng 
qemu_vfio_ram_block_added(RAMBlockNotifier * n,void * host,size_t size,size_t max_size)4648f44304cSDavid Hildenbrand static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, void *host,
4658f44304cSDavid Hildenbrand                                       size_t size, size_t max_size)
466418026caSFam Zheng {
467418026caSFam Zheng     QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
468521b97cdSPhilippe Mathieu-Daudé     Error *local_err = NULL;
469082851a3SDavid Hildenbrand     int ret;
470082851a3SDavid Hildenbrand 
4718f44304cSDavid Hildenbrand     trace_qemu_vfio_ram_block_added(s, host, max_size);
472521b97cdSPhilippe Mathieu-Daudé     ret = qemu_vfio_dma_map(s, host, max_size, false, NULL, &local_err);
473082851a3SDavid Hildenbrand     if (ret) {
474521b97cdSPhilippe Mathieu-Daudé         error_reportf_err(local_err,
475521b97cdSPhilippe Mathieu-Daudé                           "qemu_vfio_dma_map(%p, %zu) failed: ",
476521b97cdSPhilippe Mathieu-Daudé                           host, max_size);
477082851a3SDavid Hildenbrand     }
478418026caSFam Zheng }
479418026caSFam Zheng 
qemu_vfio_ram_block_removed(RAMBlockNotifier * n,void * host,size_t size,size_t max_size)4808f44304cSDavid Hildenbrand static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, void *host,
4818f44304cSDavid Hildenbrand                                         size_t size, size_t max_size)
482418026caSFam Zheng {
483418026caSFam Zheng     QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
484418026caSFam Zheng     if (host) {
4858f44304cSDavid Hildenbrand         trace_qemu_vfio_ram_block_removed(s, host, max_size);
486418026caSFam Zheng         qemu_vfio_dma_unmap(s, host);
487418026caSFam Zheng     }
488418026caSFam Zheng }
489418026caSFam Zheng 
qemu_vfio_open_common(QEMUVFIOState * s)490418026caSFam Zheng static void qemu_vfio_open_common(QEMUVFIOState *s)
491418026caSFam Zheng {
492549b50a3SMarkus Armbruster     qemu_mutex_init(&s->lock);
493418026caSFam Zheng     s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
494418026caSFam Zheng     s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
495418026caSFam Zheng     s->low_water_mark = QEMU_VFIO_IOVA_MIN;
496418026caSFam Zheng     s->high_water_mark = QEMU_VFIO_IOVA_MAX;
497082851a3SDavid Hildenbrand     ram_block_notifier_add(&s->ram_notifier);
498418026caSFam Zheng }
499418026caSFam Zheng 
500418026caSFam Zheng /**
501418026caSFam Zheng  * Open a PCI device, e.g. "0000:00:01.0".
502418026caSFam Zheng  */
qemu_vfio_open_pci(const char * device,Error ** errp)503418026caSFam Zheng QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
504418026caSFam Zheng {
505418026caSFam Zheng     int r;
506418026caSFam Zheng     QEMUVFIOState *s = g_new0(QEMUVFIOState, 1);
507418026caSFam Zheng 
508b430b513SDavid Hildenbrand     /*
509b430b513SDavid Hildenbrand      * VFIO may pin all memory inside mappings, resulting it in pinning
510b430b513SDavid Hildenbrand      * all memory inside RAM blocks unconditionally.
511b430b513SDavid Hildenbrand      */
512b430b513SDavid Hildenbrand     r = ram_block_discard_disable(true);
513b430b513SDavid Hildenbrand     if (r) {
514b430b513SDavid Hildenbrand         error_setg_errno(errp, -r, "Cannot set discarding of RAM broken");
515b430b513SDavid Hildenbrand         g_free(s);
516b430b513SDavid Hildenbrand         return NULL;
517b430b513SDavid Hildenbrand     }
518b430b513SDavid Hildenbrand 
519418026caSFam Zheng     r = qemu_vfio_init_pci(s, device, errp);
520418026caSFam Zheng     if (r) {
521b430b513SDavid Hildenbrand         ram_block_discard_disable(false);
522418026caSFam Zheng         g_free(s);
523418026caSFam Zheng         return NULL;
524418026caSFam Zheng     }
525418026caSFam Zheng     qemu_vfio_open_common(s);
526418026caSFam Zheng     return s;
527418026caSFam Zheng }
528418026caSFam Zheng 
qemu_vfio_dump_mappings(QEMUVFIOState * s)529418026caSFam Zheng static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
530418026caSFam Zheng {
531f6b8104dSPhilippe Mathieu-Daudé     for (int i = 0; i < s->nr_mappings; ++i) {
532f6b8104dSPhilippe Mathieu-Daudé         trace_qemu_vfio_dump_mapping(s->mappings[i].host,
533f6b8104dSPhilippe Mathieu-Daudé                                      s->mappings[i].iova,
534f6b8104dSPhilippe Mathieu-Daudé                                      s->mappings[i].size);
535418026caSFam Zheng     }
536418026caSFam Zheng }
537418026caSFam Zheng 
538418026caSFam Zheng /**
539418026caSFam Zheng  * Find the mapping entry that contains [host, host + size) and set @index to
540418026caSFam Zheng  * the position. If no entry contains it, @index is the position _after_ which
541418026caSFam Zheng  * to insert the new mapping. IOW, it is the index of the largest element that
542418026caSFam Zheng  * is smaller than @host, or -1 if no entry is.
543418026caSFam Zheng  */
qemu_vfio_find_mapping(QEMUVFIOState * s,void * host,int * index)544418026caSFam Zheng static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host,
545418026caSFam Zheng                                            int *index)
546418026caSFam Zheng {
547418026caSFam Zheng     IOVAMapping *p = s->mappings;
548418026caSFam Zheng     IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL;
549418026caSFam Zheng     IOVAMapping *mid;
550418026caSFam Zheng     trace_qemu_vfio_find_mapping(s, host);
551418026caSFam Zheng     if (!p) {
552418026caSFam Zheng         *index = -1;
553418026caSFam Zheng         return NULL;
554418026caSFam Zheng     }
555418026caSFam Zheng     while (true) {
556418026caSFam Zheng         mid = p + (q - p) / 2;
557418026caSFam Zheng         if (mid == p) {
558418026caSFam Zheng             break;
559418026caSFam Zheng         }
560418026caSFam Zheng         if (mid->host > host) {
561418026caSFam Zheng             q = mid;
562418026caSFam Zheng         } else if (mid->host < host) {
563418026caSFam Zheng             p = mid;
564418026caSFam Zheng         } else {
565418026caSFam Zheng             break;
566418026caSFam Zheng         }
567418026caSFam Zheng     }
568418026caSFam Zheng     if (mid->host > host) {
569418026caSFam Zheng         mid--;
570418026caSFam Zheng     } else if (mid < &s->mappings[s->nr_mappings - 1]
571418026caSFam Zheng                && (mid + 1)->host <= host) {
572418026caSFam Zheng         mid++;
573418026caSFam Zheng     }
574418026caSFam Zheng     *index = mid - &s->mappings[0];
575418026caSFam Zheng     if (mid >= &s->mappings[0] &&
576418026caSFam Zheng         mid->host <= host && mid->host + mid->size > host) {
577418026caSFam Zheng         assert(mid < &s->mappings[s->nr_mappings]);
578418026caSFam Zheng         return mid;
579418026caSFam Zheng     }
580418026caSFam Zheng     /* At this point *index + 1 is the right position to insert the new
581418026caSFam Zheng      * mapping.*/
582418026caSFam Zheng     return NULL;
583418026caSFam Zheng }
584418026caSFam Zheng 
585418026caSFam Zheng /**
586a6da793aSPhilippe Mathieu-Daudé  * Allocate IOVA and create a new mapping record and insert it in @s.
587418026caSFam Zheng  */
qemu_vfio_add_mapping(QEMUVFIOState * s,void * host,size_t size,int index,uint64_t iova)588418026caSFam Zheng static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s,
589418026caSFam Zheng                                           void *host, size_t size,
590418026caSFam Zheng                                           int index, uint64_t iova)
591418026caSFam Zheng {
592418026caSFam Zheng     int shift;
593418026caSFam Zheng     IOVAMapping m = {.host = host, .size = size, .iova = iova};
594418026caSFam Zheng     IOVAMapping *insert;
595418026caSFam Zheng 
5968e3b0cbbSMarc-André Lureau     assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size()));
5978e3b0cbbSMarc-André Lureau     assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size()));
5988e3b0cbbSMarc-André Lureau     assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size()));
599418026caSFam Zheng     trace_qemu_vfio_new_mapping(s, host, size, index, iova);
600418026caSFam Zheng 
601418026caSFam Zheng     assert(index >= 0);
602418026caSFam Zheng     s->nr_mappings++;
603d29eb678SOlaf Hering     s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
604418026caSFam Zheng     insert = &s->mappings[index];
605418026caSFam Zheng     shift = s->nr_mappings - index - 1;
606418026caSFam Zheng     if (shift) {
607418026caSFam Zheng         memmove(insert + 1, insert, shift * sizeof(s->mappings[0]));
608418026caSFam Zheng     }
609418026caSFam Zheng     *insert = m;
610418026caSFam Zheng     return insert;
611418026caSFam Zheng }
612418026caSFam Zheng 
613418026caSFam Zheng /* Do the DMA mapping with VFIO. */
qemu_vfio_do_mapping(QEMUVFIOState * s,void * host,size_t size,uint64_t iova,Error ** errp)614418026caSFam Zheng static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
615f38b376dSPhilippe Mathieu-Daudé                                 uint64_t iova, Error **errp)
616418026caSFam Zheng {
617418026caSFam Zheng     struct vfio_iommu_type1_dma_map dma_map = {
618418026caSFam Zheng         .argsz = sizeof(dma_map),
619418026caSFam Zheng         .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
620418026caSFam Zheng         .iova = iova,
621418026caSFam Zheng         .vaddr = (uintptr_t)host,
622418026caSFam Zheng         .size = size,
623418026caSFam Zheng     };
6244c946b22SPhilippe Mathieu-Daudé     trace_qemu_vfio_do_mapping(s, host, iova, size);
625418026caSFam Zheng 
626418026caSFam Zheng     if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
627f38b376dSPhilippe Mathieu-Daudé         error_setg_errno(errp, errno, "VFIO_MAP_DMA failed");
628418026caSFam Zheng         return -errno;
629418026caSFam Zheng     }
630418026caSFam Zheng     return 0;
631418026caSFam Zheng }
632418026caSFam Zheng 
633418026caSFam Zheng /**
634418026caSFam Zheng  * Undo the DMA mapping from @s with VFIO, and remove from mapping list.
635418026caSFam Zheng  */
qemu_vfio_undo_mapping(QEMUVFIOState * s,IOVAMapping * mapping,Error ** errp)636418026caSFam Zheng static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping,
637418026caSFam Zheng                                    Error **errp)
638418026caSFam Zheng {
639418026caSFam Zheng     int index;
640418026caSFam Zheng     struct vfio_iommu_type1_dma_unmap unmap = {
641418026caSFam Zheng         .argsz = sizeof(unmap),
642418026caSFam Zheng         .flags = 0,
643418026caSFam Zheng         .iova = mapping->iova,
644418026caSFam Zheng         .size = mapping->size,
645418026caSFam Zheng     };
646418026caSFam Zheng 
647418026caSFam Zheng     index = mapping - s->mappings;
648418026caSFam Zheng     assert(mapping->size > 0);
6498e3b0cbbSMarc-André Lureau     assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size()));
650418026caSFam Zheng     assert(index >= 0 && index < s->nr_mappings);
651418026caSFam Zheng     if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
652b09d51c9SMichal Privoznik         error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed");
653418026caSFam Zheng     }
654418026caSFam Zheng     memmove(mapping, &s->mappings[index + 1],
655418026caSFam Zheng             sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
656418026caSFam Zheng     s->nr_mappings--;
657d29eb678SOlaf Hering     s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
658418026caSFam Zheng }
659418026caSFam Zheng 
660418026caSFam Zheng /* Check if the mapping list is (ascending) ordered. */
qemu_vfio_verify_mappings(QEMUVFIOState * s)661418026caSFam Zheng static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
662418026caSFam Zheng {
663418026caSFam Zheng     int i;
664418026caSFam Zheng     if (QEMU_VFIO_DEBUG) {
665418026caSFam Zheng         for (i = 0; i < s->nr_mappings - 1; ++i) {
666418026caSFam Zheng             if (!(s->mappings[i].host < s->mappings[i + 1].host)) {
667cb49dfceSPhilippe Mathieu-Daudé                 error_report("item %d not sorted!", i);
668418026caSFam Zheng                 qemu_vfio_dump_mappings(s);
669418026caSFam Zheng                 return false;
670418026caSFam Zheng             }
671418026caSFam Zheng             if (!(s->mappings[i].host + s->mappings[i].size <=
672418026caSFam Zheng                   s->mappings[i + 1].host)) {
673cb49dfceSPhilippe Mathieu-Daudé                 error_report("item %d overlap with next!", i);
674418026caSFam Zheng                 qemu_vfio_dump_mappings(s);
675418026caSFam Zheng                 return false;
676418026caSFam Zheng             }
677418026caSFam Zheng         }
678418026caSFam Zheng     }
679418026caSFam Zheng     return true;
680418026caSFam Zheng }
681418026caSFam Zheng 
qemu_vfio_find_fixed_iova(QEMUVFIOState * s,size_t size,uint64_t * iova,Error ** errp)682453095e9SPhilippe Mathieu-Daudé static bool qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size,
683453095e9SPhilippe Mathieu-Daudé                                       uint64_t *iova, Error **errp)
6849ab57411SEric Auger {
6859ab57411SEric Auger     int i;
6869ab57411SEric Auger 
6879ab57411SEric Auger     for (i = 0; i < s->nb_iova_ranges; i++) {
6889ab57411SEric Auger         if (s->usable_iova_ranges[i].end < s->low_water_mark) {
6899ab57411SEric Auger             continue;
6909ab57411SEric Auger         }
6919ab57411SEric Auger         s->low_water_mark =
6929ab57411SEric Auger             MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
6939ab57411SEric Auger 
6949ab57411SEric Auger         if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
6959ab57411SEric Auger             s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
6969ab57411SEric Auger             *iova = s->low_water_mark;
6979ab57411SEric Auger             s->low_water_mark += size;
698453095e9SPhilippe Mathieu-Daudé             return true;
6999ab57411SEric Auger         }
7009ab57411SEric Auger     }
701453095e9SPhilippe Mathieu-Daudé     error_setg(errp, "fixed iova range not found");
702453095e9SPhilippe Mathieu-Daudé 
703453095e9SPhilippe Mathieu-Daudé     return false;
7049ab57411SEric Auger }
7059ab57411SEric Auger 
qemu_vfio_find_temp_iova(QEMUVFIOState * s,size_t size,uint64_t * iova,Error ** errp)706453095e9SPhilippe Mathieu-Daudé static bool qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size,
707453095e9SPhilippe Mathieu-Daudé                                      uint64_t *iova, Error **errp)
7089ab57411SEric Auger {
7099ab57411SEric Auger     int i;
7109ab57411SEric Auger 
7119ab57411SEric Auger     for (i = s->nb_iova_ranges - 1; i >= 0; i--) {
7129ab57411SEric Auger         if (s->usable_iova_ranges[i].start > s->high_water_mark) {
7139ab57411SEric Auger             continue;
7149ab57411SEric Auger         }
7159ab57411SEric Auger         s->high_water_mark =
7169ab57411SEric Auger             MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1);
7179ab57411SEric Auger 
7189ab57411SEric Auger         if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size ||
7199ab57411SEric Auger             s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) {
7209ab57411SEric Auger             *iova = s->high_water_mark - size;
7219ab57411SEric Auger             s->high_water_mark = *iova;
722453095e9SPhilippe Mathieu-Daudé             return true;
7239ab57411SEric Auger         }
7249ab57411SEric Auger     }
725453095e9SPhilippe Mathieu-Daudé     error_setg(errp, "temporary iova range not found");
726453095e9SPhilippe Mathieu-Daudé 
727453095e9SPhilippe Mathieu-Daudé     return false;
7289ab57411SEric Auger }
7299ab57411SEric Auger 
73071e3038cSPhilippe Mathieu-Daudé /**
73171e3038cSPhilippe Mathieu-Daudé  * qemu_vfio_water_mark_reached:
73271e3038cSPhilippe Mathieu-Daudé  *
73371e3038cSPhilippe Mathieu-Daudé  * Returns %true if high watermark has been reached, %false otherwise.
73471e3038cSPhilippe Mathieu-Daudé  */
qemu_vfio_water_mark_reached(QEMUVFIOState * s,size_t size,Error ** errp)73571e3038cSPhilippe Mathieu-Daudé static bool qemu_vfio_water_mark_reached(QEMUVFIOState *s, size_t size,
73671e3038cSPhilippe Mathieu-Daudé                                          Error **errp)
73771e3038cSPhilippe Mathieu-Daudé {
73871e3038cSPhilippe Mathieu-Daudé     if (s->high_water_mark - s->low_water_mark + 1 < size) {
73971e3038cSPhilippe Mathieu-Daudé         error_setg(errp, "iova exhausted (water mark reached)");
74071e3038cSPhilippe Mathieu-Daudé         return true;
74171e3038cSPhilippe Mathieu-Daudé     }
74271e3038cSPhilippe Mathieu-Daudé     return false;
74371e3038cSPhilippe Mathieu-Daudé }
74471e3038cSPhilippe Mathieu-Daudé 
745418026caSFam Zheng /* Map [host, host + size) area into a contiguous IOVA address space, and store
746418026caSFam Zheng  * the result in @iova if not NULL. The caller need to make sure the area is
747418026caSFam Zheng  * aligned to page size, and mustn't overlap with existing mapping areas (split
748418026caSFam Zheng  * mapping status within this area is not allowed).
749418026caSFam Zheng  */
qemu_vfio_dma_map(QEMUVFIOState * s,void * host,size_t size,bool temporary,uint64_t * iova,Error ** errp)750418026caSFam Zheng int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
751521b97cdSPhilippe Mathieu-Daudé                       bool temporary, uint64_t *iova, Error **errp)
752418026caSFam Zheng {
753418026caSFam Zheng     int index;
754418026caSFam Zheng     IOVAMapping *mapping;
755418026caSFam Zheng     uint64_t iova0;
756418026caSFam Zheng 
7578e3b0cbbSMarc-André Lureau     assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size()));
7588e3b0cbbSMarc-André Lureau     assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size()));
759418026caSFam Zheng     trace_qemu_vfio_dma_map(s, host, size, temporary, iova);
760a990858bSPhilippe Mathieu-Daudé     QEMU_LOCK_GUARD(&s->lock);
761418026caSFam Zheng     mapping = qemu_vfio_find_mapping(s, host, &index);
762418026caSFam Zheng     if (mapping) {
763418026caSFam Zheng         iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host);
764418026caSFam Zheng     } else {
7655a4f1626SPhilippe Mathieu-Daudé         int ret;
7665a4f1626SPhilippe Mathieu-Daudé 
76771e3038cSPhilippe Mathieu-Daudé         if (qemu_vfio_water_mark_reached(s, size, errp)) {
7685a4f1626SPhilippe Mathieu-Daudé             return -ENOMEM;
769418026caSFam Zheng         }
770418026caSFam Zheng         if (!temporary) {
771453095e9SPhilippe Mathieu-Daudé             if (!qemu_vfio_find_fixed_iova(s, size, &iova0, errp)) {
7725a4f1626SPhilippe Mathieu-Daudé                 return -ENOMEM;
7739ab57411SEric Auger             }
7749ab57411SEric Auger 
775418026caSFam Zheng             mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
776418026caSFam Zheng             assert(qemu_vfio_verify_mappings(s));
777f38b376dSPhilippe Mathieu-Daudé             ret = qemu_vfio_do_mapping(s, host, size, iova0, errp);
7785a4f1626SPhilippe Mathieu-Daudé             if (ret < 0) {
779418026caSFam Zheng                 qemu_vfio_undo_mapping(s, mapping, NULL);
7805a4f1626SPhilippe Mathieu-Daudé                 return ret;
781418026caSFam Zheng             }
782418026caSFam Zheng             qemu_vfio_dump_mappings(s);
783418026caSFam Zheng         } else {
784453095e9SPhilippe Mathieu-Daudé             if (!qemu_vfio_find_temp_iova(s, size, &iova0, errp)) {
7855a4f1626SPhilippe Mathieu-Daudé                 return -ENOMEM;
7869ab57411SEric Auger             }
787f38b376dSPhilippe Mathieu-Daudé             ret = qemu_vfio_do_mapping(s, host, size, iova0, errp);
7885a4f1626SPhilippe Mathieu-Daudé             if (ret < 0) {
7895a4f1626SPhilippe Mathieu-Daudé                 return ret;
790418026caSFam Zheng             }
791418026caSFam Zheng         }
792418026caSFam Zheng     }
7934c946b22SPhilippe Mathieu-Daudé     trace_qemu_vfio_dma_mapped(s, host, iova0, size);
794418026caSFam Zheng     if (iova) {
795418026caSFam Zheng         *iova = iova0;
796418026caSFam Zheng     }
7975a4f1626SPhilippe Mathieu-Daudé     return 0;
798418026caSFam Zheng }
799418026caSFam Zheng 
800418026caSFam Zheng /* Reset the high watermark and free all "temporary" mappings. */
qemu_vfio_dma_reset_temporary(QEMUVFIOState * s)801418026caSFam Zheng int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
802418026caSFam Zheng {
803418026caSFam Zheng     struct vfio_iommu_type1_dma_unmap unmap = {
804418026caSFam Zheng         .argsz = sizeof(unmap),
805418026caSFam Zheng         .flags = 0,
806418026caSFam Zheng         .iova = s->high_water_mark,
807418026caSFam Zheng         .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark,
808418026caSFam Zheng     };
809418026caSFam Zheng     trace_qemu_vfio_dma_reset_temporary(s);
8106e8a355dSDaniel Brodsky     QEMU_LOCK_GUARD(&s->lock);
811418026caSFam Zheng     if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
812b09d51c9SMichal Privoznik         error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
813418026caSFam Zheng         return -errno;
814418026caSFam Zheng     }
815418026caSFam Zheng     s->high_water_mark = QEMU_VFIO_IOVA_MAX;
816418026caSFam Zheng     return 0;
817418026caSFam Zheng }
818418026caSFam Zheng 
819418026caSFam Zheng /* Unmapping the whole area that was previously mapped with
820418026caSFam Zheng  * qemu_vfio_dma_map(). */
qemu_vfio_dma_unmap(QEMUVFIOState * s,void * host)821418026caSFam Zheng void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host)
822418026caSFam Zheng {
823418026caSFam Zheng     int index = 0;
824418026caSFam Zheng     IOVAMapping *m;
825418026caSFam Zheng 
826418026caSFam Zheng     if (!host) {
827418026caSFam Zheng         return;
828418026caSFam Zheng     }
829418026caSFam Zheng 
830418026caSFam Zheng     trace_qemu_vfio_dma_unmap(s, host);
831a990858bSPhilippe Mathieu-Daudé     QEMU_LOCK_GUARD(&s->lock);
832418026caSFam Zheng     m = qemu_vfio_find_mapping(s, host, &index);
833418026caSFam Zheng     if (!m) {
834a990858bSPhilippe Mathieu-Daudé         return;
835418026caSFam Zheng     }
836418026caSFam Zheng     qemu_vfio_undo_mapping(s, m, NULL);
837418026caSFam Zheng }
838418026caSFam Zheng 
qemu_vfio_reset(QEMUVFIOState * s)839418026caSFam Zheng static void qemu_vfio_reset(QEMUVFIOState *s)
840418026caSFam Zheng {
841418026caSFam Zheng     ioctl(s->device, VFIO_DEVICE_RESET);
842418026caSFam Zheng }
843418026caSFam Zheng 
844418026caSFam Zheng /* Close and free the VFIO resources. */
qemu_vfio_close(QEMUVFIOState * s)845418026caSFam Zheng void qemu_vfio_close(QEMUVFIOState *s)
846418026caSFam Zheng {
847418026caSFam Zheng     int i;
848418026caSFam Zheng 
849418026caSFam Zheng     if (!s) {
850418026caSFam Zheng         return;
851418026caSFam Zheng     }
8521f0fea38SStefan Hajnoczi 
8531f0fea38SStefan Hajnoczi     ram_block_notifier_remove(&s->ram_notifier);
8541f0fea38SStefan Hajnoczi 
855418026caSFam Zheng     for (i = 0; i < s->nr_mappings; ++i) {
856418026caSFam Zheng         qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
857418026caSFam Zheng     }
8581f0fea38SStefan Hajnoczi 
8594487d420SEric Auger     g_free(s->usable_iova_ranges);
8604487d420SEric Auger     s->nb_iova_ranges = 0;
861418026caSFam Zheng     qemu_vfio_reset(s);
862418026caSFam Zheng     close(s->device);
863418026caSFam Zheng     close(s->group);
864418026caSFam Zheng     close(s->container);
865b430b513SDavid Hildenbrand     ram_block_discard_disable(false);
866418026caSFam Zheng }
867