xref: /openbmc/qemu/util/vfio-helpers.c (revision 6e8a355de6c4d32e9df336cdafb009cd78262836)
1418026caSFam Zheng /*
2418026caSFam Zheng  * VFIO utility
3418026caSFam Zheng  *
4418026caSFam Zheng  * Copyright 2016 - 2018 Red Hat, Inc.
5418026caSFam Zheng  *
6418026caSFam Zheng  * Authors:
7418026caSFam Zheng  *   Fam Zheng <famz@redhat.com>
8418026caSFam Zheng  *
9418026caSFam Zheng  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10418026caSFam Zheng  * See the COPYING file in the top-level directory.
11418026caSFam Zheng  */
12418026caSFam Zheng 
13418026caSFam Zheng #include "qemu/osdep.h"
14418026caSFam Zheng #include <sys/ioctl.h>
15418026caSFam Zheng #include <linux/vfio.h>
16418026caSFam Zheng #include "qapi/error.h"
17418026caSFam Zheng #include "exec/ramlist.h"
18418026caSFam Zheng #include "exec/cpu-common.h"
19418026caSFam Zheng #include "trace.h"
20418026caSFam Zheng #include "qemu/error-report.h"
21418026caSFam Zheng #include "standard-headers/linux/pci_regs.h"
22418026caSFam Zheng #include "qemu/event_notifier.h"
23418026caSFam Zheng #include "qemu/vfio-helpers.h"
24*6e8a355dSDaniel Brodsky #include "qemu/lockable.h"
25418026caSFam Zheng #include "trace.h"
26418026caSFam Zheng 
27418026caSFam Zheng #define QEMU_VFIO_DEBUG 0
28418026caSFam Zheng 
29418026caSFam Zheng #define QEMU_VFIO_IOVA_MIN 0x10000ULL
30418026caSFam Zheng /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
31418026caSFam Zheng  * we can use a runtime limit; alternatively it's also possible to do platform
32418026caSFam Zheng  * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
33418026caSFam Zheng  **/
34418026caSFam Zheng #define QEMU_VFIO_IOVA_MAX (1ULL << 39)
35418026caSFam Zheng 
36418026caSFam Zheng typedef struct {
37418026caSFam Zheng     /* Page aligned addr. */
38418026caSFam Zheng     void *host;
39418026caSFam Zheng     size_t size;
40418026caSFam Zheng     uint64_t iova;
41418026caSFam Zheng } IOVAMapping;
42418026caSFam Zheng 
43418026caSFam Zheng struct QEMUVFIOState {
44418026caSFam Zheng     QemuMutex lock;
45418026caSFam Zheng 
46418026caSFam Zheng     /* These fields are protected by BQL */
47418026caSFam Zheng     int container;
48418026caSFam Zheng     int group;
49418026caSFam Zheng     int device;
50418026caSFam Zheng     RAMBlockNotifier ram_notifier;
51418026caSFam Zheng     struct vfio_region_info config_region_info, bar_region_info[6];
52418026caSFam Zheng 
53418026caSFam Zheng     /* These fields are protected by @lock */
54418026caSFam Zheng     /* VFIO's IO virtual address space is managed by splitting into a few
55418026caSFam Zheng      * sections:
56418026caSFam Zheng      *
57418026caSFam Zheng      * ---------------       <= 0
58418026caSFam Zheng      * |xxxxxxxxxxxxx|
59418026caSFam Zheng      * |-------------|       <= QEMU_VFIO_IOVA_MIN
60418026caSFam Zheng      * |             |
61418026caSFam Zheng      * |    Fixed    |
62418026caSFam Zheng      * |             |
63418026caSFam Zheng      * |-------------|       <= low_water_mark
64418026caSFam Zheng      * |             |
65418026caSFam Zheng      * |    Free     |
66418026caSFam Zheng      * |             |
67418026caSFam Zheng      * |-------------|       <= high_water_mark
68418026caSFam Zheng      * |             |
69418026caSFam Zheng      * |    Temp     |
70418026caSFam Zheng      * |             |
71418026caSFam Zheng      * |-------------|       <= QEMU_VFIO_IOVA_MAX
72418026caSFam Zheng      * |xxxxxxxxxxxxx|
73418026caSFam Zheng      * |xxxxxxxxxxxxx|
74418026caSFam Zheng      * ---------------
75418026caSFam Zheng      *
76418026caSFam Zheng      * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
77418026caSFam Zheng      *
78418026caSFam Zheng      * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
79418026caSFam Zheng      *   [QEMU_VFIO_IOVA_MIN, low_water_mark).  Once allocated they will not be
80418026caSFam Zheng      *   reclaimed - low_water_mark never shrinks;
81418026caSFam Zheng      *
82418026caSFam Zheng      * - IOVAs in range [low_water_mark, high_water_mark) are free;
83418026caSFam Zheng      *
84418026caSFam Zheng      * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
85418026caSFam Zheng      *   mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
86418026caSFam Zheng      *   is recycled. The caller should make sure I/O's depending on these
87418026caSFam Zheng      *   mappings are completed before calling.
88418026caSFam Zheng      **/
89418026caSFam Zheng     uint64_t low_water_mark;
90418026caSFam Zheng     uint64_t high_water_mark;
91418026caSFam Zheng     IOVAMapping *mappings;
92418026caSFam Zheng     int nr_mappings;
93418026caSFam Zheng };
94418026caSFam Zheng 
95418026caSFam Zheng /**
96418026caSFam Zheng  * Find group file by PCI device address as specified @device, and return the
97418026caSFam Zheng  * path. The returned string is owned by caller and should be g_free'ed later.
98418026caSFam Zheng  */
99418026caSFam Zheng static char *sysfs_find_group_file(const char *device, Error **errp)
100418026caSFam Zheng {
101418026caSFam Zheng     char *sysfs_link;
102418026caSFam Zheng     char *sysfs_group;
103418026caSFam Zheng     char *p;
104418026caSFam Zheng     char *path = NULL;
105418026caSFam Zheng 
106418026caSFam Zheng     sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device);
10778d8c99eSPaolo Bonzini     sysfs_group = g_malloc0(PATH_MAX);
108418026caSFam Zheng     if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) {
109418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to find iommu group sysfs path");
110418026caSFam Zheng         goto out;
111418026caSFam Zheng     }
112418026caSFam Zheng     p = strrchr(sysfs_group, '/');
113418026caSFam Zheng     if (!p) {
114418026caSFam Zheng         error_setg(errp, "Failed to find iommu group number");
115418026caSFam Zheng         goto out;
116418026caSFam Zheng     }
117418026caSFam Zheng 
118418026caSFam Zheng     path = g_strdup_printf("/dev/vfio/%s", p + 1);
119418026caSFam Zheng out:
120418026caSFam Zheng     g_free(sysfs_link);
121418026caSFam Zheng     g_free(sysfs_group);
122418026caSFam Zheng     return path;
123418026caSFam Zheng }
124418026caSFam Zheng 
125418026caSFam Zheng static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
126418026caSFam Zheng {
127418026caSFam Zheng     assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info));
128418026caSFam Zheng }
129418026caSFam Zheng 
130418026caSFam Zheng static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
131418026caSFam Zheng {
132418026caSFam Zheng     assert_bar_index_valid(s, index);
133418026caSFam Zheng     s->bar_region_info[index] = (struct vfio_region_info) {
134418026caSFam Zheng         .index = VFIO_PCI_BAR0_REGION_INDEX + index,
135418026caSFam Zheng         .argsz = sizeof(struct vfio_region_info),
136418026caSFam Zheng     };
137418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) {
138418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get BAR region info");
139418026caSFam Zheng         return -errno;
140418026caSFam Zheng     }
141418026caSFam Zheng 
142418026caSFam Zheng     return 0;
143418026caSFam Zheng }
144418026caSFam Zheng 
145418026caSFam Zheng /**
146418026caSFam Zheng  * Map a PCI bar area.
147418026caSFam Zheng  */
148418026caSFam Zheng void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
149418026caSFam Zheng                             uint64_t offset, uint64_t size,
150418026caSFam Zheng                             Error **errp)
151418026caSFam Zheng {
152418026caSFam Zheng     void *p;
153418026caSFam Zheng     assert_bar_index_valid(s, index);
154418026caSFam Zheng     p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
155418026caSFam Zheng              PROT_READ | PROT_WRITE, MAP_SHARED,
156418026caSFam Zheng              s->device, s->bar_region_info[index].offset + offset);
157418026caSFam Zheng     if (p == MAP_FAILED) {
158418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to map BAR region");
159418026caSFam Zheng         p = NULL;
160418026caSFam Zheng     }
161418026caSFam Zheng     return p;
162418026caSFam Zheng }
163418026caSFam Zheng 
164418026caSFam Zheng /**
165418026caSFam Zheng  * Unmap a PCI bar area.
166418026caSFam Zheng  */
167418026caSFam Zheng void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar,
168418026caSFam Zheng                              uint64_t offset, uint64_t size)
169418026caSFam Zheng {
170418026caSFam Zheng     if (bar) {
171418026caSFam Zheng         munmap(bar, MIN(size, s->bar_region_info[index].size - offset));
172418026caSFam Zheng     }
173418026caSFam Zheng }
174418026caSFam Zheng 
175418026caSFam Zheng /**
176418026caSFam Zheng  * Initialize device IRQ with @irq_type and and register an event notifier.
177418026caSFam Zheng  */
178418026caSFam Zheng int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
179418026caSFam Zheng                            int irq_type, Error **errp)
180418026caSFam Zheng {
181418026caSFam Zheng     int r;
182418026caSFam Zheng     struct vfio_irq_set *irq_set;
183418026caSFam Zheng     size_t irq_set_size;
184418026caSFam Zheng     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
185418026caSFam Zheng 
186418026caSFam Zheng     irq_info.index = irq_type;
187418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) {
188418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get device interrupt info");
189418026caSFam Zheng         return -errno;
190418026caSFam Zheng     }
191418026caSFam Zheng     if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
192418026caSFam Zheng         error_setg(errp, "Device interrupt doesn't support eventfd");
193418026caSFam Zheng         return -EINVAL;
194418026caSFam Zheng     }
195418026caSFam Zheng 
196418026caSFam Zheng     irq_set_size = sizeof(*irq_set) + sizeof(int);
197418026caSFam Zheng     irq_set = g_malloc0(irq_set_size);
198418026caSFam Zheng 
199418026caSFam Zheng     /* Get to a known IRQ state */
200418026caSFam Zheng     *irq_set = (struct vfio_irq_set) {
201418026caSFam Zheng         .argsz = irq_set_size,
202418026caSFam Zheng         .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
203418026caSFam Zheng         .index = irq_info.index,
204418026caSFam Zheng         .start = 0,
205418026caSFam Zheng         .count = 1,
206418026caSFam Zheng     };
207418026caSFam Zheng 
208418026caSFam Zheng     *(int *)&irq_set->data = event_notifier_get_fd(e);
209418026caSFam Zheng     r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set);
210418026caSFam Zheng     g_free(irq_set);
211418026caSFam Zheng     if (r) {
212418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to setup device interrupt");
213418026caSFam Zheng         return -errno;
214418026caSFam Zheng     }
215418026caSFam Zheng     return 0;
216418026caSFam Zheng }
217418026caSFam Zheng 
218418026caSFam Zheng static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
219418026caSFam Zheng                                      int size, int ofs)
220418026caSFam Zheng {
221418026caSFam Zheng     int ret;
222418026caSFam Zheng 
223418026caSFam Zheng     do {
224418026caSFam Zheng         ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
225418026caSFam Zheng     } while (ret == -1 && errno == EINTR);
226418026caSFam Zheng     return ret == size ? 0 : -errno;
227418026caSFam Zheng }
228418026caSFam Zheng 
229418026caSFam Zheng static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs)
230418026caSFam Zheng {
231418026caSFam Zheng     int ret;
232418026caSFam Zheng 
233418026caSFam Zheng     do {
234418026caSFam Zheng         ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
235418026caSFam Zheng     } while (ret == -1 && errno == EINTR);
236418026caSFam Zheng     return ret == size ? 0 : -errno;
237418026caSFam Zheng }
238418026caSFam Zheng 
239418026caSFam Zheng static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
240418026caSFam Zheng                               Error **errp)
241418026caSFam Zheng {
242418026caSFam Zheng     int ret;
243418026caSFam Zheng     int i;
244418026caSFam Zheng     uint16_t pci_cmd;
245418026caSFam Zheng     struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
246418026caSFam Zheng     struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
247418026caSFam Zheng     struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
248418026caSFam Zheng     char *group_file = NULL;
249418026caSFam Zheng 
250418026caSFam Zheng     /* Create a new container */
251418026caSFam Zheng     s->container = open("/dev/vfio/vfio", O_RDWR);
252418026caSFam Zheng 
253418026caSFam Zheng     if (s->container == -1) {
254418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio");
255418026caSFam Zheng         return -errno;
256418026caSFam Zheng     }
257418026caSFam Zheng     if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
258418026caSFam Zheng         error_setg(errp, "Invalid VFIO version");
259418026caSFam Zheng         ret = -EINVAL;
260418026caSFam Zheng         goto fail_container;
261418026caSFam Zheng     }
262418026caSFam Zheng 
263418026caSFam Zheng     if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
264418026caSFam Zheng         error_setg_errno(errp, errno, "VFIO IOMMU check failed");
265418026caSFam Zheng         ret = -EINVAL;
266418026caSFam Zheng         goto fail_container;
267418026caSFam Zheng     }
268418026caSFam Zheng 
269418026caSFam Zheng     /* Open the group */
270418026caSFam Zheng     group_file = sysfs_find_group_file(device, errp);
271418026caSFam Zheng     if (!group_file) {
272418026caSFam Zheng         ret = -EINVAL;
273418026caSFam Zheng         goto fail_container;
274418026caSFam Zheng     }
275418026caSFam Zheng 
276418026caSFam Zheng     s->group = open(group_file, O_RDWR);
277418026caSFam Zheng     if (s->group == -1) {
278418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to open VFIO group file: %s",
279418026caSFam Zheng                          group_file);
280418026caSFam Zheng         g_free(group_file);
281418026caSFam Zheng         ret = -errno;
282418026caSFam Zheng         goto fail_container;
283418026caSFam Zheng     }
284418026caSFam Zheng     g_free(group_file);
285418026caSFam Zheng 
286418026caSFam Zheng     /* Test the group is viable and available */
287418026caSFam Zheng     if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) {
288418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get VFIO group status");
289418026caSFam Zheng         ret = -errno;
290418026caSFam Zheng         goto fail;
291418026caSFam Zheng     }
292418026caSFam Zheng 
293418026caSFam Zheng     if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
294418026caSFam Zheng         error_setg(errp, "VFIO group is not viable");
295418026caSFam Zheng         ret = -EINVAL;
296418026caSFam Zheng         goto fail;
297418026caSFam Zheng     }
298418026caSFam Zheng 
299418026caSFam Zheng     /* Add the group to the container */
300418026caSFam Zheng     if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) {
301418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to add group to VFIO container");
302418026caSFam Zheng         ret = -errno;
303418026caSFam Zheng         goto fail;
304418026caSFam Zheng     }
305418026caSFam Zheng 
306418026caSFam Zheng     /* Enable the IOMMU model we want */
307418026caSFam Zheng     if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
308418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type");
309418026caSFam Zheng         ret = -errno;
310418026caSFam Zheng         goto fail;
311418026caSFam Zheng     }
312418026caSFam Zheng 
313418026caSFam Zheng     /* Get additional IOMMU info */
314418026caSFam Zheng     if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
315418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get IOMMU info");
316418026caSFam Zheng         ret = -errno;
317418026caSFam Zheng         goto fail;
318418026caSFam Zheng     }
319418026caSFam Zheng 
320418026caSFam Zheng     s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
321418026caSFam Zheng 
322418026caSFam Zheng     if (s->device < 0) {
323418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get device fd");
324418026caSFam Zheng         ret = -errno;
325418026caSFam Zheng         goto fail;
326418026caSFam Zheng     }
327418026caSFam Zheng 
328418026caSFam Zheng     /* Test and setup the device */
329418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) {
330418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get device info");
331418026caSFam Zheng         ret = -errno;
332418026caSFam Zheng         goto fail;
333418026caSFam Zheng     }
334418026caSFam Zheng 
335418026caSFam Zheng     if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
336418026caSFam Zheng         error_setg(errp, "Invalid device regions");
337418026caSFam Zheng         ret = -EINVAL;
338418026caSFam Zheng         goto fail;
339418026caSFam Zheng     }
340418026caSFam Zheng 
341418026caSFam Zheng     s->config_region_info = (struct vfio_region_info) {
342418026caSFam Zheng         .index = VFIO_PCI_CONFIG_REGION_INDEX,
343418026caSFam Zheng         .argsz = sizeof(struct vfio_region_info),
344418026caSFam Zheng     };
345418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) {
346418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get config region info");
347418026caSFam Zheng         ret = -errno;
348418026caSFam Zheng         goto fail;
349418026caSFam Zheng     }
350418026caSFam Zheng 
3519e722ebcSLi Qiang     for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
352418026caSFam Zheng         ret = qemu_vfio_pci_init_bar(s, i, errp);
353418026caSFam Zheng         if (ret) {
354418026caSFam Zheng             goto fail;
355418026caSFam Zheng         }
356418026caSFam Zheng     }
357418026caSFam Zheng 
358418026caSFam Zheng     /* Enable bus master */
359418026caSFam Zheng     ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
360418026caSFam Zheng     if (ret) {
361418026caSFam Zheng         goto fail;
362418026caSFam Zheng     }
363418026caSFam Zheng     pci_cmd |= PCI_COMMAND_MASTER;
364418026caSFam Zheng     ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
365418026caSFam Zheng     if (ret) {
366418026caSFam Zheng         goto fail;
367418026caSFam Zheng     }
368418026caSFam Zheng     return 0;
369418026caSFam Zheng fail:
370418026caSFam Zheng     close(s->group);
371418026caSFam Zheng fail_container:
372418026caSFam Zheng     close(s->container);
373418026caSFam Zheng     return ret;
374418026caSFam Zheng }
375418026caSFam Zheng 
376418026caSFam Zheng static void qemu_vfio_ram_block_added(RAMBlockNotifier *n,
377418026caSFam Zheng                                       void *host, size_t size)
378418026caSFam Zheng {
379418026caSFam Zheng     QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
380418026caSFam Zheng     trace_qemu_vfio_ram_block_added(s, host, size);
381418026caSFam Zheng     qemu_vfio_dma_map(s, host, size, false, NULL);
382418026caSFam Zheng }
383418026caSFam Zheng 
384418026caSFam Zheng static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n,
385418026caSFam Zheng                                         void *host, size_t size)
386418026caSFam Zheng {
387418026caSFam Zheng     QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
388418026caSFam Zheng     if (host) {
389418026caSFam Zheng         trace_qemu_vfio_ram_block_removed(s, host, size);
390418026caSFam Zheng         qemu_vfio_dma_unmap(s, host);
391418026caSFam Zheng     }
392418026caSFam Zheng }
393418026caSFam Zheng 
394754cb9c0SYury Kotov static int qemu_vfio_init_ramblock(RAMBlock *rb, void *opaque)
395418026caSFam Zheng {
396754cb9c0SYury Kotov     void *host_addr = qemu_ram_get_host_addr(rb);
397754cb9c0SYury Kotov     ram_addr_t length = qemu_ram_get_used_length(rb);
398418026caSFam Zheng     int ret;
399418026caSFam Zheng     QEMUVFIOState *s = opaque;
400418026caSFam Zheng 
401418026caSFam Zheng     if (!host_addr) {
402418026caSFam Zheng         return 0;
403418026caSFam Zheng     }
404418026caSFam Zheng     ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL);
405418026caSFam Zheng     if (ret) {
406418026caSFam Zheng         fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n",
407418026caSFam Zheng                 host_addr, (uint64_t)length);
408418026caSFam Zheng     }
409418026caSFam Zheng     return 0;
410418026caSFam Zheng }
411418026caSFam Zheng 
412418026caSFam Zheng static void qemu_vfio_open_common(QEMUVFIOState *s)
413418026caSFam Zheng {
414549b50a3SMarkus Armbruster     qemu_mutex_init(&s->lock);
415418026caSFam Zheng     s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
416418026caSFam Zheng     s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
417418026caSFam Zheng     ram_block_notifier_add(&s->ram_notifier);
418418026caSFam Zheng     s->low_water_mark = QEMU_VFIO_IOVA_MIN;
419418026caSFam Zheng     s->high_water_mark = QEMU_VFIO_IOVA_MAX;
420418026caSFam Zheng     qemu_ram_foreach_block(qemu_vfio_init_ramblock, s);
421418026caSFam Zheng }
422418026caSFam Zheng 
423418026caSFam Zheng /**
424418026caSFam Zheng  * Open a PCI device, e.g. "0000:00:01.0".
425418026caSFam Zheng  */
426418026caSFam Zheng QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
427418026caSFam Zheng {
428418026caSFam Zheng     int r;
429418026caSFam Zheng     QEMUVFIOState *s = g_new0(QEMUVFIOState, 1);
430418026caSFam Zheng 
431418026caSFam Zheng     r = qemu_vfio_init_pci(s, device, errp);
432418026caSFam Zheng     if (r) {
433418026caSFam Zheng         g_free(s);
434418026caSFam Zheng         return NULL;
435418026caSFam Zheng     }
436418026caSFam Zheng     qemu_vfio_open_common(s);
437418026caSFam Zheng     return s;
438418026caSFam Zheng }
439418026caSFam Zheng 
440418026caSFam Zheng static void qemu_vfio_dump_mapping(IOVAMapping *m)
441418026caSFam Zheng {
442418026caSFam Zheng     if (QEMU_VFIO_DEBUG) {
443418026caSFam Zheng         printf("  vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host,
444418026caSFam Zheng                (uint64_t)m->size, (uint64_t)m->iova);
445418026caSFam Zheng     }
446418026caSFam Zheng }
447418026caSFam Zheng 
448418026caSFam Zheng static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
449418026caSFam Zheng {
450418026caSFam Zheng     int i;
451418026caSFam Zheng 
452418026caSFam Zheng     if (QEMU_VFIO_DEBUG) {
453418026caSFam Zheng         printf("vfio mappings\n");
454418026caSFam Zheng         for (i = 0; i < s->nr_mappings; ++i) {
455418026caSFam Zheng             qemu_vfio_dump_mapping(&s->mappings[i]);
456418026caSFam Zheng         }
457418026caSFam Zheng     }
458418026caSFam Zheng }
459418026caSFam Zheng 
460418026caSFam Zheng /**
461418026caSFam Zheng  * Find the mapping entry that contains [host, host + size) and set @index to
462418026caSFam Zheng  * the position. If no entry contains it, @index is the position _after_ which
463418026caSFam Zheng  * to insert the new mapping. IOW, it is the index of the largest element that
464418026caSFam Zheng  * is smaller than @host, or -1 if no entry is.
465418026caSFam Zheng  */
466418026caSFam Zheng static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host,
467418026caSFam Zheng                                            int *index)
468418026caSFam Zheng {
469418026caSFam Zheng     IOVAMapping *p = s->mappings;
470418026caSFam Zheng     IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL;
471418026caSFam Zheng     IOVAMapping *mid;
472418026caSFam Zheng     trace_qemu_vfio_find_mapping(s, host);
473418026caSFam Zheng     if (!p) {
474418026caSFam Zheng         *index = -1;
475418026caSFam Zheng         return NULL;
476418026caSFam Zheng     }
477418026caSFam Zheng     while (true) {
478418026caSFam Zheng         mid = p + (q - p) / 2;
479418026caSFam Zheng         if (mid == p) {
480418026caSFam Zheng             break;
481418026caSFam Zheng         }
482418026caSFam Zheng         if (mid->host > host) {
483418026caSFam Zheng             q = mid;
484418026caSFam Zheng         } else if (mid->host < host) {
485418026caSFam Zheng             p = mid;
486418026caSFam Zheng         } else {
487418026caSFam Zheng             break;
488418026caSFam Zheng         }
489418026caSFam Zheng     }
490418026caSFam Zheng     if (mid->host > host) {
491418026caSFam Zheng         mid--;
492418026caSFam Zheng     } else if (mid < &s->mappings[s->nr_mappings - 1]
493418026caSFam Zheng                && (mid + 1)->host <= host) {
494418026caSFam Zheng         mid++;
495418026caSFam Zheng     }
496418026caSFam Zheng     *index = mid - &s->mappings[0];
497418026caSFam Zheng     if (mid >= &s->mappings[0] &&
498418026caSFam Zheng         mid->host <= host && mid->host + mid->size > host) {
499418026caSFam Zheng         assert(mid < &s->mappings[s->nr_mappings]);
500418026caSFam Zheng         return mid;
501418026caSFam Zheng     }
502418026caSFam Zheng     /* At this point *index + 1 is the right position to insert the new
503418026caSFam Zheng      * mapping.*/
504418026caSFam Zheng     return NULL;
505418026caSFam Zheng }
506418026caSFam Zheng 
507418026caSFam Zheng /**
508418026caSFam Zheng  * Allocate IOVA and and create a new mapping record and insert it in @s.
509418026caSFam Zheng  */
510418026caSFam Zheng static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s,
511418026caSFam Zheng                                           void *host, size_t size,
512418026caSFam Zheng                                           int index, uint64_t iova)
513418026caSFam Zheng {
514418026caSFam Zheng     int shift;
515418026caSFam Zheng     IOVAMapping m = {.host = host, .size = size, .iova = iova};
516418026caSFam Zheng     IOVAMapping *insert;
517418026caSFam Zheng 
518038adc2fSWei Yang     assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
519038adc2fSWei Yang     assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size));
520038adc2fSWei Yang     assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size));
521418026caSFam Zheng     trace_qemu_vfio_new_mapping(s, host, size, index, iova);
522418026caSFam Zheng 
523418026caSFam Zheng     assert(index >= 0);
524418026caSFam Zheng     s->nr_mappings++;
525d29eb678SOlaf Hering     s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
526418026caSFam Zheng     insert = &s->mappings[index];
527418026caSFam Zheng     shift = s->nr_mappings - index - 1;
528418026caSFam Zheng     if (shift) {
529418026caSFam Zheng         memmove(insert + 1, insert, shift * sizeof(s->mappings[0]));
530418026caSFam Zheng     }
531418026caSFam Zheng     *insert = m;
532418026caSFam Zheng     return insert;
533418026caSFam Zheng }
534418026caSFam Zheng 
535418026caSFam Zheng /* Do the DMA mapping with VFIO. */
536418026caSFam Zheng static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
537418026caSFam Zheng                                 uint64_t iova)
538418026caSFam Zheng {
539418026caSFam Zheng     struct vfio_iommu_type1_dma_map dma_map = {
540418026caSFam Zheng         .argsz = sizeof(dma_map),
541418026caSFam Zheng         .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
542418026caSFam Zheng         .iova = iova,
543418026caSFam Zheng         .vaddr = (uintptr_t)host,
544418026caSFam Zheng         .size = size,
545418026caSFam Zheng     };
546418026caSFam Zheng     trace_qemu_vfio_do_mapping(s, host, size, iova);
547418026caSFam Zheng 
548418026caSFam Zheng     if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
549b09d51c9SMichal Privoznik         error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
550418026caSFam Zheng         return -errno;
551418026caSFam Zheng     }
552418026caSFam Zheng     return 0;
553418026caSFam Zheng }
554418026caSFam Zheng 
555418026caSFam Zheng /**
556418026caSFam Zheng  * Undo the DMA mapping from @s with VFIO, and remove from mapping list.
557418026caSFam Zheng  */
558418026caSFam Zheng static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping,
559418026caSFam Zheng                                    Error **errp)
560418026caSFam Zheng {
561418026caSFam Zheng     int index;
562418026caSFam Zheng     struct vfio_iommu_type1_dma_unmap unmap = {
563418026caSFam Zheng         .argsz = sizeof(unmap),
564418026caSFam Zheng         .flags = 0,
565418026caSFam Zheng         .iova = mapping->iova,
566418026caSFam Zheng         .size = mapping->size,
567418026caSFam Zheng     };
568418026caSFam Zheng 
569418026caSFam Zheng     index = mapping - s->mappings;
570418026caSFam Zheng     assert(mapping->size > 0);
571038adc2fSWei Yang     assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size));
572418026caSFam Zheng     assert(index >= 0 && index < s->nr_mappings);
573418026caSFam Zheng     if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
574b09d51c9SMichal Privoznik         error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed");
575418026caSFam Zheng     }
576418026caSFam Zheng     memmove(mapping, &s->mappings[index + 1],
577418026caSFam Zheng             sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
578418026caSFam Zheng     s->nr_mappings--;
579d29eb678SOlaf Hering     s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
580418026caSFam Zheng }
581418026caSFam Zheng 
582418026caSFam Zheng /* Check if the mapping list is (ascending) ordered. */
583418026caSFam Zheng static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
584418026caSFam Zheng {
585418026caSFam Zheng     int i;
586418026caSFam Zheng     if (QEMU_VFIO_DEBUG) {
587418026caSFam Zheng         for (i = 0; i < s->nr_mappings - 1; ++i) {
588418026caSFam Zheng             if (!(s->mappings[i].host < s->mappings[i + 1].host)) {
589418026caSFam Zheng                 fprintf(stderr, "item %d not sorted!\n", i);
590418026caSFam Zheng                 qemu_vfio_dump_mappings(s);
591418026caSFam Zheng                 return false;
592418026caSFam Zheng             }
593418026caSFam Zheng             if (!(s->mappings[i].host + s->mappings[i].size <=
594418026caSFam Zheng                   s->mappings[i + 1].host)) {
595418026caSFam Zheng                 fprintf(stderr, "item %d overlap with next!\n", i);
596418026caSFam Zheng                 qemu_vfio_dump_mappings(s);
597418026caSFam Zheng                 return false;
598418026caSFam Zheng             }
599418026caSFam Zheng         }
600418026caSFam Zheng     }
601418026caSFam Zheng     return true;
602418026caSFam Zheng }
603418026caSFam Zheng 
604418026caSFam Zheng /* Map [host, host + size) area into a contiguous IOVA address space, and store
605418026caSFam Zheng  * the result in @iova if not NULL. The caller need to make sure the area is
606418026caSFam Zheng  * aligned to page size, and mustn't overlap with existing mapping areas (split
607418026caSFam Zheng  * mapping status within this area is not allowed).
608418026caSFam Zheng  */
609418026caSFam Zheng int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
610418026caSFam Zheng                       bool temporary, uint64_t *iova)
611418026caSFam Zheng {
612418026caSFam Zheng     int ret = 0;
613418026caSFam Zheng     int index;
614418026caSFam Zheng     IOVAMapping *mapping;
615418026caSFam Zheng     uint64_t iova0;
616418026caSFam Zheng 
617038adc2fSWei Yang     assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size));
618038adc2fSWei Yang     assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
619418026caSFam Zheng     trace_qemu_vfio_dma_map(s, host, size, temporary, iova);
620418026caSFam Zheng     qemu_mutex_lock(&s->lock);
621418026caSFam Zheng     mapping = qemu_vfio_find_mapping(s, host, &index);
622418026caSFam Zheng     if (mapping) {
623418026caSFam Zheng         iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host);
624418026caSFam Zheng     } else {
625418026caSFam Zheng         if (s->high_water_mark - s->low_water_mark + 1 < size) {
626418026caSFam Zheng             ret = -ENOMEM;
627418026caSFam Zheng             goto out;
628418026caSFam Zheng         }
629418026caSFam Zheng         if (!temporary) {
630418026caSFam Zheng             iova0 = s->low_water_mark;
631418026caSFam Zheng             mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
632418026caSFam Zheng             if (!mapping) {
633418026caSFam Zheng                 ret = -ENOMEM;
634418026caSFam Zheng                 goto out;
635418026caSFam Zheng             }
636418026caSFam Zheng             assert(qemu_vfio_verify_mappings(s));
637418026caSFam Zheng             ret = qemu_vfio_do_mapping(s, host, size, iova0);
638418026caSFam Zheng             if (ret) {
639418026caSFam Zheng                 qemu_vfio_undo_mapping(s, mapping, NULL);
640418026caSFam Zheng                 goto out;
641418026caSFam Zheng             }
642418026caSFam Zheng             s->low_water_mark += size;
643418026caSFam Zheng             qemu_vfio_dump_mappings(s);
644418026caSFam Zheng         } else {
645418026caSFam Zheng             iova0 = s->high_water_mark - size;
646418026caSFam Zheng             ret = qemu_vfio_do_mapping(s, host, size, iova0);
647418026caSFam Zheng             if (ret) {
648418026caSFam Zheng                 goto out;
649418026caSFam Zheng             }
650418026caSFam Zheng             s->high_water_mark -= size;
651418026caSFam Zheng         }
652418026caSFam Zheng     }
653418026caSFam Zheng     if (iova) {
654418026caSFam Zheng         *iova = iova0;
655418026caSFam Zheng     }
656418026caSFam Zheng out:
657418026caSFam Zheng     qemu_mutex_unlock(&s->lock);
658418026caSFam Zheng     return ret;
659418026caSFam Zheng }
660418026caSFam Zheng 
661418026caSFam Zheng /* Reset the high watermark and free all "temporary" mappings. */
662418026caSFam Zheng int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
663418026caSFam Zheng {
664418026caSFam Zheng     struct vfio_iommu_type1_dma_unmap unmap = {
665418026caSFam Zheng         .argsz = sizeof(unmap),
666418026caSFam Zheng         .flags = 0,
667418026caSFam Zheng         .iova = s->high_water_mark,
668418026caSFam Zheng         .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark,
669418026caSFam Zheng     };
670418026caSFam Zheng     trace_qemu_vfio_dma_reset_temporary(s);
671*6e8a355dSDaniel Brodsky     QEMU_LOCK_GUARD(&s->lock);
672418026caSFam Zheng     if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
673b09d51c9SMichal Privoznik         error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
674418026caSFam Zheng         return -errno;
675418026caSFam Zheng     }
676418026caSFam Zheng     s->high_water_mark = QEMU_VFIO_IOVA_MAX;
677418026caSFam Zheng     return 0;
678418026caSFam Zheng }
679418026caSFam Zheng 
680418026caSFam Zheng /* Unmapping the whole area that was previously mapped with
681418026caSFam Zheng  * qemu_vfio_dma_map(). */
682418026caSFam Zheng void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host)
683418026caSFam Zheng {
684418026caSFam Zheng     int index = 0;
685418026caSFam Zheng     IOVAMapping *m;
686418026caSFam Zheng 
687418026caSFam Zheng     if (!host) {
688418026caSFam Zheng         return;
689418026caSFam Zheng     }
690418026caSFam Zheng 
691418026caSFam Zheng     trace_qemu_vfio_dma_unmap(s, host);
692418026caSFam Zheng     qemu_mutex_lock(&s->lock);
693418026caSFam Zheng     m = qemu_vfio_find_mapping(s, host, &index);
694418026caSFam Zheng     if (!m) {
695418026caSFam Zheng         goto out;
696418026caSFam Zheng     }
697418026caSFam Zheng     qemu_vfio_undo_mapping(s, m, NULL);
698418026caSFam Zheng out:
699418026caSFam Zheng     qemu_mutex_unlock(&s->lock);
700418026caSFam Zheng }
701418026caSFam Zheng 
702418026caSFam Zheng static void qemu_vfio_reset(QEMUVFIOState *s)
703418026caSFam Zheng {
704418026caSFam Zheng     ioctl(s->device, VFIO_DEVICE_RESET);
705418026caSFam Zheng }
706418026caSFam Zheng 
707418026caSFam Zheng /* Close and free the VFIO resources. */
708418026caSFam Zheng void qemu_vfio_close(QEMUVFIOState *s)
709418026caSFam Zheng {
710418026caSFam Zheng     int i;
711418026caSFam Zheng 
712418026caSFam Zheng     if (!s) {
713418026caSFam Zheng         return;
714418026caSFam Zheng     }
715418026caSFam Zheng     for (i = 0; i < s->nr_mappings; ++i) {
716418026caSFam Zheng         qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
717418026caSFam Zheng     }
718418026caSFam Zheng     ram_block_notifier_remove(&s->ram_notifier);
719418026caSFam Zheng     qemu_vfio_reset(s);
720418026caSFam Zheng     close(s->device);
721418026caSFam Zheng     close(s->group);
722418026caSFam Zheng     close(s->container);
723418026caSFam Zheng }
724