xref: /openbmc/qemu/util/vfio-helpers.c (revision 418026ca43bc2626db092d7558258f9594366f28)
1*418026caSFam Zheng /*
2*418026caSFam Zheng  * VFIO utility
3*418026caSFam Zheng  *
4*418026caSFam Zheng  * Copyright 2016 - 2018 Red Hat, Inc.
5*418026caSFam Zheng  *
6*418026caSFam Zheng  * Authors:
7*418026caSFam Zheng  *   Fam Zheng <famz@redhat.com>
8*418026caSFam Zheng  *
9*418026caSFam Zheng  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10*418026caSFam Zheng  * See the COPYING file in the top-level directory.
11*418026caSFam Zheng  */
12*418026caSFam Zheng 
13*418026caSFam Zheng #include "qemu/osdep.h"
14*418026caSFam Zheng #include <sys/ioctl.h>
15*418026caSFam Zheng #include <linux/vfio.h>
16*418026caSFam Zheng #include "qapi/error.h"
17*418026caSFam Zheng #include "exec/ramlist.h"
18*418026caSFam Zheng #include "exec/cpu-common.h"
19*418026caSFam Zheng #include "trace.h"
20*418026caSFam Zheng #include "qemu/queue.h"
21*418026caSFam Zheng #include "qemu/error-report.h"
22*418026caSFam Zheng #include "standard-headers/linux/pci_regs.h"
23*418026caSFam Zheng #include "qemu/event_notifier.h"
24*418026caSFam Zheng #include "qemu/vfio-helpers.h"
25*418026caSFam Zheng #include "trace.h"
26*418026caSFam Zheng 
27*418026caSFam Zheng #define QEMU_VFIO_DEBUG 0
28*418026caSFam Zheng 
29*418026caSFam Zheng #define QEMU_VFIO_IOVA_MIN 0x10000ULL
30*418026caSFam Zheng /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
31*418026caSFam Zheng  * we can use a runtime limit; alternatively it's also possible to do platform
32*418026caSFam Zheng  * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
33*418026caSFam Zheng  **/
34*418026caSFam Zheng #define QEMU_VFIO_IOVA_MAX (1ULL << 39)
35*418026caSFam Zheng 
36*418026caSFam Zheng typedef struct {
37*418026caSFam Zheng     /* Page aligned addr. */
38*418026caSFam Zheng     void *host;
39*418026caSFam Zheng     size_t size;
40*418026caSFam Zheng     uint64_t iova;
41*418026caSFam Zheng } IOVAMapping;
42*418026caSFam Zheng 
43*418026caSFam Zheng struct QEMUVFIOState {
44*418026caSFam Zheng     QemuMutex lock;
45*418026caSFam Zheng 
46*418026caSFam Zheng     /* These fields are protected by BQL */
47*418026caSFam Zheng     int container;
48*418026caSFam Zheng     int group;
49*418026caSFam Zheng     int device;
50*418026caSFam Zheng     RAMBlockNotifier ram_notifier;
51*418026caSFam Zheng     struct vfio_region_info config_region_info, bar_region_info[6];
52*418026caSFam Zheng 
53*418026caSFam Zheng     /* These fields are protected by @lock */
54*418026caSFam Zheng     /* VFIO's IO virtual address space is managed by splitting into a few
55*418026caSFam Zheng      * sections:
56*418026caSFam Zheng      *
57*418026caSFam Zheng      * ---------------       <= 0
58*418026caSFam Zheng      * |xxxxxxxxxxxxx|
59*418026caSFam Zheng      * |-------------|       <= QEMU_VFIO_IOVA_MIN
60*418026caSFam Zheng      * |             |
61*418026caSFam Zheng      * |    Fixed    |
62*418026caSFam Zheng      * |             |
63*418026caSFam Zheng      * |-------------|       <= low_water_mark
64*418026caSFam Zheng      * |             |
65*418026caSFam Zheng      * |    Free     |
66*418026caSFam Zheng      * |             |
67*418026caSFam Zheng      * |-------------|       <= high_water_mark
68*418026caSFam Zheng      * |             |
69*418026caSFam Zheng      * |    Temp     |
70*418026caSFam Zheng      * |             |
71*418026caSFam Zheng      * |-------------|       <= QEMU_VFIO_IOVA_MAX
72*418026caSFam Zheng      * |xxxxxxxxxxxxx|
73*418026caSFam Zheng      * |xxxxxxxxxxxxx|
74*418026caSFam Zheng      * ---------------
75*418026caSFam Zheng      *
76*418026caSFam Zheng      * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
77*418026caSFam Zheng      *
78*418026caSFam Zheng      * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
79*418026caSFam Zheng      *   [QEMU_VFIO_IOVA_MIN, low_water_mark).  Once allocated they will not be
80*418026caSFam Zheng      *   reclaimed - low_water_mark never shrinks;
81*418026caSFam Zheng      *
82*418026caSFam Zheng      * - IOVAs in range [low_water_mark, high_water_mark) are free;
83*418026caSFam Zheng      *
84*418026caSFam Zheng      * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
85*418026caSFam Zheng      *   mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
86*418026caSFam Zheng      *   is recycled. The caller should make sure I/O's depending on these
87*418026caSFam Zheng      *   mappings are completed before calling.
88*418026caSFam Zheng      **/
89*418026caSFam Zheng     uint64_t low_water_mark;
90*418026caSFam Zheng     uint64_t high_water_mark;
91*418026caSFam Zheng     IOVAMapping *mappings;
92*418026caSFam Zheng     int nr_mappings;
93*418026caSFam Zheng };
94*418026caSFam Zheng 
95*418026caSFam Zheng /**
96*418026caSFam Zheng  * Find group file by PCI device address as specified @device, and return the
97*418026caSFam Zheng  * path. The returned string is owned by caller and should be g_free'ed later.
98*418026caSFam Zheng  */
99*418026caSFam Zheng static char *sysfs_find_group_file(const char *device, Error **errp)
100*418026caSFam Zheng {
101*418026caSFam Zheng     char *sysfs_link;
102*418026caSFam Zheng     char *sysfs_group;
103*418026caSFam Zheng     char *p;
104*418026caSFam Zheng     char *path = NULL;
105*418026caSFam Zheng 
106*418026caSFam Zheng     sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device);
107*418026caSFam Zheng     sysfs_group = g_malloc(PATH_MAX);
108*418026caSFam Zheng     if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) {
109*418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to find iommu group sysfs path");
110*418026caSFam Zheng         goto out;
111*418026caSFam Zheng     }
112*418026caSFam Zheng     p = strrchr(sysfs_group, '/');
113*418026caSFam Zheng     if (!p) {
114*418026caSFam Zheng         error_setg(errp, "Failed to find iommu group number");
115*418026caSFam Zheng         goto out;
116*418026caSFam Zheng     }
117*418026caSFam Zheng 
118*418026caSFam Zheng     path = g_strdup_printf("/dev/vfio/%s", p + 1);
119*418026caSFam Zheng out:
120*418026caSFam Zheng     g_free(sysfs_link);
121*418026caSFam Zheng     g_free(sysfs_group);
122*418026caSFam Zheng     return path;
123*418026caSFam Zheng }
124*418026caSFam Zheng 
125*418026caSFam Zheng static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
126*418026caSFam Zheng {
127*418026caSFam Zheng     assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info));
128*418026caSFam Zheng }
129*418026caSFam Zheng 
130*418026caSFam Zheng static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
131*418026caSFam Zheng {
132*418026caSFam Zheng     assert_bar_index_valid(s, index);
133*418026caSFam Zheng     s->bar_region_info[index] = (struct vfio_region_info) {
134*418026caSFam Zheng         .index = VFIO_PCI_BAR0_REGION_INDEX + index,
135*418026caSFam Zheng         .argsz = sizeof(struct vfio_region_info),
136*418026caSFam Zheng     };
137*418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) {
138*418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get BAR region info");
139*418026caSFam Zheng         return -errno;
140*418026caSFam Zheng     }
141*418026caSFam Zheng 
142*418026caSFam Zheng     return 0;
143*418026caSFam Zheng }
144*418026caSFam Zheng 
145*418026caSFam Zheng /**
146*418026caSFam Zheng  * Map a PCI bar area.
147*418026caSFam Zheng  */
148*418026caSFam Zheng void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
149*418026caSFam Zheng                             uint64_t offset, uint64_t size,
150*418026caSFam Zheng                             Error **errp)
151*418026caSFam Zheng {
152*418026caSFam Zheng     void *p;
153*418026caSFam Zheng     assert_bar_index_valid(s, index);
154*418026caSFam Zheng     p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
155*418026caSFam Zheng              PROT_READ | PROT_WRITE, MAP_SHARED,
156*418026caSFam Zheng              s->device, s->bar_region_info[index].offset + offset);
157*418026caSFam Zheng     if (p == MAP_FAILED) {
158*418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to map BAR region");
159*418026caSFam Zheng         p = NULL;
160*418026caSFam Zheng     }
161*418026caSFam Zheng     return p;
162*418026caSFam Zheng }
163*418026caSFam Zheng 
164*418026caSFam Zheng /**
165*418026caSFam Zheng  * Unmap a PCI bar area.
166*418026caSFam Zheng  */
167*418026caSFam Zheng void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar,
168*418026caSFam Zheng                              uint64_t offset, uint64_t size)
169*418026caSFam Zheng {
170*418026caSFam Zheng     if (bar) {
171*418026caSFam Zheng         munmap(bar, MIN(size, s->bar_region_info[index].size - offset));
172*418026caSFam Zheng     }
173*418026caSFam Zheng }
174*418026caSFam Zheng 
175*418026caSFam Zheng /**
176*418026caSFam Zheng  * Initialize device IRQ with @irq_type and and register an event notifier.
177*418026caSFam Zheng  */
178*418026caSFam Zheng int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
179*418026caSFam Zheng                            int irq_type, Error **errp)
180*418026caSFam Zheng {
181*418026caSFam Zheng     int r;
182*418026caSFam Zheng     struct vfio_irq_set *irq_set;
183*418026caSFam Zheng     size_t irq_set_size;
184*418026caSFam Zheng     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
185*418026caSFam Zheng 
186*418026caSFam Zheng     irq_info.index = irq_type;
187*418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) {
188*418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get device interrupt info");
189*418026caSFam Zheng         return -errno;
190*418026caSFam Zheng     }
191*418026caSFam Zheng     if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
192*418026caSFam Zheng         error_setg(errp, "Device interrupt doesn't support eventfd");
193*418026caSFam Zheng         return -EINVAL;
194*418026caSFam Zheng     }
195*418026caSFam Zheng 
196*418026caSFam Zheng     irq_set_size = sizeof(*irq_set) + sizeof(int);
197*418026caSFam Zheng     irq_set = g_malloc0(irq_set_size);
198*418026caSFam Zheng 
199*418026caSFam Zheng     /* Get to a known IRQ state */
200*418026caSFam Zheng     *irq_set = (struct vfio_irq_set) {
201*418026caSFam Zheng         .argsz = irq_set_size,
202*418026caSFam Zheng         .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
203*418026caSFam Zheng         .index = irq_info.index,
204*418026caSFam Zheng         .start = 0,
205*418026caSFam Zheng         .count = 1,
206*418026caSFam Zheng     };
207*418026caSFam Zheng 
208*418026caSFam Zheng     *(int *)&irq_set->data = event_notifier_get_fd(e);
209*418026caSFam Zheng     r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set);
210*418026caSFam Zheng     g_free(irq_set);
211*418026caSFam Zheng     if (r) {
212*418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to setup device interrupt");
213*418026caSFam Zheng         return -errno;
214*418026caSFam Zheng     }
215*418026caSFam Zheng     return 0;
216*418026caSFam Zheng }
217*418026caSFam Zheng 
218*418026caSFam Zheng static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
219*418026caSFam Zheng                                      int size, int ofs)
220*418026caSFam Zheng {
221*418026caSFam Zheng     int ret;
222*418026caSFam Zheng 
223*418026caSFam Zheng     do {
224*418026caSFam Zheng         ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
225*418026caSFam Zheng     } while (ret == -1 && errno == EINTR);
226*418026caSFam Zheng     return ret == size ? 0 : -errno;
227*418026caSFam Zheng }
228*418026caSFam Zheng 
229*418026caSFam Zheng static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs)
230*418026caSFam Zheng {
231*418026caSFam Zheng     int ret;
232*418026caSFam Zheng 
233*418026caSFam Zheng     do {
234*418026caSFam Zheng         ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
235*418026caSFam Zheng     } while (ret == -1 && errno == EINTR);
236*418026caSFam Zheng     return ret == size ? 0 : -errno;
237*418026caSFam Zheng }
238*418026caSFam Zheng 
239*418026caSFam Zheng static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
240*418026caSFam Zheng                               Error **errp)
241*418026caSFam Zheng {
242*418026caSFam Zheng     int ret;
243*418026caSFam Zheng     int i;
244*418026caSFam Zheng     uint16_t pci_cmd;
245*418026caSFam Zheng     struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
246*418026caSFam Zheng     struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
247*418026caSFam Zheng     struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
248*418026caSFam Zheng     char *group_file = NULL;
249*418026caSFam Zheng 
250*418026caSFam Zheng     /* Create a new container */
251*418026caSFam Zheng     s->container = open("/dev/vfio/vfio", O_RDWR);
252*418026caSFam Zheng 
253*418026caSFam Zheng     if (s->container == -1) {
254*418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio");
255*418026caSFam Zheng         return -errno;
256*418026caSFam Zheng     }
257*418026caSFam Zheng     if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
258*418026caSFam Zheng         error_setg(errp, "Invalid VFIO version");
259*418026caSFam Zheng         ret = -EINVAL;
260*418026caSFam Zheng         goto fail_container;
261*418026caSFam Zheng     }
262*418026caSFam Zheng 
263*418026caSFam Zheng     if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
264*418026caSFam Zheng         error_setg_errno(errp, errno, "VFIO IOMMU check failed");
265*418026caSFam Zheng         ret = -EINVAL;
266*418026caSFam Zheng         goto fail_container;
267*418026caSFam Zheng     }
268*418026caSFam Zheng 
269*418026caSFam Zheng     /* Open the group */
270*418026caSFam Zheng     group_file = sysfs_find_group_file(device, errp);
271*418026caSFam Zheng     if (!group_file) {
272*418026caSFam Zheng         ret = -EINVAL;
273*418026caSFam Zheng         goto fail_container;
274*418026caSFam Zheng     }
275*418026caSFam Zheng 
276*418026caSFam Zheng     s->group = open(group_file, O_RDWR);
277*418026caSFam Zheng     if (s->group == -1) {
278*418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to open VFIO group file: %s",
279*418026caSFam Zheng                          group_file);
280*418026caSFam Zheng         g_free(group_file);
281*418026caSFam Zheng         ret = -errno;
282*418026caSFam Zheng         goto fail_container;
283*418026caSFam Zheng     }
284*418026caSFam Zheng     g_free(group_file);
285*418026caSFam Zheng 
286*418026caSFam Zheng     /* Test the group is viable and available */
287*418026caSFam Zheng     if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) {
288*418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get VFIO group status");
289*418026caSFam Zheng         ret = -errno;
290*418026caSFam Zheng         goto fail;
291*418026caSFam Zheng     }
292*418026caSFam Zheng 
293*418026caSFam Zheng     if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
294*418026caSFam Zheng         error_setg(errp, "VFIO group is not viable");
295*418026caSFam Zheng         ret = -EINVAL;
296*418026caSFam Zheng         goto fail;
297*418026caSFam Zheng     }
298*418026caSFam Zheng 
299*418026caSFam Zheng     /* Add the group to the container */
300*418026caSFam Zheng     if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) {
301*418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to add group to VFIO container");
302*418026caSFam Zheng         ret = -errno;
303*418026caSFam Zheng         goto fail;
304*418026caSFam Zheng     }
305*418026caSFam Zheng 
306*418026caSFam Zheng     /* Enable the IOMMU model we want */
307*418026caSFam Zheng     if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
308*418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type");
309*418026caSFam Zheng         ret = -errno;
310*418026caSFam Zheng         goto fail;
311*418026caSFam Zheng     }
312*418026caSFam Zheng 
313*418026caSFam Zheng     /* Get additional IOMMU info */
314*418026caSFam Zheng     if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
315*418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get IOMMU info");
316*418026caSFam Zheng         ret = -errno;
317*418026caSFam Zheng         goto fail;
318*418026caSFam Zheng     }
319*418026caSFam Zheng 
320*418026caSFam Zheng     s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
321*418026caSFam Zheng 
322*418026caSFam Zheng     if (s->device < 0) {
323*418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get device fd");
324*418026caSFam Zheng         ret = -errno;
325*418026caSFam Zheng         goto fail;
326*418026caSFam Zheng     }
327*418026caSFam Zheng 
328*418026caSFam Zheng     /* Test and setup the device */
329*418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) {
330*418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get device info");
331*418026caSFam Zheng         ret = -errno;
332*418026caSFam Zheng         goto fail;
333*418026caSFam Zheng     }
334*418026caSFam Zheng 
335*418026caSFam Zheng     if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
336*418026caSFam Zheng         error_setg(errp, "Invalid device regions");
337*418026caSFam Zheng         ret = -EINVAL;
338*418026caSFam Zheng         goto fail;
339*418026caSFam Zheng     }
340*418026caSFam Zheng 
341*418026caSFam Zheng     s->config_region_info = (struct vfio_region_info) {
342*418026caSFam Zheng         .index = VFIO_PCI_CONFIG_REGION_INDEX,
343*418026caSFam Zheng         .argsz = sizeof(struct vfio_region_info),
344*418026caSFam Zheng     };
345*418026caSFam Zheng     if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) {
346*418026caSFam Zheng         error_setg_errno(errp, errno, "Failed to get config region info");
347*418026caSFam Zheng         ret = -errno;
348*418026caSFam Zheng         goto fail;
349*418026caSFam Zheng     }
350*418026caSFam Zheng 
351*418026caSFam Zheng     for (i = 0; i < 6; i++) {
352*418026caSFam Zheng         ret = qemu_vfio_pci_init_bar(s, i, errp);
353*418026caSFam Zheng         if (ret) {
354*418026caSFam Zheng             goto fail;
355*418026caSFam Zheng         }
356*418026caSFam Zheng     }
357*418026caSFam Zheng 
358*418026caSFam Zheng     /* Enable bus master */
359*418026caSFam Zheng     ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
360*418026caSFam Zheng     if (ret) {
361*418026caSFam Zheng         goto fail;
362*418026caSFam Zheng     }
363*418026caSFam Zheng     pci_cmd |= PCI_COMMAND_MASTER;
364*418026caSFam Zheng     ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
365*418026caSFam Zheng     if (ret) {
366*418026caSFam Zheng         goto fail;
367*418026caSFam Zheng     }
368*418026caSFam Zheng     return 0;
369*418026caSFam Zheng fail:
370*418026caSFam Zheng     close(s->group);
371*418026caSFam Zheng fail_container:
372*418026caSFam Zheng     close(s->container);
373*418026caSFam Zheng     return ret;
374*418026caSFam Zheng }
375*418026caSFam Zheng 
376*418026caSFam Zheng static void qemu_vfio_ram_block_added(RAMBlockNotifier *n,
377*418026caSFam Zheng                                       void *host, size_t size)
378*418026caSFam Zheng {
379*418026caSFam Zheng     QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
380*418026caSFam Zheng     trace_qemu_vfio_ram_block_added(s, host, size);
381*418026caSFam Zheng     qemu_vfio_dma_map(s, host, size, false, NULL);
382*418026caSFam Zheng }
383*418026caSFam Zheng 
384*418026caSFam Zheng static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n,
385*418026caSFam Zheng                                         void *host, size_t size)
386*418026caSFam Zheng {
387*418026caSFam Zheng     QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
388*418026caSFam Zheng     if (host) {
389*418026caSFam Zheng         trace_qemu_vfio_ram_block_removed(s, host, size);
390*418026caSFam Zheng         qemu_vfio_dma_unmap(s, host);
391*418026caSFam Zheng     }
392*418026caSFam Zheng }
393*418026caSFam Zheng 
394*418026caSFam Zheng static int qemu_vfio_init_ramblock(const char *block_name, void *host_addr,
395*418026caSFam Zheng                                    ram_addr_t offset, ram_addr_t length,
396*418026caSFam Zheng                                    void *opaque)
397*418026caSFam Zheng {
398*418026caSFam Zheng     int ret;
399*418026caSFam Zheng     QEMUVFIOState *s = opaque;
400*418026caSFam Zheng 
401*418026caSFam Zheng     if (!host_addr) {
402*418026caSFam Zheng         return 0;
403*418026caSFam Zheng     }
404*418026caSFam Zheng     ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL);
405*418026caSFam Zheng     if (ret) {
406*418026caSFam Zheng         fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n",
407*418026caSFam Zheng                 host_addr, (uint64_t)length);
408*418026caSFam Zheng     }
409*418026caSFam Zheng     return 0;
410*418026caSFam Zheng }
411*418026caSFam Zheng 
412*418026caSFam Zheng static void qemu_vfio_open_common(QEMUVFIOState *s)
413*418026caSFam Zheng {
414*418026caSFam Zheng     s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
415*418026caSFam Zheng     s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
416*418026caSFam Zheng     ram_block_notifier_add(&s->ram_notifier);
417*418026caSFam Zheng     s->low_water_mark = QEMU_VFIO_IOVA_MIN;
418*418026caSFam Zheng     s->high_water_mark = QEMU_VFIO_IOVA_MAX;
419*418026caSFam Zheng     qemu_ram_foreach_block(qemu_vfio_init_ramblock, s);
420*418026caSFam Zheng     qemu_mutex_init(&s->lock);
421*418026caSFam Zheng }
422*418026caSFam Zheng 
423*418026caSFam Zheng /**
424*418026caSFam Zheng  * Open a PCI device, e.g. "0000:00:01.0".
425*418026caSFam Zheng  */
426*418026caSFam Zheng QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
427*418026caSFam Zheng {
428*418026caSFam Zheng     int r;
429*418026caSFam Zheng     QEMUVFIOState *s = g_new0(QEMUVFIOState, 1);
430*418026caSFam Zheng 
431*418026caSFam Zheng     r = qemu_vfio_init_pci(s, device, errp);
432*418026caSFam Zheng     if (r) {
433*418026caSFam Zheng         g_free(s);
434*418026caSFam Zheng         return NULL;
435*418026caSFam Zheng     }
436*418026caSFam Zheng     qemu_vfio_open_common(s);
437*418026caSFam Zheng     return s;
438*418026caSFam Zheng }
439*418026caSFam Zheng 
440*418026caSFam Zheng static void qemu_vfio_dump_mapping(IOVAMapping *m)
441*418026caSFam Zheng {
442*418026caSFam Zheng     if (QEMU_VFIO_DEBUG) {
443*418026caSFam Zheng         printf("  vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host,
444*418026caSFam Zheng                (uint64_t)m->size, (uint64_t)m->iova);
445*418026caSFam Zheng     }
446*418026caSFam Zheng }
447*418026caSFam Zheng 
448*418026caSFam Zheng static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
449*418026caSFam Zheng {
450*418026caSFam Zheng     int i;
451*418026caSFam Zheng 
452*418026caSFam Zheng     if (QEMU_VFIO_DEBUG) {
453*418026caSFam Zheng         printf("vfio mappings\n");
454*418026caSFam Zheng         for (i = 0; i < s->nr_mappings; ++i) {
455*418026caSFam Zheng             qemu_vfio_dump_mapping(&s->mappings[i]);
456*418026caSFam Zheng         }
457*418026caSFam Zheng     }
458*418026caSFam Zheng }
459*418026caSFam Zheng 
460*418026caSFam Zheng /**
461*418026caSFam Zheng  * Find the mapping entry that contains [host, host + size) and set @index to
462*418026caSFam Zheng  * the position. If no entry contains it, @index is the position _after_ which
463*418026caSFam Zheng  * to insert the new mapping. IOW, it is the index of the largest element that
464*418026caSFam Zheng  * is smaller than @host, or -1 if no entry is.
465*418026caSFam Zheng  */
466*418026caSFam Zheng static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host,
467*418026caSFam Zheng                                            int *index)
468*418026caSFam Zheng {
469*418026caSFam Zheng     IOVAMapping *p = s->mappings;
470*418026caSFam Zheng     IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL;
471*418026caSFam Zheng     IOVAMapping *mid;
472*418026caSFam Zheng     trace_qemu_vfio_find_mapping(s, host);
473*418026caSFam Zheng     if (!p) {
474*418026caSFam Zheng         *index = -1;
475*418026caSFam Zheng         return NULL;
476*418026caSFam Zheng     }
477*418026caSFam Zheng     while (true) {
478*418026caSFam Zheng         mid = p + (q - p) / 2;
479*418026caSFam Zheng         if (mid == p) {
480*418026caSFam Zheng             break;
481*418026caSFam Zheng         }
482*418026caSFam Zheng         if (mid->host > host) {
483*418026caSFam Zheng             q = mid;
484*418026caSFam Zheng         } else if (mid->host < host) {
485*418026caSFam Zheng             p = mid;
486*418026caSFam Zheng         } else {
487*418026caSFam Zheng             break;
488*418026caSFam Zheng         }
489*418026caSFam Zheng     }
490*418026caSFam Zheng     if (mid->host > host) {
491*418026caSFam Zheng         mid--;
492*418026caSFam Zheng     } else if (mid < &s->mappings[s->nr_mappings - 1]
493*418026caSFam Zheng                && (mid + 1)->host <= host) {
494*418026caSFam Zheng         mid++;
495*418026caSFam Zheng     }
496*418026caSFam Zheng     *index = mid - &s->mappings[0];
497*418026caSFam Zheng     if (mid >= &s->mappings[0] &&
498*418026caSFam Zheng         mid->host <= host && mid->host + mid->size > host) {
499*418026caSFam Zheng         assert(mid < &s->mappings[s->nr_mappings]);
500*418026caSFam Zheng         return mid;
501*418026caSFam Zheng     }
502*418026caSFam Zheng     /* At this point *index + 1 is the right position to insert the new
503*418026caSFam Zheng      * mapping.*/
504*418026caSFam Zheng     return NULL;
505*418026caSFam Zheng }
506*418026caSFam Zheng 
507*418026caSFam Zheng /**
508*418026caSFam Zheng  * Allocate IOVA and and create a new mapping record and insert it in @s.
509*418026caSFam Zheng  */
510*418026caSFam Zheng static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s,
511*418026caSFam Zheng                                           void *host, size_t size,
512*418026caSFam Zheng                                           int index, uint64_t iova)
513*418026caSFam Zheng {
514*418026caSFam Zheng     int shift;
515*418026caSFam Zheng     IOVAMapping m = {.host = host, .size = size, .iova = iova};
516*418026caSFam Zheng     IOVAMapping *insert;
517*418026caSFam Zheng 
518*418026caSFam Zheng     assert(QEMU_IS_ALIGNED(size, getpagesize()));
519*418026caSFam Zheng     assert(QEMU_IS_ALIGNED(s->low_water_mark, getpagesize()));
520*418026caSFam Zheng     assert(QEMU_IS_ALIGNED(s->high_water_mark, getpagesize()));
521*418026caSFam Zheng     trace_qemu_vfio_new_mapping(s, host, size, index, iova);
522*418026caSFam Zheng 
523*418026caSFam Zheng     assert(index >= 0);
524*418026caSFam Zheng     s->nr_mappings++;
525*418026caSFam Zheng     s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]),
526*418026caSFam Zheng                               s->nr_mappings);
527*418026caSFam Zheng     insert = &s->mappings[index];
528*418026caSFam Zheng     shift = s->nr_mappings - index - 1;
529*418026caSFam Zheng     if (shift) {
530*418026caSFam Zheng         memmove(insert + 1, insert, shift * sizeof(s->mappings[0]));
531*418026caSFam Zheng     }
532*418026caSFam Zheng     *insert = m;
533*418026caSFam Zheng     return insert;
534*418026caSFam Zheng }
535*418026caSFam Zheng 
536*418026caSFam Zheng /* Do the DMA mapping with VFIO. */
537*418026caSFam Zheng static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
538*418026caSFam Zheng                                 uint64_t iova)
539*418026caSFam Zheng {
540*418026caSFam Zheng     struct vfio_iommu_type1_dma_map dma_map = {
541*418026caSFam Zheng         .argsz = sizeof(dma_map),
542*418026caSFam Zheng         .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
543*418026caSFam Zheng         .iova = iova,
544*418026caSFam Zheng         .vaddr = (uintptr_t)host,
545*418026caSFam Zheng         .size = size,
546*418026caSFam Zheng     };
547*418026caSFam Zheng     trace_qemu_vfio_do_mapping(s, host, size, iova);
548*418026caSFam Zheng 
549*418026caSFam Zheng     if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
550*418026caSFam Zheng         error_report("VFIO_MAP_DMA: %d", -errno);
551*418026caSFam Zheng         return -errno;
552*418026caSFam Zheng     }
553*418026caSFam Zheng     return 0;
554*418026caSFam Zheng }
555*418026caSFam Zheng 
556*418026caSFam Zheng /**
557*418026caSFam Zheng  * Undo the DMA mapping from @s with VFIO, and remove from mapping list.
558*418026caSFam Zheng  */
559*418026caSFam Zheng static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping,
560*418026caSFam Zheng                                    Error **errp)
561*418026caSFam Zheng {
562*418026caSFam Zheng     int index;
563*418026caSFam Zheng     struct vfio_iommu_type1_dma_unmap unmap = {
564*418026caSFam Zheng         .argsz = sizeof(unmap),
565*418026caSFam Zheng         .flags = 0,
566*418026caSFam Zheng         .iova = mapping->iova,
567*418026caSFam Zheng         .size = mapping->size,
568*418026caSFam Zheng     };
569*418026caSFam Zheng 
570*418026caSFam Zheng     index = mapping - s->mappings;
571*418026caSFam Zheng     assert(mapping->size > 0);
572*418026caSFam Zheng     assert(QEMU_IS_ALIGNED(mapping->size, getpagesize()));
573*418026caSFam Zheng     assert(index >= 0 && index < s->nr_mappings);
574*418026caSFam Zheng     if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
575*418026caSFam Zheng         error_setg(errp, "VFIO_UNMAP_DMA failed: %d", -errno);
576*418026caSFam Zheng     }
577*418026caSFam Zheng     memmove(mapping, &s->mappings[index + 1],
578*418026caSFam Zheng             sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
579*418026caSFam Zheng     s->nr_mappings--;
580*418026caSFam Zheng     s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]),
581*418026caSFam Zheng                               s->nr_mappings);
582*418026caSFam Zheng }
583*418026caSFam Zheng 
584*418026caSFam Zheng /* Check if the mapping list is (ascending) ordered. */
585*418026caSFam Zheng static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
586*418026caSFam Zheng {
587*418026caSFam Zheng     int i;
588*418026caSFam Zheng     if (QEMU_VFIO_DEBUG) {
589*418026caSFam Zheng         for (i = 0; i < s->nr_mappings - 1; ++i) {
590*418026caSFam Zheng             if (!(s->mappings[i].host < s->mappings[i + 1].host)) {
591*418026caSFam Zheng                 fprintf(stderr, "item %d not sorted!\n", i);
592*418026caSFam Zheng                 qemu_vfio_dump_mappings(s);
593*418026caSFam Zheng                 return false;
594*418026caSFam Zheng             }
595*418026caSFam Zheng             if (!(s->mappings[i].host + s->mappings[i].size <=
596*418026caSFam Zheng                   s->mappings[i + 1].host)) {
597*418026caSFam Zheng                 fprintf(stderr, "item %d overlap with next!\n", i);
598*418026caSFam Zheng                 qemu_vfio_dump_mappings(s);
599*418026caSFam Zheng                 return false;
600*418026caSFam Zheng             }
601*418026caSFam Zheng         }
602*418026caSFam Zheng     }
603*418026caSFam Zheng     return true;
604*418026caSFam Zheng }
605*418026caSFam Zheng 
606*418026caSFam Zheng /* Map [host, host + size) area into a contiguous IOVA address space, and store
607*418026caSFam Zheng  * the result in @iova if not NULL. The caller need to make sure the area is
608*418026caSFam Zheng  * aligned to page size, and mustn't overlap with existing mapping areas (split
609*418026caSFam Zheng  * mapping status within this area is not allowed).
610*418026caSFam Zheng  */
611*418026caSFam Zheng int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
612*418026caSFam Zheng                       bool temporary, uint64_t *iova)
613*418026caSFam Zheng {
614*418026caSFam Zheng     int ret = 0;
615*418026caSFam Zheng     int index;
616*418026caSFam Zheng     IOVAMapping *mapping;
617*418026caSFam Zheng     uint64_t iova0;
618*418026caSFam Zheng 
619*418026caSFam Zheng     assert(QEMU_PTR_IS_ALIGNED(host, getpagesize()));
620*418026caSFam Zheng     assert(QEMU_IS_ALIGNED(size, getpagesize()));
621*418026caSFam Zheng     trace_qemu_vfio_dma_map(s, host, size, temporary, iova);
622*418026caSFam Zheng     qemu_mutex_lock(&s->lock);
623*418026caSFam Zheng     mapping = qemu_vfio_find_mapping(s, host, &index);
624*418026caSFam Zheng     if (mapping) {
625*418026caSFam Zheng         iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host);
626*418026caSFam Zheng     } else {
627*418026caSFam Zheng         if (s->high_water_mark - s->low_water_mark + 1 < size) {
628*418026caSFam Zheng             ret = -ENOMEM;
629*418026caSFam Zheng             goto out;
630*418026caSFam Zheng         }
631*418026caSFam Zheng         if (!temporary) {
632*418026caSFam Zheng             iova0 = s->low_water_mark;
633*418026caSFam Zheng             mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
634*418026caSFam Zheng             if (!mapping) {
635*418026caSFam Zheng                 ret = -ENOMEM;
636*418026caSFam Zheng                 goto out;
637*418026caSFam Zheng             }
638*418026caSFam Zheng             assert(qemu_vfio_verify_mappings(s));
639*418026caSFam Zheng             ret = qemu_vfio_do_mapping(s, host, size, iova0);
640*418026caSFam Zheng             if (ret) {
641*418026caSFam Zheng                 qemu_vfio_undo_mapping(s, mapping, NULL);
642*418026caSFam Zheng                 goto out;
643*418026caSFam Zheng             }
644*418026caSFam Zheng             s->low_water_mark += size;
645*418026caSFam Zheng             qemu_vfio_dump_mappings(s);
646*418026caSFam Zheng         } else {
647*418026caSFam Zheng             iova0 = s->high_water_mark - size;
648*418026caSFam Zheng             ret = qemu_vfio_do_mapping(s, host, size, iova0);
649*418026caSFam Zheng             if (ret) {
650*418026caSFam Zheng                 goto out;
651*418026caSFam Zheng             }
652*418026caSFam Zheng             s->high_water_mark -= size;
653*418026caSFam Zheng         }
654*418026caSFam Zheng     }
655*418026caSFam Zheng     if (iova) {
656*418026caSFam Zheng         *iova = iova0;
657*418026caSFam Zheng     }
658*418026caSFam Zheng out:
659*418026caSFam Zheng     qemu_mutex_unlock(&s->lock);
660*418026caSFam Zheng     return ret;
661*418026caSFam Zheng }
662*418026caSFam Zheng 
663*418026caSFam Zheng /* Reset the high watermark and free all "temporary" mappings. */
664*418026caSFam Zheng int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
665*418026caSFam Zheng {
666*418026caSFam Zheng     struct vfio_iommu_type1_dma_unmap unmap = {
667*418026caSFam Zheng         .argsz = sizeof(unmap),
668*418026caSFam Zheng         .flags = 0,
669*418026caSFam Zheng         .iova = s->high_water_mark,
670*418026caSFam Zheng         .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark,
671*418026caSFam Zheng     };
672*418026caSFam Zheng     trace_qemu_vfio_dma_reset_temporary(s);
673*418026caSFam Zheng     qemu_mutex_lock(&s->lock);
674*418026caSFam Zheng     if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
675*418026caSFam Zheng         error_report("VFIO_UNMAP_DMA: %d", -errno);
676*418026caSFam Zheng         qemu_mutex_unlock(&s->lock);
677*418026caSFam Zheng         return -errno;
678*418026caSFam Zheng     }
679*418026caSFam Zheng     s->high_water_mark = QEMU_VFIO_IOVA_MAX;
680*418026caSFam Zheng     qemu_mutex_unlock(&s->lock);
681*418026caSFam Zheng     return 0;
682*418026caSFam Zheng }
683*418026caSFam Zheng 
684*418026caSFam Zheng /* Unmapping the whole area that was previously mapped with
685*418026caSFam Zheng  * qemu_vfio_dma_map(). */
686*418026caSFam Zheng void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host)
687*418026caSFam Zheng {
688*418026caSFam Zheng     int index = 0;
689*418026caSFam Zheng     IOVAMapping *m;
690*418026caSFam Zheng 
691*418026caSFam Zheng     if (!host) {
692*418026caSFam Zheng         return;
693*418026caSFam Zheng     }
694*418026caSFam Zheng 
695*418026caSFam Zheng     trace_qemu_vfio_dma_unmap(s, host);
696*418026caSFam Zheng     qemu_mutex_lock(&s->lock);
697*418026caSFam Zheng     m = qemu_vfio_find_mapping(s, host, &index);
698*418026caSFam Zheng     if (!m) {
699*418026caSFam Zheng         goto out;
700*418026caSFam Zheng     }
701*418026caSFam Zheng     qemu_vfio_undo_mapping(s, m, NULL);
702*418026caSFam Zheng out:
703*418026caSFam Zheng     qemu_mutex_unlock(&s->lock);
704*418026caSFam Zheng }
705*418026caSFam Zheng 
706*418026caSFam Zheng static void qemu_vfio_reset(QEMUVFIOState *s)
707*418026caSFam Zheng {
708*418026caSFam Zheng     ioctl(s->device, VFIO_DEVICE_RESET);
709*418026caSFam Zheng }
710*418026caSFam Zheng 
711*418026caSFam Zheng /* Close and free the VFIO resources. */
712*418026caSFam Zheng void qemu_vfio_close(QEMUVFIOState *s)
713*418026caSFam Zheng {
714*418026caSFam Zheng     int i;
715*418026caSFam Zheng 
716*418026caSFam Zheng     if (!s) {
717*418026caSFam Zheng         return;
718*418026caSFam Zheng     }
719*418026caSFam Zheng     for (i = 0; i < s->nr_mappings; ++i) {
720*418026caSFam Zheng         qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
721*418026caSFam Zheng     }
722*418026caSFam Zheng     ram_block_notifier_remove(&s->ram_notifier);
723*418026caSFam Zheng     qemu_vfio_reset(s);
724*418026caSFam Zheng     close(s->device);
725*418026caSFam Zheng     close(s->group);
726*418026caSFam Zheng     close(s->container);
727*418026caSFam Zheng }
728