xref: /openbmc/qemu/util/vfio-helpers.c (revision 55567891)
1 /*
2  * VFIO utility
3  *
4  * Copyright 2016 - 2018 Red Hat, Inc.
5  *
6  * Authors:
7  *   Fam Zheng <famz@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10  * See the COPYING file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include <sys/ioctl.h>
15 #include <linux/vfio.h>
16 #include "qapi/error.h"
17 #include "exec/ramlist.h"
18 #include "exec/cpu-common.h"
19 #include "trace.h"
20 #include "qemu/error-report.h"
21 #include "standard-headers/linux/pci_regs.h"
22 #include "qemu/event_notifier.h"
23 #include "qemu/vfio-helpers.h"
24 #include "qemu/lockable.h"
25 #include "trace.h"
26 
27 #define QEMU_VFIO_DEBUG 0
28 
29 #define QEMU_VFIO_IOVA_MIN 0x10000ULL
30 /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
31  * we can use a runtime limit; alternatively it's also possible to do platform
32  * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
33  **/
34 #define QEMU_VFIO_IOVA_MAX (1ULL << 39)
35 
36 typedef struct {
37     /* Page aligned addr. */
38     void *host;
39     size_t size;
40     uint64_t iova;
41 } IOVAMapping;
42 
43 struct IOVARange {
44     uint64_t start;
45     uint64_t end;
46 };
47 
48 struct QEMUVFIOState {
49     QemuMutex lock;
50 
51     /* These fields are protected by BQL */
52     int container;
53     int group;
54     int device;
55     RAMBlockNotifier ram_notifier;
56     struct vfio_region_info config_region_info, bar_region_info[6];
57     struct IOVARange *usable_iova_ranges;
58     uint8_t nb_iova_ranges;
59 
60     /* These fields are protected by @lock */
61     /* VFIO's IO virtual address space is managed by splitting into a few
62      * sections:
63      *
64      * ---------------       <= 0
65      * |xxxxxxxxxxxxx|
66      * |-------------|       <= QEMU_VFIO_IOVA_MIN
67      * |             |
68      * |    Fixed    |
69      * |             |
70      * |-------------|       <= low_water_mark
71      * |             |
72      * |    Free     |
73      * |             |
74      * |-------------|       <= high_water_mark
75      * |             |
76      * |    Temp     |
77      * |             |
78      * |-------------|       <= QEMU_VFIO_IOVA_MAX
79      * |xxxxxxxxxxxxx|
80      * |xxxxxxxxxxxxx|
81      * ---------------
82      *
83      * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
84      *
85      * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
86      *   [QEMU_VFIO_IOVA_MIN, low_water_mark).  Once allocated they will not be
87      *   reclaimed - low_water_mark never shrinks;
88      *
89      * - IOVAs in range [low_water_mark, high_water_mark) are free;
90      *
91      * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
92      *   mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
93      *   is recycled. The caller should make sure I/O's depending on these
94      *   mappings are completed before calling.
95      **/
96     uint64_t low_water_mark;
97     uint64_t high_water_mark;
98     IOVAMapping *mappings;
99     int nr_mappings;
100 };
101 
102 /**
103  * Find group file by PCI device address as specified @device, and return the
104  * path. The returned string is owned by caller and should be g_free'ed later.
105  */
106 static char *sysfs_find_group_file(const char *device, Error **errp)
107 {
108     char *sysfs_link;
109     char *sysfs_group;
110     char *p;
111     char *path = NULL;
112 
113     sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device);
114     sysfs_group = g_malloc0(PATH_MAX);
115     if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) {
116         error_setg_errno(errp, errno, "Failed to find iommu group sysfs path");
117         goto out;
118     }
119     p = strrchr(sysfs_group, '/');
120     if (!p) {
121         error_setg(errp, "Failed to find iommu group number");
122         goto out;
123     }
124 
125     path = g_strdup_printf("/dev/vfio/%s", p + 1);
126 out:
127     g_free(sysfs_link);
128     g_free(sysfs_group);
129     return path;
130 }
131 
132 static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
133 {
134     assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info));
135 }
136 
137 static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
138 {
139     assert_bar_index_valid(s, index);
140     s->bar_region_info[index] = (struct vfio_region_info) {
141         .index = VFIO_PCI_BAR0_REGION_INDEX + index,
142         .argsz = sizeof(struct vfio_region_info),
143     };
144     if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) {
145         error_setg_errno(errp, errno, "Failed to get BAR region info");
146         return -errno;
147     }
148 
149     return 0;
150 }
151 
152 /**
153  * Map a PCI bar area.
154  */
155 void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
156                             uint64_t offset, uint64_t size, int prot,
157                             Error **errp)
158 {
159     void *p;
160     assert_bar_index_valid(s, index);
161     p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
162              prot, MAP_SHARED,
163              s->device, s->bar_region_info[index].offset + offset);
164     if (p == MAP_FAILED) {
165         error_setg_errno(errp, errno, "Failed to map BAR region");
166         p = NULL;
167     }
168     return p;
169 }
170 
171 /**
172  * Unmap a PCI bar area.
173  */
174 void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar,
175                              uint64_t offset, uint64_t size)
176 {
177     if (bar) {
178         munmap(bar, MIN(size, s->bar_region_info[index].size - offset));
179     }
180 }
181 
182 /**
183  * Initialize device IRQ with @irq_type and register an event notifier.
184  */
185 int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
186                            int irq_type, Error **errp)
187 {
188     int r;
189     struct vfio_irq_set *irq_set;
190     size_t irq_set_size;
191     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
192 
193     irq_info.index = irq_type;
194     if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) {
195         error_setg_errno(errp, errno, "Failed to get device interrupt info");
196         return -errno;
197     }
198     if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
199         error_setg(errp, "Device interrupt doesn't support eventfd");
200         return -EINVAL;
201     }
202 
203     irq_set_size = sizeof(*irq_set) + sizeof(int);
204     irq_set = g_malloc0(irq_set_size);
205 
206     /* Get to a known IRQ state */
207     *irq_set = (struct vfio_irq_set) {
208         .argsz = irq_set_size,
209         .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
210         .index = irq_info.index,
211         .start = 0,
212         .count = 1,
213     };
214 
215     *(int *)&irq_set->data = event_notifier_get_fd(e);
216     r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set);
217     g_free(irq_set);
218     if (r) {
219         error_setg_errno(errp, errno, "Failed to setup device interrupt");
220         return -errno;
221     }
222     return 0;
223 }
224 
225 static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
226                                      int size, int ofs)
227 {
228     int ret;
229 
230     do {
231         ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
232     } while (ret == -1 && errno == EINTR);
233     return ret == size ? 0 : -errno;
234 }
235 
236 static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs)
237 {
238     int ret;
239 
240     do {
241         ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
242     } while (ret == -1 && errno == EINTR);
243     return ret == size ? 0 : -errno;
244 }
245 
246 static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf)
247 {
248     struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf;
249     struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset;
250     struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
251     int i;
252 
253     while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
254         if (!cap->next) {
255             return;
256         }
257         cap = (struct vfio_info_cap_header *)(buf + cap->next);
258     }
259 
260     cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
261 
262     s->nb_iova_ranges = cap_iova_range->nr_iovas;
263     if (s->nb_iova_ranges > 1) {
264         s->usable_iova_ranges =
265             g_realloc(s->usable_iova_ranges,
266                       s->nb_iova_ranges * sizeof(struct IOVARange));
267     }
268 
269     for (i = 0; i < s->nb_iova_ranges; i++) {
270         s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
271         s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
272     }
273 }
274 
275 static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
276                               Error **errp)
277 {
278     int ret;
279     int i;
280     uint16_t pci_cmd;
281     struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
282     struct vfio_iommu_type1_info *iommu_info = NULL;
283     size_t iommu_info_size = sizeof(*iommu_info);
284     struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
285     char *group_file = NULL;
286 
287     s->usable_iova_ranges = NULL;
288 
289     /* Create a new container */
290     s->container = open("/dev/vfio/vfio", O_RDWR);
291 
292     if (s->container == -1) {
293         error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio");
294         return -errno;
295     }
296     if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
297         error_setg(errp, "Invalid VFIO version");
298         ret = -EINVAL;
299         goto fail_container;
300     }
301 
302     if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
303         error_setg_errno(errp, errno, "VFIO IOMMU check failed");
304         ret = -EINVAL;
305         goto fail_container;
306     }
307 
308     /* Open the group */
309     group_file = sysfs_find_group_file(device, errp);
310     if (!group_file) {
311         ret = -EINVAL;
312         goto fail_container;
313     }
314 
315     s->group = open(group_file, O_RDWR);
316     if (s->group == -1) {
317         error_setg_errno(errp, errno, "Failed to open VFIO group file: %s",
318                          group_file);
319         g_free(group_file);
320         ret = -errno;
321         goto fail_container;
322     }
323     g_free(group_file);
324 
325     /* Test the group is viable and available */
326     if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) {
327         error_setg_errno(errp, errno, "Failed to get VFIO group status");
328         ret = -errno;
329         goto fail;
330     }
331 
332     if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
333         error_setg(errp, "VFIO group is not viable");
334         ret = -EINVAL;
335         goto fail;
336     }
337 
338     /* Add the group to the container */
339     if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) {
340         error_setg_errno(errp, errno, "Failed to add group to VFIO container");
341         ret = -errno;
342         goto fail;
343     }
344 
345     /* Enable the IOMMU model we want */
346     if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
347         error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type");
348         ret = -errno;
349         goto fail;
350     }
351 
352     iommu_info = g_malloc0(iommu_info_size);
353     iommu_info->argsz = iommu_info_size;
354 
355     /* Get additional IOMMU info */
356     if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
357         error_setg_errno(errp, errno, "Failed to get IOMMU info");
358         ret = -errno;
359         goto fail;
360     }
361 
362     /*
363      * if the kernel does not report usable IOVA regions, choose
364      * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
365      */
366     s->nb_iova_ranges = 1;
367     s->usable_iova_ranges = g_new0(struct IOVARange, 1);
368     s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
369     s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
370 
371     if (iommu_info->argsz > iommu_info_size) {
372         iommu_info_size = iommu_info->argsz;
373         iommu_info = g_realloc(iommu_info, iommu_info_size);
374         if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
375             ret = -errno;
376             goto fail;
377         }
378         collect_usable_iova_ranges(s, iommu_info);
379     }
380 
381     s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
382 
383     if (s->device < 0) {
384         error_setg_errno(errp, errno, "Failed to get device fd");
385         ret = -errno;
386         goto fail;
387     }
388 
389     /* Test and setup the device */
390     if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) {
391         error_setg_errno(errp, errno, "Failed to get device info");
392         ret = -errno;
393         goto fail;
394     }
395 
396     if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
397         error_setg(errp, "Invalid device regions");
398         ret = -EINVAL;
399         goto fail;
400     }
401 
402     s->config_region_info = (struct vfio_region_info) {
403         .index = VFIO_PCI_CONFIG_REGION_INDEX,
404         .argsz = sizeof(struct vfio_region_info),
405     };
406     if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) {
407         error_setg_errno(errp, errno, "Failed to get config region info");
408         ret = -errno;
409         goto fail;
410     }
411 
412     for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
413         ret = qemu_vfio_pci_init_bar(s, i, errp);
414         if (ret) {
415             goto fail;
416         }
417     }
418 
419     /* Enable bus master */
420     ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
421     if (ret) {
422         goto fail;
423     }
424     pci_cmd |= PCI_COMMAND_MASTER;
425     ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
426     if (ret) {
427         goto fail;
428     }
429     g_free(iommu_info);
430     return 0;
431 fail:
432     g_free(s->usable_iova_ranges);
433     s->usable_iova_ranges = NULL;
434     s->nb_iova_ranges = 0;
435     g_free(iommu_info);
436     close(s->group);
437 fail_container:
438     close(s->container);
439     return ret;
440 }
441 
442 static void qemu_vfio_ram_block_added(RAMBlockNotifier *n,
443                                       void *host, size_t size)
444 {
445     QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
446     trace_qemu_vfio_ram_block_added(s, host, size);
447     qemu_vfio_dma_map(s, host, size, false, NULL);
448 }
449 
450 static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n,
451                                         void *host, size_t size)
452 {
453     QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
454     if (host) {
455         trace_qemu_vfio_ram_block_removed(s, host, size);
456         qemu_vfio_dma_unmap(s, host);
457     }
458 }
459 
460 static int qemu_vfio_init_ramblock(RAMBlock *rb, void *opaque)
461 {
462     void *host_addr = qemu_ram_get_host_addr(rb);
463     ram_addr_t length = qemu_ram_get_used_length(rb);
464     int ret;
465     QEMUVFIOState *s = opaque;
466 
467     if (!host_addr) {
468         return 0;
469     }
470     ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL);
471     if (ret) {
472         fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n",
473                 host_addr, (uint64_t)length);
474     }
475     return 0;
476 }
477 
478 static void qemu_vfio_open_common(QEMUVFIOState *s)
479 {
480     qemu_mutex_init(&s->lock);
481     s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
482     s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
483     ram_block_notifier_add(&s->ram_notifier);
484     s->low_water_mark = QEMU_VFIO_IOVA_MIN;
485     s->high_water_mark = QEMU_VFIO_IOVA_MAX;
486     qemu_ram_foreach_block(qemu_vfio_init_ramblock, s);
487 }
488 
489 /**
490  * Open a PCI device, e.g. "0000:00:01.0".
491  */
492 QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
493 {
494     int r;
495     QEMUVFIOState *s = g_new0(QEMUVFIOState, 1);
496 
497     r = qemu_vfio_init_pci(s, device, errp);
498     if (r) {
499         g_free(s);
500         return NULL;
501     }
502     qemu_vfio_open_common(s);
503     return s;
504 }
505 
506 static void qemu_vfio_dump_mapping(IOVAMapping *m)
507 {
508     if (QEMU_VFIO_DEBUG) {
509         printf("  vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host,
510                (uint64_t)m->size, (uint64_t)m->iova);
511     }
512 }
513 
514 static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
515 {
516     int i;
517 
518     if (QEMU_VFIO_DEBUG) {
519         printf("vfio mappings\n");
520         for (i = 0; i < s->nr_mappings; ++i) {
521             qemu_vfio_dump_mapping(&s->mappings[i]);
522         }
523     }
524 }
525 
526 /**
527  * Find the mapping entry that contains [host, host + size) and set @index to
528  * the position. If no entry contains it, @index is the position _after_ which
529  * to insert the new mapping. IOW, it is the index of the largest element that
530  * is smaller than @host, or -1 if no entry is.
531  */
532 static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host,
533                                            int *index)
534 {
535     IOVAMapping *p = s->mappings;
536     IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL;
537     IOVAMapping *mid;
538     trace_qemu_vfio_find_mapping(s, host);
539     if (!p) {
540         *index = -1;
541         return NULL;
542     }
543     while (true) {
544         mid = p + (q - p) / 2;
545         if (mid == p) {
546             break;
547         }
548         if (mid->host > host) {
549             q = mid;
550         } else if (mid->host < host) {
551             p = mid;
552         } else {
553             break;
554         }
555     }
556     if (mid->host > host) {
557         mid--;
558     } else if (mid < &s->mappings[s->nr_mappings - 1]
559                && (mid + 1)->host <= host) {
560         mid++;
561     }
562     *index = mid - &s->mappings[0];
563     if (mid >= &s->mappings[0] &&
564         mid->host <= host && mid->host + mid->size > host) {
565         assert(mid < &s->mappings[s->nr_mappings]);
566         return mid;
567     }
568     /* At this point *index + 1 is the right position to insert the new
569      * mapping.*/
570     return NULL;
571 }
572 
573 /**
574  * Allocate IOVA and create a new mapping record and insert it in @s.
575  */
576 static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s,
577                                           void *host, size_t size,
578                                           int index, uint64_t iova)
579 {
580     int shift;
581     IOVAMapping m = {.host = host, .size = size, .iova = iova};
582     IOVAMapping *insert;
583 
584     assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
585     assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size));
586     assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size));
587     trace_qemu_vfio_new_mapping(s, host, size, index, iova);
588 
589     assert(index >= 0);
590     s->nr_mappings++;
591     s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
592     insert = &s->mappings[index];
593     shift = s->nr_mappings - index - 1;
594     if (shift) {
595         memmove(insert + 1, insert, shift * sizeof(s->mappings[0]));
596     }
597     *insert = m;
598     return insert;
599 }
600 
601 /* Do the DMA mapping with VFIO. */
602 static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
603                                 uint64_t iova)
604 {
605     struct vfio_iommu_type1_dma_map dma_map = {
606         .argsz = sizeof(dma_map),
607         .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
608         .iova = iova,
609         .vaddr = (uintptr_t)host,
610         .size = size,
611     };
612     trace_qemu_vfio_do_mapping(s, host, size, iova);
613 
614     if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
615         error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
616         return -errno;
617     }
618     return 0;
619 }
620 
621 /**
622  * Undo the DMA mapping from @s with VFIO, and remove from mapping list.
623  */
624 static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping,
625                                    Error **errp)
626 {
627     int index;
628     struct vfio_iommu_type1_dma_unmap unmap = {
629         .argsz = sizeof(unmap),
630         .flags = 0,
631         .iova = mapping->iova,
632         .size = mapping->size,
633     };
634 
635     index = mapping - s->mappings;
636     assert(mapping->size > 0);
637     assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size));
638     assert(index >= 0 && index < s->nr_mappings);
639     if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
640         error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed");
641     }
642     memmove(mapping, &s->mappings[index + 1],
643             sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
644     s->nr_mappings--;
645     s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
646 }
647 
648 /* Check if the mapping list is (ascending) ordered. */
649 static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
650 {
651     int i;
652     if (QEMU_VFIO_DEBUG) {
653         for (i = 0; i < s->nr_mappings - 1; ++i) {
654             if (!(s->mappings[i].host < s->mappings[i + 1].host)) {
655                 fprintf(stderr, "item %d not sorted!\n", i);
656                 qemu_vfio_dump_mappings(s);
657                 return false;
658             }
659             if (!(s->mappings[i].host + s->mappings[i].size <=
660                   s->mappings[i + 1].host)) {
661                 fprintf(stderr, "item %d overlap with next!\n", i);
662                 qemu_vfio_dump_mappings(s);
663                 return false;
664             }
665         }
666     }
667     return true;
668 }
669 
670 static int
671 qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
672 {
673     int i;
674 
675     for (i = 0; i < s->nb_iova_ranges; i++) {
676         if (s->usable_iova_ranges[i].end < s->low_water_mark) {
677             continue;
678         }
679         s->low_water_mark =
680             MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
681 
682         if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
683             s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
684             *iova = s->low_water_mark;
685             s->low_water_mark += size;
686             return 0;
687         }
688     }
689     return -ENOMEM;
690 }
691 
692 static int
693 qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
694 {
695     int i;
696 
697     for (i = s->nb_iova_ranges - 1; i >= 0; i--) {
698         if (s->usable_iova_ranges[i].start > s->high_water_mark) {
699             continue;
700         }
701         s->high_water_mark =
702             MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1);
703 
704         if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size ||
705             s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) {
706             *iova = s->high_water_mark - size;
707             s->high_water_mark = *iova;
708             return 0;
709         }
710     }
711     return -ENOMEM;
712 }
713 
714 /* Map [host, host + size) area into a contiguous IOVA address space, and store
715  * the result in @iova if not NULL. The caller need to make sure the area is
716  * aligned to page size, and mustn't overlap with existing mapping areas (split
717  * mapping status within this area is not allowed).
718  */
719 int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
720                       bool temporary, uint64_t *iova)
721 {
722     int ret = 0;
723     int index;
724     IOVAMapping *mapping;
725     uint64_t iova0;
726 
727     assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size));
728     assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
729     trace_qemu_vfio_dma_map(s, host, size, temporary, iova);
730     qemu_mutex_lock(&s->lock);
731     mapping = qemu_vfio_find_mapping(s, host, &index);
732     if (mapping) {
733         iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host);
734     } else {
735         if (s->high_water_mark - s->low_water_mark + 1 < size) {
736             ret = -ENOMEM;
737             goto out;
738         }
739         if (!temporary) {
740             if (qemu_vfio_find_fixed_iova(s, size, &iova0)) {
741                 ret = -ENOMEM;
742                 goto out;
743             }
744 
745             mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
746             if (!mapping) {
747                 ret = -ENOMEM;
748                 goto out;
749             }
750             assert(qemu_vfio_verify_mappings(s));
751             ret = qemu_vfio_do_mapping(s, host, size, iova0);
752             if (ret) {
753                 qemu_vfio_undo_mapping(s, mapping, NULL);
754                 goto out;
755             }
756             qemu_vfio_dump_mappings(s);
757         } else {
758             if (qemu_vfio_find_temp_iova(s, size, &iova0)) {
759                 ret = -ENOMEM;
760                 goto out;
761             }
762             ret = qemu_vfio_do_mapping(s, host, size, iova0);
763             if (ret) {
764                 goto out;
765             }
766         }
767     }
768     if (iova) {
769         *iova = iova0;
770     }
771 out:
772     qemu_mutex_unlock(&s->lock);
773     return ret;
774 }
775 
776 /* Reset the high watermark and free all "temporary" mappings. */
777 int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
778 {
779     struct vfio_iommu_type1_dma_unmap unmap = {
780         .argsz = sizeof(unmap),
781         .flags = 0,
782         .iova = s->high_water_mark,
783         .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark,
784     };
785     trace_qemu_vfio_dma_reset_temporary(s);
786     QEMU_LOCK_GUARD(&s->lock);
787     if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
788         error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
789         return -errno;
790     }
791     s->high_water_mark = QEMU_VFIO_IOVA_MAX;
792     return 0;
793 }
794 
795 /* Unmapping the whole area that was previously mapped with
796  * qemu_vfio_dma_map(). */
797 void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host)
798 {
799     int index = 0;
800     IOVAMapping *m;
801 
802     if (!host) {
803         return;
804     }
805 
806     trace_qemu_vfio_dma_unmap(s, host);
807     qemu_mutex_lock(&s->lock);
808     m = qemu_vfio_find_mapping(s, host, &index);
809     if (!m) {
810         goto out;
811     }
812     qemu_vfio_undo_mapping(s, m, NULL);
813 out:
814     qemu_mutex_unlock(&s->lock);
815 }
816 
817 static void qemu_vfio_reset(QEMUVFIOState *s)
818 {
819     ioctl(s->device, VFIO_DEVICE_RESET);
820 }
821 
822 /* Close and free the VFIO resources. */
823 void qemu_vfio_close(QEMUVFIOState *s)
824 {
825     int i;
826 
827     if (!s) {
828         return;
829     }
830     for (i = 0; i < s->nr_mappings; ++i) {
831         qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
832     }
833     ram_block_notifier_remove(&s->ram_notifier);
834     g_free(s->usable_iova_ranges);
835     s->nb_iova_ranges = 0;
836     qemu_vfio_reset(s);
837     close(s->device);
838     close(s->group);
839     close(s->container);
840 }
841