xref: /openbmc/qemu/hw/vfio/container.c (revision 87417811)
1 /*
2  * generic functions used by VFIO devices
3  *
4  * Copyright Red Hat, Inc. 2012
5  *
6  * Authors:
7  *  Alex Williamson <alex.williamson@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Based on qemu-kvm device-assignment:
13  *  Adapted for KVM by Qumranet.
14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19  */
20 
21 #include "qemu/osdep.h"
22 #include <sys/ioctl.h>
23 #ifdef CONFIG_KVM
24 #include <linux/kvm.h>
25 #endif
26 #include <linux/vfio.h>
27 
28 #include "hw/vfio/vfio-common.h"
29 #include "hw/vfio/vfio.h"
30 #include "exec/address-spaces.h"
31 #include "exec/memory.h"
32 #include "exec/ram_addr.h"
33 #include "hw/hw.h"
34 #include "qemu/error-report.h"
35 #include "qemu/range.h"
36 #include "sysemu/kvm.h"
37 #include "sysemu/reset.h"
38 #include "trace.h"
39 #include "qapi/error.h"
40 #include "migration/migration.h"
41 
42 VFIOGroupList vfio_group_list =
43     QLIST_HEAD_INITIALIZER(vfio_group_list);
44 
45 static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
46 {
47     switch (container->iommu_type) {
48     case VFIO_TYPE1v2_IOMMU:
49     case VFIO_TYPE1_IOMMU:
50         /*
51          * We support coordinated discarding of RAM via the RamDiscardManager.
52          */
53         return ram_block_uncoordinated_discard_disable(state);
54     default:
55         /*
56          * VFIO_SPAPR_TCE_IOMMU most probably works just fine with
57          * RamDiscardManager, however, it is completely untested.
58          *
59          * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does
60          * completely the opposite of managing mapping/pinning dynamically as
61          * required by RamDiscardManager. We would have to special-case sections
62          * with a RamDiscardManager.
63          */
64         return ram_block_discard_disable(state);
65     }
66 }
67 
68 static int vfio_dma_unmap_bitmap(VFIOContainer *container,
69                                  hwaddr iova, ram_addr_t size,
70                                  IOMMUTLBEntry *iotlb)
71 {
72     struct vfio_iommu_type1_dma_unmap *unmap;
73     struct vfio_bitmap *bitmap;
74     VFIOBitmap vbmap;
75     int ret;
76 
77     ret = vfio_bitmap_alloc(&vbmap, size);
78     if (ret) {
79         return ret;
80     }
81 
82     unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
83 
84     unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
85     unmap->iova = iova;
86     unmap->size = size;
87     unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
88     bitmap = (struct vfio_bitmap *)&unmap->data;
89 
90     /*
91      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
92      * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
93      * to qemu_real_host_page_size.
94      */
95     bitmap->pgsize = qemu_real_host_page_size();
96     bitmap->size = vbmap.size;
97     bitmap->data = (__u64 *)vbmap.bitmap;
98 
99     if (vbmap.size > container->max_dirty_bitmap_size) {
100         error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
101         ret = -E2BIG;
102         goto unmap_exit;
103     }
104 
105     ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
106     if (!ret) {
107         cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
108                 iotlb->translated_addr, vbmap.pages);
109     } else {
110         error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
111     }
112 
113 unmap_exit:
114     g_free(unmap);
115     g_free(vbmap.bitmap);
116 
117     return ret;
118 }
119 
120 /*
121  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
122  */
123 int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
124                    ram_addr_t size, IOMMUTLBEntry *iotlb)
125 {
126     struct vfio_iommu_type1_dma_unmap unmap = {
127         .argsz = sizeof(unmap),
128         .flags = 0,
129         .iova = iova,
130         .size = size,
131     };
132     bool need_dirty_sync = false;
133     int ret;
134 
135     if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
136         if (!vfio_devices_all_device_dirty_tracking(container) &&
137             container->dirty_pages_supported) {
138             return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
139         }
140 
141         need_dirty_sync = true;
142     }
143 
144     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
145         /*
146          * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
147          * v4.15) where an overflow in its wrap-around check prevents us from
148          * unmapping the last page of the address space.  Test for the error
149          * condition and re-try the unmap excluding the last page.  The
150          * expectation is that we've never mapped the last page anyway and this
151          * unmap request comes via vIOMMU support which also makes it unlikely
152          * that this page is used.  This bug was introduced well after type1 v2
153          * support was introduced, so we shouldn't need to test for v1.  A fix
154          * is queued for kernel v5.0 so this workaround can be removed once
155          * affected kernels are sufficiently deprecated.
156          */
157         if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
158             container->iommu_type == VFIO_TYPE1v2_IOMMU) {
159             trace_vfio_dma_unmap_overflow_workaround();
160             unmap.size -= 1ULL << ctz64(container->pgsizes);
161             continue;
162         }
163         error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
164         return -errno;
165     }
166 
167     if (need_dirty_sync) {
168         ret = vfio_get_dirty_bitmap(container, iova, size,
169                                     iotlb->translated_addr);
170         if (ret) {
171             return ret;
172         }
173     }
174 
175     return 0;
176 }
177 
178 int vfio_dma_map(VFIOContainer *container, hwaddr iova,
179                  ram_addr_t size, void *vaddr, bool readonly)
180 {
181     struct vfio_iommu_type1_dma_map map = {
182         .argsz = sizeof(map),
183         .flags = VFIO_DMA_MAP_FLAG_READ,
184         .vaddr = (__u64)(uintptr_t)vaddr,
185         .iova = iova,
186         .size = size,
187     };
188 
189     if (!readonly) {
190         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
191     }
192 
193     /*
194      * Try the mapping, if it fails with EBUSY, unmap the region and try
195      * again.  This shouldn't be necessary, but we sometimes see it in
196      * the VGA ROM space.
197      */
198     if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
199         (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
200          ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
201         return 0;
202     }
203 
204     error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
205     return -errno;
206 }
207 
208 int vfio_container_add_section_window(VFIOContainer *container,
209                                       MemoryRegionSection *section,
210                                       Error **errp)
211 {
212     VFIOHostDMAWindow *hostwin;
213     hwaddr pgsize = 0;
214     int ret;
215 
216     if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
217         return 0;
218     }
219 
220     /* For now intersections are not allowed, we may relax this later */
221     QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
222         if (ranges_overlap(hostwin->min_iova,
223                            hostwin->max_iova - hostwin->min_iova + 1,
224                            section->offset_within_address_space,
225                            int128_get64(section->size))) {
226             error_setg(errp,
227                 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
228                 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
229                 section->offset_within_address_space,
230                 section->offset_within_address_space +
231                     int128_get64(section->size) - 1,
232                 hostwin->min_iova, hostwin->max_iova);
233             return -EINVAL;
234         }
235     }
236 
237     ret = vfio_spapr_create_window(container, section, &pgsize);
238     if (ret) {
239         error_setg_errno(errp, -ret, "Failed to create SPAPR window");
240         return ret;
241     }
242 
243     vfio_host_win_add(container, section->offset_within_address_space,
244                       section->offset_within_address_space +
245                       int128_get64(section->size) - 1, pgsize);
246 #ifdef CONFIG_KVM
247     if (kvm_enabled()) {
248         VFIOGroup *group;
249         IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
250         struct kvm_vfio_spapr_tce param;
251         struct kvm_device_attr attr = {
252             .group = KVM_DEV_VFIO_GROUP,
253             .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
254             .addr = (uint64_t)(unsigned long)&param,
255         };
256 
257         if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
258                                           &param.tablefd)) {
259             QLIST_FOREACH(group, &container->group_list, container_next) {
260                 param.groupfd = group->fd;
261                 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
262                     error_setg_errno(errp, errno,
263                                      "vfio: failed GROUP_SET_SPAPR_TCE for "
264                                      "KVM VFIO device %d and group fd %d",
265                                      param.tablefd, param.groupfd);
266                     return -errno;
267                 }
268                 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
269             }
270         }
271     }
272 #endif
273     return 0;
274 }
275 
276 void vfio_container_del_section_window(VFIOContainer *container,
277                                        MemoryRegionSection *section)
278 {
279     if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
280         return;
281     }
282 
283     vfio_spapr_remove_window(container,
284                              section->offset_within_address_space);
285     if (vfio_host_win_del(container,
286                           section->offset_within_address_space,
287                           section->offset_within_address_space +
288                           int128_get64(section->size) - 1) < 0) {
289         hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
290                  __func__, section->offset_within_address_space);
291     }
292 }
293 
294 int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
295 {
296     int ret;
297     struct vfio_iommu_type1_dirty_bitmap dirty = {
298         .argsz = sizeof(dirty),
299     };
300 
301     if (!container->dirty_pages_supported) {
302         return 0;
303     }
304 
305     if (start) {
306         dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
307     } else {
308         dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
309     }
310 
311     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
312     if (ret) {
313         ret = -errno;
314         error_report("Failed to set dirty tracking flag 0x%x errno: %d",
315                      dirty.flags, errno);
316     }
317 
318     return ret;
319 }
320 
321 int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
322                             hwaddr iova, hwaddr size)
323 {
324     struct vfio_iommu_type1_dirty_bitmap *dbitmap;
325     struct vfio_iommu_type1_dirty_bitmap_get *range;
326     int ret;
327 
328     dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
329 
330     dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
331     dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
332     range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
333     range->iova = iova;
334     range->size = size;
335 
336     /*
337      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
338      * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
339      * to qemu_real_host_page_size.
340      */
341     range->bitmap.pgsize = qemu_real_host_page_size();
342     range->bitmap.size = vbmap->size;
343     range->bitmap.data = (__u64 *)vbmap->bitmap;
344 
345     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
346     if (ret) {
347         ret = -errno;
348         error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
349                 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
350                 (uint64_t)range->size, errno);
351     }
352 
353     g_free(dbitmap);
354 
355     return ret;
356 }
357 
358 static void vfio_listener_release(VFIOContainer *container)
359 {
360     memory_listener_unregister(&container->listener);
361     if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
362         memory_listener_unregister(&container->prereg_listener);
363     }
364 }
365 
366 static struct vfio_info_cap_header *
367 vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
368 {
369     if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
370         return NULL;
371     }
372 
373     return vfio_get_cap((void *)info, info->cap_offset, id);
374 }
375 
376 bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
377                              unsigned int *avail)
378 {
379     struct vfio_info_cap_header *hdr;
380     struct vfio_iommu_type1_info_dma_avail *cap;
381 
382     /* If the capability cannot be found, assume no DMA limiting */
383     hdr = vfio_get_iommu_type1_info_cap(info,
384                                         VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
385     if (hdr == NULL) {
386         return false;
387     }
388 
389     if (avail != NULL) {
390         cap = (void *) hdr;
391         *avail = cap->avail;
392     }
393 
394     return true;
395 }
396 
397 static void vfio_kvm_device_add_group(VFIOGroup *group)
398 {
399     Error *err = NULL;
400 
401     if (vfio_kvm_device_add_fd(group->fd, &err)) {
402         error_reportf_err(err, "group ID %d: ", group->groupid);
403     }
404 }
405 
406 static void vfio_kvm_device_del_group(VFIOGroup *group)
407 {
408     Error *err = NULL;
409 
410     if (vfio_kvm_device_del_fd(group->fd, &err)) {
411         error_reportf_err(err, "group ID %d: ", group->groupid);
412     }
413 }
414 
415 /*
416  * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
417  */
418 static int vfio_get_iommu_type(VFIOContainer *container,
419                                Error **errp)
420 {
421     int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
422                           VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
423     int i;
424 
425     for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
426         if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
427             return iommu_types[i];
428         }
429     }
430     error_setg(errp, "No available IOMMU models");
431     return -EINVAL;
432 }
433 
434 static int vfio_init_container(VFIOContainer *container, int group_fd,
435                                Error **errp)
436 {
437     int iommu_type, ret;
438 
439     iommu_type = vfio_get_iommu_type(container, errp);
440     if (iommu_type < 0) {
441         return iommu_type;
442     }
443 
444     ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
445     if (ret) {
446         error_setg_errno(errp, errno, "Failed to set group container");
447         return -errno;
448     }
449 
450     while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
451         if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
452             /*
453              * On sPAPR, despite the IOMMU subdriver always advertises v1 and
454              * v2, the running platform may not support v2 and there is no
455              * way to guess it until an IOMMU group gets added to the container.
456              * So in case it fails with v2, try v1 as a fallback.
457              */
458             iommu_type = VFIO_SPAPR_TCE_IOMMU;
459             continue;
460         }
461         error_setg_errno(errp, errno, "Failed to set iommu for container");
462         return -errno;
463     }
464 
465     container->iommu_type = iommu_type;
466     return 0;
467 }
468 
469 static int vfio_get_iommu_info(VFIOContainer *container,
470                                struct vfio_iommu_type1_info **info)
471 {
472 
473     size_t argsz = sizeof(struct vfio_iommu_type1_info);
474 
475     *info = g_new0(struct vfio_iommu_type1_info, 1);
476 again:
477     (*info)->argsz = argsz;
478 
479     if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
480         g_free(*info);
481         *info = NULL;
482         return -errno;
483     }
484 
485     if (((*info)->argsz > argsz)) {
486         argsz = (*info)->argsz;
487         *info = g_realloc(*info, argsz);
488         goto again;
489     }
490 
491     return 0;
492 }
493 
494 static struct vfio_info_cap_header *
495 vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
496 {
497     struct vfio_info_cap_header *hdr;
498     void *ptr = info;
499 
500     if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
501         return NULL;
502     }
503 
504     for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
505         if (hdr->id == id) {
506             return hdr;
507         }
508     }
509 
510     return NULL;
511 }
512 
513 static void vfio_get_iommu_info_migration(VFIOContainer *container,
514                                           struct vfio_iommu_type1_info *info)
515 {
516     struct vfio_info_cap_header *hdr;
517     struct vfio_iommu_type1_info_cap_migration *cap_mig;
518 
519     hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
520     if (!hdr) {
521         return;
522     }
523 
524     cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
525                             header);
526 
527     /*
528      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
529      * qemu_real_host_page_size to mark those dirty.
530      */
531     if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
532         container->dirty_pages_supported = true;
533         container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
534         container->dirty_pgsizes = cap_mig->pgsize_bitmap;
535     }
536 }
537 
538 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
539                                   Error **errp)
540 {
541     VFIOContainer *container;
542     int ret, fd;
543     VFIOAddressSpace *space;
544 
545     space = vfio_get_address_space(as);
546 
547     /*
548      * VFIO is currently incompatible with discarding of RAM insofar as the
549      * madvise to purge (zap) the page from QEMU's address space does not
550      * interact with the memory API and therefore leaves stale virtual to
551      * physical mappings in the IOMMU if the page was previously pinned.  We
552      * therefore set discarding broken for each group added to a container,
553      * whether the container is used individually or shared.  This provides
554      * us with options to allow devices within a group to opt-in and allow
555      * discarding, so long as it is done consistently for a group (for instance
556      * if the device is an mdev device where it is known that the host vendor
557      * driver will never pin pages outside of the working set of the guest
558      * driver, which would thus not be discarding candidates).
559      *
560      * The first opportunity to induce pinning occurs here where we attempt to
561      * attach the group to existing containers within the AddressSpace.  If any
562      * pages are already zapped from the virtual address space, such as from
563      * previous discards, new pinning will cause valid mappings to be
564      * re-established.  Likewise, when the overall MemoryListener for a new
565      * container is registered, a replay of mappings within the AddressSpace
566      * will occur, re-establishing any previously zapped pages as well.
567      *
568      * Especially virtio-balloon is currently only prevented from discarding
569      * new memory, it will not yet set ram_block_discard_set_required() and
570      * therefore, neither stops us here or deals with the sudden memory
571      * consumption of inflated memory.
572      *
573      * We do support discarding of memory coordinated via the RamDiscardManager
574      * with some IOMMU types. vfio_ram_block_discard_disable() handles the
575      * details once we know which type of IOMMU we are using.
576      */
577 
578     QLIST_FOREACH(container, &space->containers, next) {
579         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
580             ret = vfio_ram_block_discard_disable(container, true);
581             if (ret) {
582                 error_setg_errno(errp, -ret,
583                                  "Cannot set discarding of RAM broken");
584                 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
585                           &container->fd)) {
586                     error_report("vfio: error disconnecting group %d from"
587                                  " container", group->groupid);
588                 }
589                 return ret;
590             }
591             group->container = container;
592             QLIST_INSERT_HEAD(&container->group_list, group, container_next);
593             vfio_kvm_device_add_group(group);
594             return 0;
595         }
596     }
597 
598     fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
599     if (fd < 0) {
600         error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
601         ret = -errno;
602         goto put_space_exit;
603     }
604 
605     ret = ioctl(fd, VFIO_GET_API_VERSION);
606     if (ret != VFIO_API_VERSION) {
607         error_setg(errp, "supported vfio version: %d, "
608                    "reported version: %d", VFIO_API_VERSION, ret);
609         ret = -EINVAL;
610         goto close_fd_exit;
611     }
612 
613     container = g_malloc0(sizeof(*container));
614     container->space = space;
615     container->fd = fd;
616     container->error = NULL;
617     container->dirty_pages_supported = false;
618     container->dma_max_mappings = 0;
619     QLIST_INIT(&container->giommu_list);
620     QLIST_INIT(&container->hostwin_list);
621     QLIST_INIT(&container->vrdl_list);
622 
623     ret = vfio_init_container(container, group->fd, errp);
624     if (ret) {
625         goto free_container_exit;
626     }
627 
628     ret = vfio_ram_block_discard_disable(container, true);
629     if (ret) {
630         error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
631         goto free_container_exit;
632     }
633 
634     switch (container->iommu_type) {
635     case VFIO_TYPE1v2_IOMMU:
636     case VFIO_TYPE1_IOMMU:
637     {
638         struct vfio_iommu_type1_info *info;
639 
640         ret = vfio_get_iommu_info(container, &info);
641         if (ret) {
642             error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
643             goto enable_discards_exit;
644         }
645 
646         if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
647             container->pgsizes = info->iova_pgsizes;
648         } else {
649             container->pgsizes = qemu_real_host_page_size();
650         }
651 
652         if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
653             container->dma_max_mappings = 65535;
654         }
655         vfio_get_iommu_info_migration(container, info);
656         g_free(info);
657 
658         /*
659          * FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE
660          * information to get the actual window extent rather than assume
661          * a 64-bit IOVA address space.
662          */
663         vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes);
664 
665         break;
666     }
667     case VFIO_SPAPR_TCE_v2_IOMMU:
668     case VFIO_SPAPR_TCE_IOMMU:
669     {
670         struct vfio_iommu_spapr_tce_info info;
671         bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
672 
673         /*
674          * The host kernel code implementing VFIO_IOMMU_DISABLE is called
675          * when container fd is closed so we do not call it explicitly
676          * in this file.
677          */
678         if (!v2) {
679             ret = ioctl(fd, VFIO_IOMMU_ENABLE);
680             if (ret) {
681                 error_setg_errno(errp, errno, "failed to enable container");
682                 ret = -errno;
683                 goto enable_discards_exit;
684             }
685         } else {
686             container->prereg_listener = vfio_prereg_listener;
687 
688             memory_listener_register(&container->prereg_listener,
689                                      &address_space_memory);
690             if (container->error) {
691                 memory_listener_unregister(&container->prereg_listener);
692                 ret = -1;
693                 error_propagate_prepend(errp, container->error,
694                     "RAM memory listener initialization failed: ");
695                 goto enable_discards_exit;
696             }
697         }
698 
699         info.argsz = sizeof(info);
700         ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
701         if (ret) {
702             error_setg_errno(errp, errno,
703                              "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
704             ret = -errno;
705             if (v2) {
706                 memory_listener_unregister(&container->prereg_listener);
707             }
708             goto enable_discards_exit;
709         }
710 
711         if (v2) {
712             container->pgsizes = info.ddw.pgsizes;
713             /*
714              * There is a default window in just created container.
715              * To make region_add/del simpler, we better remove this
716              * window now and let those iommu_listener callbacks
717              * create/remove them when needed.
718              */
719             ret = vfio_spapr_remove_window(container, info.dma32_window_start);
720             if (ret) {
721                 error_setg_errno(errp, -ret,
722                                  "failed to remove existing window");
723                 goto enable_discards_exit;
724             }
725         } else {
726             /* The default table uses 4K pages */
727             container->pgsizes = 0x1000;
728             vfio_host_win_add(container, info.dma32_window_start,
729                               info.dma32_window_start +
730                               info.dma32_window_size - 1,
731                               0x1000);
732         }
733     }
734     }
735 
736     vfio_kvm_device_add_group(group);
737 
738     QLIST_INIT(&container->group_list);
739     QLIST_INSERT_HEAD(&space->containers, container, next);
740 
741     group->container = container;
742     QLIST_INSERT_HEAD(&container->group_list, group, container_next);
743 
744     container->listener = vfio_memory_listener;
745 
746     memory_listener_register(&container->listener, container->space->as);
747 
748     if (container->error) {
749         ret = -1;
750         error_propagate_prepend(errp, container->error,
751             "memory listener initialization failed: ");
752         goto listener_release_exit;
753     }
754 
755     container->initialized = true;
756 
757     return 0;
758 listener_release_exit:
759     QLIST_REMOVE(group, container_next);
760     QLIST_REMOVE(container, next);
761     vfio_kvm_device_del_group(group);
762     vfio_listener_release(container);
763 
764 enable_discards_exit:
765     vfio_ram_block_discard_disable(container, false);
766 
767 free_container_exit:
768     g_free(container);
769 
770 close_fd_exit:
771     close(fd);
772 
773 put_space_exit:
774     vfio_put_address_space(space);
775 
776     return ret;
777 }
778 
779 static void vfio_disconnect_container(VFIOGroup *group)
780 {
781     VFIOContainer *container = group->container;
782 
783     QLIST_REMOVE(group, container_next);
784     group->container = NULL;
785 
786     /*
787      * Explicitly release the listener first before unset container,
788      * since unset may destroy the backend container if it's the last
789      * group.
790      */
791     if (QLIST_EMPTY(&container->group_list)) {
792         vfio_listener_release(container);
793     }
794 
795     if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
796         error_report("vfio: error disconnecting group %d from container",
797                      group->groupid);
798     }
799 
800     if (QLIST_EMPTY(&container->group_list)) {
801         VFIOAddressSpace *space = container->space;
802         VFIOGuestIOMMU *giommu, *tmp;
803         VFIOHostDMAWindow *hostwin, *next;
804 
805         QLIST_REMOVE(container, next);
806 
807         QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
808             memory_region_unregister_iommu_notifier(
809                     MEMORY_REGION(giommu->iommu_mr), &giommu->n);
810             QLIST_REMOVE(giommu, giommu_next);
811             g_free(giommu);
812         }
813 
814         QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
815                            next) {
816             QLIST_REMOVE(hostwin, hostwin_next);
817             g_free(hostwin);
818         }
819 
820         trace_vfio_disconnect_container(container->fd);
821         close(container->fd);
822         g_free(container);
823 
824         vfio_put_address_space(space);
825     }
826 }
827 
828 static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
829 {
830     VFIOGroup *group;
831     char path[32];
832     struct vfio_group_status status = { .argsz = sizeof(status) };
833 
834     QLIST_FOREACH(group, &vfio_group_list, next) {
835         if (group->groupid == groupid) {
836             /* Found it.  Now is it already in the right context? */
837             if (group->container->space->as == as) {
838                 return group;
839             } else {
840                 error_setg(errp, "group %d used in multiple address spaces",
841                            group->groupid);
842                 return NULL;
843             }
844         }
845     }
846 
847     group = g_malloc0(sizeof(*group));
848 
849     snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
850     group->fd = qemu_open_old(path, O_RDWR);
851     if (group->fd < 0) {
852         error_setg_errno(errp, errno, "failed to open %s", path);
853         goto free_group_exit;
854     }
855 
856     if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
857         error_setg_errno(errp, errno, "failed to get group %d status", groupid);
858         goto close_fd_exit;
859     }
860 
861     if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
862         error_setg(errp, "group %d is not viable", groupid);
863         error_append_hint(errp,
864                           "Please ensure all devices within the iommu_group "
865                           "are bound to their vfio bus driver.\n");
866         goto close_fd_exit;
867     }
868 
869     group->groupid = groupid;
870     QLIST_INIT(&group->device_list);
871 
872     if (vfio_connect_container(group, as, errp)) {
873         error_prepend(errp, "failed to setup container for group %d: ",
874                       groupid);
875         goto close_fd_exit;
876     }
877 
878     QLIST_INSERT_HEAD(&vfio_group_list, group, next);
879 
880     return group;
881 
882 close_fd_exit:
883     close(group->fd);
884 
885 free_group_exit:
886     g_free(group);
887 
888     return NULL;
889 }
890 
891 static void vfio_put_group(VFIOGroup *group)
892 {
893     if (!group || !QLIST_EMPTY(&group->device_list)) {
894         return;
895     }
896 
897     if (!group->ram_block_discard_allowed) {
898         vfio_ram_block_discard_disable(group->container, false);
899     }
900     vfio_kvm_device_del_group(group);
901     vfio_disconnect_container(group);
902     QLIST_REMOVE(group, next);
903     trace_vfio_put_group(group->fd);
904     close(group->fd);
905     g_free(group);
906 }
907 
908 static int vfio_get_device(VFIOGroup *group, const char *name,
909                            VFIODevice *vbasedev, Error **errp)
910 {
911     g_autofree struct vfio_device_info *info = NULL;
912     int fd;
913 
914     fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
915     if (fd < 0) {
916         error_setg_errno(errp, errno, "error getting device from group %d",
917                          group->groupid);
918         error_append_hint(errp,
919                       "Verify all devices in group %d are bound to vfio-<bus> "
920                       "or pci-stub and not already in use\n", group->groupid);
921         return fd;
922     }
923 
924     info = vfio_get_device_info(fd);
925     if (!info) {
926         error_setg_errno(errp, errno, "error getting device info");
927         close(fd);
928         return -1;
929     }
930 
931     /*
932      * Set discarding of RAM as not broken for this group if the driver knows
933      * the device operates compatibly with discarding.  Setting must be
934      * consistent per group, but since compatibility is really only possible
935      * with mdev currently, we expect singleton groups.
936      */
937     if (vbasedev->ram_block_discard_allowed !=
938         group->ram_block_discard_allowed) {
939         if (!QLIST_EMPTY(&group->device_list)) {
940             error_setg(errp, "Inconsistent setting of support for discarding "
941                        "RAM (e.g., balloon) within group");
942             close(fd);
943             return -1;
944         }
945 
946         if (!group->ram_block_discard_allowed) {
947             group->ram_block_discard_allowed = true;
948             vfio_ram_block_discard_disable(group->container, false);
949         }
950     }
951 
952     vbasedev->fd = fd;
953     vbasedev->group = group;
954     QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
955 
956     vbasedev->num_irqs = info->num_irqs;
957     vbasedev->num_regions = info->num_regions;
958     vbasedev->flags = info->flags;
959 
960     trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs);
961 
962     vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
963 
964     return 0;
965 }
966 
967 static void vfio_put_base_device(VFIODevice *vbasedev)
968 {
969     if (!vbasedev->group) {
970         return;
971     }
972     QLIST_REMOVE(vbasedev, next);
973     vbasedev->group = NULL;
974     trace_vfio_put_base_device(vbasedev->fd);
975     close(vbasedev->fd);
976 }
977 
978 /*
979  * Interfaces for IBM EEH (Enhanced Error Handling)
980  */
981 static bool vfio_eeh_container_ok(VFIOContainer *container)
982 {
983     /*
984      * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO
985      * implementation is broken if there are multiple groups in a
986      * container.  The hardware works in units of Partitionable
987      * Endpoints (== IOMMU groups) and the EEH operations naively
988      * iterate across all groups in the container, without any logic
989      * to make sure the groups have their state synchronized.  For
990      * certain operations (ENABLE) that might be ok, until an error
991      * occurs, but for others (GET_STATE) it's clearly broken.
992      */
993 
994     /*
995      * XXX Once fixed kernels exist, test for them here
996      */
997 
998     if (QLIST_EMPTY(&container->group_list)) {
999         return false;
1000     }
1001 
1002     if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
1003         return false;
1004     }
1005 
1006     return true;
1007 }
1008 
1009 static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
1010 {
1011     struct vfio_eeh_pe_op pe_op = {
1012         .argsz = sizeof(pe_op),
1013         .op = op,
1014     };
1015     int ret;
1016 
1017     if (!vfio_eeh_container_ok(container)) {
1018         error_report("vfio/eeh: EEH_PE_OP 0x%x: "
1019                      "kernel requires a container with exactly one group", op);
1020         return -EPERM;
1021     }
1022 
1023     ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
1024     if (ret < 0) {
1025         error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
1026         return -errno;
1027     }
1028 
1029     return ret;
1030 }
1031 
1032 static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
1033 {
1034     VFIOAddressSpace *space = vfio_get_address_space(as);
1035     VFIOContainer *container = NULL;
1036 
1037     if (QLIST_EMPTY(&space->containers)) {
1038         /* No containers to act on */
1039         goto out;
1040     }
1041 
1042     container = QLIST_FIRST(&space->containers);
1043 
1044     if (QLIST_NEXT(container, next)) {
1045         /*
1046          * We don't yet have logic to synchronize EEH state across
1047          * multiple containers
1048          */
1049         container = NULL;
1050         goto out;
1051     }
1052 
1053 out:
1054     vfio_put_address_space(space);
1055     return container;
1056 }
1057 
1058 bool vfio_eeh_as_ok(AddressSpace *as)
1059 {
1060     VFIOContainer *container = vfio_eeh_as_container(as);
1061 
1062     return (container != NULL) && vfio_eeh_container_ok(container);
1063 }
1064 
1065 int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
1066 {
1067     VFIOContainer *container = vfio_eeh_as_container(as);
1068 
1069     if (!container) {
1070         return -ENODEV;
1071     }
1072     return vfio_eeh_container_op(container, op);
1073 }
1074 
1075 static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
1076 {
1077     char *tmp, group_path[PATH_MAX], *group_name;
1078     int ret, groupid;
1079     ssize_t len;
1080 
1081     tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev);
1082     len = readlink(tmp, group_path, sizeof(group_path));
1083     g_free(tmp);
1084 
1085     if (len <= 0 || len >= sizeof(group_path)) {
1086         ret = len < 0 ? -errno : -ENAMETOOLONG;
1087         error_setg_errno(errp, -ret, "no iommu_group found");
1088         return ret;
1089     }
1090 
1091     group_path[len] = 0;
1092 
1093     group_name = basename(group_path);
1094     if (sscanf(group_name, "%d", &groupid) != 1) {
1095         error_setg_errno(errp, errno, "failed to read %s", group_path);
1096         return -errno;
1097     }
1098     return groupid;
1099 }
1100 
1101 /*
1102  * vfio_attach_device: attach a device to a security context
1103  * @name and @vbasedev->name are likely to be different depending
1104  * on the type of the device, hence the need for passing @name
1105  */
1106 int vfio_attach_device(char *name, VFIODevice *vbasedev,
1107                        AddressSpace *as, Error **errp)
1108 {
1109     int groupid = vfio_device_groupid(vbasedev, errp);
1110     VFIODevice *vbasedev_iter;
1111     VFIOGroup *group;
1112     VFIOContainer *container;
1113     int ret;
1114 
1115     if (groupid < 0) {
1116         return groupid;
1117     }
1118 
1119     trace_vfio_attach_device(vbasedev->name, groupid);
1120 
1121     group = vfio_get_group(groupid, as, errp);
1122     if (!group) {
1123         return -ENOENT;
1124     }
1125 
1126     QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
1127         if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
1128             error_setg(errp, "device is already attached");
1129             vfio_put_group(group);
1130             return -EBUSY;
1131         }
1132     }
1133     ret = vfio_get_device(group, name, vbasedev, errp);
1134     if (ret) {
1135         vfio_put_group(group);
1136         return ret;
1137     }
1138 
1139     container = group->container;
1140     vbasedev->container = container;
1141     QLIST_INSERT_HEAD(&container->device_list, vbasedev, container_next);
1142     QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
1143 
1144     return ret;
1145 }
1146 
1147 void vfio_detach_device(VFIODevice *vbasedev)
1148 {
1149     VFIOGroup *group = vbasedev->group;
1150 
1151     if (!vbasedev->container) {
1152         return;
1153     }
1154 
1155     QLIST_REMOVE(vbasedev, global_next);
1156     QLIST_REMOVE(vbasedev, container_next);
1157     vbasedev->container = NULL;
1158     trace_vfio_detach_device(vbasedev->name, group->groupid);
1159     vfio_put_base_device(vbasedev);
1160     vfio_put_group(group);
1161 }
1162