xref: /openbmc/qemu/hw/vfio/container.c (revision 43f04cbeff863ae68b6ead432af5e771b92b934c)
1 /*
2  * generic functions used by VFIO devices
3  *
4  * Copyright Red Hat, Inc. 2012
5  *
6  * Authors:
7  *  Alex Williamson <alex.williamson@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Based on qemu-kvm device-assignment:
13  *  Adapted for KVM by Qumranet.
14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19  */
20 
21 #include "qemu/osdep.h"
22 #include <sys/ioctl.h>
23 #ifdef CONFIG_KVM
24 #include <linux/kvm.h>
25 #endif
26 #include <linux/vfio.h>
27 
28 #include "hw/vfio/vfio-common.h"
29 #include "hw/vfio/vfio.h"
30 #include "exec/address-spaces.h"
31 #include "exec/memory.h"
32 #include "exec/ram_addr.h"
33 #include "hw/hw.h"
34 #include "qemu/error-report.h"
35 #include "qemu/range.h"
36 #include "sysemu/kvm.h"
37 #include "sysemu/reset.h"
38 #include "trace.h"
39 #include "qapi/error.h"
40 #include "migration/migration.h"
41 
42 VFIOGroupList vfio_group_list =
43     QLIST_HEAD_INITIALIZER(vfio_group_list);
44 
45 static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
46 {
47     switch (container->iommu_type) {
48     case VFIO_TYPE1v2_IOMMU:
49     case VFIO_TYPE1_IOMMU:
50         /*
51          * We support coordinated discarding of RAM via the RamDiscardManager.
52          */
53         return ram_block_uncoordinated_discard_disable(state);
54     default:
55         /*
56          * VFIO_SPAPR_TCE_IOMMU most probably works just fine with
57          * RamDiscardManager, however, it is completely untested.
58          *
59          * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does
60          * completely the opposite of managing mapping/pinning dynamically as
61          * required by RamDiscardManager. We would have to special-case sections
62          * with a RamDiscardManager.
63          */
64         return ram_block_discard_disable(state);
65     }
66 }
67 
68 static int vfio_dma_unmap_bitmap(VFIOContainer *container,
69                                  hwaddr iova, ram_addr_t size,
70                                  IOMMUTLBEntry *iotlb)
71 {
72     struct vfio_iommu_type1_dma_unmap *unmap;
73     struct vfio_bitmap *bitmap;
74     VFIOBitmap vbmap;
75     int ret;
76 
77     ret = vfio_bitmap_alloc(&vbmap, size);
78     if (ret) {
79         return ret;
80     }
81 
82     unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
83 
84     unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
85     unmap->iova = iova;
86     unmap->size = size;
87     unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
88     bitmap = (struct vfio_bitmap *)&unmap->data;
89 
90     /*
91      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
92      * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
93      * to qemu_real_host_page_size.
94      */
95     bitmap->pgsize = qemu_real_host_page_size();
96     bitmap->size = vbmap.size;
97     bitmap->data = (__u64 *)vbmap.bitmap;
98 
99     if (vbmap.size > container->max_dirty_bitmap_size) {
100         error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
101         ret = -E2BIG;
102         goto unmap_exit;
103     }
104 
105     ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
106     if (!ret) {
107         cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
108                 iotlb->translated_addr, vbmap.pages);
109     } else {
110         error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
111     }
112 
113 unmap_exit:
114     g_free(unmap);
115     g_free(vbmap.bitmap);
116 
117     return ret;
118 }
119 
120 /*
121  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
122  */
123 int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
124                    ram_addr_t size, IOMMUTLBEntry *iotlb)
125 {
126     struct vfio_iommu_type1_dma_unmap unmap = {
127         .argsz = sizeof(unmap),
128         .flags = 0,
129         .iova = iova,
130         .size = size,
131     };
132     bool need_dirty_sync = false;
133     int ret;
134 
135     if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
136         if (!vfio_devices_all_device_dirty_tracking(container) &&
137             container->dirty_pages_supported) {
138             return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
139         }
140 
141         need_dirty_sync = true;
142     }
143 
144     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
145         /*
146          * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
147          * v4.15) where an overflow in its wrap-around check prevents us from
148          * unmapping the last page of the address space.  Test for the error
149          * condition and re-try the unmap excluding the last page.  The
150          * expectation is that we've never mapped the last page anyway and this
151          * unmap request comes via vIOMMU support which also makes it unlikely
152          * that this page is used.  This bug was introduced well after type1 v2
153          * support was introduced, so we shouldn't need to test for v1.  A fix
154          * is queued for kernel v5.0 so this workaround can be removed once
155          * affected kernels are sufficiently deprecated.
156          */
157         if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
158             container->iommu_type == VFIO_TYPE1v2_IOMMU) {
159             trace_vfio_dma_unmap_overflow_workaround();
160             unmap.size -= 1ULL << ctz64(container->pgsizes);
161             continue;
162         }
163         error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
164         return -errno;
165     }
166 
167     if (need_dirty_sync) {
168         ret = vfio_get_dirty_bitmap(container, iova, size,
169                                     iotlb->translated_addr);
170         if (ret) {
171             return ret;
172         }
173     }
174 
175     return 0;
176 }
177 
178 int vfio_dma_map(VFIOContainer *container, hwaddr iova,
179                  ram_addr_t size, void *vaddr, bool readonly)
180 {
181     struct vfio_iommu_type1_dma_map map = {
182         .argsz = sizeof(map),
183         .flags = VFIO_DMA_MAP_FLAG_READ,
184         .vaddr = (__u64)(uintptr_t)vaddr,
185         .iova = iova,
186         .size = size,
187     };
188 
189     if (!readonly) {
190         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
191     }
192 
193     /*
194      * Try the mapping, if it fails with EBUSY, unmap the region and try
195      * again.  This shouldn't be necessary, but we sometimes see it in
196      * the VGA ROM space.
197      */
198     if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
199         (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
200          ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
201         return 0;
202     }
203 
204     error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
205     return -errno;
206 }
207 
208 int vfio_container_add_section_window(VFIOContainer *container,
209                                       MemoryRegionSection *section,
210                                       Error **errp)
211 {
212     VFIOHostDMAWindow *hostwin;
213     hwaddr pgsize = 0;
214     int ret;
215 
216     if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
217         return 0;
218     }
219 
220     /* For now intersections are not allowed, we may relax this later */
221     QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
222         if (ranges_overlap(hostwin->min_iova,
223                            hostwin->max_iova - hostwin->min_iova + 1,
224                            section->offset_within_address_space,
225                            int128_get64(section->size))) {
226             error_setg(errp,
227                 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
228                 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
229                 section->offset_within_address_space,
230                 section->offset_within_address_space +
231                     int128_get64(section->size) - 1,
232                 hostwin->min_iova, hostwin->max_iova);
233             return -EINVAL;
234         }
235     }
236 
237     ret = vfio_spapr_create_window(container, section, &pgsize);
238     if (ret) {
239         error_setg_errno(errp, -ret, "Failed to create SPAPR window");
240         return ret;
241     }
242 
243     vfio_host_win_add(container, section->offset_within_address_space,
244                       section->offset_within_address_space +
245                       int128_get64(section->size) - 1, pgsize);
246 #ifdef CONFIG_KVM
247     if (kvm_enabled()) {
248         VFIOGroup *group;
249         IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
250         struct kvm_vfio_spapr_tce param;
251         struct kvm_device_attr attr = {
252             .group = KVM_DEV_VFIO_GROUP,
253             .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
254             .addr = (uint64_t)(unsigned long)&param,
255         };
256 
257         if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
258                                           &param.tablefd)) {
259             QLIST_FOREACH(group, &container->group_list, container_next) {
260                 param.groupfd = group->fd;
261                 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
262                     error_setg_errno(errp, errno,
263                                      "vfio: failed GROUP_SET_SPAPR_TCE for "
264                                      "KVM VFIO device %d and group fd %d",
265                                      param.tablefd, param.groupfd);
266                     return -errno;
267                 }
268                 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
269             }
270         }
271     }
272 #endif
273     return 0;
274 }
275 
276 void vfio_container_del_section_window(VFIOContainer *container,
277                                        MemoryRegionSection *section)
278 {
279     if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
280         return;
281     }
282 
283     vfio_spapr_remove_window(container,
284                              section->offset_within_address_space);
285     if (vfio_host_win_del(container,
286                           section->offset_within_address_space,
287                           section->offset_within_address_space +
288                           int128_get64(section->size) - 1) < 0) {
289         hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
290                  __func__, section->offset_within_address_space);
291     }
292 }
293 
294 int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
295 {
296     int ret;
297     struct vfio_iommu_type1_dirty_bitmap dirty = {
298         .argsz = sizeof(dirty),
299     };
300 
301     if (!container->dirty_pages_supported) {
302         return 0;
303     }
304 
305     if (start) {
306         dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
307     } else {
308         dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
309     }
310 
311     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
312     if (ret) {
313         ret = -errno;
314         error_report("Failed to set dirty tracking flag 0x%x errno: %d",
315                      dirty.flags, errno);
316     }
317 
318     return ret;
319 }
320 
321 int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
322                             hwaddr iova, hwaddr size)
323 {
324     struct vfio_iommu_type1_dirty_bitmap *dbitmap;
325     struct vfio_iommu_type1_dirty_bitmap_get *range;
326     int ret;
327 
328     dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
329 
330     dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
331     dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
332     range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
333     range->iova = iova;
334     range->size = size;
335 
336     /*
337      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
338      * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
339      * to qemu_real_host_page_size.
340      */
341     range->bitmap.pgsize = qemu_real_host_page_size();
342     range->bitmap.size = vbmap->size;
343     range->bitmap.data = (__u64 *)vbmap->bitmap;
344 
345     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
346     if (ret) {
347         ret = -errno;
348         error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
349                 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
350                 (uint64_t)range->size, errno);
351     }
352 
353     g_free(dbitmap);
354 
355     return ret;
356 }
357 
358 static void vfio_listener_release(VFIOContainer *container)
359 {
360     memory_listener_unregister(&container->listener);
361     if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
362         memory_listener_unregister(&container->prereg_listener);
363     }
364 }
365 
366 static struct vfio_info_cap_header *
367 vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
368 {
369     if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
370         return NULL;
371     }
372 
373     return vfio_get_cap((void *)info, info->cap_offset, id);
374 }
375 
376 bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
377                              unsigned int *avail)
378 {
379     struct vfio_info_cap_header *hdr;
380     struct vfio_iommu_type1_info_dma_avail *cap;
381 
382     /* If the capability cannot be found, assume no DMA limiting */
383     hdr = vfio_get_iommu_type1_info_cap(info,
384                                         VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
385     if (!hdr) {
386         return false;
387     }
388 
389     if (avail != NULL) {
390         cap = (void *) hdr;
391         *avail = cap->avail;
392     }
393 
394     return true;
395 }
396 
397 static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
398                                      VFIOContainer *container)
399 {
400     struct vfio_info_cap_header *hdr;
401     struct vfio_iommu_type1_info_cap_iova_range *cap;
402 
403     hdr = vfio_get_iommu_type1_info_cap(info,
404                                         VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE);
405     if (!hdr) {
406         return false;
407     }
408 
409     cap = (void *)hdr;
410 
411     for (int i = 0; i < cap->nr_iovas; i++) {
412         Range *range = g_new(Range, 1);
413 
414         range_set_bounds(range, cap->iova_ranges[i].start,
415                          cap->iova_ranges[i].end);
416         container->iova_ranges =
417             range_list_insert(container->iova_ranges, range);
418     }
419 
420     return true;
421 }
422 
423 static void vfio_kvm_device_add_group(VFIOGroup *group)
424 {
425     Error *err = NULL;
426 
427     if (vfio_kvm_device_add_fd(group->fd, &err)) {
428         error_reportf_err(err, "group ID %d: ", group->groupid);
429     }
430 }
431 
432 static void vfio_kvm_device_del_group(VFIOGroup *group)
433 {
434     Error *err = NULL;
435 
436     if (vfio_kvm_device_del_fd(group->fd, &err)) {
437         error_reportf_err(err, "group ID %d: ", group->groupid);
438     }
439 }
440 
441 /*
442  * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
443  */
444 static int vfio_get_iommu_type(VFIOContainer *container,
445                                Error **errp)
446 {
447     int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
448                           VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
449     int i;
450 
451     for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
452         if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
453             return iommu_types[i];
454         }
455     }
456     error_setg(errp, "No available IOMMU models");
457     return -EINVAL;
458 }
459 
460 static int vfio_init_container(VFIOContainer *container, int group_fd,
461                                Error **errp)
462 {
463     int iommu_type, ret;
464 
465     iommu_type = vfio_get_iommu_type(container, errp);
466     if (iommu_type < 0) {
467         return iommu_type;
468     }
469 
470     ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
471     if (ret) {
472         error_setg_errno(errp, errno, "Failed to set group container");
473         return -errno;
474     }
475 
476     while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
477         if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
478             /*
479              * On sPAPR, despite the IOMMU subdriver always advertises v1 and
480              * v2, the running platform may not support v2 and there is no
481              * way to guess it until an IOMMU group gets added to the container.
482              * So in case it fails with v2, try v1 as a fallback.
483              */
484             iommu_type = VFIO_SPAPR_TCE_IOMMU;
485             continue;
486         }
487         error_setg_errno(errp, errno, "Failed to set iommu for container");
488         return -errno;
489     }
490 
491     container->iommu_type = iommu_type;
492     return 0;
493 }
494 
495 static int vfio_get_iommu_info(VFIOContainer *container,
496                                struct vfio_iommu_type1_info **info)
497 {
498 
499     size_t argsz = sizeof(struct vfio_iommu_type1_info);
500 
501     *info = g_new0(struct vfio_iommu_type1_info, 1);
502 again:
503     (*info)->argsz = argsz;
504 
505     if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
506         g_free(*info);
507         *info = NULL;
508         return -errno;
509     }
510 
511     if (((*info)->argsz > argsz)) {
512         argsz = (*info)->argsz;
513         *info = g_realloc(*info, argsz);
514         goto again;
515     }
516 
517     return 0;
518 }
519 
520 static struct vfio_info_cap_header *
521 vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
522 {
523     struct vfio_info_cap_header *hdr;
524     void *ptr = info;
525 
526     if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
527         return NULL;
528     }
529 
530     for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
531         if (hdr->id == id) {
532             return hdr;
533         }
534     }
535 
536     return NULL;
537 }
538 
539 static void vfio_get_iommu_info_migration(VFIOContainer *container,
540                                           struct vfio_iommu_type1_info *info)
541 {
542     struct vfio_info_cap_header *hdr;
543     struct vfio_iommu_type1_info_cap_migration *cap_mig;
544 
545     hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
546     if (!hdr) {
547         return;
548     }
549 
550     cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
551                             header);
552 
553     /*
554      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
555      * qemu_real_host_page_size to mark those dirty.
556      */
557     if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
558         container->dirty_pages_supported = true;
559         container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
560         container->dirty_pgsizes = cap_mig->pgsize_bitmap;
561     }
562 }
563 
564 static void vfio_free_container(VFIOContainer *container)
565 {
566     g_list_free_full(container->iova_ranges, g_free);
567     g_free(container);
568 }
569 
570 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
571                                   Error **errp)
572 {
573     VFIOContainer *container;
574     int ret, fd;
575     VFIOAddressSpace *space;
576 
577     space = vfio_get_address_space(as);
578 
579     /*
580      * VFIO is currently incompatible with discarding of RAM insofar as the
581      * madvise to purge (zap) the page from QEMU's address space does not
582      * interact with the memory API and therefore leaves stale virtual to
583      * physical mappings in the IOMMU if the page was previously pinned.  We
584      * therefore set discarding broken for each group added to a container,
585      * whether the container is used individually or shared.  This provides
586      * us with options to allow devices within a group to opt-in and allow
587      * discarding, so long as it is done consistently for a group (for instance
588      * if the device is an mdev device where it is known that the host vendor
589      * driver will never pin pages outside of the working set of the guest
590      * driver, which would thus not be discarding candidates).
591      *
592      * The first opportunity to induce pinning occurs here where we attempt to
593      * attach the group to existing containers within the AddressSpace.  If any
594      * pages are already zapped from the virtual address space, such as from
595      * previous discards, new pinning will cause valid mappings to be
596      * re-established.  Likewise, when the overall MemoryListener for a new
597      * container is registered, a replay of mappings within the AddressSpace
598      * will occur, re-establishing any previously zapped pages as well.
599      *
600      * Especially virtio-balloon is currently only prevented from discarding
601      * new memory, it will not yet set ram_block_discard_set_required() and
602      * therefore, neither stops us here or deals with the sudden memory
603      * consumption of inflated memory.
604      *
605      * We do support discarding of memory coordinated via the RamDiscardManager
606      * with some IOMMU types. vfio_ram_block_discard_disable() handles the
607      * details once we know which type of IOMMU we are using.
608      */
609 
610     QLIST_FOREACH(container, &space->containers, next) {
611         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
612             ret = vfio_ram_block_discard_disable(container, true);
613             if (ret) {
614                 error_setg_errno(errp, -ret,
615                                  "Cannot set discarding of RAM broken");
616                 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
617                           &container->fd)) {
618                     error_report("vfio: error disconnecting group %d from"
619                                  " container", group->groupid);
620                 }
621                 return ret;
622             }
623             group->container = container;
624             QLIST_INSERT_HEAD(&container->group_list, group, container_next);
625             vfio_kvm_device_add_group(group);
626             return 0;
627         }
628     }
629 
630     fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
631     if (fd < 0) {
632         error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
633         ret = -errno;
634         goto put_space_exit;
635     }
636 
637     ret = ioctl(fd, VFIO_GET_API_VERSION);
638     if (ret != VFIO_API_VERSION) {
639         error_setg(errp, "supported vfio version: %d, "
640                    "reported version: %d", VFIO_API_VERSION, ret);
641         ret = -EINVAL;
642         goto close_fd_exit;
643     }
644 
645     container = g_malloc0(sizeof(*container));
646     container->space = space;
647     container->fd = fd;
648     container->error = NULL;
649     container->dirty_pages_supported = false;
650     container->dma_max_mappings = 0;
651     container->iova_ranges = NULL;
652     QLIST_INIT(&container->giommu_list);
653     QLIST_INIT(&container->hostwin_list);
654     QLIST_INIT(&container->vrdl_list);
655 
656     ret = vfio_init_container(container, group->fd, errp);
657     if (ret) {
658         goto free_container_exit;
659     }
660 
661     ret = vfio_ram_block_discard_disable(container, true);
662     if (ret) {
663         error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
664         goto free_container_exit;
665     }
666 
667     switch (container->iommu_type) {
668     case VFIO_TYPE1v2_IOMMU:
669     case VFIO_TYPE1_IOMMU:
670     {
671         struct vfio_iommu_type1_info *info;
672 
673         ret = vfio_get_iommu_info(container, &info);
674         if (ret) {
675             error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
676             goto enable_discards_exit;
677         }
678 
679         if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
680             container->pgsizes = info->iova_pgsizes;
681         } else {
682             container->pgsizes = qemu_real_host_page_size();
683         }
684 
685         if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
686             container->dma_max_mappings = 65535;
687         }
688 
689         vfio_get_info_iova_range(info, container);
690 
691         vfio_get_iommu_info_migration(container, info);
692         g_free(info);
693 
694         /*
695          * FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE
696          * information to get the actual window extent rather than assume
697          * a 64-bit IOVA address space.
698          */
699         vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes);
700 
701         break;
702     }
703     case VFIO_SPAPR_TCE_v2_IOMMU:
704     case VFIO_SPAPR_TCE_IOMMU:
705     {
706         struct vfio_iommu_spapr_tce_info info;
707         bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
708 
709         /*
710          * The host kernel code implementing VFIO_IOMMU_DISABLE is called
711          * when container fd is closed so we do not call it explicitly
712          * in this file.
713          */
714         if (!v2) {
715             ret = ioctl(fd, VFIO_IOMMU_ENABLE);
716             if (ret) {
717                 error_setg_errno(errp, errno, "failed to enable container");
718                 ret = -errno;
719                 goto enable_discards_exit;
720             }
721         } else {
722             container->prereg_listener = vfio_prereg_listener;
723 
724             memory_listener_register(&container->prereg_listener,
725                                      &address_space_memory);
726             if (container->error) {
727                 memory_listener_unregister(&container->prereg_listener);
728                 ret = -1;
729                 error_propagate_prepend(errp, container->error,
730                     "RAM memory listener initialization failed: ");
731                 goto enable_discards_exit;
732             }
733         }
734 
735         info.argsz = sizeof(info);
736         ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
737         if (ret) {
738             error_setg_errno(errp, errno,
739                              "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
740             ret = -errno;
741             if (v2) {
742                 memory_listener_unregister(&container->prereg_listener);
743             }
744             goto enable_discards_exit;
745         }
746 
747         if (v2) {
748             container->pgsizes = info.ddw.pgsizes;
749             /*
750              * There is a default window in just created container.
751              * To make region_add/del simpler, we better remove this
752              * window now and let those iommu_listener callbacks
753              * create/remove them when needed.
754              */
755             ret = vfio_spapr_remove_window(container, info.dma32_window_start);
756             if (ret) {
757                 error_setg_errno(errp, -ret,
758                                  "failed to remove existing window");
759                 goto enable_discards_exit;
760             }
761         } else {
762             /* The default table uses 4K pages */
763             container->pgsizes = 0x1000;
764             vfio_host_win_add(container, info.dma32_window_start,
765                               info.dma32_window_start +
766                               info.dma32_window_size - 1,
767                               0x1000);
768         }
769     }
770     }
771 
772     vfio_kvm_device_add_group(group);
773 
774     QLIST_INIT(&container->group_list);
775     QLIST_INSERT_HEAD(&space->containers, container, next);
776 
777     group->container = container;
778     QLIST_INSERT_HEAD(&container->group_list, group, container_next);
779 
780     container->listener = vfio_memory_listener;
781 
782     memory_listener_register(&container->listener, container->space->as);
783 
784     if (container->error) {
785         ret = -1;
786         error_propagate_prepend(errp, container->error,
787             "memory listener initialization failed: ");
788         goto listener_release_exit;
789     }
790 
791     container->initialized = true;
792 
793     return 0;
794 listener_release_exit:
795     QLIST_REMOVE(group, container_next);
796     QLIST_REMOVE(container, next);
797     vfio_kvm_device_del_group(group);
798     vfio_listener_release(container);
799 
800 enable_discards_exit:
801     vfio_ram_block_discard_disable(container, false);
802 
803 free_container_exit:
804     vfio_free_container(container);
805 
806 close_fd_exit:
807     close(fd);
808 
809 put_space_exit:
810     vfio_put_address_space(space);
811 
812     return ret;
813 }
814 
815 static void vfio_disconnect_container(VFIOGroup *group)
816 {
817     VFIOContainer *container = group->container;
818 
819     QLIST_REMOVE(group, container_next);
820     group->container = NULL;
821 
822     /*
823      * Explicitly release the listener first before unset container,
824      * since unset may destroy the backend container if it's the last
825      * group.
826      */
827     if (QLIST_EMPTY(&container->group_list)) {
828         vfio_listener_release(container);
829     }
830 
831     if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
832         error_report("vfio: error disconnecting group %d from container",
833                      group->groupid);
834     }
835 
836     if (QLIST_EMPTY(&container->group_list)) {
837         VFIOAddressSpace *space = container->space;
838         VFIOGuestIOMMU *giommu, *tmp;
839         VFIOHostDMAWindow *hostwin, *next;
840 
841         QLIST_REMOVE(container, next);
842 
843         QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
844             memory_region_unregister_iommu_notifier(
845                     MEMORY_REGION(giommu->iommu_mr), &giommu->n);
846             QLIST_REMOVE(giommu, giommu_next);
847             g_free(giommu);
848         }
849 
850         QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
851                            next) {
852             QLIST_REMOVE(hostwin, hostwin_next);
853             g_free(hostwin);
854         }
855 
856         trace_vfio_disconnect_container(container->fd);
857         close(container->fd);
858         vfio_free_container(container);
859 
860         vfio_put_address_space(space);
861     }
862 }
863 
864 static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
865 {
866     VFIOGroup *group;
867     char path[32];
868     struct vfio_group_status status = { .argsz = sizeof(status) };
869 
870     QLIST_FOREACH(group, &vfio_group_list, next) {
871         if (group->groupid == groupid) {
872             /* Found it.  Now is it already in the right context? */
873             if (group->container->space->as == as) {
874                 return group;
875             } else {
876                 error_setg(errp, "group %d used in multiple address spaces",
877                            group->groupid);
878                 return NULL;
879             }
880         }
881     }
882 
883     group = g_malloc0(sizeof(*group));
884 
885     snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
886     group->fd = qemu_open_old(path, O_RDWR);
887     if (group->fd < 0) {
888         error_setg_errno(errp, errno, "failed to open %s", path);
889         goto free_group_exit;
890     }
891 
892     if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
893         error_setg_errno(errp, errno, "failed to get group %d status", groupid);
894         goto close_fd_exit;
895     }
896 
897     if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
898         error_setg(errp, "group %d is not viable", groupid);
899         error_append_hint(errp,
900                           "Please ensure all devices within the iommu_group "
901                           "are bound to their vfio bus driver.\n");
902         goto close_fd_exit;
903     }
904 
905     group->groupid = groupid;
906     QLIST_INIT(&group->device_list);
907 
908     if (vfio_connect_container(group, as, errp)) {
909         error_prepend(errp, "failed to setup container for group %d: ",
910                       groupid);
911         goto close_fd_exit;
912     }
913 
914     QLIST_INSERT_HEAD(&vfio_group_list, group, next);
915 
916     return group;
917 
918 close_fd_exit:
919     close(group->fd);
920 
921 free_group_exit:
922     g_free(group);
923 
924     return NULL;
925 }
926 
927 static void vfio_put_group(VFIOGroup *group)
928 {
929     if (!group || !QLIST_EMPTY(&group->device_list)) {
930         return;
931     }
932 
933     if (!group->ram_block_discard_allowed) {
934         vfio_ram_block_discard_disable(group->container, false);
935     }
936     vfio_kvm_device_del_group(group);
937     vfio_disconnect_container(group);
938     QLIST_REMOVE(group, next);
939     trace_vfio_put_group(group->fd);
940     close(group->fd);
941     g_free(group);
942 }
943 
944 static int vfio_get_device(VFIOGroup *group, const char *name,
945                            VFIODevice *vbasedev, Error **errp)
946 {
947     g_autofree struct vfio_device_info *info = NULL;
948     int fd;
949 
950     fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
951     if (fd < 0) {
952         error_setg_errno(errp, errno, "error getting device from group %d",
953                          group->groupid);
954         error_append_hint(errp,
955                       "Verify all devices in group %d are bound to vfio-<bus> "
956                       "or pci-stub and not already in use\n", group->groupid);
957         return fd;
958     }
959 
960     info = vfio_get_device_info(fd);
961     if (!info) {
962         error_setg_errno(errp, errno, "error getting device info");
963         close(fd);
964         return -1;
965     }
966 
967     /*
968      * Set discarding of RAM as not broken for this group if the driver knows
969      * the device operates compatibly with discarding.  Setting must be
970      * consistent per group, but since compatibility is really only possible
971      * with mdev currently, we expect singleton groups.
972      */
973     if (vbasedev->ram_block_discard_allowed !=
974         group->ram_block_discard_allowed) {
975         if (!QLIST_EMPTY(&group->device_list)) {
976             error_setg(errp, "Inconsistent setting of support for discarding "
977                        "RAM (e.g., balloon) within group");
978             close(fd);
979             return -1;
980         }
981 
982         if (!group->ram_block_discard_allowed) {
983             group->ram_block_discard_allowed = true;
984             vfio_ram_block_discard_disable(group->container, false);
985         }
986     }
987 
988     vbasedev->fd = fd;
989     vbasedev->group = group;
990     QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
991 
992     vbasedev->num_irqs = info->num_irqs;
993     vbasedev->num_regions = info->num_regions;
994     vbasedev->flags = info->flags;
995 
996     trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs);
997 
998     vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
999 
1000     return 0;
1001 }
1002 
1003 static void vfio_put_base_device(VFIODevice *vbasedev)
1004 {
1005     if (!vbasedev->group) {
1006         return;
1007     }
1008     QLIST_REMOVE(vbasedev, next);
1009     vbasedev->group = NULL;
1010     trace_vfio_put_base_device(vbasedev->fd);
1011     close(vbasedev->fd);
1012 }
1013 
1014 /*
1015  * Interfaces for IBM EEH (Enhanced Error Handling)
1016  */
1017 static bool vfio_eeh_container_ok(VFIOContainer *container)
1018 {
1019     /*
1020      * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO
1021      * implementation is broken if there are multiple groups in a
1022      * container.  The hardware works in units of Partitionable
1023      * Endpoints (== IOMMU groups) and the EEH operations naively
1024      * iterate across all groups in the container, without any logic
1025      * to make sure the groups have their state synchronized.  For
1026      * certain operations (ENABLE) that might be ok, until an error
1027      * occurs, but for others (GET_STATE) it's clearly broken.
1028      */
1029 
1030     /*
1031      * XXX Once fixed kernels exist, test for them here
1032      */
1033 
1034     if (QLIST_EMPTY(&container->group_list)) {
1035         return false;
1036     }
1037 
1038     if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
1039         return false;
1040     }
1041 
1042     return true;
1043 }
1044 
1045 static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
1046 {
1047     struct vfio_eeh_pe_op pe_op = {
1048         .argsz = sizeof(pe_op),
1049         .op = op,
1050     };
1051     int ret;
1052 
1053     if (!vfio_eeh_container_ok(container)) {
1054         error_report("vfio/eeh: EEH_PE_OP 0x%x: "
1055                      "kernel requires a container with exactly one group", op);
1056         return -EPERM;
1057     }
1058 
1059     ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
1060     if (ret < 0) {
1061         error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
1062         return -errno;
1063     }
1064 
1065     return ret;
1066 }
1067 
1068 static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
1069 {
1070     VFIOAddressSpace *space = vfio_get_address_space(as);
1071     VFIOContainer *container = NULL;
1072 
1073     if (QLIST_EMPTY(&space->containers)) {
1074         /* No containers to act on */
1075         goto out;
1076     }
1077 
1078     container = QLIST_FIRST(&space->containers);
1079 
1080     if (QLIST_NEXT(container, next)) {
1081         /*
1082          * We don't yet have logic to synchronize EEH state across
1083          * multiple containers
1084          */
1085         container = NULL;
1086         goto out;
1087     }
1088 
1089 out:
1090     vfio_put_address_space(space);
1091     return container;
1092 }
1093 
1094 bool vfio_eeh_as_ok(AddressSpace *as)
1095 {
1096     VFIOContainer *container = vfio_eeh_as_container(as);
1097 
1098     return (container != NULL) && vfio_eeh_container_ok(container);
1099 }
1100 
1101 int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
1102 {
1103     VFIOContainer *container = vfio_eeh_as_container(as);
1104 
1105     if (!container) {
1106         return -ENODEV;
1107     }
1108     return vfio_eeh_container_op(container, op);
1109 }
1110 
1111 static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
1112 {
1113     char *tmp, group_path[PATH_MAX], *group_name;
1114     int ret, groupid;
1115     ssize_t len;
1116 
1117     tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev);
1118     len = readlink(tmp, group_path, sizeof(group_path));
1119     g_free(tmp);
1120 
1121     if (len <= 0 || len >= sizeof(group_path)) {
1122         ret = len < 0 ? -errno : -ENAMETOOLONG;
1123         error_setg_errno(errp, -ret, "no iommu_group found");
1124         return ret;
1125     }
1126 
1127     group_path[len] = 0;
1128 
1129     group_name = basename(group_path);
1130     if (sscanf(group_name, "%d", &groupid) != 1) {
1131         error_setg_errno(errp, errno, "failed to read %s", group_path);
1132         return -errno;
1133     }
1134     return groupid;
1135 }
1136 
1137 /*
1138  * vfio_attach_device: attach a device to a security context
1139  * @name and @vbasedev->name are likely to be different depending
1140  * on the type of the device, hence the need for passing @name
1141  */
1142 int vfio_attach_device(char *name, VFIODevice *vbasedev,
1143                        AddressSpace *as, Error **errp)
1144 {
1145     int groupid = vfio_device_groupid(vbasedev, errp);
1146     VFIODevice *vbasedev_iter;
1147     VFIOGroup *group;
1148     VFIOContainer *container;
1149     int ret;
1150 
1151     if (groupid < 0) {
1152         return groupid;
1153     }
1154 
1155     trace_vfio_attach_device(vbasedev->name, groupid);
1156 
1157     group = vfio_get_group(groupid, as, errp);
1158     if (!group) {
1159         return -ENOENT;
1160     }
1161 
1162     QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
1163         if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
1164             error_setg(errp, "device is already attached");
1165             vfio_put_group(group);
1166             return -EBUSY;
1167         }
1168     }
1169     ret = vfio_get_device(group, name, vbasedev, errp);
1170     if (ret) {
1171         vfio_put_group(group);
1172         return ret;
1173     }
1174 
1175     container = group->container;
1176     vbasedev->container = container;
1177     QLIST_INSERT_HEAD(&container->device_list, vbasedev, container_next);
1178     QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
1179 
1180     return ret;
1181 }
1182 
1183 void vfio_detach_device(VFIODevice *vbasedev)
1184 {
1185     VFIOGroup *group = vbasedev->group;
1186 
1187     if (!vbasedev->container) {
1188         return;
1189     }
1190 
1191     QLIST_REMOVE(vbasedev, global_next);
1192     QLIST_REMOVE(vbasedev, container_next);
1193     vbasedev->container = NULL;
1194     trace_vfio_detach_device(vbasedev->name, group->groupid);
1195     vfio_put_base_device(vbasedev);
1196     vfio_put_group(group);
1197 }
1198