xref: /openbmc/qemu/hw/vfio/common.c (revision 36ebc7db)
1 /*
2  * generic functions used by VFIO devices
3  *
4  * Copyright Red Hat, Inc. 2012
5  *
6  * Authors:
7  *  Alex Williamson <alex.williamson@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Based on qemu-kvm device-assignment:
13  *  Adapted for KVM by Qumranet.
14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19  */
20 
21 #include "qemu/osdep.h"
22 #include <sys/ioctl.h>
23 #ifdef CONFIG_KVM
24 #include <linux/kvm.h>
25 #endif
26 #include <linux/vfio.h>
27 
28 #include "hw/vfio/vfio-common.h"
29 #include "hw/vfio/vfio.h"
30 #include "exec/address-spaces.h"
31 #include "exec/memory.h"
32 #include "exec/ram_addr.h"
33 #include "hw/hw.h"
34 #include "qemu/error-report.h"
35 #include "qemu/main-loop.h"
36 #include "qemu/range.h"
37 #include "sysemu/kvm.h"
38 #include "sysemu/reset.h"
39 #include "sysemu/runstate.h"
40 #include "trace.h"
41 #include "qapi/error.h"
42 #include "migration/migration.h"
43 #include "migration/misc.h"
44 #include "migration/blocker.h"
45 #include "sysemu/tpm.h"
46 
47 VFIOGroupList vfio_group_list =
48     QLIST_HEAD_INITIALIZER(vfio_group_list);
49 static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
50     QLIST_HEAD_INITIALIZER(vfio_address_spaces);
51 
52 #ifdef CONFIG_KVM
53 /*
54  * We have a single VFIO pseudo device per KVM VM.  Once created it lives
55  * for the life of the VM.  Closing the file descriptor only drops our
56  * reference to it and the device's reference to kvm.  Therefore once
57  * initialized, this file descriptor is only released on QEMU exit and
58  * we'll re-use it should another vfio device be attached before then.
59  */
60 static int vfio_kvm_device_fd = -1;
61 #endif
62 
63 /*
64  * Common VFIO interrupt disable
65  */
66 void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
67 {
68     struct vfio_irq_set irq_set = {
69         .argsz = sizeof(irq_set),
70         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
71         .index = index,
72         .start = 0,
73         .count = 0,
74     };
75 
76     ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
77 }
78 
79 void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
80 {
81     struct vfio_irq_set irq_set = {
82         .argsz = sizeof(irq_set),
83         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
84         .index = index,
85         .start = 0,
86         .count = 1,
87     };
88 
89     ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
90 }
91 
92 void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
93 {
94     struct vfio_irq_set irq_set = {
95         .argsz = sizeof(irq_set),
96         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
97         .index = index,
98         .start = 0,
99         .count = 1,
100     };
101 
102     ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
103 }
104 
105 static inline const char *action_to_str(int action)
106 {
107     switch (action) {
108     case VFIO_IRQ_SET_ACTION_MASK:
109         return "MASK";
110     case VFIO_IRQ_SET_ACTION_UNMASK:
111         return "UNMASK";
112     case VFIO_IRQ_SET_ACTION_TRIGGER:
113         return "TRIGGER";
114     default:
115         return "UNKNOWN ACTION";
116     }
117 }
118 
119 static const char *index_to_str(VFIODevice *vbasedev, int index)
120 {
121     if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
122         return NULL;
123     }
124 
125     switch (index) {
126     case VFIO_PCI_INTX_IRQ_INDEX:
127         return "INTX";
128     case VFIO_PCI_MSI_IRQ_INDEX:
129         return "MSI";
130     case VFIO_PCI_MSIX_IRQ_INDEX:
131         return "MSIX";
132     case VFIO_PCI_ERR_IRQ_INDEX:
133         return "ERR";
134     case VFIO_PCI_REQ_IRQ_INDEX:
135         return "REQ";
136     default:
137         return NULL;
138     }
139 }
140 
141 static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
142 {
143     switch (container->iommu_type) {
144     case VFIO_TYPE1v2_IOMMU:
145     case VFIO_TYPE1_IOMMU:
146         /*
147          * We support coordinated discarding of RAM via the RamDiscardManager.
148          */
149         return ram_block_uncoordinated_discard_disable(state);
150     default:
151         /*
152          * VFIO_SPAPR_TCE_IOMMU most probably works just fine with
153          * RamDiscardManager, however, it is completely untested.
154          *
155          * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does
156          * completely the opposite of managing mapping/pinning dynamically as
157          * required by RamDiscardManager. We would have to special-case sections
158          * with a RamDiscardManager.
159          */
160         return ram_block_discard_disable(state);
161     }
162 }
163 
164 int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
165                            int action, int fd, Error **errp)
166 {
167     struct vfio_irq_set *irq_set;
168     int argsz, ret = 0;
169     const char *name;
170     int32_t *pfd;
171 
172     argsz = sizeof(*irq_set) + sizeof(*pfd);
173 
174     irq_set = g_malloc0(argsz);
175     irq_set->argsz = argsz;
176     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
177     irq_set->index = index;
178     irq_set->start = subindex;
179     irq_set->count = 1;
180     pfd = (int32_t *)&irq_set->data;
181     *pfd = fd;
182 
183     if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
184         ret = -errno;
185     }
186     g_free(irq_set);
187 
188     if (!ret) {
189         return 0;
190     }
191 
192     error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure");
193 
194     name = index_to_str(vbasedev, index);
195     if (name) {
196         error_prepend(errp, "%s-%d: ", name, subindex);
197     } else {
198         error_prepend(errp, "index %d-%d: ", index, subindex);
199     }
200     error_prepend(errp,
201                   "Failed to %s %s eventfd signaling for interrupt ",
202                   fd < 0 ? "tear down" : "set up", action_to_str(action));
203     return ret;
204 }
205 
206 /*
207  * IO Port/MMIO - Beware of the endians, VFIO is always little endian
208  */
209 void vfio_region_write(void *opaque, hwaddr addr,
210                        uint64_t data, unsigned size)
211 {
212     VFIORegion *region = opaque;
213     VFIODevice *vbasedev = region->vbasedev;
214     union {
215         uint8_t byte;
216         uint16_t word;
217         uint32_t dword;
218         uint64_t qword;
219     } buf;
220 
221     switch (size) {
222     case 1:
223         buf.byte = data;
224         break;
225     case 2:
226         buf.word = cpu_to_le16(data);
227         break;
228     case 4:
229         buf.dword = cpu_to_le32(data);
230         break;
231     case 8:
232         buf.qword = cpu_to_le64(data);
233         break;
234     default:
235         hw_error("vfio: unsupported write size, %u bytes", size);
236         break;
237     }
238 
239     if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
240         error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
241                      ",%d) failed: %m",
242                      __func__, vbasedev->name, region->nr,
243                      addr, data, size);
244     }
245 
246     trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
247 
248     /*
249      * A read or write to a BAR always signals an INTx EOI.  This will
250      * do nothing if not pending (including not in INTx mode).  We assume
251      * that a BAR access is in response to an interrupt and that BAR
252      * accesses will service the interrupt.  Unfortunately, we don't know
253      * which access will service the interrupt, so we're potentially
254      * getting quite a few host interrupts per guest interrupt.
255      */
256     vbasedev->ops->vfio_eoi(vbasedev);
257 }
258 
259 uint64_t vfio_region_read(void *opaque,
260                           hwaddr addr, unsigned size)
261 {
262     VFIORegion *region = opaque;
263     VFIODevice *vbasedev = region->vbasedev;
264     union {
265         uint8_t byte;
266         uint16_t word;
267         uint32_t dword;
268         uint64_t qword;
269     } buf;
270     uint64_t data = 0;
271 
272     if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
273         error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
274                      __func__, vbasedev->name, region->nr,
275                      addr, size);
276         return (uint64_t)-1;
277     }
278     switch (size) {
279     case 1:
280         data = buf.byte;
281         break;
282     case 2:
283         data = le16_to_cpu(buf.word);
284         break;
285     case 4:
286         data = le32_to_cpu(buf.dword);
287         break;
288     case 8:
289         data = le64_to_cpu(buf.qword);
290         break;
291     default:
292         hw_error("vfio: unsupported read size, %u bytes", size);
293         break;
294     }
295 
296     trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
297 
298     /* Same as write above */
299     vbasedev->ops->vfio_eoi(vbasedev);
300 
301     return data;
302 }
303 
304 const MemoryRegionOps vfio_region_ops = {
305     .read = vfio_region_read,
306     .write = vfio_region_write,
307     .endianness = DEVICE_LITTLE_ENDIAN,
308     .valid = {
309         .min_access_size = 1,
310         .max_access_size = 8,
311     },
312     .impl = {
313         .min_access_size = 1,
314         .max_access_size = 8,
315     },
316 };
317 
318 /*
319  * Device state interfaces
320  */
321 
322 bool vfio_mig_active(void)
323 {
324     VFIOGroup *group;
325     VFIODevice *vbasedev;
326 
327     if (QLIST_EMPTY(&vfio_group_list)) {
328         return false;
329     }
330 
331     QLIST_FOREACH(group, &vfio_group_list, next) {
332         QLIST_FOREACH(vbasedev, &group->device_list, next) {
333             if (vbasedev->migration_blocker) {
334                 return false;
335             }
336         }
337     }
338     return true;
339 }
340 
341 static Error *multiple_devices_migration_blocker;
342 
343 static unsigned int vfio_migratable_device_num(void)
344 {
345     VFIOGroup *group;
346     VFIODevice *vbasedev;
347     unsigned int device_num = 0;
348 
349     QLIST_FOREACH(group, &vfio_group_list, next) {
350         QLIST_FOREACH(vbasedev, &group->device_list, next) {
351             if (vbasedev->migration) {
352                 device_num++;
353             }
354         }
355     }
356 
357     return device_num;
358 }
359 
360 int vfio_block_multiple_devices_migration(Error **errp)
361 {
362     int ret;
363 
364     if (multiple_devices_migration_blocker ||
365         vfio_migratable_device_num() <= 1) {
366         return 0;
367     }
368 
369     error_setg(&multiple_devices_migration_blocker,
370                "Migration is currently not supported with multiple "
371                "VFIO devices");
372     ret = migrate_add_blocker(multiple_devices_migration_blocker, errp);
373     if (ret < 0) {
374         error_free(multiple_devices_migration_blocker);
375         multiple_devices_migration_blocker = NULL;
376     }
377 
378     return ret;
379 }
380 
381 void vfio_unblock_multiple_devices_migration(void)
382 {
383     if (!multiple_devices_migration_blocker ||
384         vfio_migratable_device_num() > 1) {
385         return;
386     }
387 
388     migrate_del_blocker(multiple_devices_migration_blocker);
389     error_free(multiple_devices_migration_blocker);
390     multiple_devices_migration_blocker = NULL;
391 }
392 
393 static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
394 {
395     VFIOGroup *group;
396     VFIODevice *vbasedev;
397     MigrationState *ms = migrate_get_current();
398 
399     if (!migration_is_setup_or_active(ms->state)) {
400         return false;
401     }
402 
403     QLIST_FOREACH(group, &container->group_list, container_next) {
404         QLIST_FOREACH(vbasedev, &group->device_list, next) {
405             VFIOMigration *migration = vbasedev->migration;
406 
407             if (!migration) {
408                 return false;
409             }
410 
411             if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
412                 migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
413                 return false;
414             }
415         }
416     }
417     return true;
418 }
419 
420 /*
421  * Check if all VFIO devices are running and migration is active, which is
422  * essentially equivalent to the migration being in pre-copy phase.
423  */
424 static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
425 {
426     VFIOGroup *group;
427     VFIODevice *vbasedev;
428 
429     if (!migration_is_active(migrate_get_current())) {
430         return false;
431     }
432 
433     QLIST_FOREACH(group, &container->group_list, container_next) {
434         QLIST_FOREACH(vbasedev, &group->device_list, next) {
435             VFIOMigration *migration = vbasedev->migration;
436 
437             if (!migration) {
438                 return false;
439             }
440 
441             if (migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
442                 continue;
443             } else {
444                 return false;
445             }
446         }
447     }
448     return true;
449 }
450 
451 static int vfio_dma_unmap_bitmap(VFIOContainer *container,
452                                  hwaddr iova, ram_addr_t size,
453                                  IOMMUTLBEntry *iotlb)
454 {
455     struct vfio_iommu_type1_dma_unmap *unmap;
456     struct vfio_bitmap *bitmap;
457     uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
458     int ret;
459 
460     unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
461 
462     unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
463     unmap->iova = iova;
464     unmap->size = size;
465     unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
466     bitmap = (struct vfio_bitmap *)&unmap->data;
467 
468     /*
469      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
470      * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
471      * to qemu_real_host_page_size.
472      */
473 
474     bitmap->pgsize = qemu_real_host_page_size();
475     bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
476                    BITS_PER_BYTE;
477 
478     if (bitmap->size > container->max_dirty_bitmap_size) {
479         error_report("UNMAP: Size of bitmap too big 0x%"PRIx64,
480                      (uint64_t)bitmap->size);
481         ret = -E2BIG;
482         goto unmap_exit;
483     }
484 
485     bitmap->data = g_try_malloc0(bitmap->size);
486     if (!bitmap->data) {
487         ret = -ENOMEM;
488         goto unmap_exit;
489     }
490 
491     ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
492     if (!ret) {
493         cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
494                 iotlb->translated_addr, pages);
495     } else {
496         error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
497     }
498 
499     g_free(bitmap->data);
500 unmap_exit:
501     g_free(unmap);
502     return ret;
503 }
504 
505 /*
506  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
507  */
508 static int vfio_dma_unmap(VFIOContainer *container,
509                           hwaddr iova, ram_addr_t size,
510                           IOMMUTLBEntry *iotlb)
511 {
512     struct vfio_iommu_type1_dma_unmap unmap = {
513         .argsz = sizeof(unmap),
514         .flags = 0,
515         .iova = iova,
516         .size = size,
517     };
518 
519     if (iotlb && container->dirty_pages_supported &&
520         vfio_devices_all_running_and_mig_active(container)) {
521         return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
522     }
523 
524     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
525         /*
526          * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
527          * v4.15) where an overflow in its wrap-around check prevents us from
528          * unmapping the last page of the address space.  Test for the error
529          * condition and re-try the unmap excluding the last page.  The
530          * expectation is that we've never mapped the last page anyway and this
531          * unmap request comes via vIOMMU support which also makes it unlikely
532          * that this page is used.  This bug was introduced well after type1 v2
533          * support was introduced, so we shouldn't need to test for v1.  A fix
534          * is queued for kernel v5.0 so this workaround can be removed once
535          * affected kernels are sufficiently deprecated.
536          */
537         if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
538             container->iommu_type == VFIO_TYPE1v2_IOMMU) {
539             trace_vfio_dma_unmap_overflow_workaround();
540             unmap.size -= 1ULL << ctz64(container->pgsizes);
541             continue;
542         }
543         error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
544         return -errno;
545     }
546 
547     if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
548         cpu_physical_memory_set_dirty_range(iotlb->translated_addr, size,
549                                             tcg_enabled() ? DIRTY_CLIENTS_ALL :
550                                             DIRTY_CLIENTS_NOCODE);
551     }
552 
553     return 0;
554 }
555 
556 static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
557                         ram_addr_t size, void *vaddr, bool readonly)
558 {
559     struct vfio_iommu_type1_dma_map map = {
560         .argsz = sizeof(map),
561         .flags = VFIO_DMA_MAP_FLAG_READ,
562         .vaddr = (__u64)(uintptr_t)vaddr,
563         .iova = iova,
564         .size = size,
565     };
566 
567     if (!readonly) {
568         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
569     }
570 
571     /*
572      * Try the mapping, if it fails with EBUSY, unmap the region and try
573      * again.  This shouldn't be necessary, but we sometimes see it in
574      * the VGA ROM space.
575      */
576     if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
577         (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
578          ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
579         return 0;
580     }
581 
582     error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
583     return -errno;
584 }
585 
586 static void vfio_host_win_add(VFIOContainer *container,
587                               hwaddr min_iova, hwaddr max_iova,
588                               uint64_t iova_pgsizes)
589 {
590     VFIOHostDMAWindow *hostwin;
591 
592     QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
593         if (ranges_overlap(hostwin->min_iova,
594                            hostwin->max_iova - hostwin->min_iova + 1,
595                            min_iova,
596                            max_iova - min_iova + 1)) {
597             hw_error("%s: Overlapped IOMMU are not enabled", __func__);
598         }
599     }
600 
601     hostwin = g_malloc0(sizeof(*hostwin));
602 
603     hostwin->min_iova = min_iova;
604     hostwin->max_iova = max_iova;
605     hostwin->iova_pgsizes = iova_pgsizes;
606     QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
607 }
608 
609 static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
610                              hwaddr max_iova)
611 {
612     VFIOHostDMAWindow *hostwin;
613 
614     QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
615         if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
616             QLIST_REMOVE(hostwin, hostwin_next);
617             g_free(hostwin);
618             return 0;
619         }
620     }
621 
622     return -1;
623 }
624 
625 static bool vfio_listener_skipped_section(MemoryRegionSection *section)
626 {
627     return (!memory_region_is_ram(section->mr) &&
628             !memory_region_is_iommu(section->mr)) ||
629            memory_region_is_protected(section->mr) ||
630            /*
631             * Sizing an enabled 64-bit BAR can cause spurious mappings to
632             * addresses in the upper part of the 64-bit address space.  These
633             * are never accessed by the CPU and beyond the address width of
634             * some IOMMU hardware.  TODO: VFIO should tell us the IOMMU width.
635             */
636            section->offset_within_address_space & (1ULL << 63);
637 }
638 
639 /* Called with rcu_read_lock held.  */
640 static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
641                                ram_addr_t *ram_addr, bool *read_only)
642 {
643     bool ret, mr_has_discard_manager;
644 
645     ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
646                                &mr_has_discard_manager);
647     if (ret && mr_has_discard_manager) {
648         /*
649          * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
650          * pages will remain pinned inside vfio until unmapped, resulting in a
651          * higher memory consumption than expected. If memory would get
652          * populated again later, there would be an inconsistency between pages
653          * pinned by vfio and pages seen by QEMU. This is the case until
654          * unmapped from the IOMMU (e.g., during device reset).
655          *
656          * With malicious guests, we really only care about pinning more memory
657          * than expected. RLIMIT_MEMLOCK set for the user/process can never be
658          * exceeded and can be used to mitigate this problem.
659          */
660         warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
661                          " RAM (e.g., virtio-mem) works, however, malicious"
662                          " guests can trigger pinning of more memory than"
663                          " intended via an IOMMU. It's possible to mitigate "
664                          " by setting/adjusting RLIMIT_MEMLOCK.");
665     }
666     return ret;
667 }
668 
669 static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
670 {
671     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
672     VFIOContainer *container = giommu->container;
673     hwaddr iova = iotlb->iova + giommu->iommu_offset;
674     void *vaddr;
675     int ret;
676 
677     trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
678                                 iova, iova + iotlb->addr_mask);
679 
680     if (iotlb->target_as != &address_space_memory) {
681         error_report("Wrong target AS \"%s\", only system memory is allowed",
682                      iotlb->target_as->name ? iotlb->target_as->name : "none");
683         return;
684     }
685 
686     rcu_read_lock();
687 
688     if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
689         bool read_only;
690 
691         if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
692             goto out;
693         }
694         /*
695          * vaddr is only valid until rcu_read_unlock(). But after
696          * vfio_dma_map has set up the mapping the pages will be
697          * pinned by the kernel. This makes sure that the RAM backend
698          * of vaddr will always be there, even if the memory object is
699          * destroyed and its backing memory munmap-ed.
700          */
701         ret = vfio_dma_map(container, iova,
702                            iotlb->addr_mask + 1, vaddr,
703                            read_only);
704         if (ret) {
705             error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
706                          "0x%"HWADDR_PRIx", %p) = %d (%m)",
707                          container, iova,
708                          iotlb->addr_mask + 1, vaddr, ret);
709         }
710     } else {
711         ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
712         if (ret) {
713             error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
714                          "0x%"HWADDR_PRIx") = %d (%m)",
715                          container, iova,
716                          iotlb->addr_mask + 1, ret);
717         }
718     }
719 out:
720     rcu_read_unlock();
721 }
722 
723 static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
724                                             MemoryRegionSection *section)
725 {
726     VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
727                                                 listener);
728     const hwaddr size = int128_get64(section->size);
729     const hwaddr iova = section->offset_within_address_space;
730     int ret;
731 
732     /* Unmap with a single call. */
733     ret = vfio_dma_unmap(vrdl->container, iova, size , NULL);
734     if (ret) {
735         error_report("%s: vfio_dma_unmap() failed: %s", __func__,
736                      strerror(-ret));
737     }
738 }
739 
740 static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
741                                             MemoryRegionSection *section)
742 {
743     VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
744                                                 listener);
745     const hwaddr end = section->offset_within_region +
746                        int128_get64(section->size);
747     hwaddr start, next, iova;
748     void *vaddr;
749     int ret;
750 
751     /*
752      * Map in (aligned within memory region) minimum granularity, so we can
753      * unmap in minimum granularity later.
754      */
755     for (start = section->offset_within_region; start < end; start = next) {
756         next = ROUND_UP(start + 1, vrdl->granularity);
757         next = MIN(next, end);
758 
759         iova = start - section->offset_within_region +
760                section->offset_within_address_space;
761         vaddr = memory_region_get_ram_ptr(section->mr) + start;
762 
763         ret = vfio_dma_map(vrdl->container, iova, next - start,
764                            vaddr, section->readonly);
765         if (ret) {
766             /* Rollback */
767             vfio_ram_discard_notify_discard(rdl, section);
768             return ret;
769         }
770     }
771     return 0;
772 }
773 
774 static void vfio_register_ram_discard_listener(VFIOContainer *container,
775                                                MemoryRegionSection *section)
776 {
777     RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
778     VFIORamDiscardListener *vrdl;
779 
780     /* Ignore some corner cases not relevant in practice. */
781     g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE));
782     g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
783                              TARGET_PAGE_SIZE));
784     g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
785 
786     vrdl = g_new0(VFIORamDiscardListener, 1);
787     vrdl->container = container;
788     vrdl->mr = section->mr;
789     vrdl->offset_within_address_space = section->offset_within_address_space;
790     vrdl->size = int128_get64(section->size);
791     vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
792                                                                 section->mr);
793 
794     g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
795     g_assert(container->pgsizes &&
796              vrdl->granularity >= 1ULL << ctz64(container->pgsizes));
797 
798     ram_discard_listener_init(&vrdl->listener,
799                               vfio_ram_discard_notify_populate,
800                               vfio_ram_discard_notify_discard, true);
801     ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
802     QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
803 
804     /*
805      * Sanity-check if we have a theoretically problematic setup where we could
806      * exceed the maximum number of possible DMA mappings over time. We assume
807      * that each mapped section in the same address space as a RamDiscardManager
808      * section consumes exactly one DMA mapping, with the exception of
809      * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections
810      * in the same address space as RamDiscardManager sections.
811      *
812      * We assume that each section in the address space consumes one memslot.
813      * We take the number of KVM memory slots as a best guess for the maximum
814      * number of sections in the address space we could have over time,
815      * also consuming DMA mappings.
816      */
817     if (container->dma_max_mappings) {
818         unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
819 
820 #ifdef CONFIG_KVM
821         if (kvm_enabled()) {
822             max_memslots = kvm_get_max_memslots();
823         }
824 #endif
825 
826         QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
827             hwaddr start, end;
828 
829             start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
830                                     vrdl->granularity);
831             end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
832                            vrdl->granularity);
833             vrdl_mappings += (end - start) / vrdl->granularity;
834             vrdl_count++;
835         }
836 
837         if (vrdl_mappings + max_memslots - vrdl_count >
838             container->dma_max_mappings) {
839             warn_report("%s: possibly running out of DMA mappings. E.g., try"
840                         " increasing the 'block-size' of virtio-mem devies."
841                         " Maximum possible DMA mappings: %d, Maximum possible"
842                         " memslots: %d", __func__, container->dma_max_mappings,
843                         max_memslots);
844         }
845     }
846 }
847 
848 static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
849                                                  MemoryRegionSection *section)
850 {
851     RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
852     VFIORamDiscardListener *vrdl = NULL;
853 
854     QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
855         if (vrdl->mr == section->mr &&
856             vrdl->offset_within_address_space ==
857             section->offset_within_address_space) {
858             break;
859         }
860     }
861 
862     if (!vrdl) {
863         hw_error("vfio: Trying to unregister missing RAM discard listener");
864     }
865 
866     ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
867     QLIST_REMOVE(vrdl, next);
868     g_free(vrdl);
869 }
870 
871 static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
872 {
873     MemoryRegion *mr = section->mr;
874 
875     if (!TPM_IS_CRB(mr->owner)) {
876         return false;
877     }
878 
879     /* this is a known safe misaligned region, just trace for debug purpose */
880     trace_vfio_known_safe_misalignment(memory_region_name(mr),
881                                        section->offset_within_address_space,
882                                        section->offset_within_region,
883                                        qemu_real_host_page_size());
884     return true;
885 }
886 
887 static void vfio_listener_region_add(MemoryListener *listener,
888                                      MemoryRegionSection *section)
889 {
890     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
891     hwaddr iova, end;
892     Int128 llend, llsize;
893     void *vaddr;
894     int ret;
895     VFIOHostDMAWindow *hostwin;
896     bool hostwin_found;
897     Error *err = NULL;
898 
899     if (vfio_listener_skipped_section(section)) {
900         trace_vfio_listener_region_add_skip(
901                 section->offset_within_address_space,
902                 section->offset_within_address_space +
903                 int128_get64(int128_sub(section->size, int128_one())));
904         return;
905     }
906 
907     if (unlikely((section->offset_within_address_space &
908                   ~qemu_real_host_page_mask()) !=
909                  (section->offset_within_region & ~qemu_real_host_page_mask()))) {
910         if (!vfio_known_safe_misalignment(section)) {
911             error_report("%s received unaligned region %s iova=0x%"PRIx64
912                          " offset_within_region=0x%"PRIx64
913                          " qemu_real_host_page_size=0x%"PRIxPTR,
914                          __func__, memory_region_name(section->mr),
915                          section->offset_within_address_space,
916                          section->offset_within_region,
917                          qemu_real_host_page_size());
918         }
919         return;
920     }
921 
922     iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
923     llend = int128_make64(section->offset_within_address_space);
924     llend = int128_add(llend, section->size);
925     llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
926 
927     if (int128_ge(int128_make64(iova), llend)) {
928         if (memory_region_is_ram_device(section->mr)) {
929             trace_vfio_listener_region_add_no_dma_map(
930                 memory_region_name(section->mr),
931                 section->offset_within_address_space,
932                 int128_getlo(section->size),
933                 qemu_real_host_page_size());
934         }
935         return;
936     }
937     end = int128_get64(int128_sub(llend, int128_one()));
938 
939     if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
940         hwaddr pgsize = 0;
941 
942         /* For now intersections are not allowed, we may relax this later */
943         QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
944             if (ranges_overlap(hostwin->min_iova,
945                                hostwin->max_iova - hostwin->min_iova + 1,
946                                section->offset_within_address_space,
947                                int128_get64(section->size))) {
948                 error_setg(&err,
949                     "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
950                     "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
951                     section->offset_within_address_space,
952                     section->offset_within_address_space +
953                         int128_get64(section->size) - 1,
954                     hostwin->min_iova, hostwin->max_iova);
955                 goto fail;
956             }
957         }
958 
959         ret = vfio_spapr_create_window(container, section, &pgsize);
960         if (ret) {
961             error_setg_errno(&err, -ret, "Failed to create SPAPR window");
962             goto fail;
963         }
964 
965         vfio_host_win_add(container, section->offset_within_address_space,
966                           section->offset_within_address_space +
967                           int128_get64(section->size) - 1, pgsize);
968 #ifdef CONFIG_KVM
969         if (kvm_enabled()) {
970             VFIOGroup *group;
971             IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
972             struct kvm_vfio_spapr_tce param;
973             struct kvm_device_attr attr = {
974                 .group = KVM_DEV_VFIO_GROUP,
975                 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
976                 .addr = (uint64_t)(unsigned long)&param,
977             };
978 
979             if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
980                                               &param.tablefd)) {
981                 QLIST_FOREACH(group, &container->group_list, container_next) {
982                     param.groupfd = group->fd;
983                     if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
984                         error_report("vfio: failed to setup fd %d "
985                                      "for a group with fd %d: %s",
986                                      param.tablefd, param.groupfd,
987                                      strerror(errno));
988                         return;
989                     }
990                     trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
991                 }
992             }
993         }
994 #endif
995     }
996 
997     hostwin_found = false;
998     QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
999         if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
1000             hostwin_found = true;
1001             break;
1002         }
1003     }
1004 
1005     if (!hostwin_found) {
1006         error_setg(&err, "Container %p can't map guest IOVA region"
1007                    " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
1008         goto fail;
1009     }
1010 
1011     memory_region_ref(section->mr);
1012 
1013     if (memory_region_is_iommu(section->mr)) {
1014         VFIOGuestIOMMU *giommu;
1015         IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
1016         int iommu_idx;
1017 
1018         trace_vfio_listener_region_add_iommu(iova, end);
1019         /*
1020          * FIXME: For VFIO iommu types which have KVM acceleration to
1021          * avoid bouncing all map/unmaps through qemu this way, this
1022          * would be the right place to wire that up (tell the KVM
1023          * device emulation the VFIO iommu handles to use).
1024          */
1025         giommu = g_malloc0(sizeof(*giommu));
1026         giommu->iommu_mr = iommu_mr;
1027         giommu->iommu_offset = section->offset_within_address_space -
1028                                section->offset_within_region;
1029         giommu->container = container;
1030         llend = int128_add(int128_make64(section->offset_within_region),
1031                            section->size);
1032         llend = int128_sub(llend, int128_one());
1033         iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
1034                                                        MEMTXATTRS_UNSPECIFIED);
1035         iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
1036                             IOMMU_NOTIFIER_IOTLB_EVENTS,
1037                             section->offset_within_region,
1038                             int128_get64(llend),
1039                             iommu_idx);
1040 
1041         ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr,
1042                                                      container->pgsizes,
1043                                                      &err);
1044         if (ret) {
1045             g_free(giommu);
1046             goto fail;
1047         }
1048 
1049         ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
1050                                                     &err);
1051         if (ret) {
1052             g_free(giommu);
1053             goto fail;
1054         }
1055         QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
1056         memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
1057 
1058         return;
1059     }
1060 
1061     /* Here we assume that memory_region_is_ram(section->mr)==true */
1062 
1063     /*
1064      * For RAM memory regions with a RamDiscardManager, we only want to map the
1065      * actually populated parts - and update the mapping whenever we're notified
1066      * about changes.
1067      */
1068     if (memory_region_has_ram_discard_manager(section->mr)) {
1069         vfio_register_ram_discard_listener(container, section);
1070         return;
1071     }
1072 
1073     vaddr = memory_region_get_ram_ptr(section->mr) +
1074             section->offset_within_region +
1075             (iova - section->offset_within_address_space);
1076 
1077     trace_vfio_listener_region_add_ram(iova, end, vaddr);
1078 
1079     llsize = int128_sub(llend, int128_make64(iova));
1080 
1081     if (memory_region_is_ram_device(section->mr)) {
1082         hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1083 
1084         if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
1085             trace_vfio_listener_region_add_no_dma_map(
1086                 memory_region_name(section->mr),
1087                 section->offset_within_address_space,
1088                 int128_getlo(section->size),
1089                 pgmask + 1);
1090             return;
1091         }
1092     }
1093 
1094     ret = vfio_dma_map(container, iova, int128_get64(llsize),
1095                        vaddr, section->readonly);
1096     if (ret) {
1097         error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
1098                    "0x%"HWADDR_PRIx", %p) = %d (%m)",
1099                    container, iova, int128_get64(llsize), vaddr, ret);
1100         if (memory_region_is_ram_device(section->mr)) {
1101             /* Allow unexpected mappings not to be fatal for RAM devices */
1102             error_report_err(err);
1103             return;
1104         }
1105         goto fail;
1106     }
1107 
1108     return;
1109 
1110 fail:
1111     if (memory_region_is_ram_device(section->mr)) {
1112         error_report("failed to vfio_dma_map. pci p2p may not work");
1113         return;
1114     }
1115     /*
1116      * On the initfn path, store the first error in the container so we
1117      * can gracefully fail.  Runtime, there's not much we can do other
1118      * than throw a hardware error.
1119      */
1120     if (!container->initialized) {
1121         if (!container->error) {
1122             error_propagate_prepend(&container->error, err,
1123                                     "Region %s: ",
1124                                     memory_region_name(section->mr));
1125         } else {
1126             error_free(err);
1127         }
1128     } else {
1129         error_report_err(err);
1130         hw_error("vfio: DMA mapping failed, unable to continue");
1131     }
1132 }
1133 
1134 static void vfio_listener_region_del(MemoryListener *listener,
1135                                      MemoryRegionSection *section)
1136 {
1137     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1138     hwaddr iova, end;
1139     Int128 llend, llsize;
1140     int ret;
1141     bool try_unmap = true;
1142 
1143     if (vfio_listener_skipped_section(section)) {
1144         trace_vfio_listener_region_del_skip(
1145                 section->offset_within_address_space,
1146                 section->offset_within_address_space +
1147                 int128_get64(int128_sub(section->size, int128_one())));
1148         return;
1149     }
1150 
1151     if (unlikely((section->offset_within_address_space &
1152                   ~qemu_real_host_page_mask()) !=
1153                  (section->offset_within_region & ~qemu_real_host_page_mask()))) {
1154         if (!vfio_known_safe_misalignment(section)) {
1155             error_report("%s received unaligned region %s iova=0x%"PRIx64
1156                          " offset_within_region=0x%"PRIx64
1157                          " qemu_real_host_page_size=0x%"PRIxPTR,
1158                          __func__, memory_region_name(section->mr),
1159                          section->offset_within_address_space,
1160                          section->offset_within_region,
1161                          qemu_real_host_page_size());
1162         }
1163         return;
1164     }
1165 
1166     if (memory_region_is_iommu(section->mr)) {
1167         VFIOGuestIOMMU *giommu;
1168 
1169         QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
1170             if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
1171                 giommu->n.start == section->offset_within_region) {
1172                 memory_region_unregister_iommu_notifier(section->mr,
1173                                                         &giommu->n);
1174                 QLIST_REMOVE(giommu, giommu_next);
1175                 g_free(giommu);
1176                 break;
1177             }
1178         }
1179 
1180         /*
1181          * FIXME: We assume the one big unmap below is adequate to
1182          * remove any individual page mappings in the IOMMU which
1183          * might have been copied into VFIO. This works for a page table
1184          * based IOMMU where a big unmap flattens a large range of IO-PTEs.
1185          * That may not be true for all IOMMU types.
1186          */
1187     }
1188 
1189     iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
1190     llend = int128_make64(section->offset_within_address_space);
1191     llend = int128_add(llend, section->size);
1192     llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
1193 
1194     if (int128_ge(int128_make64(iova), llend)) {
1195         return;
1196     }
1197     end = int128_get64(int128_sub(llend, int128_one()));
1198 
1199     llsize = int128_sub(llend, int128_make64(iova));
1200 
1201     trace_vfio_listener_region_del(iova, end);
1202 
1203     if (memory_region_is_ram_device(section->mr)) {
1204         hwaddr pgmask;
1205         VFIOHostDMAWindow *hostwin;
1206         bool hostwin_found = false;
1207 
1208         QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
1209             if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
1210                 hostwin_found = true;
1211                 break;
1212             }
1213         }
1214         assert(hostwin_found); /* or region_add() would have failed */
1215 
1216         pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1217         try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
1218     } else if (memory_region_has_ram_discard_manager(section->mr)) {
1219         vfio_unregister_ram_discard_listener(container, section);
1220         /* Unregistering will trigger an unmap. */
1221         try_unmap = false;
1222     }
1223 
1224     if (try_unmap) {
1225         if (int128_eq(llsize, int128_2_64())) {
1226             /* The unmap ioctl doesn't accept a full 64-bit span. */
1227             llsize = int128_rshift(llsize, 1);
1228             ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1229             if (ret) {
1230                 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
1231                              "0x%"HWADDR_PRIx") = %d (%m)",
1232                              container, iova, int128_get64(llsize), ret);
1233             }
1234             iova += int128_get64(llsize);
1235         }
1236         ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1237         if (ret) {
1238             error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
1239                          "0x%"HWADDR_PRIx") = %d (%m)",
1240                          container, iova, int128_get64(llsize), ret);
1241         }
1242     }
1243 
1244     memory_region_unref(section->mr);
1245 
1246     if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1247         vfio_spapr_remove_window(container,
1248                                  section->offset_within_address_space);
1249         if (vfio_host_win_del(container,
1250                               section->offset_within_address_space,
1251                               section->offset_within_address_space +
1252                               int128_get64(section->size) - 1) < 0) {
1253             hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
1254                      __func__, section->offset_within_address_space);
1255         }
1256     }
1257 }
1258 
1259 static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
1260 {
1261     int ret;
1262     struct vfio_iommu_type1_dirty_bitmap dirty = {
1263         .argsz = sizeof(dirty),
1264     };
1265 
1266     if (!container->dirty_pages_supported) {
1267         return;
1268     }
1269 
1270     if (start) {
1271         dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
1272     } else {
1273         dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
1274     }
1275 
1276     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
1277     if (ret) {
1278         error_report("Failed to set dirty tracking flag 0x%x errno: %d",
1279                      dirty.flags, errno);
1280     }
1281 }
1282 
1283 static void vfio_listener_log_global_start(MemoryListener *listener)
1284 {
1285     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1286 
1287     vfio_set_dirty_page_tracking(container, true);
1288 }
1289 
1290 static void vfio_listener_log_global_stop(MemoryListener *listener)
1291 {
1292     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1293 
1294     vfio_set_dirty_page_tracking(container, false);
1295 }
1296 
1297 static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
1298                                  uint64_t size, ram_addr_t ram_addr)
1299 {
1300     struct vfio_iommu_type1_dirty_bitmap *dbitmap;
1301     struct vfio_iommu_type1_dirty_bitmap_get *range;
1302     uint64_t pages;
1303     int ret;
1304 
1305     if (!container->dirty_pages_supported) {
1306         cpu_physical_memory_set_dirty_range(ram_addr, size,
1307                                             tcg_enabled() ? DIRTY_CLIENTS_ALL :
1308                                             DIRTY_CLIENTS_NOCODE);
1309         return 0;
1310     }
1311 
1312     dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
1313 
1314     dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
1315     dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
1316     range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
1317     range->iova = iova;
1318     range->size = size;
1319 
1320     /*
1321      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
1322      * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
1323      * to qemu_real_host_page_size.
1324      */
1325     range->bitmap.pgsize = qemu_real_host_page_size();
1326 
1327     pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size();
1328     range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
1329                                          BITS_PER_BYTE;
1330     range->bitmap.data = g_try_malloc0(range->bitmap.size);
1331     if (!range->bitmap.data) {
1332         ret = -ENOMEM;
1333         goto err_out;
1334     }
1335 
1336     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
1337     if (ret) {
1338         error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
1339                 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
1340                 (uint64_t)range->size, errno);
1341         goto err_out;
1342     }
1343 
1344     cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data,
1345                                             ram_addr, pages);
1346 
1347     trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
1348                                 range->bitmap.size, ram_addr);
1349 err_out:
1350     g_free(range->bitmap.data);
1351     g_free(dbitmap);
1352 
1353     return ret;
1354 }
1355 
1356 typedef struct {
1357     IOMMUNotifier n;
1358     VFIOGuestIOMMU *giommu;
1359 } vfio_giommu_dirty_notifier;
1360 
1361 static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
1362 {
1363     vfio_giommu_dirty_notifier *gdn = container_of(n,
1364                                                 vfio_giommu_dirty_notifier, n);
1365     VFIOGuestIOMMU *giommu = gdn->giommu;
1366     VFIOContainer *container = giommu->container;
1367     hwaddr iova = iotlb->iova + giommu->iommu_offset;
1368     ram_addr_t translated_addr;
1369 
1370     trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
1371 
1372     if (iotlb->target_as != &address_space_memory) {
1373         error_report("Wrong target AS \"%s\", only system memory is allowed",
1374                      iotlb->target_as->name ? iotlb->target_as->name : "none");
1375         return;
1376     }
1377 
1378     rcu_read_lock();
1379     if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
1380         int ret;
1381 
1382         ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
1383                                     translated_addr);
1384         if (ret) {
1385             error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
1386                          "0x%"HWADDR_PRIx") = %d (%m)",
1387                          container, iova,
1388                          iotlb->addr_mask + 1, ret);
1389         }
1390     }
1391     rcu_read_unlock();
1392 }
1393 
1394 static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
1395                                              void *opaque)
1396 {
1397     const hwaddr size = int128_get64(section->size);
1398     const hwaddr iova = section->offset_within_address_space;
1399     const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
1400                                 section->offset_within_region;
1401     VFIORamDiscardListener *vrdl = opaque;
1402 
1403     /*
1404      * Sync the whole mapped region (spanning multiple individual mappings)
1405      * in one go.
1406      */
1407     return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr);
1408 }
1409 
1410 static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
1411                                                    MemoryRegionSection *section)
1412 {
1413     RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
1414     VFIORamDiscardListener *vrdl = NULL;
1415 
1416     QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
1417         if (vrdl->mr == section->mr &&
1418             vrdl->offset_within_address_space ==
1419             section->offset_within_address_space) {
1420             break;
1421         }
1422     }
1423 
1424     if (!vrdl) {
1425         hw_error("vfio: Trying to sync missing RAM discard listener");
1426     }
1427 
1428     /*
1429      * We only want/can synchronize the bitmap for actually mapped parts -
1430      * which correspond to populated parts. Replay all populated parts.
1431      */
1432     return ram_discard_manager_replay_populated(rdm, section,
1433                                               vfio_ram_discard_get_dirty_bitmap,
1434                                                 &vrdl);
1435 }
1436 
1437 static int vfio_sync_dirty_bitmap(VFIOContainer *container,
1438                                   MemoryRegionSection *section)
1439 {
1440     ram_addr_t ram_addr;
1441 
1442     if (memory_region_is_iommu(section->mr)) {
1443         VFIOGuestIOMMU *giommu;
1444 
1445         QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
1446             if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
1447                 giommu->n.start == section->offset_within_region) {
1448                 Int128 llend;
1449                 vfio_giommu_dirty_notifier gdn = { .giommu = giommu };
1450                 int idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr,
1451                                                        MEMTXATTRS_UNSPECIFIED);
1452 
1453                 llend = int128_add(int128_make64(section->offset_within_region),
1454                                    section->size);
1455                 llend = int128_sub(llend, int128_one());
1456 
1457                 iommu_notifier_init(&gdn.n,
1458                                     vfio_iommu_map_dirty_notify,
1459                                     IOMMU_NOTIFIER_MAP,
1460                                     section->offset_within_region,
1461                                     int128_get64(llend),
1462                                     idx);
1463                 memory_region_iommu_replay(giommu->iommu_mr, &gdn.n);
1464                 break;
1465             }
1466         }
1467         return 0;
1468     } else if (memory_region_has_ram_discard_manager(section->mr)) {
1469         return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
1470     }
1471 
1472     ram_addr = memory_region_get_ram_addr(section->mr) +
1473                section->offset_within_region;
1474 
1475     return vfio_get_dirty_bitmap(container,
1476                    REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
1477                    int128_get64(section->size), ram_addr);
1478 }
1479 
1480 static void vfio_listener_log_sync(MemoryListener *listener,
1481         MemoryRegionSection *section)
1482 {
1483     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1484 
1485     if (vfio_listener_skipped_section(section)) {
1486         return;
1487     }
1488 
1489     if (vfio_devices_all_dirty_tracking(container)) {
1490         vfio_sync_dirty_bitmap(container, section);
1491     }
1492 }
1493 
1494 static const MemoryListener vfio_memory_listener = {
1495     .name = "vfio",
1496     .region_add = vfio_listener_region_add,
1497     .region_del = vfio_listener_region_del,
1498     .log_global_start = vfio_listener_log_global_start,
1499     .log_global_stop = vfio_listener_log_global_stop,
1500     .log_sync = vfio_listener_log_sync,
1501 };
1502 
1503 static void vfio_listener_release(VFIOContainer *container)
1504 {
1505     memory_listener_unregister(&container->listener);
1506     if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1507         memory_listener_unregister(&container->prereg_listener);
1508     }
1509 }
1510 
1511 static struct vfio_info_cap_header *
1512 vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id)
1513 {
1514     struct vfio_info_cap_header *hdr;
1515 
1516     for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
1517         if (hdr->id == id) {
1518             return hdr;
1519         }
1520     }
1521 
1522     return NULL;
1523 }
1524 
1525 struct vfio_info_cap_header *
1526 vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
1527 {
1528     if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
1529         return NULL;
1530     }
1531 
1532     return vfio_get_cap((void *)info, info->cap_offset, id);
1533 }
1534 
1535 static struct vfio_info_cap_header *
1536 vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1537 {
1538     if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1539         return NULL;
1540     }
1541 
1542     return vfio_get_cap((void *)info, info->cap_offset, id);
1543 }
1544 
1545 struct vfio_info_cap_header *
1546 vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id)
1547 {
1548     if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) {
1549         return NULL;
1550     }
1551 
1552     return vfio_get_cap((void *)info, info->cap_offset, id);
1553 }
1554 
1555 bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
1556                              unsigned int *avail)
1557 {
1558     struct vfio_info_cap_header *hdr;
1559     struct vfio_iommu_type1_info_dma_avail *cap;
1560 
1561     /* If the capability cannot be found, assume no DMA limiting */
1562     hdr = vfio_get_iommu_type1_info_cap(info,
1563                                         VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
1564     if (hdr == NULL) {
1565         return false;
1566     }
1567 
1568     if (avail != NULL) {
1569         cap = (void *) hdr;
1570         *avail = cap->avail;
1571     }
1572 
1573     return true;
1574 }
1575 
1576 static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
1577                                           struct vfio_region_info *info)
1578 {
1579     struct vfio_info_cap_header *hdr;
1580     struct vfio_region_info_cap_sparse_mmap *sparse;
1581     int i, j;
1582 
1583     hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
1584     if (!hdr) {
1585         return -ENODEV;
1586     }
1587 
1588     sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
1589 
1590     trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
1591                                          region->nr, sparse->nr_areas);
1592 
1593     region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
1594 
1595     for (i = 0, j = 0; i < sparse->nr_areas; i++) {
1596         if (sparse->areas[i].size) {
1597             trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
1598                                             sparse->areas[i].offset +
1599                                             sparse->areas[i].size - 1);
1600             region->mmaps[j].offset = sparse->areas[i].offset;
1601             region->mmaps[j].size = sparse->areas[i].size;
1602             j++;
1603         }
1604     }
1605 
1606     region->nr_mmaps = j;
1607     region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
1608 
1609     return 0;
1610 }
1611 
1612 int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
1613                       int index, const char *name)
1614 {
1615     struct vfio_region_info *info;
1616     int ret;
1617 
1618     ret = vfio_get_region_info(vbasedev, index, &info);
1619     if (ret) {
1620         return ret;
1621     }
1622 
1623     region->vbasedev = vbasedev;
1624     region->flags = info->flags;
1625     region->size = info->size;
1626     region->fd_offset = info->offset;
1627     region->nr = index;
1628 
1629     if (region->size) {
1630         region->mem = g_new0(MemoryRegion, 1);
1631         memory_region_init_io(region->mem, obj, &vfio_region_ops,
1632                               region, name, region->size);
1633 
1634         if (!vbasedev->no_mmap &&
1635             region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
1636 
1637             ret = vfio_setup_region_sparse_mmaps(region, info);
1638 
1639             if (ret) {
1640                 region->nr_mmaps = 1;
1641                 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
1642                 region->mmaps[0].offset = 0;
1643                 region->mmaps[0].size = region->size;
1644             }
1645         }
1646     }
1647 
1648     g_free(info);
1649 
1650     trace_vfio_region_setup(vbasedev->name, index, name,
1651                             region->flags, region->fd_offset, region->size);
1652     return 0;
1653 }
1654 
1655 static void vfio_subregion_unmap(VFIORegion *region, int index)
1656 {
1657     trace_vfio_region_unmap(memory_region_name(&region->mmaps[index].mem),
1658                             region->mmaps[index].offset,
1659                             region->mmaps[index].offset +
1660                             region->mmaps[index].size - 1);
1661     memory_region_del_subregion(region->mem, &region->mmaps[index].mem);
1662     munmap(region->mmaps[index].mmap, region->mmaps[index].size);
1663     object_unparent(OBJECT(&region->mmaps[index].mem));
1664     region->mmaps[index].mmap = NULL;
1665 }
1666 
1667 int vfio_region_mmap(VFIORegion *region)
1668 {
1669     int i, prot = 0;
1670     char *name;
1671 
1672     if (!region->mem) {
1673         return 0;
1674     }
1675 
1676     prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
1677     prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
1678 
1679     for (i = 0; i < region->nr_mmaps; i++) {
1680         region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot,
1681                                      MAP_SHARED, region->vbasedev->fd,
1682                                      region->fd_offset +
1683                                      region->mmaps[i].offset);
1684         if (region->mmaps[i].mmap == MAP_FAILED) {
1685             int ret = -errno;
1686 
1687             trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
1688                                          region->fd_offset +
1689                                          region->mmaps[i].offset,
1690                                          region->fd_offset +
1691                                          region->mmaps[i].offset +
1692                                          region->mmaps[i].size - 1, ret);
1693 
1694             region->mmaps[i].mmap = NULL;
1695 
1696             for (i--; i >= 0; i--) {
1697                 vfio_subregion_unmap(region, i);
1698             }
1699 
1700             return ret;
1701         }
1702 
1703         name = g_strdup_printf("%s mmaps[%d]",
1704                                memory_region_name(region->mem), i);
1705         memory_region_init_ram_device_ptr(&region->mmaps[i].mem,
1706                                           memory_region_owner(region->mem),
1707                                           name, region->mmaps[i].size,
1708                                           region->mmaps[i].mmap);
1709         g_free(name);
1710         memory_region_add_subregion(region->mem, region->mmaps[i].offset,
1711                                     &region->mmaps[i].mem);
1712 
1713         trace_vfio_region_mmap(memory_region_name(&region->mmaps[i].mem),
1714                                region->mmaps[i].offset,
1715                                region->mmaps[i].offset +
1716                                region->mmaps[i].size - 1);
1717     }
1718 
1719     return 0;
1720 }
1721 
1722 void vfio_region_unmap(VFIORegion *region)
1723 {
1724     int i;
1725 
1726     if (!region->mem) {
1727         return;
1728     }
1729 
1730     for (i = 0; i < region->nr_mmaps; i++) {
1731         if (region->mmaps[i].mmap) {
1732             vfio_subregion_unmap(region, i);
1733         }
1734     }
1735 }
1736 
1737 void vfio_region_exit(VFIORegion *region)
1738 {
1739     int i;
1740 
1741     if (!region->mem) {
1742         return;
1743     }
1744 
1745     for (i = 0; i < region->nr_mmaps; i++) {
1746         if (region->mmaps[i].mmap) {
1747             memory_region_del_subregion(region->mem, &region->mmaps[i].mem);
1748         }
1749     }
1750 
1751     trace_vfio_region_exit(region->vbasedev->name, region->nr);
1752 }
1753 
1754 void vfio_region_finalize(VFIORegion *region)
1755 {
1756     int i;
1757 
1758     if (!region->mem) {
1759         return;
1760     }
1761 
1762     for (i = 0; i < region->nr_mmaps; i++) {
1763         if (region->mmaps[i].mmap) {
1764             munmap(region->mmaps[i].mmap, region->mmaps[i].size);
1765             object_unparent(OBJECT(&region->mmaps[i].mem));
1766         }
1767     }
1768 
1769     object_unparent(OBJECT(region->mem));
1770 
1771     g_free(region->mem);
1772     g_free(region->mmaps);
1773 
1774     trace_vfio_region_finalize(region->vbasedev->name, region->nr);
1775 
1776     region->mem = NULL;
1777     region->mmaps = NULL;
1778     region->nr_mmaps = 0;
1779     region->size = 0;
1780     region->flags = 0;
1781     region->nr = 0;
1782 }
1783 
1784 void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
1785 {
1786     int i;
1787 
1788     if (!region->mem) {
1789         return;
1790     }
1791 
1792     for (i = 0; i < region->nr_mmaps; i++) {
1793         if (region->mmaps[i].mmap) {
1794             memory_region_set_enabled(&region->mmaps[i].mem, enabled);
1795         }
1796     }
1797 
1798     trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
1799                                         enabled);
1800 }
1801 
1802 void vfio_reset_handler(void *opaque)
1803 {
1804     VFIOGroup *group;
1805     VFIODevice *vbasedev;
1806 
1807     QLIST_FOREACH(group, &vfio_group_list, next) {
1808         QLIST_FOREACH(vbasedev, &group->device_list, next) {
1809             if (vbasedev->dev->realized) {
1810                 vbasedev->ops->vfio_compute_needs_reset(vbasedev);
1811             }
1812         }
1813     }
1814 
1815     QLIST_FOREACH(group, &vfio_group_list, next) {
1816         QLIST_FOREACH(vbasedev, &group->device_list, next) {
1817             if (vbasedev->dev->realized && vbasedev->needs_reset) {
1818                 vbasedev->ops->vfio_hot_reset_multi(vbasedev);
1819             }
1820         }
1821     }
1822 }
1823 
1824 static void vfio_kvm_device_add_group(VFIOGroup *group)
1825 {
1826 #ifdef CONFIG_KVM
1827     struct kvm_device_attr attr = {
1828         .group = KVM_DEV_VFIO_GROUP,
1829         .attr = KVM_DEV_VFIO_GROUP_ADD,
1830         .addr = (uint64_t)(unsigned long)&group->fd,
1831     };
1832 
1833     if (!kvm_enabled()) {
1834         return;
1835     }
1836 
1837     if (vfio_kvm_device_fd < 0) {
1838         struct kvm_create_device cd = {
1839             .type = KVM_DEV_TYPE_VFIO,
1840         };
1841 
1842         if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
1843             error_report("Failed to create KVM VFIO device: %m");
1844             return;
1845         }
1846 
1847         vfio_kvm_device_fd = cd.fd;
1848     }
1849 
1850     if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1851         error_report("Failed to add group %d to KVM VFIO device: %m",
1852                      group->groupid);
1853     }
1854 #endif
1855 }
1856 
1857 static void vfio_kvm_device_del_group(VFIOGroup *group)
1858 {
1859 #ifdef CONFIG_KVM
1860     struct kvm_device_attr attr = {
1861         .group = KVM_DEV_VFIO_GROUP,
1862         .attr = KVM_DEV_VFIO_GROUP_DEL,
1863         .addr = (uint64_t)(unsigned long)&group->fd,
1864     };
1865 
1866     if (vfio_kvm_device_fd < 0) {
1867         return;
1868     }
1869 
1870     if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1871         error_report("Failed to remove group %d from KVM VFIO device: %m",
1872                      group->groupid);
1873     }
1874 #endif
1875 }
1876 
1877 static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
1878 {
1879     VFIOAddressSpace *space;
1880 
1881     QLIST_FOREACH(space, &vfio_address_spaces, list) {
1882         if (space->as == as) {
1883             return space;
1884         }
1885     }
1886 
1887     /* No suitable VFIOAddressSpace, create a new one */
1888     space = g_malloc0(sizeof(*space));
1889     space->as = as;
1890     QLIST_INIT(&space->containers);
1891 
1892     QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
1893 
1894     return space;
1895 }
1896 
1897 static void vfio_put_address_space(VFIOAddressSpace *space)
1898 {
1899     if (QLIST_EMPTY(&space->containers)) {
1900         QLIST_REMOVE(space, list);
1901         g_free(space);
1902     }
1903 }
1904 
1905 /*
1906  * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
1907  */
1908 static int vfio_get_iommu_type(VFIOContainer *container,
1909                                Error **errp)
1910 {
1911     int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
1912                           VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
1913     int i;
1914 
1915     for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
1916         if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
1917             return iommu_types[i];
1918         }
1919     }
1920     error_setg(errp, "No available IOMMU models");
1921     return -EINVAL;
1922 }
1923 
1924 static int vfio_init_container(VFIOContainer *container, int group_fd,
1925                                Error **errp)
1926 {
1927     int iommu_type, ret;
1928 
1929     iommu_type = vfio_get_iommu_type(container, errp);
1930     if (iommu_type < 0) {
1931         return iommu_type;
1932     }
1933 
1934     ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
1935     if (ret) {
1936         error_setg_errno(errp, errno, "Failed to set group container");
1937         return -errno;
1938     }
1939 
1940     while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
1941         if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1942             /*
1943              * On sPAPR, despite the IOMMU subdriver always advertises v1 and
1944              * v2, the running platform may not support v2 and there is no
1945              * way to guess it until an IOMMU group gets added to the container.
1946              * So in case it fails with v2, try v1 as a fallback.
1947              */
1948             iommu_type = VFIO_SPAPR_TCE_IOMMU;
1949             continue;
1950         }
1951         error_setg_errno(errp, errno, "Failed to set iommu for container");
1952         return -errno;
1953     }
1954 
1955     container->iommu_type = iommu_type;
1956     return 0;
1957 }
1958 
1959 static int vfio_get_iommu_info(VFIOContainer *container,
1960                                struct vfio_iommu_type1_info **info)
1961 {
1962 
1963     size_t argsz = sizeof(struct vfio_iommu_type1_info);
1964 
1965     *info = g_new0(struct vfio_iommu_type1_info, 1);
1966 again:
1967     (*info)->argsz = argsz;
1968 
1969     if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
1970         g_free(*info);
1971         *info = NULL;
1972         return -errno;
1973     }
1974 
1975     if (((*info)->argsz > argsz)) {
1976         argsz = (*info)->argsz;
1977         *info = g_realloc(*info, argsz);
1978         goto again;
1979     }
1980 
1981     return 0;
1982 }
1983 
1984 static struct vfio_info_cap_header *
1985 vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1986 {
1987     struct vfio_info_cap_header *hdr;
1988     void *ptr = info;
1989 
1990     if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1991         return NULL;
1992     }
1993 
1994     for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
1995         if (hdr->id == id) {
1996             return hdr;
1997         }
1998     }
1999 
2000     return NULL;
2001 }
2002 
2003 static void vfio_get_iommu_info_migration(VFIOContainer *container,
2004                                          struct vfio_iommu_type1_info *info)
2005 {
2006     struct vfio_info_cap_header *hdr;
2007     struct vfio_iommu_type1_info_cap_migration *cap_mig;
2008 
2009     hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
2010     if (!hdr) {
2011         return;
2012     }
2013 
2014     cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
2015                             header);
2016 
2017     /*
2018      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
2019      * qemu_real_host_page_size to mark those dirty.
2020      */
2021     if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
2022         container->dirty_pages_supported = true;
2023         container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
2024         container->dirty_pgsizes = cap_mig->pgsize_bitmap;
2025     }
2026 }
2027 
2028 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
2029                                   Error **errp)
2030 {
2031     VFIOContainer *container;
2032     int ret, fd;
2033     VFIOAddressSpace *space;
2034 
2035     space = vfio_get_address_space(as);
2036 
2037     /*
2038      * VFIO is currently incompatible with discarding of RAM insofar as the
2039      * madvise to purge (zap) the page from QEMU's address space does not
2040      * interact with the memory API and therefore leaves stale virtual to
2041      * physical mappings in the IOMMU if the page was previously pinned.  We
2042      * therefore set discarding broken for each group added to a container,
2043      * whether the container is used individually or shared.  This provides
2044      * us with options to allow devices within a group to opt-in and allow
2045      * discarding, so long as it is done consistently for a group (for instance
2046      * if the device is an mdev device where it is known that the host vendor
2047      * driver will never pin pages outside of the working set of the guest
2048      * driver, which would thus not be discarding candidates).
2049      *
2050      * The first opportunity to induce pinning occurs here where we attempt to
2051      * attach the group to existing containers within the AddressSpace.  If any
2052      * pages are already zapped from the virtual address space, such as from
2053      * previous discards, new pinning will cause valid mappings to be
2054      * re-established.  Likewise, when the overall MemoryListener for a new
2055      * container is registered, a replay of mappings within the AddressSpace
2056      * will occur, re-establishing any previously zapped pages as well.
2057      *
2058      * Especially virtio-balloon is currently only prevented from discarding
2059      * new memory, it will not yet set ram_block_discard_set_required() and
2060      * therefore, neither stops us here or deals with the sudden memory
2061      * consumption of inflated memory.
2062      *
2063      * We do support discarding of memory coordinated via the RamDiscardManager
2064      * with some IOMMU types. vfio_ram_block_discard_disable() handles the
2065      * details once we know which type of IOMMU we are using.
2066      */
2067 
2068     QLIST_FOREACH(container, &space->containers, next) {
2069         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
2070             ret = vfio_ram_block_discard_disable(container, true);
2071             if (ret) {
2072                 error_setg_errno(errp, -ret,
2073                                  "Cannot set discarding of RAM broken");
2074                 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
2075                           &container->fd)) {
2076                     error_report("vfio: error disconnecting group %d from"
2077                                  " container", group->groupid);
2078                 }
2079                 return ret;
2080             }
2081             group->container = container;
2082             QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2083             vfio_kvm_device_add_group(group);
2084             return 0;
2085         }
2086     }
2087 
2088     fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
2089     if (fd < 0) {
2090         error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
2091         ret = -errno;
2092         goto put_space_exit;
2093     }
2094 
2095     ret = ioctl(fd, VFIO_GET_API_VERSION);
2096     if (ret != VFIO_API_VERSION) {
2097         error_setg(errp, "supported vfio version: %d, "
2098                    "reported version: %d", VFIO_API_VERSION, ret);
2099         ret = -EINVAL;
2100         goto close_fd_exit;
2101     }
2102 
2103     container = g_malloc0(sizeof(*container));
2104     container->space = space;
2105     container->fd = fd;
2106     container->error = NULL;
2107     container->dirty_pages_supported = false;
2108     container->dma_max_mappings = 0;
2109     QLIST_INIT(&container->giommu_list);
2110     QLIST_INIT(&container->hostwin_list);
2111     QLIST_INIT(&container->vrdl_list);
2112 
2113     ret = vfio_init_container(container, group->fd, errp);
2114     if (ret) {
2115         goto free_container_exit;
2116     }
2117 
2118     ret = vfio_ram_block_discard_disable(container, true);
2119     if (ret) {
2120         error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
2121         goto free_container_exit;
2122     }
2123 
2124     switch (container->iommu_type) {
2125     case VFIO_TYPE1v2_IOMMU:
2126     case VFIO_TYPE1_IOMMU:
2127     {
2128         struct vfio_iommu_type1_info *info;
2129 
2130         ret = vfio_get_iommu_info(container, &info);
2131         if (ret) {
2132             error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
2133             goto enable_discards_exit;
2134         }
2135 
2136         if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
2137             container->pgsizes = info->iova_pgsizes;
2138         } else {
2139             container->pgsizes = qemu_real_host_page_size();
2140         }
2141 
2142         if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
2143             container->dma_max_mappings = 65535;
2144         }
2145         vfio_get_iommu_info_migration(container, info);
2146         g_free(info);
2147 
2148         /*
2149          * FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE
2150          * information to get the actual window extent rather than assume
2151          * a 64-bit IOVA address space.
2152          */
2153         vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes);
2154 
2155         break;
2156     }
2157     case VFIO_SPAPR_TCE_v2_IOMMU:
2158     case VFIO_SPAPR_TCE_IOMMU:
2159     {
2160         struct vfio_iommu_spapr_tce_info info;
2161         bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
2162 
2163         /*
2164          * The host kernel code implementing VFIO_IOMMU_DISABLE is called
2165          * when container fd is closed so we do not call it explicitly
2166          * in this file.
2167          */
2168         if (!v2) {
2169             ret = ioctl(fd, VFIO_IOMMU_ENABLE);
2170             if (ret) {
2171                 error_setg_errno(errp, errno, "failed to enable container");
2172                 ret = -errno;
2173                 goto enable_discards_exit;
2174             }
2175         } else {
2176             container->prereg_listener = vfio_prereg_listener;
2177 
2178             memory_listener_register(&container->prereg_listener,
2179                                      &address_space_memory);
2180             if (container->error) {
2181                 memory_listener_unregister(&container->prereg_listener);
2182                 ret = -1;
2183                 error_propagate_prepend(errp, container->error,
2184                     "RAM memory listener initialization failed: ");
2185                 goto enable_discards_exit;
2186             }
2187         }
2188 
2189         info.argsz = sizeof(info);
2190         ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
2191         if (ret) {
2192             error_setg_errno(errp, errno,
2193                              "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
2194             ret = -errno;
2195             if (v2) {
2196                 memory_listener_unregister(&container->prereg_listener);
2197             }
2198             goto enable_discards_exit;
2199         }
2200 
2201         if (v2) {
2202             container->pgsizes = info.ddw.pgsizes;
2203             /*
2204              * There is a default window in just created container.
2205              * To make region_add/del simpler, we better remove this
2206              * window now and let those iommu_listener callbacks
2207              * create/remove them when needed.
2208              */
2209             ret = vfio_spapr_remove_window(container, info.dma32_window_start);
2210             if (ret) {
2211                 error_setg_errno(errp, -ret,
2212                                  "failed to remove existing window");
2213                 goto enable_discards_exit;
2214             }
2215         } else {
2216             /* The default table uses 4K pages */
2217             container->pgsizes = 0x1000;
2218             vfio_host_win_add(container, info.dma32_window_start,
2219                               info.dma32_window_start +
2220                               info.dma32_window_size - 1,
2221                               0x1000);
2222         }
2223     }
2224     }
2225 
2226     vfio_kvm_device_add_group(group);
2227 
2228     QLIST_INIT(&container->group_list);
2229     QLIST_INSERT_HEAD(&space->containers, container, next);
2230 
2231     group->container = container;
2232     QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2233 
2234     container->listener = vfio_memory_listener;
2235 
2236     memory_listener_register(&container->listener, container->space->as);
2237 
2238     if (container->error) {
2239         ret = -1;
2240         error_propagate_prepend(errp, container->error,
2241             "memory listener initialization failed: ");
2242         goto listener_release_exit;
2243     }
2244 
2245     container->initialized = true;
2246 
2247     return 0;
2248 listener_release_exit:
2249     QLIST_REMOVE(group, container_next);
2250     QLIST_REMOVE(container, next);
2251     vfio_kvm_device_del_group(group);
2252     vfio_listener_release(container);
2253 
2254 enable_discards_exit:
2255     vfio_ram_block_discard_disable(container, false);
2256 
2257 free_container_exit:
2258     g_free(container);
2259 
2260 close_fd_exit:
2261     close(fd);
2262 
2263 put_space_exit:
2264     vfio_put_address_space(space);
2265 
2266     return ret;
2267 }
2268 
2269 static void vfio_disconnect_container(VFIOGroup *group)
2270 {
2271     VFIOContainer *container = group->container;
2272 
2273     QLIST_REMOVE(group, container_next);
2274     group->container = NULL;
2275 
2276     /*
2277      * Explicitly release the listener first before unset container,
2278      * since unset may destroy the backend container if it's the last
2279      * group.
2280      */
2281     if (QLIST_EMPTY(&container->group_list)) {
2282         vfio_listener_release(container);
2283     }
2284 
2285     if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
2286         error_report("vfio: error disconnecting group %d from container",
2287                      group->groupid);
2288     }
2289 
2290     if (QLIST_EMPTY(&container->group_list)) {
2291         VFIOAddressSpace *space = container->space;
2292         VFIOGuestIOMMU *giommu, *tmp;
2293         VFIOHostDMAWindow *hostwin, *next;
2294 
2295         QLIST_REMOVE(container, next);
2296 
2297         QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
2298             memory_region_unregister_iommu_notifier(
2299                     MEMORY_REGION(giommu->iommu_mr), &giommu->n);
2300             QLIST_REMOVE(giommu, giommu_next);
2301             g_free(giommu);
2302         }
2303 
2304         QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
2305                            next) {
2306             QLIST_REMOVE(hostwin, hostwin_next);
2307             g_free(hostwin);
2308         }
2309 
2310         trace_vfio_disconnect_container(container->fd);
2311         close(container->fd);
2312         g_free(container);
2313 
2314         vfio_put_address_space(space);
2315     }
2316 }
2317 
2318 VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
2319 {
2320     VFIOGroup *group;
2321     char path[32];
2322     struct vfio_group_status status = { .argsz = sizeof(status) };
2323 
2324     QLIST_FOREACH(group, &vfio_group_list, next) {
2325         if (group->groupid == groupid) {
2326             /* Found it.  Now is it already in the right context? */
2327             if (group->container->space->as == as) {
2328                 return group;
2329             } else {
2330                 error_setg(errp, "group %d used in multiple address spaces",
2331                            group->groupid);
2332                 return NULL;
2333             }
2334         }
2335     }
2336 
2337     group = g_malloc0(sizeof(*group));
2338 
2339     snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
2340     group->fd = qemu_open_old(path, O_RDWR);
2341     if (group->fd < 0) {
2342         error_setg_errno(errp, errno, "failed to open %s", path);
2343         goto free_group_exit;
2344     }
2345 
2346     if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
2347         error_setg_errno(errp, errno, "failed to get group %d status", groupid);
2348         goto close_fd_exit;
2349     }
2350 
2351     if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
2352         error_setg(errp, "group %d is not viable", groupid);
2353         error_append_hint(errp,
2354                           "Please ensure all devices within the iommu_group "
2355                           "are bound to their vfio bus driver.\n");
2356         goto close_fd_exit;
2357     }
2358 
2359     group->groupid = groupid;
2360     QLIST_INIT(&group->device_list);
2361 
2362     if (vfio_connect_container(group, as, errp)) {
2363         error_prepend(errp, "failed to setup container for group %d: ",
2364                       groupid);
2365         goto close_fd_exit;
2366     }
2367 
2368     if (QLIST_EMPTY(&vfio_group_list)) {
2369         qemu_register_reset(vfio_reset_handler, NULL);
2370     }
2371 
2372     QLIST_INSERT_HEAD(&vfio_group_list, group, next);
2373 
2374     return group;
2375 
2376 close_fd_exit:
2377     close(group->fd);
2378 
2379 free_group_exit:
2380     g_free(group);
2381 
2382     return NULL;
2383 }
2384 
2385 void vfio_put_group(VFIOGroup *group)
2386 {
2387     if (!group || !QLIST_EMPTY(&group->device_list)) {
2388         return;
2389     }
2390 
2391     if (!group->ram_block_discard_allowed) {
2392         vfio_ram_block_discard_disable(group->container, false);
2393     }
2394     vfio_kvm_device_del_group(group);
2395     vfio_disconnect_container(group);
2396     QLIST_REMOVE(group, next);
2397     trace_vfio_put_group(group->fd);
2398     close(group->fd);
2399     g_free(group);
2400 
2401     if (QLIST_EMPTY(&vfio_group_list)) {
2402         qemu_unregister_reset(vfio_reset_handler, NULL);
2403     }
2404 }
2405 
2406 int vfio_get_device(VFIOGroup *group, const char *name,
2407                     VFIODevice *vbasedev, Error **errp)
2408 {
2409     struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
2410     int ret, fd;
2411 
2412     fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
2413     if (fd < 0) {
2414         error_setg_errno(errp, errno, "error getting device from group %d",
2415                          group->groupid);
2416         error_append_hint(errp,
2417                       "Verify all devices in group %d are bound to vfio-<bus> "
2418                       "or pci-stub and not already in use\n", group->groupid);
2419         return fd;
2420     }
2421 
2422     ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info);
2423     if (ret) {
2424         error_setg_errno(errp, errno, "error getting device info");
2425         close(fd);
2426         return ret;
2427     }
2428 
2429     /*
2430      * Set discarding of RAM as not broken for this group if the driver knows
2431      * the device operates compatibly with discarding.  Setting must be
2432      * consistent per group, but since compatibility is really only possible
2433      * with mdev currently, we expect singleton groups.
2434      */
2435     if (vbasedev->ram_block_discard_allowed !=
2436         group->ram_block_discard_allowed) {
2437         if (!QLIST_EMPTY(&group->device_list)) {
2438             error_setg(errp, "Inconsistent setting of support for discarding "
2439                        "RAM (e.g., balloon) within group");
2440             close(fd);
2441             return -1;
2442         }
2443 
2444         if (!group->ram_block_discard_allowed) {
2445             group->ram_block_discard_allowed = true;
2446             vfio_ram_block_discard_disable(group->container, false);
2447         }
2448     }
2449 
2450     vbasedev->fd = fd;
2451     vbasedev->group = group;
2452     QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
2453 
2454     vbasedev->num_irqs = dev_info.num_irqs;
2455     vbasedev->num_regions = dev_info.num_regions;
2456     vbasedev->flags = dev_info.flags;
2457 
2458     trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions,
2459                           dev_info.num_irqs);
2460 
2461     vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
2462     return 0;
2463 }
2464 
2465 void vfio_put_base_device(VFIODevice *vbasedev)
2466 {
2467     if (!vbasedev->group) {
2468         return;
2469     }
2470     QLIST_REMOVE(vbasedev, next);
2471     vbasedev->group = NULL;
2472     trace_vfio_put_base_device(vbasedev->fd);
2473     close(vbasedev->fd);
2474 }
2475 
2476 int vfio_get_region_info(VFIODevice *vbasedev, int index,
2477                          struct vfio_region_info **info)
2478 {
2479     size_t argsz = sizeof(struct vfio_region_info);
2480 
2481     *info = g_malloc0(argsz);
2482 
2483     (*info)->index = index;
2484 retry:
2485     (*info)->argsz = argsz;
2486 
2487     if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
2488         g_free(*info);
2489         *info = NULL;
2490         return -errno;
2491     }
2492 
2493     if ((*info)->argsz > argsz) {
2494         argsz = (*info)->argsz;
2495         *info = g_realloc(*info, argsz);
2496 
2497         goto retry;
2498     }
2499 
2500     return 0;
2501 }
2502 
2503 int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
2504                              uint32_t subtype, struct vfio_region_info **info)
2505 {
2506     int i;
2507 
2508     for (i = 0; i < vbasedev->num_regions; i++) {
2509         struct vfio_info_cap_header *hdr;
2510         struct vfio_region_info_cap_type *cap_type;
2511 
2512         if (vfio_get_region_info(vbasedev, i, info)) {
2513             continue;
2514         }
2515 
2516         hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
2517         if (!hdr) {
2518             g_free(*info);
2519             continue;
2520         }
2521 
2522         cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
2523 
2524         trace_vfio_get_dev_region(vbasedev->name, i,
2525                                   cap_type->type, cap_type->subtype);
2526 
2527         if (cap_type->type == type && cap_type->subtype == subtype) {
2528             return 0;
2529         }
2530 
2531         g_free(*info);
2532     }
2533 
2534     *info = NULL;
2535     return -ENODEV;
2536 }
2537 
2538 bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
2539 {
2540     struct vfio_region_info *info = NULL;
2541     bool ret = false;
2542 
2543     if (!vfio_get_region_info(vbasedev, region, &info)) {
2544         if (vfio_get_region_info_cap(info, cap_type)) {
2545             ret = true;
2546         }
2547         g_free(info);
2548     }
2549 
2550     return ret;
2551 }
2552 
2553 /*
2554  * Interfaces for IBM EEH (Enhanced Error Handling)
2555  */
2556 static bool vfio_eeh_container_ok(VFIOContainer *container)
2557 {
2558     /*
2559      * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO
2560      * implementation is broken if there are multiple groups in a
2561      * container.  The hardware works in units of Partitionable
2562      * Endpoints (== IOMMU groups) and the EEH operations naively
2563      * iterate across all groups in the container, without any logic
2564      * to make sure the groups have their state synchronized.  For
2565      * certain operations (ENABLE) that might be ok, until an error
2566      * occurs, but for others (GET_STATE) it's clearly broken.
2567      */
2568 
2569     /*
2570      * XXX Once fixed kernels exist, test for them here
2571      */
2572 
2573     if (QLIST_EMPTY(&container->group_list)) {
2574         return false;
2575     }
2576 
2577     if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
2578         return false;
2579     }
2580 
2581     return true;
2582 }
2583 
2584 static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
2585 {
2586     struct vfio_eeh_pe_op pe_op = {
2587         .argsz = sizeof(pe_op),
2588         .op = op,
2589     };
2590     int ret;
2591 
2592     if (!vfio_eeh_container_ok(container)) {
2593         error_report("vfio/eeh: EEH_PE_OP 0x%x: "
2594                      "kernel requires a container with exactly one group", op);
2595         return -EPERM;
2596     }
2597 
2598     ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
2599     if (ret < 0) {
2600         error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
2601         return -errno;
2602     }
2603 
2604     return ret;
2605 }
2606 
2607 static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
2608 {
2609     VFIOAddressSpace *space = vfio_get_address_space(as);
2610     VFIOContainer *container = NULL;
2611 
2612     if (QLIST_EMPTY(&space->containers)) {
2613         /* No containers to act on */
2614         goto out;
2615     }
2616 
2617     container = QLIST_FIRST(&space->containers);
2618 
2619     if (QLIST_NEXT(container, next)) {
2620         /* We don't yet have logic to synchronize EEH state across
2621          * multiple containers */
2622         container = NULL;
2623         goto out;
2624     }
2625 
2626 out:
2627     vfio_put_address_space(space);
2628     return container;
2629 }
2630 
2631 bool vfio_eeh_as_ok(AddressSpace *as)
2632 {
2633     VFIOContainer *container = vfio_eeh_as_container(as);
2634 
2635     return (container != NULL) && vfio_eeh_container_ok(container);
2636 }
2637 
2638 int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
2639 {
2640     VFIOContainer *container = vfio_eeh_as_container(as);
2641 
2642     if (!container) {
2643         return -ENODEV;
2644     }
2645     return vfio_eeh_container_op(container, op);
2646 }
2647