xref: /openbmc/qemu/hw/vfio/pci.c (revision 2bfcd27e00a49da2efa5d703121b94cd9cd4948b)
1 /*
2  * vfio based device assignment support
3  *
4  * Copyright Red Hat, Inc. 2012
5  *
6  * Authors:
7  *  Alex Williamson <alex.williamson@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Based on qemu-kvm device-assignment:
13  *  Adapted for KVM by Qumranet.
14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19  */
20 
21 #include "qemu/osdep.h"
22 #include CONFIG_DEVICES /* CONFIG_IOMMUFD */
23 #include <linux/vfio.h>
24 #include <sys/ioctl.h>
25 
26 #include "hw/hw.h"
27 #include "hw/pci/msi.h"
28 #include "hw/pci/msix.h"
29 #include "hw/pci/pci_bridge.h"
30 #include "hw/qdev-properties.h"
31 #include "hw/qdev-properties-system.h"
32 #include "hw/vfio/vfio-cpr.h"
33 #include "migration/vmstate.h"
34 #include "migration/cpr.h"
35 #include "qobject/qdict.h"
36 #include "qemu/error-report.h"
37 #include "qemu/main-loop.h"
38 #include "qemu/module.h"
39 #include "qemu/range.h"
40 #include "qemu/units.h"
41 #include "system/kvm.h"
42 #include "system/runstate.h"
43 #include "pci.h"
44 #include "trace.h"
45 #include "qapi/error.h"
46 #include "migration/blocker.h"
47 #include "migration/qemu-file.h"
48 #include "system/iommufd.h"
49 #include "vfio-migration-internal.h"
50 #include "vfio-helpers.h"
51 
52 #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
53 
54 /* Protected by BQL */
55 static KVMRouteChange vfio_route_change;
56 
57 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
58 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
59 static void vfio_msi_disable_common(VFIOPCIDevice *vdev);
60 
61 /* Create new or reuse existing eventfd */
62 static bool vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e,
63                                const char *name, int nr, Error **errp)
64 {
65     int fd, ret;
66 
67     fd = vfio_cpr_load_vector_fd(vdev, name, nr);
68     if (fd >= 0) {
69         event_notifier_init_fd(e, fd);
70         return true;
71     }
72 
73     ret = event_notifier_init(e, 0);
74     if (ret) {
75         error_setg_errno(errp, -ret, "vfio_notifier_init %s failed", name);
76         return false;
77     }
78 
79     fd = event_notifier_get_fd(e);
80     vfio_cpr_save_vector_fd(vdev, name, nr, fd);
81     return true;
82 }
83 
84 static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e,
85                                   const char *name, int nr)
86 {
87     vfio_cpr_delete_vector_fd(vdev, name, nr);
88     event_notifier_cleanup(e);
89 }
90 
91 /*
92  * Disabling BAR mmaping can be slow, but toggling it around INTx can
93  * also be a huge overhead.  We try to get the best of both worlds by
94  * waiting until an interrupt to disable mmaps (subsequent transitions
95  * to the same state are effectively no overhead).  If the interrupt has
96  * been serviced and the time gap is long enough, we re-enable mmaps for
97  * performance.  This works well for things like graphics cards, which
98  * may not use their interrupt at all and are penalized to an unusable
99  * level by read/write BAR traps.  Other devices, like NICs, have more
100  * regular interrupts and see much better latency by staying in non-mmap
101  * mode.  We therefore set the default mmap_timeout such that a ping
102  * is just enough to keep the mmap disabled.  Users can experiment with
103  * other options with the x-intx-mmap-timeout-ms parameter (a value of
104  * zero disables the timer).
105  */
106 static void vfio_intx_mmap_enable(void *opaque)
107 {
108     VFIOPCIDevice *vdev = opaque;
109 
110     if (vdev->intx.pending) {
111         timer_mod(vdev->intx.mmap_timer,
112                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
113         return;
114     }
115 
116     vfio_mmap_set_enabled(vdev, true);
117 }
118 
119 static void vfio_intx_interrupt(void *opaque)
120 {
121     VFIOPCIDevice *vdev = opaque;
122 
123     if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
124         return;
125     }
126 
127     trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin);
128 
129     vdev->intx.pending = true;
130     pci_irq_assert(&vdev->pdev);
131     vfio_mmap_set_enabled(vdev, false);
132     if (vdev->intx.mmap_timeout) {
133         timer_mod(vdev->intx.mmap_timer,
134                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
135     }
136 }
137 
138 void vfio_pci_intx_eoi(VFIODevice *vbasedev)
139 {
140     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
141 
142     if (!vdev->intx.pending) {
143         return;
144     }
145 
146     trace_vfio_pci_intx_eoi(vbasedev->name);
147 
148     vdev->intx.pending = false;
149     pci_irq_deassert(&vdev->pdev);
150     vfio_device_irq_unmask(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
151 }
152 
153 static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
154 {
155 #ifdef CONFIG_KVM
156     int irq_fd = event_notifier_get_fd(&vdev->intx.interrupt);
157 
158     if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
159         vdev->intx.route.mode != PCI_INTX_ENABLED ||
160         !kvm_resamplefds_enabled()) {
161         return true;
162     }
163 
164     /* Get to a known interrupt state */
165     qemu_set_fd_handler(irq_fd, NULL, NULL, vdev);
166     vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
167     vdev->intx.pending = false;
168     pci_irq_deassert(&vdev->pdev);
169 
170     /* Get an eventfd for resample/unmask */
171     if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) {
172         goto fail;
173     }
174 
175     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
176                                            &vdev->intx.interrupt,
177                                            &vdev->intx.unmask,
178                                            vdev->intx.route.irq)) {
179         error_setg_errno(errp, errno, "failed to setup resample irqfd");
180         goto fail_irqfd;
181     }
182 
183     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
184                                        VFIO_IRQ_SET_ACTION_UNMASK,
185                                        event_notifier_get_fd(&vdev->intx.unmask),
186                                        errp)) {
187         goto fail_vfio;
188     }
189 
190     /* Let'em rip */
191     vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
192 
193     vdev->intx.kvm_accel = true;
194 
195     trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
196 
197     return true;
198 
199 fail_vfio:
200     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
201                                           vdev->intx.route.irq);
202 fail_irqfd:
203     vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0);
204 fail:
205     qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev);
206     vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
207     return false;
208 #else
209     return true;
210 #endif
211 }
212 
213 static bool vfio_cpr_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
214 {
215 #ifdef CONFIG_KVM
216     if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
217         vdev->intx.route.mode != PCI_INTX_ENABLED ||
218         !kvm_resamplefds_enabled()) {
219         return true;
220     }
221 
222     if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) {
223         return false;
224     }
225 
226     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
227                                            &vdev->intx.interrupt,
228                                            &vdev->intx.unmask,
229                                            vdev->intx.route.irq)) {
230         error_setg_errno(errp, errno, "failed to setup resample irqfd");
231         vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0);
232         return false;
233     }
234 
235     vdev->intx.kvm_accel = true;
236     trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
237     return true;
238 #else
239     return true;
240 #endif
241 }
242 
243 static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
244 {
245 #ifdef CONFIG_KVM
246     if (!vdev->intx.kvm_accel) {
247         return;
248     }
249 
250     /*
251      * Get to a known state, hardware masked, QEMU ready to accept new
252      * interrupts, QEMU IRQ de-asserted.
253      */
254     vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
255     vdev->intx.pending = false;
256     pci_irq_deassert(&vdev->pdev);
257 
258     /* Tell KVM to stop listening for an INTx irqfd */
259     if (kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
260                                               vdev->intx.route.irq)) {
261         error_report("vfio: Error: Failed to disable INTx irqfd: %m");
262     }
263 
264     /* We only need to close the eventfd for VFIO to cleanup the kernel side */
265     vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0);
266 
267     /* QEMU starts listening for interrupt events. */
268     qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt),
269                         vfio_intx_interrupt, NULL, vdev);
270 
271     vdev->intx.kvm_accel = false;
272 
273     /* If we've missed an event, let it re-fire through QEMU */
274     vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
275 
276     trace_vfio_intx_disable_kvm(vdev->vbasedev.name);
277 #endif
278 }
279 
280 static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route)
281 {
282     Error *err = NULL;
283 
284     trace_vfio_intx_update(vdev->vbasedev.name,
285                            vdev->intx.route.irq, route->irq);
286 
287     vfio_intx_disable_kvm(vdev);
288 
289     vdev->intx.route = *route;
290 
291     if (route->mode != PCI_INTX_ENABLED) {
292         return;
293     }
294 
295     if (!vfio_intx_enable_kvm(vdev, &err)) {
296         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
297     }
298 
299     /* Re-enable the interrupt in cased we missed an EOI */
300     vfio_pci_intx_eoi(&vdev->vbasedev);
301 }
302 
303 static void vfio_intx_routing_notifier(PCIDevice *pdev)
304 {
305     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
306     PCIINTxRoute route;
307 
308     if (vdev->interrupt != VFIO_INT_INTx) {
309         return;
310     }
311 
312     route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
313 
314     if (pci_intx_route_changed(&vdev->intx.route, &route)) {
315         vfio_intx_update(vdev, &route);
316     }
317 }
318 
319 static void vfio_irqchip_change(Notifier *notify, void *data)
320 {
321     VFIOPCIDevice *vdev = container_of(notify, VFIOPCIDevice,
322                                        irqchip_change_notifier);
323 
324     vfio_intx_update(vdev, &vdev->intx.route);
325 }
326 
327 static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
328 {
329     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
330     Error *err = NULL;
331     int32_t fd;
332 
333 
334     if (!pin) {
335         return true;
336     }
337 
338     /*
339      * Do not alter interrupt state during vfio_realize and cpr load.
340      * The incoming state is cleared thereafter.
341      */
342     if (!cpr_is_incoming()) {
343         vfio_disable_interrupts(vdev);
344     }
345 
346     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
347     pci_config_set_interrupt_pin(vdev->pdev.config, pin);
348 
349 #ifdef CONFIG_KVM
350     /*
351      * Only conditional to avoid generating error messages on platforms
352      * where we won't actually use the result anyway.
353      */
354     if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
355         vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
356                                                         vdev->intx.pin);
357     }
358 #endif
359 
360     if (!vfio_notifier_init(vdev, &vdev->intx.interrupt, "intx-interrupt", 0,
361                             errp)) {
362         return false;
363     }
364     fd = event_notifier_get_fd(&vdev->intx.interrupt);
365     qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
366 
367 
368     if (cpr_is_incoming()) {
369         if (!vfio_cpr_intx_enable_kvm(vdev, &err)) {
370             warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
371         }
372         goto skip_signaling;
373     }
374 
375     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
376                                 VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
377         qemu_set_fd_handler(fd, NULL, NULL, vdev);
378         vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0);
379         return false;
380     }
381 
382     if (!vfio_intx_enable_kvm(vdev, &err)) {
383         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
384     }
385 
386 skip_signaling:
387     vdev->interrupt = VFIO_INT_INTx;
388 
389     trace_vfio_intx_enable(vdev->vbasedev.name);
390     return true;
391 }
392 
393 static void vfio_intx_disable(VFIOPCIDevice *vdev)
394 {
395     int fd;
396 
397     timer_del(vdev->intx.mmap_timer);
398     vfio_intx_disable_kvm(vdev);
399     vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
400     vdev->intx.pending = false;
401     pci_irq_deassert(&vdev->pdev);
402     vfio_mmap_set_enabled(vdev, true);
403 
404     fd = event_notifier_get_fd(&vdev->intx.interrupt);
405     qemu_set_fd_handler(fd, NULL, NULL, vdev);
406     vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0);
407 
408     vdev->interrupt = VFIO_INT_NONE;
409 
410     trace_vfio_intx_disable(vdev->vbasedev.name);
411 }
412 
413 bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp)
414 {
415     return vfio_intx_enable(vdev, errp);
416 }
417 
418 /*
419  * MSI/X
420  */
421 static void vfio_msi_interrupt(void *opaque)
422 {
423     VFIOMSIVector *vector = opaque;
424     VFIOPCIDevice *vdev = vector->vdev;
425     MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector);
426     void (*notify)(PCIDevice *dev, unsigned vector);
427     MSIMessage msg;
428     int nr = vector - vdev->msi_vectors;
429 
430     if (!event_notifier_test_and_clear(&vector->interrupt)) {
431         return;
432     }
433 
434     if (vdev->interrupt == VFIO_INT_MSIX) {
435         get_msg = msix_get_message;
436         notify = msix_notify;
437 
438         /* A masked vector firing needs to use the PBA, enable it */
439         if (msix_is_masked(&vdev->pdev, nr)) {
440             set_bit(nr, vdev->msix->pending);
441             memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true);
442             trace_vfio_msix_pba_enable(vdev->vbasedev.name);
443         }
444     } else if (vdev->interrupt == VFIO_INT_MSI) {
445         get_msg = msi_get_message;
446         notify = msi_notify;
447     } else {
448         abort();
449     }
450 
451     msg = get_msg(&vdev->pdev, nr);
452     trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data);
453     notify(&vdev->pdev, nr);
454 }
455 
456 void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr)
457 {
458     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
459     int fd = event_notifier_get_fd(&vector->interrupt);
460 
461     qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
462 }
463 
464 /*
465  * Get MSI-X enabled, but no vector enabled, by setting vector 0 with an invalid
466  * fd to kernel.
467  */
468 static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
469 {
470     g_autofree struct vfio_irq_set *irq_set = NULL;
471     int argsz;
472     int32_t *fd;
473 
474     argsz = sizeof(*irq_set) + sizeof(*fd);
475 
476     irq_set = g_malloc0(argsz);
477     irq_set->argsz = argsz;
478     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
479                      VFIO_IRQ_SET_ACTION_TRIGGER;
480     irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
481     irq_set->start = 0;
482     irq_set->count = 1;
483     fd = (int32_t *)&irq_set->data;
484     *fd = -1;
485 
486     return vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
487 }
488 
489 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
490 {
491     struct vfio_irq_set *irq_set;
492     int ret = 0, i, argsz;
493     int32_t *fds;
494 
495     /*
496      * If dynamic MSI-X allocation is supported, the vectors to be allocated
497      * and enabled can be scattered. Before kernel enabling MSI-X, setting
498      * nr_vectors causes all these vectors to be allocated on host.
499      *
500      * To keep allocation as needed, use vector 0 with an invalid fd to get
501      * MSI-X enabled first, then set vectors with a potentially sparse set of
502      * eventfds to enable interrupts only when enabled in guest.
503      */
504     if (msix && !vdev->msix->noresize) {
505         ret = vfio_enable_msix_no_vec(vdev);
506 
507         if (ret) {
508             return ret;
509         }
510     }
511 
512     argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
513 
514     irq_set = g_malloc0(argsz);
515     irq_set->argsz = argsz;
516     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
517     irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
518     irq_set->start = 0;
519     irq_set->count = vdev->nr_vectors;
520     fds = (int32_t *)&irq_set->data;
521 
522     for (i = 0; i < vdev->nr_vectors; i++) {
523         int fd = -1;
524 
525         /*
526          * MSI vs MSI-X - The guest has direct access to MSI mask and pending
527          * bits, therefore we always use the KVM signaling path when setup.
528          * MSI-X mask and pending bits are emulated, so we want to use the
529          * KVM signaling path only when configured and unmasked.
530          */
531         if (vdev->msi_vectors[i].use) {
532             if (vdev->msi_vectors[i].virq < 0 ||
533                 (msix && msix_is_masked(&vdev->pdev, i))) {
534                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
535             } else {
536                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
537             }
538         }
539 
540         fds[i] = fd;
541     }
542 
543     ret = vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
544 
545     g_free(irq_set);
546 
547     return ret;
548 }
549 
550 void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
551                                int vector_n, bool msix)
552 {
553     if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) {
554         return;
555     }
556 
557     vector->virq = kvm_irqchip_add_msi_route(&vfio_route_change,
558                                              vector_n, &vdev->pdev);
559 }
560 
561 static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector, int nr)
562 {
563     const char *name = "kvm_interrupt";
564 
565     if (vector->virq < 0) {
566         return;
567     }
568 
569     if (!vfio_notifier_init(vector->vdev, &vector->kvm_interrupt, name, nr,
570                             NULL)) {
571         goto fail_notifier;
572     }
573 
574     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
575                                            NULL, vector->virq) < 0) {
576         goto fail_kvm;
577     }
578 
579     return;
580 
581 fail_kvm:
582     vfio_notifier_cleanup(vector->vdev, &vector->kvm_interrupt, name, nr);
583 fail_notifier:
584     kvm_irqchip_release_virq(kvm_state, vector->virq);
585     vector->virq = -1;
586 }
587 
588 static void vfio_remove_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
589                                      int nr)
590 {
591     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
592                                           vector->virq);
593     kvm_irqchip_release_virq(kvm_state, vector->virq);
594     vector->virq = -1;
595     vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, "kvm_interrupt", nr);
596 }
597 
598 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
599                                      PCIDevice *pdev)
600 {
601     kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev);
602     kvm_irqchip_commit_routes(kvm_state);
603 }
604 
605 static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector,
606                                unsigned int nr)
607 {
608     Error *err = NULL;
609     int32_t fd;
610 
611     if (vector->virq >= 0) {
612         fd = event_notifier_get_fd(&vector->kvm_interrupt);
613     } else {
614         fd = event_notifier_get_fd(&vector->interrupt);
615     }
616 
617     if (!vfio_device_irq_set_signaling(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr,
618                                        VFIO_IRQ_SET_ACTION_TRIGGER,
619                                        fd, &err)) {
620         error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name);
621     }
622 }
623 
624 void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr)
625 {
626     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
627     PCIDevice *pdev = &vdev->pdev;
628     Error *local_err = NULL;
629 
630     vector->vdev = vdev;
631     vector->virq = -1;
632     if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", nr,
633                             &local_err)) {
634         error_report_err(local_err);
635     }
636     vector->use = true;
637     if (vdev->interrupt == VFIO_INT_MSIX) {
638         msix_vector_use(pdev, nr);
639     }
640 }
641 
642 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
643                                    MSIMessage *msg, IOHandler *handler)
644 {
645     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
646     VFIOMSIVector *vector;
647     int ret;
648     bool resizing = !!(vdev->nr_vectors < nr + 1);
649 
650     trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
651 
652     vector = &vdev->msi_vectors[nr];
653 
654     if (!vector->use) {
655         vfio_pci_vector_init(vdev, nr);
656     }
657 
658     qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
659                         handler, NULL, vector);
660 
661     /*
662      * Attempt to enable route through KVM irqchip,
663      * default to userspace handling if unavailable.
664      */
665     if (vector->virq >= 0) {
666         if (!msg) {
667             vfio_remove_kvm_msi_virq(vdev, vector, nr);
668         } else {
669             vfio_update_kvm_msi_virq(vector, *msg, pdev);
670         }
671     } else {
672         if (msg) {
673             if (vdev->defer_kvm_irq_routing) {
674                 vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true);
675             } else {
676                 vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
677                 vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true);
678                 kvm_irqchip_commit_route_changes(&vfio_route_change);
679                 vfio_connect_kvm_msi_virq(vector, nr);
680             }
681         }
682     }
683 
684     /*
685      * When dynamic allocation is not supported, we don't want to have the
686      * host allocate all possible MSI vectors for a device if they're not
687      * in use, so we shutdown and incrementally increase them as needed.
688      * nr_vectors represents the total number of vectors allocated.
689      *
690      * When dynamic allocation is supported, let the host only allocate
691      * and enable a vector when it is in use in guest. nr_vectors represents
692      * the upper bound of vectors being enabled (but not all of the ranges
693      * is allocated or enabled).
694      */
695     if (resizing) {
696         vdev->nr_vectors = nr + 1;
697     }
698 
699     if (!vdev->defer_kvm_irq_routing) {
700         if (vdev->msix->noresize && resizing) {
701             vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
702             ret = vfio_enable_vectors(vdev, true);
703             if (ret) {
704                 error_report("vfio: failed to enable vectors, %s",
705                              strerror(-ret));
706             }
707         } else {
708             set_irq_signalling(&vdev->vbasedev, vector, nr);
709         }
710     }
711 
712     /* Disable PBA emulation when nothing more is pending. */
713     clear_bit(nr, vdev->msix->pending);
714     if (find_first_bit(vdev->msix->pending,
715                        vdev->nr_vectors) == vdev->nr_vectors) {
716         memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
717         trace_vfio_msix_pba_disable(vdev->vbasedev.name);
718     }
719 
720     return 0;
721 }
722 
723 static int vfio_msix_vector_use(PCIDevice *pdev,
724                                 unsigned int nr, MSIMessage msg)
725 {
726     /*
727      * Ignore the callback from msix_set_vector_notifiers during resume.
728      * The necessary subset of these actions is called from
729      * vfio_cpr_claim_vectors during post load.
730      */
731     if (cpr_is_incoming()) {
732         return 0;
733     }
734 
735     return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
736 }
737 
738 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
739 {
740     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
741     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
742 
743     trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
744 
745     /*
746      * There are still old guests that mask and unmask vectors on every
747      * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
748      * the KVM setup in place, simply switch VFIO to use the non-bypass
749      * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
750      * core will mask the interrupt and set pending bits, allowing it to
751      * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
752      */
753     if (vector->virq >= 0) {
754         int32_t fd = event_notifier_get_fd(&vector->interrupt);
755         Error *err = NULL;
756 
757         if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
758                                     nr, VFIO_IRQ_SET_ACTION_TRIGGER, fd,
759                                     &err)) {
760             error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
761         }
762     }
763 }
764 
765 void vfio_pci_msix_set_notifiers(VFIOPCIDevice *vdev)
766 {
767     msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
768                               vfio_msix_vector_release, NULL);
769 }
770 
771 void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
772 {
773     assert(!vdev->defer_kvm_irq_routing);
774     vdev->defer_kvm_irq_routing = true;
775     vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
776 }
777 
778 void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
779 {
780     int i;
781 
782     assert(vdev->defer_kvm_irq_routing);
783     vdev->defer_kvm_irq_routing = false;
784 
785     kvm_irqchip_commit_route_changes(&vfio_route_change);
786 
787     for (i = 0; i < vdev->nr_vectors; i++) {
788         vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i], i);
789     }
790 }
791 
792 static void vfio_msix_enable(VFIOPCIDevice *vdev)
793 {
794     int ret;
795 
796     vfio_disable_interrupts(vdev);
797 
798     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries);
799 
800     vdev->interrupt = VFIO_INT_MSIX;
801 
802     /*
803      * Setting vector notifiers triggers synchronous vector-use
804      * callbacks for each active vector.  Deferring to commit the KVM
805      * routes once rather than per vector provides a substantial
806      * performance improvement.
807      */
808     vfio_pci_prepare_kvm_msi_virq_batch(vdev);
809 
810     if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
811                                   vfio_msix_vector_release, NULL)) {
812         error_report("vfio: msix_set_vector_notifiers failed");
813     }
814 
815     vfio_pci_commit_kvm_msi_virq_batch(vdev);
816 
817     if (vdev->nr_vectors) {
818         ret = vfio_enable_vectors(vdev, true);
819         if (ret) {
820             error_report("vfio: failed to enable vectors, %s",
821                          strerror(-ret));
822         }
823     } else {
824         /*
825          * Some communication channels between VF & PF or PF & fw rely on the
826          * physical state of the device and expect that enabling MSI-X from the
827          * guest enables the same on the host.  When our guest is Linux, the
828          * guest driver call to pci_enable_msix() sets the enabling bit in the
829          * MSI-X capability, but leaves the vector table masked.  We therefore
830          * can't rely on a vector_use callback (from request_irq() in the guest)
831          * to switch the physical device into MSI-X mode because that may come a
832          * long time after pci_enable_msix().  This code sets vector 0 with an
833          * invalid fd to make the physical device MSI-X enabled, but with no
834          * vectors enabled, just like the guest view.
835          */
836         ret = vfio_enable_msix_no_vec(vdev);
837         if (ret) {
838             error_report("vfio: failed to enable MSI-X, %s",
839                          strerror(-ret));
840         }
841     }
842 
843     trace_vfio_msix_enable(vdev->vbasedev.name);
844 }
845 
846 static void vfio_msi_enable(VFIOPCIDevice *vdev)
847 {
848     int ret, i;
849 
850     vfio_disable_interrupts(vdev);
851 
852     vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
853 retry:
854     /*
855      * Setting vector notifiers needs to enable route for each vector.
856      * Deferring to commit the KVM routes once rather than per vector
857      * provides a substantial performance improvement.
858      */
859     vfio_pci_prepare_kvm_msi_virq_batch(vdev);
860 
861     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
862 
863     for (i = 0; i < vdev->nr_vectors; i++) {
864         VFIOMSIVector *vector = &vdev->msi_vectors[i];
865         Error *local_err = NULL;
866 
867         vector->vdev = vdev;
868         vector->virq = -1;
869         vector->use = true;
870 
871         if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", i,
872                                 &local_err)) {
873             error_report_err(local_err);
874         }
875 
876         qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
877                             vfio_msi_interrupt, NULL, vector);
878 
879         /*
880          * Attempt to enable route through KVM irqchip,
881          * default to userspace handling if unavailable.
882          */
883         vfio_pci_add_kvm_msi_virq(vdev, vector, i, false);
884     }
885 
886     vfio_pci_commit_kvm_msi_virq_batch(vdev);
887 
888     /* Set interrupt type prior to possible interrupts */
889     vdev->interrupt = VFIO_INT_MSI;
890 
891     ret = vfio_enable_vectors(vdev, false);
892     if (ret) {
893         if (ret < 0) {
894             error_report("vfio: Error: Failed to setup MSI fds: %s",
895                          strerror(-ret));
896         } else {
897             error_report("vfio: Error: Failed to enable %d "
898                          "MSI vectors, retry with %d", vdev->nr_vectors, ret);
899         }
900 
901         vfio_msi_disable_common(vdev);
902 
903         if (ret > 0) {
904             vdev->nr_vectors = ret;
905             goto retry;
906         }
907 
908         /*
909          * Failing to setup MSI doesn't really fall within any specification.
910          * Let's try leaving interrupts disabled and hope the guest figures
911          * out to fall back to INTx for this device.
912          */
913         error_report("vfio: Error: Failed to enable MSI");
914 
915         return;
916     }
917 
918     trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
919 }
920 
921 static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
922 {
923     int i;
924 
925     for (i = 0; i < vdev->nr_vectors; i++) {
926         VFIOMSIVector *vector = &vdev->msi_vectors[i];
927         if (vdev->msi_vectors[i].use) {
928             if (vector->virq >= 0) {
929                 vfio_remove_kvm_msi_virq(vdev, vector, i);
930             }
931             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
932                                 NULL, NULL, NULL);
933             vfio_notifier_cleanup(vdev, &vector->interrupt, "interrupt", i);
934         }
935     }
936 
937     g_free(vdev->msi_vectors);
938     vdev->msi_vectors = NULL;
939     vdev->nr_vectors = 0;
940     vdev->interrupt = VFIO_INT_NONE;
941 }
942 
943 static void vfio_msix_disable(VFIOPCIDevice *vdev)
944 {
945     Error *err = NULL;
946     int i;
947 
948     msix_unset_vector_notifiers(&vdev->pdev);
949 
950     /*
951      * MSI-X will only release vectors if MSI-X is still enabled on the
952      * device, check through the rest and release it ourselves if necessary.
953      */
954     for (i = 0; i < vdev->nr_vectors; i++) {
955         if (vdev->msi_vectors[i].use) {
956             vfio_msix_vector_release(&vdev->pdev, i);
957             msix_vector_unuse(&vdev->pdev, i);
958         }
959     }
960 
961     /*
962      * Always clear MSI-X IRQ index. A PF device could have enabled
963      * MSI-X with no vectors. See vfio_msix_enable().
964      */
965     vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
966 
967     vfio_msi_disable_common(vdev);
968     if (!vfio_intx_enable(vdev, &err)) {
969         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
970     }
971 
972     memset(vdev->msix->pending, 0,
973            BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
974 
975     trace_vfio_msix_disable(vdev->vbasedev.name);
976 }
977 
978 static void vfio_msi_disable(VFIOPCIDevice *vdev)
979 {
980     Error *err = NULL;
981 
982     vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
983     vfio_msi_disable_common(vdev);
984     vfio_intx_enable(vdev, &err);
985     if (err) {
986         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
987     }
988 
989     trace_vfio_msi_disable(vdev->vbasedev.name);
990 }
991 
992 static void vfio_update_msi(VFIOPCIDevice *vdev)
993 {
994     int i;
995 
996     for (i = 0; i < vdev->nr_vectors; i++) {
997         VFIOMSIVector *vector = &vdev->msi_vectors[i];
998         MSIMessage msg;
999 
1000         if (!vector->use || vector->virq < 0) {
1001             continue;
1002         }
1003 
1004         msg = msi_get_message(&vdev->pdev, i);
1005         vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev);
1006     }
1007 }
1008 
1009 static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
1010 {
1011     VFIODevice *vbasedev = &vdev->vbasedev;
1012     struct vfio_region_info *reg_info = NULL;
1013     uint64_t size;
1014     off_t off = 0;
1015     ssize_t bytes;
1016     int ret;
1017 
1018     ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_ROM_REGION_INDEX,
1019                                       &reg_info);
1020 
1021     if (ret != 0) {
1022         error_report("vfio: Error getting ROM info: %s", strerror(-ret));
1023         return;
1024     }
1025 
1026     trace_vfio_pci_load_rom(vbasedev->name, (unsigned long)reg_info->size,
1027                             (unsigned long)reg_info->offset,
1028                             (unsigned long)reg_info->flags);
1029 
1030     vdev->rom_size = size = reg_info->size;
1031     vdev->rom_offset = reg_info->offset;
1032 
1033     if (!vdev->rom_size) {
1034         vdev->rom_read_failed = true;
1035         error_report("vfio-pci: Cannot read device rom at %s", vbasedev->name);
1036         error_printf("Device option ROM contents are probably invalid "
1037                     "(check dmesg).\nSkip option ROM probe with rombar=0, "
1038                     "or load from file with romfile=\n");
1039         return;
1040     }
1041 
1042     vdev->rom = g_malloc(size);
1043     memset(vdev->rom, 0xff, size);
1044 
1045     while (size) {
1046         bytes = vbasedev->io_ops->region_read(vbasedev,
1047                                               VFIO_PCI_ROM_REGION_INDEX,
1048                                               off, size, vdev->rom + off);
1049 
1050         if (bytes == 0) {
1051             break;
1052         } else if (bytes > 0) {
1053             off += bytes;
1054             size -= bytes;
1055         } else {
1056             if (bytes == -EINTR || bytes == -EAGAIN) {
1057                 continue;
1058             }
1059             error_report("vfio: Error reading device ROM: %s",
1060                          strreaderror(bytes));
1061 
1062             break;
1063         }
1064     }
1065 
1066     /*
1067      * Test the ROM signature against our device, if the vendor is correct
1068      * but the device ID doesn't match, store the correct device ID and
1069      * recompute the checksum.  Intel IGD devices need this and are known
1070      * to have bogus checksums so we can't simply adjust the checksum.
1071      */
1072     if (pci_get_word(vdev->rom) == 0xaa55 &&
1073         pci_get_word(vdev->rom + 0x18) + 8 < vdev->rom_size &&
1074         !memcmp(vdev->rom + pci_get_word(vdev->rom + 0x18), "PCIR", 4)) {
1075         uint16_t vid, did;
1076 
1077         vid = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 4);
1078         did = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6);
1079 
1080         if (vid == vdev->vendor_id && did != vdev->device_id) {
1081             int i;
1082             uint8_t csum, *data = vdev->rom;
1083 
1084             pci_set_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6,
1085                          vdev->device_id);
1086             data[6] = 0;
1087 
1088             for (csum = 0, i = 0; i < vdev->rom_size; i++) {
1089                 csum += data[i];
1090             }
1091 
1092             data[6] = -csum;
1093         }
1094     }
1095 }
1096 
1097 /* "Raw" read of underlying config space. */
1098 static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset,
1099                                       uint32_t size, void *data)
1100 {
1101     return vdev->vbasedev.io_ops->region_read(&vdev->vbasedev,
1102                                               VFIO_PCI_CONFIG_REGION_INDEX,
1103                                               offset, size, data);
1104 }
1105 
1106 /* "Raw" write of underlying config space. */
1107 static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset,
1108                                        uint32_t size, void *data)
1109 {
1110     return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev,
1111                                                VFIO_PCI_CONFIG_REGION_INDEX,
1112                                                offset, size, data, false);
1113 }
1114 
1115 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
1116 {
1117     VFIOPCIDevice *vdev = opaque;
1118     union {
1119         uint8_t byte;
1120         uint16_t word;
1121         uint32_t dword;
1122         uint64_t qword;
1123     } val;
1124     uint64_t data = 0;
1125 
1126     /* Load the ROM lazily when the guest tries to read it */
1127     if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
1128         vfio_pci_load_rom(vdev);
1129     }
1130 
1131     memcpy(&val, vdev->rom + addr,
1132            (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
1133 
1134     switch (size) {
1135     case 1:
1136         data = val.byte;
1137         break;
1138     case 2:
1139         data = le16_to_cpu(val.word);
1140         break;
1141     case 4:
1142         data = le32_to_cpu(val.dword);
1143         break;
1144     default:
1145         hw_error("vfio: unsupported read size, %d bytes\n", size);
1146         break;
1147     }
1148 
1149     trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data);
1150 
1151     return data;
1152 }
1153 
1154 static void vfio_rom_write(void *opaque, hwaddr addr,
1155                            uint64_t data, unsigned size)
1156 {
1157 }
1158 
1159 static const MemoryRegionOps vfio_rom_ops = {
1160     .read = vfio_rom_read,
1161     .write = vfio_rom_write,
1162     .endianness = DEVICE_LITTLE_ENDIAN,
1163 };
1164 
1165 static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
1166 {
1167     VFIODevice *vbasedev = &vdev->vbasedev;
1168     uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
1169     char *name;
1170 
1171     if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
1172         /* Since pci handles romfile, just print a message and return */
1173         if (vfio_opt_rom_in_denylist(vdev) && vdev->pdev.romfile) {
1174             warn_report("Device at %s is known to cause system instability"
1175                         " issues during option rom execution",
1176                         vdev->vbasedev.name);
1177             error_printf("Proceeding anyway since user specified romfile\n");
1178         }
1179         return;
1180     }
1181 
1182     /*
1183      * Use the same size ROM BAR as the physical device.  The contents
1184      * will get filled in later when the guest tries to read it.
1185      */
1186     if (vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4 ||
1187         vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 ||
1188         vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 ||
1189         vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4) {
1190 
1191         error_report("%s(%s) ROM access failed", __func__, vbasedev->name);
1192         return;
1193     }
1194 
1195     size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
1196 
1197     if (!size) {
1198         return;
1199     }
1200 
1201     if (vfio_opt_rom_in_denylist(vdev)) {
1202         if (vdev->pdev.rom_bar > 0) {
1203             warn_report("Device at %s is known to cause system instability"
1204                         " issues during option rom execution",
1205                         vdev->vbasedev.name);
1206             error_printf("Proceeding anyway since user specified"
1207                          " positive value for rombar\n");
1208         } else {
1209             warn_report("Rom loading for device at %s has been disabled"
1210                         " due to system instability issues",
1211                         vdev->vbasedev.name);
1212             error_printf("Specify rombar=1 or romfile to force\n");
1213             return;
1214         }
1215     }
1216 
1217     trace_vfio_pci_size_rom(vdev->vbasedev.name, size);
1218 
1219     name = g_strdup_printf("vfio[%s].rom", vdev->vbasedev.name);
1220 
1221     memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
1222                           &vfio_rom_ops, vdev, name, size);
1223     g_free(name);
1224 
1225     pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
1226                      PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
1227 
1228     vdev->rom_read_failed = false;
1229 }
1230 
1231 void vfio_vga_write(void *opaque, hwaddr addr,
1232                            uint64_t data, unsigned size)
1233 {
1234     VFIOVGARegion *region = opaque;
1235     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1236     union {
1237         uint8_t byte;
1238         uint16_t word;
1239         uint32_t dword;
1240         uint64_t qword;
1241     } buf;
1242     off_t offset = vga->fd_offset + region->offset + addr;
1243 
1244     switch (size) {
1245     case 1:
1246         buf.byte = data;
1247         break;
1248     case 2:
1249         buf.word = cpu_to_le16(data);
1250         break;
1251     case 4:
1252         buf.dword = cpu_to_le32(data);
1253         break;
1254     default:
1255         hw_error("vfio: unsupported write size, %d bytes", size);
1256         break;
1257     }
1258 
1259     if (pwrite(vga->fd, &buf, size, offset) != size) {
1260         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1261                      __func__, region->offset + addr, data, size);
1262     }
1263 
1264     trace_vfio_vga_write(region->offset + addr, data, size);
1265 }
1266 
1267 uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1268 {
1269     VFIOVGARegion *region = opaque;
1270     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1271     union {
1272         uint8_t byte;
1273         uint16_t word;
1274         uint32_t dword;
1275         uint64_t qword;
1276     } buf;
1277     uint64_t data = 0;
1278     off_t offset = vga->fd_offset + region->offset + addr;
1279 
1280     if (pread(vga->fd, &buf, size, offset) != size) {
1281         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1282                      __func__, region->offset + addr, size);
1283         return (uint64_t)-1;
1284     }
1285 
1286     switch (size) {
1287     case 1:
1288         data = buf.byte;
1289         break;
1290     case 2:
1291         data = le16_to_cpu(buf.word);
1292         break;
1293     case 4:
1294         data = le32_to_cpu(buf.dword);
1295         break;
1296     default:
1297         hw_error("vfio: unsupported read size, %d bytes", size);
1298         break;
1299     }
1300 
1301     trace_vfio_vga_read(region->offset + addr, size, data);
1302 
1303     return data;
1304 }
1305 
1306 static const MemoryRegionOps vfio_vga_ops = {
1307     .read = vfio_vga_read,
1308     .write = vfio_vga_write,
1309     .endianness = DEVICE_LITTLE_ENDIAN,
1310 };
1311 
1312 /*
1313  * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page
1314  * size if the BAR is in an exclusive page in host so that we could map
1315  * this BAR to guest. But this sub-page BAR may not occupy an exclusive
1316  * page in guest. So we should set the priority of the expanded memory
1317  * region to zero in case of overlap with BARs which share the same page
1318  * with the sub-page BAR in guest. Besides, we should also recover the
1319  * size of this sub-page BAR when its base address is changed in guest
1320  * and not page aligned any more.
1321  */
1322 static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
1323 {
1324     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
1325     VFIORegion *region = &vdev->bars[bar].region;
1326     MemoryRegion *mmap_mr, *region_mr, *base_mr;
1327     PCIIORegion *r;
1328     pcibus_t bar_addr;
1329     uint64_t size = region->size;
1330 
1331     /* Make sure that the whole region is allowed to be mmapped */
1332     if (region->nr_mmaps != 1 || !region->mmaps[0].mmap ||
1333         region->mmaps[0].size != region->size) {
1334         return;
1335     }
1336 
1337     r = &pdev->io_regions[bar];
1338     bar_addr = r->addr;
1339     base_mr = vdev->bars[bar].mr;
1340     region_mr = region->mem;
1341     mmap_mr = &region->mmaps[0].mem;
1342 
1343     /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */
1344     if (bar_addr != PCI_BAR_UNMAPPED &&
1345         !(bar_addr & ~qemu_real_host_page_mask())) {
1346         size = qemu_real_host_page_size();
1347     }
1348 
1349     memory_region_transaction_begin();
1350 
1351     if (vdev->bars[bar].size < size) {
1352         memory_region_set_size(base_mr, size);
1353     }
1354     memory_region_set_size(region_mr, size);
1355     memory_region_set_size(mmap_mr, size);
1356     if (size != vdev->bars[bar].size && memory_region_is_mapped(base_mr)) {
1357         memory_region_del_subregion(r->address_space, base_mr);
1358         memory_region_add_subregion_overlap(r->address_space,
1359                                             bar_addr, base_mr, 0);
1360     }
1361 
1362     memory_region_transaction_commit();
1363 }
1364 
1365 /*
1366  * PCI config space
1367  */
1368 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
1369 {
1370     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
1371     VFIODevice *vbasedev = &vdev->vbasedev;
1372     uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
1373 
1374     memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
1375     emu_bits = le32_to_cpu(emu_bits);
1376 
1377     if (emu_bits) {
1378         emu_val = pci_default_read_config(pdev, addr, len);
1379     }
1380 
1381     if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
1382         ssize_t ret;
1383 
1384         ret = vfio_pci_config_space_read(vdev, addr, len, &phys_val);
1385         if (ret != len) {
1386             error_report("%s(%s, 0x%x, 0x%x) failed: %s",
1387                          __func__, vbasedev->name, addr, len,
1388                          strreaderror(ret));
1389             return -1;
1390         }
1391         phys_val = le32_to_cpu(phys_val);
1392     }
1393 
1394     val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
1395 
1396     trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val);
1397 
1398     return val;
1399 }
1400 
1401 void vfio_pci_write_config(PCIDevice *pdev,
1402                            uint32_t addr, uint32_t val, int len)
1403 {
1404     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
1405     VFIODevice *vbasedev = &vdev->vbasedev;
1406     uint32_t val_le = cpu_to_le32(val);
1407     int ret;
1408 
1409     trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
1410 
1411     /* Write everything to VFIO, let it filter out what we can't write */
1412     ret = vfio_pci_config_space_write(vdev, addr, len, &val_le);
1413     if (ret != len) {
1414         error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s",
1415                      __func__, vbasedev->name, addr, val, len,
1416                     strwriteerror(ret));
1417     }
1418 
1419     /* MSI/MSI-X Enabling/Disabling */
1420     if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
1421         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
1422         int is_enabled, was_enabled = msi_enabled(pdev);
1423 
1424         pci_default_write_config(pdev, addr, val, len);
1425 
1426         is_enabled = msi_enabled(pdev);
1427 
1428         if (!was_enabled) {
1429             if (is_enabled) {
1430                 vfio_msi_enable(vdev);
1431             }
1432         } else {
1433             if (!is_enabled) {
1434                 vfio_msi_disable(vdev);
1435             } else {
1436                 vfio_update_msi(vdev);
1437             }
1438         }
1439     } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
1440         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
1441         int is_enabled, was_enabled = msix_enabled(pdev);
1442 
1443         pci_default_write_config(pdev, addr, val, len);
1444 
1445         is_enabled = msix_enabled(pdev);
1446 
1447         if (!was_enabled && is_enabled) {
1448             vfio_msix_enable(vdev);
1449         } else if (was_enabled && !is_enabled) {
1450             vfio_msix_disable(vdev);
1451         }
1452     } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) ||
1453         range_covers_byte(addr, len, PCI_COMMAND)) {
1454         pcibus_t old_addr[PCI_NUM_REGIONS - 1];
1455         int bar;
1456 
1457         for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
1458             old_addr[bar] = pdev->io_regions[bar].addr;
1459         }
1460 
1461         pci_default_write_config(pdev, addr, val, len);
1462 
1463         for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
1464             if (old_addr[bar] != pdev->io_regions[bar].addr &&
1465                 vdev->bars[bar].region.size > 0 &&
1466                 vdev->bars[bar].region.size < qemu_real_host_page_size()) {
1467                 vfio_sub_page_bar_update_mapping(pdev, bar);
1468             }
1469         }
1470     } else {
1471         /* Write everything to QEMU to keep emulated bits correct */
1472         pci_default_write_config(pdev, addr, val, len);
1473     }
1474 }
1475 
1476 /*
1477  * Interrupt setup
1478  */
1479 static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
1480 {
1481     /*
1482      * More complicated than it looks.  Disabling MSI/X transitions the
1483      * device to INTx mode (if supported).  Therefore we need to first
1484      * disable MSI/X and then cleanup by disabling INTx.
1485      */
1486     if (vdev->interrupt == VFIO_INT_MSIX) {
1487         vfio_msix_disable(vdev);
1488     } else if (vdev->interrupt == VFIO_INT_MSI) {
1489         vfio_msi_disable(vdev);
1490     }
1491 
1492     if (vdev->interrupt == VFIO_INT_INTx) {
1493         vfio_intx_disable(vdev);
1494     }
1495 }
1496 
1497 static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1498 {
1499     uint16_t ctrl;
1500     bool msi_64bit, msi_maskbit;
1501     int ret, entries;
1502     Error *err = NULL;
1503 
1504     ret = vfio_pci_config_space_read(vdev, pos + PCI_CAP_FLAGS,
1505                                      sizeof(ctrl), &ctrl);
1506     if (ret != sizeof(ctrl)) {
1507         error_setg(errp, "failed reading MSI PCI_CAP_FLAGS: %s",
1508                    strreaderror(ret));
1509         return false;
1510     }
1511     ctrl = le16_to_cpu(ctrl);
1512 
1513     msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
1514     msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
1515     entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
1516 
1517     trace_vfio_msi_setup(vdev->vbasedev.name, pos);
1518 
1519     ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit, &err);
1520     if (ret < 0) {
1521         if (ret == -ENOTSUP) {
1522             return true;
1523         }
1524         error_propagate_prepend(errp, err, "msi_init failed: ");
1525         return false;
1526     }
1527     vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
1528 
1529     return true;
1530 }
1531 
1532 static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
1533 {
1534     off_t start, end;
1535     VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region;
1536 
1537     /*
1538      * If the host driver allows mapping of a MSIX data, we are going to
1539      * do map the entire BAR and emulate MSIX table on top of that.
1540      */
1541     if (vfio_device_has_region_cap(&vdev->vbasedev, region->nr,
1542                                    VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) {
1543         return;
1544     }
1545 
1546     /*
1547      * We expect to find a single mmap covering the whole BAR, anything else
1548      * means it's either unsupported or already setup.
1549      */
1550     if (region->nr_mmaps != 1 || region->mmaps[0].offset ||
1551         region->size != region->mmaps[0].size) {
1552         return;
1553     }
1554 
1555     /* MSI-X table start and end aligned to host page size */
1556     start = vdev->msix->table_offset & qemu_real_host_page_mask();
1557     end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
1558                                (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
1559 
1560     /*
1561      * Does the MSI-X table cover the beginning of the BAR?  The whole BAR?
1562      * NB - Host page size is necessarily a power of two and so is the PCI
1563      * BAR (not counting EA yet), therefore if we have host page aligned
1564      * @start and @end, then any remainder of the BAR before or after those
1565      * must be at least host page sized and therefore mmap'able.
1566      */
1567     if (!start) {
1568         if (end >= region->size) {
1569             region->nr_mmaps = 0;
1570             g_free(region->mmaps);
1571             region->mmaps = NULL;
1572             trace_vfio_msix_fixup(vdev->vbasedev.name,
1573                                   vdev->msix->table_bar, 0, 0);
1574         } else {
1575             region->mmaps[0].offset = end;
1576             region->mmaps[0].size = region->size - end;
1577             trace_vfio_msix_fixup(vdev->vbasedev.name,
1578                               vdev->msix->table_bar, region->mmaps[0].offset,
1579                               region->mmaps[0].offset + region->mmaps[0].size);
1580         }
1581 
1582     /* Maybe it's aligned at the end of the BAR */
1583     } else if (end >= region->size) {
1584         region->mmaps[0].size = start;
1585         trace_vfio_msix_fixup(vdev->vbasedev.name,
1586                               vdev->msix->table_bar, region->mmaps[0].offset,
1587                               region->mmaps[0].offset + region->mmaps[0].size);
1588 
1589     /* Otherwise it must split the BAR */
1590     } else {
1591         region->nr_mmaps = 2;
1592         region->mmaps = g_renew(VFIOMmap, region->mmaps, 2);
1593 
1594         memcpy(&region->mmaps[1], &region->mmaps[0], sizeof(VFIOMmap));
1595 
1596         region->mmaps[0].size = start;
1597         trace_vfio_msix_fixup(vdev->vbasedev.name,
1598                               vdev->msix->table_bar, region->mmaps[0].offset,
1599                               region->mmaps[0].offset + region->mmaps[0].size);
1600 
1601         region->mmaps[1].offset = end;
1602         region->mmaps[1].size = region->size - end;
1603         trace_vfio_msix_fixup(vdev->vbasedev.name,
1604                               vdev->msix->table_bar, region->mmaps[1].offset,
1605                               region->mmaps[1].offset + region->mmaps[1].size);
1606     }
1607 }
1608 
1609 static bool vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
1610 {
1611     int target_bar = -1;
1612     size_t msix_sz;
1613 
1614     if (!vdev->msix || vdev->msix_relo == OFF_AUTO_PCIBAR_OFF) {
1615         return true;
1616     }
1617 
1618     /* The actual minimum size of MSI-X structures */
1619     msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) +
1620               (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8);
1621     /* Round up to host pages, we don't want to share a page */
1622     msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz);
1623     /* PCI BARs must be a power of 2 */
1624     msix_sz = pow2ceil(msix_sz);
1625 
1626     if (vdev->msix_relo == OFF_AUTO_PCIBAR_AUTO) {
1627         /*
1628          * TODO: Lookup table for known devices.
1629          *
1630          * Logically we might use an algorithm here to select the BAR adding
1631          * the least additional MMIO space, but we cannot programmatically
1632          * predict the driver dependency on BAR ordering or sizing, therefore
1633          * 'auto' becomes a lookup for combinations reported to work.
1634          */
1635         if (target_bar < 0) {
1636             error_setg(errp, "No automatic MSI-X relocation available for "
1637                        "device %04x:%04x", vdev->vendor_id, vdev->device_id);
1638             return false;
1639         }
1640     } else {
1641         target_bar = (int)(vdev->msix_relo - OFF_AUTO_PCIBAR_BAR0);
1642     }
1643 
1644     /* I/O port BARs cannot host MSI-X structures */
1645     if (vdev->bars[target_bar].ioport) {
1646         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1647                    "I/O port BAR", target_bar);
1648         return false;
1649     }
1650 
1651     /* Cannot use a BAR in the "shadow" of a 64-bit BAR */
1652     if (!vdev->bars[target_bar].size &&
1653          target_bar > 0 && vdev->bars[target_bar - 1].mem64) {
1654         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1655                    "consumed by 64-bit BAR %d", target_bar, target_bar - 1);
1656         return false;
1657     }
1658 
1659     /* 2GB max size for 32-bit BARs, cannot double if already > 1G */
1660     if (vdev->bars[target_bar].size > 1 * GiB &&
1661         !vdev->bars[target_bar].mem64) {
1662         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1663                    "no space to extend 32-bit BAR", target_bar);
1664         return false;
1665     }
1666 
1667     /*
1668      * If adding a new BAR, test if we can make it 64bit.  We make it
1669      * prefetchable since QEMU MSI-X emulation has no read side effects
1670      * and doing so makes mapping more flexible.
1671      */
1672     if (!vdev->bars[target_bar].size) {
1673         if (target_bar < (PCI_ROM_SLOT - 1) &&
1674             !vdev->bars[target_bar + 1].size) {
1675             vdev->bars[target_bar].mem64 = true;
1676             vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64;
1677         }
1678         vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
1679         vdev->bars[target_bar].size = msix_sz;
1680         vdev->msix->table_offset = 0;
1681     } else {
1682         vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2,
1683                                           msix_sz * 2);
1684         /*
1685          * Due to above size calc, MSI-X always starts halfway into the BAR,
1686          * which will always be a separate host page.
1687          */
1688         vdev->msix->table_offset = vdev->bars[target_bar].size / 2;
1689     }
1690 
1691     vdev->msix->table_bar = target_bar;
1692     vdev->msix->pba_bar = target_bar;
1693     /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */
1694     vdev->msix->pba_offset = vdev->msix->table_offset +
1695                                   (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE);
1696 
1697     trace_vfio_msix_relo(vdev->vbasedev.name,
1698                          vdev->msix->table_bar, vdev->msix->table_offset);
1699     return true;
1700 }
1701 
1702 /*
1703  * We don't have any control over how pci_add_capability() inserts
1704  * capabilities into the chain.  In order to setup MSI-X we need a
1705  * MemoryRegion for the BAR.  In order to setup the BAR and not
1706  * attempt to mmap the MSI-X table area, which VFIO won't allow, we
1707  * need to first look for where the MSI-X table lives.  So we
1708  * unfortunately split MSI-X setup across two functions.
1709  */
1710 static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
1711 {
1712     uint8_t pos;
1713     uint16_t ctrl;
1714     uint32_t table, pba;
1715     struct vfio_irq_info irq_info;
1716     VFIOMSIXInfo *msix;
1717     int ret;
1718 
1719     pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
1720     if (!pos) {
1721         return true;
1722     }
1723 
1724     ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_FLAGS,
1725                                      sizeof(ctrl), &ctrl);
1726     if (ret != sizeof(ctrl)) {
1727         error_setg(errp, "failed to read PCI MSIX FLAGS: %s",
1728                    strreaderror(ret));
1729         return false;
1730     }
1731 
1732     ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_TABLE,
1733                                      sizeof(table), &table);
1734     if (ret != sizeof(table)) {
1735         error_setg(errp, "failed to read PCI MSIX TABLE: %s",
1736                    strreaderror(ret));
1737         return false;
1738     }
1739 
1740     ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_PBA,
1741                                      sizeof(pba), &pba);
1742     if (ret != sizeof(pba)) {
1743         error_setg(errp, "failed to read PCI MSIX PBA: %s", strreaderror(ret));
1744         return false;
1745     }
1746 
1747     ctrl = le16_to_cpu(ctrl);
1748     table = le32_to_cpu(table);
1749     pba = le32_to_cpu(pba);
1750 
1751     msix = g_malloc0(sizeof(*msix));
1752     msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
1753     msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
1754     msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
1755     msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
1756     msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
1757 
1758     ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
1759                                    &irq_info);
1760     if (ret < 0) {
1761         error_setg_errno(errp, -ret, "failed to get MSI-X irq info");
1762         g_free(msix);
1763         return false;
1764     }
1765 
1766     msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE);
1767 
1768     /*
1769      * Test the size of the pba_offset variable and catch if it extends outside
1770      * of the specified BAR. If it is the case, we need to apply a hardware
1771      * specific quirk if the device is known or we have a broken configuration.
1772      */
1773     if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) {
1774         /*
1775          * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
1776          * adapters. The T5 hardware returns an incorrect value of 0x8000 for
1777          * the VF PBA offset while the BAR itself is only 8k. The correct value
1778          * is 0x1000, so we hard code that here.
1779          */
1780         if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO &&
1781             (vdev->device_id & 0xff00) == 0x5800) {
1782             msix->pba_offset = 0x1000;
1783         /*
1784          * BAIDU KUNLUN Virtual Function devices for KUNLUN AI processor
1785          * return an incorrect value of 0x460000 for the VF PBA offset while
1786          * the BAR itself is only 0x10000.  The correct value is 0xb400.
1787          */
1788         } else if (vfio_pci_is(vdev, PCI_VENDOR_ID_BAIDU,
1789                                PCI_DEVICE_ID_KUNLUN_VF)) {
1790             msix->pba_offset = 0xb400;
1791         } else if (vdev->msix_relo == OFF_AUTO_PCIBAR_OFF) {
1792             error_setg(errp, "hardware reports invalid configuration, "
1793                        "MSIX PBA outside of specified BAR");
1794             g_free(msix);
1795             return false;
1796         }
1797     }
1798 
1799     trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
1800                                 msix->table_offset, msix->entries,
1801                                 msix->noresize);
1802     vdev->msix = msix;
1803 
1804     vfio_pci_fixup_msix_region(vdev);
1805 
1806     return vfio_pci_relocate_msix(vdev, errp);
1807 }
1808 
1809 static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1810 {
1811     int ret;
1812     Error *err = NULL;
1813 
1814     vdev->msix->pending = g_new0(unsigned long,
1815                                  BITS_TO_LONGS(vdev->msix->entries));
1816     ret = msix_init(&vdev->pdev, vdev->msix->entries,
1817                     vdev->bars[vdev->msix->table_bar].mr,
1818                     vdev->msix->table_bar, vdev->msix->table_offset,
1819                     vdev->bars[vdev->msix->pba_bar].mr,
1820                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos,
1821                     &err);
1822     if (ret < 0) {
1823         if (ret == -ENOTSUP) {
1824             warn_report_err(err);
1825             return true;
1826         }
1827 
1828         error_propagate(errp, err);
1829         return false;
1830     }
1831 
1832     /*
1833      * The PCI spec suggests that devices provide additional alignment for
1834      * MSI-X structures and avoid overlapping non-MSI-X related registers.
1835      * For an assigned device, this hopefully means that emulation of MSI-X
1836      * structures does not affect the performance of the device.  If devices
1837      * fail to provide that alignment, a significant performance penalty may
1838      * result, for instance Mellanox MT27500 VFs:
1839      * http://www.spinics.net/lists/kvm/msg125881.html
1840      *
1841      * The PBA is simply not that important for such a serious regression and
1842      * most drivers do not appear to look at it.  The solution for this is to
1843      * disable the PBA MemoryRegion unless it's being used.  We disable it
1844      * here and only enable it if a masked vector fires through QEMU.  As the
1845      * vector-use notifier is called, which occurs on unmask, we test whether
1846      * PBA emulation is needed and again disable if not.
1847      */
1848     memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
1849 
1850     /*
1851      * The emulated machine may provide a paravirt interface for MSIX setup
1852      * so it is not strictly necessary to emulate MSIX here. This becomes
1853      * helpful when frequently accessed MMIO registers are located in
1854      * subpages adjacent to the MSIX table but the MSIX data containing page
1855      * cannot be mapped because of a host page size bigger than the MSIX table
1856      * alignment.
1857      */
1858     if (object_property_get_bool(OBJECT(qdev_get_machine()),
1859                                  "vfio-no-msix-emulation", NULL)) {
1860         memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
1861     }
1862 
1863     return true;
1864 }
1865 
1866 void vfio_pci_teardown_msi(VFIOPCIDevice *vdev)
1867 {
1868     msi_uninit(&vdev->pdev);
1869 
1870     if (vdev->msix) {
1871         msix_uninit(&vdev->pdev,
1872                     vdev->bars[vdev->msix->table_bar].mr,
1873                     vdev->bars[vdev->msix->pba_bar].mr);
1874         g_free(vdev->msix->pending);
1875     }
1876 }
1877 
1878 /*
1879  * Resource setup
1880  */
1881 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
1882 {
1883     int i;
1884 
1885     for (i = 0; i < PCI_ROM_SLOT; i++) {
1886         vfio_region_mmaps_set_enabled(&vdev->bars[i].region, enabled);
1887     }
1888 }
1889 
1890 static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
1891 {
1892     VFIOBAR *bar = &vdev->bars[nr];
1893 
1894     uint32_t pci_bar;
1895     int ret;
1896 
1897     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
1898     if (!bar->region.size) {
1899         return;
1900     }
1901 
1902     /* Determine what type of BAR this is for registration */
1903     ret = vfio_pci_config_space_read(vdev, PCI_BASE_ADDRESS_0 + (4 * nr),
1904                                      sizeof(pci_bar), &pci_bar);
1905     if (ret != sizeof(pci_bar)) {
1906         error_report("vfio: Failed to read BAR %d: %s", nr, strreaderror(ret));
1907         return;
1908     }
1909 
1910     pci_bar = le32_to_cpu(pci_bar);
1911     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
1912     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
1913     bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
1914                                          ~PCI_BASE_ADDRESS_MEM_MASK);
1915     bar->size = bar->region.size;
1916 
1917     /* IO regions are sync, memory can be async */
1918     bar->region.post_wr = (bar->ioport == 0);
1919 }
1920 
1921 static void vfio_bars_prepare(VFIOPCIDevice *vdev)
1922 {
1923     int i;
1924 
1925     for (i = 0; i < PCI_ROM_SLOT; i++) {
1926         vfio_bar_prepare(vdev, i);
1927     }
1928 }
1929 
1930 static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
1931 {
1932     VFIOBAR *bar = &vdev->bars[nr];
1933     char *name;
1934 
1935     if (!bar->size) {
1936         return;
1937     }
1938 
1939     bar->mr = g_new0(MemoryRegion, 1);
1940     name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr);
1941     memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size);
1942     g_free(name);
1943 
1944     if (bar->region.size) {
1945         memory_region_add_subregion(bar->mr, 0, bar->region.mem);
1946 
1947         if (vfio_region_mmap(&bar->region)) {
1948             error_report("Failed to mmap %s BAR %d. Performance may be slow",
1949                          vdev->vbasedev.name, nr);
1950         }
1951     }
1952 
1953     pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
1954 }
1955 
1956 static void vfio_bars_register(VFIOPCIDevice *vdev)
1957 {
1958     int i;
1959 
1960     for (i = 0; i < PCI_ROM_SLOT; i++) {
1961         vfio_bar_register(vdev, i);
1962     }
1963 }
1964 
1965 void vfio_pci_bars_exit(VFIOPCIDevice *vdev)
1966 {
1967     int i;
1968 
1969     for (i = 0; i < PCI_ROM_SLOT; i++) {
1970         VFIOBAR *bar = &vdev->bars[i];
1971 
1972         vfio_bar_quirk_exit(vdev, i);
1973         vfio_region_exit(&bar->region);
1974         if (bar->region.size) {
1975             memory_region_del_subregion(bar->mr, bar->region.mem);
1976         }
1977     }
1978 
1979     if (vdev->vga) {
1980         pci_unregister_vga(&vdev->pdev);
1981         vfio_vga_quirk_exit(vdev);
1982     }
1983 }
1984 
1985 static void vfio_bars_finalize(VFIOPCIDevice *vdev)
1986 {
1987     int i;
1988 
1989     for (i = 0; i < PCI_ROM_SLOT; i++) {
1990         VFIOBAR *bar = &vdev->bars[i];
1991 
1992         vfio_bar_quirk_finalize(vdev, i);
1993         vfio_region_finalize(&bar->region);
1994         if (bar->mr) {
1995             assert(bar->size);
1996             object_unparent(OBJECT(bar->mr));
1997             g_free(bar->mr);
1998             bar->mr = NULL;
1999         }
2000     }
2001 
2002     if (vdev->vga) {
2003         vfio_vga_quirk_finalize(vdev);
2004         for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
2005             object_unparent(OBJECT(&vdev->vga->region[i].mem));
2006         }
2007         g_free(vdev->vga);
2008     }
2009 }
2010 
2011 /*
2012  * General setup
2013  */
2014 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
2015 {
2016     uint8_t tmp;
2017     uint16_t next = PCI_CONFIG_SPACE_SIZE;
2018 
2019     for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
2020          tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]) {
2021         if (tmp > pos && tmp < next) {
2022             next = tmp;
2023         }
2024     }
2025 
2026     return next - pos;
2027 }
2028 
2029 
2030 static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
2031 {
2032     uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
2033 
2034     for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
2035         tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
2036         if (tmp > pos && tmp < next) {
2037             next = tmp;
2038         }
2039     }
2040 
2041     return next - pos;
2042 }
2043 
2044 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
2045 {
2046     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
2047 }
2048 
2049 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos,
2050                                    uint16_t val, uint16_t mask)
2051 {
2052     vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
2053     vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
2054     vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
2055 }
2056 
2057 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
2058 {
2059     pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
2060 }
2061 
2062 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
2063                                    uint32_t val, uint32_t mask)
2064 {
2065     vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
2066     vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
2067     vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
2068 }
2069 
2070 static void vfio_pci_enable_rp_atomics(VFIOPCIDevice *vdev)
2071 {
2072     struct vfio_device_info_cap_pci_atomic_comp *cap;
2073     g_autofree struct vfio_device_info *info = NULL;
2074     PCIBus *bus = pci_get_bus(&vdev->pdev);
2075     PCIDevice *parent = bus->parent_dev;
2076     struct vfio_info_cap_header *hdr;
2077     uint32_t mask = 0;
2078     uint8_t *pos;
2079 
2080     /*
2081      * PCIe Atomic Ops completer support is only added automatically for single
2082      * function devices downstream of a root port supporting DEVCAP2.  Support
2083      * is added during realize and, if added, removed during device exit.  The
2084      * single function requirement avoids conflicting requirements should a
2085      * slot be composed of multiple devices with differing capabilities.
2086      */
2087     if (pci_bus_is_root(bus) || !parent || !parent->exp.exp_cap ||
2088         pcie_cap_get_type(parent) != PCI_EXP_TYPE_ROOT_PORT ||
2089         pcie_cap_get_version(parent) != PCI_EXP_FLAGS_VER2 ||
2090         vdev->pdev.devfn ||
2091         vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
2092         return;
2093     }
2094 
2095     pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2;
2096 
2097     /* Abort if there'a already an Atomic Ops configuration on the root port */
2098     if (pci_get_long(pos) & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
2099                              PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
2100                              PCI_EXP_DEVCAP2_ATOMIC_COMP128)) {
2101         return;
2102     }
2103 
2104     info = vfio_get_device_info(vdev->vbasedev.fd);
2105     if (!info) {
2106         return;
2107     }
2108 
2109     hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP);
2110     if (!hdr) {
2111         return;
2112     }
2113 
2114     cap = (void *)hdr;
2115     if (cap->flags & VFIO_PCI_ATOMIC_COMP32) {
2116         mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP32;
2117     }
2118     if (cap->flags & VFIO_PCI_ATOMIC_COMP64) {
2119         mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP64;
2120     }
2121     if (cap->flags & VFIO_PCI_ATOMIC_COMP128) {
2122         mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP128;
2123     }
2124 
2125     if (!mask) {
2126         return;
2127     }
2128 
2129     pci_long_test_and_set_mask(pos, mask);
2130     vdev->clear_parent_atomics_on_exit = true;
2131 }
2132 
2133 static void vfio_pci_disable_rp_atomics(VFIOPCIDevice *vdev)
2134 {
2135     if (vdev->clear_parent_atomics_on_exit) {
2136         PCIDevice *parent = pci_get_bus(&vdev->pdev)->parent_dev;
2137         uint8_t *pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2;
2138 
2139         pci_long_test_and_clear_mask(pos, PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
2140                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
2141                                           PCI_EXP_DEVCAP2_ATOMIC_COMP128);
2142     }
2143 }
2144 
2145 static bool vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size,
2146                                 Error **errp)
2147 {
2148     uint16_t flags;
2149     uint8_t type;
2150 
2151     flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
2152     type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
2153 
2154     if (type != PCI_EXP_TYPE_ENDPOINT &&
2155         type != PCI_EXP_TYPE_LEG_END &&
2156         type != PCI_EXP_TYPE_RC_END) {
2157 
2158         error_setg(errp, "assignment of PCIe type 0x%x "
2159                    "devices is not currently supported", type);
2160         return false;
2161     }
2162 
2163     if (!pci_bus_is_express(pci_get_bus(&vdev->pdev))) {
2164         PCIBus *bus = pci_get_bus(&vdev->pdev);
2165         PCIDevice *bridge;
2166 
2167         /*
2168          * Traditionally PCI device assignment exposes the PCIe capability
2169          * as-is on non-express buses.  The reason being that some drivers
2170          * simply assume that it's there, for example tg3.  However when
2171          * we're running on a native PCIe machine type, like Q35, we need
2172          * to hide the PCIe capability.  The reason for this is twofold;
2173          * first Windows guests get a Code 10 error when the PCIe capability
2174          * is exposed in this configuration.  Therefore express devices won't
2175          * work at all unless they're attached to express buses in the VM.
2176          * Second, a native PCIe machine introduces the possibility of fine
2177          * granularity IOMMUs supporting both translation and isolation.
2178          * Guest code to discover the IOMMU visibility of a device, such as
2179          * IOMMU grouping code on Linux, is very aware of device types and
2180          * valid transitions between bus types.  An express device on a non-
2181          * express bus is not a valid combination on bare metal systems.
2182          *
2183          * Drivers that require a PCIe capability to make the device
2184          * functional are simply going to need to have their devices placed
2185          * on a PCIe bus in the VM.
2186          */
2187         while (!pci_bus_is_root(bus)) {
2188             bridge = pci_bridge_get_device(bus);
2189             bus = pci_get_bus(bridge);
2190         }
2191 
2192         if (pci_bus_is_express(bus)) {
2193             return true;
2194         }
2195 
2196     } else if (pci_bus_is_root(pci_get_bus(&vdev->pdev))) {
2197         /*
2198          * On a Root Complex bus Endpoints become Root Complex Integrated
2199          * Endpoints, which changes the type and clears the LNK & LNK2 fields.
2200          */
2201         if (type == PCI_EXP_TYPE_ENDPOINT) {
2202             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2203                                    PCI_EXP_TYPE_RC_END << 4,
2204                                    PCI_EXP_FLAGS_TYPE);
2205 
2206             /* Link Capabilities, Status, and Control goes away */
2207             if (size > PCI_EXP_LNKCTL) {
2208                 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
2209                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2210                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
2211 
2212 #ifndef PCI_EXP_LNKCAP2
2213 #define PCI_EXP_LNKCAP2 44
2214 #endif
2215 #ifndef PCI_EXP_LNKSTA2
2216 #define PCI_EXP_LNKSTA2 50
2217 #endif
2218                 /* Link 2 Capabilities, Status, and Control goes away */
2219                 if (size > PCI_EXP_LNKCAP2) {
2220                     vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
2221                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
2222                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
2223                 }
2224             }
2225 
2226         } else if (type == PCI_EXP_TYPE_LEG_END) {
2227             /*
2228              * Legacy endpoints don't belong on the root complex.  Windows
2229              * seems to be happier with devices if we skip the capability.
2230              */
2231             return true;
2232         }
2233 
2234     } else {
2235         /*
2236          * Convert Root Complex Integrated Endpoints to regular endpoints.
2237          * These devices don't support LNK/LNK2 capabilities, so make them up.
2238          */
2239         if (type == PCI_EXP_TYPE_RC_END) {
2240             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2241                                    PCI_EXP_TYPE_ENDPOINT << 4,
2242                                    PCI_EXP_FLAGS_TYPE);
2243             vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
2244                            QEMU_PCI_EXP_LNKCAP_MLW(QEMU_PCI_EXP_LNK_X1) |
2245                            QEMU_PCI_EXP_LNKCAP_MLS(QEMU_PCI_EXP_LNK_2_5GT), ~0);
2246             vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2247         }
2248 
2249         vfio_pci_enable_rp_atomics(vdev);
2250     }
2251 
2252     /*
2253      * Intel 82599 SR-IOV VFs report an invalid PCIe capability version 0
2254      * (Niantic errate #35) causing Windows to error with a Code 10 for the
2255      * device on Q35.  Fixup any such devices to report version 1.  If we
2256      * were to remove the capability entirely the guest would lose extended
2257      * config space.
2258      */
2259     if ((flags & PCI_EXP_FLAGS_VERS) == 0) {
2260         vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2261                                1, PCI_EXP_FLAGS_VERS);
2262     }
2263 
2264     pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size,
2265                              errp);
2266     if (pos < 0) {
2267         return false;
2268     }
2269 
2270     vdev->pdev.exp.exp_cap = pos;
2271 
2272     return true;
2273 }
2274 
2275 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
2276 {
2277     uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
2278 
2279     if (cap & PCI_EXP_DEVCAP_FLR) {
2280         trace_vfio_check_pcie_flr(vdev->vbasedev.name);
2281         vdev->has_flr = true;
2282     }
2283 }
2284 
2285 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos)
2286 {
2287     uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
2288 
2289     if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
2290         trace_vfio_check_pm_reset(vdev->vbasedev.name);
2291         vdev->has_pm_reset = true;
2292     }
2293 }
2294 
2295 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
2296 {
2297     uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
2298 
2299     if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
2300         trace_vfio_check_af_flr(vdev->vbasedev.name);
2301         vdev->has_flr = true;
2302     }
2303 }
2304 
2305 static bool vfio_add_vendor_specific_cap(VFIOPCIDevice *vdev, int pos,
2306                                          uint8_t size, Error **errp)
2307 {
2308     PCIDevice *pdev = &vdev->pdev;
2309 
2310     pos = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, size, errp);
2311     if (pos < 0) {
2312         return false;
2313     }
2314 
2315     /*
2316      * Exempt config space check for Vendor Specific Information during
2317      * restore/load.
2318      * Config space check is still enforced for 3 byte VSC header.
2319      */
2320     if (vdev->skip_vsc_check && size > 3) {
2321         memset(pdev->cmask + pos + 3, 0, size - 3);
2322     }
2323 
2324     return true;
2325 }
2326 
2327 static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
2328 {
2329     ERRP_GUARD();
2330     PCIDevice *pdev = &vdev->pdev;
2331     uint8_t cap_id, next, size;
2332     bool ret;
2333 
2334     cap_id = pdev->config[pos];
2335     next = pdev->config[pos + PCI_CAP_LIST_NEXT];
2336 
2337     /*
2338      * If it becomes important to configure capabilities to their actual
2339      * size, use this as the default when it's something we don't recognize.
2340      * Since QEMU doesn't actually handle many of the config accesses,
2341      * exact size doesn't seem worthwhile.
2342      */
2343     size = vfio_std_cap_max_size(pdev, pos);
2344 
2345     /*
2346      * pci_add_capability always inserts the new capability at the head
2347      * of the chain.  Therefore to end up with a chain that matches the
2348      * physical device, we insert from the end by making this recursive.
2349      * This is also why we pre-calculate size above as cached config space
2350      * will be changed as we unwind the stack.
2351      */
2352     if (next) {
2353         if (!vfio_add_std_cap(vdev, next, errp)) {
2354             return false;
2355         }
2356     } else {
2357         /* Begin the rebuild, use QEMU emulated list bits */
2358         pdev->config[PCI_CAPABILITY_LIST] = 0;
2359         vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
2360         vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
2361 
2362         if (!vfio_add_virt_caps(vdev, errp)) {
2363             return false;
2364         }
2365     }
2366 
2367     /* Scale down size, esp in case virt caps were added above */
2368     size = MIN(size, vfio_std_cap_max_size(pdev, pos));
2369 
2370     /* Use emulated next pointer to allow dropping caps */
2371     pci_set_byte(vdev->emulated_config_bits + pos + PCI_CAP_LIST_NEXT, 0xff);
2372 
2373     switch (cap_id) {
2374     case PCI_CAP_ID_MSI:
2375         ret = vfio_msi_setup(vdev, pos, errp);
2376         break;
2377     case PCI_CAP_ID_EXP:
2378         vfio_check_pcie_flr(vdev, pos);
2379         ret = vfio_setup_pcie_cap(vdev, pos, size, errp);
2380         break;
2381     case PCI_CAP_ID_MSIX:
2382         ret = vfio_msix_setup(vdev, pos, errp);
2383         break;
2384     case PCI_CAP_ID_PM:
2385         vfio_check_pm_reset(vdev, pos);
2386         ret = pci_pm_init(pdev, pos, errp) >= 0;
2387         /*
2388          * PCI-core config space emulation needs write access to the power
2389          * state enabled for tracking BAR mapping relative to PM state.
2390          */
2391         pci_set_word(pdev->wmask + pos + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK);
2392         break;
2393     case PCI_CAP_ID_AF:
2394         vfio_check_af_flr(vdev, pos);
2395         ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
2396         break;
2397     case PCI_CAP_ID_VNDR:
2398         ret = vfio_add_vendor_specific_cap(vdev, pos, size, errp);
2399         break;
2400     default:
2401         ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
2402         break;
2403     }
2404 
2405     if (!ret) {
2406         error_prepend(errp,
2407                       "failed to add PCI capability 0x%x[0x%x]@0x%x: ",
2408                       cap_id, size, pos);
2409     }
2410 
2411     return ret;
2412 }
2413 
2414 static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos)
2415 {
2416     uint32_t ctrl;
2417     int i, nbar;
2418 
2419     ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL);
2420     nbar = (ctrl & PCI_REBAR_CTRL_NBAR_MASK) >> PCI_REBAR_CTRL_NBAR_SHIFT;
2421 
2422     for (i = 0; i < nbar; i++) {
2423         uint32_t cap;
2424         int size;
2425 
2426         ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL + (i * 8));
2427         size = (ctrl & PCI_REBAR_CTRL_BAR_SIZE) >> PCI_REBAR_CTRL_BAR_SHIFT;
2428 
2429         /* The cap register reports sizes 1MB to 128TB, with 4 reserved bits */
2430         cap = size <= 27 ? 1U << (size + 4) : 0;
2431 
2432         /*
2433          * The PCIe spec (v6.0.1, 7.8.6) requires HW to support at least one
2434          * size in the range 1MB to 512GB.  We intend to mask all sizes except
2435          * the one currently enabled in the size field, therefore if it's
2436          * outside the range, hide the whole capability as this virtualization
2437          * trick won't work.  If >512GB resizable BARs start to appear, we
2438          * might need an opt-in or reservation scheme in the kernel.
2439          */
2440         if (!(cap & PCI_REBAR_CAP_SIZES)) {
2441             return -EINVAL;
2442         }
2443 
2444         /* Hide all sizes reported in the ctrl reg per above requirement. */
2445         ctrl &= (PCI_REBAR_CTRL_BAR_SIZE |
2446                  PCI_REBAR_CTRL_NBAR_MASK |
2447                  PCI_REBAR_CTRL_BAR_IDX);
2448 
2449         /*
2450          * The BAR size field is RW, however we've mangled the capability
2451          * register such that we only report a single size, ie. the current
2452          * BAR size.  A write of an unsupported value is undefined, therefore
2453          * the register field is essentially RO.
2454          */
2455         vfio_add_emulated_long(vdev, pos + PCI_REBAR_CAP + (i * 8), cap, ~0);
2456         vfio_add_emulated_long(vdev, pos + PCI_REBAR_CTRL + (i * 8), ctrl, ~0);
2457     }
2458 
2459     return 0;
2460 }
2461 
2462 static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
2463 {
2464     PCIDevice *pdev = &vdev->pdev;
2465     uint32_t header;
2466     uint16_t cap_id, next, size;
2467     uint8_t cap_ver;
2468     uint8_t *config;
2469 
2470     /* Only add extended caps if we have them and the guest can see them */
2471     if (!pci_is_express(pdev) || !pci_bus_is_express(pci_get_bus(pdev)) ||
2472         !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
2473         return;
2474     }
2475 
2476     /*
2477      * pcie_add_capability always inserts the new capability at the tail
2478      * of the chain.  Therefore to end up with a chain that matches the
2479      * physical device, we cache the config space to avoid overwriting
2480      * the original config space when we parse the extended capabilities.
2481      */
2482     config = g_memdup(pdev->config, vdev->config_size);
2483 
2484     /*
2485      * Extended capabilities are chained with each pointing to the next, so we
2486      * can drop anything other than the head of the chain simply by modifying
2487      * the previous next pointer.  Seed the head of the chain here such that
2488      * we can simply skip any capabilities we want to drop below, regardless
2489      * of their position in the chain.  If this stub capability still exists
2490      * after we add the capabilities we want to expose, update the capability
2491      * ID to zero.  Note that we cannot seed with the capability header being
2492      * zero as this conflicts with definition of an absent capability chain
2493      * and prevents capabilities beyond the head of the list from being added.
2494      * By replacing the dummy capability ID with zero after walking the device
2495      * chain, we also transparently mark extended capabilities as absent if
2496      * no capabilities were added.  Note that the PCIe spec defines an absence
2497      * of extended capabilities to be determined by a value of zero for the
2498      * capability ID, version, AND next pointer.  A non-zero next pointer
2499      * should be sufficient to indicate additional capabilities are present,
2500      * which will occur if we call pcie_add_capability() below.  The entire
2501      * first dword is emulated to support this.
2502      *
2503      * NB. The kernel side does similar masking, so be prepared that our
2504      * view of the device may also contain a capability ID zero in the head
2505      * of the chain.  Skip it for the same reason that we cannot seed the
2506      * chain with a zero capability.
2507      */
2508     pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
2509                  PCI_EXT_CAP(0xFFFF, 0, 0));
2510     pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
2511     pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
2512 
2513     for (next = PCI_CONFIG_SPACE_SIZE; next;
2514          next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
2515         header = pci_get_long(config + next);
2516         cap_id = PCI_EXT_CAP_ID(header);
2517         cap_ver = PCI_EXT_CAP_VER(header);
2518 
2519         /*
2520          * If it becomes important to configure extended capabilities to their
2521          * actual size, use this as the default when it's something we don't
2522          * recognize. Since QEMU doesn't actually handle many of the config
2523          * accesses, exact size doesn't seem worthwhile.
2524          */
2525         size = vfio_ext_cap_max_size(config, next);
2526 
2527         /* Use emulated next pointer to allow dropping extended caps */
2528         pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
2529                                    PCI_EXT_CAP_NEXT_MASK);
2530 
2531         switch (cap_id) {
2532         case 0: /* kernel masked capability */
2533         case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */
2534         case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */
2535             trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
2536             break;
2537         case PCI_EXT_CAP_ID_REBAR:
2538             if (!vfio_setup_rebar_ecap(vdev, next)) {
2539                 pcie_add_capability(pdev, cap_id, cap_ver, next, size);
2540             }
2541             break;
2542         default:
2543             pcie_add_capability(pdev, cap_id, cap_ver, next, size);
2544         }
2545 
2546     }
2547 
2548     /* Cleanup chain head ID if necessary */
2549     if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
2550         pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
2551     }
2552 
2553     g_free(config);
2554 }
2555 
2556 bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
2557 {
2558     PCIDevice *pdev = &vdev->pdev;
2559 
2560     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
2561         !pdev->config[PCI_CAPABILITY_LIST]) {
2562         return true; /* Nothing to add */
2563     }
2564 
2565     if (!vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST], errp)) {
2566         return false;
2567     }
2568 
2569     vfio_add_ext_cap(vdev);
2570     return true;
2571 }
2572 
2573 void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
2574 {
2575     PCIDevice *pdev = &vdev->pdev;
2576     uint16_t cmd;
2577 
2578     vfio_disable_interrupts(vdev);
2579 
2580     /*
2581      * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master.
2582      * Also put INTx Disable in known state.
2583      */
2584     cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
2585     cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
2586              PCI_COMMAND_INTX_DISABLE);
2587     vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
2588 
2589     /* Make sure the device is in D0 */
2590     if (pdev->pm_cap) {
2591         uint16_t pmcsr;
2592         uint8_t state;
2593 
2594         pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2);
2595         state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2596         if (state) {
2597             pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
2598             vfio_pci_write_config(pdev, pdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
2599             /* vfio handles the necessary delay here */
2600             pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2);
2601             state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2602             if (state) {
2603                 error_report("vfio: Unable to power on device, stuck in D%d",
2604                              state);
2605             }
2606         }
2607     }
2608 }
2609 
2610 void vfio_pci_post_reset(VFIOPCIDevice *vdev)
2611 {
2612     VFIODevice *vbasedev = &vdev->vbasedev;
2613     Error *err = NULL;
2614     int ret, nr;
2615 
2616     if (!vfio_intx_enable(vdev, &err)) {
2617         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2618     }
2619 
2620     for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
2621         off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr);
2622         uint32_t val = 0;
2623         uint32_t len = sizeof(val);
2624 
2625         ret = vfio_pci_config_space_write(vdev, addr, len, &val);
2626         if (ret != len) {
2627             error_report("%s(%s) reset bar %d failed: %s", __func__,
2628                          vbasedev->name, nr, strwriteerror(ret));
2629         }
2630     }
2631 
2632     vfio_quirk_reset(vdev);
2633 }
2634 
2635 bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
2636 {
2637     char tmp[13];
2638 
2639     sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain,
2640             addr->bus, addr->slot, addr->function);
2641 
2642     return (strcmp(tmp, name) == 0);
2643 }
2644 
2645 int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
2646                                     struct vfio_pci_hot_reset_info **info_p)
2647 {
2648     struct vfio_pci_hot_reset_info *info;
2649     int ret, count;
2650 
2651     assert(info_p && !*info_p);
2652 
2653     info = g_malloc0(sizeof(*info));
2654     info->argsz = sizeof(*info);
2655 
2656     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2657     if (ret && errno != ENOSPC) {
2658         ret = -errno;
2659         g_free(info);
2660         if (!vdev->has_pm_reset) {
2661             error_report("vfio: Cannot reset device %s, "
2662                          "no available reset mechanism.", vdev->vbasedev.name);
2663         }
2664         return ret;
2665     }
2666 
2667     count = info->count;
2668     info = g_realloc(info, sizeof(*info) + (count * sizeof(info->devices[0])));
2669     info->argsz = sizeof(*info) + (count * sizeof(info->devices[0]));
2670 
2671     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2672     if (ret) {
2673         ret = -errno;
2674         g_free(info);
2675         error_report("vfio: hot reset info failed: %m");
2676         return ret;
2677     }
2678 
2679     *info_p = info;
2680     return 0;
2681 }
2682 
2683 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
2684 {
2685     VFIODevice *vbasedev = &vdev->vbasedev;
2686     const VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer);
2687 
2688     return vioc->pci_hot_reset(vbasedev, single);
2689 }
2690 
2691 /*
2692  * We want to differentiate hot reset of multiple in-use devices vs hot reset
2693  * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
2694  * of doing hot resets when there is only a single device per bus.  The in-use
2695  * here refers to how many VFIODevices are affected.  A hot reset that affects
2696  * multiple devices, but only a single in-use device, means that we can call
2697  * it from our bus ->reset() callback since the extent is effectively a single
2698  * device.  This allows us to make use of it in the hotplug path.  When there
2699  * are multiple in-use devices, we can only trigger the hot reset during a
2700  * system reset and thus from our reset handler.  We separate _one vs _multi
2701  * here so that we don't overlap and do a double reset on the system reset
2702  * path where both our reset handler and ->reset() callback are used.  Calling
2703  * _one() will only do a hot reset for the one in-use devices case, calling
2704  * _multi() will do nothing if a _one() would have been sufficient.
2705  */
2706 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev)
2707 {
2708     return vfio_pci_hot_reset(vdev, true);
2709 }
2710 
2711 static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev)
2712 {
2713     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2714     return vfio_pci_hot_reset(vdev, false);
2715 }
2716 
2717 static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev)
2718 {
2719     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2720     if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
2721         vbasedev->needs_reset = true;
2722     }
2723 }
2724 
2725 static Object *vfio_pci_get_object(VFIODevice *vbasedev)
2726 {
2727     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2728 
2729     return OBJECT(vdev);
2730 }
2731 
2732 static bool vfio_msix_present(void *opaque, int version_id)
2733 {
2734     PCIDevice *pdev = opaque;
2735 
2736     return msix_present(pdev);
2737 }
2738 
2739 static bool vfio_display_migration_needed(void *opaque)
2740 {
2741     VFIOPCIDevice *vdev = opaque;
2742 
2743     /*
2744      * We need to migrate the VFIODisplay object if ramfb *migration* was
2745      * explicitly requested (in which case we enforced both ramfb=on and
2746      * display=on), or ramfb migration was left at the default "auto"
2747      * setting, and *ramfb* was explicitly requested (in which case we
2748      * enforced display=on).
2749      */
2750     return vdev->ramfb_migrate == ON_OFF_AUTO_ON ||
2751         (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO && vdev->enable_ramfb);
2752 }
2753 
2754 static const VMStateDescription vmstate_vfio_display = {
2755     .name = "VFIOPCIDevice/VFIODisplay",
2756     .version_id = 1,
2757     .minimum_version_id = 1,
2758     .needed = vfio_display_migration_needed,
2759     .fields = (const VMStateField[]){
2760         VMSTATE_STRUCT_POINTER(dpy, VFIOPCIDevice, vfio_display_vmstate,
2761                                VFIODisplay),
2762         VMSTATE_END_OF_LIST()
2763     }
2764 };
2765 
2766 static const VMStateDescription vmstate_vfio_pci_config = {
2767     .name = "VFIOPCIDevice",
2768     .version_id = 1,
2769     .minimum_version_id = 1,
2770     .fields = (const VMStateField[]) {
2771         VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
2772         VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present),
2773         VMSTATE_END_OF_LIST()
2774     },
2775     .subsections = (const VMStateDescription * const []) {
2776         &vmstate_vfio_display,
2777         NULL
2778     }
2779 };
2780 
2781 static int vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f, Error **errp)
2782 {
2783     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2784 
2785     return vmstate_save_state_with_err(f, &vmstate_vfio_pci_config, vdev, NULL,
2786                                        errp);
2787 }
2788 
2789 static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
2790 {
2791     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2792     PCIDevice *pdev = &vdev->pdev;
2793     pcibus_t old_addr[PCI_NUM_REGIONS - 1];
2794     int bar, ret;
2795 
2796     for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
2797         old_addr[bar] = pdev->io_regions[bar].addr;
2798     }
2799 
2800     ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1);
2801     if (ret) {
2802         return ret;
2803     }
2804 
2805     vfio_pci_write_config(pdev, PCI_COMMAND,
2806                           pci_get_word(pdev->config + PCI_COMMAND), 2);
2807 
2808     for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
2809         /*
2810          * The address may not be changed in some scenarios
2811          * (e.g. the VF driver isn't loaded in VM).
2812          */
2813         if (old_addr[bar] != pdev->io_regions[bar].addr &&
2814             vdev->bars[bar].region.size > 0 &&
2815             vdev->bars[bar].region.size < qemu_real_host_page_size()) {
2816             vfio_sub_page_bar_update_mapping(pdev, bar);
2817         }
2818     }
2819 
2820     if (msi_enabled(pdev)) {
2821         vfio_msi_enable(vdev);
2822     } else if (msix_enabled(pdev)) {
2823         vfio_msix_enable(vdev);
2824     }
2825 
2826     return ret;
2827 }
2828 
2829 static VFIODeviceOps vfio_pci_ops = {
2830     .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
2831     .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
2832     .vfio_eoi = vfio_pci_intx_eoi,
2833     .vfio_get_object = vfio_pci_get_object,
2834     .vfio_save_config = vfio_pci_save_config,
2835     .vfio_load_config = vfio_pci_load_config,
2836 };
2837 
2838 bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
2839 {
2840     VFIODevice *vbasedev = &vdev->vbasedev;
2841     struct vfio_region_info *reg_info = NULL;
2842     int ret;
2843 
2844     ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, &reg_info);
2845     if (ret) {
2846         error_setg_errno(errp, -ret,
2847                          "failed getting region info for VGA region index %d",
2848                          VFIO_PCI_VGA_REGION_INDEX);
2849         return false;
2850     }
2851 
2852     if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) ||
2853         !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) ||
2854         reg_info->size < 0xbffff + 1) {
2855         error_setg(errp, "unexpected VGA info, flags 0x%lx, size 0x%lx",
2856                    (unsigned long)reg_info->flags,
2857                    (unsigned long)reg_info->size);
2858         return false;
2859     }
2860 
2861     vdev->vga = g_new0(VFIOVGA, 1);
2862 
2863     vdev->vga->fd_offset = reg_info->offset;
2864     vdev->vga->fd = vdev->vbasedev.fd;
2865 
2866     vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
2867     vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
2868     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks);
2869 
2870     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
2871                           OBJECT(vdev), &vfio_vga_ops,
2872                           &vdev->vga->region[QEMU_PCI_VGA_MEM],
2873                           "vfio-vga-mmio@0xa0000",
2874                           QEMU_PCI_VGA_MEM_SIZE);
2875 
2876     vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
2877     vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
2878     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks);
2879 
2880     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
2881                           OBJECT(vdev), &vfio_vga_ops,
2882                           &vdev->vga->region[QEMU_PCI_VGA_IO_LO],
2883                           "vfio-vga-io@0x3b0",
2884                           QEMU_PCI_VGA_IO_LO_SIZE);
2885 
2886     vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
2887     vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
2888     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks);
2889 
2890     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
2891                           OBJECT(vdev), &vfio_vga_ops,
2892                           &vdev->vga->region[QEMU_PCI_VGA_IO_HI],
2893                           "vfio-vga-io@0x3c0",
2894                           QEMU_PCI_VGA_IO_HI_SIZE);
2895 
2896     return true;
2897 }
2898 
2899 bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp)
2900 {
2901     VFIODevice *vbasedev = &vdev->vbasedev;
2902     struct vfio_region_info *reg_info = NULL;
2903     struct vfio_irq_info irq_info;
2904     int i, ret = -1;
2905 
2906     /* Sanity check device */
2907     if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) {
2908         error_setg(errp, "this isn't a PCI device");
2909         return false;
2910     }
2911 
2912     if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
2913         error_setg(errp, "unexpected number of io regions %u",
2914                    vbasedev->num_regions);
2915         return false;
2916     }
2917 
2918     if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
2919         error_setg(errp, "unexpected number of irqs %u", vbasedev->num_irqs);
2920         return false;
2921     }
2922 
2923     for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
2924         char *name = g_strdup_printf("%s BAR %d", vbasedev->name, i);
2925 
2926         ret = vfio_region_setup(OBJECT(vdev), vbasedev,
2927                                 &vdev->bars[i].region, i, name);
2928         g_free(name);
2929 
2930         if (ret) {
2931             error_setg_errno(errp, -ret, "failed to get region %d info", i);
2932             return false;
2933         }
2934 
2935         QLIST_INIT(&vdev->bars[i].quirks);
2936     }
2937 
2938     ret = vfio_device_get_region_info(vbasedev,
2939                                       VFIO_PCI_CONFIG_REGION_INDEX, &reg_info);
2940     if (ret) {
2941         error_setg_errno(errp, -ret, "failed to get config info");
2942         return false;
2943     }
2944 
2945     trace_vfio_pci_populate_device_config(vdev->vbasedev.name,
2946                                       (unsigned long)reg_info->size,
2947                                       (unsigned long)reg_info->offset,
2948                                       (unsigned long)reg_info->flags);
2949 
2950     vdev->config_size = reg_info->size;
2951     if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
2952         vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
2953     }
2954     vdev->config_offset = reg_info->offset;
2955 
2956     if (vdev->features & VFIO_FEATURE_ENABLE_VGA) {
2957         if (!vfio_populate_vga(vdev, errp)) {
2958             error_append_hint(errp, "device does not support "
2959                               "requested feature x-vga\n");
2960             return false;
2961         }
2962     }
2963 
2964     ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info);
2965     if (ret) {
2966         /* This can fail for an old kernel or legacy PCI dev */
2967         trace_vfio_pci_populate_device_get_irq_info_failure(strerror(-ret));
2968     } else if (irq_info.count == 1) {
2969         vdev->pci_aer = true;
2970     } else {
2971         warn_report(VFIO_MSG_PREFIX
2972                     "Could not enable error recovery for the device",
2973                     vbasedev->name);
2974     }
2975 
2976     return true;
2977 }
2978 
2979 void vfio_pci_put_device(VFIOPCIDevice *vdev)
2980 {
2981     vfio_display_finalize(vdev);
2982     vfio_bars_finalize(vdev);
2983     g_free(vdev->emulated_config_bits);
2984     g_free(vdev->rom);
2985     /*
2986      * XXX Leaking igd_opregion is not an oversight, we can't remove the
2987      * fw_cfg entry therefore leaking this allocation seems like the safest
2988      * option.
2989      *
2990      * g_free(vdev->igd_opregion);
2991      */
2992 
2993     vfio_device_detach(&vdev->vbasedev);
2994 
2995     vfio_device_free_name(&vdev->vbasedev);
2996     g_free(vdev->msix);
2997 }
2998 
2999 static void vfio_err_notifier_handler(void *opaque)
3000 {
3001     VFIOPCIDevice *vdev = opaque;
3002 
3003     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
3004         return;
3005     }
3006 
3007     /*
3008      * TBD. Retrieve the error details and decide what action
3009      * needs to be taken. One of the actions could be to pass
3010      * the error to the guest and have the guest driver recover
3011      * from the error. This requires that PCIe capabilities be
3012      * exposed to the guest. For now, we just terminate the
3013      * guest to contain the error.
3014      */
3015 
3016     error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
3017 
3018     vm_stop(RUN_STATE_INTERNAL_ERROR);
3019 }
3020 
3021 /*
3022  * Registers error notifier for devices supporting error recovery.
3023  * If we encounter a failure in this function, we report an error
3024  * and continue after disabling error recovery support for the
3025  * device.
3026  */
3027 void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev)
3028 {
3029     Error *err = NULL;
3030     int32_t fd;
3031 
3032     if (!vdev->pci_aer) {
3033         return;
3034     }
3035 
3036     if (!vfio_notifier_init(vdev, &vdev->err_notifier, "err_notifier", 0,
3037                             &err)) {
3038         error_report_err(err);
3039         vdev->pci_aer = false;
3040         return;
3041     }
3042 
3043     fd = event_notifier_get_fd(&vdev->err_notifier);
3044     qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev);
3045 
3046     /* Do not alter irq_signaling during vfio_realize for cpr */
3047     if (cpr_is_incoming()) {
3048         return;
3049     }
3050 
3051     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
3052                                        VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
3053         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3054         qemu_set_fd_handler(fd, NULL, NULL, vdev);
3055         vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0);
3056         vdev->pci_aer = false;
3057     }
3058 }
3059 
3060 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
3061 {
3062     Error *err = NULL;
3063 
3064     if (!vdev->pci_aer) {
3065         return;
3066     }
3067 
3068     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
3069                                        VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
3070         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3071     }
3072     qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
3073                         NULL, NULL, vdev);
3074     vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0);
3075 }
3076 
3077 static void vfio_req_notifier_handler(void *opaque)
3078 {
3079     VFIOPCIDevice *vdev = opaque;
3080     Error *err = NULL;
3081 
3082     if (!event_notifier_test_and_clear(&vdev->req_notifier)) {
3083         return;
3084     }
3085 
3086     qdev_unplug(DEVICE(vdev), &err);
3087     if (err) {
3088         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3089     }
3090 }
3091 
3092 void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev)
3093 {
3094     struct vfio_irq_info irq_info;
3095     Error *err = NULL;
3096     int32_t fd;
3097     int ret;
3098 
3099     if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
3100         return;
3101     }
3102 
3103     ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX,
3104                                    &irq_info);
3105     if (ret < 0 || irq_info.count < 1) {
3106         return;
3107     }
3108 
3109     if (!vfio_notifier_init(vdev, &vdev->req_notifier, "req_notifier", 0,
3110                             &err)) {
3111         error_report_err(err);
3112         return;
3113     }
3114 
3115     fd = event_notifier_get_fd(&vdev->req_notifier);
3116     qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev);
3117 
3118     /* Do not alter irq_signaling during vfio_realize for cpr */
3119     if (cpr_is_incoming()) {
3120         vdev->req_enabled = true;
3121         return;
3122     }
3123 
3124     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
3125                                        VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
3126         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3127         qemu_set_fd_handler(fd, NULL, NULL, vdev);
3128         vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0);
3129     } else {
3130         vdev->req_enabled = true;
3131     }
3132 }
3133 
3134 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
3135 {
3136     Error *err = NULL;
3137 
3138     if (!vdev->req_enabled) {
3139         return;
3140     }
3141 
3142     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
3143                                        VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
3144         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3145     }
3146     qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
3147                         NULL, NULL, vdev);
3148     vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0);
3149 
3150     vdev->req_enabled = false;
3151 }
3152 
3153 bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp)
3154 {
3155     PCIDevice *pdev = &vdev->pdev;
3156     VFIODevice *vbasedev = &vdev->vbasedev;
3157     uint32_t config_space_size;
3158     int ret;
3159 
3160     config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size);
3161 
3162     /* Get a copy of config space */
3163     ret = vfio_pci_config_space_read(vdev, 0, config_space_size,
3164                                      vdev->pdev.config);
3165     if (ret < (int)config_space_size) {
3166         ret = ret < 0 ? -ret : EFAULT;
3167         error_setg_errno(errp, ret, "failed to read device config space");
3168         return false;
3169     }
3170 
3171     /* vfio emulates a lot for us, but some bits need extra love */
3172     vdev->emulated_config_bits = g_malloc0(vdev->config_size);
3173 
3174     /* QEMU can choose to expose the ROM or not */
3175     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
3176     /* QEMU can also add or extend BARs */
3177     memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
3178 
3179     /*
3180      * The PCI spec reserves vendor ID 0xffff as an invalid value.  The
3181      * device ID is managed by the vendor and need only be a 16-bit value.
3182      * Allow any 16-bit value for subsystem so they can be hidden or changed.
3183      */
3184     if (vdev->vendor_id != PCI_ANY_ID) {
3185         if (vdev->vendor_id >= 0xffff) {
3186             error_setg(errp, "invalid PCI vendor ID provided");
3187             return false;
3188         }
3189         vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
3190         trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id);
3191     } else {
3192         vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
3193     }
3194 
3195     if (vdev->device_id != PCI_ANY_ID) {
3196         if (vdev->device_id > 0xffff) {
3197             error_setg(errp, "invalid PCI device ID provided");
3198             return false;
3199         }
3200         vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
3201         trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id);
3202     } else {
3203         vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
3204     }
3205 
3206     if (vdev->sub_vendor_id != PCI_ANY_ID) {
3207         if (vdev->sub_vendor_id > 0xffff) {
3208             error_setg(errp, "invalid PCI subsystem vendor ID provided");
3209             return false;
3210         }
3211         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
3212                                vdev->sub_vendor_id, ~0);
3213         trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name,
3214                                               vdev->sub_vendor_id);
3215     }
3216 
3217     if (vdev->sub_device_id != PCI_ANY_ID) {
3218         if (vdev->sub_device_id > 0xffff) {
3219             error_setg(errp, "invalid PCI subsystem device ID provided");
3220             return false;
3221         }
3222         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
3223         trace_vfio_pci_emulated_sub_device_id(vbasedev->name,
3224                                               vdev->sub_device_id);
3225     }
3226 
3227     /*
3228      * Class code is a 24-bit value at config space 0x09. Allow overriding it
3229      * with any 24-bit value.
3230      */
3231     if (vdev->class_code != PCI_ANY_ID) {
3232         if (vdev->class_code > 0xffffff) {
3233             error_setg(errp, "invalid PCI class code provided");
3234             return false;
3235         }
3236         /* Higher 24 bits of PCI_CLASS_REVISION are class code */
3237         vfio_add_emulated_long(vdev, PCI_CLASS_REVISION,
3238                                vdev->class_code << 8, ~0xff);
3239         trace_vfio_pci_emulated_class_code(vbasedev->name, vdev->class_code);
3240     } else {
3241         vdev->class_code = pci_get_long(pdev->config + PCI_CLASS_REVISION) >> 8;
3242     }
3243 
3244     /* QEMU can change multi-function devices to single function, or reverse */
3245     vdev->emulated_config_bits[PCI_HEADER_TYPE] =
3246                                               PCI_HEADER_TYPE_MULTI_FUNCTION;
3247 
3248     /* Restore or clear multifunction, this is always controlled by QEMU */
3249     if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
3250         vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
3251     } else {
3252         vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
3253     }
3254 
3255     /*
3256      * Clear host resource mapping info.  If we choose not to register a
3257      * BAR, such as might be the case with the option ROM, we can get
3258      * confusing, unwritable, residual addresses from the host here.
3259      */
3260     memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
3261     memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
3262 
3263     vfio_pci_size_rom(vdev);
3264 
3265     vfio_bars_prepare(vdev);
3266 
3267     if (!vfio_msix_early_setup(vdev, errp)) {
3268         return false;
3269     }
3270 
3271     vfio_bars_register(vdev);
3272 
3273     if (vdev->vga && vfio_is_vga(vdev)) {
3274         pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
3275                          &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
3276                          &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem);
3277     }
3278 
3279     return true;
3280 }
3281 
3282 bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
3283 {
3284     PCIDevice *pdev = &vdev->pdev;
3285 
3286     /* QEMU emulates all of MSI & MSIX */
3287     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
3288         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
3289                MSIX_CAP_LENGTH);
3290     }
3291 
3292     if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
3293         memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
3294                vdev->msi_cap_size);
3295     }
3296 
3297     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
3298         vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
3299                                              vfio_intx_mmap_enable, vdev);
3300         pci_device_set_intx_routing_notifier(&vdev->pdev,
3301                                              vfio_intx_routing_notifier);
3302         vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
3303         kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
3304 
3305         /*
3306          * During CPR, do not call vfio_intx_enable at this time.  Instead,
3307          * call it from vfio_pci_post_load after the intx routing data has
3308          * been loaded from vmstate.
3309          */
3310         if (!cpr_is_incoming() && !vfio_intx_enable(vdev, errp)) {
3311             timer_free(vdev->intx.mmap_timer);
3312             pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3313             kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3314             return false;
3315         }
3316     }
3317     return true;
3318 }
3319 
3320 static void vfio_pci_realize(PCIDevice *pdev, Error **errp)
3321 {
3322     ERRP_GUARD();
3323     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
3324     VFIODevice *vbasedev = &vdev->vbasedev;
3325     int i;
3326     char uuid[UUID_STR_LEN];
3327     g_autofree char *name = NULL;
3328 
3329     if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
3330         if (!(~vdev->host.domain || ~vdev->host.bus ||
3331               ~vdev->host.slot || ~vdev->host.function)) {
3332             error_setg(errp, "No provided host device");
3333             error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
3334 #ifdef CONFIG_IOMMUFD
3335                               "or -device vfio-pci,fd=DEVICE_FD "
3336 #endif
3337                               "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
3338             return;
3339         }
3340         vbasedev->sysfsdev =
3341             g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x",
3342                             vdev->host.domain, vdev->host.bus,
3343                             vdev->host.slot, vdev->host.function);
3344     }
3345 
3346     if (!vfio_device_get_name(vbasedev, errp)) {
3347         return;
3348     }
3349 
3350     /*
3351      * Mediated devices *might* operate compatibly with discarding of RAM, but
3352      * we cannot know for certain, it depends on whether the mdev vendor driver
3353      * stays in sync with the active working set of the guest driver.  Prevent
3354      * the x-balloon-allowed option unless this is minimally an mdev device.
3355      */
3356     vbasedev->mdev = vfio_device_is_mdev(vbasedev);
3357 
3358     trace_vfio_mdev(vbasedev->name, vbasedev->mdev);
3359 
3360     if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) {
3361         error_setg(errp, "x-balloon-allowed only potentially compatible "
3362                    "with mdev devices");
3363         goto error;
3364     }
3365 
3366     if (!qemu_uuid_is_null(&vdev->vf_token)) {
3367         qemu_uuid_unparse(&vdev->vf_token, uuid);
3368         name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid);
3369     } else {
3370         name = g_strdup(vbasedev->name);
3371     }
3372 
3373     if (!vfio_device_attach(name, vbasedev,
3374                             pci_device_iommu_address_space(pdev), errp)) {
3375         goto error;
3376     }
3377 
3378     if (!vfio_pci_populate_device(vdev, errp)) {
3379         goto error;
3380     }
3381 
3382     if (!vfio_pci_config_setup(vdev, errp)) {
3383         goto error;
3384     }
3385 
3386     if (!vbasedev->mdev &&
3387         !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) {
3388         error_prepend(errp, "Failed to set vIOMMU: ");
3389         goto out_teardown;
3390     }
3391 
3392     if (!vfio_pci_add_capabilities(vdev, errp)) {
3393         goto out_unset_idev;
3394     }
3395 
3396     if (!vfio_config_quirk_setup(vdev, errp)) {
3397         goto out_unset_idev;
3398     }
3399 
3400     if (vdev->vga) {
3401         vfio_vga_quirk_setup(vdev);
3402     }
3403 
3404     for (i = 0; i < PCI_ROM_SLOT; i++) {
3405         vfio_bar_quirk_setup(vdev, i);
3406     }
3407 
3408     if (!vfio_pci_interrupt_setup(vdev, errp)) {
3409         goto out_unset_idev;
3410     }
3411 
3412     if (vdev->display != ON_OFF_AUTO_OFF) {
3413         if (!vfio_display_probe(vdev, errp)) {
3414             goto out_deregister;
3415         }
3416     }
3417     if (vdev->enable_ramfb && vdev->dpy == NULL) {
3418         error_setg(errp, "ramfb=on requires display=on");
3419         goto out_deregister;
3420     }
3421     if (vdev->display_xres || vdev->display_yres) {
3422         if (vdev->dpy == NULL) {
3423             error_setg(errp, "xres and yres properties require display=on");
3424             goto out_deregister;
3425         }
3426         if (vdev->dpy->edid_regs == NULL) {
3427             error_setg(errp, "xres and yres properties need edid support");
3428             goto out_deregister;
3429         }
3430     }
3431 
3432     if (vdev->ramfb_migrate == ON_OFF_AUTO_ON && !vdev->enable_ramfb) {
3433         warn_report("x-ramfb-migrate=on but ramfb=off. "
3434                     "Forcing x-ramfb-migrate to off.");
3435         vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
3436     }
3437     if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) {
3438         if (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO) {
3439             vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
3440         } else if (vdev->ramfb_migrate == ON_OFF_AUTO_ON) {
3441             error_setg(errp, "x-ramfb-migrate requires enable-migration");
3442             goto out_deregister;
3443         }
3444     }
3445 
3446     if (!pdev->failover_pair_id) {
3447         if (!vfio_migration_realize(vbasedev, errp)) {
3448             goto out_deregister;
3449         }
3450     }
3451 
3452     vfio_pci_register_err_notifier(vdev);
3453     vfio_pci_register_req_notifier(vdev);
3454     vfio_setup_resetfn_quirk(vdev);
3455 
3456     return;
3457 
3458 out_deregister:
3459     if (vdev->interrupt == VFIO_INT_INTx) {
3460         vfio_intx_disable(vdev);
3461     }
3462     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3463     if (vdev->irqchip_change_notifier.notify) {
3464         kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3465     }
3466     if (vdev->intx.mmap_timer) {
3467         timer_free(vdev->intx.mmap_timer);
3468     }
3469 out_unset_idev:
3470     if (!vbasedev->mdev) {
3471         pci_device_unset_iommu_device(pdev);
3472     }
3473 out_teardown:
3474     vfio_pci_teardown_msi(vdev);
3475     vfio_pci_bars_exit(vdev);
3476 error:
3477     error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name);
3478 }
3479 
3480 static void vfio_instance_finalize(Object *obj)
3481 {
3482     VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
3483 
3484     vfio_pci_put_device(vdev);
3485 }
3486 
3487 static void vfio_exitfn(PCIDevice *pdev)
3488 {
3489     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
3490     VFIODevice *vbasedev = &vdev->vbasedev;
3491 
3492     vfio_unregister_req_notifier(vdev);
3493     vfio_unregister_err_notifier(vdev);
3494     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3495     if (vdev->irqchip_change_notifier.notify) {
3496         kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3497     }
3498     vfio_disable_interrupts(vdev);
3499     if (vdev->intx.mmap_timer) {
3500         timer_free(vdev->intx.mmap_timer);
3501     }
3502     vfio_pci_teardown_msi(vdev);
3503     vfio_pci_disable_rp_atomics(vdev);
3504     vfio_pci_bars_exit(vdev);
3505     vfio_migration_exit(vbasedev);
3506     if (!vbasedev->mdev) {
3507         pci_device_unset_iommu_device(pdev);
3508     }
3509 }
3510 
3511 static void vfio_pci_reset(DeviceState *dev)
3512 {
3513     VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev);
3514 
3515     /* Do not reset the device during qemu_system_reset prior to cpr load */
3516     if (cpr_is_incoming()) {
3517         return;
3518     }
3519 
3520     trace_vfio_pci_reset(vdev->vbasedev.name);
3521 
3522     vfio_pci_pre_reset(vdev);
3523 
3524     if (vdev->display != ON_OFF_AUTO_OFF) {
3525         vfio_display_reset(vdev);
3526     }
3527 
3528     if (vdev->resetfn && !vdev->resetfn(vdev)) {
3529         goto post_reset;
3530     }
3531 
3532     if (vdev->vbasedev.reset_works &&
3533         (vdev->has_flr || !vdev->has_pm_reset) &&
3534         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3535         trace_vfio_pci_reset_flr(vdev->vbasedev.name);
3536         goto post_reset;
3537     }
3538 
3539     /* See if we can do our own bus reset */
3540     if (!vfio_pci_hot_reset_one(vdev)) {
3541         goto post_reset;
3542     }
3543 
3544     /* If nothing else works and the device supports PM reset, use it */
3545     if (vdev->vbasedev.reset_works && vdev->has_pm_reset &&
3546         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3547         trace_vfio_pci_reset_pm(vdev->vbasedev.name);
3548         goto post_reset;
3549     }
3550 
3551 post_reset:
3552     vfio_pci_post_reset(vdev);
3553 }
3554 
3555 static void vfio_instance_init(Object *obj)
3556 {
3557     PCIDevice *pci_dev = PCI_DEVICE(obj);
3558     VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
3559     VFIODevice *vbasedev = &vdev->vbasedev;
3560 
3561     device_add_bootindex_property(obj, &vdev->bootindex,
3562                                   "bootindex", NULL,
3563                                   &pci_dev->qdev);
3564     vdev->host.domain = ~0U;
3565     vdev->host.bus = ~0U;
3566     vdev->host.slot = ~0U;
3567     vdev->host.function = ~0U;
3568 
3569     vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops,
3570                      DEVICE(vdev), false);
3571 
3572     vdev->nv_gpudirect_clique = 0xFF;
3573 
3574     /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
3575      * line, therefore, no need to wait to realize like other devices */
3576     pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
3577 
3578     /*
3579      * A device that is resuming for cpr is already configured, so do not
3580      * reset it during qemu_system_reset prior to cpr load, else interrupts
3581      * may be lost.
3582      */
3583     pci_dev->cap_present |= QEMU_PCI_SKIP_RESET_ON_CPR;
3584 }
3585 
3586 static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data)
3587 {
3588     DeviceClass *dc = DEVICE_CLASS(klass);
3589     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3590 
3591     dc->desc = "VFIO PCI base device";
3592     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3593     pdc->exit = vfio_exitfn;
3594     pdc->config_read = vfio_pci_read_config;
3595     pdc->config_write = vfio_pci_write_config;
3596 }
3597 
3598 static const TypeInfo vfio_pci_base_dev_info = {
3599     .name = TYPE_VFIO_PCI_BASE,
3600     .parent = TYPE_PCI_DEVICE,
3601     .instance_size = sizeof(VFIOPCIDevice),
3602     .abstract = true,
3603     .class_init = vfio_pci_base_dev_class_init,
3604     .interfaces = (const InterfaceInfo[]) {
3605         { INTERFACE_PCIE_DEVICE },
3606         { INTERFACE_CONVENTIONAL_PCI_DEVICE },
3607         { }
3608     },
3609 };
3610 
3611 static PropertyInfo vfio_pci_migration_multifd_transfer_prop;
3612 
3613 static const Property vfio_pci_dev_properties[] = {
3614     DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
3615     DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
3616     DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
3617     DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
3618                             vbasedev.pre_copy_dirty_page_tracking,
3619                             ON_OFF_AUTO_ON),
3620     DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice,
3621                             vbasedev.device_dirty_page_tracking,
3622                             ON_OFF_AUTO_ON),
3623     DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
3624                             display, ON_OFF_AUTO_OFF),
3625     DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
3626     DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
3627     DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
3628                        intx.mmap_timeout, 1100),
3629     DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
3630                     VFIO_FEATURE_ENABLE_VGA_BIT, false),
3631     DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
3632                     VFIO_FEATURE_ENABLE_REQ_BIT, true),
3633     DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
3634                     VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, true),
3635     DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features,
3636                     VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false),
3637     DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice,
3638                             igd_legacy_mode, ON_OFF_AUTO_AUTO),
3639     DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice,
3640                             vbasedev.enable_migration, ON_OFF_AUTO_AUTO),
3641     DEFINE_PROP("x-migration-multifd-transfer", VFIOPCIDevice,
3642                 vbasedev.migration_multifd_transfer,
3643                 vfio_pci_migration_multifd_transfer_prop, OnOffAuto,
3644                 .set_default = true, .defval.i = ON_OFF_AUTO_AUTO),
3645     DEFINE_PROP_ON_OFF_AUTO("x-migration-load-config-after-iter", VFIOPCIDevice,
3646                             vbasedev.migration_load_config_after_iter,
3647                             ON_OFF_AUTO_AUTO),
3648     DEFINE_PROP_SIZE("x-migration-max-queued-buffers-size", VFIOPCIDevice,
3649                      vbasedev.migration_max_queued_buffers_size, UINT64_MAX),
3650     DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice,
3651                      vbasedev.migration_events, false),
3652     DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
3653     DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
3654                      vbasedev.ram_block_discard_allowed, false),
3655     DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
3656     DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
3657     DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
3658     DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
3659                      no_geforce_quirks, false),
3660     DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
3661                      false),
3662     DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
3663                      false),
3664     DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
3665     DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
3666     DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
3667                        sub_vendor_id, PCI_ANY_ID),
3668     DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
3669                        sub_device_id, PCI_ANY_ID),
3670     DEFINE_PROP_UINT32("x-pci-class-code", VFIOPCIDevice,
3671                        class_code, PCI_ANY_ID),
3672     DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
3673     DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
3674                                    nv_gpudirect_clique,
3675                                    qdev_prop_nv_gpudirect_clique, uint8_t),
3676     DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
3677                                 OFF_AUTO_PCIBAR_OFF),
3678 #ifdef CONFIG_IOMMUFD
3679     DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
3680                      TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
3681 #endif
3682     DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
3683 };
3684 
3685 #ifdef CONFIG_IOMMUFD
3686 static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp)
3687 {
3688     VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
3689     vfio_device_set_fd(&vdev->vbasedev, str, errp);
3690 }
3691 #endif
3692 
3693 static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data)
3694 {
3695     DeviceClass *dc = DEVICE_CLASS(klass);
3696     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3697 
3698     device_class_set_legacy_reset(dc, vfio_pci_reset);
3699     device_class_set_props(dc, vfio_pci_dev_properties);
3700 #ifdef CONFIG_IOMMUFD
3701     object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd);
3702 #endif
3703     dc->vmsd = &vfio_cpr_pci_vmstate;
3704     dc->desc = "VFIO-based PCI device assignment";
3705     pdc->realize = vfio_pci_realize;
3706 
3707     object_class_property_set_description(klass, /* 1.3 */
3708                                           "host",
3709                                           "Host PCI address [domain:]<bus:slot.function> of assigned device");
3710     object_class_property_set_description(klass, /* 1.3 */
3711                                           "x-intx-mmap-timeout-ms",
3712                                           "When EOI is not provided by KVM/QEMU, wait time "
3713                                           "(milliseconds) to re-enable device direct access "
3714                                           "after INTx (DEBUG)");
3715     object_class_property_set_description(klass, /* 1.5 */
3716                                           "x-vga",
3717                                           "Expose VGA address spaces for device");
3718     object_class_property_set_description(klass, /* 2.3 */
3719                                           "x-req",
3720                                           "Disable device request notification support (DEBUG)");
3721     object_class_property_set_description(klass, /* 2.4 and 2.5 */
3722                                           "x-no-mmap",
3723                                           "Disable MMAP for device. Allows to trace MMIO "
3724                                           "accesses (DEBUG)");
3725     object_class_property_set_description(klass, /* 2.5 */
3726                                           "x-no-kvm-intx",
3727                                           "Disable direct VFIO->KVM INTx injection. Allows to "
3728                                           "trace INTx interrupts (DEBUG)");
3729     object_class_property_set_description(klass, /* 2.5 */
3730                                           "x-no-kvm-msi",
3731                                           "Disable direct VFIO->KVM MSI injection. Allows to "
3732                                           "trace MSI interrupts (DEBUG)");
3733     object_class_property_set_description(klass, /* 2.5 */
3734                                           "x-no-kvm-msix",
3735                                           "Disable direct VFIO->KVM MSIx injection. Allows to "
3736                                           "trace MSIx interrupts (DEBUG)");
3737     object_class_property_set_description(klass, /* 2.5 */
3738                                           "x-pci-vendor-id",
3739                                           "Override PCI Vendor ID with provided value (DEBUG)");
3740     object_class_property_set_description(klass, /* 2.5 */
3741                                           "x-pci-device-id",
3742                                           "Override PCI device ID with provided value (DEBUG)");
3743     object_class_property_set_description(klass, /* 2.5 */
3744                                           "x-pci-sub-vendor-id",
3745                                           "Override PCI Subsystem Vendor ID with provided value "
3746                                           "(DEBUG)");
3747     object_class_property_set_description(klass, /* 2.5 */
3748                                           "x-pci-sub-device-id",
3749                                           "Override PCI Subsystem Device ID with provided value "
3750                                           "(DEBUG)");
3751     object_class_property_set_description(klass, /* 2.6 */
3752                                           "sysfsdev",
3753                                           "Host sysfs path of assigned device");
3754     object_class_property_set_description(klass, /* 2.7 */
3755                                           "x-igd-opregion",
3756                                           "Expose host IGD OpRegion to guest");
3757     object_class_property_set_description(klass, /* 2.7 (See c4c45e943e51) */
3758                                           "x-igd-gms",
3759                                           "Override IGD data stolen memory size (32MiB units)");
3760     object_class_property_set_description(klass, /* 2.11 */
3761                                           "x-nv-gpudirect-clique",
3762                                           "Add NVIDIA GPUDirect capability indicating P2P DMA "
3763                                           "clique for device [0-15]");
3764     object_class_property_set_description(klass, /* 2.12 */
3765                                           "x-no-geforce-quirks",
3766                                           "Disable GeForce quirks (for NVIDIA Quadro/GRID/Tesla). "
3767                                           "Improves performance");
3768     object_class_property_set_description(klass, /* 2.12 */
3769                                           "display",
3770                                           "Enable display support for device, ex. vGPU");
3771     object_class_property_set_description(klass, /* 2.12 */
3772                                           "x-msix-relocation",
3773                                           "Specify MSI-X MMIO relocation to the end of specified "
3774                                           "existing BAR or new BAR to avoid virtualization overhead "
3775                                           "due to adjacent device registers");
3776     object_class_property_set_description(klass, /* 3.0 */
3777                                           "x-no-kvm-ioeventfd",
3778                                           "Disable registration of ioeventfds with KVM (DEBUG)");
3779     object_class_property_set_description(klass, /* 3.0 */
3780                                           "x-no-vfio-ioeventfd",
3781                                           "Disable linking of KVM ioeventfds to VFIO ioeventfds "
3782                                           "(DEBUG)");
3783     object_class_property_set_description(klass, /* 3.1 */
3784                                           "x-balloon-allowed",
3785                                           "Override allowing ballooning with device (DEBUG, DANGER)");
3786     object_class_property_set_description(klass, /* 3.2 */
3787                                           "xres",
3788                                           "Set X display resolution the vGPU should use");
3789     object_class_property_set_description(klass, /* 3.2 */
3790                                           "yres",
3791                                           "Set Y display resolution the vGPU should use");
3792     object_class_property_set_description(klass, /* 5.2 */
3793                                           "x-pre-copy-dirty-page-tracking",
3794                                           "Disable dirty pages tracking during iterative phase "
3795                                           "(DEBUG)");
3796     object_class_property_set_description(klass, /* 5.2, 8.0 non-experimetal */
3797                                           "enable-migration",
3798                                           "Enale device migration. Also requires a host VFIO PCI "
3799                                           "variant or mdev driver with migration support enabled");
3800     object_class_property_set_description(klass, /* 8.1 */
3801                                           "vf-token",
3802                                           "Specify UUID VF token. Required for VF when PF is owned "
3803                                           "by another VFIO driver");
3804 #ifdef CONFIG_IOMMUFD
3805     object_class_property_set_description(klass, /* 9.0 */
3806                                           "iommufd",
3807                                           "Set host IOMMUFD backend device");
3808 #endif
3809     object_class_property_set_description(klass, /* 9.1 */
3810                                           "x-device-dirty-page-tracking",
3811                                           "Disable device dirty page tracking and use "
3812                                           "container-based dirty page tracking");
3813     object_class_property_set_description(klass, /* 9.1 */
3814                                           "migration-events",
3815                                           "Emit VFIO migration QAPI event when a VFIO device "
3816                                           "changes its migration state. For management applications");
3817     object_class_property_set_description(klass, /* 9.1 */
3818                                           "skip-vsc-check",
3819                                           "Skip config space check for Vendor Specific Capability. "
3820                                           "Setting to false will enforce strict checking of VSC content "
3821                                           "(DEBUG)");
3822     object_class_property_set_description(klass, /* 10.0 */
3823                                           "x-migration-multifd-transfer",
3824                                           "Transfer this device state via "
3825                                           "multifd channels when live migrating it");
3826     object_class_property_set_description(klass, /* 10.1 */
3827                                           "x-migration-load-config-after-iter",
3828                                           "Start the config load only after "
3829                                           "all iterables were loaded (during "
3830                                           "non-iterables loading phase) when "
3831                                           "doing live migration of device state "
3832                                           "via multifd channels");
3833     object_class_property_set_description(klass, /* 10.1 */
3834                                           "x-migration-max-queued-buffers-size",
3835                                           "Maximum size of in-flight VFIO "
3836                                           "device state buffers queued at the "
3837                                           "destination when doing live "
3838                                           "migration of device state via "
3839                                           "multifd channels");
3840 }
3841 
3842 static const TypeInfo vfio_pci_dev_info = {
3843     .name = TYPE_VFIO_PCI,
3844     .parent = TYPE_VFIO_PCI_BASE,
3845     .class_init = vfio_pci_dev_class_init,
3846     .instance_init = vfio_instance_init,
3847     .instance_finalize = vfio_instance_finalize,
3848 };
3849 
3850 static const Property vfio_pci_dev_nohotplug_properties[] = {
3851     DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false),
3852     DEFINE_PROP_BOOL("use-legacy-x86-rom", VFIOPCIDevice,
3853                      use_legacy_x86_rom, false),
3854     DEFINE_PROP_ON_OFF_AUTO("x-ramfb-migrate", VFIOPCIDevice, ramfb_migrate,
3855                             ON_OFF_AUTO_AUTO),
3856 };
3857 
3858 static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass,
3859                                               const void *data)
3860 {
3861     DeviceClass *dc = DEVICE_CLASS(klass);
3862 
3863     device_class_set_props(dc, vfio_pci_dev_nohotplug_properties);
3864     dc->hotpluggable = false;
3865 
3866     object_class_property_set_description(klass, /* 3.1 */
3867                                           "ramfb",
3868                                           "Enable ramfb to provide pre-boot graphics for devices "
3869                                           "enabling display option");
3870     object_class_property_set_description(klass, /* 8.2 */
3871                                           "x-ramfb-migrate",
3872                                           "Override default migration support for ramfb support "
3873                                           "(DEBUG)");
3874 }
3875 
3876 static const TypeInfo vfio_pci_nohotplug_dev_info = {
3877     .name = TYPE_VFIO_PCI_NOHOTPLUG,
3878     .parent = TYPE_VFIO_PCI,
3879     .instance_size = sizeof(VFIOPCIDevice),
3880     .class_init = vfio_pci_nohotplug_dev_class_init,
3881 };
3882 
3883 static void register_vfio_pci_dev_type(void)
3884 {
3885     /*
3886      * Ordinary ON_OFF_AUTO property isn't runtime-mutable, but source VM can
3887      * run for a long time before being migrated so it is desirable to have a
3888      * fallback mechanism to the old way of transferring VFIO device state if
3889      * it turns to be necessary.
3890      * The following makes this type of property have the same mutability level
3891      * as ordinary migration parameters.
3892      */
3893     vfio_pci_migration_multifd_transfer_prop = qdev_prop_on_off_auto;
3894     vfio_pci_migration_multifd_transfer_prop.realized_set_allowed = true;
3895 
3896     type_register_static(&vfio_pci_base_dev_info);
3897     type_register_static(&vfio_pci_dev_info);
3898     type_register_static(&vfio_pci_nohotplug_dev_info);
3899 }
3900 
3901 type_init(register_vfio_pci_dev_type)
3902