xref: /openbmc/qemu/hw/vfio/pci.c (revision 6071d13c6a37493a6b26e1609b09a98aa058038a)
1 /*
2  * vfio based device assignment support
3  *
4  * Copyright Red Hat, Inc. 2012
5  *
6  * Authors:
7  *  Alex Williamson <alex.williamson@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Based on qemu-kvm device-assignment:
13  *  Adapted for KVM by Qumranet.
14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19  */
20 
21 #include "qemu/osdep.h"
22 #include CONFIG_DEVICES /* CONFIG_IOMMUFD */
23 #include <linux/vfio.h>
24 #include <sys/ioctl.h>
25 
26 #include "hw/hw.h"
27 #include "hw/pci/msi.h"
28 #include "hw/pci/msix.h"
29 #include "hw/pci/pci_bridge.h"
30 #include "hw/qdev-properties.h"
31 #include "hw/qdev-properties-system.h"
32 #include "hw/vfio/vfio-cpr.h"
33 #include "migration/vmstate.h"
34 #include "migration/cpr.h"
35 #include "qobject/qdict.h"
36 #include "qemu/error-report.h"
37 #include "qemu/main-loop.h"
38 #include "qemu/module.h"
39 #include "qemu/range.h"
40 #include "qemu/units.h"
41 #include "system/kvm.h"
42 #include "system/runstate.h"
43 #include "pci.h"
44 #include "trace.h"
45 #include "qapi/error.h"
46 #include "migration/blocker.h"
47 #include "migration/qemu-file.h"
48 #include "system/iommufd.h"
49 #include "vfio-migration-internal.h"
50 #include "vfio-helpers.h"
51 
52 /* Protected by BQL */
53 static KVMRouteChange vfio_route_change;
54 
55 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
56 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
57 static void vfio_msi_disable_common(VFIOPCIDevice *vdev);
58 
59 /* Create new or reuse existing eventfd */
60 static bool vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e,
61                                const char *name, int nr, Error **errp)
62 {
63     int fd, ret;
64 
65     fd = vfio_cpr_load_vector_fd(vdev, name, nr);
66     if (fd >= 0) {
67         event_notifier_init_fd(e, fd);
68         return true;
69     }
70 
71     ret = event_notifier_init(e, 0);
72     if (ret) {
73         error_setg_errno(errp, -ret, "vfio_notifier_init %s failed", name);
74         return false;
75     }
76 
77     fd = event_notifier_get_fd(e);
78     vfio_cpr_save_vector_fd(vdev, name, nr, fd);
79     return true;
80 }
81 
82 static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e,
83                                   const char *name, int nr)
84 {
85     vfio_cpr_delete_vector_fd(vdev, name, nr);
86     event_notifier_cleanup(e);
87 }
88 
89 /*
90  * Disabling BAR mmaping can be slow, but toggling it around INTx can
91  * also be a huge overhead.  We try to get the best of both worlds by
92  * waiting until an interrupt to disable mmaps (subsequent transitions
93  * to the same state are effectively no overhead).  If the interrupt has
94  * been serviced and the time gap is long enough, we re-enable mmaps for
95  * performance.  This works well for things like graphics cards, which
96  * may not use their interrupt at all and are penalized to an unusable
97  * level by read/write BAR traps.  Other devices, like NICs, have more
98  * regular interrupts and see much better latency by staying in non-mmap
99  * mode.  We therefore set the default mmap_timeout such that a ping
100  * is just enough to keep the mmap disabled.  Users can experiment with
101  * other options with the x-intx-mmap-timeout-ms parameter (a value of
102  * zero disables the timer).
103  */
104 static void vfio_intx_mmap_enable(void *opaque)
105 {
106     VFIOPCIDevice *vdev = opaque;
107 
108     if (vdev->intx.pending) {
109         timer_mod(vdev->intx.mmap_timer,
110                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
111         return;
112     }
113 
114     vfio_mmap_set_enabled(vdev, true);
115 }
116 
117 static void vfio_intx_interrupt(void *opaque)
118 {
119     VFIOPCIDevice *vdev = opaque;
120 
121     if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
122         return;
123     }
124 
125     trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin);
126 
127     vdev->intx.pending = true;
128     pci_irq_assert(&vdev->pdev);
129     vfio_mmap_set_enabled(vdev, false);
130     if (vdev->intx.mmap_timeout) {
131         timer_mod(vdev->intx.mmap_timer,
132                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
133     }
134 }
135 
136 void vfio_pci_intx_eoi(VFIODevice *vbasedev)
137 {
138     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
139 
140     if (!vdev->intx.pending) {
141         return;
142     }
143 
144     trace_vfio_pci_intx_eoi(vbasedev->name);
145 
146     vdev->intx.pending = false;
147     pci_irq_deassert(&vdev->pdev);
148     vfio_device_irq_unmask(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
149 }
150 
151 static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
152 {
153 #ifdef CONFIG_KVM
154     int irq_fd = event_notifier_get_fd(&vdev->intx.interrupt);
155 
156     if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
157         vdev->intx.route.mode != PCI_INTX_ENABLED ||
158         !kvm_resamplefds_enabled()) {
159         return true;
160     }
161 
162     /* Get to a known interrupt state */
163     qemu_set_fd_handler(irq_fd, NULL, NULL, vdev);
164     vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
165     vdev->intx.pending = false;
166     pci_irq_deassert(&vdev->pdev);
167 
168     /* Get an eventfd for resample/unmask */
169     if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) {
170         goto fail;
171     }
172 
173     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
174                                            &vdev->intx.interrupt,
175                                            &vdev->intx.unmask,
176                                            vdev->intx.route.irq)) {
177         error_setg_errno(errp, errno, "failed to setup resample irqfd");
178         goto fail_irqfd;
179     }
180 
181     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
182                                        VFIO_IRQ_SET_ACTION_UNMASK,
183                                        event_notifier_get_fd(&vdev->intx.unmask),
184                                        errp)) {
185         goto fail_vfio;
186     }
187 
188     /* Let'em rip */
189     vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
190 
191     vdev->intx.kvm_accel = true;
192 
193     trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
194 
195     return true;
196 
197 fail_vfio:
198     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
199                                           vdev->intx.route.irq);
200 fail_irqfd:
201     vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0);
202 fail:
203     qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev);
204     vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
205     return false;
206 #else
207     return true;
208 #endif
209 }
210 
211 static bool vfio_cpr_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
212 {
213 #ifdef CONFIG_KVM
214     if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
215         vdev->intx.route.mode != PCI_INTX_ENABLED ||
216         !kvm_resamplefds_enabled()) {
217         return true;
218     }
219 
220     if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) {
221         return false;
222     }
223 
224     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
225                                            &vdev->intx.interrupt,
226                                            &vdev->intx.unmask,
227                                            vdev->intx.route.irq)) {
228         error_setg_errno(errp, errno, "failed to setup resample irqfd");
229         vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0);
230         return false;
231     }
232 
233     vdev->intx.kvm_accel = true;
234     trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
235     return true;
236 #else
237     return true;
238 #endif
239 }
240 
241 static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
242 {
243 #ifdef CONFIG_KVM
244     if (!vdev->intx.kvm_accel) {
245         return;
246     }
247 
248     /*
249      * Get to a known state, hardware masked, QEMU ready to accept new
250      * interrupts, QEMU IRQ de-asserted.
251      */
252     vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
253     vdev->intx.pending = false;
254     pci_irq_deassert(&vdev->pdev);
255 
256     /* Tell KVM to stop listening for an INTx irqfd */
257     if (kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
258                                               vdev->intx.route.irq)) {
259         error_report("vfio: Error: Failed to disable INTx irqfd: %m");
260     }
261 
262     /* We only need to close the eventfd for VFIO to cleanup the kernel side */
263     vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0);
264 
265     /* QEMU starts listening for interrupt events. */
266     qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt),
267                         vfio_intx_interrupt, NULL, vdev);
268 
269     vdev->intx.kvm_accel = false;
270 
271     /* If we've missed an event, let it re-fire through QEMU */
272     vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
273 
274     trace_vfio_intx_disable_kvm(vdev->vbasedev.name);
275 #endif
276 }
277 
278 static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route)
279 {
280     Error *err = NULL;
281 
282     trace_vfio_intx_update(vdev->vbasedev.name,
283                            vdev->intx.route.irq, route->irq);
284 
285     vfio_intx_disable_kvm(vdev);
286 
287     vdev->intx.route = *route;
288 
289     if (route->mode != PCI_INTX_ENABLED) {
290         return;
291     }
292 
293     if (!vfio_intx_enable_kvm(vdev, &err)) {
294         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
295     }
296 
297     /* Re-enable the interrupt in cased we missed an EOI */
298     vfio_pci_intx_eoi(&vdev->vbasedev);
299 }
300 
301 static void vfio_intx_routing_notifier(PCIDevice *pdev)
302 {
303     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
304     PCIINTxRoute route;
305 
306     if (vdev->interrupt != VFIO_INT_INTx) {
307         return;
308     }
309 
310     route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
311 
312     if (pci_intx_route_changed(&vdev->intx.route, &route)) {
313         vfio_intx_update(vdev, &route);
314     }
315 }
316 
317 static void vfio_irqchip_change(Notifier *notify, void *data)
318 {
319     VFIOPCIDevice *vdev = container_of(notify, VFIOPCIDevice,
320                                        irqchip_change_notifier);
321 
322     vfio_intx_update(vdev, &vdev->intx.route);
323 }
324 
325 static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
326 {
327     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
328     Error *err = NULL;
329     int32_t fd;
330 
331 
332     if (!pin) {
333         return true;
334     }
335 
336     /*
337      * Do not alter interrupt state during vfio_realize and cpr load.
338      * The incoming state is cleared thereafter.
339      */
340     if (!cpr_is_incoming()) {
341         vfio_disable_interrupts(vdev);
342     }
343 
344     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
345     pci_config_set_interrupt_pin(vdev->pdev.config, pin);
346 
347 #ifdef CONFIG_KVM
348     /*
349      * Only conditional to avoid generating error messages on platforms
350      * where we won't actually use the result anyway.
351      */
352     if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
353         vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
354                                                         vdev->intx.pin);
355     }
356 #endif
357 
358     if (!vfio_notifier_init(vdev, &vdev->intx.interrupt, "intx-interrupt", 0,
359                             errp)) {
360         return false;
361     }
362     fd = event_notifier_get_fd(&vdev->intx.interrupt);
363     qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
364 
365 
366     if (cpr_is_incoming()) {
367         if (!vfio_cpr_intx_enable_kvm(vdev, &err)) {
368             warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
369         }
370         goto skip_signaling;
371     }
372 
373     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
374                                 VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
375         qemu_set_fd_handler(fd, NULL, NULL, vdev);
376         vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0);
377         return false;
378     }
379 
380     if (!vfio_intx_enable_kvm(vdev, &err)) {
381         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
382     }
383 
384 skip_signaling:
385     vdev->interrupt = VFIO_INT_INTx;
386 
387     trace_vfio_intx_enable(vdev->vbasedev.name);
388     return true;
389 }
390 
391 static void vfio_intx_disable(VFIOPCIDevice *vdev)
392 {
393     int fd;
394 
395     timer_del(vdev->intx.mmap_timer);
396     vfio_intx_disable_kvm(vdev);
397     vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
398     vdev->intx.pending = false;
399     pci_irq_deassert(&vdev->pdev);
400     vfio_mmap_set_enabled(vdev, true);
401 
402     fd = event_notifier_get_fd(&vdev->intx.interrupt);
403     qemu_set_fd_handler(fd, NULL, NULL, vdev);
404     vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0);
405 
406     vdev->interrupt = VFIO_INT_NONE;
407 
408     trace_vfio_intx_disable(vdev->vbasedev.name);
409 }
410 
411 bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp)
412 {
413     return vfio_intx_enable(vdev, errp);
414 }
415 
416 /*
417  * MSI/X
418  */
419 static void vfio_msi_interrupt(void *opaque)
420 {
421     VFIOMSIVector *vector = opaque;
422     VFIOPCIDevice *vdev = vector->vdev;
423     MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector);
424     void (*notify)(PCIDevice *dev, unsigned vector);
425     MSIMessage msg;
426     int nr = vector - vdev->msi_vectors;
427 
428     if (!event_notifier_test_and_clear(&vector->interrupt)) {
429         return;
430     }
431 
432     if (vdev->interrupt == VFIO_INT_MSIX) {
433         get_msg = msix_get_message;
434         notify = msix_notify;
435 
436         /* A masked vector firing needs to use the PBA, enable it */
437         if (msix_is_masked(&vdev->pdev, nr)) {
438             set_bit(nr, vdev->msix->pending);
439             memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true);
440             trace_vfio_msix_pba_enable(vdev->vbasedev.name);
441         }
442     } else if (vdev->interrupt == VFIO_INT_MSI) {
443         get_msg = msi_get_message;
444         notify = msi_notify;
445     } else {
446         abort();
447     }
448 
449     msg = get_msg(&vdev->pdev, nr);
450     trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data);
451     notify(&vdev->pdev, nr);
452 }
453 
454 void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr)
455 {
456     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
457     int fd = event_notifier_get_fd(&vector->interrupt);
458 
459     qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
460 }
461 
462 /*
463  * Get MSI-X enabled, but no vector enabled, by setting vector 0 with an invalid
464  * fd to kernel.
465  */
466 static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
467 {
468     g_autofree struct vfio_irq_set *irq_set = NULL;
469     int argsz;
470     int32_t *fd;
471 
472     argsz = sizeof(*irq_set) + sizeof(*fd);
473 
474     irq_set = g_malloc0(argsz);
475     irq_set->argsz = argsz;
476     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
477                      VFIO_IRQ_SET_ACTION_TRIGGER;
478     irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
479     irq_set->start = 0;
480     irq_set->count = 1;
481     fd = (int32_t *)&irq_set->data;
482     *fd = -1;
483 
484     return vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
485 }
486 
487 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
488 {
489     struct vfio_irq_set *irq_set;
490     int ret = 0, i, argsz;
491     int32_t *fds;
492 
493     /*
494      * If dynamic MSI-X allocation is supported, the vectors to be allocated
495      * and enabled can be scattered. Before kernel enabling MSI-X, setting
496      * nr_vectors causes all these vectors to be allocated on host.
497      *
498      * To keep allocation as needed, use vector 0 with an invalid fd to get
499      * MSI-X enabled first, then set vectors with a potentially sparse set of
500      * eventfds to enable interrupts only when enabled in guest.
501      */
502     if (msix && !vdev->msix->noresize) {
503         ret = vfio_enable_msix_no_vec(vdev);
504 
505         if (ret) {
506             return ret;
507         }
508     }
509 
510     argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
511 
512     irq_set = g_malloc0(argsz);
513     irq_set->argsz = argsz;
514     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
515     irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
516     irq_set->start = 0;
517     irq_set->count = vdev->nr_vectors;
518     fds = (int32_t *)&irq_set->data;
519 
520     for (i = 0; i < vdev->nr_vectors; i++) {
521         int fd = -1;
522 
523         /*
524          * MSI vs MSI-X - The guest has direct access to MSI mask and pending
525          * bits, therefore we always use the KVM signaling path when setup.
526          * MSI-X mask and pending bits are emulated, so we want to use the
527          * KVM signaling path only when configured and unmasked.
528          */
529         if (vdev->msi_vectors[i].use) {
530             if (vdev->msi_vectors[i].virq < 0 ||
531                 (msix && msix_is_masked(&vdev->pdev, i))) {
532                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
533             } else {
534                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
535             }
536         }
537 
538         fds[i] = fd;
539     }
540 
541     ret = vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
542 
543     g_free(irq_set);
544 
545     return ret;
546 }
547 
548 void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
549                                int vector_n, bool msix)
550 {
551     if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) {
552         return;
553     }
554 
555     vector->virq = kvm_irqchip_add_msi_route(&vfio_route_change,
556                                              vector_n, &vdev->pdev);
557 }
558 
559 static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector, int nr)
560 {
561     const char *name = "kvm_interrupt";
562 
563     if (vector->virq < 0) {
564         return;
565     }
566 
567     if (!vfio_notifier_init(vector->vdev, &vector->kvm_interrupt, name, nr,
568                             NULL)) {
569         goto fail_notifier;
570     }
571 
572     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
573                                            NULL, vector->virq) < 0) {
574         goto fail_kvm;
575     }
576 
577     return;
578 
579 fail_kvm:
580     vfio_notifier_cleanup(vector->vdev, &vector->kvm_interrupt, name, nr);
581 fail_notifier:
582     kvm_irqchip_release_virq(kvm_state, vector->virq);
583     vector->virq = -1;
584 }
585 
586 static void vfio_remove_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
587                                      int nr)
588 {
589     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
590                                           vector->virq);
591     kvm_irqchip_release_virq(kvm_state, vector->virq);
592     vector->virq = -1;
593     vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, "kvm_interrupt", nr);
594 }
595 
596 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
597                                      PCIDevice *pdev)
598 {
599     kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev);
600     kvm_irqchip_commit_routes(kvm_state);
601 }
602 
603 static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector,
604                                unsigned int nr)
605 {
606     Error *err = NULL;
607     int32_t fd;
608 
609     if (vector->virq >= 0) {
610         fd = event_notifier_get_fd(&vector->kvm_interrupt);
611     } else {
612         fd = event_notifier_get_fd(&vector->interrupt);
613     }
614 
615     if (!vfio_device_irq_set_signaling(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr,
616                                        VFIO_IRQ_SET_ACTION_TRIGGER,
617                                        fd, &err)) {
618         error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name);
619     }
620 }
621 
622 void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr)
623 {
624     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
625     PCIDevice *pdev = &vdev->pdev;
626     Error *local_err = NULL;
627 
628     vector->vdev = vdev;
629     vector->virq = -1;
630     if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", nr,
631                             &local_err)) {
632         error_report_err(local_err);
633     }
634     vector->use = true;
635     if (vdev->interrupt == VFIO_INT_MSIX) {
636         msix_vector_use(pdev, nr);
637     }
638 }
639 
640 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
641                                    MSIMessage *msg, IOHandler *handler)
642 {
643     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
644     VFIOMSIVector *vector;
645     int ret;
646     bool resizing = !!(vdev->nr_vectors < nr + 1);
647 
648     trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
649 
650     vector = &vdev->msi_vectors[nr];
651 
652     if (!vector->use) {
653         vfio_pci_vector_init(vdev, nr);
654     }
655 
656     qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
657                         handler, NULL, vector);
658 
659     /*
660      * Attempt to enable route through KVM irqchip,
661      * default to userspace handling if unavailable.
662      */
663     if (vector->virq >= 0) {
664         if (!msg) {
665             vfio_remove_kvm_msi_virq(vdev, vector, nr);
666         } else {
667             vfio_update_kvm_msi_virq(vector, *msg, pdev);
668         }
669     } else {
670         if (msg) {
671             if (vdev->defer_kvm_irq_routing) {
672                 vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true);
673             } else {
674                 vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
675                 vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true);
676                 kvm_irqchip_commit_route_changes(&vfio_route_change);
677                 vfio_connect_kvm_msi_virq(vector, nr);
678             }
679         }
680     }
681 
682     /*
683      * When dynamic allocation is not supported, we don't want to have the
684      * host allocate all possible MSI vectors for a device if they're not
685      * in use, so we shutdown and incrementally increase them as needed.
686      * nr_vectors represents the total number of vectors allocated.
687      *
688      * When dynamic allocation is supported, let the host only allocate
689      * and enable a vector when it is in use in guest. nr_vectors represents
690      * the upper bound of vectors being enabled (but not all of the ranges
691      * is allocated or enabled).
692      */
693     if (resizing) {
694         vdev->nr_vectors = nr + 1;
695     }
696 
697     if (!vdev->defer_kvm_irq_routing) {
698         if (vdev->msix->noresize && resizing) {
699             vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
700             ret = vfio_enable_vectors(vdev, true);
701             if (ret) {
702                 error_report("vfio: failed to enable vectors, %s",
703                              strerror(-ret));
704             }
705         } else {
706             set_irq_signalling(&vdev->vbasedev, vector, nr);
707         }
708     }
709 
710     /* Disable PBA emulation when nothing more is pending. */
711     clear_bit(nr, vdev->msix->pending);
712     if (find_first_bit(vdev->msix->pending,
713                        vdev->nr_vectors) == vdev->nr_vectors) {
714         memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
715         trace_vfio_msix_pba_disable(vdev->vbasedev.name);
716     }
717 
718     return 0;
719 }
720 
721 static int vfio_msix_vector_use(PCIDevice *pdev,
722                                 unsigned int nr, MSIMessage msg)
723 {
724     /*
725      * Ignore the callback from msix_set_vector_notifiers during resume.
726      * The necessary subset of these actions is called from
727      * vfio_cpr_claim_vectors during post load.
728      */
729     if (cpr_is_incoming()) {
730         return 0;
731     }
732 
733     return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
734 }
735 
736 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
737 {
738     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
739     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
740 
741     trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
742 
743     /*
744      * There are still old guests that mask and unmask vectors on every
745      * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
746      * the KVM setup in place, simply switch VFIO to use the non-bypass
747      * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
748      * core will mask the interrupt and set pending bits, allowing it to
749      * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
750      */
751     if (vector->virq >= 0) {
752         int32_t fd = event_notifier_get_fd(&vector->interrupt);
753         Error *err = NULL;
754 
755         if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
756                                     nr, VFIO_IRQ_SET_ACTION_TRIGGER, fd,
757                                     &err)) {
758             error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
759         }
760     }
761 }
762 
763 void vfio_pci_msix_set_notifiers(VFIOPCIDevice *vdev)
764 {
765     msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
766                               vfio_msix_vector_release, NULL);
767 }
768 
769 void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
770 {
771     assert(!vdev->defer_kvm_irq_routing);
772     vdev->defer_kvm_irq_routing = true;
773     vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
774 }
775 
776 void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
777 {
778     int i;
779 
780     assert(vdev->defer_kvm_irq_routing);
781     vdev->defer_kvm_irq_routing = false;
782 
783     kvm_irqchip_commit_route_changes(&vfio_route_change);
784 
785     for (i = 0; i < vdev->nr_vectors; i++) {
786         vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i], i);
787     }
788 }
789 
790 static void vfio_msix_enable(VFIOPCIDevice *vdev)
791 {
792     int ret;
793 
794     vfio_disable_interrupts(vdev);
795 
796     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries);
797 
798     vdev->interrupt = VFIO_INT_MSIX;
799 
800     /*
801      * Setting vector notifiers triggers synchronous vector-use
802      * callbacks for each active vector.  Deferring to commit the KVM
803      * routes once rather than per vector provides a substantial
804      * performance improvement.
805      */
806     vfio_pci_prepare_kvm_msi_virq_batch(vdev);
807 
808     if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
809                                   vfio_msix_vector_release, NULL)) {
810         error_report("vfio: msix_set_vector_notifiers failed");
811     }
812 
813     vfio_pci_commit_kvm_msi_virq_batch(vdev);
814 
815     if (vdev->nr_vectors) {
816         ret = vfio_enable_vectors(vdev, true);
817         if (ret) {
818             error_report("vfio: failed to enable vectors, %s",
819                          strerror(-ret));
820         }
821     } else {
822         /*
823          * Some communication channels between VF & PF or PF & fw rely on the
824          * physical state of the device and expect that enabling MSI-X from the
825          * guest enables the same on the host.  When our guest is Linux, the
826          * guest driver call to pci_enable_msix() sets the enabling bit in the
827          * MSI-X capability, but leaves the vector table masked.  We therefore
828          * can't rely on a vector_use callback (from request_irq() in the guest)
829          * to switch the physical device into MSI-X mode because that may come a
830          * long time after pci_enable_msix().  This code sets vector 0 with an
831          * invalid fd to make the physical device MSI-X enabled, but with no
832          * vectors enabled, just like the guest view.
833          */
834         ret = vfio_enable_msix_no_vec(vdev);
835         if (ret) {
836             error_report("vfio: failed to enable MSI-X, %s",
837                          strerror(-ret));
838         }
839     }
840 
841     trace_vfio_msix_enable(vdev->vbasedev.name);
842 }
843 
844 static void vfio_msi_enable(VFIOPCIDevice *vdev)
845 {
846     int ret, i;
847 
848     vfio_disable_interrupts(vdev);
849 
850     vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
851 retry:
852     /*
853      * Setting vector notifiers needs to enable route for each vector.
854      * Deferring to commit the KVM routes once rather than per vector
855      * provides a substantial performance improvement.
856      */
857     vfio_pci_prepare_kvm_msi_virq_batch(vdev);
858 
859     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
860 
861     for (i = 0; i < vdev->nr_vectors; i++) {
862         VFIOMSIVector *vector = &vdev->msi_vectors[i];
863         Error *local_err = NULL;
864 
865         vector->vdev = vdev;
866         vector->virq = -1;
867         vector->use = true;
868 
869         if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", i,
870                                 &local_err)) {
871             error_report_err(local_err);
872         }
873 
874         qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
875                             vfio_msi_interrupt, NULL, vector);
876 
877         /*
878          * Attempt to enable route through KVM irqchip,
879          * default to userspace handling if unavailable.
880          */
881         vfio_pci_add_kvm_msi_virq(vdev, vector, i, false);
882     }
883 
884     vfio_pci_commit_kvm_msi_virq_batch(vdev);
885 
886     /* Set interrupt type prior to possible interrupts */
887     vdev->interrupt = VFIO_INT_MSI;
888 
889     ret = vfio_enable_vectors(vdev, false);
890     if (ret) {
891         if (ret < 0) {
892             error_report("vfio: Error: Failed to setup MSI fds: %s",
893                          strerror(-ret));
894         } else {
895             error_report("vfio: Error: Failed to enable %d "
896                          "MSI vectors, retry with %d", vdev->nr_vectors, ret);
897         }
898 
899         vfio_msi_disable_common(vdev);
900 
901         if (ret > 0) {
902             vdev->nr_vectors = ret;
903             goto retry;
904         }
905 
906         /*
907          * Failing to setup MSI doesn't really fall within any specification.
908          * Let's try leaving interrupts disabled and hope the guest figures
909          * out to fall back to INTx for this device.
910          */
911         error_report("vfio: Error: Failed to enable MSI");
912 
913         return;
914     }
915 
916     trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
917 }
918 
919 static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
920 {
921     int i;
922 
923     for (i = 0; i < vdev->nr_vectors; i++) {
924         VFIOMSIVector *vector = &vdev->msi_vectors[i];
925         if (vdev->msi_vectors[i].use) {
926             if (vector->virq >= 0) {
927                 vfio_remove_kvm_msi_virq(vdev, vector, i);
928             }
929             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
930                                 NULL, NULL, NULL);
931             vfio_notifier_cleanup(vdev, &vector->interrupt, "interrupt", i);
932         }
933     }
934 
935     g_free(vdev->msi_vectors);
936     vdev->msi_vectors = NULL;
937     vdev->nr_vectors = 0;
938     vdev->interrupt = VFIO_INT_NONE;
939 }
940 
941 static void vfio_msix_disable(VFIOPCIDevice *vdev)
942 {
943     Error *err = NULL;
944     int i;
945 
946     msix_unset_vector_notifiers(&vdev->pdev);
947 
948     /*
949      * MSI-X will only release vectors if MSI-X is still enabled on the
950      * device, check through the rest and release it ourselves if necessary.
951      */
952     for (i = 0; i < vdev->nr_vectors; i++) {
953         if (vdev->msi_vectors[i].use) {
954             vfio_msix_vector_release(&vdev->pdev, i);
955             msix_vector_unuse(&vdev->pdev, i);
956         }
957     }
958 
959     /*
960      * Always clear MSI-X IRQ index. A PF device could have enabled
961      * MSI-X with no vectors. See vfio_msix_enable().
962      */
963     vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
964 
965     vfio_msi_disable_common(vdev);
966     if (!vfio_intx_enable(vdev, &err)) {
967         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
968     }
969 
970     memset(vdev->msix->pending, 0,
971            BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
972 
973     trace_vfio_msix_disable(vdev->vbasedev.name);
974 }
975 
976 static void vfio_msi_disable(VFIOPCIDevice *vdev)
977 {
978     Error *err = NULL;
979 
980     vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
981     vfio_msi_disable_common(vdev);
982     vfio_intx_enable(vdev, &err);
983     if (err) {
984         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
985     }
986 
987     trace_vfio_msi_disable(vdev->vbasedev.name);
988 }
989 
990 static void vfio_update_msi(VFIOPCIDevice *vdev)
991 {
992     int i;
993 
994     for (i = 0; i < vdev->nr_vectors; i++) {
995         VFIOMSIVector *vector = &vdev->msi_vectors[i];
996         MSIMessage msg;
997 
998         if (!vector->use || vector->virq < 0) {
999             continue;
1000         }
1001 
1002         msg = msi_get_message(&vdev->pdev, i);
1003         vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev);
1004     }
1005 }
1006 
1007 static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
1008 {
1009     VFIODevice *vbasedev = &vdev->vbasedev;
1010     struct vfio_region_info *reg_info = NULL;
1011     uint64_t size;
1012     off_t off = 0;
1013     ssize_t bytes;
1014     int ret;
1015 
1016     ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_ROM_REGION_INDEX,
1017                                       &reg_info);
1018 
1019     if (ret != 0) {
1020         error_report("vfio: Error getting ROM info: %s", strerror(-ret));
1021         return;
1022     }
1023 
1024     trace_vfio_pci_load_rom(vbasedev->name, (unsigned long)reg_info->size,
1025                             (unsigned long)reg_info->offset,
1026                             (unsigned long)reg_info->flags);
1027 
1028     vdev->rom_size = size = reg_info->size;
1029     vdev->rom_offset = reg_info->offset;
1030 
1031     if (!vdev->rom_size) {
1032         vdev->rom_read_failed = true;
1033         error_report("vfio-pci: Cannot read device rom at %s", vbasedev->name);
1034         error_printf("Device option ROM contents are probably invalid "
1035                     "(check dmesg).\nSkip option ROM probe with rombar=0, "
1036                     "or load from file with romfile=\n");
1037         return;
1038     }
1039 
1040     vdev->rom = g_malloc(size);
1041     memset(vdev->rom, 0xff, size);
1042 
1043     while (size) {
1044         bytes = vbasedev->io_ops->region_read(vbasedev,
1045                                               VFIO_PCI_ROM_REGION_INDEX,
1046                                               off, size, vdev->rom + off);
1047 
1048         if (bytes == 0) {
1049             break;
1050         } else if (bytes > 0) {
1051             off += bytes;
1052             size -= bytes;
1053         } else {
1054             if (bytes == -EINTR || bytes == -EAGAIN) {
1055                 continue;
1056             }
1057             error_report("vfio: Error reading device ROM: %s",
1058                          strreaderror(bytes));
1059 
1060             break;
1061         }
1062     }
1063 
1064     /*
1065      * Test the ROM signature against our device, if the vendor is correct
1066      * but the device ID doesn't match, store the correct device ID and
1067      * recompute the checksum.  Intel IGD devices need this and are known
1068      * to have bogus checksums so we can't simply adjust the checksum.
1069      */
1070     if (pci_get_word(vdev->rom) == 0xaa55 &&
1071         pci_get_word(vdev->rom + 0x18) + 8 < vdev->rom_size &&
1072         !memcmp(vdev->rom + pci_get_word(vdev->rom + 0x18), "PCIR", 4)) {
1073         uint16_t vid, did;
1074 
1075         vid = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 4);
1076         did = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6);
1077 
1078         if (vid == vdev->vendor_id && did != vdev->device_id) {
1079             int i;
1080             uint8_t csum, *data = vdev->rom;
1081 
1082             pci_set_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6,
1083                          vdev->device_id);
1084             data[6] = 0;
1085 
1086             for (csum = 0, i = 0; i < vdev->rom_size; i++) {
1087                 csum += data[i];
1088             }
1089 
1090             data[6] = -csum;
1091         }
1092     }
1093 }
1094 
1095 /* "Raw" read of underlying config space. */
1096 static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset,
1097                                       uint32_t size, void *data)
1098 {
1099     return vdev->vbasedev.io_ops->region_read(&vdev->vbasedev,
1100                                               VFIO_PCI_CONFIG_REGION_INDEX,
1101                                               offset, size, data);
1102 }
1103 
1104 /* "Raw" write of underlying config space. */
1105 static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset,
1106                                        uint32_t size, void *data)
1107 {
1108     return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev,
1109                                                VFIO_PCI_CONFIG_REGION_INDEX,
1110                                                offset, size, data, false);
1111 }
1112 
1113 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
1114 {
1115     VFIOPCIDevice *vdev = opaque;
1116     union {
1117         uint8_t byte;
1118         uint16_t word;
1119         uint32_t dword;
1120         uint64_t qword;
1121     } val;
1122     uint64_t data = 0;
1123 
1124     /* Load the ROM lazily when the guest tries to read it */
1125     if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
1126         vfio_pci_load_rom(vdev);
1127     }
1128 
1129     memcpy(&val, vdev->rom + addr,
1130            (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
1131 
1132     switch (size) {
1133     case 1:
1134         data = val.byte;
1135         break;
1136     case 2:
1137         data = le16_to_cpu(val.word);
1138         break;
1139     case 4:
1140         data = le32_to_cpu(val.dword);
1141         break;
1142     default:
1143         hw_error("vfio: unsupported read size, %d bytes\n", size);
1144         break;
1145     }
1146 
1147     trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data);
1148 
1149     return data;
1150 }
1151 
1152 static void vfio_rom_write(void *opaque, hwaddr addr,
1153                            uint64_t data, unsigned size)
1154 {
1155 }
1156 
1157 static const MemoryRegionOps vfio_rom_ops = {
1158     .read = vfio_rom_read,
1159     .write = vfio_rom_write,
1160     .endianness = DEVICE_LITTLE_ENDIAN,
1161 };
1162 
1163 static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
1164 {
1165     VFIODevice *vbasedev = &vdev->vbasedev;
1166     uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
1167     char *name;
1168 
1169     if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
1170         /* Since pci handles romfile, just print a message and return */
1171         if (vfio_opt_rom_in_denylist(vdev) && vdev->pdev.romfile) {
1172             warn_report("Device at %s is known to cause system instability"
1173                         " issues during option rom execution",
1174                         vdev->vbasedev.name);
1175             error_printf("Proceeding anyway since user specified romfile\n");
1176         }
1177         return;
1178     }
1179 
1180     /*
1181      * Use the same size ROM BAR as the physical device.  The contents
1182      * will get filled in later when the guest tries to read it.
1183      */
1184     if (vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4 ||
1185         vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 ||
1186         vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 ||
1187         vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4) {
1188 
1189         error_report("%s(%s) ROM access failed", __func__, vbasedev->name);
1190         return;
1191     }
1192 
1193     size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
1194 
1195     if (!size) {
1196         return;
1197     }
1198 
1199     if (vfio_opt_rom_in_denylist(vdev)) {
1200         if (vdev->pdev.rom_bar > 0) {
1201             warn_report("Device at %s is known to cause system instability"
1202                         " issues during option rom execution",
1203                         vdev->vbasedev.name);
1204             error_printf("Proceeding anyway since user specified"
1205                          " positive value for rombar\n");
1206         } else {
1207             warn_report("Rom loading for device at %s has been disabled"
1208                         " due to system instability issues",
1209                         vdev->vbasedev.name);
1210             error_printf("Specify rombar=1 or romfile to force\n");
1211             return;
1212         }
1213     }
1214 
1215     trace_vfio_pci_size_rom(vdev->vbasedev.name, size);
1216 
1217     name = g_strdup_printf("vfio[%s].rom", vdev->vbasedev.name);
1218 
1219     memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
1220                           &vfio_rom_ops, vdev, name, size);
1221     g_free(name);
1222 
1223     pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
1224                      PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
1225 
1226     vdev->rom_read_failed = false;
1227 }
1228 
1229 void vfio_vga_write(void *opaque, hwaddr addr,
1230                            uint64_t data, unsigned size)
1231 {
1232     VFIOVGARegion *region = opaque;
1233     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1234     union {
1235         uint8_t byte;
1236         uint16_t word;
1237         uint32_t dword;
1238         uint64_t qword;
1239     } buf;
1240     off_t offset = vga->fd_offset + region->offset + addr;
1241 
1242     switch (size) {
1243     case 1:
1244         buf.byte = data;
1245         break;
1246     case 2:
1247         buf.word = cpu_to_le16(data);
1248         break;
1249     case 4:
1250         buf.dword = cpu_to_le32(data);
1251         break;
1252     default:
1253         hw_error("vfio: unsupported write size, %d bytes", size);
1254         break;
1255     }
1256 
1257     if (pwrite(vga->fd, &buf, size, offset) != size) {
1258         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1259                      __func__, region->offset + addr, data, size);
1260     }
1261 
1262     trace_vfio_vga_write(region->offset + addr, data, size);
1263 }
1264 
1265 uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1266 {
1267     VFIOVGARegion *region = opaque;
1268     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1269     union {
1270         uint8_t byte;
1271         uint16_t word;
1272         uint32_t dword;
1273         uint64_t qword;
1274     } buf;
1275     uint64_t data = 0;
1276     off_t offset = vga->fd_offset + region->offset + addr;
1277 
1278     if (pread(vga->fd, &buf, size, offset) != size) {
1279         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1280                      __func__, region->offset + addr, size);
1281         return (uint64_t)-1;
1282     }
1283 
1284     switch (size) {
1285     case 1:
1286         data = buf.byte;
1287         break;
1288     case 2:
1289         data = le16_to_cpu(buf.word);
1290         break;
1291     case 4:
1292         data = le32_to_cpu(buf.dword);
1293         break;
1294     default:
1295         hw_error("vfio: unsupported read size, %d bytes", size);
1296         break;
1297     }
1298 
1299     trace_vfio_vga_read(region->offset + addr, size, data);
1300 
1301     return data;
1302 }
1303 
1304 static const MemoryRegionOps vfio_vga_ops = {
1305     .read = vfio_vga_read,
1306     .write = vfio_vga_write,
1307     .endianness = DEVICE_LITTLE_ENDIAN,
1308 };
1309 
1310 /*
1311  * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page
1312  * size if the BAR is in an exclusive page in host so that we could map
1313  * this BAR to guest. But this sub-page BAR may not occupy an exclusive
1314  * page in guest. So we should set the priority of the expanded memory
1315  * region to zero in case of overlap with BARs which share the same page
1316  * with the sub-page BAR in guest. Besides, we should also recover the
1317  * size of this sub-page BAR when its base address is changed in guest
1318  * and not page aligned any more.
1319  */
1320 static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
1321 {
1322     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
1323     VFIORegion *region = &vdev->bars[bar].region;
1324     MemoryRegion *mmap_mr, *region_mr, *base_mr;
1325     PCIIORegion *r;
1326     pcibus_t bar_addr;
1327     uint64_t size = region->size;
1328 
1329     /* Make sure that the whole region is allowed to be mmapped */
1330     if (region->nr_mmaps != 1 || !region->mmaps[0].mmap ||
1331         region->mmaps[0].size != region->size) {
1332         return;
1333     }
1334 
1335     r = &pdev->io_regions[bar];
1336     bar_addr = r->addr;
1337     base_mr = vdev->bars[bar].mr;
1338     region_mr = region->mem;
1339     mmap_mr = &region->mmaps[0].mem;
1340 
1341     /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */
1342     if (bar_addr != PCI_BAR_UNMAPPED &&
1343         !(bar_addr & ~qemu_real_host_page_mask())) {
1344         size = qemu_real_host_page_size();
1345     }
1346 
1347     memory_region_transaction_begin();
1348 
1349     if (vdev->bars[bar].size < size) {
1350         memory_region_set_size(base_mr, size);
1351     }
1352     memory_region_set_size(region_mr, size);
1353     memory_region_set_size(mmap_mr, size);
1354     if (size != vdev->bars[bar].size && memory_region_is_mapped(base_mr)) {
1355         memory_region_del_subregion(r->address_space, base_mr);
1356         memory_region_add_subregion_overlap(r->address_space,
1357                                             bar_addr, base_mr, 0);
1358     }
1359 
1360     memory_region_transaction_commit();
1361 }
1362 
1363 /*
1364  * PCI config space
1365  */
1366 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
1367 {
1368     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
1369     VFIODevice *vbasedev = &vdev->vbasedev;
1370     uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
1371 
1372     memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
1373     emu_bits = le32_to_cpu(emu_bits);
1374 
1375     if (emu_bits) {
1376         emu_val = pci_default_read_config(pdev, addr, len);
1377     }
1378 
1379     if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
1380         ssize_t ret;
1381 
1382         ret = vfio_pci_config_space_read(vdev, addr, len, &phys_val);
1383         if (ret != len) {
1384             error_report("%s(%s, 0x%x, 0x%x) failed: %s",
1385                          __func__, vbasedev->name, addr, len,
1386                          strreaderror(ret));
1387             return -1;
1388         }
1389         phys_val = le32_to_cpu(phys_val);
1390     }
1391 
1392     val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
1393 
1394     trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val);
1395 
1396     return val;
1397 }
1398 
1399 void vfio_pci_write_config(PCIDevice *pdev,
1400                            uint32_t addr, uint32_t val, int len)
1401 {
1402     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
1403     VFIODevice *vbasedev = &vdev->vbasedev;
1404     uint32_t val_le = cpu_to_le32(val);
1405     int ret;
1406 
1407     trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
1408 
1409     /* Write everything to VFIO, let it filter out what we can't write */
1410     ret = vfio_pci_config_space_write(vdev, addr, len, &val_le);
1411     if (ret != len) {
1412         error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s",
1413                      __func__, vbasedev->name, addr, val, len,
1414                     strwriteerror(ret));
1415     }
1416 
1417     /* MSI/MSI-X Enabling/Disabling */
1418     if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
1419         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
1420         int is_enabled, was_enabled = msi_enabled(pdev);
1421 
1422         pci_default_write_config(pdev, addr, val, len);
1423 
1424         is_enabled = msi_enabled(pdev);
1425 
1426         if (!was_enabled) {
1427             if (is_enabled) {
1428                 vfio_msi_enable(vdev);
1429             }
1430         } else {
1431             if (!is_enabled) {
1432                 vfio_msi_disable(vdev);
1433             } else {
1434                 vfio_update_msi(vdev);
1435             }
1436         }
1437     } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
1438         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
1439         int is_enabled, was_enabled = msix_enabled(pdev);
1440 
1441         pci_default_write_config(pdev, addr, val, len);
1442 
1443         is_enabled = msix_enabled(pdev);
1444 
1445         if (!was_enabled && is_enabled) {
1446             vfio_msix_enable(vdev);
1447         } else if (was_enabled && !is_enabled) {
1448             vfio_msix_disable(vdev);
1449         }
1450     } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) ||
1451         range_covers_byte(addr, len, PCI_COMMAND)) {
1452         pcibus_t old_addr[PCI_NUM_REGIONS - 1];
1453         int bar;
1454 
1455         for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
1456             old_addr[bar] = pdev->io_regions[bar].addr;
1457         }
1458 
1459         pci_default_write_config(pdev, addr, val, len);
1460 
1461         for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
1462             if (old_addr[bar] != pdev->io_regions[bar].addr &&
1463                 vdev->bars[bar].region.size > 0 &&
1464                 vdev->bars[bar].region.size < qemu_real_host_page_size()) {
1465                 vfio_sub_page_bar_update_mapping(pdev, bar);
1466             }
1467         }
1468     } else {
1469         /* Write everything to QEMU to keep emulated bits correct */
1470         pci_default_write_config(pdev, addr, val, len);
1471     }
1472 }
1473 
1474 /*
1475  * Interrupt setup
1476  */
1477 static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
1478 {
1479     /*
1480      * More complicated than it looks.  Disabling MSI/X transitions the
1481      * device to INTx mode (if supported).  Therefore we need to first
1482      * disable MSI/X and then cleanup by disabling INTx.
1483      */
1484     if (vdev->interrupt == VFIO_INT_MSIX) {
1485         vfio_msix_disable(vdev);
1486     } else if (vdev->interrupt == VFIO_INT_MSI) {
1487         vfio_msi_disable(vdev);
1488     }
1489 
1490     if (vdev->interrupt == VFIO_INT_INTx) {
1491         vfio_intx_disable(vdev);
1492     }
1493 }
1494 
1495 static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1496 {
1497     uint16_t ctrl;
1498     bool msi_64bit, msi_maskbit;
1499     int ret, entries;
1500     Error *err = NULL;
1501 
1502     ret = vfio_pci_config_space_read(vdev, pos + PCI_CAP_FLAGS,
1503                                      sizeof(ctrl), &ctrl);
1504     if (ret != sizeof(ctrl)) {
1505         error_setg(errp, "failed reading MSI PCI_CAP_FLAGS: %s",
1506                    strreaderror(ret));
1507         return false;
1508     }
1509     ctrl = le16_to_cpu(ctrl);
1510 
1511     msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
1512     msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
1513     entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
1514 
1515     trace_vfio_msi_setup(vdev->vbasedev.name, pos);
1516 
1517     ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit, &err);
1518     if (ret < 0) {
1519         if (ret == -ENOTSUP) {
1520             return true;
1521         }
1522         error_propagate_prepend(errp, err, "msi_init failed: ");
1523         return false;
1524     }
1525     vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
1526 
1527     return true;
1528 }
1529 
1530 static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
1531 {
1532     off_t start, end;
1533     VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region;
1534 
1535     /*
1536      * If the host driver allows mapping of a MSIX data, we are going to
1537      * do map the entire BAR and emulate MSIX table on top of that.
1538      */
1539     if (vfio_device_has_region_cap(&vdev->vbasedev, region->nr,
1540                                    VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) {
1541         return;
1542     }
1543 
1544     /*
1545      * We expect to find a single mmap covering the whole BAR, anything else
1546      * means it's either unsupported or already setup.
1547      */
1548     if (region->nr_mmaps != 1 || region->mmaps[0].offset ||
1549         region->size != region->mmaps[0].size) {
1550         return;
1551     }
1552 
1553     /* MSI-X table start and end aligned to host page size */
1554     start = vdev->msix->table_offset & qemu_real_host_page_mask();
1555     end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
1556                                (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
1557 
1558     /*
1559      * Does the MSI-X table cover the beginning of the BAR?  The whole BAR?
1560      * NB - Host page size is necessarily a power of two and so is the PCI
1561      * BAR (not counting EA yet), therefore if we have host page aligned
1562      * @start and @end, then any remainder of the BAR before or after those
1563      * must be at least host page sized and therefore mmap'able.
1564      */
1565     if (!start) {
1566         if (end >= region->size) {
1567             region->nr_mmaps = 0;
1568             g_free(region->mmaps);
1569             region->mmaps = NULL;
1570             trace_vfio_msix_fixup(vdev->vbasedev.name,
1571                                   vdev->msix->table_bar, 0, 0);
1572         } else {
1573             region->mmaps[0].offset = end;
1574             region->mmaps[0].size = region->size - end;
1575             trace_vfio_msix_fixup(vdev->vbasedev.name,
1576                               vdev->msix->table_bar, region->mmaps[0].offset,
1577                               region->mmaps[0].offset + region->mmaps[0].size);
1578         }
1579 
1580     /* Maybe it's aligned at the end of the BAR */
1581     } else if (end >= region->size) {
1582         region->mmaps[0].size = start;
1583         trace_vfio_msix_fixup(vdev->vbasedev.name,
1584                               vdev->msix->table_bar, region->mmaps[0].offset,
1585                               region->mmaps[0].offset + region->mmaps[0].size);
1586 
1587     /* Otherwise it must split the BAR */
1588     } else {
1589         region->nr_mmaps = 2;
1590         region->mmaps = g_renew(VFIOMmap, region->mmaps, 2);
1591 
1592         memcpy(&region->mmaps[1], &region->mmaps[0], sizeof(VFIOMmap));
1593 
1594         region->mmaps[0].size = start;
1595         trace_vfio_msix_fixup(vdev->vbasedev.name,
1596                               vdev->msix->table_bar, region->mmaps[0].offset,
1597                               region->mmaps[0].offset + region->mmaps[0].size);
1598 
1599         region->mmaps[1].offset = end;
1600         region->mmaps[1].size = region->size - end;
1601         trace_vfio_msix_fixup(vdev->vbasedev.name,
1602                               vdev->msix->table_bar, region->mmaps[1].offset,
1603                               region->mmaps[1].offset + region->mmaps[1].size);
1604     }
1605 }
1606 
1607 static bool vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
1608 {
1609     int target_bar = -1;
1610     size_t msix_sz;
1611 
1612     if (!vdev->msix || vdev->msix_relo == OFF_AUTO_PCIBAR_OFF) {
1613         return true;
1614     }
1615 
1616     /* The actual minimum size of MSI-X structures */
1617     msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) +
1618               (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8);
1619     /* Round up to host pages, we don't want to share a page */
1620     msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz);
1621     /* PCI BARs must be a power of 2 */
1622     msix_sz = pow2ceil(msix_sz);
1623 
1624     if (vdev->msix_relo == OFF_AUTO_PCIBAR_AUTO) {
1625         /*
1626          * TODO: Lookup table for known devices.
1627          *
1628          * Logically we might use an algorithm here to select the BAR adding
1629          * the least additional MMIO space, but we cannot programmatically
1630          * predict the driver dependency on BAR ordering or sizing, therefore
1631          * 'auto' becomes a lookup for combinations reported to work.
1632          */
1633         if (target_bar < 0) {
1634             error_setg(errp, "No automatic MSI-X relocation available for "
1635                        "device %04x:%04x", vdev->vendor_id, vdev->device_id);
1636             return false;
1637         }
1638     } else {
1639         target_bar = (int)(vdev->msix_relo - OFF_AUTO_PCIBAR_BAR0);
1640     }
1641 
1642     /* I/O port BARs cannot host MSI-X structures */
1643     if (vdev->bars[target_bar].ioport) {
1644         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1645                    "I/O port BAR", target_bar);
1646         return false;
1647     }
1648 
1649     /* Cannot use a BAR in the "shadow" of a 64-bit BAR */
1650     if (!vdev->bars[target_bar].size &&
1651          target_bar > 0 && vdev->bars[target_bar - 1].mem64) {
1652         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1653                    "consumed by 64-bit BAR %d", target_bar, target_bar - 1);
1654         return false;
1655     }
1656 
1657     /* 2GB max size for 32-bit BARs, cannot double if already > 1G */
1658     if (vdev->bars[target_bar].size > 1 * GiB &&
1659         !vdev->bars[target_bar].mem64) {
1660         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1661                    "no space to extend 32-bit BAR", target_bar);
1662         return false;
1663     }
1664 
1665     /*
1666      * If adding a new BAR, test if we can make it 64bit.  We make it
1667      * prefetchable since QEMU MSI-X emulation has no read side effects
1668      * and doing so makes mapping more flexible.
1669      */
1670     if (!vdev->bars[target_bar].size) {
1671         if (target_bar < (PCI_ROM_SLOT - 1) &&
1672             !vdev->bars[target_bar + 1].size) {
1673             vdev->bars[target_bar].mem64 = true;
1674             vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64;
1675         }
1676         vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
1677         vdev->bars[target_bar].size = msix_sz;
1678         vdev->msix->table_offset = 0;
1679     } else {
1680         vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2,
1681                                           msix_sz * 2);
1682         /*
1683          * Due to above size calc, MSI-X always starts halfway into the BAR,
1684          * which will always be a separate host page.
1685          */
1686         vdev->msix->table_offset = vdev->bars[target_bar].size / 2;
1687     }
1688 
1689     vdev->msix->table_bar = target_bar;
1690     vdev->msix->pba_bar = target_bar;
1691     /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */
1692     vdev->msix->pba_offset = vdev->msix->table_offset +
1693                                   (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE);
1694 
1695     trace_vfio_msix_relo(vdev->vbasedev.name,
1696                          vdev->msix->table_bar, vdev->msix->table_offset);
1697     return true;
1698 }
1699 
1700 /*
1701  * We don't have any control over how pci_add_capability() inserts
1702  * capabilities into the chain.  In order to setup MSI-X we need a
1703  * MemoryRegion for the BAR.  In order to setup the BAR and not
1704  * attempt to mmap the MSI-X table area, which VFIO won't allow, we
1705  * need to first look for where the MSI-X table lives.  So we
1706  * unfortunately split MSI-X setup across two functions.
1707  */
1708 static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
1709 {
1710     uint8_t pos;
1711     uint16_t ctrl;
1712     uint32_t table, pba;
1713     struct vfio_irq_info irq_info;
1714     VFIOMSIXInfo *msix;
1715     int ret;
1716 
1717     pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
1718     if (!pos) {
1719         return true;
1720     }
1721 
1722     ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_FLAGS,
1723                                      sizeof(ctrl), &ctrl);
1724     if (ret != sizeof(ctrl)) {
1725         error_setg(errp, "failed to read PCI MSIX FLAGS: %s",
1726                    strreaderror(ret));
1727         return false;
1728     }
1729 
1730     ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_TABLE,
1731                                      sizeof(table), &table);
1732     if (ret != sizeof(table)) {
1733         error_setg(errp, "failed to read PCI MSIX TABLE: %s",
1734                    strreaderror(ret));
1735         return false;
1736     }
1737 
1738     ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_PBA,
1739                                      sizeof(pba), &pba);
1740     if (ret != sizeof(pba)) {
1741         error_setg(errp, "failed to read PCI MSIX PBA: %s", strreaderror(ret));
1742         return false;
1743     }
1744 
1745     ctrl = le16_to_cpu(ctrl);
1746     table = le32_to_cpu(table);
1747     pba = le32_to_cpu(pba);
1748 
1749     msix = g_malloc0(sizeof(*msix));
1750     msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
1751     msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
1752     msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
1753     msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
1754     msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
1755 
1756     ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
1757                                    &irq_info);
1758     if (ret < 0) {
1759         error_setg_errno(errp, -ret, "failed to get MSI-X irq info");
1760         g_free(msix);
1761         return false;
1762     }
1763 
1764     msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE);
1765 
1766     /*
1767      * Test the size of the pba_offset variable and catch if it extends outside
1768      * of the specified BAR. If it is the case, we need to apply a hardware
1769      * specific quirk if the device is known or we have a broken configuration.
1770      */
1771     if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) {
1772         /*
1773          * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
1774          * adapters. The T5 hardware returns an incorrect value of 0x8000 for
1775          * the VF PBA offset while the BAR itself is only 8k. The correct value
1776          * is 0x1000, so we hard code that here.
1777          */
1778         if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO &&
1779             (vdev->device_id & 0xff00) == 0x5800) {
1780             msix->pba_offset = 0x1000;
1781         /*
1782          * BAIDU KUNLUN Virtual Function devices for KUNLUN AI processor
1783          * return an incorrect value of 0x460000 for the VF PBA offset while
1784          * the BAR itself is only 0x10000.  The correct value is 0xb400.
1785          */
1786         } else if (vfio_pci_is(vdev, PCI_VENDOR_ID_BAIDU,
1787                                PCI_DEVICE_ID_KUNLUN_VF)) {
1788             msix->pba_offset = 0xb400;
1789         } else if (vdev->msix_relo == OFF_AUTO_PCIBAR_OFF) {
1790             error_setg(errp, "hardware reports invalid configuration, "
1791                        "MSIX PBA outside of specified BAR");
1792             g_free(msix);
1793             return false;
1794         }
1795     }
1796 
1797     trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
1798                                 msix->table_offset, msix->entries,
1799                                 msix->noresize);
1800     vdev->msix = msix;
1801 
1802     vfio_pci_fixup_msix_region(vdev);
1803 
1804     return vfio_pci_relocate_msix(vdev, errp);
1805 }
1806 
1807 static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1808 {
1809     int ret;
1810     Error *err = NULL;
1811 
1812     vdev->msix->pending = g_new0(unsigned long,
1813                                  BITS_TO_LONGS(vdev->msix->entries));
1814     ret = msix_init(&vdev->pdev, vdev->msix->entries,
1815                     vdev->bars[vdev->msix->table_bar].mr,
1816                     vdev->msix->table_bar, vdev->msix->table_offset,
1817                     vdev->bars[vdev->msix->pba_bar].mr,
1818                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos,
1819                     &err);
1820     if (ret < 0) {
1821         if (ret == -ENOTSUP) {
1822             warn_report_err(err);
1823             return true;
1824         }
1825 
1826         error_propagate(errp, err);
1827         return false;
1828     }
1829 
1830     /*
1831      * The PCI spec suggests that devices provide additional alignment for
1832      * MSI-X structures and avoid overlapping non-MSI-X related registers.
1833      * For an assigned device, this hopefully means that emulation of MSI-X
1834      * structures does not affect the performance of the device.  If devices
1835      * fail to provide that alignment, a significant performance penalty may
1836      * result, for instance Mellanox MT27500 VFs:
1837      * http://www.spinics.net/lists/kvm/msg125881.html
1838      *
1839      * The PBA is simply not that important for such a serious regression and
1840      * most drivers do not appear to look at it.  The solution for this is to
1841      * disable the PBA MemoryRegion unless it's being used.  We disable it
1842      * here and only enable it if a masked vector fires through QEMU.  As the
1843      * vector-use notifier is called, which occurs on unmask, we test whether
1844      * PBA emulation is needed and again disable if not.
1845      */
1846     memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
1847 
1848     /*
1849      * The emulated machine may provide a paravirt interface for MSIX setup
1850      * so it is not strictly necessary to emulate MSIX here. This becomes
1851      * helpful when frequently accessed MMIO registers are located in
1852      * subpages adjacent to the MSIX table but the MSIX data containing page
1853      * cannot be mapped because of a host page size bigger than the MSIX table
1854      * alignment.
1855      */
1856     if (object_property_get_bool(OBJECT(qdev_get_machine()),
1857                                  "vfio-no-msix-emulation", NULL)) {
1858         memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
1859     }
1860 
1861     return true;
1862 }
1863 
1864 void vfio_pci_teardown_msi(VFIOPCIDevice *vdev)
1865 {
1866     msi_uninit(&vdev->pdev);
1867 
1868     if (vdev->msix) {
1869         msix_uninit(&vdev->pdev,
1870                     vdev->bars[vdev->msix->table_bar].mr,
1871                     vdev->bars[vdev->msix->pba_bar].mr);
1872         g_free(vdev->msix->pending);
1873     }
1874 }
1875 
1876 /*
1877  * Resource setup
1878  */
1879 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
1880 {
1881     int i;
1882 
1883     for (i = 0; i < PCI_ROM_SLOT; i++) {
1884         vfio_region_mmaps_set_enabled(&vdev->bars[i].region, enabled);
1885     }
1886 }
1887 
1888 static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
1889 {
1890     VFIOBAR *bar = &vdev->bars[nr];
1891 
1892     uint32_t pci_bar;
1893     int ret;
1894 
1895     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
1896     if (!bar->region.size) {
1897         return;
1898     }
1899 
1900     /* Determine what type of BAR this is for registration */
1901     ret = vfio_pci_config_space_read(vdev, PCI_BASE_ADDRESS_0 + (4 * nr),
1902                                      sizeof(pci_bar), &pci_bar);
1903     if (ret != sizeof(pci_bar)) {
1904         error_report("vfio: Failed to read BAR %d: %s", nr, strreaderror(ret));
1905         return;
1906     }
1907 
1908     pci_bar = le32_to_cpu(pci_bar);
1909     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
1910     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
1911     bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
1912                                          ~PCI_BASE_ADDRESS_MEM_MASK);
1913     bar->size = bar->region.size;
1914 
1915     /* IO regions are sync, memory can be async */
1916     bar->region.post_wr = (bar->ioport == 0);
1917 }
1918 
1919 static void vfio_bars_prepare(VFIOPCIDevice *vdev)
1920 {
1921     int i;
1922 
1923     for (i = 0; i < PCI_ROM_SLOT; i++) {
1924         vfio_bar_prepare(vdev, i);
1925     }
1926 }
1927 
1928 static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
1929 {
1930     VFIOBAR *bar = &vdev->bars[nr];
1931     char *name;
1932 
1933     if (!bar->size) {
1934         return;
1935     }
1936 
1937     bar->mr = g_new0(MemoryRegion, 1);
1938     name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr);
1939     memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size);
1940     g_free(name);
1941 
1942     if (bar->region.size) {
1943         memory_region_add_subregion(bar->mr, 0, bar->region.mem);
1944 
1945         if (vfio_region_mmap(&bar->region)) {
1946             error_report("Failed to mmap %s BAR %d. Performance may be slow",
1947                          vdev->vbasedev.name, nr);
1948         }
1949     }
1950 
1951     pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
1952 }
1953 
1954 static void vfio_bars_register(VFIOPCIDevice *vdev)
1955 {
1956     int i;
1957 
1958     for (i = 0; i < PCI_ROM_SLOT; i++) {
1959         vfio_bar_register(vdev, i);
1960     }
1961 }
1962 
1963 void vfio_pci_bars_exit(VFIOPCIDevice *vdev)
1964 {
1965     int i;
1966 
1967     for (i = 0; i < PCI_ROM_SLOT; i++) {
1968         VFIOBAR *bar = &vdev->bars[i];
1969 
1970         vfio_bar_quirk_exit(vdev, i);
1971         vfio_region_exit(&bar->region);
1972         if (bar->region.size) {
1973             memory_region_del_subregion(bar->mr, bar->region.mem);
1974         }
1975     }
1976 
1977     if (vdev->vga) {
1978         pci_unregister_vga(&vdev->pdev);
1979         vfio_vga_quirk_exit(vdev);
1980     }
1981 }
1982 
1983 static void vfio_bars_finalize(VFIOPCIDevice *vdev)
1984 {
1985     int i;
1986 
1987     for (i = 0; i < PCI_ROM_SLOT; i++) {
1988         VFIOBAR *bar = &vdev->bars[i];
1989 
1990         vfio_bar_quirk_finalize(vdev, i);
1991         vfio_region_finalize(&bar->region);
1992         if (bar->mr) {
1993             assert(bar->size);
1994             object_unparent(OBJECT(bar->mr));
1995             g_free(bar->mr);
1996             bar->mr = NULL;
1997         }
1998     }
1999 
2000     if (vdev->vga) {
2001         vfio_vga_quirk_finalize(vdev);
2002         for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
2003             object_unparent(OBJECT(&vdev->vga->region[i].mem));
2004         }
2005         g_free(vdev->vga);
2006     }
2007 }
2008 
2009 /*
2010  * General setup
2011  */
2012 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
2013 {
2014     uint8_t tmp;
2015     uint16_t next = PCI_CONFIG_SPACE_SIZE;
2016 
2017     for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
2018          tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]) {
2019         if (tmp > pos && tmp < next) {
2020             next = tmp;
2021         }
2022     }
2023 
2024     return next - pos;
2025 }
2026 
2027 
2028 static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
2029 {
2030     uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
2031 
2032     for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
2033         tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
2034         if (tmp > pos && tmp < next) {
2035             next = tmp;
2036         }
2037     }
2038 
2039     return next - pos;
2040 }
2041 
2042 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
2043 {
2044     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
2045 }
2046 
2047 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos,
2048                                    uint16_t val, uint16_t mask)
2049 {
2050     vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
2051     vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
2052     vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
2053 }
2054 
2055 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
2056 {
2057     pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
2058 }
2059 
2060 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
2061                                    uint32_t val, uint32_t mask)
2062 {
2063     vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
2064     vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
2065     vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
2066 }
2067 
2068 static void vfio_pci_enable_rp_atomics(VFIOPCIDevice *vdev)
2069 {
2070     struct vfio_device_info_cap_pci_atomic_comp *cap;
2071     g_autofree struct vfio_device_info *info = NULL;
2072     PCIBus *bus = pci_get_bus(&vdev->pdev);
2073     PCIDevice *parent = bus->parent_dev;
2074     struct vfio_info_cap_header *hdr;
2075     uint32_t mask = 0;
2076     uint8_t *pos;
2077 
2078     /*
2079      * PCIe Atomic Ops completer support is only added automatically for single
2080      * function devices downstream of a root port supporting DEVCAP2.  Support
2081      * is added during realize and, if added, removed during device exit.  The
2082      * single function requirement avoids conflicting requirements should a
2083      * slot be composed of multiple devices with differing capabilities.
2084      */
2085     if (pci_bus_is_root(bus) || !parent || !parent->exp.exp_cap ||
2086         pcie_cap_get_type(parent) != PCI_EXP_TYPE_ROOT_PORT ||
2087         pcie_cap_get_version(parent) != PCI_EXP_FLAGS_VER2 ||
2088         vdev->pdev.devfn ||
2089         vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
2090         return;
2091     }
2092 
2093     pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2;
2094 
2095     /* Abort if there'a already an Atomic Ops configuration on the root port */
2096     if (pci_get_long(pos) & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
2097                              PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
2098                              PCI_EXP_DEVCAP2_ATOMIC_COMP128)) {
2099         return;
2100     }
2101 
2102     info = vfio_get_device_info(vdev->vbasedev.fd);
2103     if (!info) {
2104         return;
2105     }
2106 
2107     hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP);
2108     if (!hdr) {
2109         return;
2110     }
2111 
2112     cap = (void *)hdr;
2113     if (cap->flags & VFIO_PCI_ATOMIC_COMP32) {
2114         mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP32;
2115     }
2116     if (cap->flags & VFIO_PCI_ATOMIC_COMP64) {
2117         mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP64;
2118     }
2119     if (cap->flags & VFIO_PCI_ATOMIC_COMP128) {
2120         mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP128;
2121     }
2122 
2123     if (!mask) {
2124         return;
2125     }
2126 
2127     pci_long_test_and_set_mask(pos, mask);
2128     vdev->clear_parent_atomics_on_exit = true;
2129 }
2130 
2131 static void vfio_pci_disable_rp_atomics(VFIOPCIDevice *vdev)
2132 {
2133     if (vdev->clear_parent_atomics_on_exit) {
2134         PCIDevice *parent = pci_get_bus(&vdev->pdev)->parent_dev;
2135         uint8_t *pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2;
2136 
2137         pci_long_test_and_clear_mask(pos, PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
2138                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
2139                                           PCI_EXP_DEVCAP2_ATOMIC_COMP128);
2140     }
2141 }
2142 
2143 static bool vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size,
2144                                 Error **errp)
2145 {
2146     uint16_t flags;
2147     uint8_t type;
2148 
2149     flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
2150     type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
2151 
2152     if (type != PCI_EXP_TYPE_ENDPOINT &&
2153         type != PCI_EXP_TYPE_LEG_END &&
2154         type != PCI_EXP_TYPE_RC_END) {
2155 
2156         error_setg(errp, "assignment of PCIe type 0x%x "
2157                    "devices is not currently supported", type);
2158         return false;
2159     }
2160 
2161     if (!pci_bus_is_express(pci_get_bus(&vdev->pdev))) {
2162         PCIBus *bus = pci_get_bus(&vdev->pdev);
2163         PCIDevice *bridge;
2164 
2165         /*
2166          * Traditionally PCI device assignment exposes the PCIe capability
2167          * as-is on non-express buses.  The reason being that some drivers
2168          * simply assume that it's there, for example tg3.  However when
2169          * we're running on a native PCIe machine type, like Q35, we need
2170          * to hide the PCIe capability.  The reason for this is twofold;
2171          * first Windows guests get a Code 10 error when the PCIe capability
2172          * is exposed in this configuration.  Therefore express devices won't
2173          * work at all unless they're attached to express buses in the VM.
2174          * Second, a native PCIe machine introduces the possibility of fine
2175          * granularity IOMMUs supporting both translation and isolation.
2176          * Guest code to discover the IOMMU visibility of a device, such as
2177          * IOMMU grouping code on Linux, is very aware of device types and
2178          * valid transitions between bus types.  An express device on a non-
2179          * express bus is not a valid combination on bare metal systems.
2180          *
2181          * Drivers that require a PCIe capability to make the device
2182          * functional are simply going to need to have their devices placed
2183          * on a PCIe bus in the VM.
2184          */
2185         while (!pci_bus_is_root(bus)) {
2186             bridge = pci_bridge_get_device(bus);
2187             bus = pci_get_bus(bridge);
2188         }
2189 
2190         if (pci_bus_is_express(bus)) {
2191             return true;
2192         }
2193 
2194     } else if (pci_bus_is_root(pci_get_bus(&vdev->pdev))) {
2195         /*
2196          * On a Root Complex bus Endpoints become Root Complex Integrated
2197          * Endpoints, which changes the type and clears the LNK & LNK2 fields.
2198          */
2199         if (type == PCI_EXP_TYPE_ENDPOINT) {
2200             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2201                                    PCI_EXP_TYPE_RC_END << 4,
2202                                    PCI_EXP_FLAGS_TYPE);
2203 
2204             /* Link Capabilities, Status, and Control goes away */
2205             if (size > PCI_EXP_LNKCTL) {
2206                 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
2207                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2208                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
2209 
2210 #ifndef PCI_EXP_LNKCAP2
2211 #define PCI_EXP_LNKCAP2 44
2212 #endif
2213 #ifndef PCI_EXP_LNKSTA2
2214 #define PCI_EXP_LNKSTA2 50
2215 #endif
2216                 /* Link 2 Capabilities, Status, and Control goes away */
2217                 if (size > PCI_EXP_LNKCAP2) {
2218                     vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
2219                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
2220                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
2221                 }
2222             }
2223 
2224         } else if (type == PCI_EXP_TYPE_LEG_END) {
2225             /*
2226              * Legacy endpoints don't belong on the root complex.  Windows
2227              * seems to be happier with devices if we skip the capability.
2228              */
2229             return true;
2230         }
2231 
2232     } else {
2233         /*
2234          * Convert Root Complex Integrated Endpoints to regular endpoints.
2235          * These devices don't support LNK/LNK2 capabilities, so make them up.
2236          */
2237         if (type == PCI_EXP_TYPE_RC_END) {
2238             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2239                                    PCI_EXP_TYPE_ENDPOINT << 4,
2240                                    PCI_EXP_FLAGS_TYPE);
2241             vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
2242                            QEMU_PCI_EXP_LNKCAP_MLW(QEMU_PCI_EXP_LNK_X1) |
2243                            QEMU_PCI_EXP_LNKCAP_MLS(QEMU_PCI_EXP_LNK_2_5GT), ~0);
2244             vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2245         }
2246 
2247         vfio_pci_enable_rp_atomics(vdev);
2248     }
2249 
2250     /*
2251      * Intel 82599 SR-IOV VFs report an invalid PCIe capability version 0
2252      * (Niantic errate #35) causing Windows to error with a Code 10 for the
2253      * device on Q35.  Fixup any such devices to report version 1.  If we
2254      * were to remove the capability entirely the guest would lose extended
2255      * config space.
2256      */
2257     if ((flags & PCI_EXP_FLAGS_VERS) == 0) {
2258         vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2259                                1, PCI_EXP_FLAGS_VERS);
2260     }
2261 
2262     pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size,
2263                              errp);
2264     if (pos < 0) {
2265         return false;
2266     }
2267 
2268     vdev->pdev.exp.exp_cap = pos;
2269 
2270     return true;
2271 }
2272 
2273 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
2274 {
2275     uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
2276 
2277     if (cap & PCI_EXP_DEVCAP_FLR) {
2278         trace_vfio_check_pcie_flr(vdev->vbasedev.name);
2279         vdev->has_flr = true;
2280     }
2281 }
2282 
2283 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos)
2284 {
2285     uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
2286 
2287     if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
2288         trace_vfio_check_pm_reset(vdev->vbasedev.name);
2289         vdev->has_pm_reset = true;
2290     }
2291 }
2292 
2293 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
2294 {
2295     uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
2296 
2297     if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
2298         trace_vfio_check_af_flr(vdev->vbasedev.name);
2299         vdev->has_flr = true;
2300     }
2301 }
2302 
2303 static bool vfio_add_vendor_specific_cap(VFIOPCIDevice *vdev, int pos,
2304                                          uint8_t size, Error **errp)
2305 {
2306     PCIDevice *pdev = &vdev->pdev;
2307 
2308     pos = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, size, errp);
2309     if (pos < 0) {
2310         return false;
2311     }
2312 
2313     /*
2314      * Exempt config space check for Vendor Specific Information during
2315      * restore/load.
2316      * Config space check is still enforced for 3 byte VSC header.
2317      */
2318     if (vdev->skip_vsc_check && size > 3) {
2319         memset(pdev->cmask + pos + 3, 0, size - 3);
2320     }
2321 
2322     return true;
2323 }
2324 
2325 static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
2326 {
2327     ERRP_GUARD();
2328     PCIDevice *pdev = &vdev->pdev;
2329     uint8_t cap_id, next, size;
2330     bool ret;
2331 
2332     cap_id = pdev->config[pos];
2333     next = pdev->config[pos + PCI_CAP_LIST_NEXT];
2334 
2335     /*
2336      * If it becomes important to configure capabilities to their actual
2337      * size, use this as the default when it's something we don't recognize.
2338      * Since QEMU doesn't actually handle many of the config accesses,
2339      * exact size doesn't seem worthwhile.
2340      */
2341     size = vfio_std_cap_max_size(pdev, pos);
2342 
2343     /*
2344      * pci_add_capability always inserts the new capability at the head
2345      * of the chain.  Therefore to end up with a chain that matches the
2346      * physical device, we insert from the end by making this recursive.
2347      * This is also why we pre-calculate size above as cached config space
2348      * will be changed as we unwind the stack.
2349      */
2350     if (next) {
2351         if (!vfio_add_std_cap(vdev, next, errp)) {
2352             return false;
2353         }
2354     } else {
2355         /* Begin the rebuild, use QEMU emulated list bits */
2356         pdev->config[PCI_CAPABILITY_LIST] = 0;
2357         vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
2358         vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
2359 
2360         if (!vfio_add_virt_caps(vdev, errp)) {
2361             return false;
2362         }
2363     }
2364 
2365     /* Scale down size, esp in case virt caps were added above */
2366     size = MIN(size, vfio_std_cap_max_size(pdev, pos));
2367 
2368     /* Use emulated next pointer to allow dropping caps */
2369     pci_set_byte(vdev->emulated_config_bits + pos + PCI_CAP_LIST_NEXT, 0xff);
2370 
2371     switch (cap_id) {
2372     case PCI_CAP_ID_MSI:
2373         ret = vfio_msi_setup(vdev, pos, errp);
2374         break;
2375     case PCI_CAP_ID_EXP:
2376         vfio_check_pcie_flr(vdev, pos);
2377         ret = vfio_setup_pcie_cap(vdev, pos, size, errp);
2378         break;
2379     case PCI_CAP_ID_MSIX:
2380         ret = vfio_msix_setup(vdev, pos, errp);
2381         break;
2382     case PCI_CAP_ID_PM:
2383         vfio_check_pm_reset(vdev, pos);
2384         ret = pci_pm_init(pdev, pos, errp) >= 0;
2385         /*
2386          * PCI-core config space emulation needs write access to the power
2387          * state enabled for tracking BAR mapping relative to PM state.
2388          */
2389         pci_set_word(pdev->wmask + pos + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK);
2390         break;
2391     case PCI_CAP_ID_AF:
2392         vfio_check_af_flr(vdev, pos);
2393         ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
2394         break;
2395     case PCI_CAP_ID_VNDR:
2396         ret = vfio_add_vendor_specific_cap(vdev, pos, size, errp);
2397         break;
2398     default:
2399         ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
2400         break;
2401     }
2402 
2403     if (!ret) {
2404         error_prepend(errp,
2405                       "failed to add PCI capability 0x%x[0x%x]@0x%x: ",
2406                       cap_id, size, pos);
2407     }
2408 
2409     return ret;
2410 }
2411 
2412 static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos)
2413 {
2414     uint32_t ctrl;
2415     int i, nbar;
2416 
2417     ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL);
2418     nbar = (ctrl & PCI_REBAR_CTRL_NBAR_MASK) >> PCI_REBAR_CTRL_NBAR_SHIFT;
2419 
2420     for (i = 0; i < nbar; i++) {
2421         uint32_t cap;
2422         int size;
2423 
2424         ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL + (i * 8));
2425         size = (ctrl & PCI_REBAR_CTRL_BAR_SIZE) >> PCI_REBAR_CTRL_BAR_SHIFT;
2426 
2427         /* The cap register reports sizes 1MB to 128TB, with 4 reserved bits */
2428         cap = size <= 27 ? 1U << (size + 4) : 0;
2429 
2430         /*
2431          * The PCIe spec (v6.0.1, 7.8.6) requires HW to support at least one
2432          * size in the range 1MB to 512GB.  We intend to mask all sizes except
2433          * the one currently enabled in the size field, therefore if it's
2434          * outside the range, hide the whole capability as this virtualization
2435          * trick won't work.  If >512GB resizable BARs start to appear, we
2436          * might need an opt-in or reservation scheme in the kernel.
2437          */
2438         if (!(cap & PCI_REBAR_CAP_SIZES)) {
2439             return -EINVAL;
2440         }
2441 
2442         /* Hide all sizes reported in the ctrl reg per above requirement. */
2443         ctrl &= (PCI_REBAR_CTRL_BAR_SIZE |
2444                  PCI_REBAR_CTRL_NBAR_MASK |
2445                  PCI_REBAR_CTRL_BAR_IDX);
2446 
2447         /*
2448          * The BAR size field is RW, however we've mangled the capability
2449          * register such that we only report a single size, ie. the current
2450          * BAR size.  A write of an unsupported value is undefined, therefore
2451          * the register field is essentially RO.
2452          */
2453         vfio_add_emulated_long(vdev, pos + PCI_REBAR_CAP + (i * 8), cap, ~0);
2454         vfio_add_emulated_long(vdev, pos + PCI_REBAR_CTRL + (i * 8), ctrl, ~0);
2455     }
2456 
2457     return 0;
2458 }
2459 
2460 static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
2461 {
2462     PCIDevice *pdev = &vdev->pdev;
2463     uint32_t header;
2464     uint16_t cap_id, next, size;
2465     uint8_t cap_ver;
2466     uint8_t *config;
2467 
2468     /* Only add extended caps if we have them and the guest can see them */
2469     if (!pci_is_express(pdev) || !pci_bus_is_express(pci_get_bus(pdev)) ||
2470         !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
2471         return;
2472     }
2473 
2474     /*
2475      * pcie_add_capability always inserts the new capability at the tail
2476      * of the chain.  Therefore to end up with a chain that matches the
2477      * physical device, we cache the config space to avoid overwriting
2478      * the original config space when we parse the extended capabilities.
2479      */
2480     config = g_memdup(pdev->config, vdev->config_size);
2481 
2482     /*
2483      * Extended capabilities are chained with each pointing to the next, so we
2484      * can drop anything other than the head of the chain simply by modifying
2485      * the previous next pointer.  Seed the head of the chain here such that
2486      * we can simply skip any capabilities we want to drop below, regardless
2487      * of their position in the chain.  If this stub capability still exists
2488      * after we add the capabilities we want to expose, update the capability
2489      * ID to zero.  Note that we cannot seed with the capability header being
2490      * zero as this conflicts with definition of an absent capability chain
2491      * and prevents capabilities beyond the head of the list from being added.
2492      * By replacing the dummy capability ID with zero after walking the device
2493      * chain, we also transparently mark extended capabilities as absent if
2494      * no capabilities were added.  Note that the PCIe spec defines an absence
2495      * of extended capabilities to be determined by a value of zero for the
2496      * capability ID, version, AND next pointer.  A non-zero next pointer
2497      * should be sufficient to indicate additional capabilities are present,
2498      * which will occur if we call pcie_add_capability() below.  The entire
2499      * first dword is emulated to support this.
2500      *
2501      * NB. The kernel side does similar masking, so be prepared that our
2502      * view of the device may also contain a capability ID zero in the head
2503      * of the chain.  Skip it for the same reason that we cannot seed the
2504      * chain with a zero capability.
2505      */
2506     pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
2507                  PCI_EXT_CAP(0xFFFF, 0, 0));
2508     pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
2509     pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
2510 
2511     for (next = PCI_CONFIG_SPACE_SIZE; next;
2512          next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
2513         header = pci_get_long(config + next);
2514         cap_id = PCI_EXT_CAP_ID(header);
2515         cap_ver = PCI_EXT_CAP_VER(header);
2516 
2517         /*
2518          * If it becomes important to configure extended capabilities to their
2519          * actual size, use this as the default when it's something we don't
2520          * recognize. Since QEMU doesn't actually handle many of the config
2521          * accesses, exact size doesn't seem worthwhile.
2522          */
2523         size = vfio_ext_cap_max_size(config, next);
2524 
2525         /* Use emulated next pointer to allow dropping extended caps */
2526         pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
2527                                    PCI_EXT_CAP_NEXT_MASK);
2528 
2529         switch (cap_id) {
2530         case 0: /* kernel masked capability */
2531         case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */
2532         case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */
2533             trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
2534             break;
2535         case PCI_EXT_CAP_ID_REBAR:
2536             if (!vfio_setup_rebar_ecap(vdev, next)) {
2537                 pcie_add_capability(pdev, cap_id, cap_ver, next, size);
2538             }
2539             break;
2540         default:
2541             pcie_add_capability(pdev, cap_id, cap_ver, next, size);
2542         }
2543 
2544     }
2545 
2546     /* Cleanup chain head ID if necessary */
2547     if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
2548         pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
2549     }
2550 
2551     g_free(config);
2552 }
2553 
2554 bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
2555 {
2556     PCIDevice *pdev = &vdev->pdev;
2557 
2558     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
2559         !pdev->config[PCI_CAPABILITY_LIST]) {
2560         return true; /* Nothing to add */
2561     }
2562 
2563     if (!vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST], errp)) {
2564         return false;
2565     }
2566 
2567     vfio_add_ext_cap(vdev);
2568     return true;
2569 }
2570 
2571 void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
2572 {
2573     PCIDevice *pdev = &vdev->pdev;
2574     uint16_t cmd;
2575 
2576     vfio_disable_interrupts(vdev);
2577 
2578     /*
2579      * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master.
2580      * Also put INTx Disable in known state.
2581      */
2582     cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
2583     cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
2584              PCI_COMMAND_INTX_DISABLE);
2585     vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
2586 
2587     /* Make sure the device is in D0 */
2588     if (pdev->pm_cap) {
2589         uint16_t pmcsr;
2590         uint8_t state;
2591 
2592         pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2);
2593         state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2594         if (state) {
2595             pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
2596             vfio_pci_write_config(pdev, pdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
2597             /* vfio handles the necessary delay here */
2598             pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2);
2599             state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2600             if (state) {
2601                 error_report("vfio: Unable to power on device, stuck in D%d",
2602                              state);
2603             }
2604         }
2605     }
2606 }
2607 
2608 void vfio_pci_post_reset(VFIOPCIDevice *vdev)
2609 {
2610     VFIODevice *vbasedev = &vdev->vbasedev;
2611     Error *err = NULL;
2612     int ret, nr;
2613 
2614     if (!vfio_intx_enable(vdev, &err)) {
2615         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2616     }
2617 
2618     for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
2619         off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr);
2620         uint32_t val = 0;
2621         uint32_t len = sizeof(val);
2622 
2623         ret = vfio_pci_config_space_write(vdev, addr, len, &val);
2624         if (ret != len) {
2625             error_report("%s(%s) reset bar %d failed: %s", __func__,
2626                          vbasedev->name, nr, strwriteerror(ret));
2627         }
2628     }
2629 
2630     vfio_quirk_reset(vdev);
2631 }
2632 
2633 bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
2634 {
2635     char tmp[13];
2636 
2637     sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain,
2638             addr->bus, addr->slot, addr->function);
2639 
2640     return (strcmp(tmp, name) == 0);
2641 }
2642 
2643 int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
2644                                     struct vfio_pci_hot_reset_info **info_p)
2645 {
2646     struct vfio_pci_hot_reset_info *info;
2647     int ret, count;
2648 
2649     assert(info_p && !*info_p);
2650 
2651     info = g_malloc0(sizeof(*info));
2652     info->argsz = sizeof(*info);
2653 
2654     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2655     if (ret && errno != ENOSPC) {
2656         ret = -errno;
2657         g_free(info);
2658         if (!vdev->has_pm_reset) {
2659             error_report("vfio: Cannot reset device %s, "
2660                          "no available reset mechanism.", vdev->vbasedev.name);
2661         }
2662         return ret;
2663     }
2664 
2665     count = info->count;
2666     info = g_realloc(info, sizeof(*info) + (count * sizeof(info->devices[0])));
2667     info->argsz = sizeof(*info) + (count * sizeof(info->devices[0]));
2668 
2669     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2670     if (ret) {
2671         ret = -errno;
2672         g_free(info);
2673         error_report("vfio: hot reset info failed: %m");
2674         return ret;
2675     }
2676 
2677     *info_p = info;
2678     return 0;
2679 }
2680 
2681 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
2682 {
2683     VFIODevice *vbasedev = &vdev->vbasedev;
2684     const VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer);
2685 
2686     return vioc->pci_hot_reset(vbasedev, single);
2687 }
2688 
2689 /*
2690  * We want to differentiate hot reset of multiple in-use devices vs hot reset
2691  * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
2692  * of doing hot resets when there is only a single device per bus.  The in-use
2693  * here refers to how many VFIODevices are affected.  A hot reset that affects
2694  * multiple devices, but only a single in-use device, means that we can call
2695  * it from our bus ->reset() callback since the extent is effectively a single
2696  * device.  This allows us to make use of it in the hotplug path.  When there
2697  * are multiple in-use devices, we can only trigger the hot reset during a
2698  * system reset and thus from our reset handler.  We separate _one vs _multi
2699  * here so that we don't overlap and do a double reset on the system reset
2700  * path where both our reset handler and ->reset() callback are used.  Calling
2701  * _one() will only do a hot reset for the one in-use devices case, calling
2702  * _multi() will do nothing if a _one() would have been sufficient.
2703  */
2704 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev)
2705 {
2706     return vfio_pci_hot_reset(vdev, true);
2707 }
2708 
2709 static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev)
2710 {
2711     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2712     return vfio_pci_hot_reset(vdev, false);
2713 }
2714 
2715 static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev)
2716 {
2717     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2718     if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
2719         vbasedev->needs_reset = true;
2720     }
2721 }
2722 
2723 static Object *vfio_pci_get_object(VFIODevice *vbasedev)
2724 {
2725     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2726 
2727     return OBJECT(vdev);
2728 }
2729 
2730 static bool vfio_msix_present(void *opaque, int version_id)
2731 {
2732     PCIDevice *pdev = opaque;
2733 
2734     return msix_present(pdev);
2735 }
2736 
2737 static bool vfio_display_migration_needed(void *opaque)
2738 {
2739     VFIOPCIDevice *vdev = opaque;
2740 
2741     /*
2742      * We need to migrate the VFIODisplay object if ramfb *migration* was
2743      * explicitly requested (in which case we enforced both ramfb=on and
2744      * display=on), or ramfb migration was left at the default "auto"
2745      * setting, and *ramfb* was explicitly requested (in which case we
2746      * enforced display=on).
2747      */
2748     return vdev->ramfb_migrate == ON_OFF_AUTO_ON ||
2749         (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO && vdev->enable_ramfb);
2750 }
2751 
2752 static const VMStateDescription vmstate_vfio_display = {
2753     .name = "VFIOPCIDevice/VFIODisplay",
2754     .version_id = 1,
2755     .minimum_version_id = 1,
2756     .needed = vfio_display_migration_needed,
2757     .fields = (const VMStateField[]){
2758         VMSTATE_STRUCT_POINTER(dpy, VFIOPCIDevice, vfio_display_vmstate,
2759                                VFIODisplay),
2760         VMSTATE_END_OF_LIST()
2761     }
2762 };
2763 
2764 static const VMStateDescription vmstate_vfio_pci_config = {
2765     .name = "VFIOPCIDevice",
2766     .version_id = 1,
2767     .minimum_version_id = 1,
2768     .fields = (const VMStateField[]) {
2769         VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
2770         VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present),
2771         VMSTATE_END_OF_LIST()
2772     },
2773     .subsections = (const VMStateDescription * const []) {
2774         &vmstate_vfio_display,
2775         NULL
2776     }
2777 };
2778 
2779 static int vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f, Error **errp)
2780 {
2781     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2782 
2783     return vmstate_save_state_with_err(f, &vmstate_vfio_pci_config, vdev, NULL,
2784                                        errp);
2785 }
2786 
2787 static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
2788 {
2789     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2790     PCIDevice *pdev = &vdev->pdev;
2791     pcibus_t old_addr[PCI_NUM_REGIONS - 1];
2792     int bar, ret;
2793 
2794     for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
2795         old_addr[bar] = pdev->io_regions[bar].addr;
2796     }
2797 
2798     ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1);
2799     if (ret) {
2800         return ret;
2801     }
2802 
2803     vfio_pci_write_config(pdev, PCI_COMMAND,
2804                           pci_get_word(pdev->config + PCI_COMMAND), 2);
2805 
2806     for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
2807         /*
2808          * The address may not be changed in some scenarios
2809          * (e.g. the VF driver isn't loaded in VM).
2810          */
2811         if (old_addr[bar] != pdev->io_regions[bar].addr &&
2812             vdev->bars[bar].region.size > 0 &&
2813             vdev->bars[bar].region.size < qemu_real_host_page_size()) {
2814             vfio_sub_page_bar_update_mapping(pdev, bar);
2815         }
2816     }
2817 
2818     if (msi_enabled(pdev)) {
2819         vfio_msi_enable(vdev);
2820     } else if (msix_enabled(pdev)) {
2821         vfio_msix_enable(vdev);
2822     }
2823 
2824     return ret;
2825 }
2826 
2827 void vfio_sub_page_bar_update_mappings(VFIOPCIDevice *vdev)
2828 {
2829     PCIDevice *pdev = &vdev->pdev;
2830     int page_size = qemu_real_host_page_size();
2831     int bar;
2832 
2833     for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
2834         PCIIORegion *r = &pdev->io_regions[bar];
2835         if (r->addr != PCI_BAR_UNMAPPED && r->size > 0 && r->size < page_size) {
2836             vfio_sub_page_bar_update_mapping(pdev, bar);
2837         }
2838     }
2839 }
2840 
2841 static VFIODeviceOps vfio_pci_ops = {
2842     .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
2843     .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
2844     .vfio_eoi = vfio_pci_intx_eoi,
2845     .vfio_get_object = vfio_pci_get_object,
2846     .vfio_save_config = vfio_pci_save_config,
2847     .vfio_load_config = vfio_pci_load_config,
2848 };
2849 
2850 bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
2851 {
2852     VFIODevice *vbasedev = &vdev->vbasedev;
2853     struct vfio_region_info *reg_info = NULL;
2854     int ret;
2855 
2856     ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, &reg_info);
2857     if (ret) {
2858         error_setg_errno(errp, -ret,
2859                          "failed getting region info for VGA region index %d",
2860                          VFIO_PCI_VGA_REGION_INDEX);
2861         return false;
2862     }
2863 
2864     if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) ||
2865         !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) ||
2866         reg_info->size < 0xbffff + 1) {
2867         error_setg(errp, "unexpected VGA info, flags 0x%lx, size 0x%lx",
2868                    (unsigned long)reg_info->flags,
2869                    (unsigned long)reg_info->size);
2870         return false;
2871     }
2872 
2873     vdev->vga = g_new0(VFIOVGA, 1);
2874 
2875     vdev->vga->fd_offset = reg_info->offset;
2876     vdev->vga->fd = vdev->vbasedev.fd;
2877 
2878     vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
2879     vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
2880     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks);
2881 
2882     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
2883                           OBJECT(vdev), &vfio_vga_ops,
2884                           &vdev->vga->region[QEMU_PCI_VGA_MEM],
2885                           "vfio-vga-mmio@0xa0000",
2886                           QEMU_PCI_VGA_MEM_SIZE);
2887 
2888     vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
2889     vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
2890     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks);
2891 
2892     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
2893                           OBJECT(vdev), &vfio_vga_ops,
2894                           &vdev->vga->region[QEMU_PCI_VGA_IO_LO],
2895                           "vfio-vga-io@0x3b0",
2896                           QEMU_PCI_VGA_IO_LO_SIZE);
2897 
2898     vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
2899     vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
2900     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks);
2901 
2902     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
2903                           OBJECT(vdev), &vfio_vga_ops,
2904                           &vdev->vga->region[QEMU_PCI_VGA_IO_HI],
2905                           "vfio-vga-io@0x3c0",
2906                           QEMU_PCI_VGA_IO_HI_SIZE);
2907 
2908     return true;
2909 }
2910 
2911 bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp)
2912 {
2913     VFIODevice *vbasedev = &vdev->vbasedev;
2914     struct vfio_region_info *reg_info = NULL;
2915     struct vfio_irq_info irq_info;
2916     int i, ret = -1;
2917 
2918     /* Sanity check device */
2919     if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) {
2920         error_setg(errp, "this isn't a PCI device");
2921         return false;
2922     }
2923 
2924     if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
2925         error_setg(errp, "unexpected number of io regions %u",
2926                    vbasedev->num_regions);
2927         return false;
2928     }
2929 
2930     if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
2931         error_setg(errp, "unexpected number of irqs %u", vbasedev->num_irqs);
2932         return false;
2933     }
2934 
2935     for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
2936         char *name = g_strdup_printf("%s BAR %d", vbasedev->name, i);
2937 
2938         ret = vfio_region_setup(OBJECT(vdev), vbasedev,
2939                                 &vdev->bars[i].region, i, name);
2940         g_free(name);
2941 
2942         if (ret) {
2943             error_setg_errno(errp, -ret, "failed to get region %d info", i);
2944             return false;
2945         }
2946 
2947         QLIST_INIT(&vdev->bars[i].quirks);
2948     }
2949 
2950     ret = vfio_device_get_region_info(vbasedev,
2951                                       VFIO_PCI_CONFIG_REGION_INDEX, &reg_info);
2952     if (ret) {
2953         error_setg_errno(errp, -ret, "failed to get config info");
2954         return false;
2955     }
2956 
2957     trace_vfio_pci_populate_device_config(vdev->vbasedev.name,
2958                                       (unsigned long)reg_info->size,
2959                                       (unsigned long)reg_info->offset,
2960                                       (unsigned long)reg_info->flags);
2961 
2962     vdev->config_size = reg_info->size;
2963     if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
2964         vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
2965     }
2966     vdev->config_offset = reg_info->offset;
2967 
2968     if (vdev->features & VFIO_FEATURE_ENABLE_VGA) {
2969         if (!vfio_populate_vga(vdev, errp)) {
2970             error_append_hint(errp, "device does not support "
2971                               "requested feature x-vga\n");
2972             return false;
2973         }
2974     }
2975 
2976     ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info);
2977     if (ret) {
2978         /* This can fail for an old kernel or legacy PCI dev */
2979         trace_vfio_pci_populate_device_get_irq_info_failure(strerror(-ret));
2980     } else if (irq_info.count == 1) {
2981         vdev->pci_aer = true;
2982     } else {
2983         warn_report(VFIO_MSG_PREFIX
2984                     "Could not enable error recovery for the device",
2985                     vbasedev->name);
2986     }
2987 
2988     return true;
2989 }
2990 
2991 void vfio_pci_put_device(VFIOPCIDevice *vdev)
2992 {
2993     vfio_display_finalize(vdev);
2994     vfio_bars_finalize(vdev);
2995     g_free(vdev->emulated_config_bits);
2996     g_free(vdev->rom);
2997     /*
2998      * XXX Leaking igd_opregion is not an oversight, we can't remove the
2999      * fw_cfg entry therefore leaking this allocation seems like the safest
3000      * option.
3001      *
3002      * g_free(vdev->igd_opregion);
3003      */
3004 
3005     vfio_device_detach(&vdev->vbasedev);
3006 
3007     vfio_device_free_name(&vdev->vbasedev);
3008     g_free(vdev->msix);
3009 }
3010 
3011 static void vfio_err_notifier_handler(void *opaque)
3012 {
3013     VFIOPCIDevice *vdev = opaque;
3014 
3015     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
3016         return;
3017     }
3018 
3019     /*
3020      * TBD. Retrieve the error details and decide what action
3021      * needs to be taken. One of the actions could be to pass
3022      * the error to the guest and have the guest driver recover
3023      * from the error. This requires that PCIe capabilities be
3024      * exposed to the guest. For now, we just terminate the
3025      * guest to contain the error.
3026      */
3027 
3028     error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
3029 
3030     vm_stop(RUN_STATE_INTERNAL_ERROR);
3031 }
3032 
3033 /*
3034  * Registers error notifier for devices supporting error recovery.
3035  * If we encounter a failure in this function, we report an error
3036  * and continue after disabling error recovery support for the
3037  * device.
3038  */
3039 void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev)
3040 {
3041     Error *err = NULL;
3042     int32_t fd;
3043 
3044     if (!vdev->pci_aer) {
3045         return;
3046     }
3047 
3048     if (!vfio_notifier_init(vdev, &vdev->err_notifier, "err_notifier", 0,
3049                             &err)) {
3050         error_report_err(err);
3051         vdev->pci_aer = false;
3052         return;
3053     }
3054 
3055     fd = event_notifier_get_fd(&vdev->err_notifier);
3056     qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev);
3057 
3058     /* Do not alter irq_signaling during vfio_realize for cpr */
3059     if (cpr_is_incoming()) {
3060         return;
3061     }
3062 
3063     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
3064                                        VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
3065         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3066         qemu_set_fd_handler(fd, NULL, NULL, vdev);
3067         vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0);
3068         vdev->pci_aer = false;
3069     }
3070 }
3071 
3072 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
3073 {
3074     Error *err = NULL;
3075 
3076     if (!vdev->pci_aer) {
3077         return;
3078     }
3079 
3080     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
3081                                        VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
3082         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3083     }
3084     qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
3085                         NULL, NULL, vdev);
3086     vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0);
3087 }
3088 
3089 static void vfio_req_notifier_handler(void *opaque)
3090 {
3091     VFIOPCIDevice *vdev = opaque;
3092     Error *err = NULL;
3093 
3094     if (!event_notifier_test_and_clear(&vdev->req_notifier)) {
3095         return;
3096     }
3097 
3098     qdev_unplug(DEVICE(vdev), &err);
3099     if (err) {
3100         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3101     }
3102 }
3103 
3104 void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev)
3105 {
3106     struct vfio_irq_info irq_info;
3107     Error *err = NULL;
3108     int32_t fd;
3109     int ret;
3110 
3111     if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
3112         return;
3113     }
3114 
3115     ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX,
3116                                    &irq_info);
3117     if (ret < 0 || irq_info.count < 1) {
3118         return;
3119     }
3120 
3121     if (!vfio_notifier_init(vdev, &vdev->req_notifier, "req_notifier", 0,
3122                             &err)) {
3123         error_report_err(err);
3124         return;
3125     }
3126 
3127     fd = event_notifier_get_fd(&vdev->req_notifier);
3128     qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev);
3129 
3130     /* Do not alter irq_signaling during vfio_realize for cpr */
3131     if (cpr_is_incoming()) {
3132         vdev->req_enabled = true;
3133         return;
3134     }
3135 
3136     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
3137                                        VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
3138         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3139         qemu_set_fd_handler(fd, NULL, NULL, vdev);
3140         vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0);
3141     } else {
3142         vdev->req_enabled = true;
3143     }
3144 }
3145 
3146 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
3147 {
3148     Error *err = NULL;
3149 
3150     if (!vdev->req_enabled) {
3151         return;
3152     }
3153 
3154     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
3155                                        VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
3156         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3157     }
3158     qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
3159                         NULL, NULL, vdev);
3160     vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0);
3161 
3162     vdev->req_enabled = false;
3163 }
3164 
3165 void vfio_pci_config_register_vga(VFIOPCIDevice *vdev)
3166 {
3167     assert(vdev->vga != NULL);
3168 
3169     pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
3170                      &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
3171                      &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem);
3172 }
3173 
3174 bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp)
3175 {
3176     PCIDevice *pdev = &vdev->pdev;
3177     VFIODevice *vbasedev = &vdev->vbasedev;
3178     uint32_t config_space_size;
3179     int ret;
3180 
3181     config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size);
3182 
3183     /* Get a copy of config space */
3184     ret = vfio_pci_config_space_read(vdev, 0, config_space_size,
3185                                      vdev->pdev.config);
3186     if (ret < (int)config_space_size) {
3187         ret = ret < 0 ? -ret : EFAULT;
3188         error_setg_errno(errp, ret, "failed to read device config space");
3189         return false;
3190     }
3191 
3192     /* vfio emulates a lot for us, but some bits need extra love */
3193     vdev->emulated_config_bits = g_malloc0(vdev->config_size);
3194 
3195     /* QEMU can choose to expose the ROM or not */
3196     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
3197     /* QEMU can also add or extend BARs */
3198     memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
3199 
3200     /*
3201      * The PCI spec reserves vendor ID 0xffff as an invalid value.  The
3202      * device ID is managed by the vendor and need only be a 16-bit value.
3203      * Allow any 16-bit value for subsystem so they can be hidden or changed.
3204      */
3205     if (vdev->vendor_id != PCI_ANY_ID) {
3206         if (vdev->vendor_id >= 0xffff) {
3207             error_setg(errp, "invalid PCI vendor ID provided");
3208             return false;
3209         }
3210         vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
3211         trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id);
3212     } else {
3213         vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
3214     }
3215 
3216     if (vdev->device_id != PCI_ANY_ID) {
3217         if (vdev->device_id > 0xffff) {
3218             error_setg(errp, "invalid PCI device ID provided");
3219             return false;
3220         }
3221         vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
3222         trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id);
3223     } else {
3224         vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
3225     }
3226 
3227     if (vdev->sub_vendor_id != PCI_ANY_ID) {
3228         if (vdev->sub_vendor_id > 0xffff) {
3229             error_setg(errp, "invalid PCI subsystem vendor ID provided");
3230             return false;
3231         }
3232         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
3233                                vdev->sub_vendor_id, ~0);
3234         trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name,
3235                                               vdev->sub_vendor_id);
3236     }
3237 
3238     if (vdev->sub_device_id != PCI_ANY_ID) {
3239         if (vdev->sub_device_id > 0xffff) {
3240             error_setg(errp, "invalid PCI subsystem device ID provided");
3241             return false;
3242         }
3243         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
3244         trace_vfio_pci_emulated_sub_device_id(vbasedev->name,
3245                                               vdev->sub_device_id);
3246     }
3247 
3248     /*
3249      * Class code is a 24-bit value at config space 0x09. Allow overriding it
3250      * with any 24-bit value.
3251      */
3252     if (vdev->class_code != PCI_ANY_ID) {
3253         if (vdev->class_code > 0xffffff) {
3254             error_setg(errp, "invalid PCI class code provided");
3255             return false;
3256         }
3257         /* Higher 24 bits of PCI_CLASS_REVISION are class code */
3258         vfio_add_emulated_long(vdev, PCI_CLASS_REVISION,
3259                                vdev->class_code << 8, ~0xff);
3260         trace_vfio_pci_emulated_class_code(vbasedev->name, vdev->class_code);
3261     } else {
3262         vdev->class_code = pci_get_long(pdev->config + PCI_CLASS_REVISION) >> 8;
3263     }
3264 
3265     /* QEMU can change multi-function devices to single function, or reverse */
3266     vdev->emulated_config_bits[PCI_HEADER_TYPE] =
3267                                               PCI_HEADER_TYPE_MULTI_FUNCTION;
3268 
3269     /* Restore or clear multifunction, this is always controlled by QEMU */
3270     if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
3271         vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
3272     } else {
3273         vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
3274     }
3275 
3276     /*
3277      * Clear host resource mapping info.  If we choose not to register a
3278      * BAR, such as might be the case with the option ROM, we can get
3279      * confusing, unwritable, residual addresses from the host here.
3280      */
3281     memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
3282     memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
3283 
3284     vfio_pci_size_rom(vdev);
3285 
3286     vfio_bars_prepare(vdev);
3287 
3288     if (!vfio_msix_early_setup(vdev, errp)) {
3289         return false;
3290     }
3291 
3292     vfio_bars_register(vdev);
3293 
3294     if (vdev->vga && vfio_is_vga(vdev)) {
3295         vfio_pci_config_register_vga(vdev);
3296     }
3297 
3298     return true;
3299 }
3300 
3301 bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
3302 {
3303     PCIDevice *pdev = &vdev->pdev;
3304 
3305     /* QEMU emulates all of MSI & MSIX */
3306     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
3307         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
3308                MSIX_CAP_LENGTH);
3309     }
3310 
3311     if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
3312         memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
3313                vdev->msi_cap_size);
3314     }
3315 
3316     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
3317         vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
3318                                              vfio_intx_mmap_enable, vdev);
3319         pci_device_set_intx_routing_notifier(&vdev->pdev,
3320                                              vfio_intx_routing_notifier);
3321         vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
3322         kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
3323 
3324         /*
3325          * During CPR, do not call vfio_intx_enable at this time.  Instead,
3326          * call it from vfio_pci_post_load after the intx routing data has
3327          * been loaded from vmstate.
3328          */
3329         if (!cpr_is_incoming() && !vfio_intx_enable(vdev, errp)) {
3330             timer_free(vdev->intx.mmap_timer);
3331             pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3332             kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3333             return false;
3334         }
3335     }
3336     return true;
3337 }
3338 
3339 static void vfio_pci_realize(PCIDevice *pdev, Error **errp)
3340 {
3341     ERRP_GUARD();
3342     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
3343     VFIODevice *vbasedev = &vdev->vbasedev;
3344     int i;
3345     char uuid[UUID_STR_LEN];
3346     g_autofree char *name = NULL;
3347 
3348     if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
3349         if (!(~vdev->host.domain || ~vdev->host.bus ||
3350               ~vdev->host.slot || ~vdev->host.function)) {
3351             error_setg(errp, "No provided host device");
3352             error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
3353 #ifdef CONFIG_IOMMUFD
3354                               "or -device vfio-pci,fd=DEVICE_FD "
3355 #endif
3356                               "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
3357             return;
3358         }
3359         vbasedev->sysfsdev =
3360             g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x",
3361                             vdev->host.domain, vdev->host.bus,
3362                             vdev->host.slot, vdev->host.function);
3363     }
3364 
3365     if (!vfio_device_get_name(vbasedev, errp)) {
3366         return;
3367     }
3368 
3369     /*
3370      * Mediated devices *might* operate compatibly with discarding of RAM, but
3371      * we cannot know for certain, it depends on whether the mdev vendor driver
3372      * stays in sync with the active working set of the guest driver.  Prevent
3373      * the x-balloon-allowed option unless this is minimally an mdev device.
3374      */
3375     vbasedev->mdev = vfio_device_is_mdev(vbasedev);
3376 
3377     trace_vfio_mdev(vbasedev->name, vbasedev->mdev);
3378 
3379     if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) {
3380         error_setg(errp, "x-balloon-allowed only potentially compatible "
3381                    "with mdev devices");
3382         goto error;
3383     }
3384 
3385     if (!qemu_uuid_is_null(&vdev->vf_token)) {
3386         qemu_uuid_unparse(&vdev->vf_token, uuid);
3387         name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid);
3388     } else {
3389         name = g_strdup(vbasedev->name);
3390     }
3391 
3392     if (!vfio_device_attach(name, vbasedev,
3393                             pci_device_iommu_address_space(pdev), errp)) {
3394         goto error;
3395     }
3396 
3397     if (!vfio_pci_populate_device(vdev, errp)) {
3398         goto error;
3399     }
3400 
3401     if (!vfio_pci_config_setup(vdev, errp)) {
3402         goto error;
3403     }
3404 
3405     if (!vbasedev->mdev &&
3406         !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) {
3407         error_prepend(errp, "Failed to set vIOMMU: ");
3408         goto out_teardown;
3409     }
3410 
3411     if (!vfio_pci_add_capabilities(vdev, errp)) {
3412         goto out_unset_idev;
3413     }
3414 
3415     if (!vfio_config_quirk_setup(vdev, errp)) {
3416         goto out_unset_idev;
3417     }
3418 
3419     if (vdev->vga) {
3420         vfio_vga_quirk_setup(vdev);
3421     }
3422 
3423     for (i = 0; i < PCI_ROM_SLOT; i++) {
3424         vfio_bar_quirk_setup(vdev, i);
3425     }
3426 
3427     if (!vfio_pci_interrupt_setup(vdev, errp)) {
3428         goto out_unset_idev;
3429     }
3430 
3431     if (vdev->display != ON_OFF_AUTO_OFF) {
3432         if (!vfio_display_probe(vdev, errp)) {
3433             goto out_deregister;
3434         }
3435     }
3436     if (vdev->enable_ramfb && vdev->dpy == NULL) {
3437         error_setg(errp, "ramfb=on requires display=on");
3438         goto out_deregister;
3439     }
3440     if (vdev->display_xres || vdev->display_yres) {
3441         if (vdev->dpy == NULL) {
3442             error_setg(errp, "xres and yres properties require display=on");
3443             goto out_deregister;
3444         }
3445         if (vdev->dpy->edid_regs == NULL) {
3446             error_setg(errp, "xres and yres properties need edid support");
3447             goto out_deregister;
3448         }
3449     }
3450 
3451     if (vdev->ramfb_migrate == ON_OFF_AUTO_ON && !vdev->enable_ramfb) {
3452         warn_report("x-ramfb-migrate=on but ramfb=off. "
3453                     "Forcing x-ramfb-migrate to off.");
3454         vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
3455     }
3456     if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) {
3457         if (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO) {
3458             vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
3459         } else if (vdev->ramfb_migrate == ON_OFF_AUTO_ON) {
3460             error_setg(errp, "x-ramfb-migrate requires enable-migration");
3461             goto out_deregister;
3462         }
3463     }
3464 
3465     if (!pdev->failover_pair_id) {
3466         if (!vfio_migration_realize(vbasedev, errp)) {
3467             goto out_deregister;
3468         }
3469     }
3470 
3471     vfio_pci_register_err_notifier(vdev);
3472     vfio_pci_register_req_notifier(vdev);
3473     vfio_setup_resetfn_quirk(vdev);
3474 
3475     return;
3476 
3477 out_deregister:
3478     if (vdev->interrupt == VFIO_INT_INTx) {
3479         vfio_intx_disable(vdev);
3480     }
3481     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3482     if (vdev->irqchip_change_notifier.notify) {
3483         kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3484     }
3485     if (vdev->intx.mmap_timer) {
3486         timer_free(vdev->intx.mmap_timer);
3487     }
3488 out_unset_idev:
3489     if (!vbasedev->mdev) {
3490         pci_device_unset_iommu_device(pdev);
3491     }
3492 out_teardown:
3493     vfio_pci_teardown_msi(vdev);
3494     vfio_pci_bars_exit(vdev);
3495 error:
3496     error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name);
3497 }
3498 
3499 static void vfio_instance_finalize(Object *obj)
3500 {
3501     VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
3502 
3503     vfio_pci_put_device(vdev);
3504 }
3505 
3506 static void vfio_exitfn(PCIDevice *pdev)
3507 {
3508     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
3509     VFIODevice *vbasedev = &vdev->vbasedev;
3510 
3511     vfio_unregister_req_notifier(vdev);
3512     vfio_unregister_err_notifier(vdev);
3513     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3514     if (vdev->irqchip_change_notifier.notify) {
3515         kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3516     }
3517     vfio_disable_interrupts(vdev);
3518     if (vdev->intx.mmap_timer) {
3519         timer_free(vdev->intx.mmap_timer);
3520     }
3521     vfio_pci_teardown_msi(vdev);
3522     vfio_pci_disable_rp_atomics(vdev);
3523     vfio_pci_bars_exit(vdev);
3524     vfio_migration_exit(vbasedev);
3525     if (!vbasedev->mdev) {
3526         pci_device_unset_iommu_device(pdev);
3527     }
3528 }
3529 
3530 static void vfio_pci_reset(DeviceState *dev)
3531 {
3532     VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev);
3533 
3534     /* Do not reset the device during qemu_system_reset prior to cpr load */
3535     if (cpr_is_incoming()) {
3536         return;
3537     }
3538 
3539     trace_vfio_pci_reset(vdev->vbasedev.name);
3540 
3541     vfio_pci_pre_reset(vdev);
3542 
3543     if (vdev->display != ON_OFF_AUTO_OFF) {
3544         vfio_display_reset(vdev);
3545     }
3546 
3547     if (vdev->resetfn && !vdev->resetfn(vdev)) {
3548         goto post_reset;
3549     }
3550 
3551     if (vdev->vbasedev.reset_works &&
3552         (vdev->has_flr || !vdev->has_pm_reset) &&
3553         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3554         trace_vfio_pci_reset_flr(vdev->vbasedev.name);
3555         goto post_reset;
3556     }
3557 
3558     /* See if we can do our own bus reset */
3559     if (!vfio_pci_hot_reset_one(vdev)) {
3560         goto post_reset;
3561     }
3562 
3563     /* If nothing else works and the device supports PM reset, use it */
3564     if (vdev->vbasedev.reset_works && vdev->has_pm_reset &&
3565         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3566         trace_vfio_pci_reset_pm(vdev->vbasedev.name);
3567         goto post_reset;
3568     }
3569 
3570 post_reset:
3571     vfio_pci_post_reset(vdev);
3572 }
3573 
3574 static void vfio_instance_init(Object *obj)
3575 {
3576     PCIDevice *pci_dev = PCI_DEVICE(obj);
3577     VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
3578     VFIODevice *vbasedev = &vdev->vbasedev;
3579 
3580     device_add_bootindex_property(obj, &vdev->bootindex,
3581                                   "bootindex", NULL,
3582                                   &pci_dev->qdev);
3583     vdev->host.domain = ~0U;
3584     vdev->host.bus = ~0U;
3585     vdev->host.slot = ~0U;
3586     vdev->host.function = ~0U;
3587 
3588     vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops,
3589                      DEVICE(vdev), false);
3590 
3591     vdev->nv_gpudirect_clique = 0xFF;
3592 
3593     /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
3594      * line, therefore, no need to wait to realize like other devices */
3595     pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
3596 
3597     /*
3598      * A device that is resuming for cpr is already configured, so do not
3599      * reset it during qemu_system_reset prior to cpr load, else interrupts
3600      * may be lost.
3601      */
3602     pci_dev->cap_present |= QEMU_PCI_SKIP_RESET_ON_CPR;
3603 }
3604 
3605 static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data)
3606 {
3607     DeviceClass *dc = DEVICE_CLASS(klass);
3608     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3609 
3610     dc->desc = "VFIO PCI base device";
3611     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3612     pdc->exit = vfio_exitfn;
3613     pdc->config_read = vfio_pci_read_config;
3614     pdc->config_write = vfio_pci_write_config;
3615 }
3616 
3617 static const TypeInfo vfio_pci_base_dev_info = {
3618     .name = TYPE_VFIO_PCI_BASE,
3619     .parent = TYPE_PCI_DEVICE,
3620     .instance_size = sizeof(VFIOPCIDevice),
3621     .abstract = true,
3622     .class_init = vfio_pci_base_dev_class_init,
3623     .interfaces = (const InterfaceInfo[]) {
3624         { INTERFACE_PCIE_DEVICE },
3625         { INTERFACE_CONVENTIONAL_PCI_DEVICE },
3626         { }
3627     },
3628 };
3629 
3630 static PropertyInfo vfio_pci_migration_multifd_transfer_prop;
3631 
3632 static const Property vfio_pci_dev_properties[] = {
3633     DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
3634     DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
3635     DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
3636     DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
3637                             vbasedev.pre_copy_dirty_page_tracking,
3638                             ON_OFF_AUTO_ON),
3639     DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice,
3640                             vbasedev.device_dirty_page_tracking,
3641                             ON_OFF_AUTO_ON),
3642     DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
3643                             display, ON_OFF_AUTO_OFF),
3644     DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
3645     DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
3646     DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
3647                        intx.mmap_timeout, 1100),
3648     DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
3649                     VFIO_FEATURE_ENABLE_VGA_BIT, false),
3650     DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
3651                     VFIO_FEATURE_ENABLE_REQ_BIT, true),
3652     DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
3653                     VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, true),
3654     DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features,
3655                     VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false),
3656     DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice,
3657                             igd_legacy_mode, ON_OFF_AUTO_AUTO),
3658     DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice,
3659                             vbasedev.enable_migration, ON_OFF_AUTO_AUTO),
3660     DEFINE_PROP("x-migration-multifd-transfer", VFIOPCIDevice,
3661                 vbasedev.migration_multifd_transfer,
3662                 vfio_pci_migration_multifd_transfer_prop, OnOffAuto,
3663                 .set_default = true, .defval.i = ON_OFF_AUTO_AUTO),
3664     DEFINE_PROP_ON_OFF_AUTO("x-migration-load-config-after-iter", VFIOPCIDevice,
3665                             vbasedev.migration_load_config_after_iter,
3666                             ON_OFF_AUTO_AUTO),
3667     DEFINE_PROP_SIZE("x-migration-max-queued-buffers-size", VFIOPCIDevice,
3668                      vbasedev.migration_max_queued_buffers_size, UINT64_MAX),
3669     DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice,
3670                      vbasedev.migration_events, false),
3671     DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
3672     DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
3673                      vbasedev.ram_block_discard_allowed, false),
3674     DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
3675     DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
3676     DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
3677     DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
3678                      no_geforce_quirks, false),
3679     DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
3680                      false),
3681     DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
3682                      false),
3683     DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
3684     DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
3685     DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
3686                        sub_vendor_id, PCI_ANY_ID),
3687     DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
3688                        sub_device_id, PCI_ANY_ID),
3689     DEFINE_PROP_UINT32("x-pci-class-code", VFIOPCIDevice,
3690                        class_code, PCI_ANY_ID),
3691     DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
3692     DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
3693                                    nv_gpudirect_clique,
3694                                    qdev_prop_nv_gpudirect_clique, uint8_t),
3695     DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
3696                                 OFF_AUTO_PCIBAR_OFF),
3697 #ifdef CONFIG_IOMMUFD
3698     DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
3699                      TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
3700 #endif
3701     DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
3702 };
3703 
3704 #ifdef CONFIG_IOMMUFD
3705 static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp)
3706 {
3707     VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
3708     vfio_device_set_fd(&vdev->vbasedev, str, errp);
3709 }
3710 #endif
3711 
3712 static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data)
3713 {
3714     DeviceClass *dc = DEVICE_CLASS(klass);
3715     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3716 
3717     device_class_set_legacy_reset(dc, vfio_pci_reset);
3718     device_class_set_props(dc, vfio_pci_dev_properties);
3719 #ifdef CONFIG_IOMMUFD
3720     object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd);
3721 #endif
3722     dc->vmsd = &vfio_cpr_pci_vmstate;
3723     dc->desc = "VFIO-based PCI device assignment";
3724     pdc->realize = vfio_pci_realize;
3725 
3726     object_class_property_set_description(klass, /* 1.3 */
3727                                           "host",
3728                                           "Host PCI address [domain:]<bus:slot.function> of assigned device");
3729     object_class_property_set_description(klass, /* 1.3 */
3730                                           "x-intx-mmap-timeout-ms",
3731                                           "When EOI is not provided by KVM/QEMU, wait time "
3732                                           "(milliseconds) to re-enable device direct access "
3733                                           "after INTx (DEBUG)");
3734     object_class_property_set_description(klass, /* 1.5 */
3735                                           "x-vga",
3736                                           "Expose VGA address spaces for device");
3737     object_class_property_set_description(klass, /* 2.3 */
3738                                           "x-req",
3739                                           "Disable device request notification support (DEBUG)");
3740     object_class_property_set_description(klass, /* 2.4 and 2.5 */
3741                                           "x-no-mmap",
3742                                           "Disable MMAP for device. Allows to trace MMIO "
3743                                           "accesses (DEBUG)");
3744     object_class_property_set_description(klass, /* 2.5 */
3745                                           "x-no-kvm-intx",
3746                                           "Disable direct VFIO->KVM INTx injection. Allows to "
3747                                           "trace INTx interrupts (DEBUG)");
3748     object_class_property_set_description(klass, /* 2.5 */
3749                                           "x-no-kvm-msi",
3750                                           "Disable direct VFIO->KVM MSI injection. Allows to "
3751                                           "trace MSI interrupts (DEBUG)");
3752     object_class_property_set_description(klass, /* 2.5 */
3753                                           "x-no-kvm-msix",
3754                                           "Disable direct VFIO->KVM MSIx injection. Allows to "
3755                                           "trace MSIx interrupts (DEBUG)");
3756     object_class_property_set_description(klass, /* 2.5 */
3757                                           "x-pci-vendor-id",
3758                                           "Override PCI Vendor ID with provided value (DEBUG)");
3759     object_class_property_set_description(klass, /* 2.5 */
3760                                           "x-pci-device-id",
3761                                           "Override PCI device ID with provided value (DEBUG)");
3762     object_class_property_set_description(klass, /* 2.5 */
3763                                           "x-pci-sub-vendor-id",
3764                                           "Override PCI Subsystem Vendor ID with provided value "
3765                                           "(DEBUG)");
3766     object_class_property_set_description(klass, /* 2.5 */
3767                                           "x-pci-sub-device-id",
3768                                           "Override PCI Subsystem Device ID with provided value "
3769                                           "(DEBUG)");
3770     object_class_property_set_description(klass, /* 2.6 */
3771                                           "sysfsdev",
3772                                           "Host sysfs path of assigned device");
3773     object_class_property_set_description(klass, /* 2.7 */
3774                                           "x-igd-opregion",
3775                                           "Expose host IGD OpRegion to guest");
3776     object_class_property_set_description(klass, /* 2.7 (See c4c45e943e51) */
3777                                           "x-igd-gms",
3778                                           "Override IGD data stolen memory size (32MiB units)");
3779     object_class_property_set_description(klass, /* 2.11 */
3780                                           "x-nv-gpudirect-clique",
3781                                           "Add NVIDIA GPUDirect capability indicating P2P DMA "
3782                                           "clique for device [0-15]");
3783     object_class_property_set_description(klass, /* 2.12 */
3784                                           "x-no-geforce-quirks",
3785                                           "Disable GeForce quirks (for NVIDIA Quadro/GRID/Tesla). "
3786                                           "Improves performance");
3787     object_class_property_set_description(klass, /* 2.12 */
3788                                           "display",
3789                                           "Enable display support for device, ex. vGPU");
3790     object_class_property_set_description(klass, /* 2.12 */
3791                                           "x-msix-relocation",
3792                                           "Specify MSI-X MMIO relocation to the end of specified "
3793                                           "existing BAR or new BAR to avoid virtualization overhead "
3794                                           "due to adjacent device registers");
3795     object_class_property_set_description(klass, /* 3.0 */
3796                                           "x-no-kvm-ioeventfd",
3797                                           "Disable registration of ioeventfds with KVM (DEBUG)");
3798     object_class_property_set_description(klass, /* 3.0 */
3799                                           "x-no-vfio-ioeventfd",
3800                                           "Disable linking of KVM ioeventfds to VFIO ioeventfds "
3801                                           "(DEBUG)");
3802     object_class_property_set_description(klass, /* 3.1 */
3803                                           "x-balloon-allowed",
3804                                           "Override allowing ballooning with device (DEBUG, DANGER)");
3805     object_class_property_set_description(klass, /* 3.2 */
3806                                           "xres",
3807                                           "Set X display resolution the vGPU should use");
3808     object_class_property_set_description(klass, /* 3.2 */
3809                                           "yres",
3810                                           "Set Y display resolution the vGPU should use");
3811     object_class_property_set_description(klass, /* 5.2 */
3812                                           "x-pre-copy-dirty-page-tracking",
3813                                           "Disable dirty pages tracking during iterative phase "
3814                                           "(DEBUG)");
3815     object_class_property_set_description(klass, /* 5.2, 8.0 non-experimetal */
3816                                           "enable-migration",
3817                                           "Enale device migration. Also requires a host VFIO PCI "
3818                                           "variant or mdev driver with migration support enabled");
3819     object_class_property_set_description(klass, /* 8.1 */
3820                                           "vf-token",
3821                                           "Specify UUID VF token. Required for VF when PF is owned "
3822                                           "by another VFIO driver");
3823 #ifdef CONFIG_IOMMUFD
3824     object_class_property_set_description(klass, /* 9.0 */
3825                                           "iommufd",
3826                                           "Set host IOMMUFD backend device");
3827 #endif
3828     object_class_property_set_description(klass, /* 9.1 */
3829                                           "x-device-dirty-page-tracking",
3830                                           "Disable device dirty page tracking and use "
3831                                           "container-based dirty page tracking");
3832     object_class_property_set_description(klass, /* 9.1 */
3833                                           "migration-events",
3834                                           "Emit VFIO migration QAPI event when a VFIO device "
3835                                           "changes its migration state. For management applications");
3836     object_class_property_set_description(klass, /* 9.1 */
3837                                           "skip-vsc-check",
3838                                           "Skip config space check for Vendor Specific Capability. "
3839                                           "Setting to false will enforce strict checking of VSC content "
3840                                           "(DEBUG)");
3841     object_class_property_set_description(klass, /* 10.0 */
3842                                           "x-migration-multifd-transfer",
3843                                           "Transfer this device state via "
3844                                           "multifd channels when live migrating it");
3845     object_class_property_set_description(klass, /* 10.1 */
3846                                           "x-migration-load-config-after-iter",
3847                                           "Start the config load only after "
3848                                           "all iterables were loaded (during "
3849                                           "non-iterables loading phase) when "
3850                                           "doing live migration of device state "
3851                                           "via multifd channels");
3852     object_class_property_set_description(klass, /* 10.1 */
3853                                           "x-migration-max-queued-buffers-size",
3854                                           "Maximum size of in-flight VFIO "
3855                                           "device state buffers queued at the "
3856                                           "destination when doing live "
3857                                           "migration of device state via "
3858                                           "multifd channels");
3859 }
3860 
3861 static const TypeInfo vfio_pci_dev_info = {
3862     .name = TYPE_VFIO_PCI,
3863     .parent = TYPE_VFIO_PCI_BASE,
3864     .class_init = vfio_pci_dev_class_init,
3865     .instance_init = vfio_instance_init,
3866     .instance_finalize = vfio_instance_finalize,
3867 };
3868 
3869 static const Property vfio_pci_dev_nohotplug_properties[] = {
3870     DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false),
3871     DEFINE_PROP_BOOL("use-legacy-x86-rom", VFIOPCIDevice,
3872                      use_legacy_x86_rom, false),
3873     DEFINE_PROP_ON_OFF_AUTO("x-ramfb-migrate", VFIOPCIDevice, ramfb_migrate,
3874                             ON_OFF_AUTO_AUTO),
3875 };
3876 
3877 static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass,
3878                                               const void *data)
3879 {
3880     DeviceClass *dc = DEVICE_CLASS(klass);
3881 
3882     device_class_set_props(dc, vfio_pci_dev_nohotplug_properties);
3883     dc->hotpluggable = false;
3884 
3885     object_class_property_set_description(klass, /* 3.1 */
3886                                           "ramfb",
3887                                           "Enable ramfb to provide pre-boot graphics for devices "
3888                                           "enabling display option");
3889     object_class_property_set_description(klass, /* 8.2 */
3890                                           "x-ramfb-migrate",
3891                                           "Override default migration support for ramfb support "
3892                                           "(DEBUG)");
3893 }
3894 
3895 static const TypeInfo vfio_pci_nohotplug_dev_info = {
3896     .name = TYPE_VFIO_PCI_NOHOTPLUG,
3897     .parent = TYPE_VFIO_PCI,
3898     .instance_size = sizeof(VFIOPCIDevice),
3899     .class_init = vfio_pci_nohotplug_dev_class_init,
3900 };
3901 
3902 static void register_vfio_pci_dev_type(void)
3903 {
3904     /*
3905      * Ordinary ON_OFF_AUTO property isn't runtime-mutable, but source VM can
3906      * run for a long time before being migrated so it is desirable to have a
3907      * fallback mechanism to the old way of transferring VFIO device state if
3908      * it turns to be necessary.
3909      * The following makes this type of property have the same mutability level
3910      * as ordinary migration parameters.
3911      */
3912     vfio_pci_migration_multifd_transfer_prop = qdev_prop_on_off_auto;
3913     vfio_pci_migration_multifd_transfer_prop.realized_set_allowed = true;
3914 
3915     type_register_static(&vfio_pci_base_dev_info);
3916     type_register_static(&vfio_pci_dev_info);
3917     type_register_static(&vfio_pci_nohotplug_dev_info);
3918 }
3919 
3920 type_init(register_vfio_pci_dev_type)
3921