xref: /openbmc/qemu/hw/vfio/pci.c (revision 7048e70f391df76d009eecca25f8027858f9f304)
1 /*
2  * vfio based device assignment support
3  *
4  * Copyright Red Hat, Inc. 2012
5  *
6  * Authors:
7  *  Alex Williamson <alex.williamson@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Based on qemu-kvm device-assignment:
13  *  Adapted for KVM by Qumranet.
14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19  */
20 
21 #include "qemu/osdep.h"
22 #include CONFIG_DEVICES /* CONFIG_IOMMUFD */
23 #include <linux/vfio.h>
24 #include <sys/ioctl.h>
25 
26 #include "hw/hw.h"
27 #include "hw/pci/msi.h"
28 #include "hw/pci/msix.h"
29 #include "hw/pci/pci_bridge.h"
30 #include "hw/qdev-properties.h"
31 #include "hw/qdev-properties-system.h"
32 #include "migration/vmstate.h"
33 #include "migration/cpr.h"
34 #include "qobject/qdict.h"
35 #include "qemu/error-report.h"
36 #include "qemu/main-loop.h"
37 #include "qemu/module.h"
38 #include "qemu/range.h"
39 #include "qemu/units.h"
40 #include "system/kvm.h"
41 #include "system/runstate.h"
42 #include "pci.h"
43 #include "trace.h"
44 #include "qapi/error.h"
45 #include "migration/blocker.h"
46 #include "migration/qemu-file.h"
47 #include "system/iommufd.h"
48 #include "vfio-migration-internal.h"
49 #include "vfio-helpers.h"
50 
51 #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
52 
53 /* Protected by BQL */
54 static KVMRouteChange vfio_route_change;
55 
56 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
57 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
58 static void vfio_msi_disable_common(VFIOPCIDevice *vdev);
59 
60 static bool vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e,
61                                const char *name, int nr, Error **errp)
62 {
63     int ret = event_notifier_init(e, 0);
64 
65     if (ret) {
66         error_setg_errno(errp, -ret, "vfio_notifier_init %s failed", name);
67     }
68     return !ret;
69 }
70 
71 static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e,
72                                   const char *name, int nr)
73 {
74     event_notifier_cleanup(e);
75 }
76 
77 /*
78  * Disabling BAR mmaping can be slow, but toggling it around INTx can
79  * also be a huge overhead.  We try to get the best of both worlds by
80  * waiting until an interrupt to disable mmaps (subsequent transitions
81  * to the same state are effectively no overhead).  If the interrupt has
82  * been serviced and the time gap is long enough, we re-enable mmaps for
83  * performance.  This works well for things like graphics cards, which
84  * may not use their interrupt at all and are penalized to an unusable
85  * level by read/write BAR traps.  Other devices, like NICs, have more
86  * regular interrupts and see much better latency by staying in non-mmap
87  * mode.  We therefore set the default mmap_timeout such that a ping
88  * is just enough to keep the mmap disabled.  Users can experiment with
89  * other options with the x-intx-mmap-timeout-ms parameter (a value of
90  * zero disables the timer).
91  */
92 static void vfio_intx_mmap_enable(void *opaque)
93 {
94     VFIOPCIDevice *vdev = opaque;
95 
96     if (vdev->intx.pending) {
97         timer_mod(vdev->intx.mmap_timer,
98                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
99         return;
100     }
101 
102     vfio_mmap_set_enabled(vdev, true);
103 }
104 
105 static void vfio_intx_interrupt(void *opaque)
106 {
107     VFIOPCIDevice *vdev = opaque;
108 
109     if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
110         return;
111     }
112 
113     trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin);
114 
115     vdev->intx.pending = true;
116     pci_irq_assert(&vdev->pdev);
117     vfio_mmap_set_enabled(vdev, false);
118     if (vdev->intx.mmap_timeout) {
119         timer_mod(vdev->intx.mmap_timer,
120                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
121     }
122 }
123 
124 void vfio_pci_intx_eoi(VFIODevice *vbasedev)
125 {
126     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
127 
128     if (!vdev->intx.pending) {
129         return;
130     }
131 
132     trace_vfio_pci_intx_eoi(vbasedev->name);
133 
134     vdev->intx.pending = false;
135     pci_irq_deassert(&vdev->pdev);
136     vfio_device_irq_unmask(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
137 }
138 
139 static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
140 {
141 #ifdef CONFIG_KVM
142     int irq_fd = event_notifier_get_fd(&vdev->intx.interrupt);
143 
144     if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
145         vdev->intx.route.mode != PCI_INTX_ENABLED ||
146         !kvm_resamplefds_enabled()) {
147         return true;
148     }
149 
150     /* Get to a known interrupt state */
151     qemu_set_fd_handler(irq_fd, NULL, NULL, vdev);
152     vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
153     vdev->intx.pending = false;
154     pci_irq_deassert(&vdev->pdev);
155 
156     /* Get an eventfd for resample/unmask */
157     if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) {
158         goto fail;
159     }
160 
161     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
162                                            &vdev->intx.interrupt,
163                                            &vdev->intx.unmask,
164                                            vdev->intx.route.irq)) {
165         error_setg_errno(errp, errno, "failed to setup resample irqfd");
166         goto fail_irqfd;
167     }
168 
169     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
170                                        VFIO_IRQ_SET_ACTION_UNMASK,
171                                        event_notifier_get_fd(&vdev->intx.unmask),
172                                        errp)) {
173         goto fail_vfio;
174     }
175 
176     /* Let'em rip */
177     vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
178 
179     vdev->intx.kvm_accel = true;
180 
181     trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
182 
183     return true;
184 
185 fail_vfio:
186     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
187                                           vdev->intx.route.irq);
188 fail_irqfd:
189     vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0);
190 fail:
191     qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev);
192     vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
193     return false;
194 #else
195     return true;
196 #endif
197 }
198 
199 static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
200 {
201 #ifdef CONFIG_KVM
202     if (!vdev->intx.kvm_accel) {
203         return;
204     }
205 
206     /*
207      * Get to a known state, hardware masked, QEMU ready to accept new
208      * interrupts, QEMU IRQ de-asserted.
209      */
210     vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
211     vdev->intx.pending = false;
212     pci_irq_deassert(&vdev->pdev);
213 
214     /* Tell KVM to stop listening for an INTx irqfd */
215     if (kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
216                                               vdev->intx.route.irq)) {
217         error_report("vfio: Error: Failed to disable INTx irqfd: %m");
218     }
219 
220     /* We only need to close the eventfd for VFIO to cleanup the kernel side */
221     vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0);
222 
223     /* QEMU starts listening for interrupt events. */
224     qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt),
225                         vfio_intx_interrupt, NULL, vdev);
226 
227     vdev->intx.kvm_accel = false;
228 
229     /* If we've missed an event, let it re-fire through QEMU */
230     vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
231 
232     trace_vfio_intx_disable_kvm(vdev->vbasedev.name);
233 #endif
234 }
235 
236 static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route)
237 {
238     Error *err = NULL;
239 
240     trace_vfio_intx_update(vdev->vbasedev.name,
241                            vdev->intx.route.irq, route->irq);
242 
243     vfio_intx_disable_kvm(vdev);
244 
245     vdev->intx.route = *route;
246 
247     if (route->mode != PCI_INTX_ENABLED) {
248         return;
249     }
250 
251     if (!vfio_intx_enable_kvm(vdev, &err)) {
252         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
253     }
254 
255     /* Re-enable the interrupt in cased we missed an EOI */
256     vfio_pci_intx_eoi(&vdev->vbasedev);
257 }
258 
259 static void vfio_intx_routing_notifier(PCIDevice *pdev)
260 {
261     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
262     PCIINTxRoute route;
263 
264     if (vdev->interrupt != VFIO_INT_INTx) {
265         return;
266     }
267 
268     route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
269 
270     if (pci_intx_route_changed(&vdev->intx.route, &route)) {
271         vfio_intx_update(vdev, &route);
272     }
273 }
274 
275 static void vfio_irqchip_change(Notifier *notify, void *data)
276 {
277     VFIOPCIDevice *vdev = container_of(notify, VFIOPCIDevice,
278                                        irqchip_change_notifier);
279 
280     vfio_intx_update(vdev, &vdev->intx.route);
281 }
282 
283 static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
284 {
285     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
286     Error *err = NULL;
287     int32_t fd;
288 
289 
290     if (!pin) {
291         return true;
292     }
293 
294     vfio_disable_interrupts(vdev);
295 
296     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
297     pci_config_set_interrupt_pin(vdev->pdev.config, pin);
298 
299 #ifdef CONFIG_KVM
300     /*
301      * Only conditional to avoid generating error messages on platforms
302      * where we won't actually use the result anyway.
303      */
304     if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
305         vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
306                                                         vdev->intx.pin);
307     }
308 #endif
309 
310     if (!vfio_notifier_init(vdev, &vdev->intx.interrupt, "intx-interrupt", 0,
311                             errp)) {
312         return false;
313     }
314     fd = event_notifier_get_fd(&vdev->intx.interrupt);
315     qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
316 
317     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
318                                 VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
319         qemu_set_fd_handler(fd, NULL, NULL, vdev);
320         vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0);
321         return false;
322     }
323 
324     if (!vfio_intx_enable_kvm(vdev, &err)) {
325         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
326     }
327 
328     vdev->interrupt = VFIO_INT_INTx;
329 
330     trace_vfio_intx_enable(vdev->vbasedev.name);
331     return true;
332 }
333 
334 static void vfio_intx_disable(VFIOPCIDevice *vdev)
335 {
336     int fd;
337 
338     timer_del(vdev->intx.mmap_timer);
339     vfio_intx_disable_kvm(vdev);
340     vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
341     vdev->intx.pending = false;
342     pci_irq_deassert(&vdev->pdev);
343     vfio_mmap_set_enabled(vdev, true);
344 
345     fd = event_notifier_get_fd(&vdev->intx.interrupt);
346     qemu_set_fd_handler(fd, NULL, NULL, vdev);
347     vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0);
348 
349     vdev->interrupt = VFIO_INT_NONE;
350 
351     trace_vfio_intx_disable(vdev->vbasedev.name);
352 }
353 
354 bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp)
355 {
356     return vfio_intx_enable(vdev, errp);
357 }
358 
359 /*
360  * MSI/X
361  */
362 static void vfio_msi_interrupt(void *opaque)
363 {
364     VFIOMSIVector *vector = opaque;
365     VFIOPCIDevice *vdev = vector->vdev;
366     MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector);
367     void (*notify)(PCIDevice *dev, unsigned vector);
368     MSIMessage msg;
369     int nr = vector - vdev->msi_vectors;
370 
371     if (!event_notifier_test_and_clear(&vector->interrupt)) {
372         return;
373     }
374 
375     if (vdev->interrupt == VFIO_INT_MSIX) {
376         get_msg = msix_get_message;
377         notify = msix_notify;
378 
379         /* A masked vector firing needs to use the PBA, enable it */
380         if (msix_is_masked(&vdev->pdev, nr)) {
381             set_bit(nr, vdev->msix->pending);
382             memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true);
383             trace_vfio_msix_pba_enable(vdev->vbasedev.name);
384         }
385     } else if (vdev->interrupt == VFIO_INT_MSI) {
386         get_msg = msi_get_message;
387         notify = msi_notify;
388     } else {
389         abort();
390     }
391 
392     msg = get_msg(&vdev->pdev, nr);
393     trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data);
394     notify(&vdev->pdev, nr);
395 }
396 
397 /*
398  * Get MSI-X enabled, but no vector enabled, by setting vector 0 with an invalid
399  * fd to kernel.
400  */
401 static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
402 {
403     g_autofree struct vfio_irq_set *irq_set = NULL;
404     int argsz;
405     int32_t *fd;
406 
407     argsz = sizeof(*irq_set) + sizeof(*fd);
408 
409     irq_set = g_malloc0(argsz);
410     irq_set->argsz = argsz;
411     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
412                      VFIO_IRQ_SET_ACTION_TRIGGER;
413     irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
414     irq_set->start = 0;
415     irq_set->count = 1;
416     fd = (int32_t *)&irq_set->data;
417     *fd = -1;
418 
419     return vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
420 }
421 
422 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
423 {
424     struct vfio_irq_set *irq_set;
425     int ret = 0, i, argsz;
426     int32_t *fds;
427 
428     /*
429      * If dynamic MSI-X allocation is supported, the vectors to be allocated
430      * and enabled can be scattered. Before kernel enabling MSI-X, setting
431      * nr_vectors causes all these vectors to be allocated on host.
432      *
433      * To keep allocation as needed, use vector 0 with an invalid fd to get
434      * MSI-X enabled first, then set vectors with a potentially sparse set of
435      * eventfds to enable interrupts only when enabled in guest.
436      */
437     if (msix && !vdev->msix->noresize) {
438         ret = vfio_enable_msix_no_vec(vdev);
439 
440         if (ret) {
441             return ret;
442         }
443     }
444 
445     argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
446 
447     irq_set = g_malloc0(argsz);
448     irq_set->argsz = argsz;
449     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
450     irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
451     irq_set->start = 0;
452     irq_set->count = vdev->nr_vectors;
453     fds = (int32_t *)&irq_set->data;
454 
455     for (i = 0; i < vdev->nr_vectors; i++) {
456         int fd = -1;
457 
458         /*
459          * MSI vs MSI-X - The guest has direct access to MSI mask and pending
460          * bits, therefore we always use the KVM signaling path when setup.
461          * MSI-X mask and pending bits are emulated, so we want to use the
462          * KVM signaling path only when configured and unmasked.
463          */
464         if (vdev->msi_vectors[i].use) {
465             if (vdev->msi_vectors[i].virq < 0 ||
466                 (msix && msix_is_masked(&vdev->pdev, i))) {
467                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
468             } else {
469                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
470             }
471         }
472 
473         fds[i] = fd;
474     }
475 
476     ret = vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
477 
478     g_free(irq_set);
479 
480     return ret;
481 }
482 
483 void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
484                                int vector_n, bool msix)
485 {
486     if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) {
487         return;
488     }
489 
490     vector->virq = kvm_irqchip_add_msi_route(&vfio_route_change,
491                                              vector_n, &vdev->pdev);
492 }
493 
494 static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector, int nr)
495 {
496     const char *name = "kvm_interrupt";
497 
498     if (vector->virq < 0) {
499         return;
500     }
501 
502     if (!vfio_notifier_init(vector->vdev, &vector->kvm_interrupt, name, nr,
503                             NULL)) {
504         goto fail_notifier;
505     }
506 
507     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
508                                            NULL, vector->virq) < 0) {
509         goto fail_kvm;
510     }
511 
512     return;
513 
514 fail_kvm:
515     vfio_notifier_cleanup(vector->vdev, &vector->kvm_interrupt, name, nr);
516 fail_notifier:
517     kvm_irqchip_release_virq(kvm_state, vector->virq);
518     vector->virq = -1;
519 }
520 
521 static void vfio_remove_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
522                                      int nr)
523 {
524     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
525                                           vector->virq);
526     kvm_irqchip_release_virq(kvm_state, vector->virq);
527     vector->virq = -1;
528     vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, "kvm_interrupt", nr);
529 }
530 
531 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
532                                      PCIDevice *pdev)
533 {
534     kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev);
535     kvm_irqchip_commit_routes(kvm_state);
536 }
537 
538 static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector,
539                                unsigned int nr)
540 {
541     Error *err = NULL;
542     int32_t fd;
543 
544     if (vector->virq >= 0) {
545         fd = event_notifier_get_fd(&vector->kvm_interrupt);
546     } else {
547         fd = event_notifier_get_fd(&vector->interrupt);
548     }
549 
550     if (!vfio_device_irq_set_signaling(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr,
551                                        VFIO_IRQ_SET_ACTION_TRIGGER,
552                                        fd, &err)) {
553         error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name);
554     }
555 }
556 
557 void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr)
558 {
559     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
560     PCIDevice *pdev = &vdev->pdev;
561     Error *local_err = NULL;
562 
563     vector->vdev = vdev;
564     vector->virq = -1;
565     if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", nr,
566                             &local_err)) {
567         error_report_err(local_err);
568     }
569     vector->use = true;
570     if (vdev->interrupt == VFIO_INT_MSIX) {
571         msix_vector_use(pdev, nr);
572     }
573 }
574 
575 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
576                                    MSIMessage *msg, IOHandler *handler)
577 {
578     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
579     VFIOMSIVector *vector;
580     int ret;
581     bool resizing = !!(vdev->nr_vectors < nr + 1);
582 
583     trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
584 
585     vector = &vdev->msi_vectors[nr];
586 
587     if (!vector->use) {
588         vfio_pci_vector_init(vdev, nr);
589     }
590 
591     qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
592                         handler, NULL, vector);
593 
594     /*
595      * Attempt to enable route through KVM irqchip,
596      * default to userspace handling if unavailable.
597      */
598     if (vector->virq >= 0) {
599         if (!msg) {
600             vfio_remove_kvm_msi_virq(vdev, vector, nr);
601         } else {
602             vfio_update_kvm_msi_virq(vector, *msg, pdev);
603         }
604     } else {
605         if (msg) {
606             if (vdev->defer_kvm_irq_routing) {
607                 vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true);
608             } else {
609                 vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
610                 vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true);
611                 kvm_irqchip_commit_route_changes(&vfio_route_change);
612                 vfio_connect_kvm_msi_virq(vector, nr);
613             }
614         }
615     }
616 
617     /*
618      * When dynamic allocation is not supported, we don't want to have the
619      * host allocate all possible MSI vectors for a device if they're not
620      * in use, so we shutdown and incrementally increase them as needed.
621      * nr_vectors represents the total number of vectors allocated.
622      *
623      * When dynamic allocation is supported, let the host only allocate
624      * and enable a vector when it is in use in guest. nr_vectors represents
625      * the upper bound of vectors being enabled (but not all of the ranges
626      * is allocated or enabled).
627      */
628     if (resizing) {
629         vdev->nr_vectors = nr + 1;
630     }
631 
632     if (!vdev->defer_kvm_irq_routing) {
633         if (vdev->msix->noresize && resizing) {
634             vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
635             ret = vfio_enable_vectors(vdev, true);
636             if (ret) {
637                 error_report("vfio: failed to enable vectors, %s",
638                              strerror(-ret));
639             }
640         } else {
641             set_irq_signalling(&vdev->vbasedev, vector, nr);
642         }
643     }
644 
645     /* Disable PBA emulation when nothing more is pending. */
646     clear_bit(nr, vdev->msix->pending);
647     if (find_first_bit(vdev->msix->pending,
648                        vdev->nr_vectors) == vdev->nr_vectors) {
649         memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
650         trace_vfio_msix_pba_disable(vdev->vbasedev.name);
651     }
652 
653     return 0;
654 }
655 
656 static int vfio_msix_vector_use(PCIDevice *pdev,
657                                 unsigned int nr, MSIMessage msg)
658 {
659     return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
660 }
661 
662 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
663 {
664     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
665     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
666 
667     trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
668 
669     /*
670      * There are still old guests that mask and unmask vectors on every
671      * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
672      * the KVM setup in place, simply switch VFIO to use the non-bypass
673      * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
674      * core will mask the interrupt and set pending bits, allowing it to
675      * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
676      */
677     if (vector->virq >= 0) {
678         int32_t fd = event_notifier_get_fd(&vector->interrupt);
679         Error *err = NULL;
680 
681         if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
682                                     nr, VFIO_IRQ_SET_ACTION_TRIGGER, fd,
683                                     &err)) {
684             error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
685         }
686     }
687 }
688 
689 void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
690 {
691     assert(!vdev->defer_kvm_irq_routing);
692     vdev->defer_kvm_irq_routing = true;
693     vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
694 }
695 
696 void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
697 {
698     int i;
699 
700     assert(vdev->defer_kvm_irq_routing);
701     vdev->defer_kvm_irq_routing = false;
702 
703     kvm_irqchip_commit_route_changes(&vfio_route_change);
704 
705     for (i = 0; i < vdev->nr_vectors; i++) {
706         vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i], i);
707     }
708 }
709 
710 static void vfio_msix_enable(VFIOPCIDevice *vdev)
711 {
712     int ret;
713 
714     vfio_disable_interrupts(vdev);
715 
716     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries);
717 
718     vdev->interrupt = VFIO_INT_MSIX;
719 
720     /*
721      * Setting vector notifiers triggers synchronous vector-use
722      * callbacks for each active vector.  Deferring to commit the KVM
723      * routes once rather than per vector provides a substantial
724      * performance improvement.
725      */
726     vfio_pci_prepare_kvm_msi_virq_batch(vdev);
727 
728     if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
729                                   vfio_msix_vector_release, NULL)) {
730         error_report("vfio: msix_set_vector_notifiers failed");
731     }
732 
733     vfio_pci_commit_kvm_msi_virq_batch(vdev);
734 
735     if (vdev->nr_vectors) {
736         ret = vfio_enable_vectors(vdev, true);
737         if (ret) {
738             error_report("vfio: failed to enable vectors, %s",
739                          strerror(-ret));
740         }
741     } else {
742         /*
743          * Some communication channels between VF & PF or PF & fw rely on the
744          * physical state of the device and expect that enabling MSI-X from the
745          * guest enables the same on the host.  When our guest is Linux, the
746          * guest driver call to pci_enable_msix() sets the enabling bit in the
747          * MSI-X capability, but leaves the vector table masked.  We therefore
748          * can't rely on a vector_use callback (from request_irq() in the guest)
749          * to switch the physical device into MSI-X mode because that may come a
750          * long time after pci_enable_msix().  This code sets vector 0 with an
751          * invalid fd to make the physical device MSI-X enabled, but with no
752          * vectors enabled, just like the guest view.
753          */
754         ret = vfio_enable_msix_no_vec(vdev);
755         if (ret) {
756             error_report("vfio: failed to enable MSI-X, %s",
757                          strerror(-ret));
758         }
759     }
760 
761     trace_vfio_msix_enable(vdev->vbasedev.name);
762 }
763 
764 static void vfio_msi_enable(VFIOPCIDevice *vdev)
765 {
766     int ret, i;
767 
768     vfio_disable_interrupts(vdev);
769 
770     vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
771 retry:
772     /*
773      * Setting vector notifiers needs to enable route for each vector.
774      * Deferring to commit the KVM routes once rather than per vector
775      * provides a substantial performance improvement.
776      */
777     vfio_pci_prepare_kvm_msi_virq_batch(vdev);
778 
779     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
780 
781     for (i = 0; i < vdev->nr_vectors; i++) {
782         VFIOMSIVector *vector = &vdev->msi_vectors[i];
783         Error *local_err = NULL;
784 
785         vector->vdev = vdev;
786         vector->virq = -1;
787         vector->use = true;
788 
789         if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", i,
790                                 &local_err)) {
791             error_report_err(local_err);
792         }
793 
794         qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
795                             vfio_msi_interrupt, NULL, vector);
796 
797         /*
798          * Attempt to enable route through KVM irqchip,
799          * default to userspace handling if unavailable.
800          */
801         vfio_pci_add_kvm_msi_virq(vdev, vector, i, false);
802     }
803 
804     vfio_pci_commit_kvm_msi_virq_batch(vdev);
805 
806     /* Set interrupt type prior to possible interrupts */
807     vdev->interrupt = VFIO_INT_MSI;
808 
809     ret = vfio_enable_vectors(vdev, false);
810     if (ret) {
811         if (ret < 0) {
812             error_report("vfio: Error: Failed to setup MSI fds: %s",
813                          strerror(-ret));
814         } else {
815             error_report("vfio: Error: Failed to enable %d "
816                          "MSI vectors, retry with %d", vdev->nr_vectors, ret);
817         }
818 
819         vfio_msi_disable_common(vdev);
820 
821         if (ret > 0) {
822             vdev->nr_vectors = ret;
823             goto retry;
824         }
825 
826         /*
827          * Failing to setup MSI doesn't really fall within any specification.
828          * Let's try leaving interrupts disabled and hope the guest figures
829          * out to fall back to INTx for this device.
830          */
831         error_report("vfio: Error: Failed to enable MSI");
832 
833         return;
834     }
835 
836     trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
837 }
838 
839 static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
840 {
841     int i;
842 
843     for (i = 0; i < vdev->nr_vectors; i++) {
844         VFIOMSIVector *vector = &vdev->msi_vectors[i];
845         if (vdev->msi_vectors[i].use) {
846             if (vector->virq >= 0) {
847                 vfio_remove_kvm_msi_virq(vdev, vector, i);
848             }
849             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
850                                 NULL, NULL, NULL);
851             vfio_notifier_cleanup(vdev, &vector->interrupt, "interrupt", i);
852         }
853     }
854 
855     g_free(vdev->msi_vectors);
856     vdev->msi_vectors = NULL;
857     vdev->nr_vectors = 0;
858     vdev->interrupt = VFIO_INT_NONE;
859 }
860 
861 static void vfio_msix_disable(VFIOPCIDevice *vdev)
862 {
863     Error *err = NULL;
864     int i;
865 
866     msix_unset_vector_notifiers(&vdev->pdev);
867 
868     /*
869      * MSI-X will only release vectors if MSI-X is still enabled on the
870      * device, check through the rest and release it ourselves if necessary.
871      */
872     for (i = 0; i < vdev->nr_vectors; i++) {
873         if (vdev->msi_vectors[i].use) {
874             vfio_msix_vector_release(&vdev->pdev, i);
875             msix_vector_unuse(&vdev->pdev, i);
876         }
877     }
878 
879     /*
880      * Always clear MSI-X IRQ index. A PF device could have enabled
881      * MSI-X with no vectors. See vfio_msix_enable().
882      */
883     vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
884 
885     vfio_msi_disable_common(vdev);
886     if (!vfio_intx_enable(vdev, &err)) {
887         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
888     }
889 
890     memset(vdev->msix->pending, 0,
891            BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
892 
893     trace_vfio_msix_disable(vdev->vbasedev.name);
894 }
895 
896 static void vfio_msi_disable(VFIOPCIDevice *vdev)
897 {
898     Error *err = NULL;
899 
900     vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
901     vfio_msi_disable_common(vdev);
902     vfio_intx_enable(vdev, &err);
903     if (err) {
904         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
905     }
906 
907     trace_vfio_msi_disable(vdev->vbasedev.name);
908 }
909 
910 static void vfio_update_msi(VFIOPCIDevice *vdev)
911 {
912     int i;
913 
914     for (i = 0; i < vdev->nr_vectors; i++) {
915         VFIOMSIVector *vector = &vdev->msi_vectors[i];
916         MSIMessage msg;
917 
918         if (!vector->use || vector->virq < 0) {
919             continue;
920         }
921 
922         msg = msi_get_message(&vdev->pdev, i);
923         vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev);
924     }
925 }
926 
927 static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
928 {
929     VFIODevice *vbasedev = &vdev->vbasedev;
930     struct vfio_region_info *reg_info = NULL;
931     uint64_t size;
932     off_t off = 0;
933     ssize_t bytes;
934     int ret;
935 
936     ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_ROM_REGION_INDEX,
937                                       &reg_info);
938 
939     if (ret != 0) {
940         error_report("vfio: Error getting ROM info: %s", strerror(-ret));
941         return;
942     }
943 
944     trace_vfio_pci_load_rom(vbasedev->name, (unsigned long)reg_info->size,
945                             (unsigned long)reg_info->offset,
946                             (unsigned long)reg_info->flags);
947 
948     vdev->rom_size = size = reg_info->size;
949     vdev->rom_offset = reg_info->offset;
950 
951     if (!vdev->rom_size) {
952         vdev->rom_read_failed = true;
953         error_report("vfio-pci: Cannot read device rom at %s", vbasedev->name);
954         error_printf("Device option ROM contents are probably invalid "
955                     "(check dmesg).\nSkip option ROM probe with rombar=0, "
956                     "or load from file with romfile=\n");
957         return;
958     }
959 
960     vdev->rom = g_malloc(size);
961     memset(vdev->rom, 0xff, size);
962 
963     while (size) {
964         bytes = vbasedev->io_ops->region_read(vbasedev,
965                                               VFIO_PCI_ROM_REGION_INDEX,
966                                               off, size, vdev->rom + off);
967 
968         if (bytes == 0) {
969             break;
970         } else if (bytes > 0) {
971             off += bytes;
972             size -= bytes;
973         } else {
974             if (bytes == -EINTR || bytes == -EAGAIN) {
975                 continue;
976             }
977             error_report("vfio: Error reading device ROM: %s",
978                          strreaderror(bytes));
979 
980             break;
981         }
982     }
983 
984     /*
985      * Test the ROM signature against our device, if the vendor is correct
986      * but the device ID doesn't match, store the correct device ID and
987      * recompute the checksum.  Intel IGD devices need this and are known
988      * to have bogus checksums so we can't simply adjust the checksum.
989      */
990     if (pci_get_word(vdev->rom) == 0xaa55 &&
991         pci_get_word(vdev->rom + 0x18) + 8 < vdev->rom_size &&
992         !memcmp(vdev->rom + pci_get_word(vdev->rom + 0x18), "PCIR", 4)) {
993         uint16_t vid, did;
994 
995         vid = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 4);
996         did = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6);
997 
998         if (vid == vdev->vendor_id && did != vdev->device_id) {
999             int i;
1000             uint8_t csum, *data = vdev->rom;
1001 
1002             pci_set_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6,
1003                          vdev->device_id);
1004             data[6] = 0;
1005 
1006             for (csum = 0, i = 0; i < vdev->rom_size; i++) {
1007                 csum += data[i];
1008             }
1009 
1010             data[6] = -csum;
1011         }
1012     }
1013 }
1014 
1015 /* "Raw" read of underlying config space. */
1016 static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset,
1017                                       uint32_t size, void *data)
1018 {
1019     return vdev->vbasedev.io_ops->region_read(&vdev->vbasedev,
1020                                               VFIO_PCI_CONFIG_REGION_INDEX,
1021                                               offset, size, data);
1022 }
1023 
1024 /* "Raw" write of underlying config space. */
1025 static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset,
1026                                        uint32_t size, void *data)
1027 {
1028     return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev,
1029                                                VFIO_PCI_CONFIG_REGION_INDEX,
1030                                                offset, size, data, false);
1031 }
1032 
1033 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
1034 {
1035     VFIOPCIDevice *vdev = opaque;
1036     union {
1037         uint8_t byte;
1038         uint16_t word;
1039         uint32_t dword;
1040         uint64_t qword;
1041     } val;
1042     uint64_t data = 0;
1043 
1044     /* Load the ROM lazily when the guest tries to read it */
1045     if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
1046         vfio_pci_load_rom(vdev);
1047     }
1048 
1049     memcpy(&val, vdev->rom + addr,
1050            (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
1051 
1052     switch (size) {
1053     case 1:
1054         data = val.byte;
1055         break;
1056     case 2:
1057         data = le16_to_cpu(val.word);
1058         break;
1059     case 4:
1060         data = le32_to_cpu(val.dword);
1061         break;
1062     default:
1063         hw_error("vfio: unsupported read size, %d bytes\n", size);
1064         break;
1065     }
1066 
1067     trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data);
1068 
1069     return data;
1070 }
1071 
1072 static void vfio_rom_write(void *opaque, hwaddr addr,
1073                            uint64_t data, unsigned size)
1074 {
1075 }
1076 
1077 static const MemoryRegionOps vfio_rom_ops = {
1078     .read = vfio_rom_read,
1079     .write = vfio_rom_write,
1080     .endianness = DEVICE_LITTLE_ENDIAN,
1081 };
1082 
1083 static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
1084 {
1085     VFIODevice *vbasedev = &vdev->vbasedev;
1086     uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
1087     char *name;
1088 
1089     if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
1090         /* Since pci handles romfile, just print a message and return */
1091         if (vfio_opt_rom_in_denylist(vdev) && vdev->pdev.romfile) {
1092             warn_report("Device at %s is known to cause system instability"
1093                         " issues during option rom execution",
1094                         vdev->vbasedev.name);
1095             error_printf("Proceeding anyway since user specified romfile\n");
1096         }
1097         return;
1098     }
1099 
1100     /*
1101      * Use the same size ROM BAR as the physical device.  The contents
1102      * will get filled in later when the guest tries to read it.
1103      */
1104     if (vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4 ||
1105         vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 ||
1106         vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 ||
1107         vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4) {
1108 
1109         error_report("%s(%s) ROM access failed", __func__, vbasedev->name);
1110         return;
1111     }
1112 
1113     size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
1114 
1115     if (!size) {
1116         return;
1117     }
1118 
1119     if (vfio_opt_rom_in_denylist(vdev)) {
1120         if (vdev->pdev.rom_bar > 0) {
1121             warn_report("Device at %s is known to cause system instability"
1122                         " issues during option rom execution",
1123                         vdev->vbasedev.name);
1124             error_printf("Proceeding anyway since user specified"
1125                          " positive value for rombar\n");
1126         } else {
1127             warn_report("Rom loading for device at %s has been disabled"
1128                         " due to system instability issues",
1129                         vdev->vbasedev.name);
1130             error_printf("Specify rombar=1 or romfile to force\n");
1131             return;
1132         }
1133     }
1134 
1135     trace_vfio_pci_size_rom(vdev->vbasedev.name, size);
1136 
1137     name = g_strdup_printf("vfio[%s].rom", vdev->vbasedev.name);
1138 
1139     memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
1140                           &vfio_rom_ops, vdev, name, size);
1141     g_free(name);
1142 
1143     pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
1144                      PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
1145 
1146     vdev->rom_read_failed = false;
1147 }
1148 
1149 void vfio_vga_write(void *opaque, hwaddr addr,
1150                            uint64_t data, unsigned size)
1151 {
1152     VFIOVGARegion *region = opaque;
1153     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1154     union {
1155         uint8_t byte;
1156         uint16_t word;
1157         uint32_t dword;
1158         uint64_t qword;
1159     } buf;
1160     off_t offset = vga->fd_offset + region->offset + addr;
1161 
1162     switch (size) {
1163     case 1:
1164         buf.byte = data;
1165         break;
1166     case 2:
1167         buf.word = cpu_to_le16(data);
1168         break;
1169     case 4:
1170         buf.dword = cpu_to_le32(data);
1171         break;
1172     default:
1173         hw_error("vfio: unsupported write size, %d bytes", size);
1174         break;
1175     }
1176 
1177     if (pwrite(vga->fd, &buf, size, offset) != size) {
1178         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1179                      __func__, region->offset + addr, data, size);
1180     }
1181 
1182     trace_vfio_vga_write(region->offset + addr, data, size);
1183 }
1184 
1185 uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1186 {
1187     VFIOVGARegion *region = opaque;
1188     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1189     union {
1190         uint8_t byte;
1191         uint16_t word;
1192         uint32_t dword;
1193         uint64_t qword;
1194     } buf;
1195     uint64_t data = 0;
1196     off_t offset = vga->fd_offset + region->offset + addr;
1197 
1198     if (pread(vga->fd, &buf, size, offset) != size) {
1199         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1200                      __func__, region->offset + addr, size);
1201         return (uint64_t)-1;
1202     }
1203 
1204     switch (size) {
1205     case 1:
1206         data = buf.byte;
1207         break;
1208     case 2:
1209         data = le16_to_cpu(buf.word);
1210         break;
1211     case 4:
1212         data = le32_to_cpu(buf.dword);
1213         break;
1214     default:
1215         hw_error("vfio: unsupported read size, %d bytes", size);
1216         break;
1217     }
1218 
1219     trace_vfio_vga_read(region->offset + addr, size, data);
1220 
1221     return data;
1222 }
1223 
1224 static const MemoryRegionOps vfio_vga_ops = {
1225     .read = vfio_vga_read,
1226     .write = vfio_vga_write,
1227     .endianness = DEVICE_LITTLE_ENDIAN,
1228 };
1229 
1230 /*
1231  * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page
1232  * size if the BAR is in an exclusive page in host so that we could map
1233  * this BAR to guest. But this sub-page BAR may not occupy an exclusive
1234  * page in guest. So we should set the priority of the expanded memory
1235  * region to zero in case of overlap with BARs which share the same page
1236  * with the sub-page BAR in guest. Besides, we should also recover the
1237  * size of this sub-page BAR when its base address is changed in guest
1238  * and not page aligned any more.
1239  */
1240 static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
1241 {
1242     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
1243     VFIORegion *region = &vdev->bars[bar].region;
1244     MemoryRegion *mmap_mr, *region_mr, *base_mr;
1245     PCIIORegion *r;
1246     pcibus_t bar_addr;
1247     uint64_t size = region->size;
1248 
1249     /* Make sure that the whole region is allowed to be mmapped */
1250     if (region->nr_mmaps != 1 || !region->mmaps[0].mmap ||
1251         region->mmaps[0].size != region->size) {
1252         return;
1253     }
1254 
1255     r = &pdev->io_regions[bar];
1256     bar_addr = r->addr;
1257     base_mr = vdev->bars[bar].mr;
1258     region_mr = region->mem;
1259     mmap_mr = &region->mmaps[0].mem;
1260 
1261     /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */
1262     if (bar_addr != PCI_BAR_UNMAPPED &&
1263         !(bar_addr & ~qemu_real_host_page_mask())) {
1264         size = qemu_real_host_page_size();
1265     }
1266 
1267     memory_region_transaction_begin();
1268 
1269     if (vdev->bars[bar].size < size) {
1270         memory_region_set_size(base_mr, size);
1271     }
1272     memory_region_set_size(region_mr, size);
1273     memory_region_set_size(mmap_mr, size);
1274     if (size != vdev->bars[bar].size && memory_region_is_mapped(base_mr)) {
1275         memory_region_del_subregion(r->address_space, base_mr);
1276         memory_region_add_subregion_overlap(r->address_space,
1277                                             bar_addr, base_mr, 0);
1278     }
1279 
1280     memory_region_transaction_commit();
1281 }
1282 
1283 /*
1284  * PCI config space
1285  */
1286 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
1287 {
1288     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
1289     VFIODevice *vbasedev = &vdev->vbasedev;
1290     uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
1291 
1292     memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
1293     emu_bits = le32_to_cpu(emu_bits);
1294 
1295     if (emu_bits) {
1296         emu_val = pci_default_read_config(pdev, addr, len);
1297     }
1298 
1299     if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
1300         ssize_t ret;
1301 
1302         ret = vfio_pci_config_space_read(vdev, addr, len, &phys_val);
1303         if (ret != len) {
1304             error_report("%s(%s, 0x%x, 0x%x) failed: %s",
1305                          __func__, vbasedev->name, addr, len,
1306                          strreaderror(ret));
1307             return -1;
1308         }
1309         phys_val = le32_to_cpu(phys_val);
1310     }
1311 
1312     val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
1313 
1314     trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val);
1315 
1316     return val;
1317 }
1318 
1319 void vfio_pci_write_config(PCIDevice *pdev,
1320                            uint32_t addr, uint32_t val, int len)
1321 {
1322     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
1323     VFIODevice *vbasedev = &vdev->vbasedev;
1324     uint32_t val_le = cpu_to_le32(val);
1325     int ret;
1326 
1327     trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
1328 
1329     /* Write everything to VFIO, let it filter out what we can't write */
1330     ret = vfio_pci_config_space_write(vdev, addr, len, &val_le);
1331     if (ret != len) {
1332         error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s",
1333                      __func__, vbasedev->name, addr, val, len,
1334                     strwriteerror(ret));
1335     }
1336 
1337     /* MSI/MSI-X Enabling/Disabling */
1338     if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
1339         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
1340         int is_enabled, was_enabled = msi_enabled(pdev);
1341 
1342         pci_default_write_config(pdev, addr, val, len);
1343 
1344         is_enabled = msi_enabled(pdev);
1345 
1346         if (!was_enabled) {
1347             if (is_enabled) {
1348                 vfio_msi_enable(vdev);
1349             }
1350         } else {
1351             if (!is_enabled) {
1352                 vfio_msi_disable(vdev);
1353             } else {
1354                 vfio_update_msi(vdev);
1355             }
1356         }
1357     } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
1358         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
1359         int is_enabled, was_enabled = msix_enabled(pdev);
1360 
1361         pci_default_write_config(pdev, addr, val, len);
1362 
1363         is_enabled = msix_enabled(pdev);
1364 
1365         if (!was_enabled && is_enabled) {
1366             vfio_msix_enable(vdev);
1367         } else if (was_enabled && !is_enabled) {
1368             vfio_msix_disable(vdev);
1369         }
1370     } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) ||
1371         range_covers_byte(addr, len, PCI_COMMAND)) {
1372         pcibus_t old_addr[PCI_NUM_REGIONS - 1];
1373         int bar;
1374 
1375         for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
1376             old_addr[bar] = pdev->io_regions[bar].addr;
1377         }
1378 
1379         pci_default_write_config(pdev, addr, val, len);
1380 
1381         for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
1382             if (old_addr[bar] != pdev->io_regions[bar].addr &&
1383                 vdev->bars[bar].region.size > 0 &&
1384                 vdev->bars[bar].region.size < qemu_real_host_page_size()) {
1385                 vfio_sub_page_bar_update_mapping(pdev, bar);
1386             }
1387         }
1388     } else {
1389         /* Write everything to QEMU to keep emulated bits correct */
1390         pci_default_write_config(pdev, addr, val, len);
1391     }
1392 }
1393 
1394 /*
1395  * Interrupt setup
1396  */
1397 static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
1398 {
1399     /*
1400      * More complicated than it looks.  Disabling MSI/X transitions the
1401      * device to INTx mode (if supported).  Therefore we need to first
1402      * disable MSI/X and then cleanup by disabling INTx.
1403      */
1404     if (vdev->interrupt == VFIO_INT_MSIX) {
1405         vfio_msix_disable(vdev);
1406     } else if (vdev->interrupt == VFIO_INT_MSI) {
1407         vfio_msi_disable(vdev);
1408     }
1409 
1410     if (vdev->interrupt == VFIO_INT_INTx) {
1411         vfio_intx_disable(vdev);
1412     }
1413 }
1414 
1415 static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1416 {
1417     uint16_t ctrl;
1418     bool msi_64bit, msi_maskbit;
1419     int ret, entries;
1420     Error *err = NULL;
1421 
1422     ret = vfio_pci_config_space_read(vdev, pos + PCI_CAP_FLAGS,
1423                                      sizeof(ctrl), &ctrl);
1424     if (ret != sizeof(ctrl)) {
1425         error_setg(errp, "failed reading MSI PCI_CAP_FLAGS: %s",
1426                    strreaderror(ret));
1427         return false;
1428     }
1429     ctrl = le16_to_cpu(ctrl);
1430 
1431     msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
1432     msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
1433     entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
1434 
1435     trace_vfio_msi_setup(vdev->vbasedev.name, pos);
1436 
1437     ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit, &err);
1438     if (ret < 0) {
1439         if (ret == -ENOTSUP) {
1440             return true;
1441         }
1442         error_propagate_prepend(errp, err, "msi_init failed: ");
1443         return false;
1444     }
1445     vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
1446 
1447     return true;
1448 }
1449 
1450 static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
1451 {
1452     off_t start, end;
1453     VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region;
1454 
1455     /*
1456      * If the host driver allows mapping of a MSIX data, we are going to
1457      * do map the entire BAR and emulate MSIX table on top of that.
1458      */
1459     if (vfio_device_has_region_cap(&vdev->vbasedev, region->nr,
1460                                    VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) {
1461         return;
1462     }
1463 
1464     /*
1465      * We expect to find a single mmap covering the whole BAR, anything else
1466      * means it's either unsupported or already setup.
1467      */
1468     if (region->nr_mmaps != 1 || region->mmaps[0].offset ||
1469         region->size != region->mmaps[0].size) {
1470         return;
1471     }
1472 
1473     /* MSI-X table start and end aligned to host page size */
1474     start = vdev->msix->table_offset & qemu_real_host_page_mask();
1475     end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
1476                                (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
1477 
1478     /*
1479      * Does the MSI-X table cover the beginning of the BAR?  The whole BAR?
1480      * NB - Host page size is necessarily a power of two and so is the PCI
1481      * BAR (not counting EA yet), therefore if we have host page aligned
1482      * @start and @end, then any remainder of the BAR before or after those
1483      * must be at least host page sized and therefore mmap'able.
1484      */
1485     if (!start) {
1486         if (end >= region->size) {
1487             region->nr_mmaps = 0;
1488             g_free(region->mmaps);
1489             region->mmaps = NULL;
1490             trace_vfio_msix_fixup(vdev->vbasedev.name,
1491                                   vdev->msix->table_bar, 0, 0);
1492         } else {
1493             region->mmaps[0].offset = end;
1494             region->mmaps[0].size = region->size - end;
1495             trace_vfio_msix_fixup(vdev->vbasedev.name,
1496                               vdev->msix->table_bar, region->mmaps[0].offset,
1497                               region->mmaps[0].offset + region->mmaps[0].size);
1498         }
1499 
1500     /* Maybe it's aligned at the end of the BAR */
1501     } else if (end >= region->size) {
1502         region->mmaps[0].size = start;
1503         trace_vfio_msix_fixup(vdev->vbasedev.name,
1504                               vdev->msix->table_bar, region->mmaps[0].offset,
1505                               region->mmaps[0].offset + region->mmaps[0].size);
1506 
1507     /* Otherwise it must split the BAR */
1508     } else {
1509         region->nr_mmaps = 2;
1510         region->mmaps = g_renew(VFIOMmap, region->mmaps, 2);
1511 
1512         memcpy(&region->mmaps[1], &region->mmaps[0], sizeof(VFIOMmap));
1513 
1514         region->mmaps[0].size = start;
1515         trace_vfio_msix_fixup(vdev->vbasedev.name,
1516                               vdev->msix->table_bar, region->mmaps[0].offset,
1517                               region->mmaps[0].offset + region->mmaps[0].size);
1518 
1519         region->mmaps[1].offset = end;
1520         region->mmaps[1].size = region->size - end;
1521         trace_vfio_msix_fixup(vdev->vbasedev.name,
1522                               vdev->msix->table_bar, region->mmaps[1].offset,
1523                               region->mmaps[1].offset + region->mmaps[1].size);
1524     }
1525 }
1526 
1527 static bool vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
1528 {
1529     int target_bar = -1;
1530     size_t msix_sz;
1531 
1532     if (!vdev->msix || vdev->msix_relo == OFF_AUTO_PCIBAR_OFF) {
1533         return true;
1534     }
1535 
1536     /* The actual minimum size of MSI-X structures */
1537     msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) +
1538               (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8);
1539     /* Round up to host pages, we don't want to share a page */
1540     msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz);
1541     /* PCI BARs must be a power of 2 */
1542     msix_sz = pow2ceil(msix_sz);
1543 
1544     if (vdev->msix_relo == OFF_AUTO_PCIBAR_AUTO) {
1545         /*
1546          * TODO: Lookup table for known devices.
1547          *
1548          * Logically we might use an algorithm here to select the BAR adding
1549          * the least additional MMIO space, but we cannot programmatically
1550          * predict the driver dependency on BAR ordering or sizing, therefore
1551          * 'auto' becomes a lookup for combinations reported to work.
1552          */
1553         if (target_bar < 0) {
1554             error_setg(errp, "No automatic MSI-X relocation available for "
1555                        "device %04x:%04x", vdev->vendor_id, vdev->device_id);
1556             return false;
1557         }
1558     } else {
1559         target_bar = (int)(vdev->msix_relo - OFF_AUTO_PCIBAR_BAR0);
1560     }
1561 
1562     /* I/O port BARs cannot host MSI-X structures */
1563     if (vdev->bars[target_bar].ioport) {
1564         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1565                    "I/O port BAR", target_bar);
1566         return false;
1567     }
1568 
1569     /* Cannot use a BAR in the "shadow" of a 64-bit BAR */
1570     if (!vdev->bars[target_bar].size &&
1571          target_bar > 0 && vdev->bars[target_bar - 1].mem64) {
1572         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1573                    "consumed by 64-bit BAR %d", target_bar, target_bar - 1);
1574         return false;
1575     }
1576 
1577     /* 2GB max size for 32-bit BARs, cannot double if already > 1G */
1578     if (vdev->bars[target_bar].size > 1 * GiB &&
1579         !vdev->bars[target_bar].mem64) {
1580         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1581                    "no space to extend 32-bit BAR", target_bar);
1582         return false;
1583     }
1584 
1585     /*
1586      * If adding a new BAR, test if we can make it 64bit.  We make it
1587      * prefetchable since QEMU MSI-X emulation has no read side effects
1588      * and doing so makes mapping more flexible.
1589      */
1590     if (!vdev->bars[target_bar].size) {
1591         if (target_bar < (PCI_ROM_SLOT - 1) &&
1592             !vdev->bars[target_bar + 1].size) {
1593             vdev->bars[target_bar].mem64 = true;
1594             vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64;
1595         }
1596         vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
1597         vdev->bars[target_bar].size = msix_sz;
1598         vdev->msix->table_offset = 0;
1599     } else {
1600         vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2,
1601                                           msix_sz * 2);
1602         /*
1603          * Due to above size calc, MSI-X always starts halfway into the BAR,
1604          * which will always be a separate host page.
1605          */
1606         vdev->msix->table_offset = vdev->bars[target_bar].size / 2;
1607     }
1608 
1609     vdev->msix->table_bar = target_bar;
1610     vdev->msix->pba_bar = target_bar;
1611     /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */
1612     vdev->msix->pba_offset = vdev->msix->table_offset +
1613                                   (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE);
1614 
1615     trace_vfio_msix_relo(vdev->vbasedev.name,
1616                          vdev->msix->table_bar, vdev->msix->table_offset);
1617     return true;
1618 }
1619 
1620 /*
1621  * We don't have any control over how pci_add_capability() inserts
1622  * capabilities into the chain.  In order to setup MSI-X we need a
1623  * MemoryRegion for the BAR.  In order to setup the BAR and not
1624  * attempt to mmap the MSI-X table area, which VFIO won't allow, we
1625  * need to first look for where the MSI-X table lives.  So we
1626  * unfortunately split MSI-X setup across two functions.
1627  */
1628 static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
1629 {
1630     uint8_t pos;
1631     uint16_t ctrl;
1632     uint32_t table, pba;
1633     struct vfio_irq_info irq_info;
1634     VFIOMSIXInfo *msix;
1635     int ret;
1636 
1637     pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
1638     if (!pos) {
1639         return true;
1640     }
1641 
1642     ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_FLAGS,
1643                                      sizeof(ctrl), &ctrl);
1644     if (ret != sizeof(ctrl)) {
1645         error_setg(errp, "failed to read PCI MSIX FLAGS: %s",
1646                    strreaderror(ret));
1647         return false;
1648     }
1649 
1650     ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_TABLE,
1651                                      sizeof(table), &table);
1652     if (ret != sizeof(table)) {
1653         error_setg(errp, "failed to read PCI MSIX TABLE: %s",
1654                    strreaderror(ret));
1655         return false;
1656     }
1657 
1658     ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_PBA,
1659                                      sizeof(pba), &pba);
1660     if (ret != sizeof(pba)) {
1661         error_setg(errp, "failed to read PCI MSIX PBA: %s", strreaderror(ret));
1662         return false;
1663     }
1664 
1665     ctrl = le16_to_cpu(ctrl);
1666     table = le32_to_cpu(table);
1667     pba = le32_to_cpu(pba);
1668 
1669     msix = g_malloc0(sizeof(*msix));
1670     msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
1671     msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
1672     msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
1673     msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
1674     msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
1675 
1676     ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
1677                                    &irq_info);
1678     if (ret < 0) {
1679         error_setg_errno(errp, -ret, "failed to get MSI-X irq info");
1680         g_free(msix);
1681         return false;
1682     }
1683 
1684     msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE);
1685 
1686     /*
1687      * Test the size of the pba_offset variable and catch if it extends outside
1688      * of the specified BAR. If it is the case, we need to apply a hardware
1689      * specific quirk if the device is known or we have a broken configuration.
1690      */
1691     if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) {
1692         /*
1693          * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
1694          * adapters. The T5 hardware returns an incorrect value of 0x8000 for
1695          * the VF PBA offset while the BAR itself is only 8k. The correct value
1696          * is 0x1000, so we hard code that here.
1697          */
1698         if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO &&
1699             (vdev->device_id & 0xff00) == 0x5800) {
1700             msix->pba_offset = 0x1000;
1701         /*
1702          * BAIDU KUNLUN Virtual Function devices for KUNLUN AI processor
1703          * return an incorrect value of 0x460000 for the VF PBA offset while
1704          * the BAR itself is only 0x10000.  The correct value is 0xb400.
1705          */
1706         } else if (vfio_pci_is(vdev, PCI_VENDOR_ID_BAIDU,
1707                                PCI_DEVICE_ID_KUNLUN_VF)) {
1708             msix->pba_offset = 0xb400;
1709         } else if (vdev->msix_relo == OFF_AUTO_PCIBAR_OFF) {
1710             error_setg(errp, "hardware reports invalid configuration, "
1711                        "MSIX PBA outside of specified BAR");
1712             g_free(msix);
1713             return false;
1714         }
1715     }
1716 
1717     trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
1718                                 msix->table_offset, msix->entries,
1719                                 msix->noresize);
1720     vdev->msix = msix;
1721 
1722     vfio_pci_fixup_msix_region(vdev);
1723 
1724     return vfio_pci_relocate_msix(vdev, errp);
1725 }
1726 
1727 static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1728 {
1729     int ret;
1730     Error *err = NULL;
1731 
1732     vdev->msix->pending = g_new0(unsigned long,
1733                                  BITS_TO_LONGS(vdev->msix->entries));
1734     ret = msix_init(&vdev->pdev, vdev->msix->entries,
1735                     vdev->bars[vdev->msix->table_bar].mr,
1736                     vdev->msix->table_bar, vdev->msix->table_offset,
1737                     vdev->bars[vdev->msix->pba_bar].mr,
1738                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos,
1739                     &err);
1740     if (ret < 0) {
1741         if (ret == -ENOTSUP) {
1742             warn_report_err(err);
1743             return true;
1744         }
1745 
1746         error_propagate(errp, err);
1747         return false;
1748     }
1749 
1750     /*
1751      * The PCI spec suggests that devices provide additional alignment for
1752      * MSI-X structures and avoid overlapping non-MSI-X related registers.
1753      * For an assigned device, this hopefully means that emulation of MSI-X
1754      * structures does not affect the performance of the device.  If devices
1755      * fail to provide that alignment, a significant performance penalty may
1756      * result, for instance Mellanox MT27500 VFs:
1757      * http://www.spinics.net/lists/kvm/msg125881.html
1758      *
1759      * The PBA is simply not that important for such a serious regression and
1760      * most drivers do not appear to look at it.  The solution for this is to
1761      * disable the PBA MemoryRegion unless it's being used.  We disable it
1762      * here and only enable it if a masked vector fires through QEMU.  As the
1763      * vector-use notifier is called, which occurs on unmask, we test whether
1764      * PBA emulation is needed and again disable if not.
1765      */
1766     memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
1767 
1768     /*
1769      * The emulated machine may provide a paravirt interface for MSIX setup
1770      * so it is not strictly necessary to emulate MSIX here. This becomes
1771      * helpful when frequently accessed MMIO registers are located in
1772      * subpages adjacent to the MSIX table but the MSIX data containing page
1773      * cannot be mapped because of a host page size bigger than the MSIX table
1774      * alignment.
1775      */
1776     if (object_property_get_bool(OBJECT(qdev_get_machine()),
1777                                  "vfio-no-msix-emulation", NULL)) {
1778         memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
1779     }
1780 
1781     return true;
1782 }
1783 
1784 void vfio_pci_teardown_msi(VFIOPCIDevice *vdev)
1785 {
1786     msi_uninit(&vdev->pdev);
1787 
1788     if (vdev->msix) {
1789         msix_uninit(&vdev->pdev,
1790                     vdev->bars[vdev->msix->table_bar].mr,
1791                     vdev->bars[vdev->msix->pba_bar].mr);
1792         g_free(vdev->msix->pending);
1793     }
1794 }
1795 
1796 /*
1797  * Resource setup
1798  */
1799 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
1800 {
1801     int i;
1802 
1803     for (i = 0; i < PCI_ROM_SLOT; i++) {
1804         vfio_region_mmaps_set_enabled(&vdev->bars[i].region, enabled);
1805     }
1806 }
1807 
1808 static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
1809 {
1810     VFIOBAR *bar = &vdev->bars[nr];
1811 
1812     uint32_t pci_bar;
1813     int ret;
1814 
1815     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
1816     if (!bar->region.size) {
1817         return;
1818     }
1819 
1820     /* Determine what type of BAR this is for registration */
1821     ret = vfio_pci_config_space_read(vdev, PCI_BASE_ADDRESS_0 + (4 * nr),
1822                                      sizeof(pci_bar), &pci_bar);
1823     if (ret != sizeof(pci_bar)) {
1824         error_report("vfio: Failed to read BAR %d: %s", nr, strreaderror(ret));
1825         return;
1826     }
1827 
1828     pci_bar = le32_to_cpu(pci_bar);
1829     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
1830     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
1831     bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
1832                                          ~PCI_BASE_ADDRESS_MEM_MASK);
1833     bar->size = bar->region.size;
1834 
1835     /* IO regions are sync, memory can be async */
1836     bar->region.post_wr = (bar->ioport == 0);
1837 }
1838 
1839 static void vfio_bars_prepare(VFIOPCIDevice *vdev)
1840 {
1841     int i;
1842 
1843     for (i = 0; i < PCI_ROM_SLOT; i++) {
1844         vfio_bar_prepare(vdev, i);
1845     }
1846 }
1847 
1848 static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
1849 {
1850     VFIOBAR *bar = &vdev->bars[nr];
1851     char *name;
1852 
1853     if (!bar->size) {
1854         return;
1855     }
1856 
1857     bar->mr = g_new0(MemoryRegion, 1);
1858     name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr);
1859     memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size);
1860     g_free(name);
1861 
1862     if (bar->region.size) {
1863         memory_region_add_subregion(bar->mr, 0, bar->region.mem);
1864 
1865         if (vfio_region_mmap(&bar->region)) {
1866             error_report("Failed to mmap %s BAR %d. Performance may be slow",
1867                          vdev->vbasedev.name, nr);
1868         }
1869     }
1870 
1871     pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
1872 }
1873 
1874 static void vfio_bars_register(VFIOPCIDevice *vdev)
1875 {
1876     int i;
1877 
1878     for (i = 0; i < PCI_ROM_SLOT; i++) {
1879         vfio_bar_register(vdev, i);
1880     }
1881 }
1882 
1883 void vfio_pci_bars_exit(VFIOPCIDevice *vdev)
1884 {
1885     int i;
1886 
1887     for (i = 0; i < PCI_ROM_SLOT; i++) {
1888         VFIOBAR *bar = &vdev->bars[i];
1889 
1890         vfio_bar_quirk_exit(vdev, i);
1891         vfio_region_exit(&bar->region);
1892         if (bar->region.size) {
1893             memory_region_del_subregion(bar->mr, bar->region.mem);
1894         }
1895     }
1896 
1897     if (vdev->vga) {
1898         pci_unregister_vga(&vdev->pdev);
1899         vfio_vga_quirk_exit(vdev);
1900     }
1901 }
1902 
1903 static void vfio_bars_finalize(VFIOPCIDevice *vdev)
1904 {
1905     int i;
1906 
1907     for (i = 0; i < PCI_ROM_SLOT; i++) {
1908         VFIOBAR *bar = &vdev->bars[i];
1909 
1910         vfio_bar_quirk_finalize(vdev, i);
1911         vfio_region_finalize(&bar->region);
1912         if (bar->mr) {
1913             assert(bar->size);
1914             object_unparent(OBJECT(bar->mr));
1915             g_free(bar->mr);
1916             bar->mr = NULL;
1917         }
1918     }
1919 
1920     if (vdev->vga) {
1921         vfio_vga_quirk_finalize(vdev);
1922         for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1923             object_unparent(OBJECT(&vdev->vga->region[i].mem));
1924         }
1925         g_free(vdev->vga);
1926     }
1927 }
1928 
1929 /*
1930  * General setup
1931  */
1932 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
1933 {
1934     uint8_t tmp;
1935     uint16_t next = PCI_CONFIG_SPACE_SIZE;
1936 
1937     for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
1938          tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]) {
1939         if (tmp > pos && tmp < next) {
1940             next = tmp;
1941         }
1942     }
1943 
1944     return next - pos;
1945 }
1946 
1947 
1948 static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
1949 {
1950     uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
1951 
1952     for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
1953         tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
1954         if (tmp > pos && tmp < next) {
1955             next = tmp;
1956         }
1957     }
1958 
1959     return next - pos;
1960 }
1961 
1962 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
1963 {
1964     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
1965 }
1966 
1967 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos,
1968                                    uint16_t val, uint16_t mask)
1969 {
1970     vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
1971     vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
1972     vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
1973 }
1974 
1975 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
1976 {
1977     pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
1978 }
1979 
1980 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
1981                                    uint32_t val, uint32_t mask)
1982 {
1983     vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
1984     vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
1985     vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
1986 }
1987 
1988 static void vfio_pci_enable_rp_atomics(VFIOPCIDevice *vdev)
1989 {
1990     struct vfio_device_info_cap_pci_atomic_comp *cap;
1991     g_autofree struct vfio_device_info *info = NULL;
1992     PCIBus *bus = pci_get_bus(&vdev->pdev);
1993     PCIDevice *parent = bus->parent_dev;
1994     struct vfio_info_cap_header *hdr;
1995     uint32_t mask = 0;
1996     uint8_t *pos;
1997 
1998     /*
1999      * PCIe Atomic Ops completer support is only added automatically for single
2000      * function devices downstream of a root port supporting DEVCAP2.  Support
2001      * is added during realize and, if added, removed during device exit.  The
2002      * single function requirement avoids conflicting requirements should a
2003      * slot be composed of multiple devices with differing capabilities.
2004      */
2005     if (pci_bus_is_root(bus) || !parent || !parent->exp.exp_cap ||
2006         pcie_cap_get_type(parent) != PCI_EXP_TYPE_ROOT_PORT ||
2007         pcie_cap_get_version(parent) != PCI_EXP_FLAGS_VER2 ||
2008         vdev->pdev.devfn ||
2009         vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
2010         return;
2011     }
2012 
2013     pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2;
2014 
2015     /* Abort if there'a already an Atomic Ops configuration on the root port */
2016     if (pci_get_long(pos) & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
2017                              PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
2018                              PCI_EXP_DEVCAP2_ATOMIC_COMP128)) {
2019         return;
2020     }
2021 
2022     info = vfio_get_device_info(vdev->vbasedev.fd);
2023     if (!info) {
2024         return;
2025     }
2026 
2027     hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP);
2028     if (!hdr) {
2029         return;
2030     }
2031 
2032     cap = (void *)hdr;
2033     if (cap->flags & VFIO_PCI_ATOMIC_COMP32) {
2034         mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP32;
2035     }
2036     if (cap->flags & VFIO_PCI_ATOMIC_COMP64) {
2037         mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP64;
2038     }
2039     if (cap->flags & VFIO_PCI_ATOMIC_COMP128) {
2040         mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP128;
2041     }
2042 
2043     if (!mask) {
2044         return;
2045     }
2046 
2047     pci_long_test_and_set_mask(pos, mask);
2048     vdev->clear_parent_atomics_on_exit = true;
2049 }
2050 
2051 static void vfio_pci_disable_rp_atomics(VFIOPCIDevice *vdev)
2052 {
2053     if (vdev->clear_parent_atomics_on_exit) {
2054         PCIDevice *parent = pci_get_bus(&vdev->pdev)->parent_dev;
2055         uint8_t *pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2;
2056 
2057         pci_long_test_and_clear_mask(pos, PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
2058                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
2059                                           PCI_EXP_DEVCAP2_ATOMIC_COMP128);
2060     }
2061 }
2062 
2063 static bool vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size,
2064                                 Error **errp)
2065 {
2066     uint16_t flags;
2067     uint8_t type;
2068 
2069     flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
2070     type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
2071 
2072     if (type != PCI_EXP_TYPE_ENDPOINT &&
2073         type != PCI_EXP_TYPE_LEG_END &&
2074         type != PCI_EXP_TYPE_RC_END) {
2075 
2076         error_setg(errp, "assignment of PCIe type 0x%x "
2077                    "devices is not currently supported", type);
2078         return false;
2079     }
2080 
2081     if (!pci_bus_is_express(pci_get_bus(&vdev->pdev))) {
2082         PCIBus *bus = pci_get_bus(&vdev->pdev);
2083         PCIDevice *bridge;
2084 
2085         /*
2086          * Traditionally PCI device assignment exposes the PCIe capability
2087          * as-is on non-express buses.  The reason being that some drivers
2088          * simply assume that it's there, for example tg3.  However when
2089          * we're running on a native PCIe machine type, like Q35, we need
2090          * to hide the PCIe capability.  The reason for this is twofold;
2091          * first Windows guests get a Code 10 error when the PCIe capability
2092          * is exposed in this configuration.  Therefore express devices won't
2093          * work at all unless they're attached to express buses in the VM.
2094          * Second, a native PCIe machine introduces the possibility of fine
2095          * granularity IOMMUs supporting both translation and isolation.
2096          * Guest code to discover the IOMMU visibility of a device, such as
2097          * IOMMU grouping code on Linux, is very aware of device types and
2098          * valid transitions between bus types.  An express device on a non-
2099          * express bus is not a valid combination on bare metal systems.
2100          *
2101          * Drivers that require a PCIe capability to make the device
2102          * functional are simply going to need to have their devices placed
2103          * on a PCIe bus in the VM.
2104          */
2105         while (!pci_bus_is_root(bus)) {
2106             bridge = pci_bridge_get_device(bus);
2107             bus = pci_get_bus(bridge);
2108         }
2109 
2110         if (pci_bus_is_express(bus)) {
2111             return true;
2112         }
2113 
2114     } else if (pci_bus_is_root(pci_get_bus(&vdev->pdev))) {
2115         /*
2116          * On a Root Complex bus Endpoints become Root Complex Integrated
2117          * Endpoints, which changes the type and clears the LNK & LNK2 fields.
2118          */
2119         if (type == PCI_EXP_TYPE_ENDPOINT) {
2120             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2121                                    PCI_EXP_TYPE_RC_END << 4,
2122                                    PCI_EXP_FLAGS_TYPE);
2123 
2124             /* Link Capabilities, Status, and Control goes away */
2125             if (size > PCI_EXP_LNKCTL) {
2126                 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
2127                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2128                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
2129 
2130 #ifndef PCI_EXP_LNKCAP2
2131 #define PCI_EXP_LNKCAP2 44
2132 #endif
2133 #ifndef PCI_EXP_LNKSTA2
2134 #define PCI_EXP_LNKSTA2 50
2135 #endif
2136                 /* Link 2 Capabilities, Status, and Control goes away */
2137                 if (size > PCI_EXP_LNKCAP2) {
2138                     vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
2139                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
2140                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
2141                 }
2142             }
2143 
2144         } else if (type == PCI_EXP_TYPE_LEG_END) {
2145             /*
2146              * Legacy endpoints don't belong on the root complex.  Windows
2147              * seems to be happier with devices if we skip the capability.
2148              */
2149             return true;
2150         }
2151 
2152     } else {
2153         /*
2154          * Convert Root Complex Integrated Endpoints to regular endpoints.
2155          * These devices don't support LNK/LNK2 capabilities, so make them up.
2156          */
2157         if (type == PCI_EXP_TYPE_RC_END) {
2158             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2159                                    PCI_EXP_TYPE_ENDPOINT << 4,
2160                                    PCI_EXP_FLAGS_TYPE);
2161             vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
2162                            QEMU_PCI_EXP_LNKCAP_MLW(QEMU_PCI_EXP_LNK_X1) |
2163                            QEMU_PCI_EXP_LNKCAP_MLS(QEMU_PCI_EXP_LNK_2_5GT), ~0);
2164             vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2165         }
2166 
2167         vfio_pci_enable_rp_atomics(vdev);
2168     }
2169 
2170     /*
2171      * Intel 82599 SR-IOV VFs report an invalid PCIe capability version 0
2172      * (Niantic errate #35) causing Windows to error with a Code 10 for the
2173      * device on Q35.  Fixup any such devices to report version 1.  If we
2174      * were to remove the capability entirely the guest would lose extended
2175      * config space.
2176      */
2177     if ((flags & PCI_EXP_FLAGS_VERS) == 0) {
2178         vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2179                                1, PCI_EXP_FLAGS_VERS);
2180     }
2181 
2182     pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size,
2183                              errp);
2184     if (pos < 0) {
2185         return false;
2186     }
2187 
2188     vdev->pdev.exp.exp_cap = pos;
2189 
2190     return true;
2191 }
2192 
2193 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
2194 {
2195     uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
2196 
2197     if (cap & PCI_EXP_DEVCAP_FLR) {
2198         trace_vfio_check_pcie_flr(vdev->vbasedev.name);
2199         vdev->has_flr = true;
2200     }
2201 }
2202 
2203 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos)
2204 {
2205     uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
2206 
2207     if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
2208         trace_vfio_check_pm_reset(vdev->vbasedev.name);
2209         vdev->has_pm_reset = true;
2210     }
2211 }
2212 
2213 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
2214 {
2215     uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
2216 
2217     if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
2218         trace_vfio_check_af_flr(vdev->vbasedev.name);
2219         vdev->has_flr = true;
2220     }
2221 }
2222 
2223 static bool vfio_add_vendor_specific_cap(VFIOPCIDevice *vdev, int pos,
2224                                          uint8_t size, Error **errp)
2225 {
2226     PCIDevice *pdev = &vdev->pdev;
2227 
2228     pos = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, size, errp);
2229     if (pos < 0) {
2230         return false;
2231     }
2232 
2233     /*
2234      * Exempt config space check for Vendor Specific Information during
2235      * restore/load.
2236      * Config space check is still enforced for 3 byte VSC header.
2237      */
2238     if (vdev->skip_vsc_check && size > 3) {
2239         memset(pdev->cmask + pos + 3, 0, size - 3);
2240     }
2241 
2242     return true;
2243 }
2244 
2245 static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
2246 {
2247     ERRP_GUARD();
2248     PCIDevice *pdev = &vdev->pdev;
2249     uint8_t cap_id, next, size;
2250     bool ret;
2251 
2252     cap_id = pdev->config[pos];
2253     next = pdev->config[pos + PCI_CAP_LIST_NEXT];
2254 
2255     /*
2256      * If it becomes important to configure capabilities to their actual
2257      * size, use this as the default when it's something we don't recognize.
2258      * Since QEMU doesn't actually handle many of the config accesses,
2259      * exact size doesn't seem worthwhile.
2260      */
2261     size = vfio_std_cap_max_size(pdev, pos);
2262 
2263     /*
2264      * pci_add_capability always inserts the new capability at the head
2265      * of the chain.  Therefore to end up with a chain that matches the
2266      * physical device, we insert from the end by making this recursive.
2267      * This is also why we pre-calculate size above as cached config space
2268      * will be changed as we unwind the stack.
2269      */
2270     if (next) {
2271         if (!vfio_add_std_cap(vdev, next, errp)) {
2272             return false;
2273         }
2274     } else {
2275         /* Begin the rebuild, use QEMU emulated list bits */
2276         pdev->config[PCI_CAPABILITY_LIST] = 0;
2277         vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
2278         vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
2279 
2280         if (!vfio_add_virt_caps(vdev, errp)) {
2281             return false;
2282         }
2283     }
2284 
2285     /* Scale down size, esp in case virt caps were added above */
2286     size = MIN(size, vfio_std_cap_max_size(pdev, pos));
2287 
2288     /* Use emulated next pointer to allow dropping caps */
2289     pci_set_byte(vdev->emulated_config_bits + pos + PCI_CAP_LIST_NEXT, 0xff);
2290 
2291     switch (cap_id) {
2292     case PCI_CAP_ID_MSI:
2293         ret = vfio_msi_setup(vdev, pos, errp);
2294         break;
2295     case PCI_CAP_ID_EXP:
2296         vfio_check_pcie_flr(vdev, pos);
2297         ret = vfio_setup_pcie_cap(vdev, pos, size, errp);
2298         break;
2299     case PCI_CAP_ID_MSIX:
2300         ret = vfio_msix_setup(vdev, pos, errp);
2301         break;
2302     case PCI_CAP_ID_PM:
2303         vfio_check_pm_reset(vdev, pos);
2304         ret = pci_pm_init(pdev, pos, errp) >= 0;
2305         /*
2306          * PCI-core config space emulation needs write access to the power
2307          * state enabled for tracking BAR mapping relative to PM state.
2308          */
2309         pci_set_word(pdev->wmask + pos + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK);
2310         break;
2311     case PCI_CAP_ID_AF:
2312         vfio_check_af_flr(vdev, pos);
2313         ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
2314         break;
2315     case PCI_CAP_ID_VNDR:
2316         ret = vfio_add_vendor_specific_cap(vdev, pos, size, errp);
2317         break;
2318     default:
2319         ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
2320         break;
2321     }
2322 
2323     if (!ret) {
2324         error_prepend(errp,
2325                       "failed to add PCI capability 0x%x[0x%x]@0x%x: ",
2326                       cap_id, size, pos);
2327     }
2328 
2329     return ret;
2330 }
2331 
2332 static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos)
2333 {
2334     uint32_t ctrl;
2335     int i, nbar;
2336 
2337     ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL);
2338     nbar = (ctrl & PCI_REBAR_CTRL_NBAR_MASK) >> PCI_REBAR_CTRL_NBAR_SHIFT;
2339 
2340     for (i = 0; i < nbar; i++) {
2341         uint32_t cap;
2342         int size;
2343 
2344         ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL + (i * 8));
2345         size = (ctrl & PCI_REBAR_CTRL_BAR_SIZE) >> PCI_REBAR_CTRL_BAR_SHIFT;
2346 
2347         /* The cap register reports sizes 1MB to 128TB, with 4 reserved bits */
2348         cap = size <= 27 ? 1U << (size + 4) : 0;
2349 
2350         /*
2351          * The PCIe spec (v6.0.1, 7.8.6) requires HW to support at least one
2352          * size in the range 1MB to 512GB.  We intend to mask all sizes except
2353          * the one currently enabled in the size field, therefore if it's
2354          * outside the range, hide the whole capability as this virtualization
2355          * trick won't work.  If >512GB resizable BARs start to appear, we
2356          * might need an opt-in or reservation scheme in the kernel.
2357          */
2358         if (!(cap & PCI_REBAR_CAP_SIZES)) {
2359             return -EINVAL;
2360         }
2361 
2362         /* Hide all sizes reported in the ctrl reg per above requirement. */
2363         ctrl &= (PCI_REBAR_CTRL_BAR_SIZE |
2364                  PCI_REBAR_CTRL_NBAR_MASK |
2365                  PCI_REBAR_CTRL_BAR_IDX);
2366 
2367         /*
2368          * The BAR size field is RW, however we've mangled the capability
2369          * register such that we only report a single size, ie. the current
2370          * BAR size.  A write of an unsupported value is undefined, therefore
2371          * the register field is essentially RO.
2372          */
2373         vfio_add_emulated_long(vdev, pos + PCI_REBAR_CAP + (i * 8), cap, ~0);
2374         vfio_add_emulated_long(vdev, pos + PCI_REBAR_CTRL + (i * 8), ctrl, ~0);
2375     }
2376 
2377     return 0;
2378 }
2379 
2380 static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
2381 {
2382     PCIDevice *pdev = &vdev->pdev;
2383     uint32_t header;
2384     uint16_t cap_id, next, size;
2385     uint8_t cap_ver;
2386     uint8_t *config;
2387 
2388     /* Only add extended caps if we have them and the guest can see them */
2389     if (!pci_is_express(pdev) || !pci_bus_is_express(pci_get_bus(pdev)) ||
2390         !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
2391         return;
2392     }
2393 
2394     /*
2395      * pcie_add_capability always inserts the new capability at the tail
2396      * of the chain.  Therefore to end up with a chain that matches the
2397      * physical device, we cache the config space to avoid overwriting
2398      * the original config space when we parse the extended capabilities.
2399      */
2400     config = g_memdup(pdev->config, vdev->config_size);
2401 
2402     /*
2403      * Extended capabilities are chained with each pointing to the next, so we
2404      * can drop anything other than the head of the chain simply by modifying
2405      * the previous next pointer.  Seed the head of the chain here such that
2406      * we can simply skip any capabilities we want to drop below, regardless
2407      * of their position in the chain.  If this stub capability still exists
2408      * after we add the capabilities we want to expose, update the capability
2409      * ID to zero.  Note that we cannot seed with the capability header being
2410      * zero as this conflicts with definition of an absent capability chain
2411      * and prevents capabilities beyond the head of the list from being added.
2412      * By replacing the dummy capability ID with zero after walking the device
2413      * chain, we also transparently mark extended capabilities as absent if
2414      * no capabilities were added.  Note that the PCIe spec defines an absence
2415      * of extended capabilities to be determined by a value of zero for the
2416      * capability ID, version, AND next pointer.  A non-zero next pointer
2417      * should be sufficient to indicate additional capabilities are present,
2418      * which will occur if we call pcie_add_capability() below.  The entire
2419      * first dword is emulated to support this.
2420      *
2421      * NB. The kernel side does similar masking, so be prepared that our
2422      * view of the device may also contain a capability ID zero in the head
2423      * of the chain.  Skip it for the same reason that we cannot seed the
2424      * chain with a zero capability.
2425      */
2426     pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
2427                  PCI_EXT_CAP(0xFFFF, 0, 0));
2428     pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
2429     pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
2430 
2431     for (next = PCI_CONFIG_SPACE_SIZE; next;
2432          next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
2433         header = pci_get_long(config + next);
2434         cap_id = PCI_EXT_CAP_ID(header);
2435         cap_ver = PCI_EXT_CAP_VER(header);
2436 
2437         /*
2438          * If it becomes important to configure extended capabilities to their
2439          * actual size, use this as the default when it's something we don't
2440          * recognize. Since QEMU doesn't actually handle many of the config
2441          * accesses, exact size doesn't seem worthwhile.
2442          */
2443         size = vfio_ext_cap_max_size(config, next);
2444 
2445         /* Use emulated next pointer to allow dropping extended caps */
2446         pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
2447                                    PCI_EXT_CAP_NEXT_MASK);
2448 
2449         switch (cap_id) {
2450         case 0: /* kernel masked capability */
2451         case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */
2452         case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */
2453             trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
2454             break;
2455         case PCI_EXT_CAP_ID_REBAR:
2456             if (!vfio_setup_rebar_ecap(vdev, next)) {
2457                 pcie_add_capability(pdev, cap_id, cap_ver, next, size);
2458             }
2459             break;
2460         default:
2461             pcie_add_capability(pdev, cap_id, cap_ver, next, size);
2462         }
2463 
2464     }
2465 
2466     /* Cleanup chain head ID if necessary */
2467     if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
2468         pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
2469     }
2470 
2471     g_free(config);
2472 }
2473 
2474 bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
2475 {
2476     PCIDevice *pdev = &vdev->pdev;
2477 
2478     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
2479         !pdev->config[PCI_CAPABILITY_LIST]) {
2480         return true; /* Nothing to add */
2481     }
2482 
2483     if (!vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST], errp)) {
2484         return false;
2485     }
2486 
2487     vfio_add_ext_cap(vdev);
2488     return true;
2489 }
2490 
2491 void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
2492 {
2493     PCIDevice *pdev = &vdev->pdev;
2494     uint16_t cmd;
2495 
2496     vfio_disable_interrupts(vdev);
2497 
2498     /*
2499      * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master.
2500      * Also put INTx Disable in known state.
2501      */
2502     cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
2503     cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
2504              PCI_COMMAND_INTX_DISABLE);
2505     vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
2506 
2507     /* Make sure the device is in D0 */
2508     if (pdev->pm_cap) {
2509         uint16_t pmcsr;
2510         uint8_t state;
2511 
2512         pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2);
2513         state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2514         if (state) {
2515             pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
2516             vfio_pci_write_config(pdev, pdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
2517             /* vfio handles the necessary delay here */
2518             pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2);
2519             state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2520             if (state) {
2521                 error_report("vfio: Unable to power on device, stuck in D%d",
2522                              state);
2523             }
2524         }
2525     }
2526 }
2527 
2528 void vfio_pci_post_reset(VFIOPCIDevice *vdev)
2529 {
2530     VFIODevice *vbasedev = &vdev->vbasedev;
2531     Error *err = NULL;
2532     int ret, nr;
2533 
2534     if (!vfio_intx_enable(vdev, &err)) {
2535         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2536     }
2537 
2538     for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
2539         off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr);
2540         uint32_t val = 0;
2541         uint32_t len = sizeof(val);
2542 
2543         ret = vfio_pci_config_space_write(vdev, addr, len, &val);
2544         if (ret != len) {
2545             error_report("%s(%s) reset bar %d failed: %s", __func__,
2546                          vbasedev->name, nr, strwriteerror(ret));
2547         }
2548     }
2549 
2550     vfio_quirk_reset(vdev);
2551 }
2552 
2553 bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
2554 {
2555     char tmp[13];
2556 
2557     sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain,
2558             addr->bus, addr->slot, addr->function);
2559 
2560     return (strcmp(tmp, name) == 0);
2561 }
2562 
2563 int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
2564                                     struct vfio_pci_hot_reset_info **info_p)
2565 {
2566     struct vfio_pci_hot_reset_info *info;
2567     int ret, count;
2568 
2569     assert(info_p && !*info_p);
2570 
2571     info = g_malloc0(sizeof(*info));
2572     info->argsz = sizeof(*info);
2573 
2574     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2575     if (ret && errno != ENOSPC) {
2576         ret = -errno;
2577         g_free(info);
2578         if (!vdev->has_pm_reset) {
2579             error_report("vfio: Cannot reset device %s, "
2580                          "no available reset mechanism.", vdev->vbasedev.name);
2581         }
2582         return ret;
2583     }
2584 
2585     count = info->count;
2586     info = g_realloc(info, sizeof(*info) + (count * sizeof(info->devices[0])));
2587     info->argsz = sizeof(*info) + (count * sizeof(info->devices[0]));
2588 
2589     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2590     if (ret) {
2591         ret = -errno;
2592         g_free(info);
2593         error_report("vfio: hot reset info failed: %m");
2594         return ret;
2595     }
2596 
2597     *info_p = info;
2598     return 0;
2599 }
2600 
2601 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
2602 {
2603     VFIODevice *vbasedev = &vdev->vbasedev;
2604     const VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer);
2605 
2606     return vioc->pci_hot_reset(vbasedev, single);
2607 }
2608 
2609 /*
2610  * We want to differentiate hot reset of multiple in-use devices vs hot reset
2611  * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
2612  * of doing hot resets when there is only a single device per bus.  The in-use
2613  * here refers to how many VFIODevices are affected.  A hot reset that affects
2614  * multiple devices, but only a single in-use device, means that we can call
2615  * it from our bus ->reset() callback since the extent is effectively a single
2616  * device.  This allows us to make use of it in the hotplug path.  When there
2617  * are multiple in-use devices, we can only trigger the hot reset during a
2618  * system reset and thus from our reset handler.  We separate _one vs _multi
2619  * here so that we don't overlap and do a double reset on the system reset
2620  * path where both our reset handler and ->reset() callback are used.  Calling
2621  * _one() will only do a hot reset for the one in-use devices case, calling
2622  * _multi() will do nothing if a _one() would have been sufficient.
2623  */
2624 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev)
2625 {
2626     return vfio_pci_hot_reset(vdev, true);
2627 }
2628 
2629 static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev)
2630 {
2631     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2632     return vfio_pci_hot_reset(vdev, false);
2633 }
2634 
2635 static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev)
2636 {
2637     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2638     if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
2639         vbasedev->needs_reset = true;
2640     }
2641 }
2642 
2643 static Object *vfio_pci_get_object(VFIODevice *vbasedev)
2644 {
2645     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2646 
2647     return OBJECT(vdev);
2648 }
2649 
2650 static bool vfio_msix_present(void *opaque, int version_id)
2651 {
2652     PCIDevice *pdev = opaque;
2653 
2654     return msix_present(pdev);
2655 }
2656 
2657 static bool vfio_display_migration_needed(void *opaque)
2658 {
2659     VFIOPCIDevice *vdev = opaque;
2660 
2661     /*
2662      * We need to migrate the VFIODisplay object if ramfb *migration* was
2663      * explicitly requested (in which case we enforced both ramfb=on and
2664      * display=on), or ramfb migration was left at the default "auto"
2665      * setting, and *ramfb* was explicitly requested (in which case we
2666      * enforced display=on).
2667      */
2668     return vdev->ramfb_migrate == ON_OFF_AUTO_ON ||
2669         (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO && vdev->enable_ramfb);
2670 }
2671 
2672 static const VMStateDescription vmstate_vfio_display = {
2673     .name = "VFIOPCIDevice/VFIODisplay",
2674     .version_id = 1,
2675     .minimum_version_id = 1,
2676     .needed = vfio_display_migration_needed,
2677     .fields = (const VMStateField[]){
2678         VMSTATE_STRUCT_POINTER(dpy, VFIOPCIDevice, vfio_display_vmstate,
2679                                VFIODisplay),
2680         VMSTATE_END_OF_LIST()
2681     }
2682 };
2683 
2684 static const VMStateDescription vmstate_vfio_pci_config = {
2685     .name = "VFIOPCIDevice",
2686     .version_id = 1,
2687     .minimum_version_id = 1,
2688     .fields = (const VMStateField[]) {
2689         VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
2690         VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present),
2691         VMSTATE_END_OF_LIST()
2692     },
2693     .subsections = (const VMStateDescription * const []) {
2694         &vmstate_vfio_display,
2695         NULL
2696     }
2697 };
2698 
2699 static int vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f, Error **errp)
2700 {
2701     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2702 
2703     return vmstate_save_state_with_err(f, &vmstate_vfio_pci_config, vdev, NULL,
2704                                        errp);
2705 }
2706 
2707 static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
2708 {
2709     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2710     PCIDevice *pdev = &vdev->pdev;
2711     pcibus_t old_addr[PCI_NUM_REGIONS - 1];
2712     int bar, ret;
2713 
2714     for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
2715         old_addr[bar] = pdev->io_regions[bar].addr;
2716     }
2717 
2718     ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1);
2719     if (ret) {
2720         return ret;
2721     }
2722 
2723     vfio_pci_write_config(pdev, PCI_COMMAND,
2724                           pci_get_word(pdev->config + PCI_COMMAND), 2);
2725 
2726     for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
2727         /*
2728          * The address may not be changed in some scenarios
2729          * (e.g. the VF driver isn't loaded in VM).
2730          */
2731         if (old_addr[bar] != pdev->io_regions[bar].addr &&
2732             vdev->bars[bar].region.size > 0 &&
2733             vdev->bars[bar].region.size < qemu_real_host_page_size()) {
2734             vfio_sub_page_bar_update_mapping(pdev, bar);
2735         }
2736     }
2737 
2738     if (msi_enabled(pdev)) {
2739         vfio_msi_enable(vdev);
2740     } else if (msix_enabled(pdev)) {
2741         vfio_msix_enable(vdev);
2742     }
2743 
2744     return ret;
2745 }
2746 
2747 static VFIODeviceOps vfio_pci_ops = {
2748     .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
2749     .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
2750     .vfio_eoi = vfio_pci_intx_eoi,
2751     .vfio_get_object = vfio_pci_get_object,
2752     .vfio_save_config = vfio_pci_save_config,
2753     .vfio_load_config = vfio_pci_load_config,
2754 };
2755 
2756 bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
2757 {
2758     VFIODevice *vbasedev = &vdev->vbasedev;
2759     struct vfio_region_info *reg_info = NULL;
2760     int ret;
2761 
2762     ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, &reg_info);
2763     if (ret) {
2764         error_setg_errno(errp, -ret,
2765                          "failed getting region info for VGA region index %d",
2766                          VFIO_PCI_VGA_REGION_INDEX);
2767         return false;
2768     }
2769 
2770     if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) ||
2771         !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) ||
2772         reg_info->size < 0xbffff + 1) {
2773         error_setg(errp, "unexpected VGA info, flags 0x%lx, size 0x%lx",
2774                    (unsigned long)reg_info->flags,
2775                    (unsigned long)reg_info->size);
2776         return false;
2777     }
2778 
2779     vdev->vga = g_new0(VFIOVGA, 1);
2780 
2781     vdev->vga->fd_offset = reg_info->offset;
2782     vdev->vga->fd = vdev->vbasedev.fd;
2783 
2784     vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
2785     vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
2786     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks);
2787 
2788     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
2789                           OBJECT(vdev), &vfio_vga_ops,
2790                           &vdev->vga->region[QEMU_PCI_VGA_MEM],
2791                           "vfio-vga-mmio@0xa0000",
2792                           QEMU_PCI_VGA_MEM_SIZE);
2793 
2794     vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
2795     vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
2796     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks);
2797 
2798     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
2799                           OBJECT(vdev), &vfio_vga_ops,
2800                           &vdev->vga->region[QEMU_PCI_VGA_IO_LO],
2801                           "vfio-vga-io@0x3b0",
2802                           QEMU_PCI_VGA_IO_LO_SIZE);
2803 
2804     vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
2805     vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
2806     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks);
2807 
2808     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
2809                           OBJECT(vdev), &vfio_vga_ops,
2810                           &vdev->vga->region[QEMU_PCI_VGA_IO_HI],
2811                           "vfio-vga-io@0x3c0",
2812                           QEMU_PCI_VGA_IO_HI_SIZE);
2813 
2814     pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
2815                      &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
2816                      &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem);
2817 
2818     return true;
2819 }
2820 
2821 bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp)
2822 {
2823     VFIODevice *vbasedev = &vdev->vbasedev;
2824     struct vfio_region_info *reg_info = NULL;
2825     struct vfio_irq_info irq_info;
2826     int i, ret = -1;
2827 
2828     /* Sanity check device */
2829     if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) {
2830         error_setg(errp, "this isn't a PCI device");
2831         return false;
2832     }
2833 
2834     if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
2835         error_setg(errp, "unexpected number of io regions %u",
2836                    vbasedev->num_regions);
2837         return false;
2838     }
2839 
2840     if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
2841         error_setg(errp, "unexpected number of irqs %u", vbasedev->num_irqs);
2842         return false;
2843     }
2844 
2845     for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
2846         char *name = g_strdup_printf("%s BAR %d", vbasedev->name, i);
2847 
2848         ret = vfio_region_setup(OBJECT(vdev), vbasedev,
2849                                 &vdev->bars[i].region, i, name);
2850         g_free(name);
2851 
2852         if (ret) {
2853             error_setg_errno(errp, -ret, "failed to get region %d info", i);
2854             return false;
2855         }
2856 
2857         QLIST_INIT(&vdev->bars[i].quirks);
2858     }
2859 
2860     ret = vfio_device_get_region_info(vbasedev,
2861                                       VFIO_PCI_CONFIG_REGION_INDEX, &reg_info);
2862     if (ret) {
2863         error_setg_errno(errp, -ret, "failed to get config info");
2864         return false;
2865     }
2866 
2867     trace_vfio_pci_populate_device_config(vdev->vbasedev.name,
2868                                       (unsigned long)reg_info->size,
2869                                       (unsigned long)reg_info->offset,
2870                                       (unsigned long)reg_info->flags);
2871 
2872     vdev->config_size = reg_info->size;
2873     if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
2874         vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
2875     }
2876     vdev->config_offset = reg_info->offset;
2877 
2878     if (vdev->features & VFIO_FEATURE_ENABLE_VGA) {
2879         if (!vfio_populate_vga(vdev, errp)) {
2880             error_append_hint(errp, "device does not support "
2881                               "requested feature x-vga\n");
2882             return false;
2883         }
2884     }
2885 
2886     ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info);
2887     if (ret) {
2888         /* This can fail for an old kernel or legacy PCI dev */
2889         trace_vfio_pci_populate_device_get_irq_info_failure(strerror(-ret));
2890     } else if (irq_info.count == 1) {
2891         vdev->pci_aer = true;
2892     } else {
2893         warn_report(VFIO_MSG_PREFIX
2894                     "Could not enable error recovery for the device",
2895                     vbasedev->name);
2896     }
2897 
2898     return true;
2899 }
2900 
2901 void vfio_pci_put_device(VFIOPCIDevice *vdev)
2902 {
2903     vfio_display_finalize(vdev);
2904     vfio_bars_finalize(vdev);
2905     g_free(vdev->emulated_config_bits);
2906     g_free(vdev->rom);
2907     /*
2908      * XXX Leaking igd_opregion is not an oversight, we can't remove the
2909      * fw_cfg entry therefore leaking this allocation seems like the safest
2910      * option.
2911      *
2912      * g_free(vdev->igd_opregion);
2913      */
2914 
2915     vfio_device_detach(&vdev->vbasedev);
2916 
2917     g_free(vdev->vbasedev.name);
2918     g_free(vdev->msix);
2919 }
2920 
2921 static void vfio_err_notifier_handler(void *opaque)
2922 {
2923     VFIOPCIDevice *vdev = opaque;
2924 
2925     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
2926         return;
2927     }
2928 
2929     /*
2930      * TBD. Retrieve the error details and decide what action
2931      * needs to be taken. One of the actions could be to pass
2932      * the error to the guest and have the guest driver recover
2933      * from the error. This requires that PCIe capabilities be
2934      * exposed to the guest. For now, we just terminate the
2935      * guest to contain the error.
2936      */
2937 
2938     error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
2939 
2940     vm_stop(RUN_STATE_INTERNAL_ERROR);
2941 }
2942 
2943 /*
2944  * Registers error notifier for devices supporting error recovery.
2945  * If we encounter a failure in this function, we report an error
2946  * and continue after disabling error recovery support for the
2947  * device.
2948  */
2949 void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev)
2950 {
2951     Error *err = NULL;
2952     int32_t fd;
2953 
2954     if (!vdev->pci_aer) {
2955         return;
2956     }
2957 
2958     if (!vfio_notifier_init(vdev, &vdev->err_notifier, "err_notifier", 0,
2959                             &err)) {
2960         error_report_err(err);
2961         vdev->pci_aer = false;
2962         return;
2963     }
2964 
2965     fd = event_notifier_get_fd(&vdev->err_notifier);
2966     qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev);
2967 
2968     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
2969                                        VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
2970         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2971         qemu_set_fd_handler(fd, NULL, NULL, vdev);
2972         vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0);
2973         vdev->pci_aer = false;
2974     }
2975 }
2976 
2977 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
2978 {
2979     Error *err = NULL;
2980 
2981     if (!vdev->pci_aer) {
2982         return;
2983     }
2984 
2985     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
2986                                        VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
2987         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2988     }
2989     qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
2990                         NULL, NULL, vdev);
2991     vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0);
2992 }
2993 
2994 static void vfio_req_notifier_handler(void *opaque)
2995 {
2996     VFIOPCIDevice *vdev = opaque;
2997     Error *err = NULL;
2998 
2999     if (!event_notifier_test_and_clear(&vdev->req_notifier)) {
3000         return;
3001     }
3002 
3003     qdev_unplug(DEVICE(vdev), &err);
3004     if (err) {
3005         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3006     }
3007 }
3008 
3009 void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev)
3010 {
3011     struct vfio_irq_info irq_info;
3012     Error *err = NULL;
3013     int32_t fd;
3014     int ret;
3015 
3016     if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
3017         return;
3018     }
3019 
3020     ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX,
3021                                    &irq_info);
3022     if (ret < 0 || irq_info.count < 1) {
3023         return;
3024     }
3025 
3026     if (!vfio_notifier_init(vdev, &vdev->req_notifier, "req_notifier", 0,
3027                             &err)) {
3028         error_report_err(err);
3029         return;
3030     }
3031 
3032     fd = event_notifier_get_fd(&vdev->req_notifier);
3033     qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev);
3034 
3035     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
3036                                        VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
3037         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3038         qemu_set_fd_handler(fd, NULL, NULL, vdev);
3039         vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0);
3040     } else {
3041         vdev->req_enabled = true;
3042     }
3043 }
3044 
3045 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
3046 {
3047     Error *err = NULL;
3048 
3049     if (!vdev->req_enabled) {
3050         return;
3051     }
3052 
3053     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
3054                                        VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
3055         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3056     }
3057     qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
3058                         NULL, NULL, vdev);
3059     vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0);
3060 
3061     vdev->req_enabled = false;
3062 }
3063 
3064 bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp)
3065 {
3066     PCIDevice *pdev = &vdev->pdev;
3067     VFIODevice *vbasedev = &vdev->vbasedev;
3068     uint32_t config_space_size;
3069     int ret;
3070 
3071     config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size);
3072 
3073     /* Get a copy of config space */
3074     ret = vfio_pci_config_space_read(vdev, 0, config_space_size,
3075                                      vdev->pdev.config);
3076     if (ret < (int)config_space_size) {
3077         ret = ret < 0 ? -ret : EFAULT;
3078         error_setg_errno(errp, ret, "failed to read device config space");
3079         return false;
3080     }
3081 
3082     /* vfio emulates a lot for us, but some bits need extra love */
3083     vdev->emulated_config_bits = g_malloc0(vdev->config_size);
3084 
3085     /* QEMU can choose to expose the ROM or not */
3086     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
3087     /* QEMU can also add or extend BARs */
3088     memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
3089 
3090     /*
3091      * The PCI spec reserves vendor ID 0xffff as an invalid value.  The
3092      * device ID is managed by the vendor and need only be a 16-bit value.
3093      * Allow any 16-bit value for subsystem so they can be hidden or changed.
3094      */
3095     if (vdev->vendor_id != PCI_ANY_ID) {
3096         if (vdev->vendor_id >= 0xffff) {
3097             error_setg(errp, "invalid PCI vendor ID provided");
3098             return false;
3099         }
3100         vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
3101         trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id);
3102     } else {
3103         vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
3104     }
3105 
3106     if (vdev->device_id != PCI_ANY_ID) {
3107         if (vdev->device_id > 0xffff) {
3108             error_setg(errp, "invalid PCI device ID provided");
3109             return false;
3110         }
3111         vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
3112         trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id);
3113     } else {
3114         vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
3115     }
3116 
3117     if (vdev->sub_vendor_id != PCI_ANY_ID) {
3118         if (vdev->sub_vendor_id > 0xffff) {
3119             error_setg(errp, "invalid PCI subsystem vendor ID provided");
3120             return false;
3121         }
3122         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
3123                                vdev->sub_vendor_id, ~0);
3124         trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name,
3125                                               vdev->sub_vendor_id);
3126     }
3127 
3128     if (vdev->sub_device_id != PCI_ANY_ID) {
3129         if (vdev->sub_device_id > 0xffff) {
3130             error_setg(errp, "invalid PCI subsystem device ID provided");
3131             return false;
3132         }
3133         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
3134         trace_vfio_pci_emulated_sub_device_id(vbasedev->name,
3135                                               vdev->sub_device_id);
3136     }
3137 
3138     /* QEMU can change multi-function devices to single function, or reverse */
3139     vdev->emulated_config_bits[PCI_HEADER_TYPE] =
3140                                               PCI_HEADER_TYPE_MULTI_FUNCTION;
3141 
3142     /* Restore or clear multifunction, this is always controlled by QEMU */
3143     if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
3144         vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
3145     } else {
3146         vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
3147     }
3148 
3149     /*
3150      * Clear host resource mapping info.  If we choose not to register a
3151      * BAR, such as might be the case with the option ROM, we can get
3152      * confusing, unwritable, residual addresses from the host here.
3153      */
3154     memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
3155     memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
3156 
3157     vfio_pci_size_rom(vdev);
3158 
3159     vfio_bars_prepare(vdev);
3160 
3161     if (!vfio_msix_early_setup(vdev, errp)) {
3162         return false;
3163     }
3164 
3165     vfio_bars_register(vdev);
3166 
3167     return true;
3168 }
3169 
3170 bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
3171 {
3172     PCIDevice *pdev = &vdev->pdev;
3173 
3174     /* QEMU emulates all of MSI & MSIX */
3175     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
3176         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
3177                MSIX_CAP_LENGTH);
3178     }
3179 
3180     if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
3181         memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
3182                vdev->msi_cap_size);
3183     }
3184 
3185     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
3186         vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
3187                                              vfio_intx_mmap_enable, vdev);
3188         pci_device_set_intx_routing_notifier(&vdev->pdev,
3189                                              vfio_intx_routing_notifier);
3190         vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
3191         kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
3192         if (!vfio_intx_enable(vdev, errp)) {
3193             timer_free(vdev->intx.mmap_timer);
3194             pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3195             kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3196             return false;
3197         }
3198     }
3199     return true;
3200 }
3201 
3202 static void vfio_pci_realize(PCIDevice *pdev, Error **errp)
3203 {
3204     ERRP_GUARD();
3205     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
3206     VFIODevice *vbasedev = &vdev->vbasedev;
3207     int i;
3208     char uuid[UUID_STR_LEN];
3209     g_autofree char *name = NULL;
3210 
3211     if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
3212         if (!(~vdev->host.domain || ~vdev->host.bus ||
3213               ~vdev->host.slot || ~vdev->host.function)) {
3214             error_setg(errp, "No provided host device");
3215             error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
3216 #ifdef CONFIG_IOMMUFD
3217                               "or -device vfio-pci,fd=DEVICE_FD "
3218 #endif
3219                               "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
3220             return;
3221         }
3222         vbasedev->sysfsdev =
3223             g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x",
3224                             vdev->host.domain, vdev->host.bus,
3225                             vdev->host.slot, vdev->host.function);
3226     }
3227 
3228     if (!vfio_device_get_name(vbasedev, errp)) {
3229         return;
3230     }
3231 
3232     /*
3233      * Mediated devices *might* operate compatibly with discarding of RAM, but
3234      * we cannot know for certain, it depends on whether the mdev vendor driver
3235      * stays in sync with the active working set of the guest driver.  Prevent
3236      * the x-balloon-allowed option unless this is minimally an mdev device.
3237      */
3238     vbasedev->mdev = vfio_device_is_mdev(vbasedev);
3239 
3240     trace_vfio_mdev(vbasedev->name, vbasedev->mdev);
3241 
3242     if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) {
3243         error_setg(errp, "x-balloon-allowed only potentially compatible "
3244                    "with mdev devices");
3245         goto error;
3246     }
3247 
3248     if (!qemu_uuid_is_null(&vdev->vf_token)) {
3249         qemu_uuid_unparse(&vdev->vf_token, uuid);
3250         name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid);
3251     } else {
3252         name = g_strdup(vbasedev->name);
3253     }
3254 
3255     if (!vfio_device_attach(name, vbasedev,
3256                             pci_device_iommu_address_space(pdev), errp)) {
3257         goto error;
3258     }
3259 
3260     if (!vfio_pci_populate_device(vdev, errp)) {
3261         goto error;
3262     }
3263 
3264     if (!vfio_pci_config_setup(vdev, errp)) {
3265         goto error;
3266     }
3267 
3268     if (!vbasedev->mdev &&
3269         !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) {
3270         error_prepend(errp, "Failed to set vIOMMU: ");
3271         goto out_teardown;
3272     }
3273 
3274     if (!vfio_pci_add_capabilities(vdev, errp)) {
3275         goto out_unset_idev;
3276     }
3277 
3278     if (!vfio_config_quirk_setup(vdev, errp)) {
3279         goto out_unset_idev;
3280     }
3281 
3282     if (vdev->vga) {
3283         vfio_vga_quirk_setup(vdev);
3284     }
3285 
3286     for (i = 0; i < PCI_ROM_SLOT; i++) {
3287         vfio_bar_quirk_setup(vdev, i);
3288     }
3289 
3290     if (!vfio_pci_interrupt_setup(vdev, errp)) {
3291         goto out_unset_idev;
3292     }
3293 
3294     if (vdev->display != ON_OFF_AUTO_OFF) {
3295         if (!vfio_display_probe(vdev, errp)) {
3296             goto out_deregister;
3297         }
3298     }
3299     if (vdev->enable_ramfb && vdev->dpy == NULL) {
3300         error_setg(errp, "ramfb=on requires display=on");
3301         goto out_deregister;
3302     }
3303     if (vdev->display_xres || vdev->display_yres) {
3304         if (vdev->dpy == NULL) {
3305             error_setg(errp, "xres and yres properties require display=on");
3306             goto out_deregister;
3307         }
3308         if (vdev->dpy->edid_regs == NULL) {
3309             error_setg(errp, "xres and yres properties need edid support");
3310             goto out_deregister;
3311         }
3312     }
3313 
3314     if (vdev->ramfb_migrate == ON_OFF_AUTO_ON && !vdev->enable_ramfb) {
3315         warn_report("x-ramfb-migrate=on but ramfb=off. "
3316                     "Forcing x-ramfb-migrate to off.");
3317         vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
3318     }
3319     if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) {
3320         if (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO) {
3321             vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
3322         } else if (vdev->ramfb_migrate == ON_OFF_AUTO_ON) {
3323             error_setg(errp, "x-ramfb-migrate requires enable-migration");
3324             goto out_deregister;
3325         }
3326     }
3327 
3328     if (!pdev->failover_pair_id) {
3329         if (!vfio_migration_realize(vbasedev, errp)) {
3330             goto out_deregister;
3331         }
3332     }
3333 
3334     vfio_pci_register_err_notifier(vdev);
3335     vfio_pci_register_req_notifier(vdev);
3336     vfio_setup_resetfn_quirk(vdev);
3337 
3338     return;
3339 
3340 out_deregister:
3341     if (vdev->interrupt == VFIO_INT_INTx) {
3342         vfio_intx_disable(vdev);
3343     }
3344     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3345     if (vdev->irqchip_change_notifier.notify) {
3346         kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3347     }
3348     if (vdev->intx.mmap_timer) {
3349         timer_free(vdev->intx.mmap_timer);
3350     }
3351 out_unset_idev:
3352     if (!vbasedev->mdev) {
3353         pci_device_unset_iommu_device(pdev);
3354     }
3355 out_teardown:
3356     vfio_pci_teardown_msi(vdev);
3357     vfio_pci_bars_exit(vdev);
3358 error:
3359     error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name);
3360 }
3361 
3362 static void vfio_instance_finalize(Object *obj)
3363 {
3364     VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
3365 
3366     vfio_pci_put_device(vdev);
3367 }
3368 
3369 static void vfio_exitfn(PCIDevice *pdev)
3370 {
3371     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
3372     VFIODevice *vbasedev = &vdev->vbasedev;
3373 
3374     vfio_unregister_req_notifier(vdev);
3375     vfio_unregister_err_notifier(vdev);
3376     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3377     if (vdev->irqchip_change_notifier.notify) {
3378         kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3379     }
3380     vfio_disable_interrupts(vdev);
3381     if (vdev->intx.mmap_timer) {
3382         timer_free(vdev->intx.mmap_timer);
3383     }
3384     vfio_pci_teardown_msi(vdev);
3385     vfio_pci_disable_rp_atomics(vdev);
3386     vfio_pci_bars_exit(vdev);
3387     vfio_migration_exit(vbasedev);
3388     if (!vbasedev->mdev) {
3389         pci_device_unset_iommu_device(pdev);
3390     }
3391 }
3392 
3393 static void vfio_pci_reset(DeviceState *dev)
3394 {
3395     VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev);
3396 
3397     /* Do not reset the device during qemu_system_reset prior to cpr load */
3398     if (cpr_is_incoming()) {
3399         return;
3400     }
3401 
3402     trace_vfio_pci_reset(vdev->vbasedev.name);
3403 
3404     vfio_pci_pre_reset(vdev);
3405 
3406     if (vdev->display != ON_OFF_AUTO_OFF) {
3407         vfio_display_reset(vdev);
3408     }
3409 
3410     if (vdev->resetfn && !vdev->resetfn(vdev)) {
3411         goto post_reset;
3412     }
3413 
3414     if (vdev->vbasedev.reset_works &&
3415         (vdev->has_flr || !vdev->has_pm_reset) &&
3416         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3417         trace_vfio_pci_reset_flr(vdev->vbasedev.name);
3418         goto post_reset;
3419     }
3420 
3421     /* See if we can do our own bus reset */
3422     if (!vfio_pci_hot_reset_one(vdev)) {
3423         goto post_reset;
3424     }
3425 
3426     /* If nothing else works and the device supports PM reset, use it */
3427     if (vdev->vbasedev.reset_works && vdev->has_pm_reset &&
3428         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3429         trace_vfio_pci_reset_pm(vdev->vbasedev.name);
3430         goto post_reset;
3431     }
3432 
3433 post_reset:
3434     vfio_pci_post_reset(vdev);
3435 }
3436 
3437 static void vfio_instance_init(Object *obj)
3438 {
3439     PCIDevice *pci_dev = PCI_DEVICE(obj);
3440     VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
3441     VFIODevice *vbasedev = &vdev->vbasedev;
3442 
3443     device_add_bootindex_property(obj, &vdev->bootindex,
3444                                   "bootindex", NULL,
3445                                   &pci_dev->qdev);
3446     vdev->host.domain = ~0U;
3447     vdev->host.bus = ~0U;
3448     vdev->host.slot = ~0U;
3449     vdev->host.function = ~0U;
3450 
3451     vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops,
3452                      DEVICE(vdev), false);
3453 
3454     vdev->nv_gpudirect_clique = 0xFF;
3455 
3456     /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
3457      * line, therefore, no need to wait to realize like other devices */
3458     pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
3459 
3460     /*
3461      * A device that is resuming for cpr is already configured, so do not
3462      * reset it during qemu_system_reset prior to cpr load, else interrupts
3463      * may be lost.
3464      */
3465     pci_dev->cap_present |= QEMU_PCI_SKIP_RESET_ON_CPR;
3466 }
3467 
3468 static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data)
3469 {
3470     DeviceClass *dc = DEVICE_CLASS(klass);
3471     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3472 
3473     dc->desc = "VFIO PCI base device";
3474     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3475     pdc->exit = vfio_exitfn;
3476     pdc->config_read = vfio_pci_read_config;
3477     pdc->config_write = vfio_pci_write_config;
3478 }
3479 
3480 static const TypeInfo vfio_pci_base_dev_info = {
3481     .name = TYPE_VFIO_PCI_BASE,
3482     .parent = TYPE_PCI_DEVICE,
3483     .instance_size = sizeof(VFIOPCIDevice),
3484     .abstract = true,
3485     .class_init = vfio_pci_base_dev_class_init,
3486     .interfaces = (const InterfaceInfo[]) {
3487         { INTERFACE_PCIE_DEVICE },
3488         { INTERFACE_CONVENTIONAL_PCI_DEVICE },
3489         { }
3490     },
3491 };
3492 
3493 static PropertyInfo vfio_pci_migration_multifd_transfer_prop;
3494 
3495 static const Property vfio_pci_dev_properties[] = {
3496     DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
3497     DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
3498     DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
3499     DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
3500                             vbasedev.pre_copy_dirty_page_tracking,
3501                             ON_OFF_AUTO_ON),
3502     DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice,
3503                             vbasedev.device_dirty_page_tracking,
3504                             ON_OFF_AUTO_ON),
3505     DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
3506                             display, ON_OFF_AUTO_OFF),
3507     DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
3508     DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
3509     DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
3510                        intx.mmap_timeout, 1100),
3511     DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
3512                     VFIO_FEATURE_ENABLE_VGA_BIT, false),
3513     DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
3514                     VFIO_FEATURE_ENABLE_REQ_BIT, true),
3515     DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
3516                     VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, true),
3517     DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features,
3518                     VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false),
3519     DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice,
3520                             igd_legacy_mode, ON_OFF_AUTO_AUTO),
3521     DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice,
3522                             vbasedev.enable_migration, ON_OFF_AUTO_AUTO),
3523     DEFINE_PROP("x-migration-multifd-transfer", VFIOPCIDevice,
3524                 vbasedev.migration_multifd_transfer,
3525                 vfio_pci_migration_multifd_transfer_prop, OnOffAuto,
3526                 .set_default = true, .defval.i = ON_OFF_AUTO_AUTO),
3527     DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice,
3528                      vbasedev.migration_events, false),
3529     DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
3530     DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
3531                      vbasedev.ram_block_discard_allowed, false),
3532     DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
3533     DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
3534     DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
3535     DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
3536                      no_geforce_quirks, false),
3537     DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
3538                      false),
3539     DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
3540                      false),
3541     DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
3542     DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
3543     DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
3544                        sub_vendor_id, PCI_ANY_ID),
3545     DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
3546                        sub_device_id, PCI_ANY_ID),
3547     DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
3548     DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
3549                                    nv_gpudirect_clique,
3550                                    qdev_prop_nv_gpudirect_clique, uint8_t),
3551     DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
3552                                 OFF_AUTO_PCIBAR_OFF),
3553 #ifdef CONFIG_IOMMUFD
3554     DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
3555                      TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
3556 #endif
3557     DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
3558 };
3559 
3560 #ifdef CONFIG_IOMMUFD
3561 static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp)
3562 {
3563     VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
3564     vfio_device_set_fd(&vdev->vbasedev, str, errp);
3565 }
3566 #endif
3567 
3568 static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data)
3569 {
3570     DeviceClass *dc = DEVICE_CLASS(klass);
3571     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3572 
3573     device_class_set_legacy_reset(dc, vfio_pci_reset);
3574     device_class_set_props(dc, vfio_pci_dev_properties);
3575 #ifdef CONFIG_IOMMUFD
3576     object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd);
3577 #endif
3578     dc->vmsd = &vfio_cpr_pci_vmstate;
3579     dc->desc = "VFIO-based PCI device assignment";
3580     pdc->realize = vfio_pci_realize;
3581 
3582     object_class_property_set_description(klass, /* 1.3 */
3583                                           "host",
3584                                           "Host PCI address [domain:]<bus:slot.function> of assigned device");
3585     object_class_property_set_description(klass, /* 1.3 */
3586                                           "x-intx-mmap-timeout-ms",
3587                                           "When EOI is not provided by KVM/QEMU, wait time "
3588                                           "(milliseconds) to re-enable device direct access "
3589                                           "after INTx (DEBUG)");
3590     object_class_property_set_description(klass, /* 1.5 */
3591                                           "x-vga",
3592                                           "Expose VGA address spaces for device");
3593     object_class_property_set_description(klass, /* 2.3 */
3594                                           "x-req",
3595                                           "Disable device request notification support (DEBUG)");
3596     object_class_property_set_description(klass, /* 2.4 and 2.5 */
3597                                           "x-no-mmap",
3598                                           "Disable MMAP for device. Allows to trace MMIO "
3599                                           "accesses (DEBUG)");
3600     object_class_property_set_description(klass, /* 2.5 */
3601                                           "x-no-kvm-intx",
3602                                           "Disable direct VFIO->KVM INTx injection. Allows to "
3603                                           "trace INTx interrupts (DEBUG)");
3604     object_class_property_set_description(klass, /* 2.5 */
3605                                           "x-no-kvm-msi",
3606                                           "Disable direct VFIO->KVM MSI injection. Allows to "
3607                                           "trace MSI interrupts (DEBUG)");
3608     object_class_property_set_description(klass, /* 2.5 */
3609                                           "x-no-kvm-msix",
3610                                           "Disable direct VFIO->KVM MSIx injection. Allows to "
3611                                           "trace MSIx interrupts (DEBUG)");
3612     object_class_property_set_description(klass, /* 2.5 */
3613                                           "x-pci-vendor-id",
3614                                           "Override PCI Vendor ID with provided value (DEBUG)");
3615     object_class_property_set_description(klass, /* 2.5 */
3616                                           "x-pci-device-id",
3617                                           "Override PCI device ID with provided value (DEBUG)");
3618     object_class_property_set_description(klass, /* 2.5 */
3619                                           "x-pci-sub-vendor-id",
3620                                           "Override PCI Subsystem Vendor ID with provided value "
3621                                           "(DEBUG)");
3622     object_class_property_set_description(klass, /* 2.5 */
3623                                           "x-pci-sub-device-id",
3624                                           "Override PCI Subsystem Device ID with provided value "
3625                                           "(DEBUG)");
3626     object_class_property_set_description(klass, /* 2.6 */
3627                                           "sysfsdev",
3628                                           "Host sysfs path of assigned device");
3629     object_class_property_set_description(klass, /* 2.7 */
3630                                           "x-igd-opregion",
3631                                           "Expose host IGD OpRegion to guest");
3632     object_class_property_set_description(klass, /* 2.7 (See c4c45e943e51) */
3633                                           "x-igd-gms",
3634                                           "Override IGD data stolen memory size (32MiB units)");
3635     object_class_property_set_description(klass, /* 2.11 */
3636                                           "x-nv-gpudirect-clique",
3637                                           "Add NVIDIA GPUDirect capability indicating P2P DMA "
3638                                           "clique for device [0-15]");
3639     object_class_property_set_description(klass, /* 2.12 */
3640                                           "x-no-geforce-quirks",
3641                                           "Disable GeForce quirks (for NVIDIA Quadro/GRID/Tesla). "
3642                                           "Improves performance");
3643     object_class_property_set_description(klass, /* 2.12 */
3644                                           "display",
3645                                           "Enable display support for device, ex. vGPU");
3646     object_class_property_set_description(klass, /* 2.12 */
3647                                           "x-msix-relocation",
3648                                           "Specify MSI-X MMIO relocation to the end of specified "
3649                                           "existing BAR or new BAR to avoid virtualization overhead "
3650                                           "due to adjacent device registers");
3651     object_class_property_set_description(klass, /* 3.0 */
3652                                           "x-no-kvm-ioeventfd",
3653                                           "Disable registration of ioeventfds with KVM (DEBUG)");
3654     object_class_property_set_description(klass, /* 3.0 */
3655                                           "x-no-vfio-ioeventfd",
3656                                           "Disable linking of KVM ioeventfds to VFIO ioeventfds "
3657                                           "(DEBUG)");
3658     object_class_property_set_description(klass, /* 3.1 */
3659                                           "x-balloon-allowed",
3660                                           "Override allowing ballooning with device (DEBUG, DANGER)");
3661     object_class_property_set_description(klass, /* 3.2 */
3662                                           "xres",
3663                                           "Set X display resolution the vGPU should use");
3664     object_class_property_set_description(klass, /* 3.2 */
3665                                           "yres",
3666                                           "Set Y display resolution the vGPU should use");
3667     object_class_property_set_description(klass, /* 5.2 */
3668                                           "x-pre-copy-dirty-page-tracking",
3669                                           "Disable dirty pages tracking during iterative phase "
3670                                           "(DEBUG)");
3671     object_class_property_set_description(klass, /* 5.2, 8.0 non-experimetal */
3672                                           "enable-migration",
3673                                           "Enale device migration. Also requires a host VFIO PCI "
3674                                           "variant or mdev driver with migration support enabled");
3675     object_class_property_set_description(klass, /* 8.1 */
3676                                           "vf-token",
3677                                           "Specify UUID VF token. Required for VF when PF is owned "
3678                                           "by another VFIO driver");
3679 #ifdef CONFIG_IOMMUFD
3680     object_class_property_set_description(klass, /* 9.0 */
3681                                           "iommufd",
3682                                           "Set host IOMMUFD backend device");
3683 #endif
3684     object_class_property_set_description(klass, /* 9.1 */
3685                                           "x-device-dirty-page-tracking",
3686                                           "Disable device dirty page tracking and use "
3687                                           "container-based dirty page tracking");
3688     object_class_property_set_description(klass, /* 9.1 */
3689                                           "migration-events",
3690                                           "Emit VFIO migration QAPI event when a VFIO device "
3691                                           "changes its migration state. For management applications");
3692     object_class_property_set_description(klass, /* 9.1 */
3693                                           "skip-vsc-check",
3694                                           "Skip config space check for Vendor Specific Capability. "
3695                                           "Setting to false will enforce strict checking of VSC content "
3696                                           "(DEBUG)");
3697     object_class_property_set_description(klass, /* 10.0 */
3698                                           "x-migration-multifd-transfer",
3699                                           "Transfer this device state via "
3700                                           "multifd channels when live migrating it");
3701 }
3702 
3703 static const TypeInfo vfio_pci_dev_info = {
3704     .name = TYPE_VFIO_PCI,
3705     .parent = TYPE_VFIO_PCI_BASE,
3706     .class_init = vfio_pci_dev_class_init,
3707     .instance_init = vfio_instance_init,
3708     .instance_finalize = vfio_instance_finalize,
3709 };
3710 
3711 static const Property vfio_pci_dev_nohotplug_properties[] = {
3712     DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false),
3713     DEFINE_PROP_ON_OFF_AUTO("x-ramfb-migrate", VFIOPCIDevice, ramfb_migrate,
3714                             ON_OFF_AUTO_AUTO),
3715 };
3716 
3717 static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass,
3718                                               const void *data)
3719 {
3720     DeviceClass *dc = DEVICE_CLASS(klass);
3721 
3722     device_class_set_props(dc, vfio_pci_dev_nohotplug_properties);
3723     dc->hotpluggable = false;
3724 
3725     object_class_property_set_description(klass, /* 3.1 */
3726                                           "ramfb",
3727                                           "Enable ramfb to provide pre-boot graphics for devices "
3728                                           "enabling display option");
3729     object_class_property_set_description(klass, /* 8.2 */
3730                                           "x-ramfb-migrate",
3731                                           "Override default migration support for ramfb support "
3732                                           "(DEBUG)");
3733 }
3734 
3735 static const TypeInfo vfio_pci_nohotplug_dev_info = {
3736     .name = TYPE_VFIO_PCI_NOHOTPLUG,
3737     .parent = TYPE_VFIO_PCI,
3738     .instance_size = sizeof(VFIOPCIDevice),
3739     .class_init = vfio_pci_nohotplug_dev_class_init,
3740 };
3741 
3742 static void register_vfio_pci_dev_type(void)
3743 {
3744     /*
3745      * Ordinary ON_OFF_AUTO property isn't runtime-mutable, but source VM can
3746      * run for a long time before being migrated so it is desirable to have a
3747      * fallback mechanism to the old way of transferring VFIO device state if
3748      * it turns to be necessary.
3749      * The following makes this type of property have the same mutability level
3750      * as ordinary migration parameters.
3751      */
3752     vfio_pci_migration_multifd_transfer_prop = qdev_prop_on_off_auto;
3753     vfio_pci_migration_multifd_transfer_prop.realized_set_allowed = true;
3754 
3755     type_register_static(&vfio_pci_base_dev_info);
3756     type_register_static(&vfio_pci_dev_info);
3757     type_register_static(&vfio_pci_nohotplug_dev_info);
3758 }
3759 
3760 type_init(register_vfio_pci_dev_type)
3761