xref: /openbmc/qemu/hw/vfio/pci.c (revision a84e2e04e8b03c47ee9304d9b5158b212d11183a)
1 /*
2  * vfio based device assignment support
3  *
4  * Copyright Red Hat, Inc. 2012
5  *
6  * Authors:
7  *  Alex Williamson <alex.williamson@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Based on qemu-kvm device-assignment:
13  *  Adapted for KVM by Qumranet.
14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19  */
20 
21 #include "qemu/osdep.h"
22 #include CONFIG_DEVICES /* CONFIG_IOMMUFD */
23 #include <linux/vfio.h>
24 #include <sys/ioctl.h>
25 
26 #include "hw/hw.h"
27 #include "hw/pci/msi.h"
28 #include "hw/pci/msix.h"
29 #include "hw/pci/pci_bridge.h"
30 #include "hw/qdev-properties.h"
31 #include "hw/qdev-properties-system.h"
32 #include "hw/vfio/vfio-cpr.h"
33 #include "migration/vmstate.h"
34 #include "migration/cpr.h"
35 #include "qobject/qdict.h"
36 #include "qemu/error-report.h"
37 #include "qemu/main-loop.h"
38 #include "qemu/module.h"
39 #include "qemu/range.h"
40 #include "qemu/units.h"
41 #include "system/kvm.h"
42 #include "system/runstate.h"
43 #include "pci.h"
44 #include "trace.h"
45 #include "qapi/error.h"
46 #include "migration/blocker.h"
47 #include "migration/qemu-file.h"
48 #include "system/iommufd.h"
49 #include "vfio-migration-internal.h"
50 #include "vfio-helpers.h"
51 
52 /* Protected by BQL */
53 static KVMRouteChange vfio_route_change;
54 
55 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
56 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
57 static void vfio_msi_disable_common(VFIOPCIDevice *vdev);
58 
59 /* Create new or reuse existing eventfd */
60 static bool vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e,
61                                const char *name, int nr, Error **errp)
62 {
63     int fd, ret;
64 
65     fd = vfio_cpr_load_vector_fd(vdev, name, nr);
66     if (fd >= 0) {
67         event_notifier_init_fd(e, fd);
68         return true;
69     }
70 
71     ret = event_notifier_init(e, 0);
72     if (ret) {
73         error_setg_errno(errp, -ret, "vfio_notifier_init %s failed", name);
74         return false;
75     }
76 
77     fd = event_notifier_get_fd(e);
78     vfio_cpr_save_vector_fd(vdev, name, nr, fd);
79     return true;
80 }
81 
82 static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e,
83                                   const char *name, int nr)
84 {
85     vfio_cpr_delete_vector_fd(vdev, name, nr);
86     event_notifier_cleanup(e);
87 }
88 
89 /*
90  * Disabling BAR mmaping can be slow, but toggling it around INTx can
91  * also be a huge overhead.  We try to get the best of both worlds by
92  * waiting until an interrupt to disable mmaps (subsequent transitions
93  * to the same state are effectively no overhead).  If the interrupt has
94  * been serviced and the time gap is long enough, we re-enable mmaps for
95  * performance.  This works well for things like graphics cards, which
96  * may not use their interrupt at all and are penalized to an unusable
97  * level by read/write BAR traps.  Other devices, like NICs, have more
98  * regular interrupts and see much better latency by staying in non-mmap
99  * mode.  We therefore set the default mmap_timeout such that a ping
100  * is just enough to keep the mmap disabled.  Users can experiment with
101  * other options with the x-intx-mmap-timeout-ms parameter (a value of
102  * zero disables the timer).
103  */
104 static void vfio_intx_mmap_enable(void *opaque)
105 {
106     VFIOPCIDevice *vdev = opaque;
107 
108     if (vdev->intx.pending) {
109         timer_mod(vdev->intx.mmap_timer,
110                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
111         return;
112     }
113 
114     vfio_mmap_set_enabled(vdev, true);
115 }
116 
117 static void vfio_intx_interrupt(void *opaque)
118 {
119     VFIOPCIDevice *vdev = opaque;
120 
121     if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
122         return;
123     }
124 
125     trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin);
126 
127     vdev->intx.pending = true;
128     pci_irq_assert(&vdev->pdev);
129     vfio_mmap_set_enabled(vdev, false);
130     if (vdev->intx.mmap_timeout) {
131         timer_mod(vdev->intx.mmap_timer,
132                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
133     }
134 }
135 
136 void vfio_pci_intx_eoi(VFIODevice *vbasedev)
137 {
138     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
139 
140     if (!vdev->intx.pending) {
141         return;
142     }
143 
144     trace_vfio_pci_intx_eoi(vbasedev->name);
145 
146     vdev->intx.pending = false;
147     pci_irq_deassert(&vdev->pdev);
148     vfio_device_irq_unmask(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
149 }
150 
151 static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
152 {
153 #ifdef CONFIG_KVM
154     int irq_fd = event_notifier_get_fd(&vdev->intx.interrupt);
155 
156     if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
157         vdev->intx.route.mode != PCI_INTX_ENABLED ||
158         !kvm_resamplefds_enabled()) {
159         return true;
160     }
161 
162     /* Get to a known interrupt state */
163     qemu_set_fd_handler(irq_fd, NULL, NULL, vdev);
164     vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
165     vdev->intx.pending = false;
166     pci_irq_deassert(&vdev->pdev);
167 
168     /* Get an eventfd for resample/unmask */
169     if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) {
170         goto fail;
171     }
172 
173     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
174                                            &vdev->intx.interrupt,
175                                            &vdev->intx.unmask,
176                                            vdev->intx.route.irq)) {
177         error_setg_errno(errp, errno, "failed to setup resample irqfd");
178         goto fail_irqfd;
179     }
180 
181     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
182                                        VFIO_IRQ_SET_ACTION_UNMASK,
183                                        event_notifier_get_fd(&vdev->intx.unmask),
184                                        errp)) {
185         goto fail_vfio;
186     }
187 
188     /* Let'em rip */
189     vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
190 
191     vdev->intx.kvm_accel = true;
192 
193     trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
194 
195     return true;
196 
197 fail_vfio:
198     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
199                                           vdev->intx.route.irq);
200 fail_irqfd:
201     vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0);
202 fail:
203     qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev);
204     vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
205     return false;
206 #else
207     return true;
208 #endif
209 }
210 
211 static bool vfio_cpr_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
212 {
213 #ifdef CONFIG_KVM
214     if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
215         vdev->intx.route.mode != PCI_INTX_ENABLED ||
216         !kvm_resamplefds_enabled()) {
217         return true;
218     }
219 
220     if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) {
221         return false;
222     }
223 
224     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
225                                            &vdev->intx.interrupt,
226                                            &vdev->intx.unmask,
227                                            vdev->intx.route.irq)) {
228         error_setg_errno(errp, errno, "failed to setup resample irqfd");
229         vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0);
230         return false;
231     }
232 
233     vdev->intx.kvm_accel = true;
234     trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
235     return true;
236 #else
237     return true;
238 #endif
239 }
240 
241 static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
242 {
243 #ifdef CONFIG_KVM
244     if (!vdev->intx.kvm_accel) {
245         return;
246     }
247 
248     /*
249      * Get to a known state, hardware masked, QEMU ready to accept new
250      * interrupts, QEMU IRQ de-asserted.
251      */
252     vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
253     vdev->intx.pending = false;
254     pci_irq_deassert(&vdev->pdev);
255 
256     /* Tell KVM to stop listening for an INTx irqfd */
257     if (kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
258                                               vdev->intx.route.irq)) {
259         error_report("vfio: Error: Failed to disable INTx irqfd: %m");
260     }
261 
262     /* We only need to close the eventfd for VFIO to cleanup the kernel side */
263     vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0);
264 
265     /* QEMU starts listening for interrupt events. */
266     qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt),
267                         vfio_intx_interrupt, NULL, vdev);
268 
269     vdev->intx.kvm_accel = false;
270 
271     /* If we've missed an event, let it re-fire through QEMU */
272     vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
273 
274     trace_vfio_intx_disable_kvm(vdev->vbasedev.name);
275 #endif
276 }
277 
278 static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route)
279 {
280     Error *err = NULL;
281 
282     trace_vfio_intx_update(vdev->vbasedev.name,
283                            vdev->intx.route.irq, route->irq);
284 
285     vfio_intx_disable_kvm(vdev);
286 
287     vdev->intx.route = *route;
288 
289     if (route->mode != PCI_INTX_ENABLED) {
290         return;
291     }
292 
293     if (!vfio_intx_enable_kvm(vdev, &err)) {
294         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
295     }
296 
297     /* Re-enable the interrupt in cased we missed an EOI */
298     vfio_pci_intx_eoi(&vdev->vbasedev);
299 }
300 
301 static void vfio_intx_routing_notifier(PCIDevice *pdev)
302 {
303     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
304     PCIINTxRoute route;
305 
306     if (vdev->interrupt != VFIO_INT_INTx) {
307         return;
308     }
309 
310     route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
311 
312     if (pci_intx_route_changed(&vdev->intx.route, &route)) {
313         vfio_intx_update(vdev, &route);
314     }
315 }
316 
317 static void vfio_irqchip_change(Notifier *notify, void *data)
318 {
319     VFIOPCIDevice *vdev = container_of(notify, VFIOPCIDevice,
320                                        irqchip_change_notifier);
321 
322     vfio_intx_update(vdev, &vdev->intx.route);
323 }
324 
325 static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
326 {
327     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
328     Error *err = NULL;
329     int32_t fd;
330 
331 
332     if (!pin) {
333         return true;
334     }
335 
336     /*
337      * Do not alter interrupt state during vfio_realize and cpr load.
338      * The incoming state is cleared thereafter.
339      */
340     if (!cpr_is_incoming()) {
341         vfio_disable_interrupts(vdev);
342     }
343 
344     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
345     pci_config_set_interrupt_pin(vdev->pdev.config, pin);
346 
347 #ifdef CONFIG_KVM
348     /*
349      * Only conditional to avoid generating error messages on platforms
350      * where we won't actually use the result anyway.
351      */
352     if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
353         vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
354                                                         vdev->intx.pin);
355     }
356 #endif
357 
358     if (!vfio_notifier_init(vdev, &vdev->intx.interrupt, "intx-interrupt", 0,
359                             errp)) {
360         return false;
361     }
362     fd = event_notifier_get_fd(&vdev->intx.interrupt);
363     qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
364 
365 
366     if (cpr_is_incoming()) {
367         if (!vfio_cpr_intx_enable_kvm(vdev, &err)) {
368             warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
369         }
370         goto skip_signaling;
371     }
372 
373     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
374                                 VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
375         qemu_set_fd_handler(fd, NULL, NULL, vdev);
376         vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0);
377         return false;
378     }
379 
380     if (!vfio_intx_enable_kvm(vdev, &err)) {
381         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
382     }
383 
384 skip_signaling:
385     vdev->interrupt = VFIO_INT_INTx;
386 
387     trace_vfio_intx_enable(vdev->vbasedev.name);
388     return true;
389 }
390 
391 static void vfio_intx_disable(VFIOPCIDevice *vdev)
392 {
393     int fd;
394 
395     timer_del(vdev->intx.mmap_timer);
396     vfio_intx_disable_kvm(vdev);
397     vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
398     vdev->intx.pending = false;
399     pci_irq_deassert(&vdev->pdev);
400     vfio_mmap_set_enabled(vdev, true);
401 
402     fd = event_notifier_get_fd(&vdev->intx.interrupt);
403     qemu_set_fd_handler(fd, NULL, NULL, vdev);
404     vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0);
405 
406     vdev->interrupt = VFIO_INT_NONE;
407 
408     trace_vfio_intx_disable(vdev->vbasedev.name);
409 }
410 
411 bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp)
412 {
413     return vfio_intx_enable(vdev, errp);
414 }
415 
416 void vfio_pci_intx_set_handler(VFIOPCIDevice *vdev, bool enable)
417 {
418     int fd = event_notifier_get_fd(&vdev->intx.interrupt);
419     IOHandler *handler = (enable ? vfio_intx_interrupt : NULL);
420 
421     qemu_set_fd_handler(fd, handler, NULL, vdev);
422 }
423 
424 /*
425  * MSI/X
426  */
427 static void vfio_msi_interrupt(void *opaque)
428 {
429     VFIOMSIVector *vector = opaque;
430     VFIOPCIDevice *vdev = vector->vdev;
431     MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector);
432     void (*notify)(PCIDevice *dev, unsigned vector);
433     MSIMessage msg;
434     int nr = vector - vdev->msi_vectors;
435 
436     if (!event_notifier_test_and_clear(&vector->interrupt)) {
437         return;
438     }
439 
440     if (vdev->interrupt == VFIO_INT_MSIX) {
441         get_msg = msix_get_message;
442         notify = msix_notify;
443 
444         /* A masked vector firing needs to use the PBA, enable it */
445         if (msix_is_masked(&vdev->pdev, nr)) {
446             set_bit(nr, vdev->msix->pending);
447             memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true);
448             trace_vfio_msix_pba_enable(vdev->vbasedev.name);
449         }
450     } else if (vdev->interrupt == VFIO_INT_MSI) {
451         get_msg = msi_get_message;
452         notify = msi_notify;
453     } else {
454         abort();
455     }
456 
457     msg = get_msg(&vdev->pdev, nr);
458     trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data);
459     notify(&vdev->pdev, nr);
460 }
461 
462 void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr, bool enable)
463 {
464     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
465     int fd = event_notifier_get_fd(&vector->interrupt);
466     IOHandler *handler = (enable ? vfio_msi_interrupt : NULL);
467 
468     qemu_set_fd_handler(fd, handler, NULL, vector);
469 }
470 
471 /*
472  * Get MSI-X enabled, but no vector enabled, by setting vector 0 with an invalid
473  * fd to kernel.
474  */
475 static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
476 {
477     g_autofree struct vfio_irq_set *irq_set = NULL;
478     int argsz;
479     int32_t *fd;
480 
481     argsz = sizeof(*irq_set) + sizeof(*fd);
482 
483     irq_set = g_malloc0(argsz);
484     irq_set->argsz = argsz;
485     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
486                      VFIO_IRQ_SET_ACTION_TRIGGER;
487     irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
488     irq_set->start = 0;
489     irq_set->count = 1;
490     fd = (int32_t *)&irq_set->data;
491     *fd = -1;
492 
493     return vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
494 }
495 
496 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
497 {
498     struct vfio_irq_set *irq_set;
499     int ret = 0, i, argsz;
500     int32_t *fds;
501 
502     /*
503      * If dynamic MSI-X allocation is supported, the vectors to be allocated
504      * and enabled can be scattered. Before kernel enabling MSI-X, setting
505      * nr_vectors causes all these vectors to be allocated on host.
506      *
507      * To keep allocation as needed, use vector 0 with an invalid fd to get
508      * MSI-X enabled first, then set vectors with a potentially sparse set of
509      * eventfds to enable interrupts only when enabled in guest.
510      */
511     if (msix && !vdev->msix->noresize) {
512         ret = vfio_enable_msix_no_vec(vdev);
513 
514         if (ret) {
515             return ret;
516         }
517     }
518 
519     argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
520 
521     irq_set = g_malloc0(argsz);
522     irq_set->argsz = argsz;
523     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
524     irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
525     irq_set->start = 0;
526     irq_set->count = vdev->nr_vectors;
527     fds = (int32_t *)&irq_set->data;
528 
529     for (i = 0; i < vdev->nr_vectors; i++) {
530         int fd = -1;
531 
532         /*
533          * MSI vs MSI-X - The guest has direct access to MSI mask and pending
534          * bits, therefore we always use the KVM signaling path when setup.
535          * MSI-X mask and pending bits are emulated, so we want to use the
536          * KVM signaling path only when configured and unmasked.
537          */
538         if (vdev->msi_vectors[i].use) {
539             if (vdev->msi_vectors[i].virq < 0 ||
540                 (msix && msix_is_masked(&vdev->pdev, i))) {
541                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
542             } else {
543                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
544             }
545         }
546 
547         fds[i] = fd;
548     }
549 
550     ret = vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
551 
552     g_free(irq_set);
553 
554     return ret;
555 }
556 
557 void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
558                                int vector_n, bool msix)
559 {
560     if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) {
561         return;
562     }
563 
564     vector->virq = kvm_irqchip_add_msi_route(&vfio_route_change,
565                                              vector_n, &vdev->pdev);
566 }
567 
568 static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector, int nr)
569 {
570     const char *name = "kvm_interrupt";
571 
572     if (vector->virq < 0) {
573         return;
574     }
575 
576     if (!vfio_notifier_init(vector->vdev, &vector->kvm_interrupt, name, nr,
577                             NULL)) {
578         goto fail_notifier;
579     }
580 
581     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
582                                            NULL, vector->virq) < 0) {
583         goto fail_kvm;
584     }
585 
586     return;
587 
588 fail_kvm:
589     vfio_notifier_cleanup(vector->vdev, &vector->kvm_interrupt, name, nr);
590 fail_notifier:
591     kvm_irqchip_release_virq(kvm_state, vector->virq);
592     vector->virq = -1;
593 }
594 
595 static void vfio_remove_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
596                                      int nr)
597 {
598     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
599                                           vector->virq);
600     kvm_irqchip_release_virq(kvm_state, vector->virq);
601     vector->virq = -1;
602     vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, "kvm_interrupt", nr);
603 }
604 
605 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
606                                      PCIDevice *pdev)
607 {
608     kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev);
609     kvm_irqchip_commit_routes(kvm_state);
610 }
611 
612 static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector,
613                                unsigned int nr)
614 {
615     Error *err = NULL;
616     int32_t fd;
617 
618     if (vector->virq >= 0) {
619         fd = event_notifier_get_fd(&vector->kvm_interrupt);
620     } else {
621         fd = event_notifier_get_fd(&vector->interrupt);
622     }
623 
624     if (!vfio_device_irq_set_signaling(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr,
625                                        VFIO_IRQ_SET_ACTION_TRIGGER,
626                                        fd, &err)) {
627         error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name);
628     }
629 }
630 
631 void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr)
632 {
633     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
634     PCIDevice *pdev = &vdev->pdev;
635     Error *local_err = NULL;
636 
637     vector->vdev = vdev;
638     vector->virq = -1;
639     if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", nr,
640                             &local_err)) {
641         error_report_err(local_err);
642     }
643     vector->use = true;
644     if (vdev->interrupt == VFIO_INT_MSIX) {
645         msix_vector_use(pdev, nr);
646     }
647 }
648 
649 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
650                                    MSIMessage *msg, IOHandler *handler)
651 {
652     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
653     VFIOMSIVector *vector;
654     int ret;
655     bool resizing = !!(vdev->nr_vectors < nr + 1);
656 
657     trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
658 
659     vector = &vdev->msi_vectors[nr];
660 
661     if (!vector->use) {
662         vfio_pci_vector_init(vdev, nr);
663     }
664 
665     qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
666                         handler, NULL, vector);
667 
668     /*
669      * Attempt to enable route through KVM irqchip,
670      * default to userspace handling if unavailable.
671      */
672     if (vector->virq >= 0) {
673         if (!msg) {
674             vfio_remove_kvm_msi_virq(vdev, vector, nr);
675         } else {
676             vfio_update_kvm_msi_virq(vector, *msg, pdev);
677         }
678     } else {
679         if (msg) {
680             if (vdev->defer_kvm_irq_routing) {
681                 vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true);
682             } else {
683                 vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
684                 vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true);
685                 kvm_irqchip_commit_route_changes(&vfio_route_change);
686                 vfio_connect_kvm_msi_virq(vector, nr);
687             }
688         }
689     }
690 
691     /*
692      * When dynamic allocation is not supported, we don't want to have the
693      * host allocate all possible MSI vectors for a device if they're not
694      * in use, so we shutdown and incrementally increase them as needed.
695      * nr_vectors represents the total number of vectors allocated.
696      *
697      * When dynamic allocation is supported, let the host only allocate
698      * and enable a vector when it is in use in guest. nr_vectors represents
699      * the upper bound of vectors being enabled (but not all of the ranges
700      * is allocated or enabled).
701      */
702     if (resizing) {
703         vdev->nr_vectors = nr + 1;
704     }
705 
706     if (!vdev->defer_kvm_irq_routing) {
707         if (vdev->msix->noresize && resizing) {
708             vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
709             ret = vfio_enable_vectors(vdev, true);
710             if (ret) {
711                 error_report("vfio: failed to enable vectors, %s",
712                              strerror(-ret));
713             }
714         } else {
715             set_irq_signalling(&vdev->vbasedev, vector, nr);
716         }
717     }
718 
719     /* Disable PBA emulation when nothing more is pending. */
720     clear_bit(nr, vdev->msix->pending);
721     if (find_first_bit(vdev->msix->pending,
722                        vdev->nr_vectors) == vdev->nr_vectors) {
723         memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
724         trace_vfio_msix_pba_disable(vdev->vbasedev.name);
725     }
726 
727     return 0;
728 }
729 
730 static int vfio_msix_vector_use(PCIDevice *pdev,
731                                 unsigned int nr, MSIMessage msg)
732 {
733     /*
734      * Ignore the callback from msix_set_vector_notifiers during resume.
735      * The necessary subset of these actions is called from
736      * vfio_cpr_claim_vectors during post load.
737      */
738     if (cpr_is_incoming()) {
739         return 0;
740     }
741 
742     return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
743 }
744 
745 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
746 {
747     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
748     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
749 
750     trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
751 
752     /*
753      * There are still old guests that mask and unmask vectors on every
754      * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
755      * the KVM setup in place, simply switch VFIO to use the non-bypass
756      * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
757      * core will mask the interrupt and set pending bits, allowing it to
758      * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
759      */
760     if (vector->virq >= 0) {
761         int32_t fd = event_notifier_get_fd(&vector->interrupt);
762         Error *err = NULL;
763 
764         if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
765                                     nr, VFIO_IRQ_SET_ACTION_TRIGGER, fd,
766                                     &err)) {
767             error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
768         }
769     }
770 }
771 
772 void vfio_pci_msix_set_notifiers(VFIOPCIDevice *vdev)
773 {
774     msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
775                               vfio_msix_vector_release, NULL);
776 }
777 
778 void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
779 {
780     assert(!vdev->defer_kvm_irq_routing);
781     vdev->defer_kvm_irq_routing = true;
782     vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
783 }
784 
785 void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
786 {
787     int i;
788 
789     assert(vdev->defer_kvm_irq_routing);
790     vdev->defer_kvm_irq_routing = false;
791 
792     kvm_irqchip_commit_route_changes(&vfio_route_change);
793 
794     for (i = 0; i < vdev->nr_vectors; i++) {
795         vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i], i);
796     }
797 }
798 
799 static void vfio_msix_enable(VFIOPCIDevice *vdev)
800 {
801     int ret;
802 
803     vfio_disable_interrupts(vdev);
804 
805     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries);
806 
807     vdev->interrupt = VFIO_INT_MSIX;
808 
809     /*
810      * Setting vector notifiers triggers synchronous vector-use
811      * callbacks for each active vector.  Deferring to commit the KVM
812      * routes once rather than per vector provides a substantial
813      * performance improvement.
814      */
815     vfio_pci_prepare_kvm_msi_virq_batch(vdev);
816 
817     if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
818                                   vfio_msix_vector_release, NULL)) {
819         error_report("vfio: msix_set_vector_notifiers failed");
820     }
821 
822     vfio_pci_commit_kvm_msi_virq_batch(vdev);
823 
824     if (vdev->nr_vectors) {
825         ret = vfio_enable_vectors(vdev, true);
826         if (ret) {
827             error_report("vfio: failed to enable vectors, %s",
828                          strerror(-ret));
829         }
830     } else {
831         /*
832          * Some communication channels between VF & PF or PF & fw rely on the
833          * physical state of the device and expect that enabling MSI-X from the
834          * guest enables the same on the host.  When our guest is Linux, the
835          * guest driver call to pci_enable_msix() sets the enabling bit in the
836          * MSI-X capability, but leaves the vector table masked.  We therefore
837          * can't rely on a vector_use callback (from request_irq() in the guest)
838          * to switch the physical device into MSI-X mode because that may come a
839          * long time after pci_enable_msix().  This code sets vector 0 with an
840          * invalid fd to make the physical device MSI-X enabled, but with no
841          * vectors enabled, just like the guest view.
842          */
843         ret = vfio_enable_msix_no_vec(vdev);
844         if (ret) {
845             error_report("vfio: failed to enable MSI-X, %s",
846                          strerror(-ret));
847         }
848     }
849 
850     trace_vfio_msix_enable(vdev->vbasedev.name);
851 }
852 
853 static void vfio_msi_enable(VFIOPCIDevice *vdev)
854 {
855     int ret, i;
856 
857     vfio_disable_interrupts(vdev);
858 
859     vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
860 retry:
861     /*
862      * Setting vector notifiers needs to enable route for each vector.
863      * Deferring to commit the KVM routes once rather than per vector
864      * provides a substantial performance improvement.
865      */
866     vfio_pci_prepare_kvm_msi_virq_batch(vdev);
867 
868     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
869 
870     for (i = 0; i < vdev->nr_vectors; i++) {
871         VFIOMSIVector *vector = &vdev->msi_vectors[i];
872         Error *local_err = NULL;
873 
874         vector->vdev = vdev;
875         vector->virq = -1;
876         vector->use = true;
877 
878         if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", i,
879                                 &local_err)) {
880             error_report_err(local_err);
881         }
882 
883         qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
884                             vfio_msi_interrupt, NULL, vector);
885 
886         /*
887          * Attempt to enable route through KVM irqchip,
888          * default to userspace handling if unavailable.
889          */
890         vfio_pci_add_kvm_msi_virq(vdev, vector, i, false);
891     }
892 
893     vfio_pci_commit_kvm_msi_virq_batch(vdev);
894 
895     /* Set interrupt type prior to possible interrupts */
896     vdev->interrupt = VFIO_INT_MSI;
897 
898     ret = vfio_enable_vectors(vdev, false);
899     if (ret) {
900         if (ret < 0) {
901             error_report("vfio: Error: Failed to setup MSI fds: %s",
902                          strerror(-ret));
903         } else {
904             error_report("vfio: Error: Failed to enable %d "
905                          "MSI vectors, retry with %d", vdev->nr_vectors, ret);
906         }
907 
908         vfio_msi_disable_common(vdev);
909 
910         if (ret > 0) {
911             vdev->nr_vectors = ret;
912             goto retry;
913         }
914 
915         /*
916          * Failing to setup MSI doesn't really fall within any specification.
917          * Let's try leaving interrupts disabled and hope the guest figures
918          * out to fall back to INTx for this device.
919          */
920         error_report("vfio: Error: Failed to enable MSI");
921 
922         return;
923     }
924 
925     trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
926 }
927 
928 static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
929 {
930     int i;
931 
932     for (i = 0; i < vdev->nr_vectors; i++) {
933         VFIOMSIVector *vector = &vdev->msi_vectors[i];
934         if (vdev->msi_vectors[i].use) {
935             if (vector->virq >= 0) {
936                 vfio_remove_kvm_msi_virq(vdev, vector, i);
937             }
938             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
939                                 NULL, NULL, NULL);
940             vfio_notifier_cleanup(vdev, &vector->interrupt, "interrupt", i);
941         }
942     }
943 
944     g_free(vdev->msi_vectors);
945     vdev->msi_vectors = NULL;
946     vdev->nr_vectors = 0;
947     vdev->interrupt = VFIO_INT_NONE;
948 }
949 
950 static void vfio_msix_disable(VFIOPCIDevice *vdev)
951 {
952     Error *err = NULL;
953     int i;
954 
955     msix_unset_vector_notifiers(&vdev->pdev);
956 
957     /*
958      * MSI-X will only release vectors if MSI-X is still enabled on the
959      * device, check through the rest and release it ourselves if necessary.
960      */
961     for (i = 0; i < vdev->nr_vectors; i++) {
962         if (vdev->msi_vectors[i].use) {
963             vfio_msix_vector_release(&vdev->pdev, i);
964             msix_vector_unuse(&vdev->pdev, i);
965         }
966     }
967 
968     /*
969      * Always clear MSI-X IRQ index. A PF device could have enabled
970      * MSI-X with no vectors. See vfio_msix_enable().
971      */
972     vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
973 
974     vfio_msi_disable_common(vdev);
975     if (!vfio_intx_enable(vdev, &err)) {
976         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
977     }
978 
979     memset(vdev->msix->pending, 0,
980            BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
981 
982     trace_vfio_msix_disable(vdev->vbasedev.name);
983 }
984 
985 static void vfio_msi_disable(VFIOPCIDevice *vdev)
986 {
987     Error *err = NULL;
988 
989     vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
990     vfio_msi_disable_common(vdev);
991     vfio_intx_enable(vdev, &err);
992     if (err) {
993         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
994     }
995 
996     trace_vfio_msi_disable(vdev->vbasedev.name);
997 }
998 
999 static void vfio_update_msi(VFIOPCIDevice *vdev)
1000 {
1001     int i;
1002 
1003     for (i = 0; i < vdev->nr_vectors; i++) {
1004         VFIOMSIVector *vector = &vdev->msi_vectors[i];
1005         MSIMessage msg;
1006 
1007         if (!vector->use || vector->virq < 0) {
1008             continue;
1009         }
1010 
1011         msg = msi_get_message(&vdev->pdev, i);
1012         vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev);
1013     }
1014 }
1015 
1016 static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
1017 {
1018     VFIODevice *vbasedev = &vdev->vbasedev;
1019     struct vfio_region_info *reg_info = NULL;
1020     uint64_t size;
1021     off_t off = 0;
1022     ssize_t bytes;
1023     int ret;
1024 
1025     ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_ROM_REGION_INDEX,
1026                                       &reg_info);
1027 
1028     if (ret != 0) {
1029         error_report("vfio: Error getting ROM info: %s", strerror(-ret));
1030         return;
1031     }
1032 
1033     trace_vfio_pci_load_rom(vbasedev->name, (unsigned long)reg_info->size,
1034                             (unsigned long)reg_info->offset,
1035                             (unsigned long)reg_info->flags);
1036 
1037     vdev->rom_size = size = reg_info->size;
1038     vdev->rom_offset = reg_info->offset;
1039 
1040     if (!vdev->rom_size) {
1041         vdev->rom_read_failed = true;
1042         error_report("vfio-pci: Cannot read device rom at %s", vbasedev->name);
1043         error_printf("Device option ROM contents are probably invalid "
1044                     "(check dmesg).\nSkip option ROM probe with rombar=0, "
1045                     "or load from file with romfile=\n");
1046         return;
1047     }
1048 
1049     vdev->rom = g_malloc(size);
1050     memset(vdev->rom, 0xff, size);
1051 
1052     while (size) {
1053         bytes = vbasedev->io_ops->region_read(vbasedev,
1054                                               VFIO_PCI_ROM_REGION_INDEX,
1055                                               off, size, vdev->rom + off);
1056 
1057         if (bytes == 0) {
1058             break;
1059         } else if (bytes > 0) {
1060             off += bytes;
1061             size -= bytes;
1062         } else {
1063             if (bytes == -EINTR || bytes == -EAGAIN) {
1064                 continue;
1065             }
1066             error_report("vfio: Error reading device ROM: %s",
1067                          strreaderror(bytes));
1068 
1069             break;
1070         }
1071     }
1072 
1073     /*
1074      * Test the ROM signature against our device, if the vendor is correct
1075      * but the device ID doesn't match, store the correct device ID and
1076      * recompute the checksum.  Intel IGD devices need this and are known
1077      * to have bogus checksums so we can't simply adjust the checksum.
1078      */
1079     if (pci_get_word(vdev->rom) == 0xaa55 &&
1080         pci_get_word(vdev->rom + 0x18) + 8 < vdev->rom_size &&
1081         !memcmp(vdev->rom + pci_get_word(vdev->rom + 0x18), "PCIR", 4)) {
1082         uint16_t vid, did;
1083 
1084         vid = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 4);
1085         did = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6);
1086 
1087         if (vid == vdev->vendor_id && did != vdev->device_id) {
1088             int i;
1089             uint8_t csum, *data = vdev->rom;
1090 
1091             pci_set_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6,
1092                          vdev->device_id);
1093             data[6] = 0;
1094 
1095             for (csum = 0, i = 0; i < vdev->rom_size; i++) {
1096                 csum += data[i];
1097             }
1098 
1099             data[6] = -csum;
1100         }
1101     }
1102 }
1103 
1104 /* "Raw" read of underlying config space. */
1105 static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset,
1106                                       uint32_t size, void *data)
1107 {
1108     return vdev->vbasedev.io_ops->region_read(&vdev->vbasedev,
1109                                               VFIO_PCI_CONFIG_REGION_INDEX,
1110                                               offset, size, data);
1111 }
1112 
1113 /* "Raw" write of underlying config space. */
1114 static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset,
1115                                        uint32_t size, void *data)
1116 {
1117     return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev,
1118                                                VFIO_PCI_CONFIG_REGION_INDEX,
1119                                                offset, size, data, false);
1120 }
1121 
1122 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
1123 {
1124     VFIOPCIDevice *vdev = opaque;
1125     union {
1126         uint8_t byte;
1127         uint16_t word;
1128         uint32_t dword;
1129         uint64_t qword;
1130     } val;
1131     uint64_t data = 0;
1132 
1133     /* Load the ROM lazily when the guest tries to read it */
1134     if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
1135         vfio_pci_load_rom(vdev);
1136     }
1137 
1138     memcpy(&val, vdev->rom + addr,
1139            (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
1140 
1141     switch (size) {
1142     case 1:
1143         data = val.byte;
1144         break;
1145     case 2:
1146         data = le16_to_cpu(val.word);
1147         break;
1148     case 4:
1149         data = le32_to_cpu(val.dword);
1150         break;
1151     default:
1152         hw_error("vfio: unsupported read size, %d bytes\n", size);
1153         break;
1154     }
1155 
1156     trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data);
1157 
1158     return data;
1159 }
1160 
1161 static void vfio_rom_write(void *opaque, hwaddr addr,
1162                            uint64_t data, unsigned size)
1163 {
1164 }
1165 
1166 static const MemoryRegionOps vfio_rom_ops = {
1167     .read = vfio_rom_read,
1168     .write = vfio_rom_write,
1169     .endianness = DEVICE_LITTLE_ENDIAN,
1170 };
1171 
1172 static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
1173 {
1174     VFIODevice *vbasedev = &vdev->vbasedev;
1175     uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
1176     char *name;
1177 
1178     if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
1179         /* Since pci handles romfile, just print a message and return */
1180         if (vfio_opt_rom_in_denylist(vdev) && vdev->pdev.romfile) {
1181             warn_report("Device at %s is known to cause system instability"
1182                         " issues during option rom execution",
1183                         vdev->vbasedev.name);
1184             error_printf("Proceeding anyway since user specified romfile\n");
1185         }
1186         return;
1187     }
1188 
1189     /*
1190      * Use the same size ROM BAR as the physical device.  The contents
1191      * will get filled in later when the guest tries to read it.
1192      */
1193     if (vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4 ||
1194         vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 ||
1195         vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 ||
1196         vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4) {
1197 
1198         error_report("%s(%s) ROM access failed", __func__, vbasedev->name);
1199         return;
1200     }
1201 
1202     size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
1203 
1204     if (!size) {
1205         return;
1206     }
1207 
1208     if (vfio_opt_rom_in_denylist(vdev)) {
1209         if (vdev->pdev.rom_bar > 0) {
1210             warn_report("Device at %s is known to cause system instability"
1211                         " issues during option rom execution",
1212                         vdev->vbasedev.name);
1213             error_printf("Proceeding anyway since user specified"
1214                          " positive value for rombar\n");
1215         } else {
1216             warn_report("Rom loading for device at %s has been disabled"
1217                         " due to system instability issues",
1218                         vdev->vbasedev.name);
1219             error_printf("Specify rombar=1 or romfile to force\n");
1220             return;
1221         }
1222     }
1223 
1224     trace_vfio_pci_size_rom(vdev->vbasedev.name, size);
1225 
1226     name = g_strdup_printf("vfio[%s].rom", vdev->vbasedev.name);
1227 
1228     memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
1229                           &vfio_rom_ops, vdev, name, size);
1230     g_free(name);
1231 
1232     pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
1233                      PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
1234 
1235     vdev->rom_read_failed = false;
1236 }
1237 
1238 void vfio_vga_write(void *opaque, hwaddr addr,
1239                            uint64_t data, unsigned size)
1240 {
1241     VFIOVGARegion *region = opaque;
1242     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1243     union {
1244         uint8_t byte;
1245         uint16_t word;
1246         uint32_t dword;
1247         uint64_t qword;
1248     } buf;
1249     off_t offset = vga->fd_offset + region->offset + addr;
1250 
1251     switch (size) {
1252     case 1:
1253         buf.byte = data;
1254         break;
1255     case 2:
1256         buf.word = cpu_to_le16(data);
1257         break;
1258     case 4:
1259         buf.dword = cpu_to_le32(data);
1260         break;
1261     default:
1262         hw_error("vfio: unsupported write size, %d bytes", size);
1263         break;
1264     }
1265 
1266     if (pwrite(vga->fd, &buf, size, offset) != size) {
1267         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1268                      __func__, region->offset + addr, data, size);
1269     }
1270 
1271     trace_vfio_vga_write(region->offset + addr, data, size);
1272 }
1273 
1274 uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1275 {
1276     VFIOVGARegion *region = opaque;
1277     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1278     union {
1279         uint8_t byte;
1280         uint16_t word;
1281         uint32_t dword;
1282         uint64_t qword;
1283     } buf;
1284     uint64_t data = 0;
1285     off_t offset = vga->fd_offset + region->offset + addr;
1286 
1287     if (pread(vga->fd, &buf, size, offset) != size) {
1288         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1289                      __func__, region->offset + addr, size);
1290         return (uint64_t)-1;
1291     }
1292 
1293     switch (size) {
1294     case 1:
1295         data = buf.byte;
1296         break;
1297     case 2:
1298         data = le16_to_cpu(buf.word);
1299         break;
1300     case 4:
1301         data = le32_to_cpu(buf.dword);
1302         break;
1303     default:
1304         hw_error("vfio: unsupported read size, %d bytes", size);
1305         break;
1306     }
1307 
1308     trace_vfio_vga_read(region->offset + addr, size, data);
1309 
1310     return data;
1311 }
1312 
1313 static const MemoryRegionOps vfio_vga_ops = {
1314     .read = vfio_vga_read,
1315     .write = vfio_vga_write,
1316     .endianness = DEVICE_LITTLE_ENDIAN,
1317 };
1318 
1319 /*
1320  * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page
1321  * size if the BAR is in an exclusive page in host so that we could map
1322  * this BAR to guest. But this sub-page BAR may not occupy an exclusive
1323  * page in guest. So we should set the priority of the expanded memory
1324  * region to zero in case of overlap with BARs which share the same page
1325  * with the sub-page BAR in guest. Besides, we should also recover the
1326  * size of this sub-page BAR when its base address is changed in guest
1327  * and not page aligned any more.
1328  */
1329 static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
1330 {
1331     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
1332     VFIORegion *region = &vdev->bars[bar].region;
1333     MemoryRegion *mmap_mr, *region_mr, *base_mr;
1334     PCIIORegion *r;
1335     pcibus_t bar_addr;
1336     uint64_t size = region->size;
1337 
1338     /* Make sure that the whole region is allowed to be mmapped */
1339     if (region->nr_mmaps != 1 || !region->mmaps[0].mmap ||
1340         region->mmaps[0].size != region->size) {
1341         return;
1342     }
1343 
1344     r = &pdev->io_regions[bar];
1345     bar_addr = r->addr;
1346     base_mr = vdev->bars[bar].mr;
1347     region_mr = region->mem;
1348     mmap_mr = &region->mmaps[0].mem;
1349 
1350     /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */
1351     if (bar_addr != PCI_BAR_UNMAPPED &&
1352         !(bar_addr & ~qemu_real_host_page_mask())) {
1353         size = qemu_real_host_page_size();
1354     }
1355 
1356     memory_region_transaction_begin();
1357 
1358     if (vdev->bars[bar].size < size) {
1359         memory_region_set_size(base_mr, size);
1360     }
1361     memory_region_set_size(region_mr, size);
1362     memory_region_set_size(mmap_mr, size);
1363     if (size != vdev->bars[bar].size && memory_region_is_mapped(base_mr)) {
1364         memory_region_del_subregion(r->address_space, base_mr);
1365         memory_region_add_subregion_overlap(r->address_space,
1366                                             bar_addr, base_mr, 0);
1367     }
1368 
1369     memory_region_transaction_commit();
1370 }
1371 
1372 /*
1373  * PCI config space
1374  */
1375 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
1376 {
1377     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
1378     VFIODevice *vbasedev = &vdev->vbasedev;
1379     uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
1380 
1381     memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
1382     emu_bits = le32_to_cpu(emu_bits);
1383 
1384     if (emu_bits) {
1385         emu_val = pci_default_read_config(pdev, addr, len);
1386     }
1387 
1388     if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
1389         ssize_t ret;
1390 
1391         ret = vfio_pci_config_space_read(vdev, addr, len, &phys_val);
1392         if (ret != len) {
1393             error_report("%s(%s, 0x%x, 0x%x) failed: %s",
1394                          __func__, vbasedev->name, addr, len,
1395                          strreaderror(ret));
1396             return -1;
1397         }
1398         phys_val = le32_to_cpu(phys_val);
1399     }
1400 
1401     val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
1402 
1403     trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val);
1404 
1405     return val;
1406 }
1407 
1408 void vfio_pci_write_config(PCIDevice *pdev,
1409                            uint32_t addr, uint32_t val, int len)
1410 {
1411     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
1412     VFIODevice *vbasedev = &vdev->vbasedev;
1413     uint32_t val_le = cpu_to_le32(val);
1414     int ret;
1415 
1416     trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
1417 
1418     /* Write everything to VFIO, let it filter out what we can't write */
1419     ret = vfio_pci_config_space_write(vdev, addr, len, &val_le);
1420     if (ret != len) {
1421         error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s",
1422                      __func__, vbasedev->name, addr, val, len,
1423                     strwriteerror(ret));
1424     }
1425 
1426     /* MSI/MSI-X Enabling/Disabling */
1427     if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
1428         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
1429         int is_enabled, was_enabled = msi_enabled(pdev);
1430 
1431         pci_default_write_config(pdev, addr, val, len);
1432 
1433         is_enabled = msi_enabled(pdev);
1434 
1435         if (!was_enabled) {
1436             if (is_enabled) {
1437                 vfio_msi_enable(vdev);
1438             }
1439         } else {
1440             if (!is_enabled) {
1441                 vfio_msi_disable(vdev);
1442             } else {
1443                 vfio_update_msi(vdev);
1444             }
1445         }
1446     } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
1447         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
1448         int is_enabled, was_enabled = msix_enabled(pdev);
1449 
1450         pci_default_write_config(pdev, addr, val, len);
1451 
1452         is_enabled = msix_enabled(pdev);
1453 
1454         if (!was_enabled && is_enabled) {
1455             vfio_msix_enable(vdev);
1456         } else if (was_enabled && !is_enabled) {
1457             vfio_msix_disable(vdev);
1458         }
1459     } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) ||
1460         range_covers_byte(addr, len, PCI_COMMAND)) {
1461         pcibus_t old_addr[PCI_NUM_REGIONS - 1];
1462         int bar;
1463 
1464         for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
1465             old_addr[bar] = pdev->io_regions[bar].addr;
1466         }
1467 
1468         pci_default_write_config(pdev, addr, val, len);
1469 
1470         for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
1471             if (old_addr[bar] != pdev->io_regions[bar].addr &&
1472                 vdev->bars[bar].region.size > 0 &&
1473                 vdev->bars[bar].region.size < qemu_real_host_page_size()) {
1474                 vfio_sub_page_bar_update_mapping(pdev, bar);
1475             }
1476         }
1477     } else {
1478         /* Write everything to QEMU to keep emulated bits correct */
1479         pci_default_write_config(pdev, addr, val, len);
1480     }
1481 }
1482 
1483 /*
1484  * Interrupt setup
1485  */
1486 static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
1487 {
1488     /*
1489      * More complicated than it looks.  Disabling MSI/X transitions the
1490      * device to INTx mode (if supported).  Therefore we need to first
1491      * disable MSI/X and then cleanup by disabling INTx.
1492      */
1493     if (vdev->interrupt == VFIO_INT_MSIX) {
1494         vfio_msix_disable(vdev);
1495     } else if (vdev->interrupt == VFIO_INT_MSI) {
1496         vfio_msi_disable(vdev);
1497     }
1498 
1499     if (vdev->interrupt == VFIO_INT_INTx) {
1500         vfio_intx_disable(vdev);
1501     }
1502 }
1503 
1504 static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1505 {
1506     uint16_t ctrl;
1507     bool msi_64bit, msi_maskbit;
1508     int ret, entries;
1509     Error *err = NULL;
1510 
1511     ret = vfio_pci_config_space_read(vdev, pos + PCI_CAP_FLAGS,
1512                                      sizeof(ctrl), &ctrl);
1513     if (ret != sizeof(ctrl)) {
1514         error_setg(errp, "failed reading MSI PCI_CAP_FLAGS: %s",
1515                    strreaderror(ret));
1516         return false;
1517     }
1518     ctrl = le16_to_cpu(ctrl);
1519 
1520     msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
1521     msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
1522     entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
1523 
1524     trace_vfio_msi_setup(vdev->vbasedev.name, pos);
1525 
1526     ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit, &err);
1527     if (ret < 0) {
1528         if (ret == -ENOTSUP) {
1529             return true;
1530         }
1531         error_propagate_prepend(errp, err, "msi_init failed: ");
1532         return false;
1533     }
1534     vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
1535 
1536     return true;
1537 }
1538 
1539 static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
1540 {
1541     off_t start, end;
1542     VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region;
1543 
1544     /*
1545      * If the host driver allows mapping of a MSIX data, we are going to
1546      * do map the entire BAR and emulate MSIX table on top of that.
1547      */
1548     if (vfio_device_has_region_cap(&vdev->vbasedev, region->nr,
1549                                    VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) {
1550         return;
1551     }
1552 
1553     /*
1554      * We expect to find a single mmap covering the whole BAR, anything else
1555      * means it's either unsupported or already setup.
1556      */
1557     if (region->nr_mmaps != 1 || region->mmaps[0].offset ||
1558         region->size != region->mmaps[0].size) {
1559         return;
1560     }
1561 
1562     /* MSI-X table start and end aligned to host page size */
1563     start = vdev->msix->table_offset & qemu_real_host_page_mask();
1564     end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
1565                                (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
1566 
1567     /*
1568      * Does the MSI-X table cover the beginning of the BAR?  The whole BAR?
1569      * NB - Host page size is necessarily a power of two and so is the PCI
1570      * BAR (not counting EA yet), therefore if we have host page aligned
1571      * @start and @end, then any remainder of the BAR before or after those
1572      * must be at least host page sized and therefore mmap'able.
1573      */
1574     if (!start) {
1575         if (end >= region->size) {
1576             region->nr_mmaps = 0;
1577             g_free(region->mmaps);
1578             region->mmaps = NULL;
1579             trace_vfio_msix_fixup(vdev->vbasedev.name,
1580                                   vdev->msix->table_bar, 0, 0);
1581         } else {
1582             region->mmaps[0].offset = end;
1583             region->mmaps[0].size = region->size - end;
1584             trace_vfio_msix_fixup(vdev->vbasedev.name,
1585                               vdev->msix->table_bar, region->mmaps[0].offset,
1586                               region->mmaps[0].offset + region->mmaps[0].size);
1587         }
1588 
1589     /* Maybe it's aligned at the end of the BAR */
1590     } else if (end >= region->size) {
1591         region->mmaps[0].size = start;
1592         trace_vfio_msix_fixup(vdev->vbasedev.name,
1593                               vdev->msix->table_bar, region->mmaps[0].offset,
1594                               region->mmaps[0].offset + region->mmaps[0].size);
1595 
1596     /* Otherwise it must split the BAR */
1597     } else {
1598         region->nr_mmaps = 2;
1599         region->mmaps = g_renew(VFIOMmap, region->mmaps, 2);
1600 
1601         memcpy(&region->mmaps[1], &region->mmaps[0], sizeof(VFIOMmap));
1602 
1603         region->mmaps[0].size = start;
1604         trace_vfio_msix_fixup(vdev->vbasedev.name,
1605                               vdev->msix->table_bar, region->mmaps[0].offset,
1606                               region->mmaps[0].offset + region->mmaps[0].size);
1607 
1608         region->mmaps[1].offset = end;
1609         region->mmaps[1].size = region->size - end;
1610         trace_vfio_msix_fixup(vdev->vbasedev.name,
1611                               vdev->msix->table_bar, region->mmaps[1].offset,
1612                               region->mmaps[1].offset + region->mmaps[1].size);
1613     }
1614 }
1615 
1616 static bool vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
1617 {
1618     int target_bar = -1;
1619     size_t msix_sz;
1620 
1621     if (!vdev->msix || vdev->msix_relo == OFF_AUTO_PCIBAR_OFF) {
1622         return true;
1623     }
1624 
1625     /* The actual minimum size of MSI-X structures */
1626     msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) +
1627               (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8);
1628     /* Round up to host pages, we don't want to share a page */
1629     msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz);
1630     /* PCI BARs must be a power of 2 */
1631     msix_sz = pow2ceil(msix_sz);
1632 
1633     if (vdev->msix_relo == OFF_AUTO_PCIBAR_AUTO) {
1634         /*
1635          * TODO: Lookup table for known devices.
1636          *
1637          * Logically we might use an algorithm here to select the BAR adding
1638          * the least additional MMIO space, but we cannot programmatically
1639          * predict the driver dependency on BAR ordering or sizing, therefore
1640          * 'auto' becomes a lookup for combinations reported to work.
1641          */
1642         if (target_bar < 0) {
1643             error_setg(errp, "No automatic MSI-X relocation available for "
1644                        "device %04x:%04x", vdev->vendor_id, vdev->device_id);
1645             return false;
1646         }
1647     } else {
1648         target_bar = (int)(vdev->msix_relo - OFF_AUTO_PCIBAR_BAR0);
1649     }
1650 
1651     /* I/O port BARs cannot host MSI-X structures */
1652     if (vdev->bars[target_bar].ioport) {
1653         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1654                    "I/O port BAR", target_bar);
1655         return false;
1656     }
1657 
1658     /* Cannot use a BAR in the "shadow" of a 64-bit BAR */
1659     if (!vdev->bars[target_bar].size &&
1660          target_bar > 0 && vdev->bars[target_bar - 1].mem64) {
1661         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1662                    "consumed by 64-bit BAR %d", target_bar, target_bar - 1);
1663         return false;
1664     }
1665 
1666     /* 2GB max size for 32-bit BARs, cannot double if already > 1G */
1667     if (vdev->bars[target_bar].size > 1 * GiB &&
1668         !vdev->bars[target_bar].mem64) {
1669         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1670                    "no space to extend 32-bit BAR", target_bar);
1671         return false;
1672     }
1673 
1674     /*
1675      * If adding a new BAR, test if we can make it 64bit.  We make it
1676      * prefetchable since QEMU MSI-X emulation has no read side effects
1677      * and doing so makes mapping more flexible.
1678      */
1679     if (!vdev->bars[target_bar].size) {
1680         if (target_bar < (PCI_ROM_SLOT - 1) &&
1681             !vdev->bars[target_bar + 1].size) {
1682             vdev->bars[target_bar].mem64 = true;
1683             vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64;
1684         }
1685         vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
1686         vdev->bars[target_bar].size = msix_sz;
1687         vdev->msix->table_offset = 0;
1688     } else {
1689         vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2,
1690                                           msix_sz * 2);
1691         /*
1692          * Due to above size calc, MSI-X always starts halfway into the BAR,
1693          * which will always be a separate host page.
1694          */
1695         vdev->msix->table_offset = vdev->bars[target_bar].size / 2;
1696     }
1697 
1698     vdev->msix->table_bar = target_bar;
1699     vdev->msix->pba_bar = target_bar;
1700     /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */
1701     vdev->msix->pba_offset = vdev->msix->table_offset +
1702                                   (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE);
1703 
1704     trace_vfio_msix_relo(vdev->vbasedev.name,
1705                          vdev->msix->table_bar, vdev->msix->table_offset);
1706     return true;
1707 }
1708 
1709 /*
1710  * We don't have any control over how pci_add_capability() inserts
1711  * capabilities into the chain.  In order to setup MSI-X we need a
1712  * MemoryRegion for the BAR.  In order to setup the BAR and not
1713  * attempt to mmap the MSI-X table area, which VFIO won't allow, we
1714  * need to first look for where the MSI-X table lives.  So we
1715  * unfortunately split MSI-X setup across two functions.
1716  */
1717 static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
1718 {
1719     uint8_t pos;
1720     uint16_t ctrl;
1721     uint32_t table, pba;
1722     struct vfio_irq_info irq_info;
1723     VFIOMSIXInfo *msix;
1724     int ret;
1725 
1726     pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
1727     if (!pos) {
1728         return true;
1729     }
1730 
1731     ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_FLAGS,
1732                                      sizeof(ctrl), &ctrl);
1733     if (ret != sizeof(ctrl)) {
1734         error_setg(errp, "failed to read PCI MSIX FLAGS: %s",
1735                    strreaderror(ret));
1736         return false;
1737     }
1738 
1739     ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_TABLE,
1740                                      sizeof(table), &table);
1741     if (ret != sizeof(table)) {
1742         error_setg(errp, "failed to read PCI MSIX TABLE: %s",
1743                    strreaderror(ret));
1744         return false;
1745     }
1746 
1747     ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_PBA,
1748                                      sizeof(pba), &pba);
1749     if (ret != sizeof(pba)) {
1750         error_setg(errp, "failed to read PCI MSIX PBA: %s", strreaderror(ret));
1751         return false;
1752     }
1753 
1754     ctrl = le16_to_cpu(ctrl);
1755     table = le32_to_cpu(table);
1756     pba = le32_to_cpu(pba);
1757 
1758     msix = g_malloc0(sizeof(*msix));
1759     msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
1760     msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
1761     msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
1762     msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
1763     msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
1764 
1765     ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
1766                                    &irq_info);
1767     if (ret < 0) {
1768         error_setg_errno(errp, -ret, "failed to get MSI-X irq info");
1769         g_free(msix);
1770         return false;
1771     }
1772 
1773     msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE);
1774 
1775     /*
1776      * Test the size of the pba_offset variable and catch if it extends outside
1777      * of the specified BAR. If it is the case, we need to apply a hardware
1778      * specific quirk if the device is known or we have a broken configuration.
1779      */
1780     if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) {
1781         /*
1782          * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
1783          * adapters. The T5 hardware returns an incorrect value of 0x8000 for
1784          * the VF PBA offset while the BAR itself is only 8k. The correct value
1785          * is 0x1000, so we hard code that here.
1786          */
1787         if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO &&
1788             (vdev->device_id & 0xff00) == 0x5800) {
1789             msix->pba_offset = 0x1000;
1790         /*
1791          * BAIDU KUNLUN Virtual Function devices for KUNLUN AI processor
1792          * return an incorrect value of 0x460000 for the VF PBA offset while
1793          * the BAR itself is only 0x10000.  The correct value is 0xb400.
1794          */
1795         } else if (vfio_pci_is(vdev, PCI_VENDOR_ID_BAIDU,
1796                                PCI_DEVICE_ID_KUNLUN_VF)) {
1797             msix->pba_offset = 0xb400;
1798         } else if (vdev->msix_relo == OFF_AUTO_PCIBAR_OFF) {
1799             error_setg(errp, "hardware reports invalid configuration, "
1800                        "MSIX PBA outside of specified BAR");
1801             g_free(msix);
1802             return false;
1803         }
1804     }
1805 
1806     trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
1807                                 msix->table_offset, msix->entries,
1808                                 msix->noresize);
1809     vdev->msix = msix;
1810 
1811     vfio_pci_fixup_msix_region(vdev);
1812 
1813     return vfio_pci_relocate_msix(vdev, errp);
1814 }
1815 
1816 static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1817 {
1818     int ret;
1819     Error *err = NULL;
1820 
1821     vdev->msix->pending = g_new0(unsigned long,
1822                                  BITS_TO_LONGS(vdev->msix->entries));
1823     ret = msix_init(&vdev->pdev, vdev->msix->entries,
1824                     vdev->bars[vdev->msix->table_bar].mr,
1825                     vdev->msix->table_bar, vdev->msix->table_offset,
1826                     vdev->bars[vdev->msix->pba_bar].mr,
1827                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos,
1828                     &err);
1829     if (ret < 0) {
1830         if (ret == -ENOTSUP) {
1831             warn_report_err(err);
1832             return true;
1833         }
1834 
1835         error_propagate(errp, err);
1836         return false;
1837     }
1838 
1839     /*
1840      * The PCI spec suggests that devices provide additional alignment for
1841      * MSI-X structures and avoid overlapping non-MSI-X related registers.
1842      * For an assigned device, this hopefully means that emulation of MSI-X
1843      * structures does not affect the performance of the device.  If devices
1844      * fail to provide that alignment, a significant performance penalty may
1845      * result, for instance Mellanox MT27500 VFs:
1846      * http://www.spinics.net/lists/kvm/msg125881.html
1847      *
1848      * The PBA is simply not that important for such a serious regression and
1849      * most drivers do not appear to look at it.  The solution for this is to
1850      * disable the PBA MemoryRegion unless it's being used.  We disable it
1851      * here and only enable it if a masked vector fires through QEMU.  As the
1852      * vector-use notifier is called, which occurs on unmask, we test whether
1853      * PBA emulation is needed and again disable if not.
1854      */
1855     memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
1856 
1857     /*
1858      * The emulated machine may provide a paravirt interface for MSIX setup
1859      * so it is not strictly necessary to emulate MSIX here. This becomes
1860      * helpful when frequently accessed MMIO registers are located in
1861      * subpages adjacent to the MSIX table but the MSIX data containing page
1862      * cannot be mapped because of a host page size bigger than the MSIX table
1863      * alignment.
1864      */
1865     if (object_property_get_bool(OBJECT(qdev_get_machine()),
1866                                  "vfio-no-msix-emulation", NULL)) {
1867         memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
1868     }
1869 
1870     return true;
1871 }
1872 
1873 void vfio_pci_teardown_msi(VFIOPCIDevice *vdev)
1874 {
1875     msi_uninit(&vdev->pdev);
1876 
1877     if (vdev->msix) {
1878         msix_uninit(&vdev->pdev,
1879                     vdev->bars[vdev->msix->table_bar].mr,
1880                     vdev->bars[vdev->msix->pba_bar].mr);
1881         g_free(vdev->msix->pending);
1882     }
1883 }
1884 
1885 /*
1886  * Resource setup
1887  */
1888 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
1889 {
1890     int i;
1891 
1892     for (i = 0; i < PCI_ROM_SLOT; i++) {
1893         vfio_region_mmaps_set_enabled(&vdev->bars[i].region, enabled);
1894     }
1895 }
1896 
1897 static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
1898 {
1899     VFIOBAR *bar = &vdev->bars[nr];
1900 
1901     uint32_t pci_bar;
1902     int ret;
1903 
1904     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
1905     if (!bar->region.size) {
1906         return;
1907     }
1908 
1909     /* Determine what type of BAR this is for registration */
1910     ret = vfio_pci_config_space_read(vdev, PCI_BASE_ADDRESS_0 + (4 * nr),
1911                                      sizeof(pci_bar), &pci_bar);
1912     if (ret != sizeof(pci_bar)) {
1913         error_report("vfio: Failed to read BAR %d: %s", nr, strreaderror(ret));
1914         return;
1915     }
1916 
1917     pci_bar = le32_to_cpu(pci_bar);
1918     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
1919     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
1920     bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
1921                                          ~PCI_BASE_ADDRESS_MEM_MASK);
1922     bar->size = bar->region.size;
1923 
1924     /* IO regions are sync, memory can be async */
1925     bar->region.post_wr = (bar->ioport == 0);
1926 }
1927 
1928 static void vfio_bars_prepare(VFIOPCIDevice *vdev)
1929 {
1930     int i;
1931 
1932     for (i = 0; i < PCI_ROM_SLOT; i++) {
1933         vfio_bar_prepare(vdev, i);
1934     }
1935 }
1936 
1937 static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
1938 {
1939     VFIOBAR *bar = &vdev->bars[nr];
1940     char *name;
1941 
1942     if (!bar->size) {
1943         return;
1944     }
1945 
1946     bar->mr = g_new0(MemoryRegion, 1);
1947     name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr);
1948     memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size);
1949     g_free(name);
1950 
1951     if (bar->region.size) {
1952         memory_region_add_subregion(bar->mr, 0, bar->region.mem);
1953 
1954         if (vfio_region_mmap(&bar->region)) {
1955             error_report("Failed to mmap %s BAR %d. Performance may be slow",
1956                          vdev->vbasedev.name, nr);
1957         }
1958     }
1959 
1960     pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
1961 }
1962 
1963 static void vfio_bars_register(VFIOPCIDevice *vdev)
1964 {
1965     int i;
1966 
1967     for (i = 0; i < PCI_ROM_SLOT; i++) {
1968         vfio_bar_register(vdev, i);
1969     }
1970 }
1971 
1972 void vfio_pci_bars_exit(VFIOPCIDevice *vdev)
1973 {
1974     int i;
1975 
1976     for (i = 0; i < PCI_ROM_SLOT; i++) {
1977         VFIOBAR *bar = &vdev->bars[i];
1978 
1979         vfio_bar_quirk_exit(vdev, i);
1980         vfio_region_exit(&bar->region);
1981         if (bar->region.size) {
1982             memory_region_del_subregion(bar->mr, bar->region.mem);
1983         }
1984     }
1985 
1986     if (vdev->vga) {
1987         pci_unregister_vga(&vdev->pdev);
1988         vfio_vga_quirk_exit(vdev);
1989     }
1990 }
1991 
1992 static void vfio_bars_finalize(VFIOPCIDevice *vdev)
1993 {
1994     int i;
1995 
1996     for (i = 0; i < PCI_ROM_SLOT; i++) {
1997         VFIOBAR *bar = &vdev->bars[i];
1998 
1999         vfio_bar_quirk_finalize(vdev, i);
2000         vfio_region_finalize(&bar->region);
2001         if (bar->mr) {
2002             assert(bar->size);
2003             object_unparent(OBJECT(bar->mr));
2004             g_free(bar->mr);
2005             bar->mr = NULL;
2006         }
2007     }
2008 
2009     if (vdev->vga) {
2010         vfio_vga_quirk_finalize(vdev);
2011         for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
2012             object_unparent(OBJECT(&vdev->vga->region[i].mem));
2013         }
2014         g_free(vdev->vga);
2015     }
2016 }
2017 
2018 /*
2019  * General setup
2020  */
2021 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
2022 {
2023     uint8_t tmp;
2024     uint16_t next = PCI_CONFIG_SPACE_SIZE;
2025 
2026     for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
2027          tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]) {
2028         if (tmp > pos && tmp < next) {
2029             next = tmp;
2030         }
2031     }
2032 
2033     return next - pos;
2034 }
2035 
2036 
2037 static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
2038 {
2039     uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
2040 
2041     for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
2042         tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
2043         if (tmp > pos && tmp < next) {
2044             next = tmp;
2045         }
2046     }
2047 
2048     return next - pos;
2049 }
2050 
2051 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
2052 {
2053     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
2054 }
2055 
2056 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos,
2057                                    uint16_t val, uint16_t mask)
2058 {
2059     vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
2060     vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
2061     vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
2062 }
2063 
2064 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
2065 {
2066     pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
2067 }
2068 
2069 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
2070                                    uint32_t val, uint32_t mask)
2071 {
2072     vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
2073     vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
2074     vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
2075 }
2076 
2077 static void vfio_pci_enable_rp_atomics(VFIOPCIDevice *vdev)
2078 {
2079     struct vfio_device_info_cap_pci_atomic_comp *cap;
2080     g_autofree struct vfio_device_info *info = NULL;
2081     PCIBus *bus = pci_get_bus(&vdev->pdev);
2082     PCIDevice *parent = bus->parent_dev;
2083     struct vfio_info_cap_header *hdr;
2084     uint32_t mask = 0;
2085     uint8_t *pos;
2086 
2087     /*
2088      * PCIe Atomic Ops completer support is only added automatically for single
2089      * function devices downstream of a root port supporting DEVCAP2.  Support
2090      * is added during realize and, if added, removed during device exit.  The
2091      * single function requirement avoids conflicting requirements should a
2092      * slot be composed of multiple devices with differing capabilities.
2093      */
2094     if (pci_bus_is_root(bus) || !parent || !parent->exp.exp_cap ||
2095         pcie_cap_get_type(parent) != PCI_EXP_TYPE_ROOT_PORT ||
2096         pcie_cap_get_version(parent) != PCI_EXP_FLAGS_VER2 ||
2097         vdev->pdev.devfn ||
2098         vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
2099         return;
2100     }
2101 
2102     pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2;
2103 
2104     /* Abort if there'a already an Atomic Ops configuration on the root port */
2105     if (pci_get_long(pos) & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
2106                              PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
2107                              PCI_EXP_DEVCAP2_ATOMIC_COMP128)) {
2108         return;
2109     }
2110 
2111     info = vfio_get_device_info(vdev->vbasedev.fd);
2112     if (!info) {
2113         return;
2114     }
2115 
2116     hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP);
2117     if (!hdr) {
2118         return;
2119     }
2120 
2121     cap = (void *)hdr;
2122     if (cap->flags & VFIO_PCI_ATOMIC_COMP32) {
2123         mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP32;
2124     }
2125     if (cap->flags & VFIO_PCI_ATOMIC_COMP64) {
2126         mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP64;
2127     }
2128     if (cap->flags & VFIO_PCI_ATOMIC_COMP128) {
2129         mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP128;
2130     }
2131 
2132     if (!mask) {
2133         return;
2134     }
2135 
2136     pci_long_test_and_set_mask(pos, mask);
2137     vdev->clear_parent_atomics_on_exit = true;
2138 }
2139 
2140 static void vfio_pci_disable_rp_atomics(VFIOPCIDevice *vdev)
2141 {
2142     if (vdev->clear_parent_atomics_on_exit) {
2143         PCIDevice *parent = pci_get_bus(&vdev->pdev)->parent_dev;
2144         uint8_t *pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2;
2145 
2146         pci_long_test_and_clear_mask(pos, PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
2147                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
2148                                           PCI_EXP_DEVCAP2_ATOMIC_COMP128);
2149     }
2150 }
2151 
2152 static bool vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size,
2153                                 Error **errp)
2154 {
2155     uint16_t flags;
2156     uint8_t type;
2157 
2158     flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
2159     type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
2160 
2161     if (type != PCI_EXP_TYPE_ENDPOINT &&
2162         type != PCI_EXP_TYPE_LEG_END &&
2163         type != PCI_EXP_TYPE_RC_END) {
2164 
2165         error_setg(errp, "assignment of PCIe type 0x%x "
2166                    "devices is not currently supported", type);
2167         return false;
2168     }
2169 
2170     if (!pci_bus_is_express(pci_get_bus(&vdev->pdev))) {
2171         PCIBus *bus = pci_get_bus(&vdev->pdev);
2172         PCIDevice *bridge;
2173 
2174         /*
2175          * Traditionally PCI device assignment exposes the PCIe capability
2176          * as-is on non-express buses.  The reason being that some drivers
2177          * simply assume that it's there, for example tg3.  However when
2178          * we're running on a native PCIe machine type, like Q35, we need
2179          * to hide the PCIe capability.  The reason for this is twofold;
2180          * first Windows guests get a Code 10 error when the PCIe capability
2181          * is exposed in this configuration.  Therefore express devices won't
2182          * work at all unless they're attached to express buses in the VM.
2183          * Second, a native PCIe machine introduces the possibility of fine
2184          * granularity IOMMUs supporting both translation and isolation.
2185          * Guest code to discover the IOMMU visibility of a device, such as
2186          * IOMMU grouping code on Linux, is very aware of device types and
2187          * valid transitions between bus types.  An express device on a non-
2188          * express bus is not a valid combination on bare metal systems.
2189          *
2190          * Drivers that require a PCIe capability to make the device
2191          * functional are simply going to need to have their devices placed
2192          * on a PCIe bus in the VM.
2193          */
2194         while (!pci_bus_is_root(bus)) {
2195             bridge = pci_bridge_get_device(bus);
2196             bus = pci_get_bus(bridge);
2197         }
2198 
2199         if (pci_bus_is_express(bus)) {
2200             return true;
2201         }
2202 
2203     } else if (pci_bus_is_root(pci_get_bus(&vdev->pdev))) {
2204         /*
2205          * On a Root Complex bus Endpoints become Root Complex Integrated
2206          * Endpoints, which changes the type and clears the LNK & LNK2 fields.
2207          */
2208         if (type == PCI_EXP_TYPE_ENDPOINT) {
2209             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2210                                    PCI_EXP_TYPE_RC_END << 4,
2211                                    PCI_EXP_FLAGS_TYPE);
2212 
2213             /* Link Capabilities, Status, and Control goes away */
2214             if (size > PCI_EXP_LNKCTL) {
2215                 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
2216                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2217                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
2218 
2219 #ifndef PCI_EXP_LNKCAP2
2220 #define PCI_EXP_LNKCAP2 44
2221 #endif
2222 #ifndef PCI_EXP_LNKSTA2
2223 #define PCI_EXP_LNKSTA2 50
2224 #endif
2225                 /* Link 2 Capabilities, Status, and Control goes away */
2226                 if (size > PCI_EXP_LNKCAP2) {
2227                     vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
2228                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
2229                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
2230                 }
2231             }
2232 
2233         } else if (type == PCI_EXP_TYPE_LEG_END) {
2234             /*
2235              * Legacy endpoints don't belong on the root complex.  Windows
2236              * seems to be happier with devices if we skip the capability.
2237              */
2238             return true;
2239         }
2240 
2241     } else {
2242         /*
2243          * Convert Root Complex Integrated Endpoints to regular endpoints.
2244          * These devices don't support LNK/LNK2 capabilities, so make them up.
2245          */
2246         if (type == PCI_EXP_TYPE_RC_END) {
2247             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2248                                    PCI_EXP_TYPE_ENDPOINT << 4,
2249                                    PCI_EXP_FLAGS_TYPE);
2250             vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
2251                            QEMU_PCI_EXP_LNKCAP_MLW(QEMU_PCI_EXP_LNK_X1) |
2252                            QEMU_PCI_EXP_LNKCAP_MLS(QEMU_PCI_EXP_LNK_2_5GT), ~0);
2253             vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2254         }
2255 
2256         vfio_pci_enable_rp_atomics(vdev);
2257     }
2258 
2259     /*
2260      * Intel 82599 SR-IOV VFs report an invalid PCIe capability version 0
2261      * (Niantic errate #35) causing Windows to error with a Code 10 for the
2262      * device on Q35.  Fixup any such devices to report version 1.  If we
2263      * were to remove the capability entirely the guest would lose extended
2264      * config space.
2265      */
2266     if ((flags & PCI_EXP_FLAGS_VERS) == 0) {
2267         vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2268                                1, PCI_EXP_FLAGS_VERS);
2269     }
2270 
2271     pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size,
2272                              errp);
2273     if (pos < 0) {
2274         return false;
2275     }
2276 
2277     vdev->pdev.exp.exp_cap = pos;
2278 
2279     return true;
2280 }
2281 
2282 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
2283 {
2284     uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
2285 
2286     if (cap & PCI_EXP_DEVCAP_FLR) {
2287         trace_vfio_check_pcie_flr(vdev->vbasedev.name);
2288         vdev->has_flr = true;
2289     }
2290 }
2291 
2292 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos)
2293 {
2294     uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
2295 
2296     if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
2297         trace_vfio_check_pm_reset(vdev->vbasedev.name);
2298         vdev->has_pm_reset = true;
2299     }
2300 }
2301 
2302 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
2303 {
2304     uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
2305 
2306     if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
2307         trace_vfio_check_af_flr(vdev->vbasedev.name);
2308         vdev->has_flr = true;
2309     }
2310 }
2311 
2312 static bool vfio_add_vendor_specific_cap(VFIOPCIDevice *vdev, int pos,
2313                                          uint8_t size, Error **errp)
2314 {
2315     PCIDevice *pdev = &vdev->pdev;
2316 
2317     pos = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, size, errp);
2318     if (pos < 0) {
2319         return false;
2320     }
2321 
2322     /*
2323      * Exempt config space check for Vendor Specific Information during
2324      * restore/load.
2325      * Config space check is still enforced for 3 byte VSC header.
2326      */
2327     if (vdev->skip_vsc_check && size > 3) {
2328         memset(pdev->cmask + pos + 3, 0, size - 3);
2329     }
2330 
2331     return true;
2332 }
2333 
2334 static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
2335 {
2336     ERRP_GUARD();
2337     PCIDevice *pdev = &vdev->pdev;
2338     uint8_t cap_id, next, size;
2339     bool ret;
2340 
2341     cap_id = pdev->config[pos];
2342     next = pdev->config[pos + PCI_CAP_LIST_NEXT];
2343 
2344     /*
2345      * If it becomes important to configure capabilities to their actual
2346      * size, use this as the default when it's something we don't recognize.
2347      * Since QEMU doesn't actually handle many of the config accesses,
2348      * exact size doesn't seem worthwhile.
2349      */
2350     size = vfio_std_cap_max_size(pdev, pos);
2351 
2352     /*
2353      * pci_add_capability always inserts the new capability at the head
2354      * of the chain.  Therefore to end up with a chain that matches the
2355      * physical device, we insert from the end by making this recursive.
2356      * This is also why we pre-calculate size above as cached config space
2357      * will be changed as we unwind the stack.
2358      */
2359     if (next) {
2360         if (!vfio_add_std_cap(vdev, next, errp)) {
2361             return false;
2362         }
2363     } else {
2364         /* Begin the rebuild, use QEMU emulated list bits */
2365         pdev->config[PCI_CAPABILITY_LIST] = 0;
2366         vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
2367         vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
2368 
2369         if (!vfio_add_virt_caps(vdev, errp)) {
2370             return false;
2371         }
2372     }
2373 
2374     /* Scale down size, esp in case virt caps were added above */
2375     size = MIN(size, vfio_std_cap_max_size(pdev, pos));
2376 
2377     /* Use emulated next pointer to allow dropping caps */
2378     pci_set_byte(vdev->emulated_config_bits + pos + PCI_CAP_LIST_NEXT, 0xff);
2379 
2380     switch (cap_id) {
2381     case PCI_CAP_ID_MSI:
2382         ret = vfio_msi_setup(vdev, pos, errp);
2383         break;
2384     case PCI_CAP_ID_EXP:
2385         vfio_check_pcie_flr(vdev, pos);
2386         ret = vfio_setup_pcie_cap(vdev, pos, size, errp);
2387         break;
2388     case PCI_CAP_ID_MSIX:
2389         ret = vfio_msix_setup(vdev, pos, errp);
2390         break;
2391     case PCI_CAP_ID_PM:
2392         vfio_check_pm_reset(vdev, pos);
2393         ret = pci_pm_init(pdev, pos, errp) >= 0;
2394         /*
2395          * PCI-core config space emulation needs write access to the power
2396          * state enabled for tracking BAR mapping relative to PM state.
2397          */
2398         pci_set_word(pdev->wmask + pos + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK);
2399         break;
2400     case PCI_CAP_ID_AF:
2401         vfio_check_af_flr(vdev, pos);
2402         ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
2403         break;
2404     case PCI_CAP_ID_VNDR:
2405         ret = vfio_add_vendor_specific_cap(vdev, pos, size, errp);
2406         break;
2407     default:
2408         ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
2409         break;
2410     }
2411 
2412     if (!ret) {
2413         error_prepend(errp,
2414                       "failed to add PCI capability 0x%x[0x%x]@0x%x: ",
2415                       cap_id, size, pos);
2416     }
2417 
2418     return ret;
2419 }
2420 
2421 static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos)
2422 {
2423     uint32_t ctrl;
2424     int i, nbar;
2425 
2426     ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL);
2427     nbar = (ctrl & PCI_REBAR_CTRL_NBAR_MASK) >> PCI_REBAR_CTRL_NBAR_SHIFT;
2428 
2429     for (i = 0; i < nbar; i++) {
2430         uint32_t cap;
2431         int size;
2432 
2433         ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL + (i * 8));
2434         size = (ctrl & PCI_REBAR_CTRL_BAR_SIZE) >> PCI_REBAR_CTRL_BAR_SHIFT;
2435 
2436         /* The cap register reports sizes 1MB to 128TB, with 4 reserved bits */
2437         cap = size <= 27 ? 1U << (size + 4) : 0;
2438 
2439         /*
2440          * The PCIe spec (v6.0.1, 7.8.6) requires HW to support at least one
2441          * size in the range 1MB to 512GB.  We intend to mask all sizes except
2442          * the one currently enabled in the size field, therefore if it's
2443          * outside the range, hide the whole capability as this virtualization
2444          * trick won't work.  If >512GB resizable BARs start to appear, we
2445          * might need an opt-in or reservation scheme in the kernel.
2446          */
2447         if (!(cap & PCI_REBAR_CAP_SIZES)) {
2448             return -EINVAL;
2449         }
2450 
2451         /* Hide all sizes reported in the ctrl reg per above requirement. */
2452         ctrl &= (PCI_REBAR_CTRL_BAR_SIZE |
2453                  PCI_REBAR_CTRL_NBAR_MASK |
2454                  PCI_REBAR_CTRL_BAR_IDX);
2455 
2456         /*
2457          * The BAR size field is RW, however we've mangled the capability
2458          * register such that we only report a single size, ie. the current
2459          * BAR size.  A write of an unsupported value is undefined, therefore
2460          * the register field is essentially RO.
2461          */
2462         vfio_add_emulated_long(vdev, pos + PCI_REBAR_CAP + (i * 8), cap, ~0);
2463         vfio_add_emulated_long(vdev, pos + PCI_REBAR_CTRL + (i * 8), ctrl, ~0);
2464     }
2465 
2466     return 0;
2467 }
2468 
2469 static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
2470 {
2471     PCIDevice *pdev = &vdev->pdev;
2472     uint32_t header;
2473     uint16_t cap_id, next, size;
2474     uint8_t cap_ver;
2475     uint8_t *config;
2476 
2477     /* Only add extended caps if we have them and the guest can see them */
2478     if (!pci_is_express(pdev) || !pci_bus_is_express(pci_get_bus(pdev)) ||
2479         !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
2480         return;
2481     }
2482 
2483     /*
2484      * pcie_add_capability always inserts the new capability at the tail
2485      * of the chain.  Therefore to end up with a chain that matches the
2486      * physical device, we cache the config space to avoid overwriting
2487      * the original config space when we parse the extended capabilities.
2488      */
2489     config = g_memdup(pdev->config, vdev->config_size);
2490 
2491     /*
2492      * Extended capabilities are chained with each pointing to the next, so we
2493      * can drop anything other than the head of the chain simply by modifying
2494      * the previous next pointer.  Seed the head of the chain here such that
2495      * we can simply skip any capabilities we want to drop below, regardless
2496      * of their position in the chain.  If this stub capability still exists
2497      * after we add the capabilities we want to expose, update the capability
2498      * ID to zero.  Note that we cannot seed with the capability header being
2499      * zero as this conflicts with definition of an absent capability chain
2500      * and prevents capabilities beyond the head of the list from being added.
2501      * By replacing the dummy capability ID with zero after walking the device
2502      * chain, we also transparently mark extended capabilities as absent if
2503      * no capabilities were added.  Note that the PCIe spec defines an absence
2504      * of extended capabilities to be determined by a value of zero for the
2505      * capability ID, version, AND next pointer.  A non-zero next pointer
2506      * should be sufficient to indicate additional capabilities are present,
2507      * which will occur if we call pcie_add_capability() below.  The entire
2508      * first dword is emulated to support this.
2509      *
2510      * NB. The kernel side does similar masking, so be prepared that our
2511      * view of the device may also contain a capability ID zero in the head
2512      * of the chain.  Skip it for the same reason that we cannot seed the
2513      * chain with a zero capability.
2514      */
2515     pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
2516                  PCI_EXT_CAP(0xFFFF, 0, 0));
2517     pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
2518     pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
2519 
2520     for (next = PCI_CONFIG_SPACE_SIZE; next;
2521          next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
2522         header = pci_get_long(config + next);
2523         cap_id = PCI_EXT_CAP_ID(header);
2524         cap_ver = PCI_EXT_CAP_VER(header);
2525 
2526         /*
2527          * If it becomes important to configure extended capabilities to their
2528          * actual size, use this as the default when it's something we don't
2529          * recognize. Since QEMU doesn't actually handle many of the config
2530          * accesses, exact size doesn't seem worthwhile.
2531          */
2532         size = vfio_ext_cap_max_size(config, next);
2533 
2534         /* Use emulated next pointer to allow dropping extended caps */
2535         pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
2536                                    PCI_EXT_CAP_NEXT_MASK);
2537 
2538         switch (cap_id) {
2539         case 0: /* kernel masked capability */
2540         case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */
2541         case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */
2542             trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
2543             break;
2544         case PCI_EXT_CAP_ID_REBAR:
2545             if (!vfio_setup_rebar_ecap(vdev, next)) {
2546                 pcie_add_capability(pdev, cap_id, cap_ver, next, size);
2547             }
2548             break;
2549         default:
2550             pcie_add_capability(pdev, cap_id, cap_ver, next, size);
2551         }
2552 
2553     }
2554 
2555     /* Cleanup chain head ID if necessary */
2556     if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
2557         pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
2558     }
2559 
2560     g_free(config);
2561 }
2562 
2563 bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
2564 {
2565     PCIDevice *pdev = &vdev->pdev;
2566 
2567     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
2568         !pdev->config[PCI_CAPABILITY_LIST]) {
2569         return true; /* Nothing to add */
2570     }
2571 
2572     if (!vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST], errp)) {
2573         return false;
2574     }
2575 
2576     vfio_add_ext_cap(vdev);
2577     return true;
2578 }
2579 
2580 void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
2581 {
2582     PCIDevice *pdev = &vdev->pdev;
2583     uint16_t cmd;
2584 
2585     vfio_disable_interrupts(vdev);
2586 
2587     /*
2588      * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master.
2589      * Also put INTx Disable in known state.
2590      */
2591     cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
2592     cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
2593              PCI_COMMAND_INTX_DISABLE);
2594     vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
2595 
2596     /* Make sure the device is in D0 */
2597     if (pdev->pm_cap) {
2598         uint16_t pmcsr;
2599         uint8_t state;
2600 
2601         pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2);
2602         state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2603         if (state) {
2604             pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
2605             vfio_pci_write_config(pdev, pdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
2606             /* vfio handles the necessary delay here */
2607             pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2);
2608             state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2609             if (state) {
2610                 error_report("vfio: Unable to power on device, stuck in D%d",
2611                              state);
2612             }
2613         }
2614     }
2615 }
2616 
2617 void vfio_pci_post_reset(VFIOPCIDevice *vdev)
2618 {
2619     VFIODevice *vbasedev = &vdev->vbasedev;
2620     Error *err = NULL;
2621     int ret, nr;
2622 
2623     if (!vfio_intx_enable(vdev, &err)) {
2624         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2625     }
2626 
2627     for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
2628         off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr);
2629         uint32_t val = 0;
2630         uint32_t len = sizeof(val);
2631 
2632         ret = vfio_pci_config_space_write(vdev, addr, len, &val);
2633         if (ret != len) {
2634             error_report("%s(%s) reset bar %d failed: %s", __func__,
2635                          vbasedev->name, nr, strwriteerror(ret));
2636         }
2637     }
2638 
2639     vfio_quirk_reset(vdev);
2640 }
2641 
2642 bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
2643 {
2644     char tmp[13];
2645 
2646     sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain,
2647             addr->bus, addr->slot, addr->function);
2648 
2649     return (strcmp(tmp, name) == 0);
2650 }
2651 
2652 int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
2653                                     struct vfio_pci_hot_reset_info **info_p)
2654 {
2655     struct vfio_pci_hot_reset_info *info;
2656     int ret, count;
2657 
2658     assert(info_p && !*info_p);
2659 
2660     info = g_malloc0(sizeof(*info));
2661     info->argsz = sizeof(*info);
2662 
2663     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2664     if (ret && errno != ENOSPC) {
2665         ret = -errno;
2666         g_free(info);
2667         if (!vdev->has_pm_reset) {
2668             error_report("vfio: Cannot reset device %s, "
2669                          "no available reset mechanism.", vdev->vbasedev.name);
2670         }
2671         return ret;
2672     }
2673 
2674     count = info->count;
2675     info = g_realloc(info, sizeof(*info) + (count * sizeof(info->devices[0])));
2676     info->argsz = sizeof(*info) + (count * sizeof(info->devices[0]));
2677 
2678     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2679     if (ret) {
2680         ret = -errno;
2681         g_free(info);
2682         error_report("vfio: hot reset info failed: %m");
2683         return ret;
2684     }
2685 
2686     *info_p = info;
2687     return 0;
2688 }
2689 
2690 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
2691 {
2692     VFIODevice *vbasedev = &vdev->vbasedev;
2693     const VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer);
2694 
2695     return vioc->pci_hot_reset(vbasedev, single);
2696 }
2697 
2698 /*
2699  * We want to differentiate hot reset of multiple in-use devices vs hot reset
2700  * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
2701  * of doing hot resets when there is only a single device per bus.  The in-use
2702  * here refers to how many VFIODevices are affected.  A hot reset that affects
2703  * multiple devices, but only a single in-use device, means that we can call
2704  * it from our bus ->reset() callback since the extent is effectively a single
2705  * device.  This allows us to make use of it in the hotplug path.  When there
2706  * are multiple in-use devices, we can only trigger the hot reset during a
2707  * system reset and thus from our reset handler.  We separate _one vs _multi
2708  * here so that we don't overlap and do a double reset on the system reset
2709  * path where both our reset handler and ->reset() callback are used.  Calling
2710  * _one() will only do a hot reset for the one in-use devices case, calling
2711  * _multi() will do nothing if a _one() would have been sufficient.
2712  */
2713 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev)
2714 {
2715     return vfio_pci_hot_reset(vdev, true);
2716 }
2717 
2718 static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev)
2719 {
2720     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2721     return vfio_pci_hot_reset(vdev, false);
2722 }
2723 
2724 static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev)
2725 {
2726     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2727     if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
2728         vbasedev->needs_reset = true;
2729     }
2730 }
2731 
2732 static Object *vfio_pci_get_object(VFIODevice *vbasedev)
2733 {
2734     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2735 
2736     return OBJECT(vdev);
2737 }
2738 
2739 static bool vfio_msix_present(void *opaque, int version_id)
2740 {
2741     PCIDevice *pdev = opaque;
2742 
2743     return msix_present(pdev);
2744 }
2745 
2746 static bool vfio_display_migration_needed(void *opaque)
2747 {
2748     VFIOPCIDevice *vdev = opaque;
2749 
2750     /*
2751      * We need to migrate the VFIODisplay object if ramfb *migration* was
2752      * explicitly requested (in which case we enforced both ramfb=on and
2753      * display=on), or ramfb migration was left at the default "auto"
2754      * setting, and *ramfb* was explicitly requested (in which case we
2755      * enforced display=on).
2756      */
2757     return vdev->ramfb_migrate == ON_OFF_AUTO_ON ||
2758         (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO && vdev->enable_ramfb);
2759 }
2760 
2761 static const VMStateDescription vmstate_vfio_display = {
2762     .name = "VFIOPCIDevice/VFIODisplay",
2763     .version_id = 1,
2764     .minimum_version_id = 1,
2765     .needed = vfio_display_migration_needed,
2766     .fields = (const VMStateField[]){
2767         VMSTATE_STRUCT_POINTER(dpy, VFIOPCIDevice, vfio_display_vmstate,
2768                                VFIODisplay),
2769         VMSTATE_END_OF_LIST()
2770     }
2771 };
2772 
2773 static const VMStateDescription vmstate_vfio_pci_config = {
2774     .name = "VFIOPCIDevice",
2775     .version_id = 1,
2776     .minimum_version_id = 1,
2777     .fields = (const VMStateField[]) {
2778         VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
2779         VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present),
2780         VMSTATE_END_OF_LIST()
2781     },
2782     .subsections = (const VMStateDescription * const []) {
2783         &vmstate_vfio_display,
2784         NULL
2785     }
2786 };
2787 
2788 static int vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f, Error **errp)
2789 {
2790     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2791 
2792     return vmstate_save_state_with_err(f, &vmstate_vfio_pci_config, vdev, NULL,
2793                                        errp);
2794 }
2795 
2796 static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
2797 {
2798     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2799     PCIDevice *pdev = &vdev->pdev;
2800     pcibus_t old_addr[PCI_NUM_REGIONS - 1];
2801     int bar, ret;
2802 
2803     for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
2804         old_addr[bar] = pdev->io_regions[bar].addr;
2805     }
2806 
2807     ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1);
2808     if (ret) {
2809         return ret;
2810     }
2811 
2812     vfio_pci_write_config(pdev, PCI_COMMAND,
2813                           pci_get_word(pdev->config + PCI_COMMAND), 2);
2814 
2815     for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
2816         /*
2817          * The address may not be changed in some scenarios
2818          * (e.g. the VF driver isn't loaded in VM).
2819          */
2820         if (old_addr[bar] != pdev->io_regions[bar].addr &&
2821             vdev->bars[bar].region.size > 0 &&
2822             vdev->bars[bar].region.size < qemu_real_host_page_size()) {
2823             vfio_sub_page_bar_update_mapping(pdev, bar);
2824         }
2825     }
2826 
2827     if (msi_enabled(pdev)) {
2828         vfio_msi_enable(vdev);
2829     } else if (msix_enabled(pdev)) {
2830         vfio_msix_enable(vdev);
2831     }
2832 
2833     return ret;
2834 }
2835 
2836 void vfio_sub_page_bar_update_mappings(VFIOPCIDevice *vdev)
2837 {
2838     PCIDevice *pdev = &vdev->pdev;
2839     int page_size = qemu_real_host_page_size();
2840     int bar;
2841 
2842     for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
2843         PCIIORegion *r = &pdev->io_regions[bar];
2844         if (r->addr != PCI_BAR_UNMAPPED && r->size > 0 && r->size < page_size) {
2845             vfio_sub_page_bar_update_mapping(pdev, bar);
2846         }
2847     }
2848 }
2849 
2850 static VFIODeviceOps vfio_pci_ops = {
2851     .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
2852     .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
2853     .vfio_eoi = vfio_pci_intx_eoi,
2854     .vfio_get_object = vfio_pci_get_object,
2855     .vfio_save_config = vfio_pci_save_config,
2856     .vfio_load_config = vfio_pci_load_config,
2857 };
2858 
2859 bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
2860 {
2861     VFIODevice *vbasedev = &vdev->vbasedev;
2862     struct vfio_region_info *reg_info = NULL;
2863     int ret;
2864 
2865     ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, &reg_info);
2866     if (ret) {
2867         error_setg_errno(errp, -ret,
2868                          "failed getting region info for VGA region index %d",
2869                          VFIO_PCI_VGA_REGION_INDEX);
2870         return false;
2871     }
2872 
2873     if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) ||
2874         !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) ||
2875         reg_info->size < 0xbffff + 1) {
2876         error_setg(errp, "unexpected VGA info, flags 0x%lx, size 0x%lx",
2877                    (unsigned long)reg_info->flags,
2878                    (unsigned long)reg_info->size);
2879         return false;
2880     }
2881 
2882     vdev->vga = g_new0(VFIOVGA, 1);
2883 
2884     vdev->vga->fd_offset = reg_info->offset;
2885     vdev->vga->fd = vdev->vbasedev.fd;
2886 
2887     vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
2888     vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
2889     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks);
2890 
2891     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
2892                           OBJECT(vdev), &vfio_vga_ops,
2893                           &vdev->vga->region[QEMU_PCI_VGA_MEM],
2894                           "vfio-vga-mmio@0xa0000",
2895                           QEMU_PCI_VGA_MEM_SIZE);
2896 
2897     vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
2898     vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
2899     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks);
2900 
2901     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
2902                           OBJECT(vdev), &vfio_vga_ops,
2903                           &vdev->vga->region[QEMU_PCI_VGA_IO_LO],
2904                           "vfio-vga-io@0x3b0",
2905                           QEMU_PCI_VGA_IO_LO_SIZE);
2906 
2907     vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
2908     vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
2909     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks);
2910 
2911     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
2912                           OBJECT(vdev), &vfio_vga_ops,
2913                           &vdev->vga->region[QEMU_PCI_VGA_IO_HI],
2914                           "vfio-vga-io@0x3c0",
2915                           QEMU_PCI_VGA_IO_HI_SIZE);
2916 
2917     return true;
2918 }
2919 
2920 bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp)
2921 {
2922     VFIODevice *vbasedev = &vdev->vbasedev;
2923     struct vfio_region_info *reg_info = NULL;
2924     struct vfio_irq_info irq_info;
2925     int i, ret = -1;
2926 
2927     /* Sanity check device */
2928     if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) {
2929         error_setg(errp, "this isn't a PCI device");
2930         return false;
2931     }
2932 
2933     if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
2934         error_setg(errp, "unexpected number of io regions %u",
2935                    vbasedev->num_regions);
2936         return false;
2937     }
2938 
2939     if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
2940         error_setg(errp, "unexpected number of irqs %u", vbasedev->num_irqs);
2941         return false;
2942     }
2943 
2944     for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
2945         char *name = g_strdup_printf("%s BAR %d", vbasedev->name, i);
2946 
2947         ret = vfio_region_setup(OBJECT(vdev), vbasedev,
2948                                 &vdev->bars[i].region, i, name);
2949         g_free(name);
2950 
2951         if (ret) {
2952             error_setg_errno(errp, -ret, "failed to get region %d info", i);
2953             return false;
2954         }
2955 
2956         QLIST_INIT(&vdev->bars[i].quirks);
2957     }
2958 
2959     ret = vfio_device_get_region_info(vbasedev,
2960                                       VFIO_PCI_CONFIG_REGION_INDEX, &reg_info);
2961     if (ret) {
2962         error_setg_errno(errp, -ret, "failed to get config info");
2963         return false;
2964     }
2965 
2966     trace_vfio_pci_populate_device_config(vdev->vbasedev.name,
2967                                       (unsigned long)reg_info->size,
2968                                       (unsigned long)reg_info->offset,
2969                                       (unsigned long)reg_info->flags);
2970 
2971     vdev->config_size = reg_info->size;
2972     if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
2973         vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
2974     }
2975     vdev->config_offset = reg_info->offset;
2976 
2977     if (vdev->features & VFIO_FEATURE_ENABLE_VGA) {
2978         if (!vfio_populate_vga(vdev, errp)) {
2979             error_append_hint(errp, "device does not support "
2980                               "requested feature x-vga\n");
2981             return false;
2982         }
2983     }
2984 
2985     ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info);
2986     if (ret) {
2987         /* This can fail for an old kernel or legacy PCI dev */
2988         trace_vfio_pci_populate_device_get_irq_info_failure(strerror(-ret));
2989     } else if (irq_info.count == 1) {
2990         vdev->pci_aer = true;
2991     } else {
2992         warn_report(VFIO_MSG_PREFIX
2993                     "Could not enable error recovery for the device",
2994                     vbasedev->name);
2995     }
2996 
2997     return true;
2998 }
2999 
3000 void vfio_pci_put_device(VFIOPCIDevice *vdev)
3001 {
3002     vfio_display_finalize(vdev);
3003     vfio_bars_finalize(vdev);
3004     vfio_cpr_pci_unregister_device(vdev);
3005     g_free(vdev->emulated_config_bits);
3006     g_free(vdev->rom);
3007     /*
3008      * XXX Leaking igd_opregion is not an oversight, we can't remove the
3009      * fw_cfg entry therefore leaking this allocation seems like the safest
3010      * option.
3011      *
3012      * g_free(vdev->igd_opregion);
3013      */
3014 
3015     vfio_device_detach(&vdev->vbasedev);
3016 
3017     vfio_device_free_name(&vdev->vbasedev);
3018     g_free(vdev->msix);
3019 }
3020 
3021 static void vfio_err_notifier_handler(void *opaque)
3022 {
3023     VFIOPCIDevice *vdev = opaque;
3024 
3025     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
3026         return;
3027     }
3028 
3029     /*
3030      * TBD. Retrieve the error details and decide what action
3031      * needs to be taken. One of the actions could be to pass
3032      * the error to the guest and have the guest driver recover
3033      * from the error. This requires that PCIe capabilities be
3034      * exposed to the guest. For now, we just terminate the
3035      * guest to contain the error.
3036      */
3037 
3038     error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
3039 
3040     vm_stop(RUN_STATE_INTERNAL_ERROR);
3041 }
3042 
3043 /*
3044  * Registers error notifier for devices supporting error recovery.
3045  * If we encounter a failure in this function, we report an error
3046  * and continue after disabling error recovery support for the
3047  * device.
3048  */
3049 void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev)
3050 {
3051     Error *err = NULL;
3052     int32_t fd;
3053 
3054     if (!vdev->pci_aer) {
3055         return;
3056     }
3057 
3058     if (!vfio_notifier_init(vdev, &vdev->err_notifier, "err_notifier", 0,
3059                             &err)) {
3060         error_report_err(err);
3061         vdev->pci_aer = false;
3062         return;
3063     }
3064 
3065     fd = event_notifier_get_fd(&vdev->err_notifier);
3066     qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev);
3067 
3068     /* Do not alter irq_signaling during vfio_realize for cpr */
3069     if (cpr_is_incoming()) {
3070         return;
3071     }
3072 
3073     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
3074                                        VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
3075         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3076         qemu_set_fd_handler(fd, NULL, NULL, vdev);
3077         vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0);
3078         vdev->pci_aer = false;
3079     }
3080 }
3081 
3082 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
3083 {
3084     Error *err = NULL;
3085 
3086     if (!vdev->pci_aer) {
3087         return;
3088     }
3089 
3090     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
3091                                        VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
3092         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3093     }
3094     qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
3095                         NULL, NULL, vdev);
3096     vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0);
3097 }
3098 
3099 static void vfio_req_notifier_handler(void *opaque)
3100 {
3101     VFIOPCIDevice *vdev = opaque;
3102     Error *err = NULL;
3103 
3104     if (!event_notifier_test_and_clear(&vdev->req_notifier)) {
3105         return;
3106     }
3107 
3108     qdev_unplug(DEVICE(vdev), &err);
3109     if (err) {
3110         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3111     }
3112 }
3113 
3114 void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev)
3115 {
3116     struct vfio_irq_info irq_info;
3117     Error *err = NULL;
3118     int32_t fd;
3119     int ret;
3120 
3121     if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
3122         return;
3123     }
3124 
3125     ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX,
3126                                    &irq_info);
3127     if (ret < 0 || irq_info.count < 1) {
3128         return;
3129     }
3130 
3131     if (!vfio_notifier_init(vdev, &vdev->req_notifier, "req_notifier", 0,
3132                             &err)) {
3133         error_report_err(err);
3134         return;
3135     }
3136 
3137     fd = event_notifier_get_fd(&vdev->req_notifier);
3138     qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev);
3139 
3140     /* Do not alter irq_signaling during vfio_realize for cpr */
3141     if (cpr_is_incoming()) {
3142         vdev->req_enabled = true;
3143         return;
3144     }
3145 
3146     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
3147                                        VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
3148         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3149         qemu_set_fd_handler(fd, NULL, NULL, vdev);
3150         vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0);
3151     } else {
3152         vdev->req_enabled = true;
3153     }
3154 }
3155 
3156 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
3157 {
3158     Error *err = NULL;
3159 
3160     if (!vdev->req_enabled) {
3161         return;
3162     }
3163 
3164     if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
3165                                        VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
3166         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3167     }
3168     qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
3169                         NULL, NULL, vdev);
3170     vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0);
3171 
3172     vdev->req_enabled = false;
3173 }
3174 
3175 void vfio_pci_config_register_vga(VFIOPCIDevice *vdev)
3176 {
3177     assert(vdev->vga != NULL);
3178 
3179     pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
3180                      &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
3181                      &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem);
3182 }
3183 
3184 bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp)
3185 {
3186     PCIDevice *pdev = &vdev->pdev;
3187     VFIODevice *vbasedev = &vdev->vbasedev;
3188     uint32_t config_space_size;
3189     int ret;
3190 
3191     config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size);
3192 
3193     /* Get a copy of config space */
3194     ret = vfio_pci_config_space_read(vdev, 0, config_space_size,
3195                                      vdev->pdev.config);
3196     if (ret < (int)config_space_size) {
3197         ret = ret < 0 ? -ret : EFAULT;
3198         error_setg_errno(errp, ret, "failed to read device config space");
3199         return false;
3200     }
3201 
3202     /* vfio emulates a lot for us, but some bits need extra love */
3203     vdev->emulated_config_bits = g_malloc0(vdev->config_size);
3204 
3205     /* QEMU can choose to expose the ROM or not */
3206     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
3207     /* QEMU can also add or extend BARs */
3208     memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
3209 
3210     /*
3211      * The PCI spec reserves vendor ID 0xffff as an invalid value.  The
3212      * device ID is managed by the vendor and need only be a 16-bit value.
3213      * Allow any 16-bit value for subsystem so they can be hidden or changed.
3214      */
3215     if (vdev->vendor_id != PCI_ANY_ID) {
3216         if (vdev->vendor_id >= 0xffff) {
3217             error_setg(errp, "invalid PCI vendor ID provided");
3218             return false;
3219         }
3220         vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
3221         trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id);
3222     } else {
3223         vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
3224     }
3225 
3226     if (vdev->device_id != PCI_ANY_ID) {
3227         if (vdev->device_id > 0xffff) {
3228             error_setg(errp, "invalid PCI device ID provided");
3229             return false;
3230         }
3231         vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
3232         trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id);
3233     } else {
3234         vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
3235     }
3236 
3237     if (vdev->sub_vendor_id != PCI_ANY_ID) {
3238         if (vdev->sub_vendor_id > 0xffff) {
3239             error_setg(errp, "invalid PCI subsystem vendor ID provided");
3240             return false;
3241         }
3242         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
3243                                vdev->sub_vendor_id, ~0);
3244         trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name,
3245                                               vdev->sub_vendor_id);
3246     }
3247 
3248     if (vdev->sub_device_id != PCI_ANY_ID) {
3249         if (vdev->sub_device_id > 0xffff) {
3250             error_setg(errp, "invalid PCI subsystem device ID provided");
3251             return false;
3252         }
3253         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
3254         trace_vfio_pci_emulated_sub_device_id(vbasedev->name,
3255                                               vdev->sub_device_id);
3256     }
3257 
3258     /*
3259      * Class code is a 24-bit value at config space 0x09. Allow overriding it
3260      * with any 24-bit value.
3261      */
3262     if (vdev->class_code != PCI_ANY_ID) {
3263         if (vdev->class_code > 0xffffff) {
3264             error_setg(errp, "invalid PCI class code provided");
3265             return false;
3266         }
3267         /* Higher 24 bits of PCI_CLASS_REVISION are class code */
3268         vfio_add_emulated_long(vdev, PCI_CLASS_REVISION,
3269                                vdev->class_code << 8, ~0xff);
3270         trace_vfio_pci_emulated_class_code(vbasedev->name, vdev->class_code);
3271     } else {
3272         vdev->class_code = pci_get_long(pdev->config + PCI_CLASS_REVISION) >> 8;
3273     }
3274 
3275     /* QEMU can change multi-function devices to single function, or reverse */
3276     vdev->emulated_config_bits[PCI_HEADER_TYPE] =
3277                                               PCI_HEADER_TYPE_MULTI_FUNCTION;
3278 
3279     /* Restore or clear multifunction, this is always controlled by QEMU */
3280     if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
3281         vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
3282     } else {
3283         vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
3284     }
3285 
3286     /*
3287      * Clear host resource mapping info.  If we choose not to register a
3288      * BAR, such as might be the case with the option ROM, we can get
3289      * confusing, unwritable, residual addresses from the host here.
3290      */
3291     memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
3292     memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
3293 
3294     vfio_pci_size_rom(vdev);
3295 
3296     vfio_bars_prepare(vdev);
3297 
3298     if (!vfio_msix_early_setup(vdev, errp)) {
3299         return false;
3300     }
3301 
3302     vfio_bars_register(vdev);
3303 
3304     if (vdev->vga && vfio_is_vga(vdev)) {
3305         vfio_pci_config_register_vga(vdev);
3306     }
3307 
3308     return true;
3309 }
3310 
3311 bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
3312 {
3313     PCIDevice *pdev = &vdev->pdev;
3314 
3315     /* QEMU emulates all of MSI & MSIX */
3316     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
3317         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
3318                MSIX_CAP_LENGTH);
3319     }
3320 
3321     if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
3322         memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
3323                vdev->msi_cap_size);
3324     }
3325 
3326     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
3327         vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
3328                                              vfio_intx_mmap_enable, vdev);
3329         pci_device_set_intx_routing_notifier(&vdev->pdev,
3330                                              vfio_intx_routing_notifier);
3331         vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
3332         kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
3333 
3334         /*
3335          * During CPR, do not call vfio_intx_enable at this time.  Instead,
3336          * call it from vfio_pci_post_load after the intx routing data has
3337          * been loaded from vmstate.
3338          */
3339         if (!cpr_is_incoming() && !vfio_intx_enable(vdev, errp)) {
3340             timer_free(vdev->intx.mmap_timer);
3341             pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3342             kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3343             return false;
3344         }
3345     }
3346     return true;
3347 }
3348 
3349 static void vfio_pci_realize(PCIDevice *pdev, Error **errp)
3350 {
3351     ERRP_GUARD();
3352     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
3353     VFIODevice *vbasedev = &vdev->vbasedev;
3354     int i;
3355     char uuid[UUID_STR_LEN];
3356     g_autofree char *name = NULL;
3357 
3358     if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
3359         if (!(~vdev->host.domain || ~vdev->host.bus ||
3360               ~vdev->host.slot || ~vdev->host.function)) {
3361             error_setg(errp, "No provided host device");
3362             error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
3363 #ifdef CONFIG_IOMMUFD
3364                               "or -device vfio-pci,fd=DEVICE_FD "
3365 #endif
3366                               "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
3367             return;
3368         }
3369         vbasedev->sysfsdev =
3370             g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x",
3371                             vdev->host.domain, vdev->host.bus,
3372                             vdev->host.slot, vdev->host.function);
3373     }
3374 
3375     if (!vfio_device_get_name(vbasedev, errp)) {
3376         return;
3377     }
3378 
3379     /*
3380      * Mediated devices *might* operate compatibly with discarding of RAM, but
3381      * we cannot know for certain, it depends on whether the mdev vendor driver
3382      * stays in sync with the active working set of the guest driver.  Prevent
3383      * the x-balloon-allowed option unless this is minimally an mdev device.
3384      */
3385     vbasedev->mdev = vfio_device_is_mdev(vbasedev);
3386 
3387     trace_vfio_mdev(vbasedev->name, vbasedev->mdev);
3388 
3389     if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) {
3390         error_setg(errp, "x-balloon-allowed only potentially compatible "
3391                    "with mdev devices");
3392         goto error;
3393     }
3394 
3395     if (!qemu_uuid_is_null(&vdev->vf_token)) {
3396         qemu_uuid_unparse(&vdev->vf_token, uuid);
3397         name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid);
3398     } else {
3399         name = g_strdup(vbasedev->name);
3400     }
3401 
3402     if (!vfio_device_attach(name, vbasedev,
3403                             pci_device_iommu_address_space(pdev), errp)) {
3404         goto error;
3405     }
3406 
3407     if (!vfio_pci_populate_device(vdev, errp)) {
3408         goto error;
3409     }
3410 
3411     if (!vfio_pci_config_setup(vdev, errp)) {
3412         goto error;
3413     }
3414 
3415     if (!vbasedev->mdev &&
3416         !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) {
3417         error_prepend(errp, "Failed to set vIOMMU: ");
3418         goto out_teardown;
3419     }
3420 
3421     if (!vfio_pci_add_capabilities(vdev, errp)) {
3422         goto out_unset_idev;
3423     }
3424 
3425     if (!vfio_config_quirk_setup(vdev, errp)) {
3426         goto out_unset_idev;
3427     }
3428 
3429     if (vdev->vga) {
3430         vfio_vga_quirk_setup(vdev);
3431     }
3432 
3433     for (i = 0; i < PCI_ROM_SLOT; i++) {
3434         vfio_bar_quirk_setup(vdev, i);
3435     }
3436 
3437     if (!vfio_pci_interrupt_setup(vdev, errp)) {
3438         goto out_unset_idev;
3439     }
3440 
3441     if (vdev->display != ON_OFF_AUTO_OFF) {
3442         if (!vfio_display_probe(vdev, errp)) {
3443             goto out_deregister;
3444         }
3445     }
3446     if (vdev->enable_ramfb && vdev->dpy == NULL) {
3447         error_setg(errp, "ramfb=on requires display=on");
3448         goto out_deregister;
3449     }
3450     if (vdev->display_xres || vdev->display_yres) {
3451         if (vdev->dpy == NULL) {
3452             error_setg(errp, "xres and yres properties require display=on");
3453             goto out_deregister;
3454         }
3455         if (vdev->dpy->edid_regs == NULL) {
3456             error_setg(errp, "xres and yres properties need edid support");
3457             goto out_deregister;
3458         }
3459     }
3460 
3461     if (vdev->ramfb_migrate == ON_OFF_AUTO_ON && !vdev->enable_ramfb) {
3462         warn_report("x-ramfb-migrate=on but ramfb=off. "
3463                     "Forcing x-ramfb-migrate to off.");
3464         vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
3465     }
3466     if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) {
3467         if (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO) {
3468             vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
3469         } else if (vdev->ramfb_migrate == ON_OFF_AUTO_ON) {
3470             error_setg(errp, "x-ramfb-migrate requires enable-migration");
3471             goto out_deregister;
3472         }
3473     }
3474 
3475     if (!pdev->failover_pair_id) {
3476         if (!vfio_migration_realize(vbasedev, errp)) {
3477             goto out_deregister;
3478         }
3479     }
3480 
3481     vfio_pci_register_err_notifier(vdev);
3482     vfio_pci_register_req_notifier(vdev);
3483     vfio_setup_resetfn_quirk(vdev);
3484     vfio_cpr_pci_register_device(vdev);
3485 
3486     return;
3487 
3488 out_deregister:
3489     if (vdev->interrupt == VFIO_INT_INTx) {
3490         vfio_intx_disable(vdev);
3491     }
3492     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3493     if (vdev->irqchip_change_notifier.notify) {
3494         kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3495     }
3496     if (vdev->intx.mmap_timer) {
3497         timer_free(vdev->intx.mmap_timer);
3498     }
3499 out_unset_idev:
3500     if (!vbasedev->mdev) {
3501         pci_device_unset_iommu_device(pdev);
3502     }
3503 out_teardown:
3504     vfio_pci_teardown_msi(vdev);
3505     vfio_pci_bars_exit(vdev);
3506 error:
3507     error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name);
3508 }
3509 
3510 static void vfio_instance_finalize(Object *obj)
3511 {
3512     VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
3513 
3514     vfio_pci_put_device(vdev);
3515 }
3516 
3517 static void vfio_exitfn(PCIDevice *pdev)
3518 {
3519     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
3520     VFIODevice *vbasedev = &vdev->vbasedev;
3521 
3522     vfio_unregister_req_notifier(vdev);
3523     vfio_unregister_err_notifier(vdev);
3524     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3525     if (vdev->irqchip_change_notifier.notify) {
3526         kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3527     }
3528     vfio_disable_interrupts(vdev);
3529     if (vdev->intx.mmap_timer) {
3530         timer_free(vdev->intx.mmap_timer);
3531     }
3532     vfio_pci_teardown_msi(vdev);
3533     vfio_pci_disable_rp_atomics(vdev);
3534     vfio_pci_bars_exit(vdev);
3535     vfio_migration_exit(vbasedev);
3536     if (!vbasedev->mdev) {
3537         pci_device_unset_iommu_device(pdev);
3538     }
3539 }
3540 
3541 static void vfio_pci_reset(DeviceState *dev)
3542 {
3543     VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev);
3544 
3545     /* Do not reset the device during qemu_system_reset prior to cpr load */
3546     if (cpr_is_incoming()) {
3547         return;
3548     }
3549 
3550     trace_vfio_pci_reset(vdev->vbasedev.name);
3551 
3552     vfio_pci_pre_reset(vdev);
3553 
3554     if (vdev->display != ON_OFF_AUTO_OFF) {
3555         vfio_display_reset(vdev);
3556     }
3557 
3558     if (vdev->resetfn && !vdev->resetfn(vdev)) {
3559         goto post_reset;
3560     }
3561 
3562     if (vdev->vbasedev.reset_works &&
3563         (vdev->has_flr || !vdev->has_pm_reset) &&
3564         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3565         trace_vfio_pci_reset_flr(vdev->vbasedev.name);
3566         goto post_reset;
3567     }
3568 
3569     /* See if we can do our own bus reset */
3570     if (!vfio_pci_hot_reset_one(vdev)) {
3571         goto post_reset;
3572     }
3573 
3574     /* If nothing else works and the device supports PM reset, use it */
3575     if (vdev->vbasedev.reset_works && vdev->has_pm_reset &&
3576         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3577         trace_vfio_pci_reset_pm(vdev->vbasedev.name);
3578         goto post_reset;
3579     }
3580 
3581 post_reset:
3582     vfio_pci_post_reset(vdev);
3583 }
3584 
3585 static void vfio_instance_init(Object *obj)
3586 {
3587     PCIDevice *pci_dev = PCI_DEVICE(obj);
3588     VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
3589     VFIODevice *vbasedev = &vdev->vbasedev;
3590 
3591     device_add_bootindex_property(obj, &vdev->bootindex,
3592                                   "bootindex", NULL,
3593                                   &pci_dev->qdev);
3594     vdev->host.domain = ~0U;
3595     vdev->host.bus = ~0U;
3596     vdev->host.slot = ~0U;
3597     vdev->host.function = ~0U;
3598 
3599     vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops,
3600                      DEVICE(vdev), false);
3601 
3602     vdev->nv_gpudirect_clique = 0xFF;
3603 
3604     /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
3605      * line, therefore, no need to wait to realize like other devices */
3606     pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
3607 
3608     /*
3609      * A device that is resuming for cpr is already configured, so do not
3610      * reset it during qemu_system_reset prior to cpr load, else interrupts
3611      * may be lost.
3612      */
3613     pci_dev->cap_present |= QEMU_PCI_SKIP_RESET_ON_CPR;
3614 }
3615 
3616 static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data)
3617 {
3618     DeviceClass *dc = DEVICE_CLASS(klass);
3619     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3620 
3621     dc->desc = "VFIO PCI base device";
3622     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3623     pdc->exit = vfio_exitfn;
3624     pdc->config_read = vfio_pci_read_config;
3625     pdc->config_write = vfio_pci_write_config;
3626 }
3627 
3628 static const TypeInfo vfio_pci_base_dev_info = {
3629     .name = TYPE_VFIO_PCI_BASE,
3630     .parent = TYPE_PCI_DEVICE,
3631     .instance_size = sizeof(VFIOPCIDevice),
3632     .abstract = true,
3633     .class_init = vfio_pci_base_dev_class_init,
3634     .interfaces = (const InterfaceInfo[]) {
3635         { INTERFACE_PCIE_DEVICE },
3636         { INTERFACE_CONVENTIONAL_PCI_DEVICE },
3637         { }
3638     },
3639 };
3640 
3641 static PropertyInfo vfio_pci_migration_multifd_transfer_prop;
3642 
3643 static const Property vfio_pci_dev_properties[] = {
3644     DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
3645     DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
3646     DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
3647     DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
3648                             vbasedev.pre_copy_dirty_page_tracking,
3649                             ON_OFF_AUTO_ON),
3650     DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice,
3651                             vbasedev.device_dirty_page_tracking,
3652                             ON_OFF_AUTO_ON),
3653     DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
3654                             display, ON_OFF_AUTO_OFF),
3655     DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
3656     DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
3657     DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
3658                        intx.mmap_timeout, 1100),
3659     DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
3660                     VFIO_FEATURE_ENABLE_VGA_BIT, false),
3661     DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
3662                     VFIO_FEATURE_ENABLE_REQ_BIT, true),
3663     DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
3664                     VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, true),
3665     DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features,
3666                     VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false),
3667     DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice,
3668                             igd_legacy_mode, ON_OFF_AUTO_AUTO),
3669     DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice,
3670                             vbasedev.enable_migration, ON_OFF_AUTO_AUTO),
3671     DEFINE_PROP("x-migration-multifd-transfer", VFIOPCIDevice,
3672                 vbasedev.migration_multifd_transfer,
3673                 vfio_pci_migration_multifd_transfer_prop, OnOffAuto,
3674                 .set_default = true, .defval.i = ON_OFF_AUTO_AUTO),
3675     DEFINE_PROP_ON_OFF_AUTO("x-migration-load-config-after-iter", VFIOPCIDevice,
3676                             vbasedev.migration_load_config_after_iter,
3677                             ON_OFF_AUTO_AUTO),
3678     DEFINE_PROP_SIZE("x-migration-max-queued-buffers-size", VFIOPCIDevice,
3679                      vbasedev.migration_max_queued_buffers_size, UINT64_MAX),
3680     DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice,
3681                      vbasedev.migration_events, false),
3682     DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
3683     DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
3684                      vbasedev.ram_block_discard_allowed, false),
3685     DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
3686     DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
3687     DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
3688     DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
3689                      no_geforce_quirks, false),
3690     DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
3691                      false),
3692     DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
3693                      false),
3694     DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
3695     DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
3696     DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
3697                        sub_vendor_id, PCI_ANY_ID),
3698     DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
3699                        sub_device_id, PCI_ANY_ID),
3700     DEFINE_PROP_UINT32("x-pci-class-code", VFIOPCIDevice,
3701                        class_code, PCI_ANY_ID),
3702     DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
3703     DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
3704                                    nv_gpudirect_clique,
3705                                    qdev_prop_nv_gpudirect_clique, uint8_t),
3706     DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
3707                                 OFF_AUTO_PCIBAR_OFF),
3708 #ifdef CONFIG_IOMMUFD
3709     DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
3710                      TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
3711 #endif
3712     DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
3713 };
3714 
3715 #ifdef CONFIG_IOMMUFD
3716 static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp)
3717 {
3718     VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
3719     vfio_device_set_fd(&vdev->vbasedev, str, errp);
3720 }
3721 #endif
3722 
3723 static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data)
3724 {
3725     DeviceClass *dc = DEVICE_CLASS(klass);
3726     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3727 
3728     device_class_set_legacy_reset(dc, vfio_pci_reset);
3729     device_class_set_props(dc, vfio_pci_dev_properties);
3730 #ifdef CONFIG_IOMMUFD
3731     object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd);
3732 #endif
3733     dc->vmsd = &vfio_cpr_pci_vmstate;
3734     dc->desc = "VFIO-based PCI device assignment";
3735     pdc->realize = vfio_pci_realize;
3736 
3737     object_class_property_set_description(klass, /* 1.3 */
3738                                           "host",
3739                                           "Host PCI address [domain:]<bus:slot.function> of assigned device");
3740     object_class_property_set_description(klass, /* 1.3 */
3741                                           "x-intx-mmap-timeout-ms",
3742                                           "When EOI is not provided by KVM/QEMU, wait time "
3743                                           "(milliseconds) to re-enable device direct access "
3744                                           "after INTx (DEBUG)");
3745     object_class_property_set_description(klass, /* 1.5 */
3746                                           "x-vga",
3747                                           "Expose VGA address spaces for device");
3748     object_class_property_set_description(klass, /* 2.3 */
3749                                           "x-req",
3750                                           "Disable device request notification support (DEBUG)");
3751     object_class_property_set_description(klass, /* 2.4 and 2.5 */
3752                                           "x-no-mmap",
3753                                           "Disable MMAP for device. Allows to trace MMIO "
3754                                           "accesses (DEBUG)");
3755     object_class_property_set_description(klass, /* 2.5 */
3756                                           "x-no-kvm-intx",
3757                                           "Disable direct VFIO->KVM INTx injection. Allows to "
3758                                           "trace INTx interrupts (DEBUG)");
3759     object_class_property_set_description(klass, /* 2.5 */
3760                                           "x-no-kvm-msi",
3761                                           "Disable direct VFIO->KVM MSI injection. Allows to "
3762                                           "trace MSI interrupts (DEBUG)");
3763     object_class_property_set_description(klass, /* 2.5 */
3764                                           "x-no-kvm-msix",
3765                                           "Disable direct VFIO->KVM MSIx injection. Allows to "
3766                                           "trace MSIx interrupts (DEBUG)");
3767     object_class_property_set_description(klass, /* 2.5 */
3768                                           "x-pci-vendor-id",
3769                                           "Override PCI Vendor ID with provided value (DEBUG)");
3770     object_class_property_set_description(klass, /* 2.5 */
3771                                           "x-pci-device-id",
3772                                           "Override PCI device ID with provided value (DEBUG)");
3773     object_class_property_set_description(klass, /* 2.5 */
3774                                           "x-pci-sub-vendor-id",
3775                                           "Override PCI Subsystem Vendor ID with provided value "
3776                                           "(DEBUG)");
3777     object_class_property_set_description(klass, /* 2.5 */
3778                                           "x-pci-sub-device-id",
3779                                           "Override PCI Subsystem Device ID with provided value "
3780                                           "(DEBUG)");
3781     object_class_property_set_description(klass, /* 2.6 */
3782                                           "sysfsdev",
3783                                           "Host sysfs path of assigned device");
3784     object_class_property_set_description(klass, /* 2.7 */
3785                                           "x-igd-opregion",
3786                                           "Expose host IGD OpRegion to guest");
3787     object_class_property_set_description(klass, /* 2.7 (See c4c45e943e51) */
3788                                           "x-igd-gms",
3789                                           "Override IGD data stolen memory size (32MiB units)");
3790     object_class_property_set_description(klass, /* 2.11 */
3791                                           "x-nv-gpudirect-clique",
3792                                           "Add NVIDIA GPUDirect capability indicating P2P DMA "
3793                                           "clique for device [0-15]");
3794     object_class_property_set_description(klass, /* 2.12 */
3795                                           "x-no-geforce-quirks",
3796                                           "Disable GeForce quirks (for NVIDIA Quadro/GRID/Tesla). "
3797                                           "Improves performance");
3798     object_class_property_set_description(klass, /* 2.12 */
3799                                           "display",
3800                                           "Enable display support for device, ex. vGPU");
3801     object_class_property_set_description(klass, /* 2.12 */
3802                                           "x-msix-relocation",
3803                                           "Specify MSI-X MMIO relocation to the end of specified "
3804                                           "existing BAR or new BAR to avoid virtualization overhead "
3805                                           "due to adjacent device registers");
3806     object_class_property_set_description(klass, /* 3.0 */
3807                                           "x-no-kvm-ioeventfd",
3808                                           "Disable registration of ioeventfds with KVM (DEBUG)");
3809     object_class_property_set_description(klass, /* 3.0 */
3810                                           "x-no-vfio-ioeventfd",
3811                                           "Disable linking of KVM ioeventfds to VFIO ioeventfds "
3812                                           "(DEBUG)");
3813     object_class_property_set_description(klass, /* 3.1 */
3814                                           "x-balloon-allowed",
3815                                           "Override allowing ballooning with device (DEBUG, DANGER)");
3816     object_class_property_set_description(klass, /* 3.2 */
3817                                           "xres",
3818                                           "Set X display resolution the vGPU should use");
3819     object_class_property_set_description(klass, /* 3.2 */
3820                                           "yres",
3821                                           "Set Y display resolution the vGPU should use");
3822     object_class_property_set_description(klass, /* 5.2 */
3823                                           "x-pre-copy-dirty-page-tracking",
3824                                           "Disable dirty pages tracking during iterative phase "
3825                                           "(DEBUG)");
3826     object_class_property_set_description(klass, /* 5.2, 8.0 non-experimetal */
3827                                           "enable-migration",
3828                                           "Enale device migration. Also requires a host VFIO PCI "
3829                                           "variant or mdev driver with migration support enabled");
3830     object_class_property_set_description(klass, /* 8.1 */
3831                                           "vf-token",
3832                                           "Specify UUID VF token. Required for VF when PF is owned "
3833                                           "by another VFIO driver");
3834 #ifdef CONFIG_IOMMUFD
3835     object_class_property_set_description(klass, /* 9.0 */
3836                                           "iommufd",
3837                                           "Set host IOMMUFD backend device");
3838 #endif
3839     object_class_property_set_description(klass, /* 9.1 */
3840                                           "x-device-dirty-page-tracking",
3841                                           "Disable device dirty page tracking and use "
3842                                           "container-based dirty page tracking");
3843     object_class_property_set_description(klass, /* 9.1 */
3844                                           "migration-events",
3845                                           "Emit VFIO migration QAPI event when a VFIO device "
3846                                           "changes its migration state. For management applications");
3847     object_class_property_set_description(klass, /* 9.1 */
3848                                           "skip-vsc-check",
3849                                           "Skip config space check for Vendor Specific Capability. "
3850                                           "Setting to false will enforce strict checking of VSC content "
3851                                           "(DEBUG)");
3852     object_class_property_set_description(klass, /* 10.0 */
3853                                           "x-migration-multifd-transfer",
3854                                           "Transfer this device state via "
3855                                           "multifd channels when live migrating it");
3856     object_class_property_set_description(klass, /* 10.1 */
3857                                           "x-migration-load-config-after-iter",
3858                                           "Start the config load only after "
3859                                           "all iterables were loaded (during "
3860                                           "non-iterables loading phase) when "
3861                                           "doing live migration of device state "
3862                                           "via multifd channels");
3863     object_class_property_set_description(klass, /* 10.1 */
3864                                           "x-migration-max-queued-buffers-size",
3865                                           "Maximum size of in-flight VFIO "
3866                                           "device state buffers queued at the "
3867                                           "destination when doing live "
3868                                           "migration of device state via "
3869                                           "multifd channels");
3870 }
3871 
3872 static const TypeInfo vfio_pci_dev_info = {
3873     .name = TYPE_VFIO_PCI,
3874     .parent = TYPE_VFIO_PCI_BASE,
3875     .class_init = vfio_pci_dev_class_init,
3876     .instance_init = vfio_instance_init,
3877     .instance_finalize = vfio_instance_finalize,
3878 };
3879 
3880 static const Property vfio_pci_dev_nohotplug_properties[] = {
3881     DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false),
3882     DEFINE_PROP_BOOL("use-legacy-x86-rom", VFIOPCIDevice,
3883                      use_legacy_x86_rom, false),
3884     DEFINE_PROP_ON_OFF_AUTO("x-ramfb-migrate", VFIOPCIDevice, ramfb_migrate,
3885                             ON_OFF_AUTO_AUTO),
3886 };
3887 
3888 static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass,
3889                                               const void *data)
3890 {
3891     DeviceClass *dc = DEVICE_CLASS(klass);
3892 
3893     device_class_set_props(dc, vfio_pci_dev_nohotplug_properties);
3894     dc->hotpluggable = false;
3895 
3896     object_class_property_set_description(klass, /* 3.1 */
3897                                           "ramfb",
3898                                           "Enable ramfb to provide pre-boot graphics for devices "
3899                                           "enabling display option");
3900     object_class_property_set_description(klass, /* 8.2 */
3901                                           "x-ramfb-migrate",
3902                                           "Override default migration support for ramfb support "
3903                                           "(DEBUG)");
3904     object_class_property_set_description(klass, /* 10.1 */
3905                                           "use-legacy-x86-rom",
3906                                           "Controls loading of a legacy VGA BIOS ROM");
3907 }
3908 
3909 static const TypeInfo vfio_pci_nohotplug_dev_info = {
3910     .name = TYPE_VFIO_PCI_NOHOTPLUG,
3911     .parent = TYPE_VFIO_PCI,
3912     .instance_size = sizeof(VFIOPCIDevice),
3913     .class_init = vfio_pci_nohotplug_dev_class_init,
3914 };
3915 
3916 static void register_vfio_pci_dev_type(void)
3917 {
3918     /*
3919      * Ordinary ON_OFF_AUTO property isn't runtime-mutable, but source VM can
3920      * run for a long time before being migrated so it is desirable to have a
3921      * fallback mechanism to the old way of transferring VFIO device state if
3922      * it turns to be necessary.
3923      * The following makes this type of property have the same mutability level
3924      * as ordinary migration parameters.
3925      */
3926     vfio_pci_migration_multifd_transfer_prop = qdev_prop_on_off_auto;
3927     vfio_pci_migration_multifd_transfer_prop.realized_set_allowed = true;
3928 
3929     type_register_static(&vfio_pci_base_dev_info);
3930     type_register_static(&vfio_pci_dev_info);
3931     type_register_static(&vfio_pci_nohotplug_dev_info);
3932 }
3933 
3934 type_init(register_vfio_pci_dev_type)
3935