xref: /openbmc/qemu/hw/vfio/pci.c (revision 668f62ec)
1 /*
2  * vfio based device assignment support
3  *
4  * Copyright Red Hat, Inc. 2012
5  *
6  * Authors:
7  *  Alex Williamson <alex.williamson@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Based on qemu-kvm device-assignment:
13  *  Adapted for KVM by Qumranet.
14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19  */
20 
21 #include "qemu/osdep.h"
22 #include <linux/vfio.h>
23 #include <sys/ioctl.h>
24 
25 #include "hw/hw.h"
26 #include "hw/pci/msi.h"
27 #include "hw/pci/msix.h"
28 #include "hw/pci/pci_bridge.h"
29 #include "hw/qdev-properties.h"
30 #include "migration/vmstate.h"
31 #include "qemu/error-report.h"
32 #include "qemu/main-loop.h"
33 #include "qemu/module.h"
34 #include "qemu/option.h"
35 #include "qemu/range.h"
36 #include "qemu/units.h"
37 #include "sysemu/kvm.h"
38 #include "sysemu/runstate.h"
39 #include "sysemu/sysemu.h"
40 #include "pci.h"
41 #include "trace.h"
42 #include "qapi/error.h"
43 #include "migration/blocker.h"
44 
45 #define TYPE_VFIO_PCI "vfio-pci"
46 #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
47 
48 #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
49 
50 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
51 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
52 
53 /*
54  * Disabling BAR mmaping can be slow, but toggling it around INTx can
55  * also be a huge overhead.  We try to get the best of both worlds by
56  * waiting until an interrupt to disable mmaps (subsequent transitions
57  * to the same state are effectively no overhead).  If the interrupt has
58  * been serviced and the time gap is long enough, we re-enable mmaps for
59  * performance.  This works well for things like graphics cards, which
60  * may not use their interrupt at all and are penalized to an unusable
61  * level by read/write BAR traps.  Other devices, like NICs, have more
62  * regular interrupts and see much better latency by staying in non-mmap
63  * mode.  We therefore set the default mmap_timeout such that a ping
64  * is just enough to keep the mmap disabled.  Users can experiment with
65  * other options with the x-intx-mmap-timeout-ms parameter (a value of
66  * zero disables the timer).
67  */
68 static void vfio_intx_mmap_enable(void *opaque)
69 {
70     VFIOPCIDevice *vdev = opaque;
71 
72     if (vdev->intx.pending) {
73         timer_mod(vdev->intx.mmap_timer,
74                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
75         return;
76     }
77 
78     vfio_mmap_set_enabled(vdev, true);
79 }
80 
81 static void vfio_intx_interrupt(void *opaque)
82 {
83     VFIOPCIDevice *vdev = opaque;
84 
85     if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
86         return;
87     }
88 
89     trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin);
90 
91     vdev->intx.pending = true;
92     pci_irq_assert(&vdev->pdev);
93     vfio_mmap_set_enabled(vdev, false);
94     if (vdev->intx.mmap_timeout) {
95         timer_mod(vdev->intx.mmap_timer,
96                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
97     }
98 }
99 
100 static void vfio_intx_eoi(VFIODevice *vbasedev)
101 {
102     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
103 
104     if (!vdev->intx.pending) {
105         return;
106     }
107 
108     trace_vfio_intx_eoi(vbasedev->name);
109 
110     vdev->intx.pending = false;
111     pci_irq_deassert(&vdev->pdev);
112     vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
113 }
114 
115 static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
116 {
117 #ifdef CONFIG_KVM
118     int irq_fd = event_notifier_get_fd(&vdev->intx.interrupt);
119 
120     if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
121         vdev->intx.route.mode != PCI_INTX_ENABLED ||
122         !kvm_resamplefds_enabled()) {
123         return;
124     }
125 
126     /* Get to a known interrupt state */
127     qemu_set_fd_handler(irq_fd, NULL, NULL, vdev);
128     vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
129     vdev->intx.pending = false;
130     pci_irq_deassert(&vdev->pdev);
131 
132     /* Get an eventfd for resample/unmask */
133     if (event_notifier_init(&vdev->intx.unmask, 0)) {
134         error_setg(errp, "event_notifier_init failed eoi");
135         goto fail;
136     }
137 
138     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
139                                            &vdev->intx.interrupt,
140                                            &vdev->intx.unmask,
141                                            vdev->intx.route.irq)) {
142         error_setg_errno(errp, errno, "failed to setup resample irqfd");
143         goto fail_irqfd;
144     }
145 
146     if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
147                                VFIO_IRQ_SET_ACTION_UNMASK,
148                                event_notifier_get_fd(&vdev->intx.unmask),
149                                errp)) {
150         goto fail_vfio;
151     }
152 
153     /* Let'em rip */
154     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
155 
156     vdev->intx.kvm_accel = true;
157 
158     trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
159 
160     return;
161 
162 fail_vfio:
163     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
164                                           vdev->intx.route.irq);
165 fail_irqfd:
166     event_notifier_cleanup(&vdev->intx.unmask);
167 fail:
168     qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev);
169     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
170 #endif
171 }
172 
173 static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
174 {
175 #ifdef CONFIG_KVM
176     if (!vdev->intx.kvm_accel) {
177         return;
178     }
179 
180     /*
181      * Get to a known state, hardware masked, QEMU ready to accept new
182      * interrupts, QEMU IRQ de-asserted.
183      */
184     vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
185     vdev->intx.pending = false;
186     pci_irq_deassert(&vdev->pdev);
187 
188     /* Tell KVM to stop listening for an INTx irqfd */
189     if (kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
190                                               vdev->intx.route.irq)) {
191         error_report("vfio: Error: Failed to disable INTx irqfd: %m");
192     }
193 
194     /* We only need to close the eventfd for VFIO to cleanup the kernel side */
195     event_notifier_cleanup(&vdev->intx.unmask);
196 
197     /* QEMU starts listening for interrupt events. */
198     qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt),
199                         vfio_intx_interrupt, NULL, vdev);
200 
201     vdev->intx.kvm_accel = false;
202 
203     /* If we've missed an event, let it re-fire through QEMU */
204     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
205 
206     trace_vfio_intx_disable_kvm(vdev->vbasedev.name);
207 #endif
208 }
209 
210 static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route)
211 {
212     Error *err = NULL;
213 
214     trace_vfio_intx_update(vdev->vbasedev.name,
215                            vdev->intx.route.irq, route->irq);
216 
217     vfio_intx_disable_kvm(vdev);
218 
219     vdev->intx.route = *route;
220 
221     if (route->mode != PCI_INTX_ENABLED) {
222         return;
223     }
224 
225     vfio_intx_enable_kvm(vdev, &err);
226     if (err) {
227         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
228     }
229 
230     /* Re-enable the interrupt in cased we missed an EOI */
231     vfio_intx_eoi(&vdev->vbasedev);
232 }
233 
234 static void vfio_intx_routing_notifier(PCIDevice *pdev)
235 {
236     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
237     PCIINTxRoute route;
238 
239     if (vdev->interrupt != VFIO_INT_INTx) {
240         return;
241     }
242 
243     route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
244 
245     if (pci_intx_route_changed(&vdev->intx.route, &route)) {
246         vfio_intx_update(vdev, &route);
247     }
248 }
249 
250 static void vfio_irqchip_change(Notifier *notify, void *data)
251 {
252     VFIOPCIDevice *vdev = container_of(notify, VFIOPCIDevice,
253                                        irqchip_change_notifier);
254 
255     vfio_intx_update(vdev, &vdev->intx.route);
256 }
257 
258 static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
259 {
260     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
261     Error *err = NULL;
262     int32_t fd;
263     int ret;
264 
265 
266     if (!pin) {
267         return 0;
268     }
269 
270     vfio_disable_interrupts(vdev);
271 
272     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
273     pci_config_set_interrupt_pin(vdev->pdev.config, pin);
274 
275 #ifdef CONFIG_KVM
276     /*
277      * Only conditional to avoid generating error messages on platforms
278      * where we won't actually use the result anyway.
279      */
280     if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
281         vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
282                                                         vdev->intx.pin);
283     }
284 #endif
285 
286     ret = event_notifier_init(&vdev->intx.interrupt, 0);
287     if (ret) {
288         error_setg_errno(errp, -ret, "event_notifier_init failed");
289         return ret;
290     }
291     fd = event_notifier_get_fd(&vdev->intx.interrupt);
292     qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
293 
294     if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
295                                VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
296         error_propagate(errp, err);
297         qemu_set_fd_handler(fd, NULL, NULL, vdev);
298         event_notifier_cleanup(&vdev->intx.interrupt);
299         return -errno;
300     }
301 
302     vfio_intx_enable_kvm(vdev, &err);
303     if (err) {
304         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
305     }
306 
307     vdev->interrupt = VFIO_INT_INTx;
308 
309     trace_vfio_intx_enable(vdev->vbasedev.name);
310     return 0;
311 }
312 
313 static void vfio_intx_disable(VFIOPCIDevice *vdev)
314 {
315     int fd;
316 
317     timer_del(vdev->intx.mmap_timer);
318     vfio_intx_disable_kvm(vdev);
319     vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
320     vdev->intx.pending = false;
321     pci_irq_deassert(&vdev->pdev);
322     vfio_mmap_set_enabled(vdev, true);
323 
324     fd = event_notifier_get_fd(&vdev->intx.interrupt);
325     qemu_set_fd_handler(fd, NULL, NULL, vdev);
326     event_notifier_cleanup(&vdev->intx.interrupt);
327 
328     vdev->interrupt = VFIO_INT_NONE;
329 
330     trace_vfio_intx_disable(vdev->vbasedev.name);
331 }
332 
333 /*
334  * MSI/X
335  */
336 static void vfio_msi_interrupt(void *opaque)
337 {
338     VFIOMSIVector *vector = opaque;
339     VFIOPCIDevice *vdev = vector->vdev;
340     MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector);
341     void (*notify)(PCIDevice *dev, unsigned vector);
342     MSIMessage msg;
343     int nr = vector - vdev->msi_vectors;
344 
345     if (!event_notifier_test_and_clear(&vector->interrupt)) {
346         return;
347     }
348 
349     if (vdev->interrupt == VFIO_INT_MSIX) {
350         get_msg = msix_get_message;
351         notify = msix_notify;
352 
353         /* A masked vector firing needs to use the PBA, enable it */
354         if (msix_is_masked(&vdev->pdev, nr)) {
355             set_bit(nr, vdev->msix->pending);
356             memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true);
357             trace_vfio_msix_pba_enable(vdev->vbasedev.name);
358         }
359     } else if (vdev->interrupt == VFIO_INT_MSI) {
360         get_msg = msi_get_message;
361         notify = msi_notify;
362     } else {
363         abort();
364     }
365 
366     msg = get_msg(&vdev->pdev, nr);
367     trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data);
368     notify(&vdev->pdev, nr);
369 }
370 
371 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
372 {
373     struct vfio_irq_set *irq_set;
374     int ret = 0, i, argsz;
375     int32_t *fds;
376 
377     argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
378 
379     irq_set = g_malloc0(argsz);
380     irq_set->argsz = argsz;
381     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
382     irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
383     irq_set->start = 0;
384     irq_set->count = vdev->nr_vectors;
385     fds = (int32_t *)&irq_set->data;
386 
387     for (i = 0; i < vdev->nr_vectors; i++) {
388         int fd = -1;
389 
390         /*
391          * MSI vs MSI-X - The guest has direct access to MSI mask and pending
392          * bits, therefore we always use the KVM signaling path when setup.
393          * MSI-X mask and pending bits are emulated, so we want to use the
394          * KVM signaling path only when configured and unmasked.
395          */
396         if (vdev->msi_vectors[i].use) {
397             if (vdev->msi_vectors[i].virq < 0 ||
398                 (msix && msix_is_masked(&vdev->pdev, i))) {
399                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
400             } else {
401                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
402             }
403         }
404 
405         fds[i] = fd;
406     }
407 
408     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
409 
410     g_free(irq_set);
411 
412     return ret;
413 }
414 
415 static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
416                                   int vector_n, bool msix)
417 {
418     int virq;
419 
420     if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) {
421         return;
422     }
423 
424     if (event_notifier_init(&vector->kvm_interrupt, 0)) {
425         return;
426     }
427 
428     virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev);
429     if (virq < 0) {
430         event_notifier_cleanup(&vector->kvm_interrupt);
431         return;
432     }
433 
434     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
435                                        NULL, virq) < 0) {
436         kvm_irqchip_release_virq(kvm_state, virq);
437         event_notifier_cleanup(&vector->kvm_interrupt);
438         return;
439     }
440 
441     vector->virq = virq;
442 }
443 
444 static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
445 {
446     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
447                                           vector->virq);
448     kvm_irqchip_release_virq(kvm_state, vector->virq);
449     vector->virq = -1;
450     event_notifier_cleanup(&vector->kvm_interrupt);
451 }
452 
453 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
454                                      PCIDevice *pdev)
455 {
456     kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev);
457     kvm_irqchip_commit_routes(kvm_state);
458 }
459 
460 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
461                                    MSIMessage *msg, IOHandler *handler)
462 {
463     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
464     VFIOMSIVector *vector;
465     int ret;
466 
467     trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
468 
469     vector = &vdev->msi_vectors[nr];
470 
471     if (!vector->use) {
472         vector->vdev = vdev;
473         vector->virq = -1;
474         if (event_notifier_init(&vector->interrupt, 0)) {
475             error_report("vfio: Error: event_notifier_init failed");
476         }
477         vector->use = true;
478         msix_vector_use(pdev, nr);
479     }
480 
481     qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
482                         handler, NULL, vector);
483 
484     /*
485      * Attempt to enable route through KVM irqchip,
486      * default to userspace handling if unavailable.
487      */
488     if (vector->virq >= 0) {
489         if (!msg) {
490             vfio_remove_kvm_msi_virq(vector);
491         } else {
492             vfio_update_kvm_msi_virq(vector, *msg, pdev);
493         }
494     } else {
495         if (msg) {
496             vfio_add_kvm_msi_virq(vdev, vector, nr, true);
497         }
498     }
499 
500     /*
501      * We don't want to have the host allocate all possible MSI vectors
502      * for a device if they're not in use, so we shutdown and incrementally
503      * increase them as needed.
504      */
505     if (vdev->nr_vectors < nr + 1) {
506         vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
507         vdev->nr_vectors = nr + 1;
508         ret = vfio_enable_vectors(vdev, true);
509         if (ret) {
510             error_report("vfio: failed to enable vectors, %d", ret);
511         }
512     } else {
513         Error *err = NULL;
514         int32_t fd;
515 
516         if (vector->virq >= 0) {
517             fd = event_notifier_get_fd(&vector->kvm_interrupt);
518         } else {
519             fd = event_notifier_get_fd(&vector->interrupt);
520         }
521 
522         if (vfio_set_irq_signaling(&vdev->vbasedev,
523                                      VFIO_PCI_MSIX_IRQ_INDEX, nr,
524                                      VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
525             error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
526         }
527     }
528 
529     /* Disable PBA emulation when nothing more is pending. */
530     clear_bit(nr, vdev->msix->pending);
531     if (find_first_bit(vdev->msix->pending,
532                        vdev->nr_vectors) == vdev->nr_vectors) {
533         memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
534         trace_vfio_msix_pba_disable(vdev->vbasedev.name);
535     }
536 
537     return 0;
538 }
539 
540 static int vfio_msix_vector_use(PCIDevice *pdev,
541                                 unsigned int nr, MSIMessage msg)
542 {
543     return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
544 }
545 
546 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
547 {
548     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
549     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
550 
551     trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
552 
553     /*
554      * There are still old guests that mask and unmask vectors on every
555      * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
556      * the KVM setup in place, simply switch VFIO to use the non-bypass
557      * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
558      * core will mask the interrupt and set pending bits, allowing it to
559      * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
560      */
561     if (vector->virq >= 0) {
562         int32_t fd = event_notifier_get_fd(&vector->interrupt);
563         Error *err = NULL;
564 
565         if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr,
566                                    VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
567             error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
568         }
569     }
570 }
571 
572 static void vfio_msix_enable(VFIOPCIDevice *vdev)
573 {
574     vfio_disable_interrupts(vdev);
575 
576     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries);
577 
578     vdev->interrupt = VFIO_INT_MSIX;
579 
580     /*
581      * Some communication channels between VF & PF or PF & fw rely on the
582      * physical state of the device and expect that enabling MSI-X from the
583      * guest enables the same on the host.  When our guest is Linux, the
584      * guest driver call to pci_enable_msix() sets the enabling bit in the
585      * MSI-X capability, but leaves the vector table masked.  We therefore
586      * can't rely on a vector_use callback (from request_irq() in the guest)
587      * to switch the physical device into MSI-X mode because that may come a
588      * long time after pci_enable_msix().  This code enables vector 0 with
589      * triggering to userspace, then immediately release the vector, leaving
590      * the physical device with no vectors enabled, but MSI-X enabled, just
591      * like the guest view.
592      */
593     vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
594     vfio_msix_vector_release(&vdev->pdev, 0);
595 
596     if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
597                                   vfio_msix_vector_release, NULL)) {
598         error_report("vfio: msix_set_vector_notifiers failed");
599     }
600 
601     trace_vfio_msix_enable(vdev->vbasedev.name);
602 }
603 
604 static void vfio_msi_enable(VFIOPCIDevice *vdev)
605 {
606     int ret, i;
607 
608     vfio_disable_interrupts(vdev);
609 
610     vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
611 retry:
612     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
613 
614     for (i = 0; i < vdev->nr_vectors; i++) {
615         VFIOMSIVector *vector = &vdev->msi_vectors[i];
616 
617         vector->vdev = vdev;
618         vector->virq = -1;
619         vector->use = true;
620 
621         if (event_notifier_init(&vector->interrupt, 0)) {
622             error_report("vfio: Error: event_notifier_init failed");
623         }
624 
625         qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
626                             vfio_msi_interrupt, NULL, vector);
627 
628         /*
629          * Attempt to enable route through KVM irqchip,
630          * default to userspace handling if unavailable.
631          */
632         vfio_add_kvm_msi_virq(vdev, vector, i, false);
633     }
634 
635     /* Set interrupt type prior to possible interrupts */
636     vdev->interrupt = VFIO_INT_MSI;
637 
638     ret = vfio_enable_vectors(vdev, false);
639     if (ret) {
640         if (ret < 0) {
641             error_report("vfio: Error: Failed to setup MSI fds: %m");
642         } else if (ret != vdev->nr_vectors) {
643             error_report("vfio: Error: Failed to enable %d "
644                          "MSI vectors, retry with %d", vdev->nr_vectors, ret);
645         }
646 
647         for (i = 0; i < vdev->nr_vectors; i++) {
648             VFIOMSIVector *vector = &vdev->msi_vectors[i];
649             if (vector->virq >= 0) {
650                 vfio_remove_kvm_msi_virq(vector);
651             }
652             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
653                                 NULL, NULL, NULL);
654             event_notifier_cleanup(&vector->interrupt);
655         }
656 
657         g_free(vdev->msi_vectors);
658         vdev->msi_vectors = NULL;
659 
660         if (ret > 0 && ret != vdev->nr_vectors) {
661             vdev->nr_vectors = ret;
662             goto retry;
663         }
664         vdev->nr_vectors = 0;
665 
666         /*
667          * Failing to setup MSI doesn't really fall within any specification.
668          * Let's try leaving interrupts disabled and hope the guest figures
669          * out to fall back to INTx for this device.
670          */
671         error_report("vfio: Error: Failed to enable MSI");
672         vdev->interrupt = VFIO_INT_NONE;
673 
674         return;
675     }
676 
677     trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
678 }
679 
680 static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
681 {
682     Error *err = NULL;
683     int i;
684 
685     for (i = 0; i < vdev->nr_vectors; i++) {
686         VFIOMSIVector *vector = &vdev->msi_vectors[i];
687         if (vdev->msi_vectors[i].use) {
688             if (vector->virq >= 0) {
689                 vfio_remove_kvm_msi_virq(vector);
690             }
691             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
692                                 NULL, NULL, NULL);
693             event_notifier_cleanup(&vector->interrupt);
694         }
695     }
696 
697     g_free(vdev->msi_vectors);
698     vdev->msi_vectors = NULL;
699     vdev->nr_vectors = 0;
700     vdev->interrupt = VFIO_INT_NONE;
701 
702     vfio_intx_enable(vdev, &err);
703     if (err) {
704         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
705     }
706 }
707 
708 static void vfio_msix_disable(VFIOPCIDevice *vdev)
709 {
710     int i;
711 
712     msix_unset_vector_notifiers(&vdev->pdev);
713 
714     /*
715      * MSI-X will only release vectors if MSI-X is still enabled on the
716      * device, check through the rest and release it ourselves if necessary.
717      */
718     for (i = 0; i < vdev->nr_vectors; i++) {
719         if (vdev->msi_vectors[i].use) {
720             vfio_msix_vector_release(&vdev->pdev, i);
721             msix_vector_unuse(&vdev->pdev, i);
722         }
723     }
724 
725     if (vdev->nr_vectors) {
726         vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
727     }
728 
729     vfio_msi_disable_common(vdev);
730 
731     memset(vdev->msix->pending, 0,
732            BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
733 
734     trace_vfio_msix_disable(vdev->vbasedev.name);
735 }
736 
737 static void vfio_msi_disable(VFIOPCIDevice *vdev)
738 {
739     vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
740     vfio_msi_disable_common(vdev);
741 
742     trace_vfio_msi_disable(vdev->vbasedev.name);
743 }
744 
745 static void vfio_update_msi(VFIOPCIDevice *vdev)
746 {
747     int i;
748 
749     for (i = 0; i < vdev->nr_vectors; i++) {
750         VFIOMSIVector *vector = &vdev->msi_vectors[i];
751         MSIMessage msg;
752 
753         if (!vector->use || vector->virq < 0) {
754             continue;
755         }
756 
757         msg = msi_get_message(&vdev->pdev, i);
758         vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev);
759     }
760 }
761 
762 static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
763 {
764     struct vfio_region_info *reg_info;
765     uint64_t size;
766     off_t off = 0;
767     ssize_t bytes;
768 
769     if (vfio_get_region_info(&vdev->vbasedev,
770                              VFIO_PCI_ROM_REGION_INDEX, &reg_info)) {
771         error_report("vfio: Error getting ROM info: %m");
772         return;
773     }
774 
775     trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size,
776                             (unsigned long)reg_info->offset,
777                             (unsigned long)reg_info->flags);
778 
779     vdev->rom_size = size = reg_info->size;
780     vdev->rom_offset = reg_info->offset;
781 
782     g_free(reg_info);
783 
784     if (!vdev->rom_size) {
785         vdev->rom_read_failed = true;
786         error_report("vfio-pci: Cannot read device rom at "
787                     "%s", vdev->vbasedev.name);
788         error_printf("Device option ROM contents are probably invalid "
789                     "(check dmesg).\nSkip option ROM probe with rombar=0, "
790                     "or load from file with romfile=\n");
791         return;
792     }
793 
794     vdev->rom = g_malloc(size);
795     memset(vdev->rom, 0xff, size);
796 
797     while (size) {
798         bytes = pread(vdev->vbasedev.fd, vdev->rom + off,
799                       size, vdev->rom_offset + off);
800         if (bytes == 0) {
801             break;
802         } else if (bytes > 0) {
803             off += bytes;
804             size -= bytes;
805         } else {
806             if (errno == EINTR || errno == EAGAIN) {
807                 continue;
808             }
809             error_report("vfio: Error reading device ROM: %m");
810             break;
811         }
812     }
813 
814     /*
815      * Test the ROM signature against our device, if the vendor is correct
816      * but the device ID doesn't match, store the correct device ID and
817      * recompute the checksum.  Intel IGD devices need this and are known
818      * to have bogus checksums so we can't simply adjust the checksum.
819      */
820     if (pci_get_word(vdev->rom) == 0xaa55 &&
821         pci_get_word(vdev->rom + 0x18) + 8 < vdev->rom_size &&
822         !memcmp(vdev->rom + pci_get_word(vdev->rom + 0x18), "PCIR", 4)) {
823         uint16_t vid, did;
824 
825         vid = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 4);
826         did = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6);
827 
828         if (vid == vdev->vendor_id && did != vdev->device_id) {
829             int i;
830             uint8_t csum, *data = vdev->rom;
831 
832             pci_set_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6,
833                          vdev->device_id);
834             data[6] = 0;
835 
836             for (csum = 0, i = 0; i < vdev->rom_size; i++) {
837                 csum += data[i];
838             }
839 
840             data[6] = -csum;
841         }
842     }
843 }
844 
845 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
846 {
847     VFIOPCIDevice *vdev = opaque;
848     union {
849         uint8_t byte;
850         uint16_t word;
851         uint32_t dword;
852         uint64_t qword;
853     } val;
854     uint64_t data = 0;
855 
856     /* Load the ROM lazily when the guest tries to read it */
857     if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
858         vfio_pci_load_rom(vdev);
859     }
860 
861     memcpy(&val, vdev->rom + addr,
862            (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
863 
864     switch (size) {
865     case 1:
866         data = val.byte;
867         break;
868     case 2:
869         data = le16_to_cpu(val.word);
870         break;
871     case 4:
872         data = le32_to_cpu(val.dword);
873         break;
874     default:
875         hw_error("vfio: unsupported read size, %d bytes\n", size);
876         break;
877     }
878 
879     trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data);
880 
881     return data;
882 }
883 
884 static void vfio_rom_write(void *opaque, hwaddr addr,
885                            uint64_t data, unsigned size)
886 {
887 }
888 
889 static const MemoryRegionOps vfio_rom_ops = {
890     .read = vfio_rom_read,
891     .write = vfio_rom_write,
892     .endianness = DEVICE_LITTLE_ENDIAN,
893 };
894 
895 static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
896 {
897     uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
898     off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
899     DeviceState *dev = DEVICE(vdev);
900     char *name;
901     int fd = vdev->vbasedev.fd;
902 
903     if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
904         /* Since pci handles romfile, just print a message and return */
905         if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) {
906             warn_report("Device at %s is known to cause system instability"
907                         " issues during option rom execution",
908                         vdev->vbasedev.name);
909             error_printf("Proceeding anyway since user specified romfile\n");
910         }
911         return;
912     }
913 
914     /*
915      * Use the same size ROM BAR as the physical device.  The contents
916      * will get filled in later when the guest tries to read it.
917      */
918     if (pread(fd, &orig, 4, offset) != 4 ||
919         pwrite(fd, &size, 4, offset) != 4 ||
920         pread(fd, &size, 4, offset) != 4 ||
921         pwrite(fd, &orig, 4, offset) != 4) {
922         error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name);
923         return;
924     }
925 
926     size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
927 
928     if (!size) {
929         return;
930     }
931 
932     if (vfio_blacklist_opt_rom(vdev)) {
933         if (dev->opts && qemu_opt_get(dev->opts, "rombar")) {
934             warn_report("Device at %s is known to cause system instability"
935                         " issues during option rom execution",
936                         vdev->vbasedev.name);
937             error_printf("Proceeding anyway since user specified"
938                          " non zero value for rombar\n");
939         } else {
940             warn_report("Rom loading for device at %s has been disabled"
941                         " due to system instability issues",
942                         vdev->vbasedev.name);
943             error_printf("Specify rombar=1 or romfile to force\n");
944             return;
945         }
946     }
947 
948     trace_vfio_pci_size_rom(vdev->vbasedev.name, size);
949 
950     name = g_strdup_printf("vfio[%s].rom", vdev->vbasedev.name);
951 
952     memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
953                           &vfio_rom_ops, vdev, name, size);
954     g_free(name);
955 
956     pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
957                      PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
958 
959     vdev->rom_read_failed = false;
960 }
961 
962 void vfio_vga_write(void *opaque, hwaddr addr,
963                            uint64_t data, unsigned size)
964 {
965     VFIOVGARegion *region = opaque;
966     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
967     union {
968         uint8_t byte;
969         uint16_t word;
970         uint32_t dword;
971         uint64_t qword;
972     } buf;
973     off_t offset = vga->fd_offset + region->offset + addr;
974 
975     switch (size) {
976     case 1:
977         buf.byte = data;
978         break;
979     case 2:
980         buf.word = cpu_to_le16(data);
981         break;
982     case 4:
983         buf.dword = cpu_to_le32(data);
984         break;
985     default:
986         hw_error("vfio: unsupported write size, %d bytes", size);
987         break;
988     }
989 
990     if (pwrite(vga->fd, &buf, size, offset) != size) {
991         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
992                      __func__, region->offset + addr, data, size);
993     }
994 
995     trace_vfio_vga_write(region->offset + addr, data, size);
996 }
997 
998 uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
999 {
1000     VFIOVGARegion *region = opaque;
1001     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1002     union {
1003         uint8_t byte;
1004         uint16_t word;
1005         uint32_t dword;
1006         uint64_t qword;
1007     } buf;
1008     uint64_t data = 0;
1009     off_t offset = vga->fd_offset + region->offset + addr;
1010 
1011     if (pread(vga->fd, &buf, size, offset) != size) {
1012         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1013                      __func__, region->offset + addr, size);
1014         return (uint64_t)-1;
1015     }
1016 
1017     switch (size) {
1018     case 1:
1019         data = buf.byte;
1020         break;
1021     case 2:
1022         data = le16_to_cpu(buf.word);
1023         break;
1024     case 4:
1025         data = le32_to_cpu(buf.dword);
1026         break;
1027     default:
1028         hw_error("vfio: unsupported read size, %d bytes", size);
1029         break;
1030     }
1031 
1032     trace_vfio_vga_read(region->offset + addr, size, data);
1033 
1034     return data;
1035 }
1036 
1037 static const MemoryRegionOps vfio_vga_ops = {
1038     .read = vfio_vga_read,
1039     .write = vfio_vga_write,
1040     .endianness = DEVICE_LITTLE_ENDIAN,
1041 };
1042 
1043 /*
1044  * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page
1045  * size if the BAR is in an exclusive page in host so that we could map
1046  * this BAR to guest. But this sub-page BAR may not occupy an exclusive
1047  * page in guest. So we should set the priority of the expanded memory
1048  * region to zero in case of overlap with BARs which share the same page
1049  * with the sub-page BAR in guest. Besides, we should also recover the
1050  * size of this sub-page BAR when its base address is changed in guest
1051  * and not page aligned any more.
1052  */
1053 static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
1054 {
1055     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
1056     VFIORegion *region = &vdev->bars[bar].region;
1057     MemoryRegion *mmap_mr, *region_mr, *base_mr;
1058     PCIIORegion *r;
1059     pcibus_t bar_addr;
1060     uint64_t size = region->size;
1061 
1062     /* Make sure that the whole region is allowed to be mmapped */
1063     if (region->nr_mmaps != 1 || !region->mmaps[0].mmap ||
1064         region->mmaps[0].size != region->size) {
1065         return;
1066     }
1067 
1068     r = &pdev->io_regions[bar];
1069     bar_addr = r->addr;
1070     base_mr = vdev->bars[bar].mr;
1071     region_mr = region->mem;
1072     mmap_mr = &region->mmaps[0].mem;
1073 
1074     /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */
1075     if (bar_addr != PCI_BAR_UNMAPPED &&
1076         !(bar_addr & ~qemu_real_host_page_mask)) {
1077         size = qemu_real_host_page_size;
1078     }
1079 
1080     memory_region_transaction_begin();
1081 
1082     if (vdev->bars[bar].size < size) {
1083         memory_region_set_size(base_mr, size);
1084     }
1085     memory_region_set_size(region_mr, size);
1086     memory_region_set_size(mmap_mr, size);
1087     if (size != vdev->bars[bar].size && memory_region_is_mapped(base_mr)) {
1088         memory_region_del_subregion(r->address_space, base_mr);
1089         memory_region_add_subregion_overlap(r->address_space,
1090                                             bar_addr, base_mr, 0);
1091     }
1092 
1093     memory_region_transaction_commit();
1094 }
1095 
1096 /*
1097  * PCI config space
1098  */
1099 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
1100 {
1101     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
1102     uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
1103 
1104     memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
1105     emu_bits = le32_to_cpu(emu_bits);
1106 
1107     if (emu_bits) {
1108         emu_val = pci_default_read_config(pdev, addr, len);
1109     }
1110 
1111     if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
1112         ssize_t ret;
1113 
1114         ret = pread(vdev->vbasedev.fd, &phys_val, len,
1115                     vdev->config_offset + addr);
1116         if (ret != len) {
1117             error_report("%s(%s, 0x%x, 0x%x) failed: %m",
1118                          __func__, vdev->vbasedev.name, addr, len);
1119             return -errno;
1120         }
1121         phys_val = le32_to_cpu(phys_val);
1122     }
1123 
1124     val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
1125 
1126     trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val);
1127 
1128     return val;
1129 }
1130 
1131 void vfio_pci_write_config(PCIDevice *pdev,
1132                            uint32_t addr, uint32_t val, int len)
1133 {
1134     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
1135     uint32_t val_le = cpu_to_le32(val);
1136 
1137     trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
1138 
1139     /* Write everything to VFIO, let it filter out what we can't write */
1140     if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr)
1141                 != len) {
1142         error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m",
1143                      __func__, vdev->vbasedev.name, addr, val, len);
1144     }
1145 
1146     /* MSI/MSI-X Enabling/Disabling */
1147     if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
1148         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
1149         int is_enabled, was_enabled = msi_enabled(pdev);
1150 
1151         pci_default_write_config(pdev, addr, val, len);
1152 
1153         is_enabled = msi_enabled(pdev);
1154 
1155         if (!was_enabled) {
1156             if (is_enabled) {
1157                 vfio_msi_enable(vdev);
1158             }
1159         } else {
1160             if (!is_enabled) {
1161                 vfio_msi_disable(vdev);
1162             } else {
1163                 vfio_update_msi(vdev);
1164             }
1165         }
1166     } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
1167         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
1168         int is_enabled, was_enabled = msix_enabled(pdev);
1169 
1170         pci_default_write_config(pdev, addr, val, len);
1171 
1172         is_enabled = msix_enabled(pdev);
1173 
1174         if (!was_enabled && is_enabled) {
1175             vfio_msix_enable(vdev);
1176         } else if (was_enabled && !is_enabled) {
1177             vfio_msix_disable(vdev);
1178         }
1179     } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) ||
1180         range_covers_byte(addr, len, PCI_COMMAND)) {
1181         pcibus_t old_addr[PCI_NUM_REGIONS - 1];
1182         int bar;
1183 
1184         for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
1185             old_addr[bar] = pdev->io_regions[bar].addr;
1186         }
1187 
1188         pci_default_write_config(pdev, addr, val, len);
1189 
1190         for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
1191             if (old_addr[bar] != pdev->io_regions[bar].addr &&
1192                 vdev->bars[bar].region.size > 0 &&
1193                 vdev->bars[bar].region.size < qemu_real_host_page_size) {
1194                 vfio_sub_page_bar_update_mapping(pdev, bar);
1195             }
1196         }
1197     } else {
1198         /* Write everything to QEMU to keep emulated bits correct */
1199         pci_default_write_config(pdev, addr, val, len);
1200     }
1201 }
1202 
1203 /*
1204  * Interrupt setup
1205  */
1206 static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
1207 {
1208     /*
1209      * More complicated than it looks.  Disabling MSI/X transitions the
1210      * device to INTx mode (if supported).  Therefore we need to first
1211      * disable MSI/X and then cleanup by disabling INTx.
1212      */
1213     if (vdev->interrupt == VFIO_INT_MSIX) {
1214         vfio_msix_disable(vdev);
1215     } else if (vdev->interrupt == VFIO_INT_MSI) {
1216         vfio_msi_disable(vdev);
1217     }
1218 
1219     if (vdev->interrupt == VFIO_INT_INTx) {
1220         vfio_intx_disable(vdev);
1221     }
1222 }
1223 
1224 static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1225 {
1226     uint16_t ctrl;
1227     bool msi_64bit, msi_maskbit;
1228     int ret, entries;
1229     Error *err = NULL;
1230 
1231     if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
1232               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
1233         error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
1234         return -errno;
1235     }
1236     ctrl = le16_to_cpu(ctrl);
1237 
1238     msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
1239     msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
1240     entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
1241 
1242     trace_vfio_msi_setup(vdev->vbasedev.name, pos);
1243 
1244     ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit, &err);
1245     if (ret < 0) {
1246         if (ret == -ENOTSUP) {
1247             return 0;
1248         }
1249         error_propagate_prepend(errp, err, "msi_init failed: ");
1250         return ret;
1251     }
1252     vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
1253 
1254     return 0;
1255 }
1256 
1257 static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
1258 {
1259     off_t start, end;
1260     VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region;
1261 
1262     /*
1263      * If the host driver allows mapping of a MSIX data, we are going to
1264      * do map the entire BAR and emulate MSIX table on top of that.
1265      */
1266     if (vfio_has_region_cap(&vdev->vbasedev, region->nr,
1267                             VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) {
1268         return;
1269     }
1270 
1271     /*
1272      * We expect to find a single mmap covering the whole BAR, anything else
1273      * means it's either unsupported or already setup.
1274      */
1275     if (region->nr_mmaps != 1 || region->mmaps[0].offset ||
1276         region->size != region->mmaps[0].size) {
1277         return;
1278     }
1279 
1280     /* MSI-X table start and end aligned to host page size */
1281     start = vdev->msix->table_offset & qemu_real_host_page_mask;
1282     end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
1283                                (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
1284 
1285     /*
1286      * Does the MSI-X table cover the beginning of the BAR?  The whole BAR?
1287      * NB - Host page size is necessarily a power of two and so is the PCI
1288      * BAR (not counting EA yet), therefore if we have host page aligned
1289      * @start and @end, then any remainder of the BAR before or after those
1290      * must be at least host page sized and therefore mmap'able.
1291      */
1292     if (!start) {
1293         if (end >= region->size) {
1294             region->nr_mmaps = 0;
1295             g_free(region->mmaps);
1296             region->mmaps = NULL;
1297             trace_vfio_msix_fixup(vdev->vbasedev.name,
1298                                   vdev->msix->table_bar, 0, 0);
1299         } else {
1300             region->mmaps[0].offset = end;
1301             region->mmaps[0].size = region->size - end;
1302             trace_vfio_msix_fixup(vdev->vbasedev.name,
1303                               vdev->msix->table_bar, region->mmaps[0].offset,
1304                               region->mmaps[0].offset + region->mmaps[0].size);
1305         }
1306 
1307     /* Maybe it's aligned at the end of the BAR */
1308     } else if (end >= region->size) {
1309         region->mmaps[0].size = start;
1310         trace_vfio_msix_fixup(vdev->vbasedev.name,
1311                               vdev->msix->table_bar, region->mmaps[0].offset,
1312                               region->mmaps[0].offset + region->mmaps[0].size);
1313 
1314     /* Otherwise it must split the BAR */
1315     } else {
1316         region->nr_mmaps = 2;
1317         region->mmaps = g_renew(VFIOMmap, region->mmaps, 2);
1318 
1319         memcpy(&region->mmaps[1], &region->mmaps[0], sizeof(VFIOMmap));
1320 
1321         region->mmaps[0].size = start;
1322         trace_vfio_msix_fixup(vdev->vbasedev.name,
1323                               vdev->msix->table_bar, region->mmaps[0].offset,
1324                               region->mmaps[0].offset + region->mmaps[0].size);
1325 
1326         region->mmaps[1].offset = end;
1327         region->mmaps[1].size = region->size - end;
1328         trace_vfio_msix_fixup(vdev->vbasedev.name,
1329                               vdev->msix->table_bar, region->mmaps[1].offset,
1330                               region->mmaps[1].offset + region->mmaps[1].size);
1331     }
1332 }
1333 
1334 static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
1335 {
1336     int target_bar = -1;
1337     size_t msix_sz;
1338 
1339     if (!vdev->msix || vdev->msix_relo == OFF_AUTOPCIBAR_OFF) {
1340         return;
1341     }
1342 
1343     /* The actual minimum size of MSI-X structures */
1344     msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) +
1345               (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8);
1346     /* Round up to host pages, we don't want to share a page */
1347     msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz);
1348     /* PCI BARs must be a power of 2 */
1349     msix_sz = pow2ceil(msix_sz);
1350 
1351     if (vdev->msix_relo == OFF_AUTOPCIBAR_AUTO) {
1352         /*
1353          * TODO: Lookup table for known devices.
1354          *
1355          * Logically we might use an algorithm here to select the BAR adding
1356          * the least additional MMIO space, but we cannot programatically
1357          * predict the driver dependency on BAR ordering or sizing, therefore
1358          * 'auto' becomes a lookup for combinations reported to work.
1359          */
1360         if (target_bar < 0) {
1361             error_setg(errp, "No automatic MSI-X relocation available for "
1362                        "device %04x:%04x", vdev->vendor_id, vdev->device_id);
1363             return;
1364         }
1365     } else {
1366         target_bar = (int)(vdev->msix_relo - OFF_AUTOPCIBAR_BAR0);
1367     }
1368 
1369     /* I/O port BARs cannot host MSI-X structures */
1370     if (vdev->bars[target_bar].ioport) {
1371         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1372                    "I/O port BAR", target_bar);
1373         return;
1374     }
1375 
1376     /* Cannot use a BAR in the "shadow" of a 64-bit BAR */
1377     if (!vdev->bars[target_bar].size &&
1378          target_bar > 0 && vdev->bars[target_bar - 1].mem64) {
1379         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1380                    "consumed by 64-bit BAR %d", target_bar, target_bar - 1);
1381         return;
1382     }
1383 
1384     /* 2GB max size for 32-bit BARs, cannot double if already > 1G */
1385     if (vdev->bars[target_bar].size > 1 * GiB &&
1386         !vdev->bars[target_bar].mem64) {
1387         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1388                    "no space to extend 32-bit BAR", target_bar);
1389         return;
1390     }
1391 
1392     /*
1393      * If adding a new BAR, test if we can make it 64bit.  We make it
1394      * prefetchable since QEMU MSI-X emulation has no read side effects
1395      * and doing so makes mapping more flexible.
1396      */
1397     if (!vdev->bars[target_bar].size) {
1398         if (target_bar < (PCI_ROM_SLOT - 1) &&
1399             !vdev->bars[target_bar + 1].size) {
1400             vdev->bars[target_bar].mem64 = true;
1401             vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64;
1402         }
1403         vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
1404         vdev->bars[target_bar].size = msix_sz;
1405         vdev->msix->table_offset = 0;
1406     } else {
1407         vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2,
1408                                           msix_sz * 2);
1409         /*
1410          * Due to above size calc, MSI-X always starts halfway into the BAR,
1411          * which will always be a separate host page.
1412          */
1413         vdev->msix->table_offset = vdev->bars[target_bar].size / 2;
1414     }
1415 
1416     vdev->msix->table_bar = target_bar;
1417     vdev->msix->pba_bar = target_bar;
1418     /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */
1419     vdev->msix->pba_offset = vdev->msix->table_offset +
1420                                   (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE);
1421 
1422     trace_vfio_msix_relo(vdev->vbasedev.name,
1423                          vdev->msix->table_bar, vdev->msix->table_offset);
1424 }
1425 
1426 /*
1427  * We don't have any control over how pci_add_capability() inserts
1428  * capabilities into the chain.  In order to setup MSI-X we need a
1429  * MemoryRegion for the BAR.  In order to setup the BAR and not
1430  * attempt to mmap the MSI-X table area, which VFIO won't allow, we
1431  * need to first look for where the MSI-X table lives.  So we
1432  * unfortunately split MSI-X setup across two functions.
1433  */
1434 static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
1435 {
1436     uint8_t pos;
1437     uint16_t ctrl;
1438     uint32_t table, pba;
1439     int fd = vdev->vbasedev.fd;
1440     VFIOMSIXInfo *msix;
1441 
1442     pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
1443     if (!pos) {
1444         return;
1445     }
1446 
1447     if (pread(fd, &ctrl, sizeof(ctrl),
1448               vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) {
1449         error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS");
1450         return;
1451     }
1452 
1453     if (pread(fd, &table, sizeof(table),
1454               vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
1455         error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE");
1456         return;
1457     }
1458 
1459     if (pread(fd, &pba, sizeof(pba),
1460               vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
1461         error_setg_errno(errp, errno, "failed to read PCI MSIX PBA");
1462         return;
1463     }
1464 
1465     ctrl = le16_to_cpu(ctrl);
1466     table = le32_to_cpu(table);
1467     pba = le32_to_cpu(pba);
1468 
1469     msix = g_malloc0(sizeof(*msix));
1470     msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
1471     msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
1472     msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
1473     msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
1474     msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
1475 
1476     /*
1477      * Test the size of the pba_offset variable and catch if it extends outside
1478      * of the specified BAR. If it is the case, we need to apply a hardware
1479      * specific quirk if the device is known or we have a broken configuration.
1480      */
1481     if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) {
1482         /*
1483          * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
1484          * adapters. The T5 hardware returns an incorrect value of 0x8000 for
1485          * the VF PBA offset while the BAR itself is only 8k. The correct value
1486          * is 0x1000, so we hard code that here.
1487          */
1488         if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO &&
1489             (vdev->device_id & 0xff00) == 0x5800) {
1490             msix->pba_offset = 0x1000;
1491         } else if (vdev->msix_relo == OFF_AUTOPCIBAR_OFF) {
1492             error_setg(errp, "hardware reports invalid configuration, "
1493                        "MSIX PBA outside of specified BAR");
1494             g_free(msix);
1495             return;
1496         }
1497     }
1498 
1499     trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
1500                                 msix->table_offset, msix->entries);
1501     vdev->msix = msix;
1502 
1503     vfio_pci_fixup_msix_region(vdev);
1504 
1505     vfio_pci_relocate_msix(vdev, errp);
1506 }
1507 
1508 static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1509 {
1510     int ret;
1511     Error *err = NULL;
1512 
1513     vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) *
1514                                     sizeof(unsigned long));
1515     ret = msix_init(&vdev->pdev, vdev->msix->entries,
1516                     vdev->bars[vdev->msix->table_bar].mr,
1517                     vdev->msix->table_bar, vdev->msix->table_offset,
1518                     vdev->bars[vdev->msix->pba_bar].mr,
1519                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos,
1520                     &err);
1521     if (ret < 0) {
1522         if (ret == -ENOTSUP) {
1523             warn_report_err(err);
1524             return 0;
1525         }
1526 
1527         error_propagate(errp, err);
1528         return ret;
1529     }
1530 
1531     /*
1532      * The PCI spec suggests that devices provide additional alignment for
1533      * MSI-X structures and avoid overlapping non-MSI-X related registers.
1534      * For an assigned device, this hopefully means that emulation of MSI-X
1535      * structures does not affect the performance of the device.  If devices
1536      * fail to provide that alignment, a significant performance penalty may
1537      * result, for instance Mellanox MT27500 VFs:
1538      * http://www.spinics.net/lists/kvm/msg125881.html
1539      *
1540      * The PBA is simply not that important for such a serious regression and
1541      * most drivers do not appear to look at it.  The solution for this is to
1542      * disable the PBA MemoryRegion unless it's being used.  We disable it
1543      * here and only enable it if a masked vector fires through QEMU.  As the
1544      * vector-use notifier is called, which occurs on unmask, we test whether
1545      * PBA emulation is needed and again disable if not.
1546      */
1547     memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
1548 
1549     /*
1550      * The emulated machine may provide a paravirt interface for MSIX setup
1551      * so it is not strictly necessary to emulate MSIX here. This becomes
1552      * helpful when frequently accessed MMIO registers are located in
1553      * subpages adjacent to the MSIX table but the MSIX data containing page
1554      * cannot be mapped because of a host page size bigger than the MSIX table
1555      * alignment.
1556      */
1557     if (object_property_get_bool(OBJECT(qdev_get_machine()),
1558                                  "vfio-no-msix-emulation", NULL)) {
1559         memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
1560     }
1561 
1562     return 0;
1563 }
1564 
1565 static void vfio_teardown_msi(VFIOPCIDevice *vdev)
1566 {
1567     msi_uninit(&vdev->pdev);
1568 
1569     if (vdev->msix) {
1570         msix_uninit(&vdev->pdev,
1571                     vdev->bars[vdev->msix->table_bar].mr,
1572                     vdev->bars[vdev->msix->pba_bar].mr);
1573         g_free(vdev->msix->pending);
1574     }
1575 }
1576 
1577 /*
1578  * Resource setup
1579  */
1580 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
1581 {
1582     int i;
1583 
1584     for (i = 0; i < PCI_ROM_SLOT; i++) {
1585         vfio_region_mmaps_set_enabled(&vdev->bars[i].region, enabled);
1586     }
1587 }
1588 
1589 static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
1590 {
1591     VFIOBAR *bar = &vdev->bars[nr];
1592 
1593     uint32_t pci_bar;
1594     int ret;
1595 
1596     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
1597     if (!bar->region.size) {
1598         return;
1599     }
1600 
1601     /* Determine what type of BAR this is for registration */
1602     ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar),
1603                 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
1604     if (ret != sizeof(pci_bar)) {
1605         error_report("vfio: Failed to read BAR %d (%m)", nr);
1606         return;
1607     }
1608 
1609     pci_bar = le32_to_cpu(pci_bar);
1610     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
1611     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
1612     bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
1613                                          ~PCI_BASE_ADDRESS_MEM_MASK);
1614     bar->size = bar->region.size;
1615 }
1616 
1617 static void vfio_bars_prepare(VFIOPCIDevice *vdev)
1618 {
1619     int i;
1620 
1621     for (i = 0; i < PCI_ROM_SLOT; i++) {
1622         vfio_bar_prepare(vdev, i);
1623     }
1624 }
1625 
1626 static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
1627 {
1628     VFIOBAR *bar = &vdev->bars[nr];
1629     char *name;
1630 
1631     if (!bar->size) {
1632         return;
1633     }
1634 
1635     bar->mr = g_new0(MemoryRegion, 1);
1636     name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr);
1637     memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size);
1638     g_free(name);
1639 
1640     if (bar->region.size) {
1641         memory_region_add_subregion(bar->mr, 0, bar->region.mem);
1642 
1643         if (vfio_region_mmap(&bar->region)) {
1644             error_report("Failed to mmap %s BAR %d. Performance may be slow",
1645                          vdev->vbasedev.name, nr);
1646         }
1647     }
1648 
1649     pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
1650 }
1651 
1652 static void vfio_bars_register(VFIOPCIDevice *vdev)
1653 {
1654     int i;
1655 
1656     for (i = 0; i < PCI_ROM_SLOT; i++) {
1657         vfio_bar_register(vdev, i);
1658     }
1659 }
1660 
1661 static void vfio_bars_exit(VFIOPCIDevice *vdev)
1662 {
1663     int i;
1664 
1665     for (i = 0; i < PCI_ROM_SLOT; i++) {
1666         VFIOBAR *bar = &vdev->bars[i];
1667 
1668         vfio_bar_quirk_exit(vdev, i);
1669         vfio_region_exit(&bar->region);
1670         if (bar->region.size) {
1671             memory_region_del_subregion(bar->mr, bar->region.mem);
1672         }
1673     }
1674 
1675     if (vdev->vga) {
1676         pci_unregister_vga(&vdev->pdev);
1677         vfio_vga_quirk_exit(vdev);
1678     }
1679 }
1680 
1681 static void vfio_bars_finalize(VFIOPCIDevice *vdev)
1682 {
1683     int i;
1684 
1685     for (i = 0; i < PCI_ROM_SLOT; i++) {
1686         VFIOBAR *bar = &vdev->bars[i];
1687 
1688         vfio_bar_quirk_finalize(vdev, i);
1689         vfio_region_finalize(&bar->region);
1690         if (bar->size) {
1691             object_unparent(OBJECT(bar->mr));
1692             g_free(bar->mr);
1693         }
1694     }
1695 
1696     if (vdev->vga) {
1697         vfio_vga_quirk_finalize(vdev);
1698         for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1699             object_unparent(OBJECT(&vdev->vga->region[i].mem));
1700         }
1701         g_free(vdev->vga);
1702     }
1703 }
1704 
1705 /*
1706  * General setup
1707  */
1708 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
1709 {
1710     uint8_t tmp;
1711     uint16_t next = PCI_CONFIG_SPACE_SIZE;
1712 
1713     for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
1714          tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]) {
1715         if (tmp > pos && tmp < next) {
1716             next = tmp;
1717         }
1718     }
1719 
1720     return next - pos;
1721 }
1722 
1723 
1724 static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
1725 {
1726     uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
1727 
1728     for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
1729         tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
1730         if (tmp > pos && tmp < next) {
1731             next = tmp;
1732         }
1733     }
1734 
1735     return next - pos;
1736 }
1737 
1738 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
1739 {
1740     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
1741 }
1742 
1743 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos,
1744                                    uint16_t val, uint16_t mask)
1745 {
1746     vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
1747     vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
1748     vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
1749 }
1750 
1751 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
1752 {
1753     pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
1754 }
1755 
1756 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
1757                                    uint32_t val, uint32_t mask)
1758 {
1759     vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
1760     vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
1761     vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
1762 }
1763 
1764 static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size,
1765                                Error **errp)
1766 {
1767     uint16_t flags;
1768     uint8_t type;
1769 
1770     flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
1771     type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
1772 
1773     if (type != PCI_EXP_TYPE_ENDPOINT &&
1774         type != PCI_EXP_TYPE_LEG_END &&
1775         type != PCI_EXP_TYPE_RC_END) {
1776 
1777         error_setg(errp, "assignment of PCIe type 0x%x "
1778                    "devices is not currently supported", type);
1779         return -EINVAL;
1780     }
1781 
1782     if (!pci_bus_is_express(pci_get_bus(&vdev->pdev))) {
1783         PCIBus *bus = pci_get_bus(&vdev->pdev);
1784         PCIDevice *bridge;
1785 
1786         /*
1787          * Traditionally PCI device assignment exposes the PCIe capability
1788          * as-is on non-express buses.  The reason being that some drivers
1789          * simply assume that it's there, for example tg3.  However when
1790          * we're running on a native PCIe machine type, like Q35, we need
1791          * to hide the PCIe capability.  The reason for this is twofold;
1792          * first Windows guests get a Code 10 error when the PCIe capability
1793          * is exposed in this configuration.  Therefore express devices won't
1794          * work at all unless they're attached to express buses in the VM.
1795          * Second, a native PCIe machine introduces the possibility of fine
1796          * granularity IOMMUs supporting both translation and isolation.
1797          * Guest code to discover the IOMMU visibility of a device, such as
1798          * IOMMU grouping code on Linux, is very aware of device types and
1799          * valid transitions between bus types.  An express device on a non-
1800          * express bus is not a valid combination on bare metal systems.
1801          *
1802          * Drivers that require a PCIe capability to make the device
1803          * functional are simply going to need to have their devices placed
1804          * on a PCIe bus in the VM.
1805          */
1806         while (!pci_bus_is_root(bus)) {
1807             bridge = pci_bridge_get_device(bus);
1808             bus = pci_get_bus(bridge);
1809         }
1810 
1811         if (pci_bus_is_express(bus)) {
1812             return 0;
1813         }
1814 
1815     } else if (pci_bus_is_root(pci_get_bus(&vdev->pdev))) {
1816         /*
1817          * On a Root Complex bus Endpoints become Root Complex Integrated
1818          * Endpoints, which changes the type and clears the LNK & LNK2 fields.
1819          */
1820         if (type == PCI_EXP_TYPE_ENDPOINT) {
1821             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
1822                                    PCI_EXP_TYPE_RC_END << 4,
1823                                    PCI_EXP_FLAGS_TYPE);
1824 
1825             /* Link Capabilities, Status, and Control goes away */
1826             if (size > PCI_EXP_LNKCTL) {
1827                 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
1828                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
1829                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
1830 
1831 #ifndef PCI_EXP_LNKCAP2
1832 #define PCI_EXP_LNKCAP2 44
1833 #endif
1834 #ifndef PCI_EXP_LNKSTA2
1835 #define PCI_EXP_LNKSTA2 50
1836 #endif
1837                 /* Link 2 Capabilities, Status, and Control goes away */
1838                 if (size > PCI_EXP_LNKCAP2) {
1839                     vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
1840                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
1841                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
1842                 }
1843             }
1844 
1845         } else if (type == PCI_EXP_TYPE_LEG_END) {
1846             /*
1847              * Legacy endpoints don't belong on the root complex.  Windows
1848              * seems to be happier with devices if we skip the capability.
1849              */
1850             return 0;
1851         }
1852 
1853     } else {
1854         /*
1855          * Convert Root Complex Integrated Endpoints to regular endpoints.
1856          * These devices don't support LNK/LNK2 capabilities, so make them up.
1857          */
1858         if (type == PCI_EXP_TYPE_RC_END) {
1859             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
1860                                    PCI_EXP_TYPE_ENDPOINT << 4,
1861                                    PCI_EXP_FLAGS_TYPE);
1862             vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
1863                            QEMU_PCI_EXP_LNKCAP_MLW(QEMU_PCI_EXP_LNK_X1) |
1864                            QEMU_PCI_EXP_LNKCAP_MLS(QEMU_PCI_EXP_LNK_2_5GT), ~0);
1865             vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
1866         }
1867     }
1868 
1869     /*
1870      * Intel 82599 SR-IOV VFs report an invalid PCIe capability version 0
1871      * (Niantic errate #35) causing Windows to error with a Code 10 for the
1872      * device on Q35.  Fixup any such devices to report version 1.  If we
1873      * were to remove the capability entirely the guest would lose extended
1874      * config space.
1875      */
1876     if ((flags & PCI_EXP_FLAGS_VERS) == 0) {
1877         vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
1878                                1, PCI_EXP_FLAGS_VERS);
1879     }
1880 
1881     pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size,
1882                              errp);
1883     if (pos < 0) {
1884         return pos;
1885     }
1886 
1887     vdev->pdev.exp.exp_cap = pos;
1888 
1889     return pos;
1890 }
1891 
1892 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
1893 {
1894     uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
1895 
1896     if (cap & PCI_EXP_DEVCAP_FLR) {
1897         trace_vfio_check_pcie_flr(vdev->vbasedev.name);
1898         vdev->has_flr = true;
1899     }
1900 }
1901 
1902 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos)
1903 {
1904     uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
1905 
1906     if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
1907         trace_vfio_check_pm_reset(vdev->vbasedev.name);
1908         vdev->has_pm_reset = true;
1909     }
1910 }
1911 
1912 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
1913 {
1914     uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
1915 
1916     if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
1917         trace_vfio_check_af_flr(vdev->vbasedev.name);
1918         vdev->has_flr = true;
1919     }
1920 }
1921 
1922 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
1923 {
1924     PCIDevice *pdev = &vdev->pdev;
1925     uint8_t cap_id, next, size;
1926     int ret;
1927 
1928     cap_id = pdev->config[pos];
1929     next = pdev->config[pos + PCI_CAP_LIST_NEXT];
1930 
1931     /*
1932      * If it becomes important to configure capabilities to their actual
1933      * size, use this as the default when it's something we don't recognize.
1934      * Since QEMU doesn't actually handle many of the config accesses,
1935      * exact size doesn't seem worthwhile.
1936      */
1937     size = vfio_std_cap_max_size(pdev, pos);
1938 
1939     /*
1940      * pci_add_capability always inserts the new capability at the head
1941      * of the chain.  Therefore to end up with a chain that matches the
1942      * physical device, we insert from the end by making this recursive.
1943      * This is also why we pre-calculate size above as cached config space
1944      * will be changed as we unwind the stack.
1945      */
1946     if (next) {
1947         ret = vfio_add_std_cap(vdev, next, errp);
1948         if (ret) {
1949             return ret;
1950         }
1951     } else {
1952         /* Begin the rebuild, use QEMU emulated list bits */
1953         pdev->config[PCI_CAPABILITY_LIST] = 0;
1954         vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
1955         vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
1956 
1957         ret = vfio_add_virt_caps(vdev, errp);
1958         if (ret) {
1959             return ret;
1960         }
1961     }
1962 
1963     /* Scale down size, esp in case virt caps were added above */
1964     size = MIN(size, vfio_std_cap_max_size(pdev, pos));
1965 
1966     /* Use emulated next pointer to allow dropping caps */
1967     pci_set_byte(vdev->emulated_config_bits + pos + PCI_CAP_LIST_NEXT, 0xff);
1968 
1969     switch (cap_id) {
1970     case PCI_CAP_ID_MSI:
1971         ret = vfio_msi_setup(vdev, pos, errp);
1972         break;
1973     case PCI_CAP_ID_EXP:
1974         vfio_check_pcie_flr(vdev, pos);
1975         ret = vfio_setup_pcie_cap(vdev, pos, size, errp);
1976         break;
1977     case PCI_CAP_ID_MSIX:
1978         ret = vfio_msix_setup(vdev, pos, errp);
1979         break;
1980     case PCI_CAP_ID_PM:
1981         vfio_check_pm_reset(vdev, pos);
1982         vdev->pm_cap = pos;
1983         ret = pci_add_capability(pdev, cap_id, pos, size, errp);
1984         break;
1985     case PCI_CAP_ID_AF:
1986         vfio_check_af_flr(vdev, pos);
1987         ret = pci_add_capability(pdev, cap_id, pos, size, errp);
1988         break;
1989     default:
1990         ret = pci_add_capability(pdev, cap_id, pos, size, errp);
1991         break;
1992     }
1993 
1994     if (ret < 0) {
1995         error_prepend(errp,
1996                       "failed to add PCI capability 0x%x[0x%x]@0x%x: ",
1997                       cap_id, size, pos);
1998         return ret;
1999     }
2000 
2001     return 0;
2002 }
2003 
2004 static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
2005 {
2006     PCIDevice *pdev = &vdev->pdev;
2007     uint32_t header;
2008     uint16_t cap_id, next, size;
2009     uint8_t cap_ver;
2010     uint8_t *config;
2011 
2012     /* Only add extended caps if we have them and the guest can see them */
2013     if (!pci_is_express(pdev) || !pci_bus_is_express(pci_get_bus(pdev)) ||
2014         !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
2015         return;
2016     }
2017 
2018     /*
2019      * pcie_add_capability always inserts the new capability at the tail
2020      * of the chain.  Therefore to end up with a chain that matches the
2021      * physical device, we cache the config space to avoid overwriting
2022      * the original config space when we parse the extended capabilities.
2023      */
2024     config = g_memdup(pdev->config, vdev->config_size);
2025 
2026     /*
2027      * Extended capabilities are chained with each pointing to the next, so we
2028      * can drop anything other than the head of the chain simply by modifying
2029      * the previous next pointer.  Seed the head of the chain here such that
2030      * we can simply skip any capabilities we want to drop below, regardless
2031      * of their position in the chain.  If this stub capability still exists
2032      * after we add the capabilities we want to expose, update the capability
2033      * ID to zero.  Note that we cannot seed with the capability header being
2034      * zero as this conflicts with definition of an absent capability chain
2035      * and prevents capabilities beyond the head of the list from being added.
2036      * By replacing the dummy capability ID with zero after walking the device
2037      * chain, we also transparently mark extended capabilities as absent if
2038      * no capabilities were added.  Note that the PCIe spec defines an absence
2039      * of extended capabilities to be determined by a value of zero for the
2040      * capability ID, version, AND next pointer.  A non-zero next pointer
2041      * should be sufficient to indicate additional capabilities are present,
2042      * which will occur if we call pcie_add_capability() below.  The entire
2043      * first dword is emulated to support this.
2044      *
2045      * NB. The kernel side does similar masking, so be prepared that our
2046      * view of the device may also contain a capability ID zero in the head
2047      * of the chain.  Skip it for the same reason that we cannot seed the
2048      * chain with a zero capability.
2049      */
2050     pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
2051                  PCI_EXT_CAP(0xFFFF, 0, 0));
2052     pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
2053     pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
2054 
2055     for (next = PCI_CONFIG_SPACE_SIZE; next;
2056          next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
2057         header = pci_get_long(config + next);
2058         cap_id = PCI_EXT_CAP_ID(header);
2059         cap_ver = PCI_EXT_CAP_VER(header);
2060 
2061         /*
2062          * If it becomes important to configure extended capabilities to their
2063          * actual size, use this as the default when it's something we don't
2064          * recognize. Since QEMU doesn't actually handle many of the config
2065          * accesses, exact size doesn't seem worthwhile.
2066          */
2067         size = vfio_ext_cap_max_size(config, next);
2068 
2069         /* Use emulated next pointer to allow dropping extended caps */
2070         pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
2071                                    PCI_EXT_CAP_NEXT_MASK);
2072 
2073         switch (cap_id) {
2074         case 0: /* kernel masked capability */
2075         case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */
2076         case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */
2077         case PCI_EXT_CAP_ID_REBAR: /* Can't expose read-only */
2078             trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
2079             break;
2080         default:
2081             pcie_add_capability(pdev, cap_id, cap_ver, next, size);
2082         }
2083 
2084     }
2085 
2086     /* Cleanup chain head ID if necessary */
2087     if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
2088         pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
2089     }
2090 
2091     g_free(config);
2092     return;
2093 }
2094 
2095 static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
2096 {
2097     PCIDevice *pdev = &vdev->pdev;
2098     int ret;
2099 
2100     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
2101         !pdev->config[PCI_CAPABILITY_LIST]) {
2102         return 0; /* Nothing to add */
2103     }
2104 
2105     ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST], errp);
2106     if (ret) {
2107         return ret;
2108     }
2109 
2110     vfio_add_ext_cap(vdev);
2111     return 0;
2112 }
2113 
2114 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
2115 {
2116     PCIDevice *pdev = &vdev->pdev;
2117     uint16_t cmd;
2118 
2119     vfio_disable_interrupts(vdev);
2120 
2121     /* Make sure the device is in D0 */
2122     if (vdev->pm_cap) {
2123         uint16_t pmcsr;
2124         uint8_t state;
2125 
2126         pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2127         state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2128         if (state) {
2129             pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
2130             vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
2131             /* vfio handles the necessary delay here */
2132             pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2133             state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2134             if (state) {
2135                 error_report("vfio: Unable to power on device, stuck in D%d",
2136                              state);
2137             }
2138         }
2139     }
2140 
2141     /*
2142      * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
2143      * Also put INTx Disable in known state.
2144      */
2145     cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
2146     cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
2147              PCI_COMMAND_INTX_DISABLE);
2148     vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
2149 }
2150 
2151 static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
2152 {
2153     Error *err = NULL;
2154     int nr;
2155 
2156     vfio_intx_enable(vdev, &err);
2157     if (err) {
2158         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2159     }
2160 
2161     for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
2162         off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr);
2163         uint32_t val = 0;
2164         uint32_t len = sizeof(val);
2165 
2166         if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) {
2167             error_report("%s(%s) reset bar %d failed: %m", __func__,
2168                          vdev->vbasedev.name, nr);
2169         }
2170     }
2171 
2172     vfio_quirk_reset(vdev);
2173 }
2174 
2175 static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
2176 {
2177     char tmp[13];
2178 
2179     sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain,
2180             addr->bus, addr->slot, addr->function);
2181 
2182     return (strcmp(tmp, name) == 0);
2183 }
2184 
2185 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
2186 {
2187     VFIOGroup *group;
2188     struct vfio_pci_hot_reset_info *info;
2189     struct vfio_pci_dependent_device *devices;
2190     struct vfio_pci_hot_reset *reset;
2191     int32_t *fds;
2192     int ret, i, count;
2193     bool multi = false;
2194 
2195     trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
2196 
2197     if (!single) {
2198         vfio_pci_pre_reset(vdev);
2199     }
2200     vdev->vbasedev.needs_reset = false;
2201 
2202     info = g_malloc0(sizeof(*info));
2203     info->argsz = sizeof(*info);
2204 
2205     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2206     if (ret && errno != ENOSPC) {
2207         ret = -errno;
2208         if (!vdev->has_pm_reset) {
2209             error_report("vfio: Cannot reset device %s, "
2210                          "no available reset mechanism.", vdev->vbasedev.name);
2211         }
2212         goto out_single;
2213     }
2214 
2215     count = info->count;
2216     info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
2217     info->argsz = sizeof(*info) + (count * sizeof(*devices));
2218     devices = &info->devices[0];
2219 
2220     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2221     if (ret) {
2222         ret = -errno;
2223         error_report("vfio: hot reset info failed: %m");
2224         goto out_single;
2225     }
2226 
2227     trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
2228 
2229     /* Verify that we have all the groups required */
2230     for (i = 0; i < info->count; i++) {
2231         PCIHostDeviceAddress host;
2232         VFIOPCIDevice *tmp;
2233         VFIODevice *vbasedev_iter;
2234 
2235         host.domain = devices[i].segment;
2236         host.bus = devices[i].bus;
2237         host.slot = PCI_SLOT(devices[i].devfn);
2238         host.function = PCI_FUNC(devices[i].devfn);
2239 
2240         trace_vfio_pci_hot_reset_dep_devices(host.domain,
2241                 host.bus, host.slot, host.function, devices[i].group_id);
2242 
2243         if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
2244             continue;
2245         }
2246 
2247         QLIST_FOREACH(group, &vfio_group_list, next) {
2248             if (group->groupid == devices[i].group_id) {
2249                 break;
2250             }
2251         }
2252 
2253         if (!group) {
2254             if (!vdev->has_pm_reset) {
2255                 error_report("vfio: Cannot reset device %s, "
2256                              "depends on group %d which is not owned.",
2257                              vdev->vbasedev.name, devices[i].group_id);
2258             }
2259             ret = -EPERM;
2260             goto out;
2261         }
2262 
2263         /* Prep dependent devices for reset and clear our marker. */
2264         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
2265             if (!vbasedev_iter->dev->realized ||
2266                 vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
2267                 continue;
2268             }
2269             tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
2270             if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
2271                 if (single) {
2272                     ret = -EINVAL;
2273                     goto out_single;
2274                 }
2275                 vfio_pci_pre_reset(tmp);
2276                 tmp->vbasedev.needs_reset = false;
2277                 multi = true;
2278                 break;
2279             }
2280         }
2281     }
2282 
2283     if (!single && !multi) {
2284         ret = -EINVAL;
2285         goto out_single;
2286     }
2287 
2288     /* Determine how many group fds need to be passed */
2289     count = 0;
2290     QLIST_FOREACH(group, &vfio_group_list, next) {
2291         for (i = 0; i < info->count; i++) {
2292             if (group->groupid == devices[i].group_id) {
2293                 count++;
2294                 break;
2295             }
2296         }
2297     }
2298 
2299     reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
2300     reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
2301     fds = &reset->group_fds[0];
2302 
2303     /* Fill in group fds */
2304     QLIST_FOREACH(group, &vfio_group_list, next) {
2305         for (i = 0; i < info->count; i++) {
2306             if (group->groupid == devices[i].group_id) {
2307                 fds[reset->count++] = group->fd;
2308                 break;
2309             }
2310         }
2311     }
2312 
2313     /* Bus reset! */
2314     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
2315     g_free(reset);
2316 
2317     trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
2318                                     ret ? "%m" : "Success");
2319 
2320 out:
2321     /* Re-enable INTx on affected devices */
2322     for (i = 0; i < info->count; i++) {
2323         PCIHostDeviceAddress host;
2324         VFIOPCIDevice *tmp;
2325         VFIODevice *vbasedev_iter;
2326 
2327         host.domain = devices[i].segment;
2328         host.bus = devices[i].bus;
2329         host.slot = PCI_SLOT(devices[i].devfn);
2330         host.function = PCI_FUNC(devices[i].devfn);
2331 
2332         if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
2333             continue;
2334         }
2335 
2336         QLIST_FOREACH(group, &vfio_group_list, next) {
2337             if (group->groupid == devices[i].group_id) {
2338                 break;
2339             }
2340         }
2341 
2342         if (!group) {
2343             break;
2344         }
2345 
2346         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
2347             if (!vbasedev_iter->dev->realized ||
2348                 vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
2349                 continue;
2350             }
2351             tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
2352             if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
2353                 vfio_pci_post_reset(tmp);
2354                 break;
2355             }
2356         }
2357     }
2358 out_single:
2359     if (!single) {
2360         vfio_pci_post_reset(vdev);
2361     }
2362     g_free(info);
2363 
2364     return ret;
2365 }
2366 
2367 /*
2368  * We want to differentiate hot reset of mulitple in-use devices vs hot reset
2369  * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
2370  * of doing hot resets when there is only a single device per bus.  The in-use
2371  * here refers to how many VFIODevices are affected.  A hot reset that affects
2372  * multiple devices, but only a single in-use device, means that we can call
2373  * it from our bus ->reset() callback since the extent is effectively a single
2374  * device.  This allows us to make use of it in the hotplug path.  When there
2375  * are multiple in-use devices, we can only trigger the hot reset during a
2376  * system reset and thus from our reset handler.  We separate _one vs _multi
2377  * here so that we don't overlap and do a double reset on the system reset
2378  * path where both our reset handler and ->reset() callback are used.  Calling
2379  * _one() will only do a hot reset for the one in-use devices case, calling
2380  * _multi() will do nothing if a _one() would have been sufficient.
2381  */
2382 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev)
2383 {
2384     return vfio_pci_hot_reset(vdev, true);
2385 }
2386 
2387 static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev)
2388 {
2389     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2390     return vfio_pci_hot_reset(vdev, false);
2391 }
2392 
2393 static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev)
2394 {
2395     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2396     if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
2397         vbasedev->needs_reset = true;
2398     }
2399 }
2400 
2401 static VFIODeviceOps vfio_pci_ops = {
2402     .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
2403     .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
2404     .vfio_eoi = vfio_intx_eoi,
2405 };
2406 
2407 int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
2408 {
2409     VFIODevice *vbasedev = &vdev->vbasedev;
2410     struct vfio_region_info *reg_info;
2411     int ret;
2412 
2413     ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, &reg_info);
2414     if (ret) {
2415         error_setg_errno(errp, -ret,
2416                          "failed getting region info for VGA region index %d",
2417                          VFIO_PCI_VGA_REGION_INDEX);
2418         return ret;
2419     }
2420 
2421     if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) ||
2422         !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) ||
2423         reg_info->size < 0xbffff + 1) {
2424         error_setg(errp, "unexpected VGA info, flags 0x%lx, size 0x%lx",
2425                    (unsigned long)reg_info->flags,
2426                    (unsigned long)reg_info->size);
2427         g_free(reg_info);
2428         return -EINVAL;
2429     }
2430 
2431     vdev->vga = g_new0(VFIOVGA, 1);
2432 
2433     vdev->vga->fd_offset = reg_info->offset;
2434     vdev->vga->fd = vdev->vbasedev.fd;
2435 
2436     g_free(reg_info);
2437 
2438     vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
2439     vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
2440     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks);
2441 
2442     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
2443                           OBJECT(vdev), &vfio_vga_ops,
2444                           &vdev->vga->region[QEMU_PCI_VGA_MEM],
2445                           "vfio-vga-mmio@0xa0000",
2446                           QEMU_PCI_VGA_MEM_SIZE);
2447 
2448     vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
2449     vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
2450     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks);
2451 
2452     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
2453                           OBJECT(vdev), &vfio_vga_ops,
2454                           &vdev->vga->region[QEMU_PCI_VGA_IO_LO],
2455                           "vfio-vga-io@0x3b0",
2456                           QEMU_PCI_VGA_IO_LO_SIZE);
2457 
2458     vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
2459     vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
2460     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks);
2461 
2462     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
2463                           OBJECT(vdev), &vfio_vga_ops,
2464                           &vdev->vga->region[QEMU_PCI_VGA_IO_HI],
2465                           "vfio-vga-io@0x3c0",
2466                           QEMU_PCI_VGA_IO_HI_SIZE);
2467 
2468     pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
2469                      &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
2470                      &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem);
2471 
2472     return 0;
2473 }
2474 
2475 static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
2476 {
2477     VFIODevice *vbasedev = &vdev->vbasedev;
2478     struct vfio_region_info *reg_info;
2479     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
2480     int i, ret = -1;
2481 
2482     /* Sanity check device */
2483     if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) {
2484         error_setg(errp, "this isn't a PCI device");
2485         return;
2486     }
2487 
2488     if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
2489         error_setg(errp, "unexpected number of io regions %u",
2490                    vbasedev->num_regions);
2491         return;
2492     }
2493 
2494     if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
2495         error_setg(errp, "unexpected number of irqs %u", vbasedev->num_irqs);
2496         return;
2497     }
2498 
2499     for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
2500         char *name = g_strdup_printf("%s BAR %d", vbasedev->name, i);
2501 
2502         ret = vfio_region_setup(OBJECT(vdev), vbasedev,
2503                                 &vdev->bars[i].region, i, name);
2504         g_free(name);
2505 
2506         if (ret) {
2507             error_setg_errno(errp, -ret, "failed to get region %d info", i);
2508             return;
2509         }
2510 
2511         QLIST_INIT(&vdev->bars[i].quirks);
2512     }
2513 
2514     ret = vfio_get_region_info(vbasedev,
2515                                VFIO_PCI_CONFIG_REGION_INDEX, &reg_info);
2516     if (ret) {
2517         error_setg_errno(errp, -ret, "failed to get config info");
2518         return;
2519     }
2520 
2521     trace_vfio_populate_device_config(vdev->vbasedev.name,
2522                                       (unsigned long)reg_info->size,
2523                                       (unsigned long)reg_info->offset,
2524                                       (unsigned long)reg_info->flags);
2525 
2526     vdev->config_size = reg_info->size;
2527     if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
2528         vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
2529     }
2530     vdev->config_offset = reg_info->offset;
2531 
2532     g_free(reg_info);
2533 
2534     if (vdev->features & VFIO_FEATURE_ENABLE_VGA) {
2535         ret = vfio_populate_vga(vdev, errp);
2536         if (ret) {
2537             error_append_hint(errp, "device does not support "
2538                               "requested feature x-vga\n");
2539             return;
2540         }
2541     }
2542 
2543     irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
2544 
2545     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
2546     if (ret) {
2547         /* This can fail for an old kernel or legacy PCI dev */
2548         trace_vfio_populate_device_get_irq_info_failure(strerror(errno));
2549     } else if (irq_info.count == 1) {
2550         vdev->pci_aer = true;
2551     } else {
2552         warn_report(VFIO_MSG_PREFIX
2553                     "Could not enable error recovery for the device",
2554                     vbasedev->name);
2555     }
2556 }
2557 
2558 static void vfio_put_device(VFIOPCIDevice *vdev)
2559 {
2560     g_free(vdev->vbasedev.name);
2561     g_free(vdev->msix);
2562 
2563     vfio_put_base_device(&vdev->vbasedev);
2564 }
2565 
2566 static void vfio_err_notifier_handler(void *opaque)
2567 {
2568     VFIOPCIDevice *vdev = opaque;
2569 
2570     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
2571         return;
2572     }
2573 
2574     /*
2575      * TBD. Retrieve the error details and decide what action
2576      * needs to be taken. One of the actions could be to pass
2577      * the error to the guest and have the guest driver recover
2578      * from the error. This requires that PCIe capabilities be
2579      * exposed to the guest. For now, we just terminate the
2580      * guest to contain the error.
2581      */
2582 
2583     error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
2584 
2585     vm_stop(RUN_STATE_INTERNAL_ERROR);
2586 }
2587 
2588 /*
2589  * Registers error notifier for devices supporting error recovery.
2590  * If we encounter a failure in this function, we report an error
2591  * and continue after disabling error recovery support for the
2592  * device.
2593  */
2594 static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
2595 {
2596     Error *err = NULL;
2597     int32_t fd;
2598 
2599     if (!vdev->pci_aer) {
2600         return;
2601     }
2602 
2603     if (event_notifier_init(&vdev->err_notifier, 0)) {
2604         error_report("vfio: Unable to init event notifier for error detection");
2605         vdev->pci_aer = false;
2606         return;
2607     }
2608 
2609     fd = event_notifier_get_fd(&vdev->err_notifier);
2610     qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev);
2611 
2612     if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
2613                                VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
2614         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2615         qemu_set_fd_handler(fd, NULL, NULL, vdev);
2616         event_notifier_cleanup(&vdev->err_notifier);
2617         vdev->pci_aer = false;
2618     }
2619 }
2620 
2621 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
2622 {
2623     Error *err = NULL;
2624 
2625     if (!vdev->pci_aer) {
2626         return;
2627     }
2628 
2629     if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
2630                                VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
2631         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2632     }
2633     qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
2634                         NULL, NULL, vdev);
2635     event_notifier_cleanup(&vdev->err_notifier);
2636 }
2637 
2638 static void vfio_req_notifier_handler(void *opaque)
2639 {
2640     VFIOPCIDevice *vdev = opaque;
2641     Error *err = NULL;
2642 
2643     if (!event_notifier_test_and_clear(&vdev->req_notifier)) {
2644         return;
2645     }
2646 
2647     qdev_unplug(DEVICE(vdev), &err);
2648     if (err) {
2649         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2650     }
2651 }
2652 
2653 static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
2654 {
2655     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
2656                                       .index = VFIO_PCI_REQ_IRQ_INDEX };
2657     Error *err = NULL;
2658     int32_t fd;
2659 
2660     if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
2661         return;
2662     }
2663 
2664     if (ioctl(vdev->vbasedev.fd,
2665               VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) {
2666         return;
2667     }
2668 
2669     if (event_notifier_init(&vdev->req_notifier, 0)) {
2670         error_report("vfio: Unable to init event notifier for device request");
2671         return;
2672     }
2673 
2674     fd = event_notifier_get_fd(&vdev->req_notifier);
2675     qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev);
2676 
2677     if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
2678                            VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
2679         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2680         qemu_set_fd_handler(fd, NULL, NULL, vdev);
2681         event_notifier_cleanup(&vdev->req_notifier);
2682     } else {
2683         vdev->req_enabled = true;
2684     }
2685 }
2686 
2687 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
2688 {
2689     Error *err = NULL;
2690 
2691     if (!vdev->req_enabled) {
2692         return;
2693     }
2694 
2695     if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
2696                                VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
2697         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2698     }
2699     qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
2700                         NULL, NULL, vdev);
2701     event_notifier_cleanup(&vdev->req_notifier);
2702 
2703     vdev->req_enabled = false;
2704 }
2705 
2706 static void vfio_realize(PCIDevice *pdev, Error **errp)
2707 {
2708     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
2709     VFIODevice *vbasedev_iter;
2710     VFIOGroup *group;
2711     char *tmp, *subsys, group_path[PATH_MAX], *group_name;
2712     Error *err = NULL;
2713     ssize_t len;
2714     struct stat st;
2715     int groupid;
2716     int i, ret;
2717     bool is_mdev;
2718 
2719     if (!vdev->vbasedev.sysfsdev) {
2720         if (!(~vdev->host.domain || ~vdev->host.bus ||
2721               ~vdev->host.slot || ~vdev->host.function)) {
2722             error_setg(errp, "No provided host device");
2723             error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
2724                               "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
2725             return;
2726         }
2727         vdev->vbasedev.sysfsdev =
2728             g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x",
2729                             vdev->host.domain, vdev->host.bus,
2730                             vdev->host.slot, vdev->host.function);
2731     }
2732 
2733     if (stat(vdev->vbasedev.sysfsdev, &st) < 0) {
2734         error_setg_errno(errp, errno, "no such host device");
2735         error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.sysfsdev);
2736         return;
2737     }
2738 
2739     if (!pdev->failover_pair_id) {
2740         error_setg(&vdev->migration_blocker,
2741                 "VFIO device doesn't support migration");
2742         ret = migrate_add_blocker(vdev->migration_blocker, errp);
2743         if (ret) {
2744             error_free(vdev->migration_blocker);
2745             vdev->migration_blocker = NULL;
2746             return;
2747         }
2748     }
2749 
2750     vdev->vbasedev.name = g_path_get_basename(vdev->vbasedev.sysfsdev);
2751     vdev->vbasedev.ops = &vfio_pci_ops;
2752     vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI;
2753     vdev->vbasedev.dev = DEVICE(vdev);
2754 
2755     tmp = g_strdup_printf("%s/iommu_group", vdev->vbasedev.sysfsdev);
2756     len = readlink(tmp, group_path, sizeof(group_path));
2757     g_free(tmp);
2758 
2759     if (len <= 0 || len >= sizeof(group_path)) {
2760         error_setg_errno(errp, len < 0 ? errno : ENAMETOOLONG,
2761                          "no iommu_group found");
2762         goto error;
2763     }
2764 
2765     group_path[len] = 0;
2766 
2767     group_name = basename(group_path);
2768     if (sscanf(group_name, "%d", &groupid) != 1) {
2769         error_setg_errno(errp, errno, "failed to read %s", group_path);
2770         goto error;
2771     }
2772 
2773     trace_vfio_realize(vdev->vbasedev.name, groupid);
2774 
2775     group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev), errp);
2776     if (!group) {
2777         goto error;
2778     }
2779 
2780     QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
2781         if (strcmp(vbasedev_iter->name, vdev->vbasedev.name) == 0) {
2782             error_setg(errp, "device is already attached");
2783             vfio_put_group(group);
2784             goto error;
2785         }
2786     }
2787 
2788     /*
2789      * Mediated devices *might* operate compatibly with discarding of RAM, but
2790      * we cannot know for certain, it depends on whether the mdev vendor driver
2791      * stays in sync with the active working set of the guest driver.  Prevent
2792      * the x-balloon-allowed option unless this is minimally an mdev device.
2793      */
2794     tmp = g_strdup_printf("%s/subsystem", vdev->vbasedev.sysfsdev);
2795     subsys = realpath(tmp, NULL);
2796     g_free(tmp);
2797     is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0);
2798     free(subsys);
2799 
2800     trace_vfio_mdev(vdev->vbasedev.name, is_mdev);
2801 
2802     if (vdev->vbasedev.ram_block_discard_allowed && !is_mdev) {
2803         error_setg(errp, "x-balloon-allowed only potentially compatible "
2804                    "with mdev devices");
2805         vfio_put_group(group);
2806         goto error;
2807     }
2808 
2809     ret = vfio_get_device(group, vdev->vbasedev.name, &vdev->vbasedev, errp);
2810     if (ret) {
2811         vfio_put_group(group);
2812         goto error;
2813     }
2814 
2815     vfio_populate_device(vdev, &err);
2816     if (err) {
2817         error_propagate(errp, err);
2818         goto error;
2819     }
2820 
2821     /* Get a copy of config space */
2822     ret = pread(vdev->vbasedev.fd, vdev->pdev.config,
2823                 MIN(pci_config_size(&vdev->pdev), vdev->config_size),
2824                 vdev->config_offset);
2825     if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
2826         ret = ret < 0 ? -errno : -EFAULT;
2827         error_setg_errno(errp, -ret, "failed to read device config space");
2828         goto error;
2829     }
2830 
2831     /* vfio emulates a lot for us, but some bits need extra love */
2832     vdev->emulated_config_bits = g_malloc0(vdev->config_size);
2833 
2834     /* QEMU can choose to expose the ROM or not */
2835     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
2836     /* QEMU can also add or extend BARs */
2837     memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
2838 
2839     /*
2840      * The PCI spec reserves vendor ID 0xffff as an invalid value.  The
2841      * device ID is managed by the vendor and need only be a 16-bit value.
2842      * Allow any 16-bit value for subsystem so they can be hidden or changed.
2843      */
2844     if (vdev->vendor_id != PCI_ANY_ID) {
2845         if (vdev->vendor_id >= 0xffff) {
2846             error_setg(errp, "invalid PCI vendor ID provided");
2847             goto error;
2848         }
2849         vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
2850         trace_vfio_pci_emulated_vendor_id(vdev->vbasedev.name, vdev->vendor_id);
2851     } else {
2852         vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
2853     }
2854 
2855     if (vdev->device_id != PCI_ANY_ID) {
2856         if (vdev->device_id > 0xffff) {
2857             error_setg(errp, "invalid PCI device ID provided");
2858             goto error;
2859         }
2860         vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
2861         trace_vfio_pci_emulated_device_id(vdev->vbasedev.name, vdev->device_id);
2862     } else {
2863         vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
2864     }
2865 
2866     if (vdev->sub_vendor_id != PCI_ANY_ID) {
2867         if (vdev->sub_vendor_id > 0xffff) {
2868             error_setg(errp, "invalid PCI subsystem vendor ID provided");
2869             goto error;
2870         }
2871         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
2872                                vdev->sub_vendor_id, ~0);
2873         trace_vfio_pci_emulated_sub_vendor_id(vdev->vbasedev.name,
2874                                               vdev->sub_vendor_id);
2875     }
2876 
2877     if (vdev->sub_device_id != PCI_ANY_ID) {
2878         if (vdev->sub_device_id > 0xffff) {
2879             error_setg(errp, "invalid PCI subsystem device ID provided");
2880             goto error;
2881         }
2882         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
2883         trace_vfio_pci_emulated_sub_device_id(vdev->vbasedev.name,
2884                                               vdev->sub_device_id);
2885     }
2886 
2887     /* QEMU can change multi-function devices to single function, or reverse */
2888     vdev->emulated_config_bits[PCI_HEADER_TYPE] =
2889                                               PCI_HEADER_TYPE_MULTI_FUNCTION;
2890 
2891     /* Restore or clear multifunction, this is always controlled by QEMU */
2892     if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
2893         vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
2894     } else {
2895         vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
2896     }
2897 
2898     /*
2899      * Clear host resource mapping info.  If we choose not to register a
2900      * BAR, such as might be the case with the option ROM, we can get
2901      * confusing, unwritable, residual addresses from the host here.
2902      */
2903     memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
2904     memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
2905 
2906     vfio_pci_size_rom(vdev);
2907 
2908     vfio_bars_prepare(vdev);
2909 
2910     vfio_msix_early_setup(vdev, &err);
2911     if (err) {
2912         error_propagate(errp, err);
2913         goto error;
2914     }
2915 
2916     vfio_bars_register(vdev);
2917 
2918     ret = vfio_add_capabilities(vdev, errp);
2919     if (ret) {
2920         goto out_teardown;
2921     }
2922 
2923     if (vdev->vga) {
2924         vfio_vga_quirk_setup(vdev);
2925     }
2926 
2927     for (i = 0; i < PCI_ROM_SLOT; i++) {
2928         vfio_bar_quirk_setup(vdev, i);
2929     }
2930 
2931     if (!vdev->igd_opregion &&
2932         vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) {
2933         struct vfio_region_info *opregion;
2934 
2935         if (vdev->pdev.qdev.hotplugged) {
2936             error_setg(errp,
2937                        "cannot support IGD OpRegion feature on hotplugged "
2938                        "device");
2939             goto out_teardown;
2940         }
2941 
2942         ret = vfio_get_dev_region_info(&vdev->vbasedev,
2943                         VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
2944                         VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion);
2945         if (ret) {
2946             error_setg_errno(errp, -ret,
2947                              "does not support requested IGD OpRegion feature");
2948             goto out_teardown;
2949         }
2950 
2951         ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
2952         g_free(opregion);
2953         if (ret) {
2954             goto out_teardown;
2955         }
2956     }
2957 
2958     /* QEMU emulates all of MSI & MSIX */
2959     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
2960         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
2961                MSIX_CAP_LENGTH);
2962     }
2963 
2964     if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
2965         memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
2966                vdev->msi_cap_size);
2967     }
2968 
2969     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
2970         vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
2971                                                   vfio_intx_mmap_enable, vdev);
2972         pci_device_set_intx_routing_notifier(&vdev->pdev,
2973                                              vfio_intx_routing_notifier);
2974         vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
2975         kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
2976         ret = vfio_intx_enable(vdev, errp);
2977         if (ret) {
2978             goto out_deregister;
2979         }
2980     }
2981 
2982     if (vdev->display != ON_OFF_AUTO_OFF) {
2983         ret = vfio_display_probe(vdev, errp);
2984         if (ret) {
2985             goto out_deregister;
2986         }
2987     }
2988     if (vdev->enable_ramfb && vdev->dpy == NULL) {
2989         error_setg(errp, "ramfb=on requires display=on");
2990         goto out_deregister;
2991     }
2992     if (vdev->display_xres || vdev->display_yres) {
2993         if (vdev->dpy == NULL) {
2994             error_setg(errp, "xres and yres properties require display=on");
2995             goto out_deregister;
2996         }
2997         if (vdev->dpy->edid_regs == NULL) {
2998             error_setg(errp, "xres and yres properties need edid support");
2999             goto out_deregister;
3000         }
3001     }
3002 
3003     if (vdev->vendor_id == PCI_VENDOR_ID_NVIDIA) {
3004         ret = vfio_pci_nvidia_v100_ram_init(vdev, errp);
3005         if (ret && ret != -ENODEV) {
3006             error_report("Failed to setup NVIDIA V100 GPU RAM");
3007         }
3008     }
3009 
3010     if (vdev->vendor_id == PCI_VENDOR_ID_IBM) {
3011         ret = vfio_pci_nvlink2_init(vdev, errp);
3012         if (ret && ret != -ENODEV) {
3013             error_report("Failed to setup NVlink2 bridge");
3014         }
3015     }
3016 
3017     vfio_register_err_notifier(vdev);
3018     vfio_register_req_notifier(vdev);
3019     vfio_setup_resetfn_quirk(vdev);
3020 
3021     return;
3022 
3023 out_deregister:
3024     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3025     kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3026 out_teardown:
3027     vfio_teardown_msi(vdev);
3028     vfio_bars_exit(vdev);
3029 error:
3030     error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3031     if (vdev->migration_blocker) {
3032         migrate_del_blocker(vdev->migration_blocker);
3033         error_free(vdev->migration_blocker);
3034         vdev->migration_blocker = NULL;
3035     }
3036 }
3037 
3038 static void vfio_instance_finalize(Object *obj)
3039 {
3040     VFIOPCIDevice *vdev = PCI_VFIO(obj);
3041     VFIOGroup *group = vdev->vbasedev.group;
3042 
3043     vfio_display_finalize(vdev);
3044     vfio_bars_finalize(vdev);
3045     g_free(vdev->emulated_config_bits);
3046     g_free(vdev->rom);
3047     if (vdev->migration_blocker) {
3048         migrate_del_blocker(vdev->migration_blocker);
3049         error_free(vdev->migration_blocker);
3050     }
3051     /*
3052      * XXX Leaking igd_opregion is not an oversight, we can't remove the
3053      * fw_cfg entry therefore leaking this allocation seems like the safest
3054      * option.
3055      *
3056      * g_free(vdev->igd_opregion);
3057      */
3058     vfio_put_device(vdev);
3059     vfio_put_group(group);
3060 }
3061 
3062 static void vfio_exitfn(PCIDevice *pdev)
3063 {
3064     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
3065 
3066     vfio_unregister_req_notifier(vdev);
3067     vfio_unregister_err_notifier(vdev);
3068     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3069     if (vdev->irqchip_change_notifier.notify) {
3070         kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3071     }
3072     vfio_disable_interrupts(vdev);
3073     if (vdev->intx.mmap_timer) {
3074         timer_free(vdev->intx.mmap_timer);
3075     }
3076     vfio_teardown_msi(vdev);
3077     vfio_bars_exit(vdev);
3078 }
3079 
3080 static void vfio_pci_reset(DeviceState *dev)
3081 {
3082     VFIOPCIDevice *vdev = PCI_VFIO(dev);
3083 
3084     trace_vfio_pci_reset(vdev->vbasedev.name);
3085 
3086     vfio_pci_pre_reset(vdev);
3087 
3088     if (vdev->display != ON_OFF_AUTO_OFF) {
3089         vfio_display_reset(vdev);
3090     }
3091 
3092     if (vdev->resetfn && !vdev->resetfn(vdev)) {
3093         goto post_reset;
3094     }
3095 
3096     if (vdev->vbasedev.reset_works &&
3097         (vdev->has_flr || !vdev->has_pm_reset) &&
3098         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3099         trace_vfio_pci_reset_flr(vdev->vbasedev.name);
3100         goto post_reset;
3101     }
3102 
3103     /* See if we can do our own bus reset */
3104     if (!vfio_pci_hot_reset_one(vdev)) {
3105         goto post_reset;
3106     }
3107 
3108     /* If nothing else works and the device supports PM reset, use it */
3109     if (vdev->vbasedev.reset_works && vdev->has_pm_reset &&
3110         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3111         trace_vfio_pci_reset_pm(vdev->vbasedev.name);
3112         goto post_reset;
3113     }
3114 
3115 post_reset:
3116     vfio_pci_post_reset(vdev);
3117 }
3118 
3119 static void vfio_instance_init(Object *obj)
3120 {
3121     PCIDevice *pci_dev = PCI_DEVICE(obj);
3122     VFIOPCIDevice *vdev = PCI_VFIO(obj);
3123 
3124     device_add_bootindex_property(obj, &vdev->bootindex,
3125                                   "bootindex", NULL,
3126                                   &pci_dev->qdev);
3127     vdev->host.domain = ~0U;
3128     vdev->host.bus = ~0U;
3129     vdev->host.slot = ~0U;
3130     vdev->host.function = ~0U;
3131 
3132     vdev->nv_gpudirect_clique = 0xFF;
3133 
3134     /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
3135      * line, therefore, no need to wait to realize like other devices */
3136     pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
3137 }
3138 
3139 static Property vfio_pci_dev_properties[] = {
3140     DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
3141     DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
3142     DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
3143                             display, ON_OFF_AUTO_OFF),
3144     DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
3145     DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
3146     DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
3147                        intx.mmap_timeout, 1100),
3148     DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
3149                     VFIO_FEATURE_ENABLE_VGA_BIT, false),
3150     DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
3151                     VFIO_FEATURE_ENABLE_REQ_BIT, true),
3152     DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
3153                     VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
3154     DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
3155     DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
3156                      vbasedev.ram_block_discard_allowed, false),
3157     DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
3158     DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
3159     DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
3160     DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
3161                      no_geforce_quirks, false),
3162     DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
3163                      false),
3164     DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
3165                      false),
3166     DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
3167     DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
3168     DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
3169                        sub_vendor_id, PCI_ANY_ID),
3170     DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
3171                        sub_device_id, PCI_ANY_ID),
3172     DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
3173     DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
3174                                    nv_gpudirect_clique,
3175                                    qdev_prop_nv_gpudirect_clique, uint8_t),
3176     DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
3177                                 OFF_AUTOPCIBAR_OFF),
3178     /*
3179      * TODO - support passed fds... is this necessary?
3180      * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
3181      * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name),
3182      */
3183     DEFINE_PROP_END_OF_LIST(),
3184 };
3185 
3186 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
3187 {
3188     DeviceClass *dc = DEVICE_CLASS(klass);
3189     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3190 
3191     dc->reset = vfio_pci_reset;
3192     device_class_set_props(dc, vfio_pci_dev_properties);
3193     dc->desc = "VFIO-based PCI device assignment";
3194     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3195     pdc->realize = vfio_realize;
3196     pdc->exit = vfio_exitfn;
3197     pdc->config_read = vfio_pci_read_config;
3198     pdc->config_write = vfio_pci_write_config;
3199 }
3200 
3201 static const TypeInfo vfio_pci_dev_info = {
3202     .name = TYPE_VFIO_PCI,
3203     .parent = TYPE_PCI_DEVICE,
3204     .instance_size = sizeof(VFIOPCIDevice),
3205     .class_init = vfio_pci_dev_class_init,
3206     .instance_init = vfio_instance_init,
3207     .instance_finalize = vfio_instance_finalize,
3208     .interfaces = (InterfaceInfo[]) {
3209         { INTERFACE_PCIE_DEVICE },
3210         { INTERFACE_CONVENTIONAL_PCI_DEVICE },
3211         { }
3212     },
3213 };
3214 
3215 static Property vfio_pci_dev_nohotplug_properties[] = {
3216     DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false),
3217     DEFINE_PROP_END_OF_LIST(),
3218 };
3219 
3220 static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data)
3221 {
3222     DeviceClass *dc = DEVICE_CLASS(klass);
3223 
3224     device_class_set_props(dc, vfio_pci_dev_nohotplug_properties);
3225     dc->hotpluggable = false;
3226 }
3227 
3228 static const TypeInfo vfio_pci_nohotplug_dev_info = {
3229     .name = TYPE_VFIO_PCI_NOHOTPLUG,
3230     .parent = TYPE_VFIO_PCI,
3231     .instance_size = sizeof(VFIOPCIDevice),
3232     .class_init = vfio_pci_nohotplug_dev_class_init,
3233 };
3234 
3235 static void register_vfio_pci_dev_type(void)
3236 {
3237     type_register_static(&vfio_pci_dev_info);
3238     type_register_static(&vfio_pci_nohotplug_dev_info);
3239 }
3240 
3241 type_init(register_vfio_pci_dev_type)
3242