xref: /openbmc/qemu/hw/vfio/pci.c (revision 28ae3179fc52d2e4d870b635c4a412aab99759e7)
1cf7087dbSKim Phillips /*
2cf7087dbSKim Phillips  * vfio based device assignment support
3cf7087dbSKim Phillips  *
4cf7087dbSKim Phillips  * Copyright Red Hat, Inc. 2012
5cf7087dbSKim Phillips  *
6cf7087dbSKim Phillips  * Authors:
7cf7087dbSKim Phillips  *  Alex Williamson <alex.williamson@redhat.com>
8cf7087dbSKim Phillips  *
9cf7087dbSKim Phillips  * This work is licensed under the terms of the GNU GPL, version 2.  See
10cf7087dbSKim Phillips  * the COPYING file in the top-level directory.
11cf7087dbSKim Phillips  *
12cf7087dbSKim Phillips  * Based on qemu-kvm device-assignment:
13cf7087dbSKim Phillips  *  Adapted for KVM by Qumranet.
14cf7087dbSKim Phillips  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15cf7087dbSKim Phillips  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16cf7087dbSKim Phillips  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17cf7087dbSKim Phillips  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18cf7087dbSKim Phillips  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19cf7087dbSKim Phillips  */
20cf7087dbSKim Phillips 
21c6eacb1aSPeter Maydell #include "qemu/osdep.h"
22ee42b261SEric Auger #include CONFIG_DEVICES /* CONFIG_IOMMUFD */
23cf7087dbSKim Phillips #include <linux/vfio.h>
24cf7087dbSKim Phillips #include <sys/ioctl.h>
25cf7087dbSKim Phillips 
26650d103dSMarkus Armbruster #include "hw/hw.h"
27cf7087dbSKim Phillips #include "hw/pci/msi.h"
28cf7087dbSKim Phillips #include "hw/pci/msix.h"
290282abf0SAlex Williamson #include "hw/pci/pci_bridge.h"
30a27bd6c7SMarkus Armbruster #include "hw/qdev-properties.h"
31ce35e229SEduardo Habkost #include "hw/qdev-properties-system.h"
32d6454270SMarkus Armbruster #include "migration/vmstate.h"
33f3558b1bSKevin Wolf #include "qapi/qmp/qdict.h"
34cf7087dbSKim Phillips #include "qemu/error-report.h"
35db725815SMarkus Armbruster #include "qemu/main-loop.h"
360b8fa32fSMarkus Armbruster #include "qemu/module.h"
37cf7087dbSKim Phillips #include "qemu/range.h"
38e0255bb1SPhilippe Mathieu-Daudé #include "qemu/units.h"
39cf7087dbSKim Phillips #include "sysemu/kvm.h"
4054d31236SMarkus Armbruster #include "sysemu/runstate.h"
4178f33d2bSAlex Williamson #include "pci.h"
42cf7087dbSKim Phillips #include "trace.h"
431108b2f8SCao jin #include "qapi/error.h"
44f045a010SJens Freimann #include "migration/blocker.h"
45c5e2fb3cSKirti Wankhede #include "migration/qemu-file.h"
46ee42b261SEric Auger #include "sysemu/iommufd.h"
47cf7087dbSKim Phillips 
48f75ca627SChen Zhang #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
490c0c8f8aSLi Qiang 
50dc580d51SLongpeng(Mike) /* Protected by BQL */
51dc580d51SLongpeng(Mike) static KVMRouteChange vfio_route_change;
52dc580d51SLongpeng(Mike) 
539ee27d73SEric Auger static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
549ee27d73SEric Auger static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
558ab217d5SLongpeng(Mike) static void vfio_msi_disable_common(VFIOPCIDevice *vdev);
56cf7087dbSKim Phillips 
57cf7087dbSKim Phillips /*
58cf7087dbSKim Phillips  * Disabling BAR mmaping can be slow, but toggling it around INTx can
59cf7087dbSKim Phillips  * also be a huge overhead.  We try to get the best of both worlds by
60cf7087dbSKim Phillips  * waiting until an interrupt to disable mmaps (subsequent transitions
61cf7087dbSKim Phillips  * to the same state are effectively no overhead).  If the interrupt has
62cf7087dbSKim Phillips  * been serviced and the time gap is long enough, we re-enable mmaps for
63cf7087dbSKim Phillips  * performance.  This works well for things like graphics cards, which
64cf7087dbSKim Phillips  * may not use their interrupt at all and are penalized to an unusable
65cf7087dbSKim Phillips  * level by read/write BAR traps.  Other devices, like NICs, have more
66cf7087dbSKim Phillips  * regular interrupts and see much better latency by staying in non-mmap
67cf7087dbSKim Phillips  * mode.  We therefore set the default mmap_timeout such that a ping
68cf7087dbSKim Phillips  * is just enough to keep the mmap disabled.  Users can experiment with
69cf7087dbSKim Phillips  * other options with the x-intx-mmap-timeout-ms parameter (a value of
70cf7087dbSKim Phillips  * zero disables the timer).
71cf7087dbSKim Phillips  */
vfio_intx_mmap_enable(void * opaque)72cf7087dbSKim Phillips static void vfio_intx_mmap_enable(void *opaque)
73cf7087dbSKim Phillips {
749ee27d73SEric Auger     VFIOPCIDevice *vdev = opaque;
75cf7087dbSKim Phillips 
76cf7087dbSKim Phillips     if (vdev->intx.pending) {
77cf7087dbSKim Phillips         timer_mod(vdev->intx.mmap_timer,
78cf7087dbSKim Phillips                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
79cf7087dbSKim Phillips         return;
80cf7087dbSKim Phillips     }
81cf7087dbSKim Phillips 
82cf7087dbSKim Phillips     vfio_mmap_set_enabled(vdev, true);
83cf7087dbSKim Phillips }
84cf7087dbSKim Phillips 
vfio_intx_interrupt(void * opaque)85cf7087dbSKim Phillips static void vfio_intx_interrupt(void *opaque)
86cf7087dbSKim Phillips {
879ee27d73SEric Auger     VFIOPCIDevice *vdev = opaque;
88cf7087dbSKim Phillips 
89cf7087dbSKim Phillips     if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
90cf7087dbSKim Phillips         return;
91cf7087dbSKim Phillips     }
92cf7087dbSKim Phillips 
93df92ee44SEric Auger     trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin);
94cf7087dbSKim Phillips 
95cf7087dbSKim Phillips     vdev->intx.pending = true;
96cf7087dbSKim Phillips     pci_irq_assert(&vdev->pdev);
97cf7087dbSKim Phillips     vfio_mmap_set_enabled(vdev, false);
98cf7087dbSKim Phillips     if (vdev->intx.mmap_timeout) {
99cf7087dbSKim Phillips         timer_mod(vdev->intx.mmap_timer,
100cf7087dbSKim Phillips                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
101cf7087dbSKim Phillips     }
102cf7087dbSKim Phillips }
103cf7087dbSKim Phillips 
vfio_intx_eoi(VFIODevice * vbasedev)104870cb6f1SAlex Williamson static void vfio_intx_eoi(VFIODevice *vbasedev)
105cf7087dbSKim Phillips {
106a664477dSEric Auger     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
107a664477dSEric Auger 
108cf7087dbSKim Phillips     if (!vdev->intx.pending) {
109cf7087dbSKim Phillips         return;
110cf7087dbSKim Phillips     }
111cf7087dbSKim Phillips 
112870cb6f1SAlex Williamson     trace_vfio_intx_eoi(vbasedev->name);
113cf7087dbSKim Phillips 
114cf7087dbSKim Phillips     vdev->intx.pending = false;
115cf7087dbSKim Phillips     pci_irq_deassert(&vdev->pdev);
116a664477dSEric Auger     vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
117cf7087dbSKim Phillips }
118cf7087dbSKim Phillips 
vfio_intx_enable_kvm(VFIOPCIDevice * vdev,Error ** errp)11944cd660aSZhenzhong Duan static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
120cf7087dbSKim Phillips {
121cf7087dbSKim Phillips #ifdef CONFIG_KVM
12297a37576SPeter Xu     int irq_fd = event_notifier_get_fd(&vdev->intx.interrupt);
123cf7087dbSKim Phillips 
12446746dbaSAlex Williamson     if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
125cf7087dbSKim Phillips         vdev->intx.route.mode != PCI_INTX_ENABLED ||
126cf7087dbSKim Phillips         !kvm_resamplefds_enabled()) {
12744cd660aSZhenzhong Duan         return true;
128cf7087dbSKim Phillips     }
129cf7087dbSKim Phillips 
130cf7087dbSKim Phillips     /* Get to a known interrupt state */
13197a37576SPeter Xu     qemu_set_fd_handler(irq_fd, NULL, NULL, vdev);
1325546a621SEric Auger     vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
133cf7087dbSKim Phillips     vdev->intx.pending = false;
134cf7087dbSKim Phillips     pci_irq_deassert(&vdev->pdev);
135cf7087dbSKim Phillips 
136cf7087dbSKim Phillips     /* Get an eventfd for resample/unmask */
137cf7087dbSKim Phillips     if (event_notifier_init(&vdev->intx.unmask, 0)) {
1387dfb3424SEric Auger         error_setg(errp, "event_notifier_init failed eoi");
139cf7087dbSKim Phillips         goto fail;
140cf7087dbSKim Phillips     }
141cf7087dbSKim Phillips 
14297a37576SPeter Xu     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
14397a37576SPeter Xu                                            &vdev->intx.interrupt,
14497a37576SPeter Xu                                            &vdev->intx.unmask,
14597a37576SPeter Xu                                            vdev->intx.route.irq)) {
1467dfb3424SEric Auger         error_setg_errno(errp, errno, "failed to setup resample irqfd");
147cf7087dbSKim Phillips         goto fail_irqfd;
148cf7087dbSKim Phillips     }
149cf7087dbSKim Phillips 
15084e37d02SZhenzhong Duan     if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
151201a7331SEric Auger                                 VFIO_IRQ_SET_ACTION_UNMASK,
15297a37576SPeter Xu                                 event_notifier_get_fd(&vdev->intx.unmask),
153668f62ecSMarkus Armbruster                                 errp)) {
154cf7087dbSKim Phillips         goto fail_vfio;
155cf7087dbSKim Phillips     }
156cf7087dbSKim Phillips 
157cf7087dbSKim Phillips     /* Let'em rip */
1585546a621SEric Auger     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
159cf7087dbSKim Phillips 
160cf7087dbSKim Phillips     vdev->intx.kvm_accel = true;
161cf7087dbSKim Phillips 
162870cb6f1SAlex Williamson     trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
163cf7087dbSKim Phillips 
16444cd660aSZhenzhong Duan     return true;
165cf7087dbSKim Phillips 
166cf7087dbSKim Phillips fail_vfio:
16797a37576SPeter Xu     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
16897a37576SPeter Xu                                           vdev->intx.route.irq);
169cf7087dbSKim Phillips fail_irqfd:
170cf7087dbSKim Phillips     event_notifier_cleanup(&vdev->intx.unmask);
171cf7087dbSKim Phillips fail:
17297a37576SPeter Xu     qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev);
1735546a621SEric Auger     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
17444cd660aSZhenzhong Duan     return false;
17544cd660aSZhenzhong Duan #else
17644cd660aSZhenzhong Duan     return true;
177cf7087dbSKim Phillips #endif
178cf7087dbSKim Phillips }
179cf7087dbSKim Phillips 
vfio_intx_disable_kvm(VFIOPCIDevice * vdev)180870cb6f1SAlex Williamson static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
181cf7087dbSKim Phillips {
182cf7087dbSKim Phillips #ifdef CONFIG_KVM
183cf7087dbSKim Phillips     if (!vdev->intx.kvm_accel) {
184cf7087dbSKim Phillips         return;
185cf7087dbSKim Phillips     }
186cf7087dbSKim Phillips 
187cf7087dbSKim Phillips     /*
188cf7087dbSKim Phillips      * Get to a known state, hardware masked, QEMU ready to accept new
189cf7087dbSKim Phillips      * interrupts, QEMU IRQ de-asserted.
190cf7087dbSKim Phillips      */
1915546a621SEric Auger     vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
192cf7087dbSKim Phillips     vdev->intx.pending = false;
193cf7087dbSKim Phillips     pci_irq_deassert(&vdev->pdev);
194cf7087dbSKim Phillips 
195cf7087dbSKim Phillips     /* Tell KVM to stop listening for an INTx irqfd */
19697a37576SPeter Xu     if (kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
19797a37576SPeter Xu                                               vdev->intx.route.irq)) {
198cf7087dbSKim Phillips         error_report("vfio: Error: Failed to disable INTx irqfd: %m");
199cf7087dbSKim Phillips     }
200cf7087dbSKim Phillips 
201cf7087dbSKim Phillips     /* We only need to close the eventfd for VFIO to cleanup the kernel side */
202cf7087dbSKim Phillips     event_notifier_cleanup(&vdev->intx.unmask);
203cf7087dbSKim Phillips 
204cf7087dbSKim Phillips     /* QEMU starts listening for interrupt events. */
20597a37576SPeter Xu     qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt),
20697a37576SPeter Xu                         vfio_intx_interrupt, NULL, vdev);
207cf7087dbSKim Phillips 
208cf7087dbSKim Phillips     vdev->intx.kvm_accel = false;
209cf7087dbSKim Phillips 
210cf7087dbSKim Phillips     /* If we've missed an event, let it re-fire through QEMU */
2115546a621SEric Auger     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
212cf7087dbSKim Phillips 
213870cb6f1SAlex Williamson     trace_vfio_intx_disable_kvm(vdev->vbasedev.name);
214cf7087dbSKim Phillips #endif
215cf7087dbSKim Phillips }
216cf7087dbSKim Phillips 
vfio_intx_update(VFIOPCIDevice * vdev,PCIINTxRoute * route)217ad54dbd8SDavid Gibson static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route)
218cf7087dbSKim Phillips {
2197dfb3424SEric Auger     Error *err = NULL;
220cf7087dbSKim Phillips 
221870cb6f1SAlex Williamson     trace_vfio_intx_update(vdev->vbasedev.name,
222ad54dbd8SDavid Gibson                            vdev->intx.route.irq, route->irq);
223cf7087dbSKim Phillips 
224870cb6f1SAlex Williamson     vfio_intx_disable_kvm(vdev);
225cf7087dbSKim Phillips 
226ad54dbd8SDavid Gibson     vdev->intx.route = *route;
227cf7087dbSKim Phillips 
228ad54dbd8SDavid Gibson     if (route->mode != PCI_INTX_ENABLED) {
229cf7087dbSKim Phillips         return;
230cf7087dbSKim Phillips     }
231cf7087dbSKim Phillips 
23244cd660aSZhenzhong Duan     if (!vfio_intx_enable_kvm(vdev, &err)) {
233e1eb292aSMarkus Armbruster         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2347dfb3424SEric Auger     }
235cf7087dbSKim Phillips 
236cf7087dbSKim Phillips     /* Re-enable the interrupt in cased we missed an EOI */
237870cb6f1SAlex Williamson     vfio_intx_eoi(&vdev->vbasedev);
238cf7087dbSKim Phillips }
239cf7087dbSKim Phillips 
vfio_intx_routing_notifier(PCIDevice * pdev)240ad54dbd8SDavid Gibson static void vfio_intx_routing_notifier(PCIDevice *pdev)
241ad54dbd8SDavid Gibson {
24201b46064SEduardo Habkost     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
243ad54dbd8SDavid Gibson     PCIINTxRoute route;
244ad54dbd8SDavid Gibson 
245ad54dbd8SDavid Gibson     if (vdev->interrupt != VFIO_INT_INTx) {
246ad54dbd8SDavid Gibson         return;
247ad54dbd8SDavid Gibson     }
248ad54dbd8SDavid Gibson 
249ad54dbd8SDavid Gibson     route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
250ad54dbd8SDavid Gibson 
251ad54dbd8SDavid Gibson     if (pci_intx_route_changed(&vdev->intx.route, &route)) {
252ad54dbd8SDavid Gibson         vfio_intx_update(vdev, &route);
253ad54dbd8SDavid Gibson     }
254ad54dbd8SDavid Gibson }
255ad54dbd8SDavid Gibson 
vfio_irqchip_change(Notifier * notify,void * data)256c5478feaSDavid Gibson static void vfio_irqchip_change(Notifier *notify, void *data)
257c5478feaSDavid Gibson {
258c5478feaSDavid Gibson     VFIOPCIDevice *vdev = container_of(notify, VFIOPCIDevice,
259c5478feaSDavid Gibson                                        irqchip_change_notifier);
260c5478feaSDavid Gibson 
261c5478feaSDavid Gibson     vfio_intx_update(vdev, &vdev->intx.route);
262c5478feaSDavid Gibson }
263c5478feaSDavid Gibson 
vfio_intx_enable(VFIOPCIDevice * vdev,Error ** errp)264c32bab07SZhenzhong Duan static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
265cf7087dbSKim Phillips {
266cf7087dbSKim Phillips     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
2677dfb3424SEric Auger     Error *err = NULL;
268201a7331SEric Auger     int32_t fd;
269201a7331SEric Auger     int ret;
270201a7331SEric Auger 
271cf7087dbSKim Phillips 
272cf7087dbSKim Phillips     if (!pin) {
273c32bab07SZhenzhong Duan         return true;
274cf7087dbSKim Phillips     }
275cf7087dbSKim Phillips 
276cf7087dbSKim Phillips     vfio_disable_interrupts(vdev);
277cf7087dbSKim Phillips 
278cf7087dbSKim Phillips     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
279cf7087dbSKim Phillips     pci_config_set_interrupt_pin(vdev->pdev.config, pin);
280cf7087dbSKim Phillips 
281cf7087dbSKim Phillips #ifdef CONFIG_KVM
282cf7087dbSKim Phillips     /*
283cf7087dbSKim Phillips      * Only conditional to avoid generating error messages on platforms
284cf7087dbSKim Phillips      * where we won't actually use the result anyway.
285cf7087dbSKim Phillips      */
286cf7087dbSKim Phillips     if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
287cf7087dbSKim Phillips         vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
288cf7087dbSKim Phillips                                                         vdev->intx.pin);
289cf7087dbSKim Phillips     }
290cf7087dbSKim Phillips #endif
291cf7087dbSKim Phillips 
292cf7087dbSKim Phillips     ret = event_notifier_init(&vdev->intx.interrupt, 0);
293cf7087dbSKim Phillips     if (ret) {
2947dfb3424SEric Auger         error_setg_errno(errp, -ret, "event_notifier_init failed");
295c32bab07SZhenzhong Duan         return false;
296cf7087dbSKim Phillips     }
297201a7331SEric Auger     fd = event_notifier_get_fd(&vdev->intx.interrupt);
298201a7331SEric Auger     qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
299cf7087dbSKim Phillips 
30084e37d02SZhenzhong Duan     if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
301af175e85SMarkus Armbruster                                 VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
302201a7331SEric Auger         qemu_set_fd_handler(fd, NULL, NULL, vdev);
303cf7087dbSKim Phillips         event_notifier_cleanup(&vdev->intx.interrupt);
304c32bab07SZhenzhong Duan         return false;
305cf7087dbSKim Phillips     }
306cf7087dbSKim Phillips 
30744cd660aSZhenzhong Duan     if (!vfio_intx_enable_kvm(vdev, &err)) {
308e1eb292aSMarkus Armbruster         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3097dfb3424SEric Auger     }
310cf7087dbSKim Phillips 
311cf7087dbSKim Phillips     vdev->interrupt = VFIO_INT_INTx;
312cf7087dbSKim Phillips 
313870cb6f1SAlex Williamson     trace_vfio_intx_enable(vdev->vbasedev.name);
314c32bab07SZhenzhong Duan     return true;
315cf7087dbSKim Phillips }
316cf7087dbSKim Phillips 
vfio_intx_disable(VFIOPCIDevice * vdev)317870cb6f1SAlex Williamson static void vfio_intx_disable(VFIOPCIDevice *vdev)
318cf7087dbSKim Phillips {
319cf7087dbSKim Phillips     int fd;
320cf7087dbSKim Phillips 
321cf7087dbSKim Phillips     timer_del(vdev->intx.mmap_timer);
322870cb6f1SAlex Williamson     vfio_intx_disable_kvm(vdev);
3235546a621SEric Auger     vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
324cf7087dbSKim Phillips     vdev->intx.pending = false;
325cf7087dbSKim Phillips     pci_irq_deassert(&vdev->pdev);
326cf7087dbSKim Phillips     vfio_mmap_set_enabled(vdev, true);
327cf7087dbSKim Phillips 
328cf7087dbSKim Phillips     fd = event_notifier_get_fd(&vdev->intx.interrupt);
329cf7087dbSKim Phillips     qemu_set_fd_handler(fd, NULL, NULL, vdev);
330cf7087dbSKim Phillips     event_notifier_cleanup(&vdev->intx.interrupt);
331cf7087dbSKim Phillips 
332cf7087dbSKim Phillips     vdev->interrupt = VFIO_INT_NONE;
333cf7087dbSKim Phillips 
334870cb6f1SAlex Williamson     trace_vfio_intx_disable(vdev->vbasedev.name);
335cf7087dbSKim Phillips }
336cf7087dbSKim Phillips 
337cf7087dbSKim Phillips /*
338cf7087dbSKim Phillips  * MSI/X
339cf7087dbSKim Phillips  */
vfio_msi_interrupt(void * opaque)340cf7087dbSKim Phillips static void vfio_msi_interrupt(void *opaque)
341cf7087dbSKim Phillips {
342cf7087dbSKim Phillips     VFIOMSIVector *vector = opaque;
3439ee27d73SEric Auger     VFIOPCIDevice *vdev = vector->vdev;
3440de70dc7SAlex Williamson     MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector);
3450de70dc7SAlex Williamson     void (*notify)(PCIDevice *dev, unsigned vector);
3460de70dc7SAlex Williamson     MSIMessage msg;
347cf7087dbSKim Phillips     int nr = vector - vdev->msi_vectors;
348cf7087dbSKim Phillips 
349cf7087dbSKim Phillips     if (!event_notifier_test_and_clear(&vector->interrupt)) {
350cf7087dbSKim Phillips         return;
351cf7087dbSKim Phillips     }
352cf7087dbSKim Phillips 
353cf7087dbSKim Phillips     if (vdev->interrupt == VFIO_INT_MSIX) {
3540de70dc7SAlex Williamson         get_msg = msix_get_message;
3550de70dc7SAlex Williamson         notify = msix_notify;
35695239e16SAlex Williamson 
35795239e16SAlex Williamson         /* A masked vector firing needs to use the PBA, enable it */
35895239e16SAlex Williamson         if (msix_is_masked(&vdev->pdev, nr)) {
35995239e16SAlex Williamson             set_bit(nr, vdev->msix->pending);
36095239e16SAlex Williamson             memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true);
36195239e16SAlex Williamson             trace_vfio_msix_pba_enable(vdev->vbasedev.name);
36295239e16SAlex Williamson         }
363cf7087dbSKim Phillips     } else if (vdev->interrupt == VFIO_INT_MSI) {
3640de70dc7SAlex Williamson         get_msg = msi_get_message;
3650de70dc7SAlex Williamson         notify = msi_notify;
366cf7087dbSKim Phillips     } else {
367cf7087dbSKim Phillips         abort();
368cf7087dbSKim Phillips     }
369cf7087dbSKim Phillips 
3700de70dc7SAlex Williamson     msg = get_msg(&vdev->pdev, nr);
371bc5baffaSAlexey Kardashevskiy     trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data);
3720de70dc7SAlex Williamson     notify(&vdev->pdev, nr);
373cf7087dbSKim Phillips }
374cf7087dbSKim Phillips 
3755ebffa4eSJing Liu /*
3765ebffa4eSJing Liu  * Get MSI-X enabled, but no vector enabled, by setting vector 0 with an invalid
3775ebffa4eSJing Liu  * fd to kernel.
3785ebffa4eSJing Liu  */
vfio_enable_msix_no_vec(VFIOPCIDevice * vdev)3795ebffa4eSJing Liu static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
3805ebffa4eSJing Liu {
3815ebffa4eSJing Liu     g_autofree struct vfio_irq_set *irq_set = NULL;
3825ebffa4eSJing Liu     int ret = 0, argsz;
3835ebffa4eSJing Liu     int32_t *fd;
3845ebffa4eSJing Liu 
3855ebffa4eSJing Liu     argsz = sizeof(*irq_set) + sizeof(*fd);
3865ebffa4eSJing Liu 
3875ebffa4eSJing Liu     irq_set = g_malloc0(argsz);
3885ebffa4eSJing Liu     irq_set->argsz = argsz;
3895ebffa4eSJing Liu     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3905ebffa4eSJing Liu                      VFIO_IRQ_SET_ACTION_TRIGGER;
3915ebffa4eSJing Liu     irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
3925ebffa4eSJing Liu     irq_set->start = 0;
3935ebffa4eSJing Liu     irq_set->count = 1;
3945ebffa4eSJing Liu     fd = (int32_t *)&irq_set->data;
3955ebffa4eSJing Liu     *fd = -1;
3965ebffa4eSJing Liu 
3975ebffa4eSJing Liu     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
3985ebffa4eSJing Liu 
3995ebffa4eSJing Liu     return ret;
4005ebffa4eSJing Liu }
4015ebffa4eSJing Liu 
vfio_enable_vectors(VFIOPCIDevice * vdev,bool msix)4029ee27d73SEric Auger static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
403cf7087dbSKim Phillips {
404cf7087dbSKim Phillips     struct vfio_irq_set *irq_set;
405cf7087dbSKim Phillips     int ret = 0, i, argsz;
406cf7087dbSKim Phillips     int32_t *fds;
407cf7087dbSKim Phillips 
408eaadba6fSJing Liu     /*
409eaadba6fSJing Liu      * If dynamic MSI-X allocation is supported, the vectors to be allocated
410eaadba6fSJing Liu      * and enabled can be scattered. Before kernel enabling MSI-X, setting
411eaadba6fSJing Liu      * nr_vectors causes all these vectors to be allocated on host.
412eaadba6fSJing Liu      *
413eaadba6fSJing Liu      * To keep allocation as needed, use vector 0 with an invalid fd to get
414eaadba6fSJing Liu      * MSI-X enabled first, then set vectors with a potentially sparse set of
415eaadba6fSJing Liu      * eventfds to enable interrupts only when enabled in guest.
416eaadba6fSJing Liu      */
417eaadba6fSJing Liu     if (msix && !vdev->msix->noresize) {
418eaadba6fSJing Liu         ret = vfio_enable_msix_no_vec(vdev);
419eaadba6fSJing Liu 
420eaadba6fSJing Liu         if (ret) {
421eaadba6fSJing Liu             return ret;
422eaadba6fSJing Liu         }
423eaadba6fSJing Liu     }
424eaadba6fSJing Liu 
425cf7087dbSKim Phillips     argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
426cf7087dbSKim Phillips 
427cf7087dbSKim Phillips     irq_set = g_malloc0(argsz);
428cf7087dbSKim Phillips     irq_set->argsz = argsz;
429cf7087dbSKim Phillips     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
430cf7087dbSKim Phillips     irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
431cf7087dbSKim Phillips     irq_set->start = 0;
432cf7087dbSKim Phillips     irq_set->count = vdev->nr_vectors;
433cf7087dbSKim Phillips     fds = (int32_t *)&irq_set->data;
434cf7087dbSKim Phillips 
435cf7087dbSKim Phillips     for (i = 0; i < vdev->nr_vectors; i++) {
436cf7087dbSKim Phillips         int fd = -1;
437cf7087dbSKim Phillips 
438cf7087dbSKim Phillips         /*
439cf7087dbSKim Phillips          * MSI vs MSI-X - The guest has direct access to MSI mask and pending
440cf7087dbSKim Phillips          * bits, therefore we always use the KVM signaling path when setup.
441cf7087dbSKim Phillips          * MSI-X mask and pending bits are emulated, so we want to use the
442cf7087dbSKim Phillips          * KVM signaling path only when configured and unmasked.
443cf7087dbSKim Phillips          */
444cf7087dbSKim Phillips         if (vdev->msi_vectors[i].use) {
445cf7087dbSKim Phillips             if (vdev->msi_vectors[i].virq < 0 ||
446cf7087dbSKim Phillips                 (msix && msix_is_masked(&vdev->pdev, i))) {
447cf7087dbSKim Phillips                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
448cf7087dbSKim Phillips             } else {
449cf7087dbSKim Phillips                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
450cf7087dbSKim Phillips             }
451cf7087dbSKim Phillips         }
452cf7087dbSKim Phillips 
453cf7087dbSKim Phillips         fds[i] = fd;
454cf7087dbSKim Phillips     }
455cf7087dbSKim Phillips 
4565546a621SEric Auger     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
457cf7087dbSKim Phillips 
458cf7087dbSKim Phillips     g_free(irq_set);
459cf7087dbSKim Phillips 
460cf7087dbSKim Phillips     return ret;
461cf7087dbSKim Phillips }
462cf7087dbSKim Phillips 
vfio_add_kvm_msi_virq(VFIOPCIDevice * vdev,VFIOMSIVector * vector,int vector_n,bool msix)46346746dbaSAlex Williamson static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
464d1f6af6aSPeter Xu                                   int vector_n, bool msix)
465cf7087dbSKim Phillips {
466d1f6af6aSPeter Xu     if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) {
467cf7087dbSKim Phillips         return;
468cf7087dbSKim Phillips     }
469cf7087dbSKim Phillips 
470dc580d51SLongpeng(Mike)     vector->virq = kvm_irqchip_add_msi_route(&vfio_route_change,
471dc580d51SLongpeng(Mike)                                              vector_n, &vdev->pdev);
472dc580d51SLongpeng(Mike) }
473dc580d51SLongpeng(Mike) 
vfio_connect_kvm_msi_virq(VFIOMSIVector * vector)474dc580d51SLongpeng(Mike) static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector)
475dc580d51SLongpeng(Mike) {
476dc580d51SLongpeng(Mike)     if (vector->virq < 0) {
477cf7087dbSKim Phillips         return;
478cf7087dbSKim Phillips     }
479cf7087dbSKim Phillips 
480dc580d51SLongpeng(Mike)     if (event_notifier_init(&vector->kvm_interrupt, 0)) {
481dc580d51SLongpeng(Mike)         goto fail_notifier;
482cf7087dbSKim Phillips     }
483cf7087dbSKim Phillips 
4841c9b71a7SEric Auger     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
485dc580d51SLongpeng(Mike)                                            NULL, vector->virq) < 0) {
486dc580d51SLongpeng(Mike)         goto fail_kvm;
487cf7087dbSKim Phillips     }
488cf7087dbSKim Phillips 
489dc580d51SLongpeng(Mike)     return;
490dc580d51SLongpeng(Mike) 
491dc580d51SLongpeng(Mike) fail_kvm:
492dc580d51SLongpeng(Mike)     event_notifier_cleanup(&vector->kvm_interrupt);
493dc580d51SLongpeng(Mike) fail_notifier:
494dc580d51SLongpeng(Mike)     kvm_irqchip_release_virq(kvm_state, vector->virq);
495dc580d51SLongpeng(Mike)     vector->virq = -1;
496cf7087dbSKim Phillips }
497cf7087dbSKim Phillips 
vfio_remove_kvm_msi_virq(VFIOMSIVector * vector)498cf7087dbSKim Phillips static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
499cf7087dbSKim Phillips {
5001c9b71a7SEric Auger     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
501cf7087dbSKim Phillips                                           vector->virq);
502cf7087dbSKim Phillips     kvm_irqchip_release_virq(kvm_state, vector->virq);
503cf7087dbSKim Phillips     vector->virq = -1;
504cf7087dbSKim Phillips     event_notifier_cleanup(&vector->kvm_interrupt);
505cf7087dbSKim Phillips }
506cf7087dbSKim Phillips 
vfio_update_kvm_msi_virq(VFIOMSIVector * vector,MSIMessage msg,PCIDevice * pdev)507dc9f06caSPavel Fedin static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
508dc9f06caSPavel Fedin                                      PCIDevice *pdev)
509cf7087dbSKim Phillips {
510dc9f06caSPavel Fedin     kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev);
5113f1fea0fSPeter Xu     kvm_irqchip_commit_routes(kvm_state);
512cf7087dbSKim Phillips }
513cf7087dbSKim Phillips 
vfio_msix_vector_do_use(PCIDevice * pdev,unsigned int nr,MSIMessage * msg,IOHandler * handler)514cf7087dbSKim Phillips static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
515cf7087dbSKim Phillips                                    MSIMessage *msg, IOHandler *handler)
516cf7087dbSKim Phillips {
51701b46064SEduardo Habkost     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
518cf7087dbSKim Phillips     VFIOMSIVector *vector;
519cf7087dbSKim Phillips     int ret;
520d9e6710dSJing Liu     bool resizing = !!(vdev->nr_vectors < nr + 1);
521cf7087dbSKim Phillips 
522df92ee44SEric Auger     trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
523cf7087dbSKim Phillips 
524cf7087dbSKim Phillips     vector = &vdev->msi_vectors[nr];
525cf7087dbSKim Phillips 
526cf7087dbSKim Phillips     if (!vector->use) {
527cf7087dbSKim Phillips         vector->vdev = vdev;
528cf7087dbSKim Phillips         vector->virq = -1;
529cf7087dbSKim Phillips         if (event_notifier_init(&vector->interrupt, 0)) {
530cf7087dbSKim Phillips             error_report("vfio: Error: event_notifier_init failed");
531cf7087dbSKim Phillips         }
532cf7087dbSKim Phillips         vector->use = true;
533cf7087dbSKim Phillips         msix_vector_use(pdev, nr);
534cf7087dbSKim Phillips     }
535cf7087dbSKim Phillips 
536cf7087dbSKim Phillips     qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
537cf7087dbSKim Phillips                         handler, NULL, vector);
538cf7087dbSKim Phillips 
539cf7087dbSKim Phillips     /*
540cf7087dbSKim Phillips      * Attempt to enable route through KVM irqchip,
541cf7087dbSKim Phillips      * default to userspace handling if unavailable.
542cf7087dbSKim Phillips      */
543cf7087dbSKim Phillips     if (vector->virq >= 0) {
544cf7087dbSKim Phillips         if (!msg) {
545cf7087dbSKim Phillips             vfio_remove_kvm_msi_virq(vector);
546cf7087dbSKim Phillips         } else {
547dc9f06caSPavel Fedin             vfio_update_kvm_msi_virq(vector, *msg, pdev);
548cf7087dbSKim Phillips         }
549cf7087dbSKim Phillips     } else {
5506d17a018SDavid Gibson         if (msg) {
551dc580d51SLongpeng(Mike)             if (vdev->defer_kvm_irq_routing) {
552d1f6af6aSPeter Xu                 vfio_add_kvm_msi_virq(vdev, vector, nr, true);
553dc580d51SLongpeng(Mike)             } else {
554dc580d51SLongpeng(Mike)                 vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
555dc580d51SLongpeng(Mike)                 vfio_add_kvm_msi_virq(vdev, vector, nr, true);
556dc580d51SLongpeng(Mike)                 kvm_irqchip_commit_route_changes(&vfio_route_change);
557dc580d51SLongpeng(Mike)                 vfio_connect_kvm_msi_virq(vector);
558dc580d51SLongpeng(Mike)             }
559cf7087dbSKim Phillips         }
5606d17a018SDavid Gibson     }
561cf7087dbSKim Phillips 
562cf7087dbSKim Phillips     /*
563d9e6710dSJing Liu      * When dynamic allocation is not supported, we don't want to have the
564d9e6710dSJing Liu      * host allocate all possible MSI vectors for a device if they're not
565d9e6710dSJing Liu      * in use, so we shutdown and incrementally increase them as needed.
566d9e6710dSJing Liu      * nr_vectors represents the total number of vectors allocated.
567d9e6710dSJing Liu      *
568d9e6710dSJing Liu      * When dynamic allocation is supported, let the host only allocate
569d9e6710dSJing Liu      * and enable a vector when it is in use in guest. nr_vectors represents
570d9e6710dSJing Liu      * the upper bound of vectors being enabled (but not all of the ranges
571d9e6710dSJing Liu      * is allocated or enabled).
572cf7087dbSKim Phillips      */
573d9e6710dSJing Liu     if (resizing) {
574cf7087dbSKim Phillips         vdev->nr_vectors = nr + 1;
575d9e6710dSJing Liu     }
576d9e6710dSJing Liu 
577dc580d51SLongpeng(Mike)     if (!vdev->defer_kvm_irq_routing) {
578d9e6710dSJing Liu         if (vdev->msix->noresize && resizing) {
579dc580d51SLongpeng(Mike)             vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
580cf7087dbSKim Phillips             ret = vfio_enable_vectors(vdev, true);
581cf7087dbSKim Phillips             if (ret) {
582cf7087dbSKim Phillips                 error_report("vfio: failed to enable vectors, %d", ret);
583cf7087dbSKim Phillips             }
584cf7087dbSKim Phillips         } else {
585201a7331SEric Auger             Error *err = NULL;
586201a7331SEric Auger             int32_t fd;
587cf7087dbSKim Phillips 
588cf7087dbSKim Phillips             if (vector->virq >= 0) {
589201a7331SEric Auger                 fd = event_notifier_get_fd(&vector->kvm_interrupt);
590cf7087dbSKim Phillips             } else {
591201a7331SEric Auger                 fd = event_notifier_get_fd(&vector->interrupt);
592cf7087dbSKim Phillips             }
593cf7087dbSKim Phillips 
59484e37d02SZhenzhong Duan             if (!vfio_set_irq_signaling(&vdev->vbasedev,
595201a7331SEric Auger                                         VFIO_PCI_MSIX_IRQ_INDEX, nr,
59684e37d02SZhenzhong Duan                                         VFIO_IRQ_SET_ACTION_TRIGGER, fd,
59784e37d02SZhenzhong Duan                                         &err)) {
598201a7331SEric Auger                 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
599cf7087dbSKim Phillips             }
600cf7087dbSKim Phillips         }
601d9e6710dSJing Liu     }
602cf7087dbSKim Phillips 
60395239e16SAlex Williamson     /* Disable PBA emulation when nothing more is pending. */
60495239e16SAlex Williamson     clear_bit(nr, vdev->msix->pending);
60595239e16SAlex Williamson     if (find_first_bit(vdev->msix->pending,
60695239e16SAlex Williamson                        vdev->nr_vectors) == vdev->nr_vectors) {
60795239e16SAlex Williamson         memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
60895239e16SAlex Williamson         trace_vfio_msix_pba_disable(vdev->vbasedev.name);
60995239e16SAlex Williamson     }
61095239e16SAlex Williamson 
611cf7087dbSKim Phillips     return 0;
612cf7087dbSKim Phillips }
613cf7087dbSKim Phillips 
vfio_msix_vector_use(PCIDevice * pdev,unsigned int nr,MSIMessage msg)614cf7087dbSKim Phillips static int vfio_msix_vector_use(PCIDevice *pdev,
615cf7087dbSKim Phillips                                 unsigned int nr, MSIMessage msg)
616cf7087dbSKim Phillips {
617cf7087dbSKim Phillips     return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
618cf7087dbSKim Phillips }
619cf7087dbSKim Phillips 
vfio_msix_vector_release(PCIDevice * pdev,unsigned int nr)620cf7087dbSKim Phillips static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
621cf7087dbSKim Phillips {
62201b46064SEduardo Habkost     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
623cf7087dbSKim Phillips     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
624cf7087dbSKim Phillips 
625df92ee44SEric Auger     trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
626cf7087dbSKim Phillips 
627cf7087dbSKim Phillips     /*
628cf7087dbSKim Phillips      * There are still old guests that mask and unmask vectors on every
629cf7087dbSKim Phillips      * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
630cf7087dbSKim Phillips      * the KVM setup in place, simply switch VFIO to use the non-bypass
631cf7087dbSKim Phillips      * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
632cf7087dbSKim Phillips      * core will mask the interrupt and set pending bits, allowing it to
633cf7087dbSKim Phillips      * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
634cf7087dbSKim Phillips      */
635cf7087dbSKim Phillips     if (vector->virq >= 0) {
636201a7331SEric Auger         int32_t fd = event_notifier_get_fd(&vector->interrupt);
6375053bd78SEric Auger         Error *err = NULL;
638cf7087dbSKim Phillips 
63984e37d02SZhenzhong Duan         if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
64084e37d02SZhenzhong Duan                                     nr, VFIO_IRQ_SET_ACTION_TRIGGER, fd,
64184e37d02SZhenzhong Duan                                     &err)) {
6425053bd78SEric Auger             error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
6435053bd78SEric Auger         }
644cf7087dbSKim Phillips     }
645cf7087dbSKim Phillips }
646cf7087dbSKim Phillips 
vfio_prepare_kvm_msi_virq_batch(VFIOPCIDevice * vdev)647dc580d51SLongpeng(Mike) static void vfio_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
648dc580d51SLongpeng(Mike) {
649dc580d51SLongpeng(Mike)     assert(!vdev->defer_kvm_irq_routing);
650dc580d51SLongpeng(Mike)     vdev->defer_kvm_irq_routing = true;
651dc580d51SLongpeng(Mike)     vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
652dc580d51SLongpeng(Mike) }
653dc580d51SLongpeng(Mike) 
vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice * vdev)654dc580d51SLongpeng(Mike) static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
655dc580d51SLongpeng(Mike) {
656dc580d51SLongpeng(Mike)     int i;
657dc580d51SLongpeng(Mike) 
658dc580d51SLongpeng(Mike)     assert(vdev->defer_kvm_irq_routing);
659dc580d51SLongpeng(Mike)     vdev->defer_kvm_irq_routing = false;
660dc580d51SLongpeng(Mike) 
661dc580d51SLongpeng(Mike)     kvm_irqchip_commit_route_changes(&vfio_route_change);
662dc580d51SLongpeng(Mike) 
663dc580d51SLongpeng(Mike)     for (i = 0; i < vdev->nr_vectors; i++) {
664dc580d51SLongpeng(Mike)         vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i]);
665dc580d51SLongpeng(Mike)     }
666dc580d51SLongpeng(Mike) }
667dc580d51SLongpeng(Mike) 
vfio_msix_enable(VFIOPCIDevice * vdev)6680de70dc7SAlex Williamson static void vfio_msix_enable(VFIOPCIDevice *vdev)
669cf7087dbSKim Phillips {
6705ebffa4eSJing Liu     int ret;
6715ebffa4eSJing Liu 
672cf7087dbSKim Phillips     vfio_disable_interrupts(vdev);
673cf7087dbSKim Phillips 
674bdd81addSMarkus Armbruster     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries);
675cf7087dbSKim Phillips 
676cf7087dbSKim Phillips     vdev->interrupt = VFIO_INT_MSIX;
677cf7087dbSKim Phillips 
678cf7087dbSKim Phillips     /*
679dc580d51SLongpeng(Mike)      * Setting vector notifiers triggers synchronous vector-use
680dc580d51SLongpeng(Mike)      * callbacks for each active vector.  Deferring to commit the KVM
681dc580d51SLongpeng(Mike)      * routes once rather than per vector provides a substantial
682dc580d51SLongpeng(Mike)      * performance improvement.
683dc580d51SLongpeng(Mike)      */
684dc580d51SLongpeng(Mike)     vfio_prepare_kvm_msi_virq_batch(vdev);
685dc580d51SLongpeng(Mike) 
686dc580d51SLongpeng(Mike)     if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
687dc580d51SLongpeng(Mike)                                   vfio_msix_vector_release, NULL)) {
688dc580d51SLongpeng(Mike)         error_report("vfio: msix_set_vector_notifiers failed");
689dc580d51SLongpeng(Mike)     }
690dc580d51SLongpeng(Mike) 
691dc580d51SLongpeng(Mike)     vfio_commit_kvm_msi_virq_batch(vdev);
692dc580d51SLongpeng(Mike) 
693dc580d51SLongpeng(Mike)     if (vdev->nr_vectors) {
694dc580d51SLongpeng(Mike)         ret = vfio_enable_vectors(vdev, true);
695dc580d51SLongpeng(Mike)         if (ret) {
696dc580d51SLongpeng(Mike)             error_report("vfio: failed to enable vectors, %d", ret);
697dc580d51SLongpeng(Mike)         }
698dc580d51SLongpeng(Mike)     } else {
699dc580d51SLongpeng(Mike)         /*
700cf7087dbSKim Phillips          * Some communication channels between VF & PF or PF & fw rely on the
701cf7087dbSKim Phillips          * physical state of the device and expect that enabling MSI-X from the
702cf7087dbSKim Phillips          * guest enables the same on the host.  When our guest is Linux, the
703cf7087dbSKim Phillips          * guest driver call to pci_enable_msix() sets the enabling bit in the
704cf7087dbSKim Phillips          * MSI-X capability, but leaves the vector table masked.  We therefore
705cf7087dbSKim Phillips          * can't rely on a vector_use callback (from request_irq() in the guest)
706cf7087dbSKim Phillips          * to switch the physical device into MSI-X mode because that may come a
7075ebffa4eSJing Liu          * long time after pci_enable_msix().  This code sets vector 0 with an
7085ebffa4eSJing Liu          * invalid fd to make the physical device MSI-X enabled, but with no
7095ebffa4eSJing Liu          * vectors enabled, just like the guest view.
710cf7087dbSKim Phillips          */
7115ebffa4eSJing Liu         ret = vfio_enable_msix_no_vec(vdev);
7125ebffa4eSJing Liu         if (ret) {
7135ebffa4eSJing Liu             error_report("vfio: failed to enable MSI-X, %d", ret);
7145ebffa4eSJing Liu         }
715cf7087dbSKim Phillips     }
716cf7087dbSKim Phillips 
7170de70dc7SAlex Williamson     trace_vfio_msix_enable(vdev->vbasedev.name);
718cf7087dbSKim Phillips }
719cf7087dbSKim Phillips 
vfio_msi_enable(VFIOPCIDevice * vdev)7200de70dc7SAlex Williamson static void vfio_msi_enable(VFIOPCIDevice *vdev)
721cf7087dbSKim Phillips {
722cf7087dbSKim Phillips     int ret, i;
723cf7087dbSKim Phillips 
724cf7087dbSKim Phillips     vfio_disable_interrupts(vdev);
725cf7087dbSKim Phillips 
726c1740889SShameer Kolothum     vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
727c1740889SShameer Kolothum retry:
728dc580d51SLongpeng(Mike)     /*
729dc580d51SLongpeng(Mike)      * Setting vector notifiers needs to enable route for each vector.
730dc580d51SLongpeng(Mike)      * Deferring to commit the KVM routes once rather than per vector
731dc580d51SLongpeng(Mike)      * provides a substantial performance improvement.
732dc580d51SLongpeng(Mike)      */
733dc580d51SLongpeng(Mike)     vfio_prepare_kvm_msi_virq_batch(vdev);
734dc580d51SLongpeng(Mike) 
735bdd81addSMarkus Armbruster     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
736cf7087dbSKim Phillips 
737cf7087dbSKim Phillips     for (i = 0; i < vdev->nr_vectors; i++) {
738cf7087dbSKim Phillips         VFIOMSIVector *vector = &vdev->msi_vectors[i];
739cf7087dbSKim Phillips 
740cf7087dbSKim Phillips         vector->vdev = vdev;
741cf7087dbSKim Phillips         vector->virq = -1;
742cf7087dbSKim Phillips         vector->use = true;
743cf7087dbSKim Phillips 
744cf7087dbSKim Phillips         if (event_notifier_init(&vector->interrupt, 0)) {
745cf7087dbSKim Phillips             error_report("vfio: Error: event_notifier_init failed");
746cf7087dbSKim Phillips         }
747cf7087dbSKim Phillips 
748cf7087dbSKim Phillips         qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
749cf7087dbSKim Phillips                             vfio_msi_interrupt, NULL, vector);
750cf7087dbSKim Phillips 
751cf7087dbSKim Phillips         /*
752cf7087dbSKim Phillips          * Attempt to enable route through KVM irqchip,
753cf7087dbSKim Phillips          * default to userspace handling if unavailable.
754cf7087dbSKim Phillips          */
755d1f6af6aSPeter Xu         vfio_add_kvm_msi_virq(vdev, vector, i, false);
756cf7087dbSKim Phillips     }
757cf7087dbSKim Phillips 
758dc580d51SLongpeng(Mike)     vfio_commit_kvm_msi_virq_batch(vdev);
759dc580d51SLongpeng(Mike) 
760cf7087dbSKim Phillips     /* Set interrupt type prior to possible interrupts */
761cf7087dbSKim Phillips     vdev->interrupt = VFIO_INT_MSI;
762cf7087dbSKim Phillips 
763cf7087dbSKim Phillips     ret = vfio_enable_vectors(vdev, false);
764cf7087dbSKim Phillips     if (ret) {
765cf7087dbSKim Phillips         if (ret < 0) {
766cf7087dbSKim Phillips             error_report("vfio: Error: Failed to setup MSI fds: %m");
767a6f5770fSLongpeng(Mike)         } else {
768cf7087dbSKim Phillips             error_report("vfio: Error: Failed to enable %d "
769cf7087dbSKim Phillips                          "MSI vectors, retry with %d", vdev->nr_vectors, ret);
770cf7087dbSKim Phillips         }
771cf7087dbSKim Phillips 
7728ab217d5SLongpeng(Mike)         vfio_msi_disable_common(vdev);
773cf7087dbSKim Phillips 
774a6f5770fSLongpeng(Mike)         if (ret > 0) {
775cf7087dbSKim Phillips             vdev->nr_vectors = ret;
776cf7087dbSKim Phillips             goto retry;
777cf7087dbSKim Phillips         }
778cf7087dbSKim Phillips 
779cf7087dbSKim Phillips         /*
780cf7087dbSKim Phillips          * Failing to setup MSI doesn't really fall within any specification.
781cf7087dbSKim Phillips          * Let's try leaving interrupts disabled and hope the guest figures
782cf7087dbSKim Phillips          * out to fall back to INTx for this device.
783cf7087dbSKim Phillips          */
784cf7087dbSKim Phillips         error_report("vfio: Error: Failed to enable MSI");
785cf7087dbSKim Phillips 
786cf7087dbSKim Phillips         return;
787cf7087dbSKim Phillips     }
788cf7087dbSKim Phillips 
7890de70dc7SAlex Williamson     trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
790cf7087dbSKim Phillips }
791cf7087dbSKim Phillips 
vfio_msi_disable_common(VFIOPCIDevice * vdev)7920de70dc7SAlex Williamson static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
793cf7087dbSKim Phillips {
794cf7087dbSKim Phillips     int i;
795cf7087dbSKim Phillips 
796cf7087dbSKim Phillips     for (i = 0; i < vdev->nr_vectors; i++) {
797cf7087dbSKim Phillips         VFIOMSIVector *vector = &vdev->msi_vectors[i];
798cf7087dbSKim Phillips         if (vdev->msi_vectors[i].use) {
799cf7087dbSKim Phillips             if (vector->virq >= 0) {
800cf7087dbSKim Phillips                 vfio_remove_kvm_msi_virq(vector);
801cf7087dbSKim Phillips             }
802cf7087dbSKim Phillips             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
803cf7087dbSKim Phillips                                 NULL, NULL, NULL);
804cf7087dbSKim Phillips             event_notifier_cleanup(&vector->interrupt);
805cf7087dbSKim Phillips         }
806cf7087dbSKim Phillips     }
807cf7087dbSKim Phillips 
808cf7087dbSKim Phillips     g_free(vdev->msi_vectors);
809cf7087dbSKim Phillips     vdev->msi_vectors = NULL;
810cf7087dbSKim Phillips     vdev->nr_vectors = 0;
811cf7087dbSKim Phillips     vdev->interrupt = VFIO_INT_NONE;
812cf7087dbSKim Phillips }
813cf7087dbSKim Phillips 
vfio_msix_disable(VFIOPCIDevice * vdev)8140de70dc7SAlex Williamson static void vfio_msix_disable(VFIOPCIDevice *vdev)
815cf7087dbSKim Phillips {
816be4a46ecSLongpeng(Mike)     Error *err = NULL;
817cf7087dbSKim Phillips     int i;
818cf7087dbSKim Phillips 
819cf7087dbSKim Phillips     msix_unset_vector_notifiers(&vdev->pdev);
820cf7087dbSKim Phillips 
821cf7087dbSKim Phillips     /*
822cf7087dbSKim Phillips      * MSI-X will only release vectors if MSI-X is still enabled on the
823cf7087dbSKim Phillips      * device, check through the rest and release it ourselves if necessary.
824cf7087dbSKim Phillips      */
825cf7087dbSKim Phillips     for (i = 0; i < vdev->nr_vectors; i++) {
826cf7087dbSKim Phillips         if (vdev->msi_vectors[i].use) {
827cf7087dbSKim Phillips             vfio_msix_vector_release(&vdev->pdev, i);
828cf7087dbSKim Phillips             msix_vector_unuse(&vdev->pdev, i);
829cf7087dbSKim Phillips         }
830cf7087dbSKim Phillips     }
831cf7087dbSKim Phillips 
832d2b668fcSCédric Le Goater     /*
833d2b668fcSCédric Le Goater      * Always clear MSI-X IRQ index. A PF device could have enabled
834d2b668fcSCédric Le Goater      * MSI-X with no vectors. See vfio_msix_enable().
835d2b668fcSCédric Le Goater      */
8365546a621SEric Auger     vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
837cf7087dbSKim Phillips 
8380de70dc7SAlex Williamson     vfio_msi_disable_common(vdev);
839c32bab07SZhenzhong Duan     if (!vfio_intx_enable(vdev, &err)) {
840be4a46ecSLongpeng(Mike)         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
841be4a46ecSLongpeng(Mike)     }
842cf7087dbSKim Phillips 
84395239e16SAlex Williamson     memset(vdev->msix->pending, 0,
84495239e16SAlex Williamson            BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
84595239e16SAlex Williamson 
8460de70dc7SAlex Williamson     trace_vfio_msix_disable(vdev->vbasedev.name);
847cf7087dbSKim Phillips }
848cf7087dbSKim Phillips 
vfio_msi_disable(VFIOPCIDevice * vdev)8490de70dc7SAlex Williamson static void vfio_msi_disable(VFIOPCIDevice *vdev)
850cf7087dbSKim Phillips {
851be4a46ecSLongpeng(Mike)     Error *err = NULL;
852be4a46ecSLongpeng(Mike) 
8535546a621SEric Auger     vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
8540de70dc7SAlex Williamson     vfio_msi_disable_common(vdev);
855be4a46ecSLongpeng(Mike)     vfio_intx_enable(vdev, &err);
856be4a46ecSLongpeng(Mike)     if (err) {
857be4a46ecSLongpeng(Mike)         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
858be4a46ecSLongpeng(Mike)     }
859cf7087dbSKim Phillips 
8600de70dc7SAlex Williamson     trace_vfio_msi_disable(vdev->vbasedev.name);
861cf7087dbSKim Phillips }
862cf7087dbSKim Phillips 
vfio_update_msi(VFIOPCIDevice * vdev)8639ee27d73SEric Auger static void vfio_update_msi(VFIOPCIDevice *vdev)
864cf7087dbSKim Phillips {
865cf7087dbSKim Phillips     int i;
866cf7087dbSKim Phillips 
867cf7087dbSKim Phillips     for (i = 0; i < vdev->nr_vectors; i++) {
868cf7087dbSKim Phillips         VFIOMSIVector *vector = &vdev->msi_vectors[i];
869cf7087dbSKim Phillips         MSIMessage msg;
870cf7087dbSKim Phillips 
871cf7087dbSKim Phillips         if (!vector->use || vector->virq < 0) {
872cf7087dbSKim Phillips             continue;
873cf7087dbSKim Phillips         }
874cf7087dbSKim Phillips 
875cf7087dbSKim Phillips         msg = msi_get_message(&vdev->pdev, i);
876dc9f06caSPavel Fedin         vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev);
877cf7087dbSKim Phillips     }
878cf7087dbSKim Phillips }
879cf7087dbSKim Phillips 
vfio_pci_load_rom(VFIOPCIDevice * vdev)8809ee27d73SEric Auger static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
881cf7087dbSKim Phillips {
8820d3e89beSZhenzhong Duan     g_autofree struct vfio_region_info *reg_info = NULL;
883cf7087dbSKim Phillips     uint64_t size;
884cf7087dbSKim Phillips     off_t off = 0;
8857d489dcdSPaolo Bonzini     ssize_t bytes;
886cf7087dbSKim Phillips 
88746900226SAlex Williamson     if (vfio_get_region_info(&vdev->vbasedev,
88846900226SAlex Williamson                              VFIO_PCI_ROM_REGION_INDEX, &reg_info)) {
889cf7087dbSKim Phillips         error_report("vfio: Error getting ROM info: %m");
890cf7087dbSKim Phillips         return;
891cf7087dbSKim Phillips     }
892cf7087dbSKim Phillips 
89346900226SAlex Williamson     trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size,
89446900226SAlex Williamson                             (unsigned long)reg_info->offset,
89546900226SAlex Williamson                             (unsigned long)reg_info->flags);
896cf7087dbSKim Phillips 
89746900226SAlex Williamson     vdev->rom_size = size = reg_info->size;
89846900226SAlex Williamson     vdev->rom_offset = reg_info->offset;
89946900226SAlex Williamson 
900cf7087dbSKim Phillips     if (!vdev->rom_size) {
901cf7087dbSKim Phillips         vdev->rom_read_failed = true;
902cf7087dbSKim Phillips         error_report("vfio-pci: Cannot read device rom at "
903df92ee44SEric Auger                     "%s", vdev->vbasedev.name);
904cf7087dbSKim Phillips         error_printf("Device option ROM contents are probably invalid "
905cf7087dbSKim Phillips                     "(check dmesg).\nSkip option ROM probe with rombar=0, "
906cf7087dbSKim Phillips                     "or load from file with romfile=\n");
907cf7087dbSKim Phillips         return;
908cf7087dbSKim Phillips     }
909cf7087dbSKim Phillips 
910cf7087dbSKim Phillips     vdev->rom = g_malloc(size);
911cf7087dbSKim Phillips     memset(vdev->rom, 0xff, size);
912cf7087dbSKim Phillips 
913cf7087dbSKim Phillips     while (size) {
9145546a621SEric Auger         bytes = pread(vdev->vbasedev.fd, vdev->rom + off,
9155546a621SEric Auger                       size, vdev->rom_offset + off);
916cf7087dbSKim Phillips         if (bytes == 0) {
917cf7087dbSKim Phillips             break;
918cf7087dbSKim Phillips         } else if (bytes > 0) {
919cf7087dbSKim Phillips             off += bytes;
920cf7087dbSKim Phillips             size -= bytes;
921cf7087dbSKim Phillips         } else {
922cf7087dbSKim Phillips             if (errno == EINTR || errno == EAGAIN) {
923cf7087dbSKim Phillips                 continue;
924cf7087dbSKim Phillips             }
925cf7087dbSKim Phillips             error_report("vfio: Error reading device ROM: %m");
926cf7087dbSKim Phillips             break;
927cf7087dbSKim Phillips         }
928cf7087dbSKim Phillips     }
929e2e5ee9cSAlex Williamson 
930e2e5ee9cSAlex Williamson     /*
931e2e5ee9cSAlex Williamson      * Test the ROM signature against our device, if the vendor is correct
932e2e5ee9cSAlex Williamson      * but the device ID doesn't match, store the correct device ID and
933e2e5ee9cSAlex Williamson      * recompute the checksum.  Intel IGD devices need this and are known
934e2e5ee9cSAlex Williamson      * to have bogus checksums so we can't simply adjust the checksum.
935e2e5ee9cSAlex Williamson      */
936e2e5ee9cSAlex Williamson     if (pci_get_word(vdev->rom) == 0xaa55 &&
937e2e5ee9cSAlex Williamson         pci_get_word(vdev->rom + 0x18) + 8 < vdev->rom_size &&
938e2e5ee9cSAlex Williamson         !memcmp(vdev->rom + pci_get_word(vdev->rom + 0x18), "PCIR", 4)) {
939e2e5ee9cSAlex Williamson         uint16_t vid, did;
940e2e5ee9cSAlex Williamson 
941e2e5ee9cSAlex Williamson         vid = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 4);
942e2e5ee9cSAlex Williamson         did = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6);
943e2e5ee9cSAlex Williamson 
944e2e5ee9cSAlex Williamson         if (vid == vdev->vendor_id && did != vdev->device_id) {
945e2e5ee9cSAlex Williamson             int i;
946e2e5ee9cSAlex Williamson             uint8_t csum, *data = vdev->rom;
947e2e5ee9cSAlex Williamson 
948e2e5ee9cSAlex Williamson             pci_set_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6,
949e2e5ee9cSAlex Williamson                          vdev->device_id);
950e2e5ee9cSAlex Williamson             data[6] = 0;
951e2e5ee9cSAlex Williamson 
952e2e5ee9cSAlex Williamson             for (csum = 0, i = 0; i < vdev->rom_size; i++) {
953e2e5ee9cSAlex Williamson                 csum += data[i];
954e2e5ee9cSAlex Williamson             }
955e2e5ee9cSAlex Williamson 
956e2e5ee9cSAlex Williamson             data[6] = -csum;
957e2e5ee9cSAlex Williamson         }
958e2e5ee9cSAlex Williamson     }
959cf7087dbSKim Phillips }
960cf7087dbSKim Phillips 
vfio_rom_read(void * opaque,hwaddr addr,unsigned size)961cf7087dbSKim Phillips static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
962cf7087dbSKim Phillips {
9639ee27d73SEric Auger     VFIOPCIDevice *vdev = opaque;
964cf7087dbSKim Phillips     union {
965cf7087dbSKim Phillips         uint8_t byte;
966cf7087dbSKim Phillips         uint16_t word;
967cf7087dbSKim Phillips         uint32_t dword;
968cf7087dbSKim Phillips         uint64_t qword;
969cf7087dbSKim Phillips     } val;
970cf7087dbSKim Phillips     uint64_t data = 0;
971cf7087dbSKim Phillips 
972cf7087dbSKim Phillips     /* Load the ROM lazily when the guest tries to read it */
973cf7087dbSKim Phillips     if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
974cf7087dbSKim Phillips         vfio_pci_load_rom(vdev);
975cf7087dbSKim Phillips     }
976cf7087dbSKim Phillips 
977cf7087dbSKim Phillips     memcpy(&val, vdev->rom + addr,
978cf7087dbSKim Phillips            (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
979cf7087dbSKim Phillips 
980cf7087dbSKim Phillips     switch (size) {
981cf7087dbSKim Phillips     case 1:
982cf7087dbSKim Phillips         data = val.byte;
983cf7087dbSKim Phillips         break;
984cf7087dbSKim Phillips     case 2:
985cf7087dbSKim Phillips         data = le16_to_cpu(val.word);
986cf7087dbSKim Phillips         break;
987cf7087dbSKim Phillips     case 4:
988cf7087dbSKim Phillips         data = le32_to_cpu(val.dword);
989cf7087dbSKim Phillips         break;
990cf7087dbSKim Phillips     default:
991cf7087dbSKim Phillips         hw_error("vfio: unsupported read size, %d bytes\n", size);
992cf7087dbSKim Phillips         break;
993cf7087dbSKim Phillips     }
994cf7087dbSKim Phillips 
995df92ee44SEric Auger     trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data);
996cf7087dbSKim Phillips 
997cf7087dbSKim Phillips     return data;
998cf7087dbSKim Phillips }
999cf7087dbSKim Phillips 
vfio_rom_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)1000cf7087dbSKim Phillips static void vfio_rom_write(void *opaque, hwaddr addr,
1001cf7087dbSKim Phillips                            uint64_t data, unsigned size)
1002cf7087dbSKim Phillips {
1003cf7087dbSKim Phillips }
1004cf7087dbSKim Phillips 
1005cf7087dbSKim Phillips static const MemoryRegionOps vfio_rom_ops = {
1006cf7087dbSKim Phillips     .read = vfio_rom_read,
1007cf7087dbSKim Phillips     .write = vfio_rom_write,
1008cf7087dbSKim Phillips     .endianness = DEVICE_LITTLE_ENDIAN,
1009cf7087dbSKim Phillips };
1010cf7087dbSKim Phillips 
vfio_pci_size_rom(VFIOPCIDevice * vdev)10119ee27d73SEric Auger static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
1012cf7087dbSKim Phillips {
1013cf7087dbSKim Phillips     uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
1014cf7087dbSKim Phillips     off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
1015cf7087dbSKim Phillips     DeviceState *dev = DEVICE(vdev);
1016062ed5d8SNeo Jia     char *name;
10175546a621SEric Auger     int fd = vdev->vbasedev.fd;
1018cf7087dbSKim Phillips 
1019cf7087dbSKim Phillips     if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
1020cf7087dbSKim Phillips         /* Since pci handles romfile, just print a message and return */
10214eda914cSPhilippe Mathieu-Daudé         if (vfio_opt_rom_in_denylist(vdev) && vdev->pdev.romfile) {
10228f8f5885SMarkus Armbruster             warn_report("Device at %s is known to cause system instability"
10238f8f5885SMarkus Armbruster                         " issues during option rom execution",
10247df9381bSAlex Williamson                         vdev->vbasedev.name);
10258f8f5885SMarkus Armbruster             error_printf("Proceeding anyway since user specified romfile\n");
1026cf7087dbSKim Phillips         }
1027cf7087dbSKim Phillips         return;
1028cf7087dbSKim Phillips     }
1029cf7087dbSKim Phillips 
1030cf7087dbSKim Phillips     /*
1031cf7087dbSKim Phillips      * Use the same size ROM BAR as the physical device.  The contents
1032cf7087dbSKim Phillips      * will get filled in later when the guest tries to read it.
1033cf7087dbSKim Phillips      */
10345546a621SEric Auger     if (pread(fd, &orig, 4, offset) != 4 ||
10355546a621SEric Auger         pwrite(fd, &size, 4, offset) != 4 ||
10365546a621SEric Auger         pread(fd, &size, 4, offset) != 4 ||
10375546a621SEric Auger         pwrite(fd, &orig, 4, offset) != 4) {
10387df9381bSAlex Williamson         error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name);
1039cf7087dbSKim Phillips         return;
1040cf7087dbSKim Phillips     }
1041cf7087dbSKim Phillips 
1042cf7087dbSKim Phillips     size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
1043cf7087dbSKim Phillips 
1044cf7087dbSKim Phillips     if (!size) {
1045cf7087dbSKim Phillips         return;
1046cf7087dbSKim Phillips     }
1047cf7087dbSKim Phillips 
10484eda914cSPhilippe Mathieu-Daudé     if (vfio_opt_rom_in_denylist(vdev)) {
1049f3558b1bSKevin Wolf         if (dev->opts && qdict_haskey(dev->opts, "rombar")) {
10508f8f5885SMarkus Armbruster             warn_report("Device at %s is known to cause system instability"
10518f8f5885SMarkus Armbruster                         " issues during option rom execution",
10527df9381bSAlex Williamson                         vdev->vbasedev.name);
10538f8f5885SMarkus Armbruster             error_printf("Proceeding anyway since user specified"
10548f8f5885SMarkus Armbruster                          " non zero value for rombar\n");
1055cf7087dbSKim Phillips         } else {
10568f8f5885SMarkus Armbruster             warn_report("Rom loading for device at %s has been disabled"
10578f8f5885SMarkus Armbruster                         " due to system instability issues",
10587df9381bSAlex Williamson                         vdev->vbasedev.name);
10598f8f5885SMarkus Armbruster             error_printf("Specify rombar=1 or romfile to force\n");
1060cf7087dbSKim Phillips             return;
1061cf7087dbSKim Phillips         }
1062cf7087dbSKim Phillips     }
1063cf7087dbSKim Phillips 
1064df92ee44SEric Auger     trace_vfio_pci_size_rom(vdev->vbasedev.name, size);
1065cf7087dbSKim Phillips 
1066062ed5d8SNeo Jia     name = g_strdup_printf("vfio[%s].rom", vdev->vbasedev.name);
1067cf7087dbSKim Phillips 
1068cf7087dbSKim Phillips     memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
1069cf7087dbSKim Phillips                           &vfio_rom_ops, vdev, name, size);
1070062ed5d8SNeo Jia     g_free(name);
1071cf7087dbSKim Phillips 
1072cf7087dbSKim Phillips     pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
1073cf7087dbSKim Phillips                      PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
1074cf7087dbSKim Phillips 
1075cf7087dbSKim Phillips     vdev->rom_read_failed = false;
1076cf7087dbSKim Phillips }
1077cf7087dbSKim Phillips 
vfio_vga_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)1078c00d61d8SAlex Williamson void vfio_vga_write(void *opaque, hwaddr addr,
1079cf7087dbSKim Phillips                            uint64_t data, unsigned size)
1080cf7087dbSKim Phillips {
1081cf7087dbSKim Phillips     VFIOVGARegion *region = opaque;
1082cf7087dbSKim Phillips     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1083cf7087dbSKim Phillips     union {
1084cf7087dbSKim Phillips         uint8_t byte;
1085cf7087dbSKim Phillips         uint16_t word;
1086cf7087dbSKim Phillips         uint32_t dword;
1087cf7087dbSKim Phillips         uint64_t qword;
1088cf7087dbSKim Phillips     } buf;
1089cf7087dbSKim Phillips     off_t offset = vga->fd_offset + region->offset + addr;
1090cf7087dbSKim Phillips 
1091cf7087dbSKim Phillips     switch (size) {
1092cf7087dbSKim Phillips     case 1:
1093cf7087dbSKim Phillips         buf.byte = data;
1094cf7087dbSKim Phillips         break;
1095cf7087dbSKim Phillips     case 2:
1096cf7087dbSKim Phillips         buf.word = cpu_to_le16(data);
1097cf7087dbSKim Phillips         break;
1098cf7087dbSKim Phillips     case 4:
1099cf7087dbSKim Phillips         buf.dword = cpu_to_le32(data);
1100cf7087dbSKim Phillips         break;
1101cf7087dbSKim Phillips     default:
1102cf7087dbSKim Phillips         hw_error("vfio: unsupported write size, %d bytes", size);
1103cf7087dbSKim Phillips         break;
1104cf7087dbSKim Phillips     }
1105cf7087dbSKim Phillips 
1106cf7087dbSKim Phillips     if (pwrite(vga->fd, &buf, size, offset) != size) {
1107cf7087dbSKim Phillips         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1108cf7087dbSKim Phillips                      __func__, region->offset + addr, data, size);
1109cf7087dbSKim Phillips     }
1110cf7087dbSKim Phillips 
1111cf7087dbSKim Phillips     trace_vfio_vga_write(region->offset + addr, data, size);
1112cf7087dbSKim Phillips }
1113cf7087dbSKim Phillips 
vfio_vga_read(void * opaque,hwaddr addr,unsigned size)1114c00d61d8SAlex Williamson uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1115cf7087dbSKim Phillips {
1116cf7087dbSKim Phillips     VFIOVGARegion *region = opaque;
1117cf7087dbSKim Phillips     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1118cf7087dbSKim Phillips     union {
1119cf7087dbSKim Phillips         uint8_t byte;
1120cf7087dbSKim Phillips         uint16_t word;
1121cf7087dbSKim Phillips         uint32_t dword;
1122cf7087dbSKim Phillips         uint64_t qword;
1123cf7087dbSKim Phillips     } buf;
1124cf7087dbSKim Phillips     uint64_t data = 0;
1125cf7087dbSKim Phillips     off_t offset = vga->fd_offset + region->offset + addr;
1126cf7087dbSKim Phillips 
1127cf7087dbSKim Phillips     if (pread(vga->fd, &buf, size, offset) != size) {
1128cf7087dbSKim Phillips         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1129cf7087dbSKim Phillips                      __func__, region->offset + addr, size);
1130cf7087dbSKim Phillips         return (uint64_t)-1;
1131cf7087dbSKim Phillips     }
1132cf7087dbSKim Phillips 
1133cf7087dbSKim Phillips     switch (size) {
1134cf7087dbSKim Phillips     case 1:
1135cf7087dbSKim Phillips         data = buf.byte;
1136cf7087dbSKim Phillips         break;
1137cf7087dbSKim Phillips     case 2:
1138cf7087dbSKim Phillips         data = le16_to_cpu(buf.word);
1139cf7087dbSKim Phillips         break;
1140cf7087dbSKim Phillips     case 4:
1141cf7087dbSKim Phillips         data = le32_to_cpu(buf.dword);
1142cf7087dbSKim Phillips         break;
1143cf7087dbSKim Phillips     default:
1144cf7087dbSKim Phillips         hw_error("vfio: unsupported read size, %d bytes", size);
1145cf7087dbSKim Phillips         break;
1146cf7087dbSKim Phillips     }
1147cf7087dbSKim Phillips 
1148cf7087dbSKim Phillips     trace_vfio_vga_read(region->offset + addr, size, data);
1149cf7087dbSKim Phillips 
1150cf7087dbSKim Phillips     return data;
1151cf7087dbSKim Phillips }
1152cf7087dbSKim Phillips 
1153cf7087dbSKim Phillips static const MemoryRegionOps vfio_vga_ops = {
1154cf7087dbSKim Phillips     .read = vfio_vga_read,
1155cf7087dbSKim Phillips     .write = vfio_vga_write,
1156cf7087dbSKim Phillips     .endianness = DEVICE_LITTLE_ENDIAN,
1157cf7087dbSKim Phillips };
1158cf7087dbSKim Phillips 
1159cf7087dbSKim Phillips /*
116095251725SYongji Xie  * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page
116195251725SYongji Xie  * size if the BAR is in an exclusive page in host so that we could map
116295251725SYongji Xie  * this BAR to guest. But this sub-page BAR may not occupy an exclusive
116395251725SYongji Xie  * page in guest. So we should set the priority of the expanded memory
116495251725SYongji Xie  * region to zero in case of overlap with BARs which share the same page
116595251725SYongji Xie  * with the sub-page BAR in guest. Besides, we should also recover the
116695251725SYongji Xie  * size of this sub-page BAR when its base address is changed in guest
116795251725SYongji Xie  * and not page aligned any more.
116895251725SYongji Xie  */
vfio_sub_page_bar_update_mapping(PCIDevice * pdev,int bar)116995251725SYongji Xie static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
117095251725SYongji Xie {
117101b46064SEduardo Habkost     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
117295251725SYongji Xie     VFIORegion *region = &vdev->bars[bar].region;
11733a286732SAlex Williamson     MemoryRegion *mmap_mr, *region_mr, *base_mr;
117495251725SYongji Xie     PCIIORegion *r;
117595251725SYongji Xie     pcibus_t bar_addr;
117695251725SYongji Xie     uint64_t size = region->size;
117795251725SYongji Xie 
117895251725SYongji Xie     /* Make sure that the whole region is allowed to be mmapped */
117995251725SYongji Xie     if (region->nr_mmaps != 1 || !region->mmaps[0].mmap ||
118095251725SYongji Xie         region->mmaps[0].size != region->size) {
118195251725SYongji Xie         return;
118295251725SYongji Xie     }
118395251725SYongji Xie 
118495251725SYongji Xie     r = &pdev->io_regions[bar];
118595251725SYongji Xie     bar_addr = r->addr;
11863a286732SAlex Williamson     base_mr = vdev->bars[bar].mr;
11873a286732SAlex Williamson     region_mr = region->mem;
118895251725SYongji Xie     mmap_mr = &region->mmaps[0].mem;
118995251725SYongji Xie 
119095251725SYongji Xie     /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */
119195251725SYongji Xie     if (bar_addr != PCI_BAR_UNMAPPED &&
11928e3b0cbbSMarc-André Lureau         !(bar_addr & ~qemu_real_host_page_mask())) {
11938e3b0cbbSMarc-André Lureau         size = qemu_real_host_page_size();
119495251725SYongji Xie     }
119595251725SYongji Xie 
119695251725SYongji Xie     memory_region_transaction_begin();
119795251725SYongji Xie 
11983a286732SAlex Williamson     if (vdev->bars[bar].size < size) {
11993a286732SAlex Williamson         memory_region_set_size(base_mr, size);
12003a286732SAlex Williamson     }
12013a286732SAlex Williamson     memory_region_set_size(region_mr, size);
120295251725SYongji Xie     memory_region_set_size(mmap_mr, size);
12033a286732SAlex Williamson     if (size != vdev->bars[bar].size && memory_region_is_mapped(base_mr)) {
12043a286732SAlex Williamson         memory_region_del_subregion(r->address_space, base_mr);
120595251725SYongji Xie         memory_region_add_subregion_overlap(r->address_space,
12063a286732SAlex Williamson                                             bar_addr, base_mr, 0);
120795251725SYongji Xie     }
120895251725SYongji Xie 
120995251725SYongji Xie     memory_region_transaction_commit();
121095251725SYongji Xie }
121195251725SYongji Xie 
121295251725SYongji Xie /*
1213cf7087dbSKim Phillips  * PCI config space
1214cf7087dbSKim Phillips  */
vfio_pci_read_config(PCIDevice * pdev,uint32_t addr,int len)1215c00d61d8SAlex Williamson uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
1216cf7087dbSKim Phillips {
121701b46064SEduardo Habkost     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
1218cf7087dbSKim Phillips     uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
1219cf7087dbSKim Phillips 
1220cf7087dbSKim Phillips     memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
1221cf7087dbSKim Phillips     emu_bits = le32_to_cpu(emu_bits);
1222cf7087dbSKim Phillips 
1223cf7087dbSKim Phillips     if (emu_bits) {
1224cf7087dbSKim Phillips         emu_val = pci_default_read_config(pdev, addr, len);
1225cf7087dbSKim Phillips     }
1226cf7087dbSKim Phillips 
1227cf7087dbSKim Phillips     if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
1228cf7087dbSKim Phillips         ssize_t ret;
1229cf7087dbSKim Phillips 
12305546a621SEric Auger         ret = pread(vdev->vbasedev.fd, &phys_val, len,
12315546a621SEric Auger                     vdev->config_offset + addr);
1232cf7087dbSKim Phillips         if (ret != len) {
12337df9381bSAlex Williamson             error_report("%s(%s, 0x%x, 0x%x) failed: %m",
12347df9381bSAlex Williamson                          __func__, vdev->vbasedev.name, addr, len);
1235cf7087dbSKim Phillips             return -errno;
1236cf7087dbSKim Phillips         }
1237cf7087dbSKim Phillips         phys_val = le32_to_cpu(phys_val);
1238cf7087dbSKim Phillips     }
1239cf7087dbSKim Phillips 
1240cf7087dbSKim Phillips     val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
1241cf7087dbSKim Phillips 
1242df92ee44SEric Auger     trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val);
1243cf7087dbSKim Phillips 
1244cf7087dbSKim Phillips     return val;
1245cf7087dbSKim Phillips }
1246cf7087dbSKim Phillips 
vfio_pci_write_config(PCIDevice * pdev,uint32_t addr,uint32_t val,int len)1247c00d61d8SAlex Williamson void vfio_pci_write_config(PCIDevice *pdev,
1248c00d61d8SAlex Williamson                            uint32_t addr, uint32_t val, int len)
1249cf7087dbSKim Phillips {
125001b46064SEduardo Habkost     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
1251cf7087dbSKim Phillips     uint32_t val_le = cpu_to_le32(val);
1252cf7087dbSKim Phillips 
1253df92ee44SEric Auger     trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
1254cf7087dbSKim Phillips 
1255cf7087dbSKim Phillips     /* Write everything to VFIO, let it filter out what we can't write */
12565546a621SEric Auger     if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr)
12575546a621SEric Auger                 != len) {
12587df9381bSAlex Williamson         error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m",
12597df9381bSAlex Williamson                      __func__, vdev->vbasedev.name, addr, val, len);
1260cf7087dbSKim Phillips     }
1261cf7087dbSKim Phillips 
1262cf7087dbSKim Phillips     /* MSI/MSI-X Enabling/Disabling */
1263cf7087dbSKim Phillips     if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
1264cf7087dbSKim Phillips         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
1265cf7087dbSKim Phillips         int is_enabled, was_enabled = msi_enabled(pdev);
1266cf7087dbSKim Phillips 
1267cf7087dbSKim Phillips         pci_default_write_config(pdev, addr, val, len);
1268cf7087dbSKim Phillips 
1269cf7087dbSKim Phillips         is_enabled = msi_enabled(pdev);
1270cf7087dbSKim Phillips 
1271cf7087dbSKim Phillips         if (!was_enabled) {
1272cf7087dbSKim Phillips             if (is_enabled) {
12730de70dc7SAlex Williamson                 vfio_msi_enable(vdev);
1274cf7087dbSKim Phillips             }
1275cf7087dbSKim Phillips         } else {
1276cf7087dbSKim Phillips             if (!is_enabled) {
12770de70dc7SAlex Williamson                 vfio_msi_disable(vdev);
1278cf7087dbSKim Phillips             } else {
1279cf7087dbSKim Phillips                 vfio_update_msi(vdev);
1280cf7087dbSKim Phillips             }
1281cf7087dbSKim Phillips         }
1282cf7087dbSKim Phillips     } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
1283cf7087dbSKim Phillips         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
1284cf7087dbSKim Phillips         int is_enabled, was_enabled = msix_enabled(pdev);
1285cf7087dbSKim Phillips 
1286cf7087dbSKim Phillips         pci_default_write_config(pdev, addr, val, len);
1287cf7087dbSKim Phillips 
1288cf7087dbSKim Phillips         is_enabled = msix_enabled(pdev);
1289cf7087dbSKim Phillips 
1290cf7087dbSKim Phillips         if (!was_enabled && is_enabled) {
12910de70dc7SAlex Williamson             vfio_msix_enable(vdev);
1292cf7087dbSKim Phillips         } else if (was_enabled && !is_enabled) {
12930de70dc7SAlex Williamson             vfio_msix_disable(vdev);
1294cf7087dbSKim Phillips         }
129595251725SYongji Xie     } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) ||
129695251725SYongji Xie         range_covers_byte(addr, len, PCI_COMMAND)) {
129795251725SYongji Xie         pcibus_t old_addr[PCI_NUM_REGIONS - 1];
129895251725SYongji Xie         int bar;
129995251725SYongji Xie 
130095251725SYongji Xie         for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
130195251725SYongji Xie             old_addr[bar] = pdev->io_regions[bar].addr;
130295251725SYongji Xie         }
130395251725SYongji Xie 
130495251725SYongji Xie         pci_default_write_config(pdev, addr, val, len);
130595251725SYongji Xie 
130695251725SYongji Xie         for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
130795251725SYongji Xie             if (old_addr[bar] != pdev->io_regions[bar].addr &&
13083a286732SAlex Williamson                 vdev->bars[bar].region.size > 0 &&
13098e3b0cbbSMarc-André Lureau                 vdev->bars[bar].region.size < qemu_real_host_page_size()) {
131095251725SYongji Xie                 vfio_sub_page_bar_update_mapping(pdev, bar);
131195251725SYongji Xie             }
131295251725SYongji Xie         }
1313cf7087dbSKim Phillips     } else {
1314cf7087dbSKim Phillips         /* Write everything to QEMU to keep emulated bits correct */
1315cf7087dbSKim Phillips         pci_default_write_config(pdev, addr, val, len);
1316cf7087dbSKim Phillips     }
1317cf7087dbSKim Phillips }
1318cf7087dbSKim Phillips 
1319cf7087dbSKim Phillips /*
1320cf7087dbSKim Phillips  * Interrupt setup
1321cf7087dbSKim Phillips  */
vfio_disable_interrupts(VFIOPCIDevice * vdev)13229ee27d73SEric Auger static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
1323cf7087dbSKim Phillips {
1324b3e27c3aSAlex Williamson     /*
1325b3e27c3aSAlex Williamson      * More complicated than it looks.  Disabling MSI/X transitions the
1326b3e27c3aSAlex Williamson      * device to INTx mode (if supported).  Therefore we need to first
1327b3e27c3aSAlex Williamson      * disable MSI/X and then cleanup by disabling INTx.
1328b3e27c3aSAlex Williamson      */
1329b3e27c3aSAlex Williamson     if (vdev->interrupt == VFIO_INT_MSIX) {
13300de70dc7SAlex Williamson         vfio_msix_disable(vdev);
1331b3e27c3aSAlex Williamson     } else if (vdev->interrupt == VFIO_INT_MSI) {
13320de70dc7SAlex Williamson         vfio_msi_disable(vdev);
1333b3e27c3aSAlex Williamson     }
1334b3e27c3aSAlex Williamson 
1335b3e27c3aSAlex Williamson     if (vdev->interrupt == VFIO_INT_INTx) {
1336870cb6f1SAlex Williamson         vfio_intx_disable(vdev);
1337cf7087dbSKim Phillips     }
1338cf7087dbSKim Phillips }
1339cf7087dbSKim Phillips 
vfio_msi_setup(VFIOPCIDevice * vdev,int pos,Error ** errp)1340b771a40fSZhenzhong Duan static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1341cf7087dbSKim Phillips {
1342cf7087dbSKim Phillips     uint16_t ctrl;
1343cf7087dbSKim Phillips     bool msi_64bit, msi_maskbit;
1344cf7087dbSKim Phillips     int ret, entries;
13451108b2f8SCao jin     Error *err = NULL;
1346cf7087dbSKim Phillips 
13475546a621SEric Auger     if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
1348cf7087dbSKim Phillips               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
13497ef165b9SEric Auger         error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
1350b771a40fSZhenzhong Duan         return false;
1351cf7087dbSKim Phillips     }
1352cf7087dbSKim Phillips     ctrl = le16_to_cpu(ctrl);
1353cf7087dbSKim Phillips 
1354cf7087dbSKim Phillips     msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
1355cf7087dbSKim Phillips     msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
1356cf7087dbSKim Phillips     entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
1357cf7087dbSKim Phillips 
13580de70dc7SAlex Williamson     trace_vfio_msi_setup(vdev->vbasedev.name, pos);
1359cf7087dbSKim Phillips 
13601108b2f8SCao jin     ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit, &err);
1361cf7087dbSKim Phillips     if (ret < 0) {
1362cf7087dbSKim Phillips         if (ret == -ENOTSUP) {
1363b771a40fSZhenzhong Duan             return true;
1364cf7087dbSKim Phillips         }
13654b576648SMarkus Armbruster         error_propagate_prepend(errp, err, "msi_init failed: ");
1366b771a40fSZhenzhong Duan         return false;
1367cf7087dbSKim Phillips     }
1368cf7087dbSKim Phillips     vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
1369cf7087dbSKim Phillips 
1370b771a40fSZhenzhong Duan     return true;
1371cf7087dbSKim Phillips }
1372cf7087dbSKim Phillips 
vfio_pci_fixup_msix_region(VFIOPCIDevice * vdev)1373db0da029SAlex Williamson static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
1374db0da029SAlex Williamson {
1375db0da029SAlex Williamson     off_t start, end;
1376db0da029SAlex Williamson     VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region;
1377db0da029SAlex Williamson 
1378db0da029SAlex Williamson     /*
1379ae0215b2SAlexey Kardashevskiy      * If the host driver allows mapping of a MSIX data, we are going to
1380ae0215b2SAlexey Kardashevskiy      * do map the entire BAR and emulate MSIX table on top of that.
1381ae0215b2SAlexey Kardashevskiy      */
1382ae0215b2SAlexey Kardashevskiy     if (vfio_has_region_cap(&vdev->vbasedev, region->nr,
1383ae0215b2SAlexey Kardashevskiy                             VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) {
1384ae0215b2SAlexey Kardashevskiy         return;
1385ae0215b2SAlexey Kardashevskiy     }
1386ae0215b2SAlexey Kardashevskiy 
1387ae0215b2SAlexey Kardashevskiy     /*
1388db0da029SAlex Williamson      * We expect to find a single mmap covering the whole BAR, anything else
1389db0da029SAlex Williamson      * means it's either unsupported or already setup.
1390db0da029SAlex Williamson      */
1391db0da029SAlex Williamson     if (region->nr_mmaps != 1 || region->mmaps[0].offset ||
1392db0da029SAlex Williamson         region->size != region->mmaps[0].size) {
1393db0da029SAlex Williamson         return;
1394db0da029SAlex Williamson     }
1395db0da029SAlex Williamson 
1396db0da029SAlex Williamson     /* MSI-X table start and end aligned to host page size */
13978e3b0cbbSMarc-André Lureau     start = vdev->msix->table_offset & qemu_real_host_page_mask();
1398db0da029SAlex Williamson     end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
1399db0da029SAlex Williamson                                (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
1400db0da029SAlex Williamson 
1401db0da029SAlex Williamson     /*
1402db0da029SAlex Williamson      * Does the MSI-X table cover the beginning of the BAR?  The whole BAR?
1403db0da029SAlex Williamson      * NB - Host page size is necessarily a power of two and so is the PCI
1404db0da029SAlex Williamson      * BAR (not counting EA yet), therefore if we have host page aligned
1405db0da029SAlex Williamson      * @start and @end, then any remainder of the BAR before or after those
1406db0da029SAlex Williamson      * must be at least host page sized and therefore mmap'able.
1407db0da029SAlex Williamson      */
1408db0da029SAlex Williamson     if (!start) {
1409db0da029SAlex Williamson         if (end >= region->size) {
1410db0da029SAlex Williamson             region->nr_mmaps = 0;
1411db0da029SAlex Williamson             g_free(region->mmaps);
1412db0da029SAlex Williamson             region->mmaps = NULL;
1413db0da029SAlex Williamson             trace_vfio_msix_fixup(vdev->vbasedev.name,
1414db0da029SAlex Williamson                                   vdev->msix->table_bar, 0, 0);
1415db0da029SAlex Williamson         } else {
1416db0da029SAlex Williamson             region->mmaps[0].offset = end;
1417db0da029SAlex Williamson             region->mmaps[0].size = region->size - end;
1418db0da029SAlex Williamson             trace_vfio_msix_fixup(vdev->vbasedev.name,
1419db0da029SAlex Williamson                               vdev->msix->table_bar, region->mmaps[0].offset,
1420db0da029SAlex Williamson                               region->mmaps[0].offset + region->mmaps[0].size);
1421db0da029SAlex Williamson         }
1422db0da029SAlex Williamson 
1423db0da029SAlex Williamson     /* Maybe it's aligned at the end of the BAR */
1424db0da029SAlex Williamson     } else if (end >= region->size) {
1425db0da029SAlex Williamson         region->mmaps[0].size = start;
1426db0da029SAlex Williamson         trace_vfio_msix_fixup(vdev->vbasedev.name,
1427db0da029SAlex Williamson                               vdev->msix->table_bar, region->mmaps[0].offset,
1428db0da029SAlex Williamson                               region->mmaps[0].offset + region->mmaps[0].size);
1429db0da029SAlex Williamson 
1430db0da029SAlex Williamson     /* Otherwise it must split the BAR */
1431db0da029SAlex Williamson     } else {
1432db0da029SAlex Williamson         region->nr_mmaps = 2;
1433db0da029SAlex Williamson         region->mmaps = g_renew(VFIOMmap, region->mmaps, 2);
1434db0da029SAlex Williamson 
1435db0da029SAlex Williamson         memcpy(&region->mmaps[1], &region->mmaps[0], sizeof(VFIOMmap));
1436db0da029SAlex Williamson 
1437db0da029SAlex Williamson         region->mmaps[0].size = start;
1438db0da029SAlex Williamson         trace_vfio_msix_fixup(vdev->vbasedev.name,
1439db0da029SAlex Williamson                               vdev->msix->table_bar, region->mmaps[0].offset,
1440db0da029SAlex Williamson                               region->mmaps[0].offset + region->mmaps[0].size);
1441db0da029SAlex Williamson 
1442db0da029SAlex Williamson         region->mmaps[1].offset = end;
1443db0da029SAlex Williamson         region->mmaps[1].size = region->size - end;
1444db0da029SAlex Williamson         trace_vfio_msix_fixup(vdev->vbasedev.name,
1445db0da029SAlex Williamson                               vdev->msix->table_bar, region->mmaps[1].offset,
1446db0da029SAlex Williamson                               region->mmaps[1].offset + region->mmaps[1].size);
1447db0da029SAlex Williamson     }
1448db0da029SAlex Williamson }
1449db0da029SAlex Williamson 
vfio_pci_relocate_msix(VFIOPCIDevice * vdev,Error ** errp)1450713b59a6SZhenzhong Duan static bool vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
145189d5202eSAlex Williamson {
145289d5202eSAlex Williamson     int target_bar = -1;
145389d5202eSAlex Williamson     size_t msix_sz;
145489d5202eSAlex Williamson 
145555872c70SMarkus Armbruster     if (!vdev->msix || vdev->msix_relo == OFF_AUTO_PCIBAR_OFF) {
1456713b59a6SZhenzhong Duan         return true;
145789d5202eSAlex Williamson     }
145889d5202eSAlex Williamson 
145989d5202eSAlex Williamson     /* The actual minimum size of MSI-X structures */
146089d5202eSAlex Williamson     msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) +
146189d5202eSAlex Williamson               (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8);
146289d5202eSAlex Williamson     /* Round up to host pages, we don't want to share a page */
146389d5202eSAlex Williamson     msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz);
146489d5202eSAlex Williamson     /* PCI BARs must be a power of 2 */
146589d5202eSAlex Williamson     msix_sz = pow2ceil(msix_sz);
146689d5202eSAlex Williamson 
146755872c70SMarkus Armbruster     if (vdev->msix_relo == OFF_AUTO_PCIBAR_AUTO) {
146889d5202eSAlex Williamson         /*
146989d5202eSAlex Williamson          * TODO: Lookup table for known devices.
147089d5202eSAlex Williamson          *
147189d5202eSAlex Williamson          * Logically we might use an algorithm here to select the BAR adding
1472631ba5a1SCai Huoqing          * the least additional MMIO space, but we cannot programmatically
147389d5202eSAlex Williamson          * predict the driver dependency on BAR ordering or sizing, therefore
147489d5202eSAlex Williamson          * 'auto' becomes a lookup for combinations reported to work.
147589d5202eSAlex Williamson          */
147689d5202eSAlex Williamson         if (target_bar < 0) {
147789d5202eSAlex Williamson             error_setg(errp, "No automatic MSI-X relocation available for "
147889d5202eSAlex Williamson                        "device %04x:%04x", vdev->vendor_id, vdev->device_id);
1479713b59a6SZhenzhong Duan             return false;
148089d5202eSAlex Williamson         }
148189d5202eSAlex Williamson     } else {
148255872c70SMarkus Armbruster         target_bar = (int)(vdev->msix_relo - OFF_AUTO_PCIBAR_BAR0);
148389d5202eSAlex Williamson     }
148489d5202eSAlex Williamson 
148589d5202eSAlex Williamson     /* I/O port BARs cannot host MSI-X structures */
148689d5202eSAlex Williamson     if (vdev->bars[target_bar].ioport) {
148789d5202eSAlex Williamson         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
148889d5202eSAlex Williamson                    "I/O port BAR", target_bar);
1489713b59a6SZhenzhong Duan         return false;
149089d5202eSAlex Williamson     }
149189d5202eSAlex Williamson 
149289d5202eSAlex Williamson     /* Cannot use a BAR in the "shadow" of a 64-bit BAR */
149389d5202eSAlex Williamson     if (!vdev->bars[target_bar].size &&
149489d5202eSAlex Williamson          target_bar > 0 && vdev->bars[target_bar - 1].mem64) {
149589d5202eSAlex Williamson         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
149689d5202eSAlex Williamson                    "consumed by 64-bit BAR %d", target_bar, target_bar - 1);
1497713b59a6SZhenzhong Duan         return false;
149889d5202eSAlex Williamson     }
149989d5202eSAlex Williamson 
150089d5202eSAlex Williamson     /* 2GB max size for 32-bit BARs, cannot double if already > 1G */
1501e0255bb1SPhilippe Mathieu-Daudé     if (vdev->bars[target_bar].size > 1 * GiB &&
150289d5202eSAlex Williamson         !vdev->bars[target_bar].mem64) {
150389d5202eSAlex Williamson         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
150489d5202eSAlex Williamson                    "no space to extend 32-bit BAR", target_bar);
1505713b59a6SZhenzhong Duan         return false;
150689d5202eSAlex Williamson     }
150789d5202eSAlex Williamson 
150889d5202eSAlex Williamson     /*
150989d5202eSAlex Williamson      * If adding a new BAR, test if we can make it 64bit.  We make it
151089d5202eSAlex Williamson      * prefetchable since QEMU MSI-X emulation has no read side effects
151189d5202eSAlex Williamson      * and doing so makes mapping more flexible.
151289d5202eSAlex Williamson      */
151389d5202eSAlex Williamson     if (!vdev->bars[target_bar].size) {
151489d5202eSAlex Williamson         if (target_bar < (PCI_ROM_SLOT - 1) &&
151589d5202eSAlex Williamson             !vdev->bars[target_bar + 1].size) {
151689d5202eSAlex Williamson             vdev->bars[target_bar].mem64 = true;
151789d5202eSAlex Williamson             vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64;
151889d5202eSAlex Williamson         }
151989d5202eSAlex Williamson         vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
152089d5202eSAlex Williamson         vdev->bars[target_bar].size = msix_sz;
152189d5202eSAlex Williamson         vdev->msix->table_offset = 0;
152289d5202eSAlex Williamson     } else {
152389d5202eSAlex Williamson         vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2,
152489d5202eSAlex Williamson                                           msix_sz * 2);
152589d5202eSAlex Williamson         /*
152689d5202eSAlex Williamson          * Due to above size calc, MSI-X always starts halfway into the BAR,
152789d5202eSAlex Williamson          * which will always be a separate host page.
152889d5202eSAlex Williamson          */
152989d5202eSAlex Williamson         vdev->msix->table_offset = vdev->bars[target_bar].size / 2;
153089d5202eSAlex Williamson     }
153189d5202eSAlex Williamson 
153289d5202eSAlex Williamson     vdev->msix->table_bar = target_bar;
153389d5202eSAlex Williamson     vdev->msix->pba_bar = target_bar;
153489d5202eSAlex Williamson     /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */
153589d5202eSAlex Williamson     vdev->msix->pba_offset = vdev->msix->table_offset +
153689d5202eSAlex Williamson                                   (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE);
153789d5202eSAlex Williamson 
153889d5202eSAlex Williamson     trace_vfio_msix_relo(vdev->vbasedev.name,
153989d5202eSAlex Williamson                          vdev->msix->table_bar, vdev->msix->table_offset);
1540713b59a6SZhenzhong Duan     return true;
154189d5202eSAlex Williamson }
154289d5202eSAlex Williamson 
1543cf7087dbSKim Phillips /*
1544cf7087dbSKim Phillips  * We don't have any control over how pci_add_capability() inserts
1545cf7087dbSKim Phillips  * capabilities into the chain.  In order to setup MSI-X we need a
1546cf7087dbSKim Phillips  * MemoryRegion for the BAR.  In order to setup the BAR and not
1547cf7087dbSKim Phillips  * attempt to mmap the MSI-X table area, which VFIO won't allow, we
1548cf7087dbSKim Phillips  * need to first look for where the MSI-X table lives.  So we
1549cf7087dbSKim Phillips  * unfortunately split MSI-X setup across two functions.
1550cf7087dbSKim Phillips  */
vfio_msix_early_setup(VFIOPCIDevice * vdev,Error ** errp)1551713b59a6SZhenzhong Duan static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
1552cf7087dbSKim Phillips {
1553cf7087dbSKim Phillips     uint8_t pos;
1554cf7087dbSKim Phillips     uint16_t ctrl;
1555cf7087dbSKim Phillips     uint32_t table, pba;
155645d85f62SJing Liu     int ret, fd = vdev->vbasedev.fd;
155745d85f62SJing Liu     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
155845d85f62SJing Liu                                       .index = VFIO_PCI_MSIX_IRQ_INDEX };
1559b5bd049fSAlex Williamson     VFIOMSIXInfo *msix;
1560cf7087dbSKim Phillips 
1561cf7087dbSKim Phillips     pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
1562cf7087dbSKim Phillips     if (!pos) {
1563713b59a6SZhenzhong Duan         return true;
1564cf7087dbSKim Phillips     }
1565cf7087dbSKim Phillips 
15665546a621SEric Auger     if (pread(fd, &ctrl, sizeof(ctrl),
1567b58b17f7SWei Yang               vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) {
1568008d0e2dSEric Auger         error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS");
1569713b59a6SZhenzhong Duan         return false;
1570cf7087dbSKim Phillips     }
1571cf7087dbSKim Phillips 
15725546a621SEric Auger     if (pread(fd, &table, sizeof(table),
1573cf7087dbSKim Phillips               vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
1574008d0e2dSEric Auger         error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE");
1575713b59a6SZhenzhong Duan         return false;
1576cf7087dbSKim Phillips     }
1577cf7087dbSKim Phillips 
15785546a621SEric Auger     if (pread(fd, &pba, sizeof(pba),
1579cf7087dbSKim Phillips               vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
1580008d0e2dSEric Auger         error_setg_errno(errp, errno, "failed to read PCI MSIX PBA");
1581713b59a6SZhenzhong Duan         return false;
1582cf7087dbSKim Phillips     }
1583cf7087dbSKim Phillips 
1584cf7087dbSKim Phillips     ctrl = le16_to_cpu(ctrl);
1585cf7087dbSKim Phillips     table = le32_to_cpu(table);
1586cf7087dbSKim Phillips     pba = le32_to_cpu(pba);
1587cf7087dbSKim Phillips 
1588b5bd049fSAlex Williamson     msix = g_malloc0(sizeof(*msix));
1589b5bd049fSAlex Williamson     msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
1590b5bd049fSAlex Williamson     msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
1591b5bd049fSAlex Williamson     msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
1592b5bd049fSAlex Williamson     msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
1593b5bd049fSAlex Williamson     msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
1594cf7087dbSKim Phillips 
159545d85f62SJing Liu     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
159645d85f62SJing Liu     if (ret < 0) {
159745d85f62SJing Liu         error_setg_errno(errp, -ret, "failed to get MSI-X irq info");
159845d85f62SJing Liu         g_free(msix);
1599713b59a6SZhenzhong Duan         return false;
160045d85f62SJing Liu     }
160145d85f62SJing Liu 
160245d85f62SJing Liu     msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE);
160345d85f62SJing Liu 
160443302969SGabriel Laupre     /*
160543302969SGabriel Laupre      * Test the size of the pba_offset variable and catch if it extends outside
160643302969SGabriel Laupre      * of the specified BAR. If it is the case, we need to apply a hardware
160743302969SGabriel Laupre      * specific quirk if the device is known or we have a broken configuration.
160843302969SGabriel Laupre      */
1609b5bd049fSAlex Williamson     if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) {
161043302969SGabriel Laupre         /*
161143302969SGabriel Laupre          * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
161243302969SGabriel Laupre          * adapters. The T5 hardware returns an incorrect value of 0x8000 for
161343302969SGabriel Laupre          * the VF PBA offset while the BAR itself is only 8k. The correct value
161443302969SGabriel Laupre          * is 0x1000, so we hard code that here.
161543302969SGabriel Laupre          */
1616ff635e37SAlex Williamson         if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO &&
1617ff635e37SAlex Williamson             (vdev->device_id & 0xff00) == 0x5800) {
1618b5bd049fSAlex Williamson             msix->pba_offset = 0x1000;
16191bd9f1b1SCai Huoqing         /*
16201bd9f1b1SCai Huoqing          * BAIDU KUNLUN Virtual Function devices for KUNLUN AI processor
16211bd9f1b1SCai Huoqing          * return an incorrect value of 0x460000 for the VF PBA offset while
16221bd9f1b1SCai Huoqing          * the BAR itself is only 0x10000.  The correct value is 0xb400.
16231bd9f1b1SCai Huoqing          */
16241bd9f1b1SCai Huoqing         } else if (vfio_pci_is(vdev, PCI_VENDOR_ID_BAIDU,
16251bd9f1b1SCai Huoqing                                PCI_DEVICE_ID_KUNLUN_VF)) {
16261bd9f1b1SCai Huoqing             msix->pba_offset = 0xb400;
162755872c70SMarkus Armbruster         } else if (vdev->msix_relo == OFF_AUTO_PCIBAR_OFF) {
1628008d0e2dSEric Auger             error_setg(errp, "hardware reports invalid configuration, "
162943302969SGabriel Laupre                        "MSIX PBA outside of specified BAR");
1630b5bd049fSAlex Williamson             g_free(msix);
1631713b59a6SZhenzhong Duan             return false;
163243302969SGabriel Laupre         }
163343302969SGabriel Laupre     }
163443302969SGabriel Laupre 
16350de70dc7SAlex Williamson     trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
163645d85f62SJing Liu                                 msix->table_offset, msix->entries,
163745d85f62SJing Liu                                 msix->noresize);
1638b5bd049fSAlex Williamson     vdev->msix = msix;
1639cf7087dbSKim Phillips 
1640db0da029SAlex Williamson     vfio_pci_fixup_msix_region(vdev);
164189d5202eSAlex Williamson 
1642713b59a6SZhenzhong Duan     return vfio_pci_relocate_msix(vdev, errp);
1643cf7087dbSKim Phillips }
1644cf7087dbSKim Phillips 
vfio_msix_setup(VFIOPCIDevice * vdev,int pos,Error ** errp)1645b771a40fSZhenzhong Duan static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1646cf7087dbSKim Phillips {
1647cf7087dbSKim Phillips     int ret;
1648ee640c62SCao jin     Error *err = NULL;
1649cf7087dbSKim Phillips 
1650b21e2380SMarkus Armbruster     vdev->msix->pending = g_new0(unsigned long,
1651b21e2380SMarkus Armbruster                                  BITS_TO_LONGS(vdev->msix->entries));
1652cf7087dbSKim Phillips     ret = msix_init(&vdev->pdev, vdev->msix->entries,
16533a286732SAlex Williamson                     vdev->bars[vdev->msix->table_bar].mr,
1654cf7087dbSKim Phillips                     vdev->msix->table_bar, vdev->msix->table_offset,
16553a286732SAlex Williamson                     vdev->bars[vdev->msix->pba_bar].mr,
1656ee640c62SCao jin                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos,
1657ee640c62SCao jin                     &err);
1658cf7087dbSKim Phillips     if (ret < 0) {
1659cf7087dbSKim Phillips         if (ret == -ENOTSUP) {
1660e1eb292aSMarkus Armbruster             warn_report_err(err);
1661b771a40fSZhenzhong Duan             return true;
1662cf7087dbSKim Phillips         }
1663ee640c62SCao jin 
1664ee640c62SCao jin         error_propagate(errp, err);
1665b771a40fSZhenzhong Duan         return false;
1666cf7087dbSKim Phillips     }
1667cf7087dbSKim Phillips 
166895239e16SAlex Williamson     /*
166995239e16SAlex Williamson      * The PCI spec suggests that devices provide additional alignment for
167095239e16SAlex Williamson      * MSI-X structures and avoid overlapping non-MSI-X related registers.
167195239e16SAlex Williamson      * For an assigned device, this hopefully means that emulation of MSI-X
167295239e16SAlex Williamson      * structures does not affect the performance of the device.  If devices
167395239e16SAlex Williamson      * fail to provide that alignment, a significant performance penalty may
167495239e16SAlex Williamson      * result, for instance Mellanox MT27500 VFs:
167595239e16SAlex Williamson      * http://www.spinics.net/lists/kvm/msg125881.html
167695239e16SAlex Williamson      *
167795239e16SAlex Williamson      * The PBA is simply not that important for such a serious regression and
167895239e16SAlex Williamson      * most drivers do not appear to look at it.  The solution for this is to
167995239e16SAlex Williamson      * disable the PBA MemoryRegion unless it's being used.  We disable it
168095239e16SAlex Williamson      * here and only enable it if a masked vector fires through QEMU.  As the
168195239e16SAlex Williamson      * vector-use notifier is called, which occurs on unmask, we test whether
168295239e16SAlex Williamson      * PBA emulation is needed and again disable if not.
168395239e16SAlex Williamson      */
168495239e16SAlex Williamson     memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
168595239e16SAlex Williamson 
1686fcad0d21SAlexey Kardashevskiy     /*
1687fcad0d21SAlexey Kardashevskiy      * The emulated machine may provide a paravirt interface for MSIX setup
1688fcad0d21SAlexey Kardashevskiy      * so it is not strictly necessary to emulate MSIX here. This becomes
1689fcad0d21SAlexey Kardashevskiy      * helpful when frequently accessed MMIO registers are located in
1690fcad0d21SAlexey Kardashevskiy      * subpages adjacent to the MSIX table but the MSIX data containing page
1691fcad0d21SAlexey Kardashevskiy      * cannot be mapped because of a host page size bigger than the MSIX table
1692fcad0d21SAlexey Kardashevskiy      * alignment.
1693fcad0d21SAlexey Kardashevskiy      */
1694fcad0d21SAlexey Kardashevskiy     if (object_property_get_bool(OBJECT(qdev_get_machine()),
1695fcad0d21SAlexey Kardashevskiy                                  "vfio-no-msix-emulation", NULL)) {
1696fcad0d21SAlexey Kardashevskiy         memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
1697fcad0d21SAlexey Kardashevskiy     }
1698fcad0d21SAlexey Kardashevskiy 
1699b771a40fSZhenzhong Duan     return true;
1700cf7087dbSKim Phillips }
1701cf7087dbSKim Phillips 
vfio_teardown_msi(VFIOPCIDevice * vdev)17029ee27d73SEric Auger static void vfio_teardown_msi(VFIOPCIDevice *vdev)
1703cf7087dbSKim Phillips {
1704cf7087dbSKim Phillips     msi_uninit(&vdev->pdev);
1705cf7087dbSKim Phillips 
1706cf7087dbSKim Phillips     if (vdev->msix) {
1707a664477dSEric Auger         msix_uninit(&vdev->pdev,
17083a286732SAlex Williamson                     vdev->bars[vdev->msix->table_bar].mr,
17093a286732SAlex Williamson                     vdev->bars[vdev->msix->pba_bar].mr);
171095239e16SAlex Williamson         g_free(vdev->msix->pending);
1711cf7087dbSKim Phillips     }
1712cf7087dbSKim Phillips }
1713cf7087dbSKim Phillips 
1714cf7087dbSKim Phillips /*
1715cf7087dbSKim Phillips  * Resource setup
1716cf7087dbSKim Phillips  */
vfio_mmap_set_enabled(VFIOPCIDevice * vdev,bool enabled)17179ee27d73SEric Auger static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
1718cf7087dbSKim Phillips {
1719cf7087dbSKim Phillips     int i;
1720cf7087dbSKim Phillips 
1721cf7087dbSKim Phillips     for (i = 0; i < PCI_ROM_SLOT; i++) {
1722db0da029SAlex Williamson         vfio_region_mmaps_set_enabled(&vdev->bars[i].region, enabled);
1723cf7087dbSKim Phillips     }
1724cf7087dbSKim Phillips }
1725cf7087dbSKim Phillips 
vfio_bar_prepare(VFIOPCIDevice * vdev,int nr)17263a286732SAlex Williamson static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
1727cf7087dbSKim Phillips {
1728cf7087dbSKim Phillips     VFIOBAR *bar = &vdev->bars[nr];
1729cf7087dbSKim Phillips 
1730cf7087dbSKim Phillips     uint32_t pci_bar;
1731cf7087dbSKim Phillips     int ret;
1732cf7087dbSKim Phillips 
1733cf7087dbSKim Phillips     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
17342d82f8a3SAlex Williamson     if (!bar->region.size) {
1735cf7087dbSKim Phillips         return;
1736cf7087dbSKim Phillips     }
1737cf7087dbSKim Phillips 
1738cf7087dbSKim Phillips     /* Determine what type of BAR this is for registration */
17395546a621SEric Auger     ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar),
1740cf7087dbSKim Phillips                 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
1741cf7087dbSKim Phillips     if (ret != sizeof(pci_bar)) {
1742cf7087dbSKim Phillips         error_report("vfio: Failed to read BAR %d (%m)", nr);
1743cf7087dbSKim Phillips         return;
1744cf7087dbSKim Phillips     }
1745cf7087dbSKim Phillips 
1746cf7087dbSKim Phillips     pci_bar = le32_to_cpu(pci_bar);
1747cf7087dbSKim Phillips     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
1748cf7087dbSKim Phillips     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
17493a286732SAlex Williamson     bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
1750cf7087dbSKim Phillips                                          ~PCI_BASE_ADDRESS_MEM_MASK);
17513a286732SAlex Williamson     bar->size = bar->region.size;
17523a286732SAlex Williamson }
17533a286732SAlex Williamson 
vfio_bars_prepare(VFIOPCIDevice * vdev)17543a286732SAlex Williamson static void vfio_bars_prepare(VFIOPCIDevice *vdev)
17553a286732SAlex Williamson {
17563a286732SAlex Williamson     int i;
17573a286732SAlex Williamson 
17583a286732SAlex Williamson     for (i = 0; i < PCI_ROM_SLOT; i++) {
17593a286732SAlex Williamson         vfio_bar_prepare(vdev, i);
17603a286732SAlex Williamson     }
17613a286732SAlex Williamson }
17623a286732SAlex Williamson 
vfio_bar_register(VFIOPCIDevice * vdev,int nr)17633a286732SAlex Williamson static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
17643a286732SAlex Williamson {
17653a286732SAlex Williamson     VFIOBAR *bar = &vdev->bars[nr];
17663a286732SAlex Williamson     char *name;
17673a286732SAlex Williamson 
17683a286732SAlex Williamson     if (!bar->size) {
17693a286732SAlex Williamson         return;
17703a286732SAlex Williamson     }
17713a286732SAlex Williamson 
17723a286732SAlex Williamson     bar->mr = g_new0(MemoryRegion, 1);
17733a286732SAlex Williamson     name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr);
17743a286732SAlex Williamson     memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size);
17753a286732SAlex Williamson     g_free(name);
17763a286732SAlex Williamson 
17773a286732SAlex Williamson     if (bar->region.size) {
17783a286732SAlex Williamson         memory_region_add_subregion(bar->mr, 0, bar->region.mem);
1779cf7087dbSKim Phillips 
1780db0da029SAlex Williamson         if (vfio_region_mmap(&bar->region)) {
1781db0da029SAlex Williamson             error_report("Failed to mmap %s BAR %d. Performance may be slow",
1782db0da029SAlex Williamson                          vdev->vbasedev.name, nr);
1783cf7087dbSKim Phillips         }
1784cf7087dbSKim Phillips     }
1785cf7087dbSKim Phillips 
17863a286732SAlex Williamson     pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
17873a286732SAlex Williamson }
17883a286732SAlex Williamson 
vfio_bars_register(VFIOPCIDevice * vdev)17893a286732SAlex Williamson static void vfio_bars_register(VFIOPCIDevice *vdev)
1790cf7087dbSKim Phillips {
1791cf7087dbSKim Phillips     int i;
1792cf7087dbSKim Phillips 
1793cf7087dbSKim Phillips     for (i = 0; i < PCI_ROM_SLOT; i++) {
17943a286732SAlex Williamson         vfio_bar_register(vdev, i);
1795cf7087dbSKim Phillips     }
1796cf7087dbSKim Phillips }
1797cf7087dbSKim Phillips 
vfio_bars_exit(VFIOPCIDevice * vdev)17982d82f8a3SAlex Williamson static void vfio_bars_exit(VFIOPCIDevice *vdev)
1799ba5e6bfaSPaolo Bonzini {
1800ba5e6bfaSPaolo Bonzini     int i;
1801ba5e6bfaSPaolo Bonzini 
1802ba5e6bfaSPaolo Bonzini     for (i = 0; i < PCI_ROM_SLOT; i++) {
18033a286732SAlex Williamson         VFIOBAR *bar = &vdev->bars[i];
18043a286732SAlex Williamson 
18052d82f8a3SAlex Williamson         vfio_bar_quirk_exit(vdev, i);
18063a286732SAlex Williamson         vfio_region_exit(&bar->region);
18073a286732SAlex Williamson         if (bar->region.size) {
18083a286732SAlex Williamson             memory_region_del_subregion(bar->mr, bar->region.mem);
18093a286732SAlex Williamson         }
1810ba5e6bfaSPaolo Bonzini     }
1811ba5e6bfaSPaolo Bonzini 
18122d82f8a3SAlex Williamson     if (vdev->vga) {
1813ba5e6bfaSPaolo Bonzini         pci_unregister_vga(&vdev->pdev);
18142d82f8a3SAlex Williamson         vfio_vga_quirk_exit(vdev);
1815ba5e6bfaSPaolo Bonzini     }
1816ba5e6bfaSPaolo Bonzini }
1817ba5e6bfaSPaolo Bonzini 
vfio_bars_finalize(VFIOPCIDevice * vdev)18182d82f8a3SAlex Williamson static void vfio_bars_finalize(VFIOPCIDevice *vdev)
1819cf7087dbSKim Phillips {
1820cf7087dbSKim Phillips     int i;
1821cf7087dbSKim Phillips 
1822cf7087dbSKim Phillips     for (i = 0; i < PCI_ROM_SLOT; i++) {
18233a286732SAlex Williamson         VFIOBAR *bar = &vdev->bars[i];
18243a286732SAlex Williamson 
18252d82f8a3SAlex Williamson         vfio_bar_quirk_finalize(vdev, i);
18263a286732SAlex Williamson         vfio_region_finalize(&bar->region);
18278af87a3eSAvihai Horon         if (bar->mr) {
18288af87a3eSAvihai Horon             assert(bar->size);
18293a286732SAlex Williamson             object_unparent(OBJECT(bar->mr));
18303a286732SAlex Williamson             g_free(bar->mr);
18318af87a3eSAvihai Horon             bar->mr = NULL;
18323a286732SAlex Williamson         }
1833cf7087dbSKim Phillips     }
1834cf7087dbSKim Phillips 
18352d82f8a3SAlex Williamson     if (vdev->vga) {
18362d82f8a3SAlex Williamson         vfio_vga_quirk_finalize(vdev);
18372d82f8a3SAlex Williamson         for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
18382d82f8a3SAlex Williamson             object_unparent(OBJECT(&vdev->vga->region[i].mem));
18392d82f8a3SAlex Williamson         }
18402d82f8a3SAlex Williamson         g_free(vdev->vga);
1841cf7087dbSKim Phillips     }
1842cf7087dbSKim Phillips }
1843cf7087dbSKim Phillips 
1844cf7087dbSKim Phillips /*
1845cf7087dbSKim Phillips  * General setup
1846cf7087dbSKim Phillips  */
vfio_std_cap_max_size(PCIDevice * pdev,uint8_t pos)1847cf7087dbSKim Phillips static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
1848cf7087dbSKim Phillips {
184988caf177SChen Fan     uint8_t tmp;
185088caf177SChen Fan     uint16_t next = PCI_CONFIG_SPACE_SIZE;
1851cf7087dbSKim Phillips 
1852cf7087dbSKim Phillips     for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
18533fc1c182SWei Yang          tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]) {
1854cf7087dbSKim Phillips         if (tmp > pos && tmp < next) {
1855cf7087dbSKim Phillips             next = tmp;
1856cf7087dbSKim Phillips         }
1857cf7087dbSKim Phillips     }
1858cf7087dbSKim Phillips 
1859cf7087dbSKim Phillips     return next - pos;
1860cf7087dbSKim Phillips }
1861cf7087dbSKim Phillips 
1862325ae8d5SChen Fan 
vfio_ext_cap_max_size(const uint8_t * config,uint16_t pos)1863325ae8d5SChen Fan static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
1864325ae8d5SChen Fan {
1865325ae8d5SChen Fan     uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
1866325ae8d5SChen Fan 
1867325ae8d5SChen Fan     for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
1868325ae8d5SChen Fan         tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
1869325ae8d5SChen Fan         if (tmp > pos && tmp < next) {
1870325ae8d5SChen Fan             next = tmp;
1871325ae8d5SChen Fan         }
1872325ae8d5SChen Fan     }
1873325ae8d5SChen Fan 
1874325ae8d5SChen Fan     return next - pos;
1875325ae8d5SChen Fan }
1876325ae8d5SChen Fan 
vfio_set_word_bits(uint8_t * buf,uint16_t val,uint16_t mask)1877cf7087dbSKim Phillips static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
1878cf7087dbSKim Phillips {
1879cf7087dbSKim Phillips     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
1880cf7087dbSKim Phillips }
1881cf7087dbSKim Phillips 
vfio_add_emulated_word(VFIOPCIDevice * vdev,int pos,uint16_t val,uint16_t mask)18829ee27d73SEric Auger static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos,
1883cf7087dbSKim Phillips                                    uint16_t val, uint16_t mask)
1884cf7087dbSKim Phillips {
1885cf7087dbSKim Phillips     vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
1886cf7087dbSKim Phillips     vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
1887cf7087dbSKim Phillips     vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
1888cf7087dbSKim Phillips }
1889cf7087dbSKim Phillips 
vfio_set_long_bits(uint8_t * buf,uint32_t val,uint32_t mask)1890cf7087dbSKim Phillips static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
1891cf7087dbSKim Phillips {
1892cf7087dbSKim Phillips     pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
1893cf7087dbSKim Phillips }
1894cf7087dbSKim Phillips 
vfio_add_emulated_long(VFIOPCIDevice * vdev,int pos,uint32_t val,uint32_t mask)18959ee27d73SEric Auger static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
1896cf7087dbSKim Phillips                                    uint32_t val, uint32_t mask)
1897cf7087dbSKim Phillips {
1898cf7087dbSKim Phillips     vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
1899cf7087dbSKim Phillips     vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
1900cf7087dbSKim Phillips     vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
1901cf7087dbSKim Phillips }
1902cf7087dbSKim Phillips 
vfio_pci_enable_rp_atomics(VFIOPCIDevice * vdev)1903c00aac6fSAlex Williamson static void vfio_pci_enable_rp_atomics(VFIOPCIDevice *vdev)
1904c00aac6fSAlex Williamson {
1905c00aac6fSAlex Williamson     struct vfio_device_info_cap_pci_atomic_comp *cap;
1906c00aac6fSAlex Williamson     g_autofree struct vfio_device_info *info = NULL;
1907c00aac6fSAlex Williamson     PCIBus *bus = pci_get_bus(&vdev->pdev);
1908c00aac6fSAlex Williamson     PCIDevice *parent = bus->parent_dev;
1909c00aac6fSAlex Williamson     struct vfio_info_cap_header *hdr;
1910c00aac6fSAlex Williamson     uint32_t mask = 0;
1911c00aac6fSAlex Williamson     uint8_t *pos;
1912c00aac6fSAlex Williamson 
1913c00aac6fSAlex Williamson     /*
1914c00aac6fSAlex Williamson      * PCIe Atomic Ops completer support is only added automatically for single
1915c00aac6fSAlex Williamson      * function devices downstream of a root port supporting DEVCAP2.  Support
1916c00aac6fSAlex Williamson      * is added during realize and, if added, removed during device exit.  The
1917c00aac6fSAlex Williamson      * single function requirement avoids conflicting requirements should a
1918c00aac6fSAlex Williamson      * slot be composed of multiple devices with differing capabilities.
1919c00aac6fSAlex Williamson      */
1920c00aac6fSAlex Williamson     if (pci_bus_is_root(bus) || !parent || !parent->exp.exp_cap ||
1921c00aac6fSAlex Williamson         pcie_cap_get_type(parent) != PCI_EXP_TYPE_ROOT_PORT ||
1922c00aac6fSAlex Williamson         pcie_cap_get_version(parent) != PCI_EXP_FLAGS_VER2 ||
1923c00aac6fSAlex Williamson         vdev->pdev.devfn ||
1924c00aac6fSAlex Williamson         vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
1925c00aac6fSAlex Williamson         return;
1926c00aac6fSAlex Williamson     }
1927c00aac6fSAlex Williamson 
1928c00aac6fSAlex Williamson     pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2;
1929c00aac6fSAlex Williamson 
1930c00aac6fSAlex Williamson     /* Abort if there'a already an Atomic Ops configuration on the root port */
1931c00aac6fSAlex Williamson     if (pci_get_long(pos) & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
1932c00aac6fSAlex Williamson                              PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
1933c00aac6fSAlex Williamson                              PCI_EXP_DEVCAP2_ATOMIC_COMP128)) {
1934c00aac6fSAlex Williamson         return;
1935c00aac6fSAlex Williamson     }
1936c00aac6fSAlex Williamson 
1937c00aac6fSAlex Williamson     info = vfio_get_device_info(vdev->vbasedev.fd);
1938c00aac6fSAlex Williamson     if (!info) {
1939c00aac6fSAlex Williamson         return;
1940c00aac6fSAlex Williamson     }
1941c00aac6fSAlex Williamson 
1942c00aac6fSAlex Williamson     hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP);
1943c00aac6fSAlex Williamson     if (!hdr) {
1944c00aac6fSAlex Williamson         return;
1945c00aac6fSAlex Williamson     }
1946c00aac6fSAlex Williamson 
1947c00aac6fSAlex Williamson     cap = (void *)hdr;
1948c00aac6fSAlex Williamson     if (cap->flags & VFIO_PCI_ATOMIC_COMP32) {
1949c00aac6fSAlex Williamson         mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP32;
1950c00aac6fSAlex Williamson     }
1951c00aac6fSAlex Williamson     if (cap->flags & VFIO_PCI_ATOMIC_COMP64) {
1952c00aac6fSAlex Williamson         mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP64;
1953c00aac6fSAlex Williamson     }
1954c00aac6fSAlex Williamson     if (cap->flags & VFIO_PCI_ATOMIC_COMP128) {
1955c00aac6fSAlex Williamson         mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP128;
1956c00aac6fSAlex Williamson     }
1957c00aac6fSAlex Williamson 
1958c00aac6fSAlex Williamson     if (!mask) {
1959c00aac6fSAlex Williamson         return;
1960c00aac6fSAlex Williamson     }
1961c00aac6fSAlex Williamson 
1962c00aac6fSAlex Williamson     pci_long_test_and_set_mask(pos, mask);
1963c00aac6fSAlex Williamson     vdev->clear_parent_atomics_on_exit = true;
1964c00aac6fSAlex Williamson }
1965c00aac6fSAlex Williamson 
vfio_pci_disable_rp_atomics(VFIOPCIDevice * vdev)1966c00aac6fSAlex Williamson static void vfio_pci_disable_rp_atomics(VFIOPCIDevice *vdev)
1967c00aac6fSAlex Williamson {
1968c00aac6fSAlex Williamson     if (vdev->clear_parent_atomics_on_exit) {
1969c00aac6fSAlex Williamson         PCIDevice *parent = pci_get_bus(&vdev->pdev)->parent_dev;
1970c00aac6fSAlex Williamson         uint8_t *pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2;
1971c00aac6fSAlex Williamson 
1972c00aac6fSAlex Williamson         pci_long_test_and_clear_mask(pos, PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
1973c00aac6fSAlex Williamson                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
1974c00aac6fSAlex Williamson                                           PCI_EXP_DEVCAP2_ATOMIC_COMP128);
1975c00aac6fSAlex Williamson     }
1976c00aac6fSAlex Williamson }
1977c00aac6fSAlex Williamson 
vfio_setup_pcie_cap(VFIOPCIDevice * vdev,int pos,uint8_t size,Error ** errp)1978b771a40fSZhenzhong Duan static bool vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size,
19797ef165b9SEric Auger                                 Error **errp)
1980cf7087dbSKim Phillips {
1981cf7087dbSKim Phillips     uint16_t flags;
1982cf7087dbSKim Phillips     uint8_t type;
1983cf7087dbSKim Phillips 
1984cf7087dbSKim Phillips     flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
1985cf7087dbSKim Phillips     type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
1986cf7087dbSKim Phillips 
1987cf7087dbSKim Phillips     if (type != PCI_EXP_TYPE_ENDPOINT &&
1988cf7087dbSKim Phillips         type != PCI_EXP_TYPE_LEG_END &&
1989cf7087dbSKim Phillips         type != PCI_EXP_TYPE_RC_END) {
1990cf7087dbSKim Phillips 
19917ef165b9SEric Auger         error_setg(errp, "assignment of PCIe type 0x%x "
1992cf7087dbSKim Phillips                    "devices is not currently supported", type);
1993b771a40fSZhenzhong Duan         return false;
1994cf7087dbSKim Phillips     }
1995cf7087dbSKim Phillips 
1996fd56e061SDavid Gibson     if (!pci_bus_is_express(pci_get_bus(&vdev->pdev))) {
1997fd56e061SDavid Gibson         PCIBus *bus = pci_get_bus(&vdev->pdev);
19980282abf0SAlex Williamson         PCIDevice *bridge;
19990282abf0SAlex Williamson 
2000cf7087dbSKim Phillips         /*
20010282abf0SAlex Williamson          * Traditionally PCI device assignment exposes the PCIe capability
20020282abf0SAlex Williamson          * as-is on non-express buses.  The reason being that some drivers
20030282abf0SAlex Williamson          * simply assume that it's there, for example tg3.  However when
20040282abf0SAlex Williamson          * we're running on a native PCIe machine type, like Q35, we need
20050282abf0SAlex Williamson          * to hide the PCIe capability.  The reason for this is twofold;
20060282abf0SAlex Williamson          * first Windows guests get a Code 10 error when the PCIe capability
20070282abf0SAlex Williamson          * is exposed in this configuration.  Therefore express devices won't
20080282abf0SAlex Williamson          * work at all unless they're attached to express buses in the VM.
20090282abf0SAlex Williamson          * Second, a native PCIe machine introduces the possibility of fine
20100282abf0SAlex Williamson          * granularity IOMMUs supporting both translation and isolation.
20110282abf0SAlex Williamson          * Guest code to discover the IOMMU visibility of a device, such as
20120282abf0SAlex Williamson          * IOMMU grouping code on Linux, is very aware of device types and
20130282abf0SAlex Williamson          * valid transitions between bus types.  An express device on a non-
20140282abf0SAlex Williamson          * express bus is not a valid combination on bare metal systems.
20150282abf0SAlex Williamson          *
20160282abf0SAlex Williamson          * Drivers that require a PCIe capability to make the device
20170282abf0SAlex Williamson          * functional are simply going to need to have their devices placed
20180282abf0SAlex Williamson          * on a PCIe bus in the VM.
2019cf7087dbSKim Phillips          */
20200282abf0SAlex Williamson         while (!pci_bus_is_root(bus)) {
20210282abf0SAlex Williamson             bridge = pci_bridge_get_device(bus);
2022fd56e061SDavid Gibson             bus = pci_get_bus(bridge);
20230282abf0SAlex Williamson         }
20240282abf0SAlex Williamson 
20250282abf0SAlex Williamson         if (pci_bus_is_express(bus)) {
2026b771a40fSZhenzhong Duan             return true;
20270282abf0SAlex Williamson         }
20280282abf0SAlex Williamson 
2029fd56e061SDavid Gibson     } else if (pci_bus_is_root(pci_get_bus(&vdev->pdev))) {
2030cf7087dbSKim Phillips         /*
2031cf7087dbSKim Phillips          * On a Root Complex bus Endpoints become Root Complex Integrated
2032cf7087dbSKim Phillips          * Endpoints, which changes the type and clears the LNK & LNK2 fields.
2033cf7087dbSKim Phillips          */
2034cf7087dbSKim Phillips         if (type == PCI_EXP_TYPE_ENDPOINT) {
2035cf7087dbSKim Phillips             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2036cf7087dbSKim Phillips                                    PCI_EXP_TYPE_RC_END << 4,
2037cf7087dbSKim Phillips                                    PCI_EXP_FLAGS_TYPE);
2038cf7087dbSKim Phillips 
2039cf7087dbSKim Phillips             /* Link Capabilities, Status, and Control goes away */
2040cf7087dbSKim Phillips             if (size > PCI_EXP_LNKCTL) {
2041cf7087dbSKim Phillips                 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
2042cf7087dbSKim Phillips                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2043cf7087dbSKim Phillips                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
2044cf7087dbSKim Phillips 
2045cf7087dbSKim Phillips #ifndef PCI_EXP_LNKCAP2
2046cf7087dbSKim Phillips #define PCI_EXP_LNKCAP2 44
2047cf7087dbSKim Phillips #endif
2048cf7087dbSKim Phillips #ifndef PCI_EXP_LNKSTA2
2049cf7087dbSKim Phillips #define PCI_EXP_LNKSTA2 50
2050cf7087dbSKim Phillips #endif
2051cf7087dbSKim Phillips                 /* Link 2 Capabilities, Status, and Control goes away */
2052cf7087dbSKim Phillips                 if (size > PCI_EXP_LNKCAP2) {
2053cf7087dbSKim Phillips                     vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
2054cf7087dbSKim Phillips                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
2055cf7087dbSKim Phillips                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
2056cf7087dbSKim Phillips                 }
2057cf7087dbSKim Phillips             }
2058cf7087dbSKim Phillips 
2059cf7087dbSKim Phillips         } else if (type == PCI_EXP_TYPE_LEG_END) {
2060cf7087dbSKim Phillips             /*
2061cf7087dbSKim Phillips              * Legacy endpoints don't belong on the root complex.  Windows
2062cf7087dbSKim Phillips              * seems to be happier with devices if we skip the capability.
2063cf7087dbSKim Phillips              */
2064b771a40fSZhenzhong Duan             return true;
2065cf7087dbSKim Phillips         }
2066cf7087dbSKim Phillips 
2067cf7087dbSKim Phillips     } else {
2068cf7087dbSKim Phillips         /*
2069cf7087dbSKim Phillips          * Convert Root Complex Integrated Endpoints to regular endpoints.
2070cf7087dbSKim Phillips          * These devices don't support LNK/LNK2 capabilities, so make them up.
2071cf7087dbSKim Phillips          */
2072cf7087dbSKim Phillips         if (type == PCI_EXP_TYPE_RC_END) {
2073cf7087dbSKim Phillips             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2074cf7087dbSKim Phillips                                    PCI_EXP_TYPE_ENDPOINT << 4,
2075cf7087dbSKim Phillips                                    PCI_EXP_FLAGS_TYPE);
2076cf7087dbSKim Phillips             vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
2077d96a0ac7SAlex Williamson                            QEMU_PCI_EXP_LNKCAP_MLW(QEMU_PCI_EXP_LNK_X1) |
2078d96a0ac7SAlex Williamson                            QEMU_PCI_EXP_LNKCAP_MLS(QEMU_PCI_EXP_LNK_2_5GT), ~0);
2079cf7087dbSKim Phillips             vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2080cf7087dbSKim Phillips         }
2081c00aac6fSAlex Williamson 
2082c00aac6fSAlex Williamson         vfio_pci_enable_rp_atomics(vdev);
2083cf7087dbSKim Phillips     }
2084cf7087dbSKim Phillips 
208547985727SAlex Williamson     /*
208647985727SAlex Williamson      * Intel 82599 SR-IOV VFs report an invalid PCIe capability version 0
208747985727SAlex Williamson      * (Niantic errate #35) causing Windows to error with a Code 10 for the
208847985727SAlex Williamson      * device on Q35.  Fixup any such devices to report version 1.  If we
208947985727SAlex Williamson      * were to remove the capability entirely the guest would lose extended
209047985727SAlex Williamson      * config space.
209147985727SAlex Williamson      */
209247985727SAlex Williamson     if ((flags & PCI_EXP_FLAGS_VERS) == 0) {
209347985727SAlex Williamson         vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
209447985727SAlex Williamson                                1, PCI_EXP_FLAGS_VERS);
209547985727SAlex Williamson     }
209647985727SAlex Williamson 
20979a7c2a59SMao Zhongyi     pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size,
20989a7c2a59SMao Zhongyi                              errp);
20999a7c2a59SMao Zhongyi     if (pos < 0) {
2100b771a40fSZhenzhong Duan         return false;
2101cf7087dbSKim Phillips     }
2102cf7087dbSKim Phillips 
21039a7c2a59SMao Zhongyi     vdev->pdev.exp.exp_cap = pos;
21049a7c2a59SMao Zhongyi 
2105b771a40fSZhenzhong Duan     return true;
2106cf7087dbSKim Phillips }
2107cf7087dbSKim Phillips 
vfio_check_pcie_flr(VFIOPCIDevice * vdev,uint8_t pos)21089ee27d73SEric Auger static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
2109cf7087dbSKim Phillips {
2110cf7087dbSKim Phillips     uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
2111cf7087dbSKim Phillips 
2112cf7087dbSKim Phillips     if (cap & PCI_EXP_DEVCAP_FLR) {
2113df92ee44SEric Auger         trace_vfio_check_pcie_flr(vdev->vbasedev.name);
2114cf7087dbSKim Phillips         vdev->has_flr = true;
2115cf7087dbSKim Phillips     }
2116cf7087dbSKim Phillips }
2117cf7087dbSKim Phillips 
vfio_check_pm_reset(VFIOPCIDevice * vdev,uint8_t pos)21189ee27d73SEric Auger static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos)
2119cf7087dbSKim Phillips {
2120cf7087dbSKim Phillips     uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
2121cf7087dbSKim Phillips 
2122cf7087dbSKim Phillips     if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
2123df92ee44SEric Auger         trace_vfio_check_pm_reset(vdev->vbasedev.name);
2124cf7087dbSKim Phillips         vdev->has_pm_reset = true;
2125cf7087dbSKim Phillips     }
2126cf7087dbSKim Phillips }
2127cf7087dbSKim Phillips 
vfio_check_af_flr(VFIOPCIDevice * vdev,uint8_t pos)21289ee27d73SEric Auger static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
2129cf7087dbSKim Phillips {
2130cf7087dbSKim Phillips     uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
2131cf7087dbSKim Phillips 
2132cf7087dbSKim Phillips     if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
2133df92ee44SEric Auger         trace_vfio_check_af_flr(vdev->vbasedev.name);
2134cf7087dbSKim Phillips         vdev->has_flr = true;
2135cf7087dbSKim Phillips     }
2136cf7087dbSKim Phillips }
2137cf7087dbSKim Phillips 
vfio_add_vendor_specific_cap(VFIOPCIDevice * vdev,int pos,uint8_t size,Error ** errp)2138b771a40fSZhenzhong Duan static bool vfio_add_vendor_specific_cap(VFIOPCIDevice *vdev, int pos,
2139187716feSVinayak Kale                                          uint8_t size, Error **errp)
2140187716feSVinayak Kale {
2141187716feSVinayak Kale     PCIDevice *pdev = &vdev->pdev;
2142187716feSVinayak Kale 
2143187716feSVinayak Kale     pos = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, size, errp);
2144187716feSVinayak Kale     if (pos < 0) {
2145b771a40fSZhenzhong Duan         return false;
2146187716feSVinayak Kale     }
2147187716feSVinayak Kale 
2148187716feSVinayak Kale     /*
2149187716feSVinayak Kale      * Exempt config space check for Vendor Specific Information during
2150187716feSVinayak Kale      * restore/load.
2151187716feSVinayak Kale      * Config space check is still enforced for 3 byte VSC header.
2152187716feSVinayak Kale      */
2153187716feSVinayak Kale     if (vdev->skip_vsc_check && size > 3) {
2154187716feSVinayak Kale         memset(pdev->cmask + pos + 3, 0, size - 3);
2155187716feSVinayak Kale     }
2156187716feSVinayak Kale 
2157b771a40fSZhenzhong Duan     return true;
2158187716feSVinayak Kale }
2159187716feSVinayak Kale 
vfio_add_std_cap(VFIOPCIDevice * vdev,uint8_t pos,Error ** errp)2160b771a40fSZhenzhong Duan static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
2161cf7087dbSKim Phillips {
2162cf8afdfaSZhao Liu     ERRP_GUARD();
2163cf7087dbSKim Phillips     PCIDevice *pdev = &vdev->pdev;
2164cf7087dbSKim Phillips     uint8_t cap_id, next, size;
2165b771a40fSZhenzhong Duan     bool ret;
2166cf7087dbSKim Phillips 
2167cf7087dbSKim Phillips     cap_id = pdev->config[pos];
21683fc1c182SWei Yang     next = pdev->config[pos + PCI_CAP_LIST_NEXT];
2169cf7087dbSKim Phillips 
2170cf7087dbSKim Phillips     /*
2171cf7087dbSKim Phillips      * If it becomes important to configure capabilities to their actual
2172cf7087dbSKim Phillips      * size, use this as the default when it's something we don't recognize.
2173cf7087dbSKim Phillips      * Since QEMU doesn't actually handle many of the config accesses,
2174cf7087dbSKim Phillips      * exact size doesn't seem worthwhile.
2175cf7087dbSKim Phillips      */
2176cf7087dbSKim Phillips     size = vfio_std_cap_max_size(pdev, pos);
2177cf7087dbSKim Phillips 
2178cf7087dbSKim Phillips     /*
2179cf7087dbSKim Phillips      * pci_add_capability always inserts the new capability at the head
2180cf7087dbSKim Phillips      * of the chain.  Therefore to end up with a chain that matches the
2181cf7087dbSKim Phillips      * physical device, we insert from the end by making this recursive.
21823fc1c182SWei Yang      * This is also why we pre-calculate size above as cached config space
2183cf7087dbSKim Phillips      * will be changed as we unwind the stack.
2184cf7087dbSKim Phillips      */
2185cf7087dbSKim Phillips     if (next) {
2186b771a40fSZhenzhong Duan         if (!vfio_add_std_cap(vdev, next, errp)) {
2187b771a40fSZhenzhong Duan             return false;
2188cf7087dbSKim Phillips         }
2189cf7087dbSKim Phillips     } else {
2190cf7087dbSKim Phillips         /* Begin the rebuild, use QEMU emulated list bits */
2191cf7087dbSKim Phillips         pdev->config[PCI_CAPABILITY_LIST] = 0;
2192cf7087dbSKim Phillips         vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
2193cf7087dbSKim Phillips         vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
2194e3f79f3bSAlex Williamson 
21950a0bda0aSZhenzhong Duan         if (!vfio_add_virt_caps(vdev, errp)) {
2196b771a40fSZhenzhong Duan             return false;
2197cf7087dbSKim Phillips         }
2198e3f79f3bSAlex Williamson     }
2199e3f79f3bSAlex Williamson 
2200e3f79f3bSAlex Williamson     /* Scale down size, esp in case virt caps were added above */
2201e3f79f3bSAlex Williamson     size = MIN(size, vfio_std_cap_max_size(pdev, pos));
2202cf7087dbSKim Phillips 
2203cf7087dbSKim Phillips     /* Use emulated next pointer to allow dropping caps */
22043fc1c182SWei Yang     pci_set_byte(vdev->emulated_config_bits + pos + PCI_CAP_LIST_NEXT, 0xff);
2205cf7087dbSKim Phillips 
2206cf7087dbSKim Phillips     switch (cap_id) {
2207cf7087dbSKim Phillips     case PCI_CAP_ID_MSI:
22087ef165b9SEric Auger         ret = vfio_msi_setup(vdev, pos, errp);
2209cf7087dbSKim Phillips         break;
2210cf7087dbSKim Phillips     case PCI_CAP_ID_EXP:
2211cf7087dbSKim Phillips         vfio_check_pcie_flr(vdev, pos);
22127ef165b9SEric Auger         ret = vfio_setup_pcie_cap(vdev, pos, size, errp);
2213cf7087dbSKim Phillips         break;
2214cf7087dbSKim Phillips     case PCI_CAP_ID_MSIX:
22157ef165b9SEric Auger         ret = vfio_msix_setup(vdev, pos, errp);
2216cf7087dbSKim Phillips         break;
2217cf7087dbSKim Phillips     case PCI_CAP_ID_PM:
2218cf7087dbSKim Phillips         vfio_check_pm_reset(vdev, pos);
2219cf7087dbSKim Phillips         vdev->pm_cap = pos;
2220b771a40fSZhenzhong Duan         ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
2221cf7087dbSKim Phillips         break;
2222cf7087dbSKim Phillips     case PCI_CAP_ID_AF:
2223cf7087dbSKim Phillips         vfio_check_af_flr(vdev, pos);
2224b771a40fSZhenzhong Duan         ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
2225cf7087dbSKim Phillips         break;
2226187716feSVinayak Kale     case PCI_CAP_ID_VNDR:
2227187716feSVinayak Kale         ret = vfio_add_vendor_specific_cap(vdev, pos, size, errp);
2228187716feSVinayak Kale         break;
2229cf7087dbSKim Phillips     default:
2230b771a40fSZhenzhong Duan         ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
2231cf7087dbSKim Phillips         break;
2232cf7087dbSKim Phillips     }
22335b31c822SAlex Williamson 
2234b771a40fSZhenzhong Duan     if (!ret) {
22357ef165b9SEric Auger         error_prepend(errp,
22367ef165b9SEric Auger                       "failed to add PCI capability 0x%x[0x%x]@0x%x: ",
22377ef165b9SEric Auger                       cap_id, size, pos);
2238cf7087dbSKim Phillips     }
2239cf7087dbSKim Phillips 
2240b771a40fSZhenzhong Duan     return ret;
2241cf7087dbSKim Phillips }
2242cf7087dbSKim Phillips 
vfio_setup_rebar_ecap(VFIOPCIDevice * vdev,uint16_t pos)2243b5048a4cSAlex Williamson static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos)
2244b5048a4cSAlex Williamson {
2245b5048a4cSAlex Williamson     uint32_t ctrl;
2246b5048a4cSAlex Williamson     int i, nbar;
2247b5048a4cSAlex Williamson 
2248b5048a4cSAlex Williamson     ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL);
2249b5048a4cSAlex Williamson     nbar = (ctrl & PCI_REBAR_CTRL_NBAR_MASK) >> PCI_REBAR_CTRL_NBAR_SHIFT;
2250b5048a4cSAlex Williamson 
2251b5048a4cSAlex Williamson     for (i = 0; i < nbar; i++) {
2252b5048a4cSAlex Williamson         uint32_t cap;
2253b5048a4cSAlex Williamson         int size;
2254b5048a4cSAlex Williamson 
2255b5048a4cSAlex Williamson         ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL + (i * 8));
2256b5048a4cSAlex Williamson         size = (ctrl & PCI_REBAR_CTRL_BAR_SIZE) >> PCI_REBAR_CTRL_BAR_SHIFT;
2257b5048a4cSAlex Williamson 
2258b5048a4cSAlex Williamson         /* The cap register reports sizes 1MB to 128TB, with 4 reserved bits */
2259b5048a4cSAlex Williamson         cap = size <= 27 ? 1U << (size + 4) : 0;
2260b5048a4cSAlex Williamson 
2261b5048a4cSAlex Williamson         /*
2262b5048a4cSAlex Williamson          * The PCIe spec (v6.0.1, 7.8.6) requires HW to support at least one
2263b5048a4cSAlex Williamson          * size in the range 1MB to 512GB.  We intend to mask all sizes except
2264b5048a4cSAlex Williamson          * the one currently enabled in the size field, therefore if it's
2265b5048a4cSAlex Williamson          * outside the range, hide the whole capability as this virtualization
2266b5048a4cSAlex Williamson          * trick won't work.  If >512GB resizable BARs start to appear, we
2267b5048a4cSAlex Williamson          * might need an opt-in or reservation scheme in the kernel.
2268b5048a4cSAlex Williamson          */
2269b5048a4cSAlex Williamson         if (!(cap & PCI_REBAR_CAP_SIZES)) {
2270b5048a4cSAlex Williamson             return -EINVAL;
2271b5048a4cSAlex Williamson         }
2272b5048a4cSAlex Williamson 
2273b5048a4cSAlex Williamson         /* Hide all sizes reported in the ctrl reg per above requirement. */
2274b5048a4cSAlex Williamson         ctrl &= (PCI_REBAR_CTRL_BAR_SIZE |
2275b5048a4cSAlex Williamson                  PCI_REBAR_CTRL_NBAR_MASK |
2276b5048a4cSAlex Williamson                  PCI_REBAR_CTRL_BAR_IDX);
2277b5048a4cSAlex Williamson 
2278b5048a4cSAlex Williamson         /*
2279b5048a4cSAlex Williamson          * The BAR size field is RW, however we've mangled the capability
2280b5048a4cSAlex Williamson          * register such that we only report a single size, ie. the current
2281b5048a4cSAlex Williamson          * BAR size.  A write of an unsupported value is undefined, therefore
2282b5048a4cSAlex Williamson          * the register field is essentially RO.
2283b5048a4cSAlex Williamson          */
2284b5048a4cSAlex Williamson         vfio_add_emulated_long(vdev, pos + PCI_REBAR_CAP + (i * 8), cap, ~0);
2285b5048a4cSAlex Williamson         vfio_add_emulated_long(vdev, pos + PCI_REBAR_CTRL + (i * 8), ctrl, ~0);
2286b5048a4cSAlex Williamson     }
2287b5048a4cSAlex Williamson 
2288b5048a4cSAlex Williamson     return 0;
2289b5048a4cSAlex Williamson }
2290b5048a4cSAlex Williamson 
vfio_add_ext_cap(VFIOPCIDevice * vdev)22917ef165b9SEric Auger static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
2292325ae8d5SChen Fan {
2293325ae8d5SChen Fan     PCIDevice *pdev = &vdev->pdev;
2294325ae8d5SChen Fan     uint32_t header;
2295325ae8d5SChen Fan     uint16_t cap_id, next, size;
2296325ae8d5SChen Fan     uint8_t cap_ver;
2297325ae8d5SChen Fan     uint8_t *config;
2298325ae8d5SChen Fan 
2299e37dac06SAlex Williamson     /* Only add extended caps if we have them and the guest can see them */
2300fd56e061SDavid Gibson     if (!pci_is_express(pdev) || !pci_bus_is_express(pci_get_bus(pdev)) ||
2301e37dac06SAlex Williamson         !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
23027ef165b9SEric Auger         return;
2303e37dac06SAlex Williamson     }
2304e37dac06SAlex Williamson 
2305325ae8d5SChen Fan     /*
2306325ae8d5SChen Fan      * pcie_add_capability always inserts the new capability at the tail
2307325ae8d5SChen Fan      * of the chain.  Therefore to end up with a chain that matches the
2308325ae8d5SChen Fan      * physical device, we cache the config space to avoid overwriting
2309325ae8d5SChen Fan      * the original config space when we parse the extended capabilities.
2310325ae8d5SChen Fan      */
2311325ae8d5SChen Fan     config = g_memdup(pdev->config, vdev->config_size);
2312325ae8d5SChen Fan 
2313e37dac06SAlex Williamson     /*
2314e37dac06SAlex Williamson      * Extended capabilities are chained with each pointing to the next, so we
2315e37dac06SAlex Williamson      * can drop anything other than the head of the chain simply by modifying
2316d0d1cd70SAlex Williamson      * the previous next pointer.  Seed the head of the chain here such that
2317d0d1cd70SAlex Williamson      * we can simply skip any capabilities we want to drop below, regardless
2318d0d1cd70SAlex Williamson      * of their position in the chain.  If this stub capability still exists
2319d0d1cd70SAlex Williamson      * after we add the capabilities we want to expose, update the capability
2320d0d1cd70SAlex Williamson      * ID to zero.  Note that we cannot seed with the capability header being
2321d0d1cd70SAlex Williamson      * zero as this conflicts with definition of an absent capability chain
2322d0d1cd70SAlex Williamson      * and prevents capabilities beyond the head of the list from being added.
2323d0d1cd70SAlex Williamson      * By replacing the dummy capability ID with zero after walking the device
2324d0d1cd70SAlex Williamson      * chain, we also transparently mark extended capabilities as absent if
2325d0d1cd70SAlex Williamson      * no capabilities were added.  Note that the PCIe spec defines an absence
2326d0d1cd70SAlex Williamson      * of extended capabilities to be determined by a value of zero for the
2327d0d1cd70SAlex Williamson      * capability ID, version, AND next pointer.  A non-zero next pointer
2328d0d1cd70SAlex Williamson      * should be sufficient to indicate additional capabilities are present,
2329d0d1cd70SAlex Williamson      * which will occur if we call pcie_add_capability() below.  The entire
2330d0d1cd70SAlex Williamson      * first dword is emulated to support this.
2331d0d1cd70SAlex Williamson      *
2332d0d1cd70SAlex Williamson      * NB. The kernel side does similar masking, so be prepared that our
2333d0d1cd70SAlex Williamson      * view of the device may also contain a capability ID zero in the head
2334d0d1cd70SAlex Williamson      * of the chain.  Skip it for the same reason that we cannot seed the
2335d0d1cd70SAlex Williamson      * chain with a zero capability.
2336e37dac06SAlex Williamson      */
2337e37dac06SAlex Williamson     pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
2338e37dac06SAlex Williamson                  PCI_EXT_CAP(0xFFFF, 0, 0));
2339e37dac06SAlex Williamson     pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
2340e37dac06SAlex Williamson     pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
2341e37dac06SAlex Williamson 
2342325ae8d5SChen Fan     for (next = PCI_CONFIG_SPACE_SIZE; next;
2343325ae8d5SChen Fan          next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
2344325ae8d5SChen Fan         header = pci_get_long(config + next);
2345325ae8d5SChen Fan         cap_id = PCI_EXT_CAP_ID(header);
2346325ae8d5SChen Fan         cap_ver = PCI_EXT_CAP_VER(header);
2347325ae8d5SChen Fan 
2348325ae8d5SChen Fan         /*
2349325ae8d5SChen Fan          * If it becomes important to configure extended capabilities to their
2350325ae8d5SChen Fan          * actual size, use this as the default when it's something we don't
2351325ae8d5SChen Fan          * recognize. Since QEMU doesn't actually handle many of the config
2352325ae8d5SChen Fan          * accesses, exact size doesn't seem worthwhile.
2353325ae8d5SChen Fan          */
2354325ae8d5SChen Fan         size = vfio_ext_cap_max_size(config, next);
2355325ae8d5SChen Fan 
2356325ae8d5SChen Fan         /* Use emulated next pointer to allow dropping extended caps */
2357325ae8d5SChen Fan         pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
2358325ae8d5SChen Fan                                    PCI_EXT_CAP_NEXT_MASK);
2359e37dac06SAlex Williamson 
2360e37dac06SAlex Williamson         switch (cap_id) {
2361d0d1cd70SAlex Williamson         case 0: /* kernel masked capability */
2362e37dac06SAlex Williamson         case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */
2363383a7af7SAlex Williamson         case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */
2364e37dac06SAlex Williamson             trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
2365e37dac06SAlex Williamson             break;
2366b5048a4cSAlex Williamson         case PCI_EXT_CAP_ID_REBAR:
2367b5048a4cSAlex Williamson             if (!vfio_setup_rebar_ecap(vdev, next)) {
2368b5048a4cSAlex Williamson                 pcie_add_capability(pdev, cap_id, cap_ver, next, size);
2369b5048a4cSAlex Williamson             }
2370b5048a4cSAlex Williamson             break;
2371e37dac06SAlex Williamson         default:
2372e37dac06SAlex Williamson             pcie_add_capability(pdev, cap_id, cap_ver, next, size);
2373e37dac06SAlex Williamson         }
2374e37dac06SAlex Williamson 
2375e37dac06SAlex Williamson     }
2376e37dac06SAlex Williamson 
2377e37dac06SAlex Williamson     /* Cleanup chain head ID if necessary */
2378e37dac06SAlex Williamson     if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
2379e37dac06SAlex Williamson         pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
2380325ae8d5SChen Fan     }
2381325ae8d5SChen Fan 
2382325ae8d5SChen Fan     g_free(config);
23837ef165b9SEric Auger     return;
2384325ae8d5SChen Fan }
2385325ae8d5SChen Fan 
vfio_add_capabilities(VFIOPCIDevice * vdev,Error ** errp)2386b771a40fSZhenzhong Duan static bool vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
2387cf7087dbSKim Phillips {
2388cf7087dbSKim Phillips     PCIDevice *pdev = &vdev->pdev;
2389cf7087dbSKim Phillips 
2390cf7087dbSKim Phillips     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
2391cf7087dbSKim Phillips         !pdev->config[PCI_CAPABILITY_LIST]) {
2392b771a40fSZhenzhong Duan         return true; /* Nothing to add */
2393cf7087dbSKim Phillips     }
2394cf7087dbSKim Phillips 
2395b771a40fSZhenzhong Duan     if (!vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST], errp)) {
2396b771a40fSZhenzhong Duan         return false;
2397325ae8d5SChen Fan     }
2398325ae8d5SChen Fan 
23997ef165b9SEric Auger     vfio_add_ext_cap(vdev);
2400b771a40fSZhenzhong Duan     return true;
2401cf7087dbSKim Phillips }
2402cf7087dbSKim Phillips 
vfio_pci_pre_reset(VFIOPCIDevice * vdev)2403c328e7e8SZhenzhong Duan void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
2404cf7087dbSKim Phillips {
2405cf7087dbSKim Phillips     PCIDevice *pdev = &vdev->pdev;
2406cf7087dbSKim Phillips     uint16_t cmd;
2407cf7087dbSKim Phillips 
2408cf7087dbSKim Phillips     vfio_disable_interrupts(vdev);
2409cf7087dbSKim Phillips 
2410cf7087dbSKim Phillips     /* Make sure the device is in D0 */
2411cf7087dbSKim Phillips     if (vdev->pm_cap) {
2412cf7087dbSKim Phillips         uint16_t pmcsr;
2413cf7087dbSKim Phillips         uint8_t state;
2414cf7087dbSKim Phillips 
2415cf7087dbSKim Phillips         pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2416cf7087dbSKim Phillips         state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2417cf7087dbSKim Phillips         if (state) {
2418cf7087dbSKim Phillips             pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
2419cf7087dbSKim Phillips             vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
2420cf7087dbSKim Phillips             /* vfio handles the necessary delay here */
2421cf7087dbSKim Phillips             pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2422cf7087dbSKim Phillips             state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2423cf7087dbSKim Phillips             if (state) {
2424cf7087dbSKim Phillips                 error_report("vfio: Unable to power on device, stuck in D%d",
2425cf7087dbSKim Phillips                              state);
2426cf7087dbSKim Phillips             }
2427cf7087dbSKim Phillips         }
2428cf7087dbSKim Phillips     }
2429cf7087dbSKim Phillips 
2430cf7087dbSKim Phillips     /*
2431631ba5a1SCai Huoqing      * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master.
2432cf7087dbSKim Phillips      * Also put INTx Disable in known state.
2433cf7087dbSKim Phillips      */
2434cf7087dbSKim Phillips     cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
2435cf7087dbSKim Phillips     cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
2436cf7087dbSKim Phillips              PCI_COMMAND_INTX_DISABLE);
2437cf7087dbSKim Phillips     vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
2438cf7087dbSKim Phillips }
2439cf7087dbSKim Phillips 
vfio_pci_post_reset(VFIOPCIDevice * vdev)2440c328e7e8SZhenzhong Duan void vfio_pci_post_reset(VFIOPCIDevice *vdev)
2441cf7087dbSKim Phillips {
24427dfb3424SEric Auger     Error *err = NULL;
2443a52a4c47SIdo Yariv     int nr;
24447dfb3424SEric Auger 
2445c32bab07SZhenzhong Duan     if (!vfio_intx_enable(vdev, &err)) {
2446c3b8e3e0SMarkus Armbruster         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
24477dfb3424SEric Auger     }
2448a52a4c47SIdo Yariv 
2449a52a4c47SIdo Yariv     for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
2450a52a4c47SIdo Yariv         off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr);
2451a52a4c47SIdo Yariv         uint32_t val = 0;
2452a52a4c47SIdo Yariv         uint32_t len = sizeof(val);
2453a52a4c47SIdo Yariv 
2454a52a4c47SIdo Yariv         if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) {
2455a52a4c47SIdo Yariv             error_report("%s(%s) reset bar %d failed: %m", __func__,
2456a52a4c47SIdo Yariv                          vdev->vbasedev.name, nr);
2457a52a4c47SIdo Yariv         }
2458a52a4c47SIdo Yariv     }
2459469d02deSAlex Williamson 
2460469d02deSAlex Williamson     vfio_quirk_reset(vdev);
2461cf7087dbSKim Phillips }
2462cf7087dbSKim Phillips 
vfio_pci_host_match(PCIHostDeviceAddress * addr,const char * name)2463c328e7e8SZhenzhong Duan bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
2464cf7087dbSKim Phillips {
24657df9381bSAlex Williamson     char tmp[13];
24667df9381bSAlex Williamson 
24677df9381bSAlex Williamson     sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain,
24687df9381bSAlex Williamson             addr->bus, addr->slot, addr->function);
24697df9381bSAlex Williamson 
24707df9381bSAlex Williamson     return (strcmp(tmp, name) == 0);
2471cf7087dbSKim Phillips }
2472cf7087dbSKim Phillips 
vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice * vdev,struct vfio_pci_hot_reset_info ** info_p)24734d36ec23SZhenzhong Duan int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
24744d36ec23SZhenzhong Duan                                     struct vfio_pci_hot_reset_info **info_p)
24754d36ec23SZhenzhong Duan {
24764d36ec23SZhenzhong Duan     struct vfio_pci_hot_reset_info *info;
24774d36ec23SZhenzhong Duan     int ret, count;
24784d36ec23SZhenzhong Duan 
24794d36ec23SZhenzhong Duan     assert(info_p && !*info_p);
24804d36ec23SZhenzhong Duan 
24814d36ec23SZhenzhong Duan     info = g_malloc0(sizeof(*info));
24824d36ec23SZhenzhong Duan     info->argsz = sizeof(*info);
24834d36ec23SZhenzhong Duan 
24844d36ec23SZhenzhong Duan     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
24854d36ec23SZhenzhong Duan     if (ret && errno != ENOSPC) {
24864d36ec23SZhenzhong Duan         ret = -errno;
24874d36ec23SZhenzhong Duan         g_free(info);
24884d36ec23SZhenzhong Duan         if (!vdev->has_pm_reset) {
24894d36ec23SZhenzhong Duan             error_report("vfio: Cannot reset device %s, "
24904d36ec23SZhenzhong Duan                          "no available reset mechanism.", vdev->vbasedev.name);
24914d36ec23SZhenzhong Duan         }
24924d36ec23SZhenzhong Duan         return ret;
24934d36ec23SZhenzhong Duan     }
24944d36ec23SZhenzhong Duan 
24954d36ec23SZhenzhong Duan     count = info->count;
24964d36ec23SZhenzhong Duan     info = g_realloc(info, sizeof(*info) + (count * sizeof(info->devices[0])));
24974d36ec23SZhenzhong Duan     info->argsz = sizeof(*info) + (count * sizeof(info->devices[0]));
24984d36ec23SZhenzhong Duan 
24994d36ec23SZhenzhong Duan     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
25004d36ec23SZhenzhong Duan     if (ret) {
25014d36ec23SZhenzhong Duan         ret = -errno;
25024d36ec23SZhenzhong Duan         g_free(info);
25034d36ec23SZhenzhong Duan         error_report("vfio: hot reset info failed: %m");
25044d36ec23SZhenzhong Duan         return ret;
25054d36ec23SZhenzhong Duan     }
25064d36ec23SZhenzhong Duan 
25074d36ec23SZhenzhong Duan     *info_p = info;
25084d36ec23SZhenzhong Duan     return 0;
25094d36ec23SZhenzhong Duan }
25104d36ec23SZhenzhong Duan 
vfio_pci_hot_reset(VFIOPCIDevice * vdev,bool single)25119ee27d73SEric Auger static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
2512cf7087dbSKim Phillips {
2513c328e7e8SZhenzhong Duan     VFIODevice *vbasedev = &vdev->vbasedev;
251441d698b8SCédric Le Goater     const VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer);
2515cf7087dbSKim Phillips 
251641d698b8SCédric Le Goater     return vioc->pci_hot_reset(vbasedev, single);
2517cf7087dbSKim Phillips }
2518cf7087dbSKim Phillips 
2519cf7087dbSKim Phillips /*
2520631ba5a1SCai Huoqing  * We want to differentiate hot reset of multiple in-use devices vs hot reset
2521cf7087dbSKim Phillips  * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
2522cf7087dbSKim Phillips  * of doing hot resets when there is only a single device per bus.  The in-use
2523cf7087dbSKim Phillips  * here refers to how many VFIODevices are affected.  A hot reset that affects
2524cf7087dbSKim Phillips  * multiple devices, but only a single in-use device, means that we can call
2525cf7087dbSKim Phillips  * it from our bus ->reset() callback since the extent is effectively a single
2526cf7087dbSKim Phillips  * device.  This allows us to make use of it in the hotplug path.  When there
2527cf7087dbSKim Phillips  * are multiple in-use devices, we can only trigger the hot reset during a
2528cf7087dbSKim Phillips  * system reset and thus from our reset handler.  We separate _one vs _multi
2529cf7087dbSKim Phillips  * here so that we don't overlap and do a double reset on the system reset
2530cf7087dbSKim Phillips  * path where both our reset handler and ->reset() callback are used.  Calling
2531cf7087dbSKim Phillips  * _one() will only do a hot reset for the one in-use devices case, calling
2532cf7087dbSKim Phillips  * _multi() will do nothing if a _one() would have been sufficient.
2533cf7087dbSKim Phillips  */
vfio_pci_hot_reset_one(VFIOPCIDevice * vdev)25349ee27d73SEric Auger static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev)
2535cf7087dbSKim Phillips {
2536cf7087dbSKim Phillips     return vfio_pci_hot_reset(vdev, true);
2537cf7087dbSKim Phillips }
2538cf7087dbSKim Phillips 
vfio_pci_hot_reset_multi(VFIODevice * vbasedev)2539b47d8efaSEric Auger static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev)
2540cf7087dbSKim Phillips {
2541b47d8efaSEric Auger     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2542cf7087dbSKim Phillips     return vfio_pci_hot_reset(vdev, false);
2543cf7087dbSKim Phillips }
2544cf7087dbSKim Phillips 
vfio_pci_compute_needs_reset(VFIODevice * vbasedev)2545b47d8efaSEric Auger static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev)
2546b47d8efaSEric Auger {
2547b47d8efaSEric Auger     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2548b47d8efaSEric Auger     if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
2549b47d8efaSEric Auger         vbasedev->needs_reset = true;
2550b47d8efaSEric Auger     }
2551b47d8efaSEric Auger }
2552b47d8efaSEric Auger 
vfio_pci_get_object(VFIODevice * vbasedev)2553e93b733bSKirti Wankhede static Object *vfio_pci_get_object(VFIODevice *vbasedev)
2554e93b733bSKirti Wankhede {
2555e93b733bSKirti Wankhede     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2556e93b733bSKirti Wankhede 
2557e93b733bSKirti Wankhede     return OBJECT(vdev);
2558e93b733bSKirti Wankhede }
2559e93b733bSKirti Wankhede 
vfio_msix_present(void * opaque,int version_id)2560c5e2fb3cSKirti Wankhede static bool vfio_msix_present(void *opaque, int version_id)
2561c5e2fb3cSKirti Wankhede {
2562c5e2fb3cSKirti Wankhede     PCIDevice *pdev = opaque;
2563c5e2fb3cSKirti Wankhede 
2564c5e2fb3cSKirti Wankhede     return msix_present(pdev);
2565c5e2fb3cSKirti Wankhede }
2566c5e2fb3cSKirti Wankhede 
vfio_display_migration_needed(void * opaque)256787417811SMarc-André Lureau static bool vfio_display_migration_needed(void *opaque)
256887417811SMarc-André Lureau {
256987417811SMarc-André Lureau     VFIOPCIDevice *vdev = opaque;
257087417811SMarc-André Lureau 
257187417811SMarc-André Lureau     /*
257287417811SMarc-André Lureau      * We need to migrate the VFIODisplay object if ramfb *migration* was
257387417811SMarc-André Lureau      * explicitly requested (in which case we enforced both ramfb=on and
257487417811SMarc-André Lureau      * display=on), or ramfb migration was left at the default "auto"
257587417811SMarc-André Lureau      * setting, and *ramfb* was explicitly requested (in which case we
257687417811SMarc-André Lureau      * enforced display=on).
257787417811SMarc-André Lureau      */
257887417811SMarc-André Lureau     return vdev->ramfb_migrate == ON_OFF_AUTO_ON ||
257987417811SMarc-André Lureau         (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO && vdev->enable_ramfb);
258087417811SMarc-André Lureau }
258187417811SMarc-André Lureau 
258275d5a5feSFrediano Ziglio static const VMStateDescription vmstate_vfio_display = {
258387417811SMarc-André Lureau     .name = "VFIOPCIDevice/VFIODisplay",
258487417811SMarc-André Lureau     .version_id = 1,
258587417811SMarc-André Lureau     .minimum_version_id = 1,
258687417811SMarc-André Lureau     .needed = vfio_display_migration_needed,
258765bd53e8SRichard Henderson     .fields = (const VMStateField[]){
258887417811SMarc-André Lureau         VMSTATE_STRUCT_POINTER(dpy, VFIOPCIDevice, vfio_display_vmstate,
258987417811SMarc-André Lureau                                VFIODisplay),
259087417811SMarc-André Lureau         VMSTATE_END_OF_LIST()
259187417811SMarc-André Lureau     }
259287417811SMarc-André Lureau };
259387417811SMarc-André Lureau 
259475d5a5feSFrediano Ziglio static const VMStateDescription vmstate_vfio_pci_config = {
2595c5e2fb3cSKirti Wankhede     .name = "VFIOPCIDevice",
2596c5e2fb3cSKirti Wankhede     .version_id = 1,
2597c5e2fb3cSKirti Wankhede     .minimum_version_id = 1,
259865bd53e8SRichard Henderson     .fields = (const VMStateField[]) {
2599c5e2fb3cSKirti Wankhede         VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
2600c5e2fb3cSKirti Wankhede         VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present),
2601c5e2fb3cSKirti Wankhede         VMSTATE_END_OF_LIST()
260287417811SMarc-André Lureau     },
260365bd53e8SRichard Henderson     .subsections = (const VMStateDescription * const []) {
260487417811SMarc-André Lureau         &vmstate_vfio_display,
260587417811SMarc-André Lureau         NULL
2606c5e2fb3cSKirti Wankhede     }
2607c5e2fb3cSKirti Wankhede };
2608c5e2fb3cSKirti Wankhede 
vfio_pci_save_config(VFIODevice * vbasedev,QEMUFile * f,Error ** errp)26093783f814SCédric Le Goater static int vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f, Error **errp)
2610c5e2fb3cSKirti Wankhede {
2611c5e2fb3cSKirti Wankhede     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2612c5e2fb3cSKirti Wankhede 
26133783f814SCédric Le Goater     return vmstate_save_state_with_err(f, &vmstate_vfio_pci_config, vdev, NULL,
26143783f814SCédric Le Goater                                        errp);
2615c5e2fb3cSKirti Wankhede }
2616c5e2fb3cSKirti Wankhede 
vfio_pci_load_config(VFIODevice * vbasedev,QEMUFile * f)2617c5e2fb3cSKirti Wankhede static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
2618c5e2fb3cSKirti Wankhede {
2619c5e2fb3cSKirti Wankhede     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2620c5e2fb3cSKirti Wankhede     PCIDevice *pdev = &vdev->pdev;
2621f36d4fb8SKunkun Jiang     pcibus_t old_addr[PCI_NUM_REGIONS - 1];
2622f36d4fb8SKunkun Jiang     int bar, ret;
2623f36d4fb8SKunkun Jiang 
2624f36d4fb8SKunkun Jiang     for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
2625f36d4fb8SKunkun Jiang         old_addr[bar] = pdev->io_regions[bar].addr;
2626f36d4fb8SKunkun Jiang     }
2627c5e2fb3cSKirti Wankhede 
2628c5e2fb3cSKirti Wankhede     ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1);
2629c5e2fb3cSKirti Wankhede     if (ret) {
2630c5e2fb3cSKirti Wankhede         return ret;
2631c5e2fb3cSKirti Wankhede     }
2632c5e2fb3cSKirti Wankhede 
2633c5e2fb3cSKirti Wankhede     vfio_pci_write_config(pdev, PCI_COMMAND,
2634c5e2fb3cSKirti Wankhede                           pci_get_word(pdev->config + PCI_COMMAND), 2);
2635c5e2fb3cSKirti Wankhede 
2636f36d4fb8SKunkun Jiang     for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
2637f36d4fb8SKunkun Jiang         /*
2638f36d4fb8SKunkun Jiang          * The address may not be changed in some scenarios
2639f36d4fb8SKunkun Jiang          * (e.g. the VF driver isn't loaded in VM).
2640f36d4fb8SKunkun Jiang          */
2641f36d4fb8SKunkun Jiang         if (old_addr[bar] != pdev->io_regions[bar].addr &&
2642f36d4fb8SKunkun Jiang             vdev->bars[bar].region.size > 0 &&
26438e3b0cbbSMarc-André Lureau             vdev->bars[bar].region.size < qemu_real_host_page_size()) {
2644f36d4fb8SKunkun Jiang             vfio_sub_page_bar_update_mapping(pdev, bar);
2645f36d4fb8SKunkun Jiang         }
2646f36d4fb8SKunkun Jiang     }
2647f36d4fb8SKunkun Jiang 
2648c5e2fb3cSKirti Wankhede     if (msi_enabled(pdev)) {
2649c5e2fb3cSKirti Wankhede         vfio_msi_enable(vdev);
2650c5e2fb3cSKirti Wankhede     } else if (msix_enabled(pdev)) {
2651c5e2fb3cSKirti Wankhede         vfio_msix_enable(vdev);
2652c5e2fb3cSKirti Wankhede     }
2653c5e2fb3cSKirti Wankhede 
2654c5e2fb3cSKirti Wankhede     return ret;
2655c5e2fb3cSKirti Wankhede }
2656c5e2fb3cSKirti Wankhede 
2657b47d8efaSEric Auger static VFIODeviceOps vfio_pci_ops = {
2658b47d8efaSEric Auger     .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
2659b47d8efaSEric Auger     .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
2660870cb6f1SAlex Williamson     .vfio_eoi = vfio_intx_eoi,
2661e93b733bSKirti Wankhede     .vfio_get_object = vfio_pci_get_object,
2662c5e2fb3cSKirti Wankhede     .vfio_save_config = vfio_pci_save_config,
2663c5e2fb3cSKirti Wankhede     .vfio_load_config = vfio_pci_load_config,
2664b47d8efaSEric Auger };
2665b47d8efaSEric Auger 
vfio_populate_vga(VFIOPCIDevice * vdev,Error ** errp)266664410a74SZhenzhong Duan bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
2667e593c021SAlex Williamson {
2668e593c021SAlex Williamson     VFIODevice *vbasedev = &vdev->vbasedev;
26690d3e89beSZhenzhong Duan     g_autofree struct vfio_region_info *reg_info = NULL;
2670e593c021SAlex Williamson     int ret;
2671e593c021SAlex Williamson 
26724225f2b6SAlex Williamson     ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, &reg_info);
2673e593c021SAlex Williamson     if (ret) {
2674cde4279bSEric Auger         error_setg_errno(errp, -ret,
2675cde4279bSEric Auger                          "failed getting region info for VGA region index %d",
2676cde4279bSEric Auger                          VFIO_PCI_VGA_REGION_INDEX);
267764410a74SZhenzhong Duan         return false;
2678e593c021SAlex Williamson     }
2679e593c021SAlex Williamson 
2680e593c021SAlex Williamson     if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) ||
2681e593c021SAlex Williamson         !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) ||
2682e593c021SAlex Williamson         reg_info->size < 0xbffff + 1) {
2683cde4279bSEric Auger         error_setg(errp, "unexpected VGA info, flags 0x%lx, size 0x%lx",
2684e593c021SAlex Williamson                    (unsigned long)reg_info->flags,
2685e593c021SAlex Williamson                    (unsigned long)reg_info->size);
268664410a74SZhenzhong Duan         return false;
2687e593c021SAlex Williamson     }
2688e593c021SAlex Williamson 
2689e593c021SAlex Williamson     vdev->vga = g_new0(VFIOVGA, 1);
2690e593c021SAlex Williamson 
2691e593c021SAlex Williamson     vdev->vga->fd_offset = reg_info->offset;
2692e593c021SAlex Williamson     vdev->vga->fd = vdev->vbasedev.fd;
2693e593c021SAlex Williamson 
2694e593c021SAlex Williamson     vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
2695e593c021SAlex Williamson     vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
2696e593c021SAlex Williamson     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks);
2697e593c021SAlex Williamson 
2698182bca45SAlex Williamson     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
2699182bca45SAlex Williamson                           OBJECT(vdev), &vfio_vga_ops,
2700182bca45SAlex Williamson                           &vdev->vga->region[QEMU_PCI_VGA_MEM],
2701182bca45SAlex Williamson                           "vfio-vga-mmio@0xa0000",
2702182bca45SAlex Williamson                           QEMU_PCI_VGA_MEM_SIZE);
2703182bca45SAlex Williamson 
2704e593c021SAlex Williamson     vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
2705e593c021SAlex Williamson     vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
2706e593c021SAlex Williamson     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks);
2707e593c021SAlex Williamson 
2708182bca45SAlex Williamson     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
2709182bca45SAlex Williamson                           OBJECT(vdev), &vfio_vga_ops,
2710182bca45SAlex Williamson                           &vdev->vga->region[QEMU_PCI_VGA_IO_LO],
2711182bca45SAlex Williamson                           "vfio-vga-io@0x3b0",
2712182bca45SAlex Williamson                           QEMU_PCI_VGA_IO_LO_SIZE);
2713182bca45SAlex Williamson 
2714e593c021SAlex Williamson     vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
2715e593c021SAlex Williamson     vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
2716e593c021SAlex Williamson     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks);
2717e593c021SAlex Williamson 
2718182bca45SAlex Williamson     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
2719182bca45SAlex Williamson                           OBJECT(vdev), &vfio_vga_ops,
2720182bca45SAlex Williamson                           &vdev->vga->region[QEMU_PCI_VGA_IO_HI],
2721182bca45SAlex Williamson                           "vfio-vga-io@0x3c0",
2722182bca45SAlex Williamson                           QEMU_PCI_VGA_IO_HI_SIZE);
2723182bca45SAlex Williamson 
2724182bca45SAlex Williamson     pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
2725182bca45SAlex Williamson                      &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
2726182bca45SAlex Williamson                      &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem);
2727182bca45SAlex Williamson 
272864410a74SZhenzhong Duan     return true;
2729e593c021SAlex Williamson }
2730e593c021SAlex Williamson 
vfio_populate_device(VFIOPCIDevice * vdev,Error ** errp)2731e942d8f0SZhenzhong Duan static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
2732cf7087dbSKim Phillips {
2733217e9fdcSPaolo Bonzini     VFIODevice *vbasedev = &vdev->vbasedev;
27340d3e89beSZhenzhong Duan     g_autofree struct vfio_region_info *reg_info = NULL;
2735cf7087dbSKim Phillips     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
2736d13dd2d7SEric Auger     int i, ret = -1;
2737cf7087dbSKim Phillips 
2738cf7087dbSKim Phillips     /* Sanity check device */
2739d13dd2d7SEric Auger     if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) {
27402312d907SEric Auger         error_setg(errp, "this isn't a PCI device");
2741e942d8f0SZhenzhong Duan         return false;
2742cf7087dbSKim Phillips     }
2743cf7087dbSKim Phillips 
2744d13dd2d7SEric Auger     if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
27452312d907SEric Auger         error_setg(errp, "unexpected number of io regions %u",
2746d13dd2d7SEric Auger                    vbasedev->num_regions);
2747e942d8f0SZhenzhong Duan         return false;
2748cf7087dbSKim Phillips     }
2749cf7087dbSKim Phillips 
2750d13dd2d7SEric Auger     if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
27512312d907SEric Auger         error_setg(errp, "unexpected number of irqs %u", vbasedev->num_irqs);
2752e942d8f0SZhenzhong Duan         return false;
2753cf7087dbSKim Phillips     }
2754cf7087dbSKim Phillips 
2755cf7087dbSKim Phillips     for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
2756db0da029SAlex Williamson         char *name = g_strdup_printf("%s BAR %d", vbasedev->name, i);
2757db0da029SAlex Williamson 
2758db0da029SAlex Williamson         ret = vfio_region_setup(OBJECT(vdev), vbasedev,
2759db0da029SAlex Williamson                                 &vdev->bars[i].region, i, name);
2760db0da029SAlex Williamson         g_free(name);
2761db0da029SAlex Williamson 
2762cf7087dbSKim Phillips         if (ret) {
27632312d907SEric Auger             error_setg_errno(errp, -ret, "failed to get region %d info", i);
2764e942d8f0SZhenzhong Duan             return false;
2765cf7087dbSKim Phillips         }
2766cf7087dbSKim Phillips 
2767cf7087dbSKim Phillips         QLIST_INIT(&vdev->bars[i].quirks);
2768cf7087dbSKim Phillips     }
2769cf7087dbSKim Phillips 
277046900226SAlex Williamson     ret = vfio_get_region_info(vbasedev,
277146900226SAlex Williamson                                VFIO_PCI_CONFIG_REGION_INDEX, &reg_info);
2772cf7087dbSKim Phillips     if (ret) {
27732312d907SEric Auger         error_setg_errno(errp, -ret, "failed to get config info");
2774e942d8f0SZhenzhong Duan         return false;
2775cf7087dbSKim Phillips     }
2776cf7087dbSKim Phillips 
2777d13dd2d7SEric Auger     trace_vfio_populate_device_config(vdev->vbasedev.name,
277846900226SAlex Williamson                                       (unsigned long)reg_info->size,
277946900226SAlex Williamson                                       (unsigned long)reg_info->offset,
278046900226SAlex Williamson                                       (unsigned long)reg_info->flags);
2781cf7087dbSKim Phillips 
278246900226SAlex Williamson     vdev->config_size = reg_info->size;
2783cf7087dbSKim Phillips     if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
2784cf7087dbSKim Phillips         vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
2785cf7087dbSKim Phillips     }
278646900226SAlex Williamson     vdev->config_offset = reg_info->offset;
278746900226SAlex Williamson 
2788e593c021SAlex Williamson     if (vdev->features & VFIO_FEATURE_ENABLE_VGA) {
278964410a74SZhenzhong Duan         if (!vfio_populate_vga(vdev, errp)) {
27902312d907SEric Auger             error_append_hint(errp, "device does not support "
2791cde4279bSEric Auger                               "requested feature x-vga\n");
2792e942d8f0SZhenzhong Duan             return false;
2793cf7087dbSKim Phillips         }
2794cf7087dbSKim Phillips     }
279547cbe50cSAlex Williamson 
2796cf7087dbSKim Phillips     irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
2797cf7087dbSKim Phillips 
27985546a621SEric Auger     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
2799cf7087dbSKim Phillips     if (ret) {
2800cf7087dbSKim Phillips         /* This can fail for an old kernel or legacy PCI dev */
2801772f1b37SDaniel P. Berrangé         trace_vfio_populate_device_get_irq_info_failure(strerror(errno));
2802cf7087dbSKim Phillips     } else if (irq_info.count == 1) {
2803cf7087dbSKim Phillips         vdev->pci_aer = true;
2804cf7087dbSKim Phillips     } else {
2805e1eb292aSMarkus Armbruster         warn_report(VFIO_MSG_PREFIX
2806cf7087dbSKim Phillips                     "Could not enable error recovery for the device",
2807df92ee44SEric Auger                     vbasedev->name);
2808cf7087dbSKim Phillips     }
2809e942d8f0SZhenzhong Duan 
2810e942d8f0SZhenzhong Duan     return true;
2811d13dd2d7SEric Auger }
2812d13dd2d7SEric Auger 
vfio_pci_put_device(VFIOPCIDevice * vdev)2813c06327c9SZhenzhong Duan static void vfio_pci_put_device(VFIOPCIDevice *vdev)
2814cf7087dbSKim Phillips {
28155456b186SEric Auger     vfio_detach_device(&vdev->vbasedev);
28165456b186SEric Auger 
2817462037c9SEric Auger     g_free(vdev->vbasedev.name);
2818cf7087dbSKim Phillips     g_free(vdev->msix);
2819cf7087dbSKim Phillips }
2820cf7087dbSKim Phillips 
vfio_err_notifier_handler(void * opaque)2821cf7087dbSKim Phillips static void vfio_err_notifier_handler(void *opaque)
2822cf7087dbSKim Phillips {
28239ee27d73SEric Auger     VFIOPCIDevice *vdev = opaque;
2824cf7087dbSKim Phillips 
2825cf7087dbSKim Phillips     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
2826cf7087dbSKim Phillips         return;
2827cf7087dbSKim Phillips     }
2828cf7087dbSKim Phillips 
2829cf7087dbSKim Phillips     /*
2830cf7087dbSKim Phillips      * TBD. Retrieve the error details and decide what action
2831cf7087dbSKim Phillips      * needs to be taken. One of the actions could be to pass
2832cf7087dbSKim Phillips      * the error to the guest and have the guest driver recover
2833cf7087dbSKim Phillips      * from the error. This requires that PCIe capabilities be
2834cf7087dbSKim Phillips      * exposed to the guest. For now, we just terminate the
2835cf7087dbSKim Phillips      * guest to contain the error.
2836cf7087dbSKim Phillips      */
2837cf7087dbSKim Phillips 
28387df9381bSAlex Williamson     error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
2839cf7087dbSKim Phillips 
2840cf7087dbSKim Phillips     vm_stop(RUN_STATE_INTERNAL_ERROR);
2841cf7087dbSKim Phillips }
2842cf7087dbSKim Phillips 
2843cf7087dbSKim Phillips /*
2844cf7087dbSKim Phillips  * Registers error notifier for devices supporting error recovery.
2845cf7087dbSKim Phillips  * If we encounter a failure in this function, we report an error
2846cf7087dbSKim Phillips  * and continue after disabling error recovery support for the
2847cf7087dbSKim Phillips  * device.
2848cf7087dbSKim Phillips  */
vfio_register_err_notifier(VFIOPCIDevice * vdev)28499ee27d73SEric Auger static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
2850cf7087dbSKim Phillips {
2851201a7331SEric Auger     Error *err = NULL;
2852201a7331SEric Auger     int32_t fd;
2853cf7087dbSKim Phillips 
2854cf7087dbSKim Phillips     if (!vdev->pci_aer) {
2855cf7087dbSKim Phillips         return;
2856cf7087dbSKim Phillips     }
2857cf7087dbSKim Phillips 
2858cf7087dbSKim Phillips     if (event_notifier_init(&vdev->err_notifier, 0)) {
2859cf7087dbSKim Phillips         error_report("vfio: Unable to init event notifier for error detection");
2860cf7087dbSKim Phillips         vdev->pci_aer = false;
2861cf7087dbSKim Phillips         return;
2862cf7087dbSKim Phillips     }
2863cf7087dbSKim Phillips 
2864201a7331SEric Auger     fd = event_notifier_get_fd(&vdev->err_notifier);
2865201a7331SEric Auger     qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev);
2866cf7087dbSKim Phillips 
286784e37d02SZhenzhong Duan     if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
2868201a7331SEric Auger                                 VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
2869201a7331SEric Auger         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2870201a7331SEric Auger         qemu_set_fd_handler(fd, NULL, NULL, vdev);
2871cf7087dbSKim Phillips         event_notifier_cleanup(&vdev->err_notifier);
2872cf7087dbSKim Phillips         vdev->pci_aer = false;
2873cf7087dbSKim Phillips     }
2874cf7087dbSKim Phillips }
2875cf7087dbSKim Phillips 
vfio_unregister_err_notifier(VFIOPCIDevice * vdev)28769ee27d73SEric Auger static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
2877cf7087dbSKim Phillips {
2878201a7331SEric Auger     Error *err = NULL;
2879cf7087dbSKim Phillips 
2880cf7087dbSKim Phillips     if (!vdev->pci_aer) {
2881cf7087dbSKim Phillips         return;
2882cf7087dbSKim Phillips     }
2883cf7087dbSKim Phillips 
288484e37d02SZhenzhong Duan     if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
2885201a7331SEric Auger                                 VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
2886201a7331SEric Auger         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2887cf7087dbSKim Phillips     }
2888cf7087dbSKim Phillips     qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
2889cf7087dbSKim Phillips                         NULL, NULL, vdev);
2890cf7087dbSKim Phillips     event_notifier_cleanup(&vdev->err_notifier);
2891cf7087dbSKim Phillips }
2892cf7087dbSKim Phillips 
vfio_req_notifier_handler(void * opaque)289347cbe50cSAlex Williamson static void vfio_req_notifier_handler(void *opaque)
289447cbe50cSAlex Williamson {
289547cbe50cSAlex Williamson     VFIOPCIDevice *vdev = opaque;
289635c7cb4cSAlex Williamson     Error *err = NULL;
289747cbe50cSAlex Williamson 
289847cbe50cSAlex Williamson     if (!event_notifier_test_and_clear(&vdev->req_notifier)) {
289947cbe50cSAlex Williamson         return;
290047cbe50cSAlex Williamson     }
290147cbe50cSAlex Williamson 
2902a2596aeeSPhilippe Mathieu-Daudé     qdev_unplug(DEVICE(vdev), &err);
290335c7cb4cSAlex Williamson     if (err) {
2904e1eb292aSMarkus Armbruster         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
290535c7cb4cSAlex Williamson     }
290647cbe50cSAlex Williamson }
290747cbe50cSAlex Williamson 
vfio_register_req_notifier(VFIOPCIDevice * vdev)290847cbe50cSAlex Williamson static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
290947cbe50cSAlex Williamson {
291047cbe50cSAlex Williamson     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
291147cbe50cSAlex Williamson                                       .index = VFIO_PCI_REQ_IRQ_INDEX };
2912201a7331SEric Auger     Error *err = NULL;
2913201a7331SEric Auger     int32_t fd;
291447cbe50cSAlex Williamson 
291547cbe50cSAlex Williamson     if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
291647cbe50cSAlex Williamson         return;
291747cbe50cSAlex Williamson     }
291847cbe50cSAlex Williamson 
291947cbe50cSAlex Williamson     if (ioctl(vdev->vbasedev.fd,
292047cbe50cSAlex Williamson               VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) {
292147cbe50cSAlex Williamson         return;
292247cbe50cSAlex Williamson     }
292347cbe50cSAlex Williamson 
292447cbe50cSAlex Williamson     if (event_notifier_init(&vdev->req_notifier, 0)) {
292547cbe50cSAlex Williamson         error_report("vfio: Unable to init event notifier for device request");
292647cbe50cSAlex Williamson         return;
292747cbe50cSAlex Williamson     }
292847cbe50cSAlex Williamson 
2929201a7331SEric Auger     fd = event_notifier_get_fd(&vdev->req_notifier);
2930201a7331SEric Auger     qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev);
293147cbe50cSAlex Williamson 
293284e37d02SZhenzhong Duan     if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
2933201a7331SEric Auger                                 VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
2934201a7331SEric Auger         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2935201a7331SEric Auger         qemu_set_fd_handler(fd, NULL, NULL, vdev);
293647cbe50cSAlex Williamson         event_notifier_cleanup(&vdev->req_notifier);
293747cbe50cSAlex Williamson     } else {
293847cbe50cSAlex Williamson         vdev->req_enabled = true;
293947cbe50cSAlex Williamson     }
294047cbe50cSAlex Williamson }
294147cbe50cSAlex Williamson 
vfio_unregister_req_notifier(VFIOPCIDevice * vdev)294247cbe50cSAlex Williamson static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
294347cbe50cSAlex Williamson {
2944201a7331SEric Auger     Error *err = NULL;
294547cbe50cSAlex Williamson 
294647cbe50cSAlex Williamson     if (!vdev->req_enabled) {
294747cbe50cSAlex Williamson         return;
294847cbe50cSAlex Williamson     }
294947cbe50cSAlex Williamson 
295084e37d02SZhenzhong Duan     if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
2951201a7331SEric Auger                                 VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
2952201a7331SEric Auger         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
295347cbe50cSAlex Williamson     }
295447cbe50cSAlex Williamson     qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
295547cbe50cSAlex Williamson                         NULL, NULL, vdev);
295647cbe50cSAlex Williamson     event_notifier_cleanup(&vdev->req_notifier);
295747cbe50cSAlex Williamson 
295847cbe50cSAlex Williamson     vdev->req_enabled = false;
295947cbe50cSAlex Williamson }
296047cbe50cSAlex Williamson 
vfio_realize(PCIDevice * pdev,Error ** errp)29611a22aca1SEric Auger static void vfio_realize(PCIDevice *pdev, Error **errp)
2962cf7087dbSKim Phillips {
2963cf8afdfaSZhao Liu     ERRP_GUARD();
296401b46064SEduardo Habkost     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
29650d570a25SEric Auger     VFIODevice *vbasedev = &vdev->vbasedev;
2966581406e0SAlex Williamson     int i, ret;
2967f8d6f3b1SCédric Le Goater     char uuid[UUID_STR_LEN];
296881987bd5SZhenzhong Duan     g_autofree char *name = NULL;
2969cf7087dbSKim Phillips 
2970da3e04b2SZhenzhong Duan     if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
29714a946268SEric Auger         if (!(~vdev->host.domain || ~vdev->host.bus ||
29724a946268SEric Auger               ~vdev->host.slot || ~vdev->host.function)) {
29734a946268SEric Auger             error_setg(errp, "No provided host device");
29746e4e6f0dSDong Jia Shi             error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
2975da3e04b2SZhenzhong Duan #ifdef CONFIG_IOMMUFD
2976da3e04b2SZhenzhong Duan                               "or -device vfio-pci,fd=DEVICE_FD "
2977da3e04b2SZhenzhong Duan #endif
29786e4e6f0dSDong Jia Shi                               "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
29794a946268SEric Auger             return;
29804a946268SEric Auger         }
29810d570a25SEric Auger         vbasedev->sysfsdev =
29827df9381bSAlex Williamson             g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x",
29837df9381bSAlex Williamson                             vdev->host.domain, vdev->host.bus,
29847df9381bSAlex Williamson                             vdev->host.slot, vdev->host.function);
29857df9381bSAlex Williamson     }
29867df9381bSAlex Williamson 
2987c6c6cf91SZhenzhong Duan     if (!vfio_device_get_name(vbasedev, errp)) {
29881a22aca1SEric Auger         return;
2989cf7087dbSKim Phillips     }
2990462037c9SEric Auger 
2991238e9172SAlex Williamson     /*
2992aff92b82SDavid Hildenbrand      * Mediated devices *might* operate compatibly with discarding of RAM, but
2993238e9172SAlex Williamson      * we cannot know for certain, it depends on whether the mdev vendor driver
2994238e9172SAlex Williamson      * stays in sync with the active working set of the guest driver.  Prevent
2995238e9172SAlex Williamson      * the x-balloon-allowed option unless this is minimally an mdev device.
2996238e9172SAlex Williamson      */
299713e522f6SJoao Martins     vbasedev->mdev = vfio_device_is_mdev(vbasedev);
2998238e9172SAlex Williamson 
299913e522f6SJoao Martins     trace_vfio_mdev(vbasedev->name, vbasedev->mdev);
3000238e9172SAlex Williamson 
300113e522f6SJoao Martins     if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) {
3002238e9172SAlex Williamson         error_setg(errp, "x-balloon-allowed only potentially compatible "
3003238e9172SAlex Williamson                    "with mdev devices");
3004238e9172SAlex Williamson         goto error;
3005238e9172SAlex Williamson     }
3006238e9172SAlex Williamson 
30072dca1b37SMinwoo Im     if (!qemu_uuid_is_null(&vdev->vf_token)) {
30082dca1b37SMinwoo Im         qemu_uuid_unparse(&vdev->vf_token, uuid);
30092dca1b37SMinwoo Im         name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid);
30102dca1b37SMinwoo Im     } else {
3011b83b40b6SZhenzhong Duan         name = g_strdup(vbasedev->name);
30122dca1b37SMinwoo Im     }
30132dca1b37SMinwoo Im 
3014b7754835SZhenzhong Duan     if (!vfio_attach_device(name, vbasedev,
3015b7754835SZhenzhong Duan                             pci_device_iommu_address_space(pdev), errp)) {
3016426ec904SEric Auger         goto error;
3017cf7087dbSKim Phillips     }
3018cf7087dbSKim Phillips 
3019e942d8f0SZhenzhong Duan     if (!vfio_populate_device(vdev, errp)) {
3020c0f527f4SEric Auger         goto error;
3021217e9fdcSPaolo Bonzini     }
3022217e9fdcSPaolo Bonzini 
3023cf7087dbSKim Phillips     /* Get a copy of config space */
30240d570a25SEric Auger     ret = pread(vbasedev->fd, vdev->pdev.config,
3025cf7087dbSKim Phillips                 MIN(pci_config_size(&vdev->pdev), vdev->config_size),
3026cf7087dbSKim Phillips                 vdev->config_offset);
3027cf7087dbSKim Phillips     if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
3028cf7087dbSKim Phillips         ret = ret < 0 ? -errno : -EFAULT;
30291a22aca1SEric Auger         error_setg_errno(errp, -ret, "failed to read device config space");
3030c0f527f4SEric Auger         goto error;
3031cf7087dbSKim Phillips     }
3032cf7087dbSKim Phillips 
3033cf7087dbSKim Phillips     /* vfio emulates a lot for us, but some bits need extra love */
3034cf7087dbSKim Phillips     vdev->emulated_config_bits = g_malloc0(vdev->config_size);
3035cf7087dbSKim Phillips 
3036cf7087dbSKim Phillips     /* QEMU can choose to expose the ROM or not */
3037cf7087dbSKim Phillips     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
303804f336b0SAlex Williamson     /* QEMU can also add or extend BARs */
303904f336b0SAlex Williamson     memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
3040cf7087dbSKim Phillips 
304189dcccc5SAlex Williamson     /*
304289dcccc5SAlex Williamson      * The PCI spec reserves vendor ID 0xffff as an invalid value.  The
304389dcccc5SAlex Williamson      * device ID is managed by the vendor and need only be a 16-bit value.
304489dcccc5SAlex Williamson      * Allow any 16-bit value for subsystem so they can be hidden or changed.
304589dcccc5SAlex Williamson      */
304689dcccc5SAlex Williamson     if (vdev->vendor_id != PCI_ANY_ID) {
304789dcccc5SAlex Williamson         if (vdev->vendor_id >= 0xffff) {
30481a22aca1SEric Auger             error_setg(errp, "invalid PCI vendor ID provided");
3049c0f527f4SEric Auger             goto error;
305089dcccc5SAlex Williamson         }
305189dcccc5SAlex Williamson         vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
30520d570a25SEric Auger         trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id);
305389dcccc5SAlex Williamson     } else {
3054ff635e37SAlex Williamson         vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
305589dcccc5SAlex Williamson     }
305689dcccc5SAlex Williamson 
305789dcccc5SAlex Williamson     if (vdev->device_id != PCI_ANY_ID) {
305889dcccc5SAlex Williamson         if (vdev->device_id > 0xffff) {
30591a22aca1SEric Auger             error_setg(errp, "invalid PCI device ID provided");
3060c0f527f4SEric Auger             goto error;
306189dcccc5SAlex Williamson         }
306289dcccc5SAlex Williamson         vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
30630d570a25SEric Auger         trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id);
306489dcccc5SAlex Williamson     } else {
3065ff635e37SAlex Williamson         vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
306689dcccc5SAlex Williamson     }
306789dcccc5SAlex Williamson 
306889dcccc5SAlex Williamson     if (vdev->sub_vendor_id != PCI_ANY_ID) {
306989dcccc5SAlex Williamson         if (vdev->sub_vendor_id > 0xffff) {
30701a22aca1SEric Auger             error_setg(errp, "invalid PCI subsystem vendor ID provided");
3071c0f527f4SEric Auger             goto error;
307289dcccc5SAlex Williamson         }
307389dcccc5SAlex Williamson         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
307489dcccc5SAlex Williamson                                vdev->sub_vendor_id, ~0);
30750d570a25SEric Auger         trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name,
307689dcccc5SAlex Williamson                                               vdev->sub_vendor_id);
307789dcccc5SAlex Williamson     }
307889dcccc5SAlex Williamson 
307989dcccc5SAlex Williamson     if (vdev->sub_device_id != PCI_ANY_ID) {
308089dcccc5SAlex Williamson         if (vdev->sub_device_id > 0xffff) {
30811a22aca1SEric Auger             error_setg(errp, "invalid PCI subsystem device ID provided");
3082c0f527f4SEric Auger             goto error;
308389dcccc5SAlex Williamson         }
308489dcccc5SAlex Williamson         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
30850d570a25SEric Auger         trace_vfio_pci_emulated_sub_device_id(vbasedev->name,
308689dcccc5SAlex Williamson                                               vdev->sub_device_id);
308789dcccc5SAlex Williamson     }
3088ff635e37SAlex Williamson 
3089cf7087dbSKim Phillips     /* QEMU can change multi-function devices to single function, or reverse */
3090cf7087dbSKim Phillips     vdev->emulated_config_bits[PCI_HEADER_TYPE] =
3091cf7087dbSKim Phillips                                               PCI_HEADER_TYPE_MULTI_FUNCTION;
3092cf7087dbSKim Phillips 
3093cf7087dbSKim Phillips     /* Restore or clear multifunction, this is always controlled by QEMU */
3094cf7087dbSKim Phillips     if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
3095cf7087dbSKim Phillips         vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
3096cf7087dbSKim Phillips     } else {
3097cf7087dbSKim Phillips         vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
3098cf7087dbSKim Phillips     }
3099cf7087dbSKim Phillips 
3100cf7087dbSKim Phillips     /*
3101cf7087dbSKim Phillips      * Clear host resource mapping info.  If we choose not to register a
3102cf7087dbSKim Phillips      * BAR, such as might be the case with the option ROM, we can get
3103cf7087dbSKim Phillips      * confusing, unwritable, residual addresses from the host here.
3104cf7087dbSKim Phillips      */
3105cf7087dbSKim Phillips     memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
3106cf7087dbSKim Phillips     memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
3107cf7087dbSKim Phillips 
3108cf7087dbSKim Phillips     vfio_pci_size_rom(vdev);
3109cf7087dbSKim Phillips 
311089d5202eSAlex Williamson     vfio_bars_prepare(vdev);
311189d5202eSAlex Williamson 
3112713b59a6SZhenzhong Duan     if (!vfio_msix_early_setup(vdev, errp)) {
3113c0f527f4SEric Auger         goto error;
3114cf7087dbSKim Phillips     }
3115cf7087dbSKim Phillips 
31163a286732SAlex Williamson     vfio_bars_register(vdev);
3117cf7087dbSKim Phillips 
31189f176041SJoao Martins     if (!vbasedev->mdev &&
31199f176041SJoao Martins         !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) {
3120ee26474dSZhenzhong Duan         error_prepend(errp, "Failed to set iommu_device: ");
3121cf7087dbSKim Phillips         goto out_teardown;
3122cf7087dbSKim Phillips     }
3123cf7087dbSKim Phillips 
3124ee26474dSZhenzhong Duan     if (!vfio_add_capabilities(vdev, errp)) {
3125ee26474dSZhenzhong Duan         goto out_unset_idev;
3126ee26474dSZhenzhong Duan     }
3127ee26474dSZhenzhong Duan 
3128182bca45SAlex Williamson     if (vdev->vga) {
3129182bca45SAlex Williamson         vfio_vga_quirk_setup(vdev);
3130182bca45SAlex Williamson     }
3131182bca45SAlex Williamson 
3132581406e0SAlex Williamson     for (i = 0; i < PCI_ROM_SLOT; i++) {
3133581406e0SAlex Williamson         vfio_bar_quirk_setup(vdev, i);
3134581406e0SAlex Williamson     }
3135581406e0SAlex Williamson 
31366ced0bbaSAlex Williamson     if (!vdev->igd_opregion &&
31376ced0bbaSAlex Williamson         vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) {
3138514855e1SZhenzhong Duan         g_autofree struct vfio_region_info *opregion = NULL;
31396ced0bbaSAlex Williamson 
31406ced0bbaSAlex Williamson         if (vdev->pdev.qdev.hotplugged) {
31411a22aca1SEric Auger             error_setg(errp,
3142426ec904SEric Auger                        "cannot support IGD OpRegion feature on hotplugged "
3143426ec904SEric Auger                        "device");
3144ee26474dSZhenzhong Duan             goto out_unset_idev;
31456ced0bbaSAlex Williamson         }
31466ced0bbaSAlex Williamson 
31470d570a25SEric Auger         ret = vfio_get_dev_region_info(vbasedev,
31486ced0bbaSAlex Williamson                         VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
31496ced0bbaSAlex Williamson                         VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion);
31506ced0bbaSAlex Williamson         if (ret) {
31511a22aca1SEric Auger             error_setg_errno(errp, -ret,
3152426ec904SEric Auger                              "does not support requested IGD OpRegion feature");
3153ee26474dSZhenzhong Duan             goto out_unset_idev;
31546ced0bbaSAlex Williamson         }
31556ced0bbaSAlex Williamson 
3156d3c6a18bSZhenzhong Duan         if (!vfio_pci_igd_opregion_init(vdev, opregion, errp)) {
3157ee26474dSZhenzhong Duan             goto out_unset_idev;
31586ced0bbaSAlex Williamson         }
31596ced0bbaSAlex Williamson     }
31606ced0bbaSAlex Williamson 
3161cf7087dbSKim Phillips     /* QEMU emulates all of MSI & MSIX */
3162cf7087dbSKim Phillips     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
3163cf7087dbSKim Phillips         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
3164cf7087dbSKim Phillips                MSIX_CAP_LENGTH);
3165cf7087dbSKim Phillips     }
3166cf7087dbSKim Phillips 
3167cf7087dbSKim Phillips     if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
3168cf7087dbSKim Phillips         memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
3169cf7087dbSKim Phillips                vdev->msi_cap_size);
3170cf7087dbSKim Phillips     }
3171cf7087dbSKim Phillips 
3172cf7087dbSKim Phillips     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
3173cf7087dbSKim Phillips         vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
3174cf7087dbSKim Phillips                                                   vfio_intx_mmap_enable, vdev);
3175ad54dbd8SDavid Gibson         pci_device_set_intx_routing_notifier(&vdev->pdev,
3176ad54dbd8SDavid Gibson                                              vfio_intx_routing_notifier);
3177c5478feaSDavid Gibson         vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
3178c5478feaSDavid Gibson         kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
3179c32bab07SZhenzhong Duan         if (!vfio_intx_enable(vdev, errp)) {
3180c5478feaSDavid Gibson             goto out_deregister;
3181cf7087dbSKim Phillips         }
3182cf7087dbSKim Phillips     }
3183cf7087dbSKim Phillips 
3184a9994687SGerd Hoffmann     if (vdev->display != ON_OFF_AUTO_OFF) {
3185455c009dSZhenzhong Duan         if (!vfio_display_probe(vdev, errp)) {
3186c5478feaSDavid Gibson             goto out_deregister;
3187a9994687SGerd Hoffmann         }
3188a9994687SGerd Hoffmann     }
3189b290659fSGerd Hoffmann     if (vdev->enable_ramfb && vdev->dpy == NULL) {
3190b290659fSGerd Hoffmann         error_setg(errp, "ramfb=on requires display=on");
3191c5478feaSDavid Gibson         goto out_deregister;
3192b290659fSGerd Hoffmann     }
3193c62a0c7cSGerd Hoffmann     if (vdev->display_xres || vdev->display_yres) {
3194c62a0c7cSGerd Hoffmann         if (vdev->dpy == NULL) {
3195c62a0c7cSGerd Hoffmann             error_setg(errp, "xres and yres properties require display=on");
3196c5478feaSDavid Gibson             goto out_deregister;
3197c62a0c7cSGerd Hoffmann         }
3198c62a0c7cSGerd Hoffmann         if (vdev->dpy->edid_regs == NULL) {
3199c62a0c7cSGerd Hoffmann             error_setg(errp, "xres and yres properties need edid support");
3200c5478feaSDavid Gibson             goto out_deregister;
3201c62a0c7cSGerd Hoffmann         }
3202c62a0c7cSGerd Hoffmann     }
3203a9994687SGerd Hoffmann 
320487417811SMarc-André Lureau     if (vdev->ramfb_migrate == ON_OFF_AUTO_ON && !vdev->enable_ramfb) {
320587417811SMarc-André Lureau         warn_report("x-ramfb-migrate=on but ramfb=off. "
320687417811SMarc-André Lureau                     "Forcing x-ramfb-migrate to off.");
320787417811SMarc-André Lureau         vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
320887417811SMarc-André Lureau     }
320987417811SMarc-André Lureau     if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) {
321087417811SMarc-André Lureau         if (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO) {
321187417811SMarc-André Lureau             vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
321287417811SMarc-André Lureau         } else if (vdev->ramfb_migrate == ON_OFF_AUTO_ON) {
321387417811SMarc-André Lureau             error_setg(errp, "x-ramfb-migrate requires enable-migration");
321487417811SMarc-André Lureau             goto out_deregister;
321587417811SMarc-André Lureau         }
321687417811SMarc-André Lureau     }
321787417811SMarc-André Lureau 
3218a2265105SKirti Wankhede     if (!pdev->failover_pair_id) {
3219d4a2af74SZhenzhong Duan         if (!vfio_migration_realize(vbasedev, errp)) {
32202b43b299SZhenzhong Duan             goto out_deregister;
3221a2265105SKirti Wankhede         }
3222a2265105SKirti Wankhede     }
3223a2265105SKirti Wankhede 
3224cf7087dbSKim Phillips     vfio_register_err_notifier(vdev);
322547cbe50cSAlex Williamson     vfio_register_req_notifier(vdev);
3226c9c50009SAlex Williamson     vfio_setup_resetfn_quirk(vdev);
3227cf7087dbSKim Phillips 
32281a22aca1SEric Auger     return;
3229cf7087dbSKim Phillips 
3230c5478feaSDavid Gibson out_deregister:
3231adee0da0SZhenzhong Duan     if (vdev->interrupt == VFIO_INT_INTx) {
3232adee0da0SZhenzhong Duan         vfio_intx_disable(vdev);
3233adee0da0SZhenzhong Duan     }
3234cf7087dbSKim Phillips     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3235357bd793SZhenzhong Duan     if (vdev->irqchip_change_notifier.notify) {
3236c5478feaSDavid Gibson         kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3237357bd793SZhenzhong Duan     }
32380cc889c8SZhenzhong Duan     if (vdev->intx.mmap_timer) {
32390cc889c8SZhenzhong Duan         timer_free(vdev->intx.mmap_timer);
32400cc889c8SZhenzhong Duan     }
3241ee26474dSZhenzhong Duan out_unset_idev:
32429f176041SJoao Martins     if (!vbasedev->mdev) {
3243ee26474dSZhenzhong Duan         pci_device_unset_iommu_device(pdev);
32449f176041SJoao Martins     }
3245c5478feaSDavid Gibson out_teardown:
3246cf7087dbSKim Phillips     vfio_teardown_msi(vdev);
32472d82f8a3SAlex Williamson     vfio_bars_exit(vdev);
3248426ec904SEric Auger error:
32490d570a25SEric Auger     error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name);
325077a10d04SPaolo Bonzini }
325177a10d04SPaolo Bonzini 
vfio_instance_finalize(Object * obj)325277a10d04SPaolo Bonzini static void vfio_instance_finalize(Object *obj)
325377a10d04SPaolo Bonzini {
325401b46064SEduardo Habkost     VFIOPCIDevice *vdev = VFIO_PCI(obj);
325577a10d04SPaolo Bonzini 
3256a9994687SGerd Hoffmann     vfio_display_finalize(vdev);
32572d82f8a3SAlex Williamson     vfio_bars_finalize(vdev);
3258cf7087dbSKim Phillips     g_free(vdev->emulated_config_bits);
325977a10d04SPaolo Bonzini     g_free(vdev->rom);
3260c4c45e94SAlex Williamson     /*
3261c4c45e94SAlex Williamson      * XXX Leaking igd_opregion is not an oversight, we can't remove the
3262c4c45e94SAlex Williamson      * fw_cfg entry therefore leaking this allocation seems like the safest
3263c4c45e94SAlex Williamson      * option.
3264c4c45e94SAlex Williamson      *
3265c4c45e94SAlex Williamson      * g_free(vdev->igd_opregion);
3266c4c45e94SAlex Williamson      */
3267c06327c9SZhenzhong Duan     vfio_pci_put_device(vdev);
3268cf7087dbSKim Phillips }
3269cf7087dbSKim Phillips 
vfio_exitfn(PCIDevice * pdev)3270cf7087dbSKim Phillips static void vfio_exitfn(PCIDevice *pdev)
3271cf7087dbSKim Phillips {
327201b46064SEduardo Habkost     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
3273ee26474dSZhenzhong Duan     VFIODevice *vbasedev = &vdev->vbasedev;
3274cf7087dbSKim Phillips 
327547cbe50cSAlex Williamson     vfio_unregister_req_notifier(vdev);
3276cf7087dbSKim Phillips     vfio_unregister_err_notifier(vdev);
3277cf7087dbSKim Phillips     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
32780446f812SPeter Xu     if (vdev->irqchip_change_notifier.notify) {
3279c5478feaSDavid Gibson         kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
32800446f812SPeter Xu     }
3281cf7087dbSKim Phillips     vfio_disable_interrupts(vdev);
3282cf7087dbSKim Phillips     if (vdev->intx.mmap_timer) {
3283cf7087dbSKim Phillips         timer_free(vdev->intx.mmap_timer);
3284cf7087dbSKim Phillips     }
3285cf7087dbSKim Phillips     vfio_teardown_msi(vdev);
3286c00aac6fSAlex Williamson     vfio_pci_disable_rp_atomics(vdev);
32872d82f8a3SAlex Williamson     vfio_bars_exit(vdev);
3288ee26474dSZhenzhong Duan     vfio_migration_exit(vbasedev);
32899f176041SJoao Martins     if (!vbasedev->mdev) {
3290ee26474dSZhenzhong Duan         pci_device_unset_iommu_device(pdev);
3291cf7087dbSKim Phillips     }
32929f176041SJoao Martins }
3293cf7087dbSKim Phillips 
vfio_pci_reset(DeviceState * dev)3294cf7087dbSKim Phillips static void vfio_pci_reset(DeviceState *dev)
3295cf7087dbSKim Phillips {
329601b46064SEduardo Habkost     VFIOPCIDevice *vdev = VFIO_PCI(dev);
3297cf7087dbSKim Phillips 
3298df92ee44SEric Auger     trace_vfio_pci_reset(vdev->vbasedev.name);
3299cf7087dbSKim Phillips 
3300cf7087dbSKim Phillips     vfio_pci_pre_reset(vdev);
3301cf7087dbSKim Phillips 
33028983e3e3STina Zhang     if (vdev->display != ON_OFF_AUTO_OFF) {
33038983e3e3STina Zhang         vfio_display_reset(vdev);
33048983e3e3STina Zhang     }
33058983e3e3STina Zhang 
33065655f931SAlex Williamson     if (vdev->resetfn && !vdev->resetfn(vdev)) {
33075655f931SAlex Williamson         goto post_reset;
33085655f931SAlex Williamson     }
33095655f931SAlex Williamson 
3310b47d8efaSEric Auger     if (vdev->vbasedev.reset_works &&
3311b47d8efaSEric Auger         (vdev->has_flr || !vdev->has_pm_reset) &&
33125546a621SEric Auger         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3313df92ee44SEric Auger         trace_vfio_pci_reset_flr(vdev->vbasedev.name);
3314cf7087dbSKim Phillips         goto post_reset;
3315cf7087dbSKim Phillips     }
3316cf7087dbSKim Phillips 
3317cf7087dbSKim Phillips     /* See if we can do our own bus reset */
3318cf7087dbSKim Phillips     if (!vfio_pci_hot_reset_one(vdev)) {
3319cf7087dbSKim Phillips         goto post_reset;
3320cf7087dbSKim Phillips     }
3321cf7087dbSKim Phillips 
3322cf7087dbSKim Phillips     /* If nothing else works and the device supports PM reset, use it */
3323b47d8efaSEric Auger     if (vdev->vbasedev.reset_works && vdev->has_pm_reset &&
33245546a621SEric Auger         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3325df92ee44SEric Auger         trace_vfio_pci_reset_pm(vdev->vbasedev.name);
3326cf7087dbSKim Phillips         goto post_reset;
3327cf7087dbSKim Phillips     }
3328cf7087dbSKim Phillips 
3329cf7087dbSKim Phillips post_reset:
3330cf7087dbSKim Phillips     vfio_pci_post_reset(vdev);
3331cf7087dbSKim Phillips }
3332cf7087dbSKim Phillips 
vfio_instance_init(Object * obj)3333cf7087dbSKim Phillips static void vfio_instance_init(Object *obj)
3334cf7087dbSKim Phillips {
3335cf7087dbSKim Phillips     PCIDevice *pci_dev = PCI_DEVICE(obj);
333601b46064SEduardo Habkost     VFIOPCIDevice *vdev = VFIO_PCI(obj);
3337dd2fcb17SZhenzhong Duan     VFIODevice *vbasedev = &vdev->vbasedev;
3338cf7087dbSKim Phillips 
3339cf7087dbSKim Phillips     device_add_bootindex_property(obj, &vdev->bootindex,
3340cf7087dbSKim Phillips                                   "bootindex", NULL,
334140c2281cSMarkus Armbruster                                   &pci_dev->qdev);
33424a946268SEric Auger     vdev->host.domain = ~0U;
33434a946268SEric Auger     vdev->host.bus = ~0U;
33444a946268SEric Auger     vdev->host.slot = ~0U;
33454a946268SEric Auger     vdev->host.function = ~0U;
3346dd2fcb17SZhenzhong Duan 
33476106a329SZhenzhong Duan     vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops,
33486106a329SZhenzhong Duan                      DEVICE(vdev), false);
3349dfbee78dSAlex Williamson 
3350dfbee78dSAlex Williamson     vdev->nv_gpudirect_clique = 0xFF;
3351d61a363dSYoni Bettan 
3352d61a363dSYoni Bettan     /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
3353d61a363dSYoni Bettan      * line, therefore, no need to wait to realize like other devices */
3354d61a363dSYoni Bettan     pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
3355cf7087dbSKim Phillips }
3356cf7087dbSKim Phillips 
3357cf7087dbSKim Phillips static Property vfio_pci_dev_properties[] = {
33589ee27d73SEric Auger     DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
33592dca1b37SMinwoo Im     DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
33607df9381bSAlex Williamson     DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
3361bb0990d1SKirti Wankhede     DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
3362bb0990d1SKirti Wankhede                             vbasedev.pre_copy_dirty_page_tracking,
3363bb0990d1SKirti Wankhede                             ON_OFF_AUTO_ON),
336430b91677SJoao Martins     DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice,
336530b91677SJoao Martins                             vbasedev.device_dirty_page_tracking,
336630b91677SJoao Martins                             ON_OFF_AUTO_ON),
3367a9994687SGerd Hoffmann     DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
33688151a9c5SAlex Williamson                             display, ON_OFF_AUTO_OFF),
3369c62a0c7cSGerd Hoffmann     DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
3370c62a0c7cSGerd Hoffmann     DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
33719ee27d73SEric Auger     DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
3372cf7087dbSKim Phillips                        intx.mmap_timeout, 1100),
33739ee27d73SEric Auger     DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
3374cf7087dbSKim Phillips                     VFIO_FEATURE_ENABLE_VGA_BIT, false),
337547cbe50cSAlex Williamson     DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
337647cbe50cSAlex Williamson                     VFIO_FEATURE_ENABLE_REQ_BIT, true),
33776ced0bbaSAlex Williamson     DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
33786ced0bbaSAlex Williamson                     VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
33798bbcb64aSAvihai Horon     DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice,
33808bbcb64aSAvihai Horon                             vbasedev.enable_migration, ON_OFF_AUTO_AUTO),
33815e1f8905SAvihai Horon     DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice,
33825e1f8905SAvihai Horon                      vbasedev.migration_events, false),
33835e15d79bSAlex Williamson     DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
3384238e9172SAlex Williamson     DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
3385aff92b82SDavid Hildenbrand                      vbasedev.ram_block_discard_allowed, false),
338646746dbaSAlex Williamson     DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
338746746dbaSAlex Williamson     DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
338846746dbaSAlex Williamson     DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
3389db32d0f4SAlex Williamson     DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
3390db32d0f4SAlex Williamson                      no_geforce_quirks, false),
3391c958c51dSAlex Williamson     DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
3392c958c51dSAlex Williamson                      false),
33932b1dbd0dSAlex Williamson     DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
33942b1dbd0dSAlex Williamson                      false),
339589dcccc5SAlex Williamson     DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
339689dcccc5SAlex Williamson     DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
339789dcccc5SAlex Williamson     DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
339889dcccc5SAlex Williamson                        sub_vendor_id, PCI_ANY_ID),
339989dcccc5SAlex Williamson     DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
340089dcccc5SAlex Williamson                        sub_device_id, PCI_ANY_ID),
3401c4c45e94SAlex Williamson     DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
3402dfbee78dSAlex Williamson     DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
3403dfbee78dSAlex Williamson                                    nv_gpudirect_clique,
3404dfbee78dSAlex Williamson                                    qdev_prop_nv_gpudirect_clique, uint8_t),
340589d5202eSAlex Williamson     DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
340655872c70SMarkus Armbruster                                 OFF_AUTO_PCIBAR_OFF),
3407ee42b261SEric Auger #ifdef CONFIG_IOMMUFD
3408ee42b261SEric Auger     DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
3409ee42b261SEric Auger                      TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
3410ee42b261SEric Auger #endif
3411187716feSVinayak Kale     DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
3412cf7087dbSKim Phillips     DEFINE_PROP_END_OF_LIST(),
3413cf7087dbSKim Phillips };
3414cf7087dbSKim Phillips 
3415da3e04b2SZhenzhong Duan #ifdef CONFIG_IOMMUFD
vfio_pci_set_fd(Object * obj,const char * str,Error ** errp)3416da3e04b2SZhenzhong Duan static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp)
3417da3e04b2SZhenzhong Duan {
3418da3e04b2SZhenzhong Duan     vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp);
3419da3e04b2SZhenzhong Duan }
3420da3e04b2SZhenzhong Duan #endif
3421da3e04b2SZhenzhong Duan 
vfio_pci_dev_class_init(ObjectClass * klass,void * data)3422cf7087dbSKim Phillips static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
3423cf7087dbSKim Phillips {
3424cf7087dbSKim Phillips     DeviceClass *dc = DEVICE_CLASS(klass);
3425cf7087dbSKim Phillips     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3426cf7087dbSKim Phillips 
3427*e3d08143SPeter Maydell     device_class_set_legacy_reset(dc, vfio_pci_reset);
34284f67d30bSMarc-André Lureau     device_class_set_props(dc, vfio_pci_dev_properties);
3429da3e04b2SZhenzhong Duan #ifdef CONFIG_IOMMUFD
3430da3e04b2SZhenzhong Duan     object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd);
3431da3e04b2SZhenzhong Duan #endif
3432cf7087dbSKim Phillips     dc->desc = "VFIO-based PCI device assignment";
3433cf7087dbSKim Phillips     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
34341a22aca1SEric Auger     pdc->realize = vfio_realize;
3435cf7087dbSKim Phillips     pdc->exit = vfio_exitfn;
3436cf7087dbSKim Phillips     pdc->config_read = vfio_pci_read_config;
3437cf7087dbSKim Phillips     pdc->config_write = vfio_pci_write_config;
3438cf7087dbSKim Phillips }
3439cf7087dbSKim Phillips 
3440cf7087dbSKim Phillips static const TypeInfo vfio_pci_dev_info = {
34412683ccd5SLi Qiang     .name = TYPE_VFIO_PCI,
3442cf7087dbSKim Phillips     .parent = TYPE_PCI_DEVICE,
34439ee27d73SEric Auger     .instance_size = sizeof(VFIOPCIDevice),
3444cf7087dbSKim Phillips     .class_init = vfio_pci_dev_class_init,
3445cf7087dbSKim Phillips     .instance_init = vfio_instance_init,
344677a10d04SPaolo Bonzini     .instance_finalize = vfio_instance_finalize,
3447a5fa336fSEduardo Habkost     .interfaces = (InterfaceInfo[]) {
3448a5fa336fSEduardo Habkost         { INTERFACE_PCIE_DEVICE },
3449a5fa336fSEduardo Habkost         { INTERFACE_CONVENTIONAL_PCI_DEVICE },
3450a5fa336fSEduardo Habkost         { }
3451a5fa336fSEduardo Habkost     },
3452cf7087dbSKim Phillips };
3453cf7087dbSKim Phillips 
3454b290659fSGerd Hoffmann static Property vfio_pci_dev_nohotplug_properties[] = {
3455b290659fSGerd Hoffmann     DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false),
345687417811SMarc-André Lureau     DEFINE_PROP_ON_OFF_AUTO("x-ramfb-migrate", VFIOPCIDevice, ramfb_migrate,
345787417811SMarc-André Lureau                             ON_OFF_AUTO_AUTO),
3458b290659fSGerd Hoffmann     DEFINE_PROP_END_OF_LIST(),
3459b290659fSGerd Hoffmann };
3460b290659fSGerd Hoffmann 
vfio_pci_nohotplug_dev_class_init(ObjectClass * klass,void * data)3461b290659fSGerd Hoffmann static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data)
3462b290659fSGerd Hoffmann {
3463b290659fSGerd Hoffmann     DeviceClass *dc = DEVICE_CLASS(klass);
3464b290659fSGerd Hoffmann 
34654f67d30bSMarc-André Lureau     device_class_set_props(dc, vfio_pci_dev_nohotplug_properties);
3466b290659fSGerd Hoffmann     dc->hotpluggable = false;
3467b290659fSGerd Hoffmann }
3468b290659fSGerd Hoffmann 
3469b290659fSGerd Hoffmann static const TypeInfo vfio_pci_nohotplug_dev_info = {
3470f75ca627SChen Zhang     .name = TYPE_VFIO_PCI_NOHOTPLUG,
34710c0c8f8aSLi Qiang     .parent = TYPE_VFIO_PCI,
3472b290659fSGerd Hoffmann     .instance_size = sizeof(VFIOPCIDevice),
3473b290659fSGerd Hoffmann     .class_init = vfio_pci_nohotplug_dev_class_init,
3474b290659fSGerd Hoffmann };
3475b290659fSGerd Hoffmann 
register_vfio_pci_dev_type(void)3476cf7087dbSKim Phillips static void register_vfio_pci_dev_type(void)
3477cf7087dbSKim Phillips {
3478cf7087dbSKim Phillips     type_register_static(&vfio_pci_dev_info);
3479b290659fSGerd Hoffmann     type_register_static(&vfio_pci_nohotplug_dev_info);
3480cf7087dbSKim Phillips }
3481cf7087dbSKim Phillips 
3482cf7087dbSKim Phillips type_init(register_vfio_pci_dev_type)
3483