1cf7087dbSKim Phillips /*
2cf7087dbSKim Phillips * vfio based device assignment support
3cf7087dbSKim Phillips *
4cf7087dbSKim Phillips * Copyright Red Hat, Inc. 2012
5cf7087dbSKim Phillips *
6cf7087dbSKim Phillips * Authors:
7cf7087dbSKim Phillips * Alex Williamson <alex.williamson@redhat.com>
8cf7087dbSKim Phillips *
9cf7087dbSKim Phillips * This work is licensed under the terms of the GNU GPL, version 2. See
10cf7087dbSKim Phillips * the COPYING file in the top-level directory.
11cf7087dbSKim Phillips *
12cf7087dbSKim Phillips * Based on qemu-kvm device-assignment:
13cf7087dbSKim Phillips * Adapted for KVM by Qumranet.
14cf7087dbSKim Phillips * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15cf7087dbSKim Phillips * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16cf7087dbSKim Phillips * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17cf7087dbSKim Phillips * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18cf7087dbSKim Phillips * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19cf7087dbSKim Phillips */
20cf7087dbSKim Phillips
21c6eacb1aSPeter Maydell #include "qemu/osdep.h"
22ee42b261SEric Auger #include CONFIG_DEVICES /* CONFIG_IOMMUFD */
23cf7087dbSKim Phillips #include <linux/vfio.h>
24cf7087dbSKim Phillips #include <sys/ioctl.h>
25cf7087dbSKim Phillips
26650d103dSMarkus Armbruster #include "hw/hw.h"
27cf7087dbSKim Phillips #include "hw/pci/msi.h"
28cf7087dbSKim Phillips #include "hw/pci/msix.h"
290282abf0SAlex Williamson #include "hw/pci/pci_bridge.h"
30a27bd6c7SMarkus Armbruster #include "hw/qdev-properties.h"
31ce35e229SEduardo Habkost #include "hw/qdev-properties-system.h"
32d6454270SMarkus Armbruster #include "migration/vmstate.h"
33f3558b1bSKevin Wolf #include "qapi/qmp/qdict.h"
34cf7087dbSKim Phillips #include "qemu/error-report.h"
35db725815SMarkus Armbruster #include "qemu/main-loop.h"
360b8fa32fSMarkus Armbruster #include "qemu/module.h"
37cf7087dbSKim Phillips #include "qemu/range.h"
38e0255bb1SPhilippe Mathieu-Daudé #include "qemu/units.h"
39cf7087dbSKim Phillips #include "sysemu/kvm.h"
4054d31236SMarkus Armbruster #include "sysemu/runstate.h"
4178f33d2bSAlex Williamson #include "pci.h"
42cf7087dbSKim Phillips #include "trace.h"
431108b2f8SCao jin #include "qapi/error.h"
44f045a010SJens Freimann #include "migration/blocker.h"
45c5e2fb3cSKirti Wankhede #include "migration/qemu-file.h"
46ee42b261SEric Auger #include "sysemu/iommufd.h"
47cf7087dbSKim Phillips
48f75ca627SChen Zhang #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
490c0c8f8aSLi Qiang
50dc580d51SLongpeng(Mike) /* Protected by BQL */
51dc580d51SLongpeng(Mike) static KVMRouteChange vfio_route_change;
52dc580d51SLongpeng(Mike)
539ee27d73SEric Auger static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
549ee27d73SEric Auger static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
558ab217d5SLongpeng(Mike) static void vfio_msi_disable_common(VFIOPCIDevice *vdev);
56cf7087dbSKim Phillips
57cf7087dbSKim Phillips /*
58cf7087dbSKim Phillips * Disabling BAR mmaping can be slow, but toggling it around INTx can
59cf7087dbSKim Phillips * also be a huge overhead. We try to get the best of both worlds by
60cf7087dbSKim Phillips * waiting until an interrupt to disable mmaps (subsequent transitions
61cf7087dbSKim Phillips * to the same state are effectively no overhead). If the interrupt has
62cf7087dbSKim Phillips * been serviced and the time gap is long enough, we re-enable mmaps for
63cf7087dbSKim Phillips * performance. This works well for things like graphics cards, which
64cf7087dbSKim Phillips * may not use their interrupt at all and are penalized to an unusable
65cf7087dbSKim Phillips * level by read/write BAR traps. Other devices, like NICs, have more
66cf7087dbSKim Phillips * regular interrupts and see much better latency by staying in non-mmap
67cf7087dbSKim Phillips * mode. We therefore set the default mmap_timeout such that a ping
68cf7087dbSKim Phillips * is just enough to keep the mmap disabled. Users can experiment with
69cf7087dbSKim Phillips * other options with the x-intx-mmap-timeout-ms parameter (a value of
70cf7087dbSKim Phillips * zero disables the timer).
71cf7087dbSKim Phillips */
vfio_intx_mmap_enable(void * opaque)72cf7087dbSKim Phillips static void vfio_intx_mmap_enable(void *opaque)
73cf7087dbSKim Phillips {
749ee27d73SEric Auger VFIOPCIDevice *vdev = opaque;
75cf7087dbSKim Phillips
76cf7087dbSKim Phillips if (vdev->intx.pending) {
77cf7087dbSKim Phillips timer_mod(vdev->intx.mmap_timer,
78cf7087dbSKim Phillips qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
79cf7087dbSKim Phillips return;
80cf7087dbSKim Phillips }
81cf7087dbSKim Phillips
82cf7087dbSKim Phillips vfio_mmap_set_enabled(vdev, true);
83cf7087dbSKim Phillips }
84cf7087dbSKim Phillips
vfio_intx_interrupt(void * opaque)85cf7087dbSKim Phillips static void vfio_intx_interrupt(void *opaque)
86cf7087dbSKim Phillips {
879ee27d73SEric Auger VFIOPCIDevice *vdev = opaque;
88cf7087dbSKim Phillips
89cf7087dbSKim Phillips if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
90cf7087dbSKim Phillips return;
91cf7087dbSKim Phillips }
92cf7087dbSKim Phillips
93df92ee44SEric Auger trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin);
94cf7087dbSKim Phillips
95cf7087dbSKim Phillips vdev->intx.pending = true;
96cf7087dbSKim Phillips pci_irq_assert(&vdev->pdev);
97cf7087dbSKim Phillips vfio_mmap_set_enabled(vdev, false);
98cf7087dbSKim Phillips if (vdev->intx.mmap_timeout) {
99cf7087dbSKim Phillips timer_mod(vdev->intx.mmap_timer,
100cf7087dbSKim Phillips qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
101cf7087dbSKim Phillips }
102cf7087dbSKim Phillips }
103cf7087dbSKim Phillips
vfio_intx_eoi(VFIODevice * vbasedev)104870cb6f1SAlex Williamson static void vfio_intx_eoi(VFIODevice *vbasedev)
105cf7087dbSKim Phillips {
106a664477dSEric Auger VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
107a664477dSEric Auger
108cf7087dbSKim Phillips if (!vdev->intx.pending) {
109cf7087dbSKim Phillips return;
110cf7087dbSKim Phillips }
111cf7087dbSKim Phillips
112870cb6f1SAlex Williamson trace_vfio_intx_eoi(vbasedev->name);
113cf7087dbSKim Phillips
114cf7087dbSKim Phillips vdev->intx.pending = false;
115cf7087dbSKim Phillips pci_irq_deassert(&vdev->pdev);
116a664477dSEric Auger vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
117cf7087dbSKim Phillips }
118cf7087dbSKim Phillips
vfio_intx_enable_kvm(VFIOPCIDevice * vdev,Error ** errp)11944cd660aSZhenzhong Duan static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
120cf7087dbSKim Phillips {
121cf7087dbSKim Phillips #ifdef CONFIG_KVM
12297a37576SPeter Xu int irq_fd = event_notifier_get_fd(&vdev->intx.interrupt);
123cf7087dbSKim Phillips
12446746dbaSAlex Williamson if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
125cf7087dbSKim Phillips vdev->intx.route.mode != PCI_INTX_ENABLED ||
126cf7087dbSKim Phillips !kvm_resamplefds_enabled()) {
12744cd660aSZhenzhong Duan return true;
128cf7087dbSKim Phillips }
129cf7087dbSKim Phillips
130cf7087dbSKim Phillips /* Get to a known interrupt state */
13197a37576SPeter Xu qemu_set_fd_handler(irq_fd, NULL, NULL, vdev);
1325546a621SEric Auger vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
133cf7087dbSKim Phillips vdev->intx.pending = false;
134cf7087dbSKim Phillips pci_irq_deassert(&vdev->pdev);
135cf7087dbSKim Phillips
136cf7087dbSKim Phillips /* Get an eventfd for resample/unmask */
137cf7087dbSKim Phillips if (event_notifier_init(&vdev->intx.unmask, 0)) {
1387dfb3424SEric Auger error_setg(errp, "event_notifier_init failed eoi");
139cf7087dbSKim Phillips goto fail;
140cf7087dbSKim Phillips }
141cf7087dbSKim Phillips
14297a37576SPeter Xu if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
14397a37576SPeter Xu &vdev->intx.interrupt,
14497a37576SPeter Xu &vdev->intx.unmask,
14597a37576SPeter Xu vdev->intx.route.irq)) {
1467dfb3424SEric Auger error_setg_errno(errp, errno, "failed to setup resample irqfd");
147cf7087dbSKim Phillips goto fail_irqfd;
148cf7087dbSKim Phillips }
149cf7087dbSKim Phillips
15084e37d02SZhenzhong Duan if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
151201a7331SEric Auger VFIO_IRQ_SET_ACTION_UNMASK,
15297a37576SPeter Xu event_notifier_get_fd(&vdev->intx.unmask),
153668f62ecSMarkus Armbruster errp)) {
154cf7087dbSKim Phillips goto fail_vfio;
155cf7087dbSKim Phillips }
156cf7087dbSKim Phillips
157cf7087dbSKim Phillips /* Let'em rip */
1585546a621SEric Auger vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
159cf7087dbSKim Phillips
160cf7087dbSKim Phillips vdev->intx.kvm_accel = true;
161cf7087dbSKim Phillips
162870cb6f1SAlex Williamson trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
163cf7087dbSKim Phillips
16444cd660aSZhenzhong Duan return true;
165cf7087dbSKim Phillips
166cf7087dbSKim Phillips fail_vfio:
16797a37576SPeter Xu kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
16897a37576SPeter Xu vdev->intx.route.irq);
169cf7087dbSKim Phillips fail_irqfd:
170cf7087dbSKim Phillips event_notifier_cleanup(&vdev->intx.unmask);
171cf7087dbSKim Phillips fail:
17297a37576SPeter Xu qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev);
1735546a621SEric Auger vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
17444cd660aSZhenzhong Duan return false;
17544cd660aSZhenzhong Duan #else
17644cd660aSZhenzhong Duan return true;
177cf7087dbSKim Phillips #endif
178cf7087dbSKim Phillips }
179cf7087dbSKim Phillips
vfio_intx_disable_kvm(VFIOPCIDevice * vdev)180870cb6f1SAlex Williamson static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
181cf7087dbSKim Phillips {
182cf7087dbSKim Phillips #ifdef CONFIG_KVM
183cf7087dbSKim Phillips if (!vdev->intx.kvm_accel) {
184cf7087dbSKim Phillips return;
185cf7087dbSKim Phillips }
186cf7087dbSKim Phillips
187cf7087dbSKim Phillips /*
188cf7087dbSKim Phillips * Get to a known state, hardware masked, QEMU ready to accept new
189cf7087dbSKim Phillips * interrupts, QEMU IRQ de-asserted.
190cf7087dbSKim Phillips */
1915546a621SEric Auger vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
192cf7087dbSKim Phillips vdev->intx.pending = false;
193cf7087dbSKim Phillips pci_irq_deassert(&vdev->pdev);
194cf7087dbSKim Phillips
195cf7087dbSKim Phillips /* Tell KVM to stop listening for an INTx irqfd */
19697a37576SPeter Xu if (kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
19797a37576SPeter Xu vdev->intx.route.irq)) {
198cf7087dbSKim Phillips error_report("vfio: Error: Failed to disable INTx irqfd: %m");
199cf7087dbSKim Phillips }
200cf7087dbSKim Phillips
201cf7087dbSKim Phillips /* We only need to close the eventfd for VFIO to cleanup the kernel side */
202cf7087dbSKim Phillips event_notifier_cleanup(&vdev->intx.unmask);
203cf7087dbSKim Phillips
204cf7087dbSKim Phillips /* QEMU starts listening for interrupt events. */
20597a37576SPeter Xu qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt),
20697a37576SPeter Xu vfio_intx_interrupt, NULL, vdev);
207cf7087dbSKim Phillips
208cf7087dbSKim Phillips vdev->intx.kvm_accel = false;
209cf7087dbSKim Phillips
210cf7087dbSKim Phillips /* If we've missed an event, let it re-fire through QEMU */
2115546a621SEric Auger vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
212cf7087dbSKim Phillips
213870cb6f1SAlex Williamson trace_vfio_intx_disable_kvm(vdev->vbasedev.name);
214cf7087dbSKim Phillips #endif
215cf7087dbSKim Phillips }
216cf7087dbSKim Phillips
vfio_intx_update(VFIOPCIDevice * vdev,PCIINTxRoute * route)217ad54dbd8SDavid Gibson static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route)
218cf7087dbSKim Phillips {
2197dfb3424SEric Auger Error *err = NULL;
220cf7087dbSKim Phillips
221870cb6f1SAlex Williamson trace_vfio_intx_update(vdev->vbasedev.name,
222ad54dbd8SDavid Gibson vdev->intx.route.irq, route->irq);
223cf7087dbSKim Phillips
224870cb6f1SAlex Williamson vfio_intx_disable_kvm(vdev);
225cf7087dbSKim Phillips
226ad54dbd8SDavid Gibson vdev->intx.route = *route;
227cf7087dbSKim Phillips
228ad54dbd8SDavid Gibson if (route->mode != PCI_INTX_ENABLED) {
229cf7087dbSKim Phillips return;
230cf7087dbSKim Phillips }
231cf7087dbSKim Phillips
23244cd660aSZhenzhong Duan if (!vfio_intx_enable_kvm(vdev, &err)) {
233e1eb292aSMarkus Armbruster warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2347dfb3424SEric Auger }
235cf7087dbSKim Phillips
236cf7087dbSKim Phillips /* Re-enable the interrupt in cased we missed an EOI */
237870cb6f1SAlex Williamson vfio_intx_eoi(&vdev->vbasedev);
238cf7087dbSKim Phillips }
239cf7087dbSKim Phillips
vfio_intx_routing_notifier(PCIDevice * pdev)240ad54dbd8SDavid Gibson static void vfio_intx_routing_notifier(PCIDevice *pdev)
241ad54dbd8SDavid Gibson {
24201b46064SEduardo Habkost VFIOPCIDevice *vdev = VFIO_PCI(pdev);
243ad54dbd8SDavid Gibson PCIINTxRoute route;
244ad54dbd8SDavid Gibson
245ad54dbd8SDavid Gibson if (vdev->interrupt != VFIO_INT_INTx) {
246ad54dbd8SDavid Gibson return;
247ad54dbd8SDavid Gibson }
248ad54dbd8SDavid Gibson
249ad54dbd8SDavid Gibson route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
250ad54dbd8SDavid Gibson
251ad54dbd8SDavid Gibson if (pci_intx_route_changed(&vdev->intx.route, &route)) {
252ad54dbd8SDavid Gibson vfio_intx_update(vdev, &route);
253ad54dbd8SDavid Gibson }
254ad54dbd8SDavid Gibson }
255ad54dbd8SDavid Gibson
vfio_irqchip_change(Notifier * notify,void * data)256c5478feaSDavid Gibson static void vfio_irqchip_change(Notifier *notify, void *data)
257c5478feaSDavid Gibson {
258c5478feaSDavid Gibson VFIOPCIDevice *vdev = container_of(notify, VFIOPCIDevice,
259c5478feaSDavid Gibson irqchip_change_notifier);
260c5478feaSDavid Gibson
261c5478feaSDavid Gibson vfio_intx_update(vdev, &vdev->intx.route);
262c5478feaSDavid Gibson }
263c5478feaSDavid Gibson
vfio_intx_enable(VFIOPCIDevice * vdev,Error ** errp)264c32bab07SZhenzhong Duan static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
265cf7087dbSKim Phillips {
266cf7087dbSKim Phillips uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
2677dfb3424SEric Auger Error *err = NULL;
268201a7331SEric Auger int32_t fd;
269201a7331SEric Auger int ret;
270201a7331SEric Auger
271cf7087dbSKim Phillips
272cf7087dbSKim Phillips if (!pin) {
273c32bab07SZhenzhong Duan return true;
274cf7087dbSKim Phillips }
275cf7087dbSKim Phillips
276cf7087dbSKim Phillips vfio_disable_interrupts(vdev);
277cf7087dbSKim Phillips
278cf7087dbSKim Phillips vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
279cf7087dbSKim Phillips pci_config_set_interrupt_pin(vdev->pdev.config, pin);
280cf7087dbSKim Phillips
281cf7087dbSKim Phillips #ifdef CONFIG_KVM
282cf7087dbSKim Phillips /*
283cf7087dbSKim Phillips * Only conditional to avoid generating error messages on platforms
284cf7087dbSKim Phillips * where we won't actually use the result anyway.
285cf7087dbSKim Phillips */
286cf7087dbSKim Phillips if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
287cf7087dbSKim Phillips vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
288cf7087dbSKim Phillips vdev->intx.pin);
289cf7087dbSKim Phillips }
290cf7087dbSKim Phillips #endif
291cf7087dbSKim Phillips
292cf7087dbSKim Phillips ret = event_notifier_init(&vdev->intx.interrupt, 0);
293cf7087dbSKim Phillips if (ret) {
2947dfb3424SEric Auger error_setg_errno(errp, -ret, "event_notifier_init failed");
295c32bab07SZhenzhong Duan return false;
296cf7087dbSKim Phillips }
297201a7331SEric Auger fd = event_notifier_get_fd(&vdev->intx.interrupt);
298201a7331SEric Auger qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
299cf7087dbSKim Phillips
30084e37d02SZhenzhong Duan if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
301af175e85SMarkus Armbruster VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
302201a7331SEric Auger qemu_set_fd_handler(fd, NULL, NULL, vdev);
303cf7087dbSKim Phillips event_notifier_cleanup(&vdev->intx.interrupt);
304c32bab07SZhenzhong Duan return false;
305cf7087dbSKim Phillips }
306cf7087dbSKim Phillips
30744cd660aSZhenzhong Duan if (!vfio_intx_enable_kvm(vdev, &err)) {
308e1eb292aSMarkus Armbruster warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3097dfb3424SEric Auger }
310cf7087dbSKim Phillips
311cf7087dbSKim Phillips vdev->interrupt = VFIO_INT_INTx;
312cf7087dbSKim Phillips
313870cb6f1SAlex Williamson trace_vfio_intx_enable(vdev->vbasedev.name);
314c32bab07SZhenzhong Duan return true;
315cf7087dbSKim Phillips }
316cf7087dbSKim Phillips
vfio_intx_disable(VFIOPCIDevice * vdev)317870cb6f1SAlex Williamson static void vfio_intx_disable(VFIOPCIDevice *vdev)
318cf7087dbSKim Phillips {
319cf7087dbSKim Phillips int fd;
320cf7087dbSKim Phillips
321cf7087dbSKim Phillips timer_del(vdev->intx.mmap_timer);
322870cb6f1SAlex Williamson vfio_intx_disable_kvm(vdev);
3235546a621SEric Auger vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
324cf7087dbSKim Phillips vdev->intx.pending = false;
325cf7087dbSKim Phillips pci_irq_deassert(&vdev->pdev);
326cf7087dbSKim Phillips vfio_mmap_set_enabled(vdev, true);
327cf7087dbSKim Phillips
328cf7087dbSKim Phillips fd = event_notifier_get_fd(&vdev->intx.interrupt);
329cf7087dbSKim Phillips qemu_set_fd_handler(fd, NULL, NULL, vdev);
330cf7087dbSKim Phillips event_notifier_cleanup(&vdev->intx.interrupt);
331cf7087dbSKim Phillips
332cf7087dbSKim Phillips vdev->interrupt = VFIO_INT_NONE;
333cf7087dbSKim Phillips
334870cb6f1SAlex Williamson trace_vfio_intx_disable(vdev->vbasedev.name);
335cf7087dbSKim Phillips }
336cf7087dbSKim Phillips
337cf7087dbSKim Phillips /*
338cf7087dbSKim Phillips * MSI/X
339cf7087dbSKim Phillips */
vfio_msi_interrupt(void * opaque)340cf7087dbSKim Phillips static void vfio_msi_interrupt(void *opaque)
341cf7087dbSKim Phillips {
342cf7087dbSKim Phillips VFIOMSIVector *vector = opaque;
3439ee27d73SEric Auger VFIOPCIDevice *vdev = vector->vdev;
3440de70dc7SAlex Williamson MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector);
3450de70dc7SAlex Williamson void (*notify)(PCIDevice *dev, unsigned vector);
3460de70dc7SAlex Williamson MSIMessage msg;
347cf7087dbSKim Phillips int nr = vector - vdev->msi_vectors;
348cf7087dbSKim Phillips
349cf7087dbSKim Phillips if (!event_notifier_test_and_clear(&vector->interrupt)) {
350cf7087dbSKim Phillips return;
351cf7087dbSKim Phillips }
352cf7087dbSKim Phillips
353cf7087dbSKim Phillips if (vdev->interrupt == VFIO_INT_MSIX) {
3540de70dc7SAlex Williamson get_msg = msix_get_message;
3550de70dc7SAlex Williamson notify = msix_notify;
35695239e16SAlex Williamson
35795239e16SAlex Williamson /* A masked vector firing needs to use the PBA, enable it */
35895239e16SAlex Williamson if (msix_is_masked(&vdev->pdev, nr)) {
35995239e16SAlex Williamson set_bit(nr, vdev->msix->pending);
36095239e16SAlex Williamson memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true);
36195239e16SAlex Williamson trace_vfio_msix_pba_enable(vdev->vbasedev.name);
36295239e16SAlex Williamson }
363cf7087dbSKim Phillips } else if (vdev->interrupt == VFIO_INT_MSI) {
3640de70dc7SAlex Williamson get_msg = msi_get_message;
3650de70dc7SAlex Williamson notify = msi_notify;
366cf7087dbSKim Phillips } else {
367cf7087dbSKim Phillips abort();
368cf7087dbSKim Phillips }
369cf7087dbSKim Phillips
3700de70dc7SAlex Williamson msg = get_msg(&vdev->pdev, nr);
371bc5baffaSAlexey Kardashevskiy trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data);
3720de70dc7SAlex Williamson notify(&vdev->pdev, nr);
373cf7087dbSKim Phillips }
374cf7087dbSKim Phillips
3755ebffa4eSJing Liu /*
3765ebffa4eSJing Liu * Get MSI-X enabled, but no vector enabled, by setting vector 0 with an invalid
3775ebffa4eSJing Liu * fd to kernel.
3785ebffa4eSJing Liu */
vfio_enable_msix_no_vec(VFIOPCIDevice * vdev)3795ebffa4eSJing Liu static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
3805ebffa4eSJing Liu {
3815ebffa4eSJing Liu g_autofree struct vfio_irq_set *irq_set = NULL;
3825ebffa4eSJing Liu int ret = 0, argsz;
3835ebffa4eSJing Liu int32_t *fd;
3845ebffa4eSJing Liu
3855ebffa4eSJing Liu argsz = sizeof(*irq_set) + sizeof(*fd);
3865ebffa4eSJing Liu
3875ebffa4eSJing Liu irq_set = g_malloc0(argsz);
3885ebffa4eSJing Liu irq_set->argsz = argsz;
3895ebffa4eSJing Liu irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3905ebffa4eSJing Liu VFIO_IRQ_SET_ACTION_TRIGGER;
3915ebffa4eSJing Liu irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
3925ebffa4eSJing Liu irq_set->start = 0;
3935ebffa4eSJing Liu irq_set->count = 1;
3945ebffa4eSJing Liu fd = (int32_t *)&irq_set->data;
3955ebffa4eSJing Liu *fd = -1;
3965ebffa4eSJing Liu
3975ebffa4eSJing Liu ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
3985ebffa4eSJing Liu
3995ebffa4eSJing Liu return ret;
4005ebffa4eSJing Liu }
4015ebffa4eSJing Liu
vfio_enable_vectors(VFIOPCIDevice * vdev,bool msix)4029ee27d73SEric Auger static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
403cf7087dbSKim Phillips {
404cf7087dbSKim Phillips struct vfio_irq_set *irq_set;
405cf7087dbSKim Phillips int ret = 0, i, argsz;
406cf7087dbSKim Phillips int32_t *fds;
407cf7087dbSKim Phillips
408eaadba6fSJing Liu /*
409eaadba6fSJing Liu * If dynamic MSI-X allocation is supported, the vectors to be allocated
410eaadba6fSJing Liu * and enabled can be scattered. Before kernel enabling MSI-X, setting
411eaadba6fSJing Liu * nr_vectors causes all these vectors to be allocated on host.
412eaadba6fSJing Liu *
413eaadba6fSJing Liu * To keep allocation as needed, use vector 0 with an invalid fd to get
414eaadba6fSJing Liu * MSI-X enabled first, then set vectors with a potentially sparse set of
415eaadba6fSJing Liu * eventfds to enable interrupts only when enabled in guest.
416eaadba6fSJing Liu */
417eaadba6fSJing Liu if (msix && !vdev->msix->noresize) {
418eaadba6fSJing Liu ret = vfio_enable_msix_no_vec(vdev);
419eaadba6fSJing Liu
420eaadba6fSJing Liu if (ret) {
421eaadba6fSJing Liu return ret;
422eaadba6fSJing Liu }
423eaadba6fSJing Liu }
424eaadba6fSJing Liu
425cf7087dbSKim Phillips argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
426cf7087dbSKim Phillips
427cf7087dbSKim Phillips irq_set = g_malloc0(argsz);
428cf7087dbSKim Phillips irq_set->argsz = argsz;
429cf7087dbSKim Phillips irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
430cf7087dbSKim Phillips irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
431cf7087dbSKim Phillips irq_set->start = 0;
432cf7087dbSKim Phillips irq_set->count = vdev->nr_vectors;
433cf7087dbSKim Phillips fds = (int32_t *)&irq_set->data;
434cf7087dbSKim Phillips
435cf7087dbSKim Phillips for (i = 0; i < vdev->nr_vectors; i++) {
436cf7087dbSKim Phillips int fd = -1;
437cf7087dbSKim Phillips
438cf7087dbSKim Phillips /*
439cf7087dbSKim Phillips * MSI vs MSI-X - The guest has direct access to MSI mask and pending
440cf7087dbSKim Phillips * bits, therefore we always use the KVM signaling path when setup.
441cf7087dbSKim Phillips * MSI-X mask and pending bits are emulated, so we want to use the
442cf7087dbSKim Phillips * KVM signaling path only when configured and unmasked.
443cf7087dbSKim Phillips */
444cf7087dbSKim Phillips if (vdev->msi_vectors[i].use) {
445cf7087dbSKim Phillips if (vdev->msi_vectors[i].virq < 0 ||
446cf7087dbSKim Phillips (msix && msix_is_masked(&vdev->pdev, i))) {
447cf7087dbSKim Phillips fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
448cf7087dbSKim Phillips } else {
449cf7087dbSKim Phillips fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
450cf7087dbSKim Phillips }
451cf7087dbSKim Phillips }
452cf7087dbSKim Phillips
453cf7087dbSKim Phillips fds[i] = fd;
454cf7087dbSKim Phillips }
455cf7087dbSKim Phillips
4565546a621SEric Auger ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
457cf7087dbSKim Phillips
458cf7087dbSKim Phillips g_free(irq_set);
459cf7087dbSKim Phillips
460cf7087dbSKim Phillips return ret;
461cf7087dbSKim Phillips }
462cf7087dbSKim Phillips
vfio_add_kvm_msi_virq(VFIOPCIDevice * vdev,VFIOMSIVector * vector,int vector_n,bool msix)46346746dbaSAlex Williamson static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
464d1f6af6aSPeter Xu int vector_n, bool msix)
465cf7087dbSKim Phillips {
466d1f6af6aSPeter Xu if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) {
467cf7087dbSKim Phillips return;
468cf7087dbSKim Phillips }
469cf7087dbSKim Phillips
470dc580d51SLongpeng(Mike) vector->virq = kvm_irqchip_add_msi_route(&vfio_route_change,
471dc580d51SLongpeng(Mike) vector_n, &vdev->pdev);
472dc580d51SLongpeng(Mike) }
473dc580d51SLongpeng(Mike)
vfio_connect_kvm_msi_virq(VFIOMSIVector * vector)474dc580d51SLongpeng(Mike) static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector)
475dc580d51SLongpeng(Mike) {
476dc580d51SLongpeng(Mike) if (vector->virq < 0) {
477cf7087dbSKim Phillips return;
478cf7087dbSKim Phillips }
479cf7087dbSKim Phillips
480dc580d51SLongpeng(Mike) if (event_notifier_init(&vector->kvm_interrupt, 0)) {
481dc580d51SLongpeng(Mike) goto fail_notifier;
482cf7087dbSKim Phillips }
483cf7087dbSKim Phillips
4841c9b71a7SEric Auger if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
485dc580d51SLongpeng(Mike) NULL, vector->virq) < 0) {
486dc580d51SLongpeng(Mike) goto fail_kvm;
487cf7087dbSKim Phillips }
488cf7087dbSKim Phillips
489dc580d51SLongpeng(Mike) return;
490dc580d51SLongpeng(Mike)
491dc580d51SLongpeng(Mike) fail_kvm:
492dc580d51SLongpeng(Mike) event_notifier_cleanup(&vector->kvm_interrupt);
493dc580d51SLongpeng(Mike) fail_notifier:
494dc580d51SLongpeng(Mike) kvm_irqchip_release_virq(kvm_state, vector->virq);
495dc580d51SLongpeng(Mike) vector->virq = -1;
496cf7087dbSKim Phillips }
497cf7087dbSKim Phillips
vfio_remove_kvm_msi_virq(VFIOMSIVector * vector)498cf7087dbSKim Phillips static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
499cf7087dbSKim Phillips {
5001c9b71a7SEric Auger kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
501cf7087dbSKim Phillips vector->virq);
502cf7087dbSKim Phillips kvm_irqchip_release_virq(kvm_state, vector->virq);
503cf7087dbSKim Phillips vector->virq = -1;
504cf7087dbSKim Phillips event_notifier_cleanup(&vector->kvm_interrupt);
505cf7087dbSKim Phillips }
506cf7087dbSKim Phillips
vfio_update_kvm_msi_virq(VFIOMSIVector * vector,MSIMessage msg,PCIDevice * pdev)507dc9f06caSPavel Fedin static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
508dc9f06caSPavel Fedin PCIDevice *pdev)
509cf7087dbSKim Phillips {
510dc9f06caSPavel Fedin kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev);
5113f1fea0fSPeter Xu kvm_irqchip_commit_routes(kvm_state);
512cf7087dbSKim Phillips }
513cf7087dbSKim Phillips
vfio_msix_vector_do_use(PCIDevice * pdev,unsigned int nr,MSIMessage * msg,IOHandler * handler)514cf7087dbSKim Phillips static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
515cf7087dbSKim Phillips MSIMessage *msg, IOHandler *handler)
516cf7087dbSKim Phillips {
51701b46064SEduardo Habkost VFIOPCIDevice *vdev = VFIO_PCI(pdev);
518cf7087dbSKim Phillips VFIOMSIVector *vector;
519cf7087dbSKim Phillips int ret;
520d9e6710dSJing Liu bool resizing = !!(vdev->nr_vectors < nr + 1);
521cf7087dbSKim Phillips
522df92ee44SEric Auger trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
523cf7087dbSKim Phillips
524cf7087dbSKim Phillips vector = &vdev->msi_vectors[nr];
525cf7087dbSKim Phillips
526cf7087dbSKim Phillips if (!vector->use) {
527cf7087dbSKim Phillips vector->vdev = vdev;
528cf7087dbSKim Phillips vector->virq = -1;
529cf7087dbSKim Phillips if (event_notifier_init(&vector->interrupt, 0)) {
530cf7087dbSKim Phillips error_report("vfio: Error: event_notifier_init failed");
531cf7087dbSKim Phillips }
532cf7087dbSKim Phillips vector->use = true;
533cf7087dbSKim Phillips msix_vector_use(pdev, nr);
534cf7087dbSKim Phillips }
535cf7087dbSKim Phillips
536cf7087dbSKim Phillips qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
537cf7087dbSKim Phillips handler, NULL, vector);
538cf7087dbSKim Phillips
539cf7087dbSKim Phillips /*
540cf7087dbSKim Phillips * Attempt to enable route through KVM irqchip,
541cf7087dbSKim Phillips * default to userspace handling if unavailable.
542cf7087dbSKim Phillips */
543cf7087dbSKim Phillips if (vector->virq >= 0) {
544cf7087dbSKim Phillips if (!msg) {
545cf7087dbSKim Phillips vfio_remove_kvm_msi_virq(vector);
546cf7087dbSKim Phillips } else {
547dc9f06caSPavel Fedin vfio_update_kvm_msi_virq(vector, *msg, pdev);
548cf7087dbSKim Phillips }
549cf7087dbSKim Phillips } else {
5506d17a018SDavid Gibson if (msg) {
551dc580d51SLongpeng(Mike) if (vdev->defer_kvm_irq_routing) {
552d1f6af6aSPeter Xu vfio_add_kvm_msi_virq(vdev, vector, nr, true);
553dc580d51SLongpeng(Mike) } else {
554dc580d51SLongpeng(Mike) vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
555dc580d51SLongpeng(Mike) vfio_add_kvm_msi_virq(vdev, vector, nr, true);
556dc580d51SLongpeng(Mike) kvm_irqchip_commit_route_changes(&vfio_route_change);
557dc580d51SLongpeng(Mike) vfio_connect_kvm_msi_virq(vector);
558dc580d51SLongpeng(Mike) }
559cf7087dbSKim Phillips }
5606d17a018SDavid Gibson }
561cf7087dbSKim Phillips
562cf7087dbSKim Phillips /*
563d9e6710dSJing Liu * When dynamic allocation is not supported, we don't want to have the
564d9e6710dSJing Liu * host allocate all possible MSI vectors for a device if they're not
565d9e6710dSJing Liu * in use, so we shutdown and incrementally increase them as needed.
566d9e6710dSJing Liu * nr_vectors represents the total number of vectors allocated.
567d9e6710dSJing Liu *
568d9e6710dSJing Liu * When dynamic allocation is supported, let the host only allocate
569d9e6710dSJing Liu * and enable a vector when it is in use in guest. nr_vectors represents
570d9e6710dSJing Liu * the upper bound of vectors being enabled (but not all of the ranges
571d9e6710dSJing Liu * is allocated or enabled).
572cf7087dbSKim Phillips */
573d9e6710dSJing Liu if (resizing) {
574cf7087dbSKim Phillips vdev->nr_vectors = nr + 1;
575d9e6710dSJing Liu }
576d9e6710dSJing Liu
577dc580d51SLongpeng(Mike) if (!vdev->defer_kvm_irq_routing) {
578d9e6710dSJing Liu if (vdev->msix->noresize && resizing) {
579dc580d51SLongpeng(Mike) vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
580cf7087dbSKim Phillips ret = vfio_enable_vectors(vdev, true);
581cf7087dbSKim Phillips if (ret) {
582cf7087dbSKim Phillips error_report("vfio: failed to enable vectors, %d", ret);
583cf7087dbSKim Phillips }
584cf7087dbSKim Phillips } else {
585201a7331SEric Auger Error *err = NULL;
586201a7331SEric Auger int32_t fd;
587cf7087dbSKim Phillips
588cf7087dbSKim Phillips if (vector->virq >= 0) {
589201a7331SEric Auger fd = event_notifier_get_fd(&vector->kvm_interrupt);
590cf7087dbSKim Phillips } else {
591201a7331SEric Auger fd = event_notifier_get_fd(&vector->interrupt);
592cf7087dbSKim Phillips }
593cf7087dbSKim Phillips
59484e37d02SZhenzhong Duan if (!vfio_set_irq_signaling(&vdev->vbasedev,
595201a7331SEric Auger VFIO_PCI_MSIX_IRQ_INDEX, nr,
59684e37d02SZhenzhong Duan VFIO_IRQ_SET_ACTION_TRIGGER, fd,
59784e37d02SZhenzhong Duan &err)) {
598201a7331SEric Auger error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
599cf7087dbSKim Phillips }
600cf7087dbSKim Phillips }
601d9e6710dSJing Liu }
602cf7087dbSKim Phillips
60395239e16SAlex Williamson /* Disable PBA emulation when nothing more is pending. */
60495239e16SAlex Williamson clear_bit(nr, vdev->msix->pending);
60595239e16SAlex Williamson if (find_first_bit(vdev->msix->pending,
60695239e16SAlex Williamson vdev->nr_vectors) == vdev->nr_vectors) {
60795239e16SAlex Williamson memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
60895239e16SAlex Williamson trace_vfio_msix_pba_disable(vdev->vbasedev.name);
60995239e16SAlex Williamson }
61095239e16SAlex Williamson
611cf7087dbSKim Phillips return 0;
612cf7087dbSKim Phillips }
613cf7087dbSKim Phillips
vfio_msix_vector_use(PCIDevice * pdev,unsigned int nr,MSIMessage msg)614cf7087dbSKim Phillips static int vfio_msix_vector_use(PCIDevice *pdev,
615cf7087dbSKim Phillips unsigned int nr, MSIMessage msg)
616cf7087dbSKim Phillips {
617cf7087dbSKim Phillips return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
618cf7087dbSKim Phillips }
619cf7087dbSKim Phillips
vfio_msix_vector_release(PCIDevice * pdev,unsigned int nr)620cf7087dbSKim Phillips static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
621cf7087dbSKim Phillips {
62201b46064SEduardo Habkost VFIOPCIDevice *vdev = VFIO_PCI(pdev);
623cf7087dbSKim Phillips VFIOMSIVector *vector = &vdev->msi_vectors[nr];
624cf7087dbSKim Phillips
625df92ee44SEric Auger trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
626cf7087dbSKim Phillips
627cf7087dbSKim Phillips /*
628cf7087dbSKim Phillips * There are still old guests that mask and unmask vectors on every
629cf7087dbSKim Phillips * interrupt. If we're using QEMU bypass with a KVM irqfd, leave all of
630cf7087dbSKim Phillips * the KVM setup in place, simply switch VFIO to use the non-bypass
631cf7087dbSKim Phillips * eventfd. We'll then fire the interrupt through QEMU and the MSI-X
632cf7087dbSKim Phillips * core will mask the interrupt and set pending bits, allowing it to
633cf7087dbSKim Phillips * be re-asserted on unmask. Nothing to do if already using QEMU mode.
634cf7087dbSKim Phillips */
635cf7087dbSKim Phillips if (vector->virq >= 0) {
636201a7331SEric Auger int32_t fd = event_notifier_get_fd(&vector->interrupt);
6375053bd78SEric Auger Error *err = NULL;
638cf7087dbSKim Phillips
63984e37d02SZhenzhong Duan if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
64084e37d02SZhenzhong Duan nr, VFIO_IRQ_SET_ACTION_TRIGGER, fd,
64184e37d02SZhenzhong Duan &err)) {
6425053bd78SEric Auger error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
6435053bd78SEric Auger }
644cf7087dbSKim Phillips }
645cf7087dbSKim Phillips }
646cf7087dbSKim Phillips
vfio_prepare_kvm_msi_virq_batch(VFIOPCIDevice * vdev)647dc580d51SLongpeng(Mike) static void vfio_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
648dc580d51SLongpeng(Mike) {
649dc580d51SLongpeng(Mike) assert(!vdev->defer_kvm_irq_routing);
650dc580d51SLongpeng(Mike) vdev->defer_kvm_irq_routing = true;
651dc580d51SLongpeng(Mike) vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
652dc580d51SLongpeng(Mike) }
653dc580d51SLongpeng(Mike)
vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice * vdev)654dc580d51SLongpeng(Mike) static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
655dc580d51SLongpeng(Mike) {
656dc580d51SLongpeng(Mike) int i;
657dc580d51SLongpeng(Mike)
658dc580d51SLongpeng(Mike) assert(vdev->defer_kvm_irq_routing);
659dc580d51SLongpeng(Mike) vdev->defer_kvm_irq_routing = false;
660dc580d51SLongpeng(Mike)
661dc580d51SLongpeng(Mike) kvm_irqchip_commit_route_changes(&vfio_route_change);
662dc580d51SLongpeng(Mike)
663dc580d51SLongpeng(Mike) for (i = 0; i < vdev->nr_vectors; i++) {
664dc580d51SLongpeng(Mike) vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i]);
665dc580d51SLongpeng(Mike) }
666dc580d51SLongpeng(Mike) }
667dc580d51SLongpeng(Mike)
vfio_msix_enable(VFIOPCIDevice * vdev)6680de70dc7SAlex Williamson static void vfio_msix_enable(VFIOPCIDevice *vdev)
669cf7087dbSKim Phillips {
6705ebffa4eSJing Liu int ret;
6715ebffa4eSJing Liu
672cf7087dbSKim Phillips vfio_disable_interrupts(vdev);
673cf7087dbSKim Phillips
674bdd81addSMarkus Armbruster vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries);
675cf7087dbSKim Phillips
676cf7087dbSKim Phillips vdev->interrupt = VFIO_INT_MSIX;
677cf7087dbSKim Phillips
678cf7087dbSKim Phillips /*
679dc580d51SLongpeng(Mike) * Setting vector notifiers triggers synchronous vector-use
680dc580d51SLongpeng(Mike) * callbacks for each active vector. Deferring to commit the KVM
681dc580d51SLongpeng(Mike) * routes once rather than per vector provides a substantial
682dc580d51SLongpeng(Mike) * performance improvement.
683dc580d51SLongpeng(Mike) */
684dc580d51SLongpeng(Mike) vfio_prepare_kvm_msi_virq_batch(vdev);
685dc580d51SLongpeng(Mike)
686dc580d51SLongpeng(Mike) if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
687dc580d51SLongpeng(Mike) vfio_msix_vector_release, NULL)) {
688dc580d51SLongpeng(Mike) error_report("vfio: msix_set_vector_notifiers failed");
689dc580d51SLongpeng(Mike) }
690dc580d51SLongpeng(Mike)
691dc580d51SLongpeng(Mike) vfio_commit_kvm_msi_virq_batch(vdev);
692dc580d51SLongpeng(Mike)
693dc580d51SLongpeng(Mike) if (vdev->nr_vectors) {
694dc580d51SLongpeng(Mike) ret = vfio_enable_vectors(vdev, true);
695dc580d51SLongpeng(Mike) if (ret) {
696dc580d51SLongpeng(Mike) error_report("vfio: failed to enable vectors, %d", ret);
697dc580d51SLongpeng(Mike) }
698dc580d51SLongpeng(Mike) } else {
699dc580d51SLongpeng(Mike) /*
700cf7087dbSKim Phillips * Some communication channels between VF & PF or PF & fw rely on the
701cf7087dbSKim Phillips * physical state of the device and expect that enabling MSI-X from the
702cf7087dbSKim Phillips * guest enables the same on the host. When our guest is Linux, the
703cf7087dbSKim Phillips * guest driver call to pci_enable_msix() sets the enabling bit in the
704cf7087dbSKim Phillips * MSI-X capability, but leaves the vector table masked. We therefore
705cf7087dbSKim Phillips * can't rely on a vector_use callback (from request_irq() in the guest)
706cf7087dbSKim Phillips * to switch the physical device into MSI-X mode because that may come a
7075ebffa4eSJing Liu * long time after pci_enable_msix(). This code sets vector 0 with an
7085ebffa4eSJing Liu * invalid fd to make the physical device MSI-X enabled, but with no
7095ebffa4eSJing Liu * vectors enabled, just like the guest view.
710cf7087dbSKim Phillips */
7115ebffa4eSJing Liu ret = vfio_enable_msix_no_vec(vdev);
7125ebffa4eSJing Liu if (ret) {
7135ebffa4eSJing Liu error_report("vfio: failed to enable MSI-X, %d", ret);
7145ebffa4eSJing Liu }
715cf7087dbSKim Phillips }
716cf7087dbSKim Phillips
7170de70dc7SAlex Williamson trace_vfio_msix_enable(vdev->vbasedev.name);
718cf7087dbSKim Phillips }
719cf7087dbSKim Phillips
vfio_msi_enable(VFIOPCIDevice * vdev)7200de70dc7SAlex Williamson static void vfio_msi_enable(VFIOPCIDevice *vdev)
721cf7087dbSKim Phillips {
722cf7087dbSKim Phillips int ret, i;
723cf7087dbSKim Phillips
724cf7087dbSKim Phillips vfio_disable_interrupts(vdev);
725cf7087dbSKim Phillips
726c1740889SShameer Kolothum vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
727c1740889SShameer Kolothum retry:
728dc580d51SLongpeng(Mike) /*
729dc580d51SLongpeng(Mike) * Setting vector notifiers needs to enable route for each vector.
730dc580d51SLongpeng(Mike) * Deferring to commit the KVM routes once rather than per vector
731dc580d51SLongpeng(Mike) * provides a substantial performance improvement.
732dc580d51SLongpeng(Mike) */
733dc580d51SLongpeng(Mike) vfio_prepare_kvm_msi_virq_batch(vdev);
734dc580d51SLongpeng(Mike)
735bdd81addSMarkus Armbruster vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
736cf7087dbSKim Phillips
737cf7087dbSKim Phillips for (i = 0; i < vdev->nr_vectors; i++) {
738cf7087dbSKim Phillips VFIOMSIVector *vector = &vdev->msi_vectors[i];
739cf7087dbSKim Phillips
740cf7087dbSKim Phillips vector->vdev = vdev;
741cf7087dbSKim Phillips vector->virq = -1;
742cf7087dbSKim Phillips vector->use = true;
743cf7087dbSKim Phillips
744cf7087dbSKim Phillips if (event_notifier_init(&vector->interrupt, 0)) {
745cf7087dbSKim Phillips error_report("vfio: Error: event_notifier_init failed");
746cf7087dbSKim Phillips }
747cf7087dbSKim Phillips
748cf7087dbSKim Phillips qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
749cf7087dbSKim Phillips vfio_msi_interrupt, NULL, vector);
750cf7087dbSKim Phillips
751cf7087dbSKim Phillips /*
752cf7087dbSKim Phillips * Attempt to enable route through KVM irqchip,
753cf7087dbSKim Phillips * default to userspace handling if unavailable.
754cf7087dbSKim Phillips */
755d1f6af6aSPeter Xu vfio_add_kvm_msi_virq(vdev, vector, i, false);
756cf7087dbSKim Phillips }
757cf7087dbSKim Phillips
758dc580d51SLongpeng(Mike) vfio_commit_kvm_msi_virq_batch(vdev);
759dc580d51SLongpeng(Mike)
760cf7087dbSKim Phillips /* Set interrupt type prior to possible interrupts */
761cf7087dbSKim Phillips vdev->interrupt = VFIO_INT_MSI;
762cf7087dbSKim Phillips
763cf7087dbSKim Phillips ret = vfio_enable_vectors(vdev, false);
764cf7087dbSKim Phillips if (ret) {
765cf7087dbSKim Phillips if (ret < 0) {
766cf7087dbSKim Phillips error_report("vfio: Error: Failed to setup MSI fds: %m");
767a6f5770fSLongpeng(Mike) } else {
768cf7087dbSKim Phillips error_report("vfio: Error: Failed to enable %d "
769cf7087dbSKim Phillips "MSI vectors, retry with %d", vdev->nr_vectors, ret);
770cf7087dbSKim Phillips }
771cf7087dbSKim Phillips
7728ab217d5SLongpeng(Mike) vfio_msi_disable_common(vdev);
773cf7087dbSKim Phillips
774a6f5770fSLongpeng(Mike) if (ret > 0) {
775cf7087dbSKim Phillips vdev->nr_vectors = ret;
776cf7087dbSKim Phillips goto retry;
777cf7087dbSKim Phillips }
778cf7087dbSKim Phillips
779cf7087dbSKim Phillips /*
780cf7087dbSKim Phillips * Failing to setup MSI doesn't really fall within any specification.
781cf7087dbSKim Phillips * Let's try leaving interrupts disabled and hope the guest figures
782cf7087dbSKim Phillips * out to fall back to INTx for this device.
783cf7087dbSKim Phillips */
784cf7087dbSKim Phillips error_report("vfio: Error: Failed to enable MSI");
785cf7087dbSKim Phillips
786cf7087dbSKim Phillips return;
787cf7087dbSKim Phillips }
788cf7087dbSKim Phillips
7890de70dc7SAlex Williamson trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
790cf7087dbSKim Phillips }
791cf7087dbSKim Phillips
vfio_msi_disable_common(VFIOPCIDevice * vdev)7920de70dc7SAlex Williamson static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
793cf7087dbSKim Phillips {
794cf7087dbSKim Phillips int i;
795cf7087dbSKim Phillips
796cf7087dbSKim Phillips for (i = 0; i < vdev->nr_vectors; i++) {
797cf7087dbSKim Phillips VFIOMSIVector *vector = &vdev->msi_vectors[i];
798cf7087dbSKim Phillips if (vdev->msi_vectors[i].use) {
799cf7087dbSKim Phillips if (vector->virq >= 0) {
800cf7087dbSKim Phillips vfio_remove_kvm_msi_virq(vector);
801cf7087dbSKim Phillips }
802cf7087dbSKim Phillips qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
803cf7087dbSKim Phillips NULL, NULL, NULL);
804cf7087dbSKim Phillips event_notifier_cleanup(&vector->interrupt);
805cf7087dbSKim Phillips }
806cf7087dbSKim Phillips }
807cf7087dbSKim Phillips
808cf7087dbSKim Phillips g_free(vdev->msi_vectors);
809cf7087dbSKim Phillips vdev->msi_vectors = NULL;
810cf7087dbSKim Phillips vdev->nr_vectors = 0;
811cf7087dbSKim Phillips vdev->interrupt = VFIO_INT_NONE;
812cf7087dbSKim Phillips }
813cf7087dbSKim Phillips
vfio_msix_disable(VFIOPCIDevice * vdev)8140de70dc7SAlex Williamson static void vfio_msix_disable(VFIOPCIDevice *vdev)
815cf7087dbSKim Phillips {
816be4a46ecSLongpeng(Mike) Error *err = NULL;
817cf7087dbSKim Phillips int i;
818cf7087dbSKim Phillips
819cf7087dbSKim Phillips msix_unset_vector_notifiers(&vdev->pdev);
820cf7087dbSKim Phillips
821cf7087dbSKim Phillips /*
822cf7087dbSKim Phillips * MSI-X will only release vectors if MSI-X is still enabled on the
823cf7087dbSKim Phillips * device, check through the rest and release it ourselves if necessary.
824cf7087dbSKim Phillips */
825cf7087dbSKim Phillips for (i = 0; i < vdev->nr_vectors; i++) {
826cf7087dbSKim Phillips if (vdev->msi_vectors[i].use) {
827cf7087dbSKim Phillips vfio_msix_vector_release(&vdev->pdev, i);
828cf7087dbSKim Phillips msix_vector_unuse(&vdev->pdev, i);
829cf7087dbSKim Phillips }
830cf7087dbSKim Phillips }
831cf7087dbSKim Phillips
832d2b668fcSCédric Le Goater /*
833d2b668fcSCédric Le Goater * Always clear MSI-X IRQ index. A PF device could have enabled
834d2b668fcSCédric Le Goater * MSI-X with no vectors. See vfio_msix_enable().
835d2b668fcSCédric Le Goater */
8365546a621SEric Auger vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
837cf7087dbSKim Phillips
8380de70dc7SAlex Williamson vfio_msi_disable_common(vdev);
839c32bab07SZhenzhong Duan if (!vfio_intx_enable(vdev, &err)) {
840be4a46ecSLongpeng(Mike) error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
841be4a46ecSLongpeng(Mike) }
842cf7087dbSKim Phillips
84395239e16SAlex Williamson memset(vdev->msix->pending, 0,
84495239e16SAlex Williamson BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
84595239e16SAlex Williamson
8460de70dc7SAlex Williamson trace_vfio_msix_disable(vdev->vbasedev.name);
847cf7087dbSKim Phillips }
848cf7087dbSKim Phillips
vfio_msi_disable(VFIOPCIDevice * vdev)8490de70dc7SAlex Williamson static void vfio_msi_disable(VFIOPCIDevice *vdev)
850cf7087dbSKim Phillips {
851be4a46ecSLongpeng(Mike) Error *err = NULL;
852be4a46ecSLongpeng(Mike)
8535546a621SEric Auger vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
8540de70dc7SAlex Williamson vfio_msi_disable_common(vdev);
855be4a46ecSLongpeng(Mike) vfio_intx_enable(vdev, &err);
856be4a46ecSLongpeng(Mike) if (err) {
857be4a46ecSLongpeng(Mike) error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
858be4a46ecSLongpeng(Mike) }
859cf7087dbSKim Phillips
8600de70dc7SAlex Williamson trace_vfio_msi_disable(vdev->vbasedev.name);
861cf7087dbSKim Phillips }
862cf7087dbSKim Phillips
vfio_update_msi(VFIOPCIDevice * vdev)8639ee27d73SEric Auger static void vfio_update_msi(VFIOPCIDevice *vdev)
864cf7087dbSKim Phillips {
865cf7087dbSKim Phillips int i;
866cf7087dbSKim Phillips
867cf7087dbSKim Phillips for (i = 0; i < vdev->nr_vectors; i++) {
868cf7087dbSKim Phillips VFIOMSIVector *vector = &vdev->msi_vectors[i];
869cf7087dbSKim Phillips MSIMessage msg;
870cf7087dbSKim Phillips
871cf7087dbSKim Phillips if (!vector->use || vector->virq < 0) {
872cf7087dbSKim Phillips continue;
873cf7087dbSKim Phillips }
874cf7087dbSKim Phillips
875cf7087dbSKim Phillips msg = msi_get_message(&vdev->pdev, i);
876dc9f06caSPavel Fedin vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev);
877cf7087dbSKim Phillips }
878cf7087dbSKim Phillips }
879cf7087dbSKim Phillips
vfio_pci_load_rom(VFIOPCIDevice * vdev)8809ee27d73SEric Auger static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
881cf7087dbSKim Phillips {
8820d3e89beSZhenzhong Duan g_autofree struct vfio_region_info *reg_info = NULL;
883cf7087dbSKim Phillips uint64_t size;
884cf7087dbSKim Phillips off_t off = 0;
8857d489dcdSPaolo Bonzini ssize_t bytes;
886cf7087dbSKim Phillips
88746900226SAlex Williamson if (vfio_get_region_info(&vdev->vbasedev,
88846900226SAlex Williamson VFIO_PCI_ROM_REGION_INDEX, ®_info)) {
889cf7087dbSKim Phillips error_report("vfio: Error getting ROM info: %m");
890cf7087dbSKim Phillips return;
891cf7087dbSKim Phillips }
892cf7087dbSKim Phillips
89346900226SAlex Williamson trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size,
89446900226SAlex Williamson (unsigned long)reg_info->offset,
89546900226SAlex Williamson (unsigned long)reg_info->flags);
896cf7087dbSKim Phillips
89746900226SAlex Williamson vdev->rom_size = size = reg_info->size;
89846900226SAlex Williamson vdev->rom_offset = reg_info->offset;
89946900226SAlex Williamson
900cf7087dbSKim Phillips if (!vdev->rom_size) {
901cf7087dbSKim Phillips vdev->rom_read_failed = true;
902cf7087dbSKim Phillips error_report("vfio-pci: Cannot read device rom at "
903df92ee44SEric Auger "%s", vdev->vbasedev.name);
904cf7087dbSKim Phillips error_printf("Device option ROM contents are probably invalid "
905cf7087dbSKim Phillips "(check dmesg).\nSkip option ROM probe with rombar=0, "
906cf7087dbSKim Phillips "or load from file with romfile=\n");
907cf7087dbSKim Phillips return;
908cf7087dbSKim Phillips }
909cf7087dbSKim Phillips
910cf7087dbSKim Phillips vdev->rom = g_malloc(size);
911cf7087dbSKim Phillips memset(vdev->rom, 0xff, size);
912cf7087dbSKim Phillips
913cf7087dbSKim Phillips while (size) {
9145546a621SEric Auger bytes = pread(vdev->vbasedev.fd, vdev->rom + off,
9155546a621SEric Auger size, vdev->rom_offset + off);
916cf7087dbSKim Phillips if (bytes == 0) {
917cf7087dbSKim Phillips break;
918cf7087dbSKim Phillips } else if (bytes > 0) {
919cf7087dbSKim Phillips off += bytes;
920cf7087dbSKim Phillips size -= bytes;
921cf7087dbSKim Phillips } else {
922cf7087dbSKim Phillips if (errno == EINTR || errno == EAGAIN) {
923cf7087dbSKim Phillips continue;
924cf7087dbSKim Phillips }
925cf7087dbSKim Phillips error_report("vfio: Error reading device ROM: %m");
926cf7087dbSKim Phillips break;
927cf7087dbSKim Phillips }
928cf7087dbSKim Phillips }
929e2e5ee9cSAlex Williamson
930e2e5ee9cSAlex Williamson /*
931e2e5ee9cSAlex Williamson * Test the ROM signature against our device, if the vendor is correct
932e2e5ee9cSAlex Williamson * but the device ID doesn't match, store the correct device ID and
933e2e5ee9cSAlex Williamson * recompute the checksum. Intel IGD devices need this and are known
934e2e5ee9cSAlex Williamson * to have bogus checksums so we can't simply adjust the checksum.
935e2e5ee9cSAlex Williamson */
936e2e5ee9cSAlex Williamson if (pci_get_word(vdev->rom) == 0xaa55 &&
937e2e5ee9cSAlex Williamson pci_get_word(vdev->rom + 0x18) + 8 < vdev->rom_size &&
938e2e5ee9cSAlex Williamson !memcmp(vdev->rom + pci_get_word(vdev->rom + 0x18), "PCIR", 4)) {
939e2e5ee9cSAlex Williamson uint16_t vid, did;
940e2e5ee9cSAlex Williamson
941e2e5ee9cSAlex Williamson vid = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 4);
942e2e5ee9cSAlex Williamson did = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6);
943e2e5ee9cSAlex Williamson
944e2e5ee9cSAlex Williamson if (vid == vdev->vendor_id && did != vdev->device_id) {
945e2e5ee9cSAlex Williamson int i;
946e2e5ee9cSAlex Williamson uint8_t csum, *data = vdev->rom;
947e2e5ee9cSAlex Williamson
948e2e5ee9cSAlex Williamson pci_set_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6,
949e2e5ee9cSAlex Williamson vdev->device_id);
950e2e5ee9cSAlex Williamson data[6] = 0;
951e2e5ee9cSAlex Williamson
952e2e5ee9cSAlex Williamson for (csum = 0, i = 0; i < vdev->rom_size; i++) {
953e2e5ee9cSAlex Williamson csum += data[i];
954e2e5ee9cSAlex Williamson }
955e2e5ee9cSAlex Williamson
956e2e5ee9cSAlex Williamson data[6] = -csum;
957e2e5ee9cSAlex Williamson }
958e2e5ee9cSAlex Williamson }
959cf7087dbSKim Phillips }
960cf7087dbSKim Phillips
vfio_rom_read(void * opaque,hwaddr addr,unsigned size)961cf7087dbSKim Phillips static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
962cf7087dbSKim Phillips {
9639ee27d73SEric Auger VFIOPCIDevice *vdev = opaque;
964cf7087dbSKim Phillips union {
965cf7087dbSKim Phillips uint8_t byte;
966cf7087dbSKim Phillips uint16_t word;
967cf7087dbSKim Phillips uint32_t dword;
968cf7087dbSKim Phillips uint64_t qword;
969cf7087dbSKim Phillips } val;
970cf7087dbSKim Phillips uint64_t data = 0;
971cf7087dbSKim Phillips
972cf7087dbSKim Phillips /* Load the ROM lazily when the guest tries to read it */
973cf7087dbSKim Phillips if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
974cf7087dbSKim Phillips vfio_pci_load_rom(vdev);
975cf7087dbSKim Phillips }
976cf7087dbSKim Phillips
977cf7087dbSKim Phillips memcpy(&val, vdev->rom + addr,
978cf7087dbSKim Phillips (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
979cf7087dbSKim Phillips
980cf7087dbSKim Phillips switch (size) {
981cf7087dbSKim Phillips case 1:
982cf7087dbSKim Phillips data = val.byte;
983cf7087dbSKim Phillips break;
984cf7087dbSKim Phillips case 2:
985cf7087dbSKim Phillips data = le16_to_cpu(val.word);
986cf7087dbSKim Phillips break;
987cf7087dbSKim Phillips case 4:
988cf7087dbSKim Phillips data = le32_to_cpu(val.dword);
989cf7087dbSKim Phillips break;
990cf7087dbSKim Phillips default:
991cf7087dbSKim Phillips hw_error("vfio: unsupported read size, %d bytes\n", size);
992cf7087dbSKim Phillips break;
993cf7087dbSKim Phillips }
994cf7087dbSKim Phillips
995df92ee44SEric Auger trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data);
996cf7087dbSKim Phillips
997cf7087dbSKim Phillips return data;
998cf7087dbSKim Phillips }
999cf7087dbSKim Phillips
vfio_rom_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)1000cf7087dbSKim Phillips static void vfio_rom_write(void *opaque, hwaddr addr,
1001cf7087dbSKim Phillips uint64_t data, unsigned size)
1002cf7087dbSKim Phillips {
1003cf7087dbSKim Phillips }
1004cf7087dbSKim Phillips
1005cf7087dbSKim Phillips static const MemoryRegionOps vfio_rom_ops = {
1006cf7087dbSKim Phillips .read = vfio_rom_read,
1007cf7087dbSKim Phillips .write = vfio_rom_write,
1008cf7087dbSKim Phillips .endianness = DEVICE_LITTLE_ENDIAN,
1009cf7087dbSKim Phillips };
1010cf7087dbSKim Phillips
vfio_pci_size_rom(VFIOPCIDevice * vdev)10119ee27d73SEric Auger static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
1012cf7087dbSKim Phillips {
1013cf7087dbSKim Phillips uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
1014cf7087dbSKim Phillips off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
1015cf7087dbSKim Phillips DeviceState *dev = DEVICE(vdev);
1016062ed5d8SNeo Jia char *name;
10175546a621SEric Auger int fd = vdev->vbasedev.fd;
1018cf7087dbSKim Phillips
1019cf7087dbSKim Phillips if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
1020cf7087dbSKim Phillips /* Since pci handles romfile, just print a message and return */
10214eda914cSPhilippe Mathieu-Daudé if (vfio_opt_rom_in_denylist(vdev) && vdev->pdev.romfile) {
10228f8f5885SMarkus Armbruster warn_report("Device at %s is known to cause system instability"
10238f8f5885SMarkus Armbruster " issues during option rom execution",
10247df9381bSAlex Williamson vdev->vbasedev.name);
10258f8f5885SMarkus Armbruster error_printf("Proceeding anyway since user specified romfile\n");
1026cf7087dbSKim Phillips }
1027cf7087dbSKim Phillips return;
1028cf7087dbSKim Phillips }
1029cf7087dbSKim Phillips
1030cf7087dbSKim Phillips /*
1031cf7087dbSKim Phillips * Use the same size ROM BAR as the physical device. The contents
1032cf7087dbSKim Phillips * will get filled in later when the guest tries to read it.
1033cf7087dbSKim Phillips */
10345546a621SEric Auger if (pread(fd, &orig, 4, offset) != 4 ||
10355546a621SEric Auger pwrite(fd, &size, 4, offset) != 4 ||
10365546a621SEric Auger pread(fd, &size, 4, offset) != 4 ||
10375546a621SEric Auger pwrite(fd, &orig, 4, offset) != 4) {
10387df9381bSAlex Williamson error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name);
1039cf7087dbSKim Phillips return;
1040cf7087dbSKim Phillips }
1041cf7087dbSKim Phillips
1042cf7087dbSKim Phillips size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
1043cf7087dbSKim Phillips
1044cf7087dbSKim Phillips if (!size) {
1045cf7087dbSKim Phillips return;
1046cf7087dbSKim Phillips }
1047cf7087dbSKim Phillips
10484eda914cSPhilippe Mathieu-Daudé if (vfio_opt_rom_in_denylist(vdev)) {
1049f3558b1bSKevin Wolf if (dev->opts && qdict_haskey(dev->opts, "rombar")) {
10508f8f5885SMarkus Armbruster warn_report("Device at %s is known to cause system instability"
10518f8f5885SMarkus Armbruster " issues during option rom execution",
10527df9381bSAlex Williamson vdev->vbasedev.name);
10538f8f5885SMarkus Armbruster error_printf("Proceeding anyway since user specified"
10548f8f5885SMarkus Armbruster " non zero value for rombar\n");
1055cf7087dbSKim Phillips } else {
10568f8f5885SMarkus Armbruster warn_report("Rom loading for device at %s has been disabled"
10578f8f5885SMarkus Armbruster " due to system instability issues",
10587df9381bSAlex Williamson vdev->vbasedev.name);
10598f8f5885SMarkus Armbruster error_printf("Specify rombar=1 or romfile to force\n");
1060cf7087dbSKim Phillips return;
1061cf7087dbSKim Phillips }
1062cf7087dbSKim Phillips }
1063cf7087dbSKim Phillips
1064df92ee44SEric Auger trace_vfio_pci_size_rom(vdev->vbasedev.name, size);
1065cf7087dbSKim Phillips
1066062ed5d8SNeo Jia name = g_strdup_printf("vfio[%s].rom", vdev->vbasedev.name);
1067cf7087dbSKim Phillips
1068cf7087dbSKim Phillips memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
1069cf7087dbSKim Phillips &vfio_rom_ops, vdev, name, size);
1070062ed5d8SNeo Jia g_free(name);
1071cf7087dbSKim Phillips
1072cf7087dbSKim Phillips pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
1073cf7087dbSKim Phillips PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
1074cf7087dbSKim Phillips
1075cf7087dbSKim Phillips vdev->rom_read_failed = false;
1076cf7087dbSKim Phillips }
1077cf7087dbSKim Phillips
vfio_vga_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)1078c00d61d8SAlex Williamson void vfio_vga_write(void *opaque, hwaddr addr,
1079cf7087dbSKim Phillips uint64_t data, unsigned size)
1080cf7087dbSKim Phillips {
1081cf7087dbSKim Phillips VFIOVGARegion *region = opaque;
1082cf7087dbSKim Phillips VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1083cf7087dbSKim Phillips union {
1084cf7087dbSKim Phillips uint8_t byte;
1085cf7087dbSKim Phillips uint16_t word;
1086cf7087dbSKim Phillips uint32_t dword;
1087cf7087dbSKim Phillips uint64_t qword;
1088cf7087dbSKim Phillips } buf;
1089cf7087dbSKim Phillips off_t offset = vga->fd_offset + region->offset + addr;
1090cf7087dbSKim Phillips
1091cf7087dbSKim Phillips switch (size) {
1092cf7087dbSKim Phillips case 1:
1093cf7087dbSKim Phillips buf.byte = data;
1094cf7087dbSKim Phillips break;
1095cf7087dbSKim Phillips case 2:
1096cf7087dbSKim Phillips buf.word = cpu_to_le16(data);
1097cf7087dbSKim Phillips break;
1098cf7087dbSKim Phillips case 4:
1099cf7087dbSKim Phillips buf.dword = cpu_to_le32(data);
1100cf7087dbSKim Phillips break;
1101cf7087dbSKim Phillips default:
1102cf7087dbSKim Phillips hw_error("vfio: unsupported write size, %d bytes", size);
1103cf7087dbSKim Phillips break;
1104cf7087dbSKim Phillips }
1105cf7087dbSKim Phillips
1106cf7087dbSKim Phillips if (pwrite(vga->fd, &buf, size, offset) != size) {
1107cf7087dbSKim Phillips error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1108cf7087dbSKim Phillips __func__, region->offset + addr, data, size);
1109cf7087dbSKim Phillips }
1110cf7087dbSKim Phillips
1111cf7087dbSKim Phillips trace_vfio_vga_write(region->offset + addr, data, size);
1112cf7087dbSKim Phillips }
1113cf7087dbSKim Phillips
vfio_vga_read(void * opaque,hwaddr addr,unsigned size)1114c00d61d8SAlex Williamson uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1115cf7087dbSKim Phillips {
1116cf7087dbSKim Phillips VFIOVGARegion *region = opaque;
1117cf7087dbSKim Phillips VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1118cf7087dbSKim Phillips union {
1119cf7087dbSKim Phillips uint8_t byte;
1120cf7087dbSKim Phillips uint16_t word;
1121cf7087dbSKim Phillips uint32_t dword;
1122cf7087dbSKim Phillips uint64_t qword;
1123cf7087dbSKim Phillips } buf;
1124cf7087dbSKim Phillips uint64_t data = 0;
1125cf7087dbSKim Phillips off_t offset = vga->fd_offset + region->offset + addr;
1126cf7087dbSKim Phillips
1127cf7087dbSKim Phillips if (pread(vga->fd, &buf, size, offset) != size) {
1128cf7087dbSKim Phillips error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1129cf7087dbSKim Phillips __func__, region->offset + addr, size);
1130cf7087dbSKim Phillips return (uint64_t)-1;
1131cf7087dbSKim Phillips }
1132cf7087dbSKim Phillips
1133cf7087dbSKim Phillips switch (size) {
1134cf7087dbSKim Phillips case 1:
1135cf7087dbSKim Phillips data = buf.byte;
1136cf7087dbSKim Phillips break;
1137cf7087dbSKim Phillips case 2:
1138cf7087dbSKim Phillips data = le16_to_cpu(buf.word);
1139cf7087dbSKim Phillips break;
1140cf7087dbSKim Phillips case 4:
1141cf7087dbSKim Phillips data = le32_to_cpu(buf.dword);
1142cf7087dbSKim Phillips break;
1143cf7087dbSKim Phillips default:
1144cf7087dbSKim Phillips hw_error("vfio: unsupported read size, %d bytes", size);
1145cf7087dbSKim Phillips break;
1146cf7087dbSKim Phillips }
1147cf7087dbSKim Phillips
1148cf7087dbSKim Phillips trace_vfio_vga_read(region->offset + addr, size, data);
1149cf7087dbSKim Phillips
1150cf7087dbSKim Phillips return data;
1151cf7087dbSKim Phillips }
1152cf7087dbSKim Phillips
1153cf7087dbSKim Phillips static const MemoryRegionOps vfio_vga_ops = {
1154cf7087dbSKim Phillips .read = vfio_vga_read,
1155cf7087dbSKim Phillips .write = vfio_vga_write,
1156cf7087dbSKim Phillips .endianness = DEVICE_LITTLE_ENDIAN,
1157cf7087dbSKim Phillips };
1158cf7087dbSKim Phillips
1159cf7087dbSKim Phillips /*
116095251725SYongji Xie * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page
116195251725SYongji Xie * size if the BAR is in an exclusive page in host so that we could map
116295251725SYongji Xie * this BAR to guest. But this sub-page BAR may not occupy an exclusive
116395251725SYongji Xie * page in guest. So we should set the priority of the expanded memory
116495251725SYongji Xie * region to zero in case of overlap with BARs which share the same page
116595251725SYongji Xie * with the sub-page BAR in guest. Besides, we should also recover the
116695251725SYongji Xie * size of this sub-page BAR when its base address is changed in guest
116795251725SYongji Xie * and not page aligned any more.
116895251725SYongji Xie */
vfio_sub_page_bar_update_mapping(PCIDevice * pdev,int bar)116995251725SYongji Xie static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
117095251725SYongji Xie {
117101b46064SEduardo Habkost VFIOPCIDevice *vdev = VFIO_PCI(pdev);
117295251725SYongji Xie VFIORegion *region = &vdev->bars[bar].region;
11733a286732SAlex Williamson MemoryRegion *mmap_mr, *region_mr, *base_mr;
117495251725SYongji Xie PCIIORegion *r;
117595251725SYongji Xie pcibus_t bar_addr;
117695251725SYongji Xie uint64_t size = region->size;
117795251725SYongji Xie
117895251725SYongji Xie /* Make sure that the whole region is allowed to be mmapped */
117995251725SYongji Xie if (region->nr_mmaps != 1 || !region->mmaps[0].mmap ||
118095251725SYongji Xie region->mmaps[0].size != region->size) {
118195251725SYongji Xie return;
118295251725SYongji Xie }
118395251725SYongji Xie
118495251725SYongji Xie r = &pdev->io_regions[bar];
118595251725SYongji Xie bar_addr = r->addr;
11863a286732SAlex Williamson base_mr = vdev->bars[bar].mr;
11873a286732SAlex Williamson region_mr = region->mem;
118895251725SYongji Xie mmap_mr = ®ion->mmaps[0].mem;
118995251725SYongji Xie
119095251725SYongji Xie /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */
119195251725SYongji Xie if (bar_addr != PCI_BAR_UNMAPPED &&
11928e3b0cbbSMarc-André Lureau !(bar_addr & ~qemu_real_host_page_mask())) {
11938e3b0cbbSMarc-André Lureau size = qemu_real_host_page_size();
119495251725SYongji Xie }
119595251725SYongji Xie
119695251725SYongji Xie memory_region_transaction_begin();
119795251725SYongji Xie
11983a286732SAlex Williamson if (vdev->bars[bar].size < size) {
11993a286732SAlex Williamson memory_region_set_size(base_mr, size);
12003a286732SAlex Williamson }
12013a286732SAlex Williamson memory_region_set_size(region_mr, size);
120295251725SYongji Xie memory_region_set_size(mmap_mr, size);
12033a286732SAlex Williamson if (size != vdev->bars[bar].size && memory_region_is_mapped(base_mr)) {
12043a286732SAlex Williamson memory_region_del_subregion(r->address_space, base_mr);
120595251725SYongji Xie memory_region_add_subregion_overlap(r->address_space,
12063a286732SAlex Williamson bar_addr, base_mr, 0);
120795251725SYongji Xie }
120895251725SYongji Xie
120995251725SYongji Xie memory_region_transaction_commit();
121095251725SYongji Xie }
121195251725SYongji Xie
121295251725SYongji Xie /*
1213cf7087dbSKim Phillips * PCI config space
1214cf7087dbSKim Phillips */
vfio_pci_read_config(PCIDevice * pdev,uint32_t addr,int len)1215c00d61d8SAlex Williamson uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
1216cf7087dbSKim Phillips {
121701b46064SEduardo Habkost VFIOPCIDevice *vdev = VFIO_PCI(pdev);
1218cf7087dbSKim Phillips uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
1219cf7087dbSKim Phillips
1220cf7087dbSKim Phillips memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
1221cf7087dbSKim Phillips emu_bits = le32_to_cpu(emu_bits);
1222cf7087dbSKim Phillips
1223cf7087dbSKim Phillips if (emu_bits) {
1224cf7087dbSKim Phillips emu_val = pci_default_read_config(pdev, addr, len);
1225cf7087dbSKim Phillips }
1226cf7087dbSKim Phillips
1227cf7087dbSKim Phillips if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
1228cf7087dbSKim Phillips ssize_t ret;
1229cf7087dbSKim Phillips
12305546a621SEric Auger ret = pread(vdev->vbasedev.fd, &phys_val, len,
12315546a621SEric Auger vdev->config_offset + addr);
1232cf7087dbSKim Phillips if (ret != len) {
12337df9381bSAlex Williamson error_report("%s(%s, 0x%x, 0x%x) failed: %m",
12347df9381bSAlex Williamson __func__, vdev->vbasedev.name, addr, len);
1235cf7087dbSKim Phillips return -errno;
1236cf7087dbSKim Phillips }
1237cf7087dbSKim Phillips phys_val = le32_to_cpu(phys_val);
1238cf7087dbSKim Phillips }
1239cf7087dbSKim Phillips
1240cf7087dbSKim Phillips val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
1241cf7087dbSKim Phillips
1242df92ee44SEric Auger trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val);
1243cf7087dbSKim Phillips
1244cf7087dbSKim Phillips return val;
1245cf7087dbSKim Phillips }
1246cf7087dbSKim Phillips
vfio_pci_write_config(PCIDevice * pdev,uint32_t addr,uint32_t val,int len)1247c00d61d8SAlex Williamson void vfio_pci_write_config(PCIDevice *pdev,
1248c00d61d8SAlex Williamson uint32_t addr, uint32_t val, int len)
1249cf7087dbSKim Phillips {
125001b46064SEduardo Habkost VFIOPCIDevice *vdev = VFIO_PCI(pdev);
1251cf7087dbSKim Phillips uint32_t val_le = cpu_to_le32(val);
1252cf7087dbSKim Phillips
1253df92ee44SEric Auger trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
1254cf7087dbSKim Phillips
1255cf7087dbSKim Phillips /* Write everything to VFIO, let it filter out what we can't write */
12565546a621SEric Auger if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr)
12575546a621SEric Auger != len) {
12587df9381bSAlex Williamson error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m",
12597df9381bSAlex Williamson __func__, vdev->vbasedev.name, addr, val, len);
1260cf7087dbSKim Phillips }
1261cf7087dbSKim Phillips
1262cf7087dbSKim Phillips /* MSI/MSI-X Enabling/Disabling */
1263cf7087dbSKim Phillips if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
1264cf7087dbSKim Phillips ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
1265cf7087dbSKim Phillips int is_enabled, was_enabled = msi_enabled(pdev);
1266cf7087dbSKim Phillips
1267cf7087dbSKim Phillips pci_default_write_config(pdev, addr, val, len);
1268cf7087dbSKim Phillips
1269cf7087dbSKim Phillips is_enabled = msi_enabled(pdev);
1270cf7087dbSKim Phillips
1271cf7087dbSKim Phillips if (!was_enabled) {
1272cf7087dbSKim Phillips if (is_enabled) {
12730de70dc7SAlex Williamson vfio_msi_enable(vdev);
1274cf7087dbSKim Phillips }
1275cf7087dbSKim Phillips } else {
1276cf7087dbSKim Phillips if (!is_enabled) {
12770de70dc7SAlex Williamson vfio_msi_disable(vdev);
1278cf7087dbSKim Phillips } else {
1279cf7087dbSKim Phillips vfio_update_msi(vdev);
1280cf7087dbSKim Phillips }
1281cf7087dbSKim Phillips }
1282cf7087dbSKim Phillips } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
1283cf7087dbSKim Phillips ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
1284cf7087dbSKim Phillips int is_enabled, was_enabled = msix_enabled(pdev);
1285cf7087dbSKim Phillips
1286cf7087dbSKim Phillips pci_default_write_config(pdev, addr, val, len);
1287cf7087dbSKim Phillips
1288cf7087dbSKim Phillips is_enabled = msix_enabled(pdev);
1289cf7087dbSKim Phillips
1290cf7087dbSKim Phillips if (!was_enabled && is_enabled) {
12910de70dc7SAlex Williamson vfio_msix_enable(vdev);
1292cf7087dbSKim Phillips } else if (was_enabled && !is_enabled) {
12930de70dc7SAlex Williamson vfio_msix_disable(vdev);
1294cf7087dbSKim Phillips }
129595251725SYongji Xie } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) ||
129695251725SYongji Xie range_covers_byte(addr, len, PCI_COMMAND)) {
129795251725SYongji Xie pcibus_t old_addr[PCI_NUM_REGIONS - 1];
129895251725SYongji Xie int bar;
129995251725SYongji Xie
130095251725SYongji Xie for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
130195251725SYongji Xie old_addr[bar] = pdev->io_regions[bar].addr;
130295251725SYongji Xie }
130395251725SYongji Xie
130495251725SYongji Xie pci_default_write_config(pdev, addr, val, len);
130595251725SYongji Xie
130695251725SYongji Xie for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
130795251725SYongji Xie if (old_addr[bar] != pdev->io_regions[bar].addr &&
13083a286732SAlex Williamson vdev->bars[bar].region.size > 0 &&
13098e3b0cbbSMarc-André Lureau vdev->bars[bar].region.size < qemu_real_host_page_size()) {
131095251725SYongji Xie vfio_sub_page_bar_update_mapping(pdev, bar);
131195251725SYongji Xie }
131295251725SYongji Xie }
1313cf7087dbSKim Phillips } else {
1314cf7087dbSKim Phillips /* Write everything to QEMU to keep emulated bits correct */
1315cf7087dbSKim Phillips pci_default_write_config(pdev, addr, val, len);
1316cf7087dbSKim Phillips }
1317cf7087dbSKim Phillips }
1318cf7087dbSKim Phillips
1319cf7087dbSKim Phillips /*
1320cf7087dbSKim Phillips * Interrupt setup
1321cf7087dbSKim Phillips */
vfio_disable_interrupts(VFIOPCIDevice * vdev)13229ee27d73SEric Auger static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
1323cf7087dbSKim Phillips {
1324b3e27c3aSAlex Williamson /*
1325b3e27c3aSAlex Williamson * More complicated than it looks. Disabling MSI/X transitions the
1326b3e27c3aSAlex Williamson * device to INTx mode (if supported). Therefore we need to first
1327b3e27c3aSAlex Williamson * disable MSI/X and then cleanup by disabling INTx.
1328b3e27c3aSAlex Williamson */
1329b3e27c3aSAlex Williamson if (vdev->interrupt == VFIO_INT_MSIX) {
13300de70dc7SAlex Williamson vfio_msix_disable(vdev);
1331b3e27c3aSAlex Williamson } else if (vdev->interrupt == VFIO_INT_MSI) {
13320de70dc7SAlex Williamson vfio_msi_disable(vdev);
1333b3e27c3aSAlex Williamson }
1334b3e27c3aSAlex Williamson
1335b3e27c3aSAlex Williamson if (vdev->interrupt == VFIO_INT_INTx) {
1336870cb6f1SAlex Williamson vfio_intx_disable(vdev);
1337cf7087dbSKim Phillips }
1338cf7087dbSKim Phillips }
1339cf7087dbSKim Phillips
vfio_msi_setup(VFIOPCIDevice * vdev,int pos,Error ** errp)1340b771a40fSZhenzhong Duan static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1341cf7087dbSKim Phillips {
1342cf7087dbSKim Phillips uint16_t ctrl;
1343cf7087dbSKim Phillips bool msi_64bit, msi_maskbit;
1344cf7087dbSKim Phillips int ret, entries;
13451108b2f8SCao jin Error *err = NULL;
1346cf7087dbSKim Phillips
13475546a621SEric Auger if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
1348cf7087dbSKim Phillips vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
13497ef165b9SEric Auger error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
1350b771a40fSZhenzhong Duan return false;
1351cf7087dbSKim Phillips }
1352cf7087dbSKim Phillips ctrl = le16_to_cpu(ctrl);
1353cf7087dbSKim Phillips
1354cf7087dbSKim Phillips msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
1355cf7087dbSKim Phillips msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
1356cf7087dbSKim Phillips entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
1357cf7087dbSKim Phillips
13580de70dc7SAlex Williamson trace_vfio_msi_setup(vdev->vbasedev.name, pos);
1359cf7087dbSKim Phillips
13601108b2f8SCao jin ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit, &err);
1361cf7087dbSKim Phillips if (ret < 0) {
1362cf7087dbSKim Phillips if (ret == -ENOTSUP) {
1363b771a40fSZhenzhong Duan return true;
1364cf7087dbSKim Phillips }
13654b576648SMarkus Armbruster error_propagate_prepend(errp, err, "msi_init failed: ");
1366b771a40fSZhenzhong Duan return false;
1367cf7087dbSKim Phillips }
1368cf7087dbSKim Phillips vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
1369cf7087dbSKim Phillips
1370b771a40fSZhenzhong Duan return true;
1371cf7087dbSKim Phillips }
1372cf7087dbSKim Phillips
vfio_pci_fixup_msix_region(VFIOPCIDevice * vdev)1373db0da029SAlex Williamson static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
1374db0da029SAlex Williamson {
1375db0da029SAlex Williamson off_t start, end;
1376db0da029SAlex Williamson VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region;
1377db0da029SAlex Williamson
1378db0da029SAlex Williamson /*
1379ae0215b2SAlexey Kardashevskiy * If the host driver allows mapping of a MSIX data, we are going to
1380ae0215b2SAlexey Kardashevskiy * do map the entire BAR and emulate MSIX table on top of that.
1381ae0215b2SAlexey Kardashevskiy */
1382ae0215b2SAlexey Kardashevskiy if (vfio_has_region_cap(&vdev->vbasedev, region->nr,
1383ae0215b2SAlexey Kardashevskiy VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) {
1384ae0215b2SAlexey Kardashevskiy return;
1385ae0215b2SAlexey Kardashevskiy }
1386ae0215b2SAlexey Kardashevskiy
1387ae0215b2SAlexey Kardashevskiy /*
1388db0da029SAlex Williamson * We expect to find a single mmap covering the whole BAR, anything else
1389db0da029SAlex Williamson * means it's either unsupported or already setup.
1390db0da029SAlex Williamson */
1391db0da029SAlex Williamson if (region->nr_mmaps != 1 || region->mmaps[0].offset ||
1392db0da029SAlex Williamson region->size != region->mmaps[0].size) {
1393db0da029SAlex Williamson return;
1394db0da029SAlex Williamson }
1395db0da029SAlex Williamson
1396db0da029SAlex Williamson /* MSI-X table start and end aligned to host page size */
13978e3b0cbbSMarc-André Lureau start = vdev->msix->table_offset & qemu_real_host_page_mask();
1398db0da029SAlex Williamson end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
1399db0da029SAlex Williamson (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
1400db0da029SAlex Williamson
1401db0da029SAlex Williamson /*
1402db0da029SAlex Williamson * Does the MSI-X table cover the beginning of the BAR? The whole BAR?
1403db0da029SAlex Williamson * NB - Host page size is necessarily a power of two and so is the PCI
1404db0da029SAlex Williamson * BAR (not counting EA yet), therefore if we have host page aligned
1405db0da029SAlex Williamson * @start and @end, then any remainder of the BAR before or after those
1406db0da029SAlex Williamson * must be at least host page sized and therefore mmap'able.
1407db0da029SAlex Williamson */
1408db0da029SAlex Williamson if (!start) {
1409db0da029SAlex Williamson if (end >= region->size) {
1410db0da029SAlex Williamson region->nr_mmaps = 0;
1411db0da029SAlex Williamson g_free(region->mmaps);
1412db0da029SAlex Williamson region->mmaps = NULL;
1413db0da029SAlex Williamson trace_vfio_msix_fixup(vdev->vbasedev.name,
1414db0da029SAlex Williamson vdev->msix->table_bar, 0, 0);
1415db0da029SAlex Williamson } else {
1416db0da029SAlex Williamson region->mmaps[0].offset = end;
1417db0da029SAlex Williamson region->mmaps[0].size = region->size - end;
1418db0da029SAlex Williamson trace_vfio_msix_fixup(vdev->vbasedev.name,
1419db0da029SAlex Williamson vdev->msix->table_bar, region->mmaps[0].offset,
1420db0da029SAlex Williamson region->mmaps[0].offset + region->mmaps[0].size);
1421db0da029SAlex Williamson }
1422db0da029SAlex Williamson
1423db0da029SAlex Williamson /* Maybe it's aligned at the end of the BAR */
1424db0da029SAlex Williamson } else if (end >= region->size) {
1425db0da029SAlex Williamson region->mmaps[0].size = start;
1426db0da029SAlex Williamson trace_vfio_msix_fixup(vdev->vbasedev.name,
1427db0da029SAlex Williamson vdev->msix->table_bar, region->mmaps[0].offset,
1428db0da029SAlex Williamson region->mmaps[0].offset + region->mmaps[0].size);
1429db0da029SAlex Williamson
1430db0da029SAlex Williamson /* Otherwise it must split the BAR */
1431db0da029SAlex Williamson } else {
1432db0da029SAlex Williamson region->nr_mmaps = 2;
1433db0da029SAlex Williamson region->mmaps = g_renew(VFIOMmap, region->mmaps, 2);
1434db0da029SAlex Williamson
1435db0da029SAlex Williamson memcpy(®ion->mmaps[1], ®ion->mmaps[0], sizeof(VFIOMmap));
1436db0da029SAlex Williamson
1437db0da029SAlex Williamson region->mmaps[0].size = start;
1438db0da029SAlex Williamson trace_vfio_msix_fixup(vdev->vbasedev.name,
1439db0da029SAlex Williamson vdev->msix->table_bar, region->mmaps[0].offset,
1440db0da029SAlex Williamson region->mmaps[0].offset + region->mmaps[0].size);
1441db0da029SAlex Williamson
1442db0da029SAlex Williamson region->mmaps[1].offset = end;
1443db0da029SAlex Williamson region->mmaps[1].size = region->size - end;
1444db0da029SAlex Williamson trace_vfio_msix_fixup(vdev->vbasedev.name,
1445db0da029SAlex Williamson vdev->msix->table_bar, region->mmaps[1].offset,
1446db0da029SAlex Williamson region->mmaps[1].offset + region->mmaps[1].size);
1447db0da029SAlex Williamson }
1448db0da029SAlex Williamson }
1449db0da029SAlex Williamson
vfio_pci_relocate_msix(VFIOPCIDevice * vdev,Error ** errp)1450713b59a6SZhenzhong Duan static bool vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
145189d5202eSAlex Williamson {
145289d5202eSAlex Williamson int target_bar = -1;
145389d5202eSAlex Williamson size_t msix_sz;
145489d5202eSAlex Williamson
145555872c70SMarkus Armbruster if (!vdev->msix || vdev->msix_relo == OFF_AUTO_PCIBAR_OFF) {
1456713b59a6SZhenzhong Duan return true;
145789d5202eSAlex Williamson }
145889d5202eSAlex Williamson
145989d5202eSAlex Williamson /* The actual minimum size of MSI-X structures */
146089d5202eSAlex Williamson msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) +
146189d5202eSAlex Williamson (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8);
146289d5202eSAlex Williamson /* Round up to host pages, we don't want to share a page */
146389d5202eSAlex Williamson msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz);
146489d5202eSAlex Williamson /* PCI BARs must be a power of 2 */
146589d5202eSAlex Williamson msix_sz = pow2ceil(msix_sz);
146689d5202eSAlex Williamson
146755872c70SMarkus Armbruster if (vdev->msix_relo == OFF_AUTO_PCIBAR_AUTO) {
146889d5202eSAlex Williamson /*
146989d5202eSAlex Williamson * TODO: Lookup table for known devices.
147089d5202eSAlex Williamson *
147189d5202eSAlex Williamson * Logically we might use an algorithm here to select the BAR adding
1472631ba5a1SCai Huoqing * the least additional MMIO space, but we cannot programmatically
147389d5202eSAlex Williamson * predict the driver dependency on BAR ordering or sizing, therefore
147489d5202eSAlex Williamson * 'auto' becomes a lookup for combinations reported to work.
147589d5202eSAlex Williamson */
147689d5202eSAlex Williamson if (target_bar < 0) {
147789d5202eSAlex Williamson error_setg(errp, "No automatic MSI-X relocation available for "
147889d5202eSAlex Williamson "device %04x:%04x", vdev->vendor_id, vdev->device_id);
1479713b59a6SZhenzhong Duan return false;
148089d5202eSAlex Williamson }
148189d5202eSAlex Williamson } else {
148255872c70SMarkus Armbruster target_bar = (int)(vdev->msix_relo - OFF_AUTO_PCIBAR_BAR0);
148389d5202eSAlex Williamson }
148489d5202eSAlex Williamson
148589d5202eSAlex Williamson /* I/O port BARs cannot host MSI-X structures */
148689d5202eSAlex Williamson if (vdev->bars[target_bar].ioport) {
148789d5202eSAlex Williamson error_setg(errp, "Invalid MSI-X relocation BAR %d, "
148889d5202eSAlex Williamson "I/O port BAR", target_bar);
1489713b59a6SZhenzhong Duan return false;
149089d5202eSAlex Williamson }
149189d5202eSAlex Williamson
149289d5202eSAlex Williamson /* Cannot use a BAR in the "shadow" of a 64-bit BAR */
149389d5202eSAlex Williamson if (!vdev->bars[target_bar].size &&
149489d5202eSAlex Williamson target_bar > 0 && vdev->bars[target_bar - 1].mem64) {
149589d5202eSAlex Williamson error_setg(errp, "Invalid MSI-X relocation BAR %d, "
149689d5202eSAlex Williamson "consumed by 64-bit BAR %d", target_bar, target_bar - 1);
1497713b59a6SZhenzhong Duan return false;
149889d5202eSAlex Williamson }
149989d5202eSAlex Williamson
150089d5202eSAlex Williamson /* 2GB max size for 32-bit BARs, cannot double if already > 1G */
1501e0255bb1SPhilippe Mathieu-Daudé if (vdev->bars[target_bar].size > 1 * GiB &&
150289d5202eSAlex Williamson !vdev->bars[target_bar].mem64) {
150389d5202eSAlex Williamson error_setg(errp, "Invalid MSI-X relocation BAR %d, "
150489d5202eSAlex Williamson "no space to extend 32-bit BAR", target_bar);
1505713b59a6SZhenzhong Duan return false;
150689d5202eSAlex Williamson }
150789d5202eSAlex Williamson
150889d5202eSAlex Williamson /*
150989d5202eSAlex Williamson * If adding a new BAR, test if we can make it 64bit. We make it
151089d5202eSAlex Williamson * prefetchable since QEMU MSI-X emulation has no read side effects
151189d5202eSAlex Williamson * and doing so makes mapping more flexible.
151289d5202eSAlex Williamson */
151389d5202eSAlex Williamson if (!vdev->bars[target_bar].size) {
151489d5202eSAlex Williamson if (target_bar < (PCI_ROM_SLOT - 1) &&
151589d5202eSAlex Williamson !vdev->bars[target_bar + 1].size) {
151689d5202eSAlex Williamson vdev->bars[target_bar].mem64 = true;
151789d5202eSAlex Williamson vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64;
151889d5202eSAlex Williamson }
151989d5202eSAlex Williamson vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
152089d5202eSAlex Williamson vdev->bars[target_bar].size = msix_sz;
152189d5202eSAlex Williamson vdev->msix->table_offset = 0;
152289d5202eSAlex Williamson } else {
152389d5202eSAlex Williamson vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2,
152489d5202eSAlex Williamson msix_sz * 2);
152589d5202eSAlex Williamson /*
152689d5202eSAlex Williamson * Due to above size calc, MSI-X always starts halfway into the BAR,
152789d5202eSAlex Williamson * which will always be a separate host page.
152889d5202eSAlex Williamson */
152989d5202eSAlex Williamson vdev->msix->table_offset = vdev->bars[target_bar].size / 2;
153089d5202eSAlex Williamson }
153189d5202eSAlex Williamson
153289d5202eSAlex Williamson vdev->msix->table_bar = target_bar;
153389d5202eSAlex Williamson vdev->msix->pba_bar = target_bar;
153489d5202eSAlex Williamson /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */
153589d5202eSAlex Williamson vdev->msix->pba_offset = vdev->msix->table_offset +
153689d5202eSAlex Williamson (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE);
153789d5202eSAlex Williamson
153889d5202eSAlex Williamson trace_vfio_msix_relo(vdev->vbasedev.name,
153989d5202eSAlex Williamson vdev->msix->table_bar, vdev->msix->table_offset);
1540713b59a6SZhenzhong Duan return true;
154189d5202eSAlex Williamson }
154289d5202eSAlex Williamson
1543cf7087dbSKim Phillips /*
1544cf7087dbSKim Phillips * We don't have any control over how pci_add_capability() inserts
1545cf7087dbSKim Phillips * capabilities into the chain. In order to setup MSI-X we need a
1546cf7087dbSKim Phillips * MemoryRegion for the BAR. In order to setup the BAR and not
1547cf7087dbSKim Phillips * attempt to mmap the MSI-X table area, which VFIO won't allow, we
1548cf7087dbSKim Phillips * need to first look for where the MSI-X table lives. So we
1549cf7087dbSKim Phillips * unfortunately split MSI-X setup across two functions.
1550cf7087dbSKim Phillips */
vfio_msix_early_setup(VFIOPCIDevice * vdev,Error ** errp)1551713b59a6SZhenzhong Duan static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
1552cf7087dbSKim Phillips {
1553cf7087dbSKim Phillips uint8_t pos;
1554cf7087dbSKim Phillips uint16_t ctrl;
1555cf7087dbSKim Phillips uint32_t table, pba;
155645d85f62SJing Liu int ret, fd = vdev->vbasedev.fd;
155745d85f62SJing Liu struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
155845d85f62SJing Liu .index = VFIO_PCI_MSIX_IRQ_INDEX };
1559b5bd049fSAlex Williamson VFIOMSIXInfo *msix;
1560cf7087dbSKim Phillips
1561cf7087dbSKim Phillips pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
1562cf7087dbSKim Phillips if (!pos) {
1563713b59a6SZhenzhong Duan return true;
1564cf7087dbSKim Phillips }
1565cf7087dbSKim Phillips
15665546a621SEric Auger if (pread(fd, &ctrl, sizeof(ctrl),
1567b58b17f7SWei Yang vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) {
1568008d0e2dSEric Auger error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS");
1569713b59a6SZhenzhong Duan return false;
1570cf7087dbSKim Phillips }
1571cf7087dbSKim Phillips
15725546a621SEric Auger if (pread(fd, &table, sizeof(table),
1573cf7087dbSKim Phillips vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
1574008d0e2dSEric Auger error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE");
1575713b59a6SZhenzhong Duan return false;
1576cf7087dbSKim Phillips }
1577cf7087dbSKim Phillips
15785546a621SEric Auger if (pread(fd, &pba, sizeof(pba),
1579cf7087dbSKim Phillips vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
1580008d0e2dSEric Auger error_setg_errno(errp, errno, "failed to read PCI MSIX PBA");
1581713b59a6SZhenzhong Duan return false;
1582cf7087dbSKim Phillips }
1583cf7087dbSKim Phillips
1584cf7087dbSKim Phillips ctrl = le16_to_cpu(ctrl);
1585cf7087dbSKim Phillips table = le32_to_cpu(table);
1586cf7087dbSKim Phillips pba = le32_to_cpu(pba);
1587cf7087dbSKim Phillips
1588b5bd049fSAlex Williamson msix = g_malloc0(sizeof(*msix));
1589b5bd049fSAlex Williamson msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
1590b5bd049fSAlex Williamson msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
1591b5bd049fSAlex Williamson msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
1592b5bd049fSAlex Williamson msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
1593b5bd049fSAlex Williamson msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
1594cf7087dbSKim Phillips
159545d85f62SJing Liu ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
159645d85f62SJing Liu if (ret < 0) {
159745d85f62SJing Liu error_setg_errno(errp, -ret, "failed to get MSI-X irq info");
159845d85f62SJing Liu g_free(msix);
1599713b59a6SZhenzhong Duan return false;
160045d85f62SJing Liu }
160145d85f62SJing Liu
160245d85f62SJing Liu msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE);
160345d85f62SJing Liu
160443302969SGabriel Laupre /*
160543302969SGabriel Laupre * Test the size of the pba_offset variable and catch if it extends outside
160643302969SGabriel Laupre * of the specified BAR. If it is the case, we need to apply a hardware
160743302969SGabriel Laupre * specific quirk if the device is known or we have a broken configuration.
160843302969SGabriel Laupre */
1609b5bd049fSAlex Williamson if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) {
161043302969SGabriel Laupre /*
161143302969SGabriel Laupre * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
161243302969SGabriel Laupre * adapters. The T5 hardware returns an incorrect value of 0x8000 for
161343302969SGabriel Laupre * the VF PBA offset while the BAR itself is only 8k. The correct value
161443302969SGabriel Laupre * is 0x1000, so we hard code that here.
161543302969SGabriel Laupre */
1616ff635e37SAlex Williamson if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO &&
1617ff635e37SAlex Williamson (vdev->device_id & 0xff00) == 0x5800) {
1618b5bd049fSAlex Williamson msix->pba_offset = 0x1000;
16191bd9f1b1SCai Huoqing /*
16201bd9f1b1SCai Huoqing * BAIDU KUNLUN Virtual Function devices for KUNLUN AI processor
16211bd9f1b1SCai Huoqing * return an incorrect value of 0x460000 for the VF PBA offset while
16221bd9f1b1SCai Huoqing * the BAR itself is only 0x10000. The correct value is 0xb400.
16231bd9f1b1SCai Huoqing */
16241bd9f1b1SCai Huoqing } else if (vfio_pci_is(vdev, PCI_VENDOR_ID_BAIDU,
16251bd9f1b1SCai Huoqing PCI_DEVICE_ID_KUNLUN_VF)) {
16261bd9f1b1SCai Huoqing msix->pba_offset = 0xb400;
162755872c70SMarkus Armbruster } else if (vdev->msix_relo == OFF_AUTO_PCIBAR_OFF) {
1628008d0e2dSEric Auger error_setg(errp, "hardware reports invalid configuration, "
162943302969SGabriel Laupre "MSIX PBA outside of specified BAR");
1630b5bd049fSAlex Williamson g_free(msix);
1631713b59a6SZhenzhong Duan return false;
163243302969SGabriel Laupre }
163343302969SGabriel Laupre }
163443302969SGabriel Laupre
16350de70dc7SAlex Williamson trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
163645d85f62SJing Liu msix->table_offset, msix->entries,
163745d85f62SJing Liu msix->noresize);
1638b5bd049fSAlex Williamson vdev->msix = msix;
1639cf7087dbSKim Phillips
1640db0da029SAlex Williamson vfio_pci_fixup_msix_region(vdev);
164189d5202eSAlex Williamson
1642713b59a6SZhenzhong Duan return vfio_pci_relocate_msix(vdev, errp);
1643cf7087dbSKim Phillips }
1644cf7087dbSKim Phillips
vfio_msix_setup(VFIOPCIDevice * vdev,int pos,Error ** errp)1645b771a40fSZhenzhong Duan static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1646cf7087dbSKim Phillips {
1647cf7087dbSKim Phillips int ret;
1648ee640c62SCao jin Error *err = NULL;
1649cf7087dbSKim Phillips
1650b21e2380SMarkus Armbruster vdev->msix->pending = g_new0(unsigned long,
1651b21e2380SMarkus Armbruster BITS_TO_LONGS(vdev->msix->entries));
1652cf7087dbSKim Phillips ret = msix_init(&vdev->pdev, vdev->msix->entries,
16533a286732SAlex Williamson vdev->bars[vdev->msix->table_bar].mr,
1654cf7087dbSKim Phillips vdev->msix->table_bar, vdev->msix->table_offset,
16553a286732SAlex Williamson vdev->bars[vdev->msix->pba_bar].mr,
1656ee640c62SCao jin vdev->msix->pba_bar, vdev->msix->pba_offset, pos,
1657ee640c62SCao jin &err);
1658cf7087dbSKim Phillips if (ret < 0) {
1659cf7087dbSKim Phillips if (ret == -ENOTSUP) {
1660e1eb292aSMarkus Armbruster warn_report_err(err);
1661b771a40fSZhenzhong Duan return true;
1662cf7087dbSKim Phillips }
1663ee640c62SCao jin
1664ee640c62SCao jin error_propagate(errp, err);
1665b771a40fSZhenzhong Duan return false;
1666cf7087dbSKim Phillips }
1667cf7087dbSKim Phillips
166895239e16SAlex Williamson /*
166995239e16SAlex Williamson * The PCI spec suggests that devices provide additional alignment for
167095239e16SAlex Williamson * MSI-X structures and avoid overlapping non-MSI-X related registers.
167195239e16SAlex Williamson * For an assigned device, this hopefully means that emulation of MSI-X
167295239e16SAlex Williamson * structures does not affect the performance of the device. If devices
167395239e16SAlex Williamson * fail to provide that alignment, a significant performance penalty may
167495239e16SAlex Williamson * result, for instance Mellanox MT27500 VFs:
167595239e16SAlex Williamson * http://www.spinics.net/lists/kvm/msg125881.html
167695239e16SAlex Williamson *
167795239e16SAlex Williamson * The PBA is simply not that important for such a serious regression and
167895239e16SAlex Williamson * most drivers do not appear to look at it. The solution for this is to
167995239e16SAlex Williamson * disable the PBA MemoryRegion unless it's being used. We disable it
168095239e16SAlex Williamson * here and only enable it if a masked vector fires through QEMU. As the
168195239e16SAlex Williamson * vector-use notifier is called, which occurs on unmask, we test whether
168295239e16SAlex Williamson * PBA emulation is needed and again disable if not.
168395239e16SAlex Williamson */
168495239e16SAlex Williamson memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
168595239e16SAlex Williamson
1686fcad0d21SAlexey Kardashevskiy /*
1687fcad0d21SAlexey Kardashevskiy * The emulated machine may provide a paravirt interface for MSIX setup
1688fcad0d21SAlexey Kardashevskiy * so it is not strictly necessary to emulate MSIX here. This becomes
1689fcad0d21SAlexey Kardashevskiy * helpful when frequently accessed MMIO registers are located in
1690fcad0d21SAlexey Kardashevskiy * subpages adjacent to the MSIX table but the MSIX data containing page
1691fcad0d21SAlexey Kardashevskiy * cannot be mapped because of a host page size bigger than the MSIX table
1692fcad0d21SAlexey Kardashevskiy * alignment.
1693fcad0d21SAlexey Kardashevskiy */
1694fcad0d21SAlexey Kardashevskiy if (object_property_get_bool(OBJECT(qdev_get_machine()),
1695fcad0d21SAlexey Kardashevskiy "vfio-no-msix-emulation", NULL)) {
1696fcad0d21SAlexey Kardashevskiy memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
1697fcad0d21SAlexey Kardashevskiy }
1698fcad0d21SAlexey Kardashevskiy
1699b771a40fSZhenzhong Duan return true;
1700cf7087dbSKim Phillips }
1701cf7087dbSKim Phillips
vfio_teardown_msi(VFIOPCIDevice * vdev)17029ee27d73SEric Auger static void vfio_teardown_msi(VFIOPCIDevice *vdev)
1703cf7087dbSKim Phillips {
1704cf7087dbSKim Phillips msi_uninit(&vdev->pdev);
1705cf7087dbSKim Phillips
1706cf7087dbSKim Phillips if (vdev->msix) {
1707a664477dSEric Auger msix_uninit(&vdev->pdev,
17083a286732SAlex Williamson vdev->bars[vdev->msix->table_bar].mr,
17093a286732SAlex Williamson vdev->bars[vdev->msix->pba_bar].mr);
171095239e16SAlex Williamson g_free(vdev->msix->pending);
1711cf7087dbSKim Phillips }
1712cf7087dbSKim Phillips }
1713cf7087dbSKim Phillips
1714cf7087dbSKim Phillips /*
1715cf7087dbSKim Phillips * Resource setup
1716cf7087dbSKim Phillips */
vfio_mmap_set_enabled(VFIOPCIDevice * vdev,bool enabled)17179ee27d73SEric Auger static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
1718cf7087dbSKim Phillips {
1719cf7087dbSKim Phillips int i;
1720cf7087dbSKim Phillips
1721cf7087dbSKim Phillips for (i = 0; i < PCI_ROM_SLOT; i++) {
1722db0da029SAlex Williamson vfio_region_mmaps_set_enabled(&vdev->bars[i].region, enabled);
1723cf7087dbSKim Phillips }
1724cf7087dbSKim Phillips }
1725cf7087dbSKim Phillips
vfio_bar_prepare(VFIOPCIDevice * vdev,int nr)17263a286732SAlex Williamson static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
1727cf7087dbSKim Phillips {
1728cf7087dbSKim Phillips VFIOBAR *bar = &vdev->bars[nr];
1729cf7087dbSKim Phillips
1730cf7087dbSKim Phillips uint32_t pci_bar;
1731cf7087dbSKim Phillips int ret;
1732cf7087dbSKim Phillips
1733cf7087dbSKim Phillips /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
17342d82f8a3SAlex Williamson if (!bar->region.size) {
1735cf7087dbSKim Phillips return;
1736cf7087dbSKim Phillips }
1737cf7087dbSKim Phillips
1738cf7087dbSKim Phillips /* Determine what type of BAR this is for registration */
17395546a621SEric Auger ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar),
1740cf7087dbSKim Phillips vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
1741cf7087dbSKim Phillips if (ret != sizeof(pci_bar)) {
1742cf7087dbSKim Phillips error_report("vfio: Failed to read BAR %d (%m)", nr);
1743cf7087dbSKim Phillips return;
1744cf7087dbSKim Phillips }
1745cf7087dbSKim Phillips
1746cf7087dbSKim Phillips pci_bar = le32_to_cpu(pci_bar);
1747cf7087dbSKim Phillips bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
1748cf7087dbSKim Phillips bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
17493a286732SAlex Williamson bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
1750cf7087dbSKim Phillips ~PCI_BASE_ADDRESS_MEM_MASK);
17513a286732SAlex Williamson bar->size = bar->region.size;
17523a286732SAlex Williamson }
17533a286732SAlex Williamson
vfio_bars_prepare(VFIOPCIDevice * vdev)17543a286732SAlex Williamson static void vfio_bars_prepare(VFIOPCIDevice *vdev)
17553a286732SAlex Williamson {
17563a286732SAlex Williamson int i;
17573a286732SAlex Williamson
17583a286732SAlex Williamson for (i = 0; i < PCI_ROM_SLOT; i++) {
17593a286732SAlex Williamson vfio_bar_prepare(vdev, i);
17603a286732SAlex Williamson }
17613a286732SAlex Williamson }
17623a286732SAlex Williamson
vfio_bar_register(VFIOPCIDevice * vdev,int nr)17633a286732SAlex Williamson static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
17643a286732SAlex Williamson {
17653a286732SAlex Williamson VFIOBAR *bar = &vdev->bars[nr];
17663a286732SAlex Williamson char *name;
17673a286732SAlex Williamson
17683a286732SAlex Williamson if (!bar->size) {
17693a286732SAlex Williamson return;
17703a286732SAlex Williamson }
17713a286732SAlex Williamson
17723a286732SAlex Williamson bar->mr = g_new0(MemoryRegion, 1);
17733a286732SAlex Williamson name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr);
17743a286732SAlex Williamson memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size);
17753a286732SAlex Williamson g_free(name);
17763a286732SAlex Williamson
17773a286732SAlex Williamson if (bar->region.size) {
17783a286732SAlex Williamson memory_region_add_subregion(bar->mr, 0, bar->region.mem);
1779cf7087dbSKim Phillips
1780db0da029SAlex Williamson if (vfio_region_mmap(&bar->region)) {
1781db0da029SAlex Williamson error_report("Failed to mmap %s BAR %d. Performance may be slow",
1782db0da029SAlex Williamson vdev->vbasedev.name, nr);
1783cf7087dbSKim Phillips }
1784cf7087dbSKim Phillips }
1785cf7087dbSKim Phillips
17863a286732SAlex Williamson pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
17873a286732SAlex Williamson }
17883a286732SAlex Williamson
vfio_bars_register(VFIOPCIDevice * vdev)17893a286732SAlex Williamson static void vfio_bars_register(VFIOPCIDevice *vdev)
1790cf7087dbSKim Phillips {
1791cf7087dbSKim Phillips int i;
1792cf7087dbSKim Phillips
1793cf7087dbSKim Phillips for (i = 0; i < PCI_ROM_SLOT; i++) {
17943a286732SAlex Williamson vfio_bar_register(vdev, i);
1795cf7087dbSKim Phillips }
1796cf7087dbSKim Phillips }
1797cf7087dbSKim Phillips
vfio_bars_exit(VFIOPCIDevice * vdev)17982d82f8a3SAlex Williamson static void vfio_bars_exit(VFIOPCIDevice *vdev)
1799ba5e6bfaSPaolo Bonzini {
1800ba5e6bfaSPaolo Bonzini int i;
1801ba5e6bfaSPaolo Bonzini
1802ba5e6bfaSPaolo Bonzini for (i = 0; i < PCI_ROM_SLOT; i++) {
18033a286732SAlex Williamson VFIOBAR *bar = &vdev->bars[i];
18043a286732SAlex Williamson
18052d82f8a3SAlex Williamson vfio_bar_quirk_exit(vdev, i);
18063a286732SAlex Williamson vfio_region_exit(&bar->region);
18073a286732SAlex Williamson if (bar->region.size) {
18083a286732SAlex Williamson memory_region_del_subregion(bar->mr, bar->region.mem);
18093a286732SAlex Williamson }
1810ba5e6bfaSPaolo Bonzini }
1811ba5e6bfaSPaolo Bonzini
18122d82f8a3SAlex Williamson if (vdev->vga) {
1813ba5e6bfaSPaolo Bonzini pci_unregister_vga(&vdev->pdev);
18142d82f8a3SAlex Williamson vfio_vga_quirk_exit(vdev);
1815ba5e6bfaSPaolo Bonzini }
1816ba5e6bfaSPaolo Bonzini }
1817ba5e6bfaSPaolo Bonzini
vfio_bars_finalize(VFIOPCIDevice * vdev)18182d82f8a3SAlex Williamson static void vfio_bars_finalize(VFIOPCIDevice *vdev)
1819cf7087dbSKim Phillips {
1820cf7087dbSKim Phillips int i;
1821cf7087dbSKim Phillips
1822cf7087dbSKim Phillips for (i = 0; i < PCI_ROM_SLOT; i++) {
18233a286732SAlex Williamson VFIOBAR *bar = &vdev->bars[i];
18243a286732SAlex Williamson
18252d82f8a3SAlex Williamson vfio_bar_quirk_finalize(vdev, i);
18263a286732SAlex Williamson vfio_region_finalize(&bar->region);
18278af87a3eSAvihai Horon if (bar->mr) {
18288af87a3eSAvihai Horon assert(bar->size);
18293a286732SAlex Williamson object_unparent(OBJECT(bar->mr));
18303a286732SAlex Williamson g_free(bar->mr);
18318af87a3eSAvihai Horon bar->mr = NULL;
18323a286732SAlex Williamson }
1833cf7087dbSKim Phillips }
1834cf7087dbSKim Phillips
18352d82f8a3SAlex Williamson if (vdev->vga) {
18362d82f8a3SAlex Williamson vfio_vga_quirk_finalize(vdev);
18372d82f8a3SAlex Williamson for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
18382d82f8a3SAlex Williamson object_unparent(OBJECT(&vdev->vga->region[i].mem));
18392d82f8a3SAlex Williamson }
18402d82f8a3SAlex Williamson g_free(vdev->vga);
1841cf7087dbSKim Phillips }
1842cf7087dbSKim Phillips }
1843cf7087dbSKim Phillips
1844cf7087dbSKim Phillips /*
1845cf7087dbSKim Phillips * General setup
1846cf7087dbSKim Phillips */
vfio_std_cap_max_size(PCIDevice * pdev,uint8_t pos)1847cf7087dbSKim Phillips static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
1848cf7087dbSKim Phillips {
184988caf177SChen Fan uint8_t tmp;
185088caf177SChen Fan uint16_t next = PCI_CONFIG_SPACE_SIZE;
1851cf7087dbSKim Phillips
1852cf7087dbSKim Phillips for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
18533fc1c182SWei Yang tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]) {
1854cf7087dbSKim Phillips if (tmp > pos && tmp < next) {
1855cf7087dbSKim Phillips next = tmp;
1856cf7087dbSKim Phillips }
1857cf7087dbSKim Phillips }
1858cf7087dbSKim Phillips
1859cf7087dbSKim Phillips return next - pos;
1860cf7087dbSKim Phillips }
1861cf7087dbSKim Phillips
1862325ae8d5SChen Fan
vfio_ext_cap_max_size(const uint8_t * config,uint16_t pos)1863325ae8d5SChen Fan static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
1864325ae8d5SChen Fan {
1865325ae8d5SChen Fan uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
1866325ae8d5SChen Fan
1867325ae8d5SChen Fan for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
1868325ae8d5SChen Fan tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
1869325ae8d5SChen Fan if (tmp > pos && tmp < next) {
1870325ae8d5SChen Fan next = tmp;
1871325ae8d5SChen Fan }
1872325ae8d5SChen Fan }
1873325ae8d5SChen Fan
1874325ae8d5SChen Fan return next - pos;
1875325ae8d5SChen Fan }
1876325ae8d5SChen Fan
vfio_set_word_bits(uint8_t * buf,uint16_t val,uint16_t mask)1877cf7087dbSKim Phillips static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
1878cf7087dbSKim Phillips {
1879cf7087dbSKim Phillips pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
1880cf7087dbSKim Phillips }
1881cf7087dbSKim Phillips
vfio_add_emulated_word(VFIOPCIDevice * vdev,int pos,uint16_t val,uint16_t mask)18829ee27d73SEric Auger static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos,
1883cf7087dbSKim Phillips uint16_t val, uint16_t mask)
1884cf7087dbSKim Phillips {
1885cf7087dbSKim Phillips vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
1886cf7087dbSKim Phillips vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
1887cf7087dbSKim Phillips vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
1888cf7087dbSKim Phillips }
1889cf7087dbSKim Phillips
vfio_set_long_bits(uint8_t * buf,uint32_t val,uint32_t mask)1890cf7087dbSKim Phillips static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
1891cf7087dbSKim Phillips {
1892cf7087dbSKim Phillips pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
1893cf7087dbSKim Phillips }
1894cf7087dbSKim Phillips
vfio_add_emulated_long(VFIOPCIDevice * vdev,int pos,uint32_t val,uint32_t mask)18959ee27d73SEric Auger static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
1896cf7087dbSKim Phillips uint32_t val, uint32_t mask)
1897cf7087dbSKim Phillips {
1898cf7087dbSKim Phillips vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
1899cf7087dbSKim Phillips vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
1900cf7087dbSKim Phillips vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
1901cf7087dbSKim Phillips }
1902cf7087dbSKim Phillips
vfio_pci_enable_rp_atomics(VFIOPCIDevice * vdev)1903c00aac6fSAlex Williamson static void vfio_pci_enable_rp_atomics(VFIOPCIDevice *vdev)
1904c00aac6fSAlex Williamson {
1905c00aac6fSAlex Williamson struct vfio_device_info_cap_pci_atomic_comp *cap;
1906c00aac6fSAlex Williamson g_autofree struct vfio_device_info *info = NULL;
1907c00aac6fSAlex Williamson PCIBus *bus = pci_get_bus(&vdev->pdev);
1908c00aac6fSAlex Williamson PCIDevice *parent = bus->parent_dev;
1909c00aac6fSAlex Williamson struct vfio_info_cap_header *hdr;
1910c00aac6fSAlex Williamson uint32_t mask = 0;
1911c00aac6fSAlex Williamson uint8_t *pos;
1912c00aac6fSAlex Williamson
1913c00aac6fSAlex Williamson /*
1914c00aac6fSAlex Williamson * PCIe Atomic Ops completer support is only added automatically for single
1915c00aac6fSAlex Williamson * function devices downstream of a root port supporting DEVCAP2. Support
1916c00aac6fSAlex Williamson * is added during realize and, if added, removed during device exit. The
1917c00aac6fSAlex Williamson * single function requirement avoids conflicting requirements should a
1918c00aac6fSAlex Williamson * slot be composed of multiple devices with differing capabilities.
1919c00aac6fSAlex Williamson */
1920c00aac6fSAlex Williamson if (pci_bus_is_root(bus) || !parent || !parent->exp.exp_cap ||
1921c00aac6fSAlex Williamson pcie_cap_get_type(parent) != PCI_EXP_TYPE_ROOT_PORT ||
1922c00aac6fSAlex Williamson pcie_cap_get_version(parent) != PCI_EXP_FLAGS_VER2 ||
1923c00aac6fSAlex Williamson vdev->pdev.devfn ||
1924c00aac6fSAlex Williamson vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
1925c00aac6fSAlex Williamson return;
1926c00aac6fSAlex Williamson }
1927c00aac6fSAlex Williamson
1928c00aac6fSAlex Williamson pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2;
1929c00aac6fSAlex Williamson
1930c00aac6fSAlex Williamson /* Abort if there'a already an Atomic Ops configuration on the root port */
1931c00aac6fSAlex Williamson if (pci_get_long(pos) & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
1932c00aac6fSAlex Williamson PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
1933c00aac6fSAlex Williamson PCI_EXP_DEVCAP2_ATOMIC_COMP128)) {
1934c00aac6fSAlex Williamson return;
1935c00aac6fSAlex Williamson }
1936c00aac6fSAlex Williamson
1937c00aac6fSAlex Williamson info = vfio_get_device_info(vdev->vbasedev.fd);
1938c00aac6fSAlex Williamson if (!info) {
1939c00aac6fSAlex Williamson return;
1940c00aac6fSAlex Williamson }
1941c00aac6fSAlex Williamson
1942c00aac6fSAlex Williamson hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP);
1943c00aac6fSAlex Williamson if (!hdr) {
1944c00aac6fSAlex Williamson return;
1945c00aac6fSAlex Williamson }
1946c00aac6fSAlex Williamson
1947c00aac6fSAlex Williamson cap = (void *)hdr;
1948c00aac6fSAlex Williamson if (cap->flags & VFIO_PCI_ATOMIC_COMP32) {
1949c00aac6fSAlex Williamson mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP32;
1950c00aac6fSAlex Williamson }
1951c00aac6fSAlex Williamson if (cap->flags & VFIO_PCI_ATOMIC_COMP64) {
1952c00aac6fSAlex Williamson mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP64;
1953c00aac6fSAlex Williamson }
1954c00aac6fSAlex Williamson if (cap->flags & VFIO_PCI_ATOMIC_COMP128) {
1955c00aac6fSAlex Williamson mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP128;
1956c00aac6fSAlex Williamson }
1957c00aac6fSAlex Williamson
1958c00aac6fSAlex Williamson if (!mask) {
1959c00aac6fSAlex Williamson return;
1960c00aac6fSAlex Williamson }
1961c00aac6fSAlex Williamson
1962c00aac6fSAlex Williamson pci_long_test_and_set_mask(pos, mask);
1963c00aac6fSAlex Williamson vdev->clear_parent_atomics_on_exit = true;
1964c00aac6fSAlex Williamson }
1965c00aac6fSAlex Williamson
vfio_pci_disable_rp_atomics(VFIOPCIDevice * vdev)1966c00aac6fSAlex Williamson static void vfio_pci_disable_rp_atomics(VFIOPCIDevice *vdev)
1967c00aac6fSAlex Williamson {
1968c00aac6fSAlex Williamson if (vdev->clear_parent_atomics_on_exit) {
1969c00aac6fSAlex Williamson PCIDevice *parent = pci_get_bus(&vdev->pdev)->parent_dev;
1970c00aac6fSAlex Williamson uint8_t *pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2;
1971c00aac6fSAlex Williamson
1972c00aac6fSAlex Williamson pci_long_test_and_clear_mask(pos, PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
1973c00aac6fSAlex Williamson PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
1974c00aac6fSAlex Williamson PCI_EXP_DEVCAP2_ATOMIC_COMP128);
1975c00aac6fSAlex Williamson }
1976c00aac6fSAlex Williamson }
1977c00aac6fSAlex Williamson
vfio_setup_pcie_cap(VFIOPCIDevice * vdev,int pos,uint8_t size,Error ** errp)1978b771a40fSZhenzhong Duan static bool vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size,
19797ef165b9SEric Auger Error **errp)
1980cf7087dbSKim Phillips {
1981cf7087dbSKim Phillips uint16_t flags;
1982cf7087dbSKim Phillips uint8_t type;
1983cf7087dbSKim Phillips
1984cf7087dbSKim Phillips flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
1985cf7087dbSKim Phillips type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
1986cf7087dbSKim Phillips
1987cf7087dbSKim Phillips if (type != PCI_EXP_TYPE_ENDPOINT &&
1988cf7087dbSKim Phillips type != PCI_EXP_TYPE_LEG_END &&
1989cf7087dbSKim Phillips type != PCI_EXP_TYPE_RC_END) {
1990cf7087dbSKim Phillips
19917ef165b9SEric Auger error_setg(errp, "assignment of PCIe type 0x%x "
1992cf7087dbSKim Phillips "devices is not currently supported", type);
1993b771a40fSZhenzhong Duan return false;
1994cf7087dbSKim Phillips }
1995cf7087dbSKim Phillips
1996fd56e061SDavid Gibson if (!pci_bus_is_express(pci_get_bus(&vdev->pdev))) {
1997fd56e061SDavid Gibson PCIBus *bus = pci_get_bus(&vdev->pdev);
19980282abf0SAlex Williamson PCIDevice *bridge;
19990282abf0SAlex Williamson
2000cf7087dbSKim Phillips /*
20010282abf0SAlex Williamson * Traditionally PCI device assignment exposes the PCIe capability
20020282abf0SAlex Williamson * as-is on non-express buses. The reason being that some drivers
20030282abf0SAlex Williamson * simply assume that it's there, for example tg3. However when
20040282abf0SAlex Williamson * we're running on a native PCIe machine type, like Q35, we need
20050282abf0SAlex Williamson * to hide the PCIe capability. The reason for this is twofold;
20060282abf0SAlex Williamson * first Windows guests get a Code 10 error when the PCIe capability
20070282abf0SAlex Williamson * is exposed in this configuration. Therefore express devices won't
20080282abf0SAlex Williamson * work at all unless they're attached to express buses in the VM.
20090282abf0SAlex Williamson * Second, a native PCIe machine introduces the possibility of fine
20100282abf0SAlex Williamson * granularity IOMMUs supporting both translation and isolation.
20110282abf0SAlex Williamson * Guest code to discover the IOMMU visibility of a device, such as
20120282abf0SAlex Williamson * IOMMU grouping code on Linux, is very aware of device types and
20130282abf0SAlex Williamson * valid transitions between bus types. An express device on a non-
20140282abf0SAlex Williamson * express bus is not a valid combination on bare metal systems.
20150282abf0SAlex Williamson *
20160282abf0SAlex Williamson * Drivers that require a PCIe capability to make the device
20170282abf0SAlex Williamson * functional are simply going to need to have their devices placed
20180282abf0SAlex Williamson * on a PCIe bus in the VM.
2019cf7087dbSKim Phillips */
20200282abf0SAlex Williamson while (!pci_bus_is_root(bus)) {
20210282abf0SAlex Williamson bridge = pci_bridge_get_device(bus);
2022fd56e061SDavid Gibson bus = pci_get_bus(bridge);
20230282abf0SAlex Williamson }
20240282abf0SAlex Williamson
20250282abf0SAlex Williamson if (pci_bus_is_express(bus)) {
2026b771a40fSZhenzhong Duan return true;
20270282abf0SAlex Williamson }
20280282abf0SAlex Williamson
2029fd56e061SDavid Gibson } else if (pci_bus_is_root(pci_get_bus(&vdev->pdev))) {
2030cf7087dbSKim Phillips /*
2031cf7087dbSKim Phillips * On a Root Complex bus Endpoints become Root Complex Integrated
2032cf7087dbSKim Phillips * Endpoints, which changes the type and clears the LNK & LNK2 fields.
2033cf7087dbSKim Phillips */
2034cf7087dbSKim Phillips if (type == PCI_EXP_TYPE_ENDPOINT) {
2035cf7087dbSKim Phillips vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2036cf7087dbSKim Phillips PCI_EXP_TYPE_RC_END << 4,
2037cf7087dbSKim Phillips PCI_EXP_FLAGS_TYPE);
2038cf7087dbSKim Phillips
2039cf7087dbSKim Phillips /* Link Capabilities, Status, and Control goes away */
2040cf7087dbSKim Phillips if (size > PCI_EXP_LNKCTL) {
2041cf7087dbSKim Phillips vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
2042cf7087dbSKim Phillips vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2043cf7087dbSKim Phillips vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
2044cf7087dbSKim Phillips
2045cf7087dbSKim Phillips #ifndef PCI_EXP_LNKCAP2
2046cf7087dbSKim Phillips #define PCI_EXP_LNKCAP2 44
2047cf7087dbSKim Phillips #endif
2048cf7087dbSKim Phillips #ifndef PCI_EXP_LNKSTA2
2049cf7087dbSKim Phillips #define PCI_EXP_LNKSTA2 50
2050cf7087dbSKim Phillips #endif
2051cf7087dbSKim Phillips /* Link 2 Capabilities, Status, and Control goes away */
2052cf7087dbSKim Phillips if (size > PCI_EXP_LNKCAP2) {
2053cf7087dbSKim Phillips vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
2054cf7087dbSKim Phillips vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
2055cf7087dbSKim Phillips vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
2056cf7087dbSKim Phillips }
2057cf7087dbSKim Phillips }
2058cf7087dbSKim Phillips
2059cf7087dbSKim Phillips } else if (type == PCI_EXP_TYPE_LEG_END) {
2060cf7087dbSKim Phillips /*
2061cf7087dbSKim Phillips * Legacy endpoints don't belong on the root complex. Windows
2062cf7087dbSKim Phillips * seems to be happier with devices if we skip the capability.
2063cf7087dbSKim Phillips */
2064b771a40fSZhenzhong Duan return true;
2065cf7087dbSKim Phillips }
2066cf7087dbSKim Phillips
2067cf7087dbSKim Phillips } else {
2068cf7087dbSKim Phillips /*
2069cf7087dbSKim Phillips * Convert Root Complex Integrated Endpoints to regular endpoints.
2070cf7087dbSKim Phillips * These devices don't support LNK/LNK2 capabilities, so make them up.
2071cf7087dbSKim Phillips */
2072cf7087dbSKim Phillips if (type == PCI_EXP_TYPE_RC_END) {
2073cf7087dbSKim Phillips vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2074cf7087dbSKim Phillips PCI_EXP_TYPE_ENDPOINT << 4,
2075cf7087dbSKim Phillips PCI_EXP_FLAGS_TYPE);
2076cf7087dbSKim Phillips vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
2077d96a0ac7SAlex Williamson QEMU_PCI_EXP_LNKCAP_MLW(QEMU_PCI_EXP_LNK_X1) |
2078d96a0ac7SAlex Williamson QEMU_PCI_EXP_LNKCAP_MLS(QEMU_PCI_EXP_LNK_2_5GT), ~0);
2079cf7087dbSKim Phillips vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2080cf7087dbSKim Phillips }
2081c00aac6fSAlex Williamson
2082c00aac6fSAlex Williamson vfio_pci_enable_rp_atomics(vdev);
2083cf7087dbSKim Phillips }
2084cf7087dbSKim Phillips
208547985727SAlex Williamson /*
208647985727SAlex Williamson * Intel 82599 SR-IOV VFs report an invalid PCIe capability version 0
208747985727SAlex Williamson * (Niantic errate #35) causing Windows to error with a Code 10 for the
208847985727SAlex Williamson * device on Q35. Fixup any such devices to report version 1. If we
208947985727SAlex Williamson * were to remove the capability entirely the guest would lose extended
209047985727SAlex Williamson * config space.
209147985727SAlex Williamson */
209247985727SAlex Williamson if ((flags & PCI_EXP_FLAGS_VERS) == 0) {
209347985727SAlex Williamson vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
209447985727SAlex Williamson 1, PCI_EXP_FLAGS_VERS);
209547985727SAlex Williamson }
209647985727SAlex Williamson
20979a7c2a59SMao Zhongyi pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size,
20989a7c2a59SMao Zhongyi errp);
20999a7c2a59SMao Zhongyi if (pos < 0) {
2100b771a40fSZhenzhong Duan return false;
2101cf7087dbSKim Phillips }
2102cf7087dbSKim Phillips
21039a7c2a59SMao Zhongyi vdev->pdev.exp.exp_cap = pos;
21049a7c2a59SMao Zhongyi
2105b771a40fSZhenzhong Duan return true;
2106cf7087dbSKim Phillips }
2107cf7087dbSKim Phillips
vfio_check_pcie_flr(VFIOPCIDevice * vdev,uint8_t pos)21089ee27d73SEric Auger static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
2109cf7087dbSKim Phillips {
2110cf7087dbSKim Phillips uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
2111cf7087dbSKim Phillips
2112cf7087dbSKim Phillips if (cap & PCI_EXP_DEVCAP_FLR) {
2113df92ee44SEric Auger trace_vfio_check_pcie_flr(vdev->vbasedev.name);
2114cf7087dbSKim Phillips vdev->has_flr = true;
2115cf7087dbSKim Phillips }
2116cf7087dbSKim Phillips }
2117cf7087dbSKim Phillips
vfio_check_pm_reset(VFIOPCIDevice * vdev,uint8_t pos)21189ee27d73SEric Auger static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos)
2119cf7087dbSKim Phillips {
2120cf7087dbSKim Phillips uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
2121cf7087dbSKim Phillips
2122cf7087dbSKim Phillips if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
2123df92ee44SEric Auger trace_vfio_check_pm_reset(vdev->vbasedev.name);
2124cf7087dbSKim Phillips vdev->has_pm_reset = true;
2125cf7087dbSKim Phillips }
2126cf7087dbSKim Phillips }
2127cf7087dbSKim Phillips
vfio_check_af_flr(VFIOPCIDevice * vdev,uint8_t pos)21289ee27d73SEric Auger static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
2129cf7087dbSKim Phillips {
2130cf7087dbSKim Phillips uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
2131cf7087dbSKim Phillips
2132cf7087dbSKim Phillips if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
2133df92ee44SEric Auger trace_vfio_check_af_flr(vdev->vbasedev.name);
2134cf7087dbSKim Phillips vdev->has_flr = true;
2135cf7087dbSKim Phillips }
2136cf7087dbSKim Phillips }
2137cf7087dbSKim Phillips
vfio_add_vendor_specific_cap(VFIOPCIDevice * vdev,int pos,uint8_t size,Error ** errp)2138b771a40fSZhenzhong Duan static bool vfio_add_vendor_specific_cap(VFIOPCIDevice *vdev, int pos,
2139187716feSVinayak Kale uint8_t size, Error **errp)
2140187716feSVinayak Kale {
2141187716feSVinayak Kale PCIDevice *pdev = &vdev->pdev;
2142187716feSVinayak Kale
2143187716feSVinayak Kale pos = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, size, errp);
2144187716feSVinayak Kale if (pos < 0) {
2145b771a40fSZhenzhong Duan return false;
2146187716feSVinayak Kale }
2147187716feSVinayak Kale
2148187716feSVinayak Kale /*
2149187716feSVinayak Kale * Exempt config space check for Vendor Specific Information during
2150187716feSVinayak Kale * restore/load.
2151187716feSVinayak Kale * Config space check is still enforced for 3 byte VSC header.
2152187716feSVinayak Kale */
2153187716feSVinayak Kale if (vdev->skip_vsc_check && size > 3) {
2154187716feSVinayak Kale memset(pdev->cmask + pos + 3, 0, size - 3);
2155187716feSVinayak Kale }
2156187716feSVinayak Kale
2157b771a40fSZhenzhong Duan return true;
2158187716feSVinayak Kale }
2159187716feSVinayak Kale
vfio_add_std_cap(VFIOPCIDevice * vdev,uint8_t pos,Error ** errp)2160b771a40fSZhenzhong Duan static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
2161cf7087dbSKim Phillips {
2162cf8afdfaSZhao Liu ERRP_GUARD();
2163cf7087dbSKim Phillips PCIDevice *pdev = &vdev->pdev;
2164cf7087dbSKim Phillips uint8_t cap_id, next, size;
2165b771a40fSZhenzhong Duan bool ret;
2166cf7087dbSKim Phillips
2167cf7087dbSKim Phillips cap_id = pdev->config[pos];
21683fc1c182SWei Yang next = pdev->config[pos + PCI_CAP_LIST_NEXT];
2169cf7087dbSKim Phillips
2170cf7087dbSKim Phillips /*
2171cf7087dbSKim Phillips * If it becomes important to configure capabilities to their actual
2172cf7087dbSKim Phillips * size, use this as the default when it's something we don't recognize.
2173cf7087dbSKim Phillips * Since QEMU doesn't actually handle many of the config accesses,
2174cf7087dbSKim Phillips * exact size doesn't seem worthwhile.
2175cf7087dbSKim Phillips */
2176cf7087dbSKim Phillips size = vfio_std_cap_max_size(pdev, pos);
2177cf7087dbSKim Phillips
2178cf7087dbSKim Phillips /*
2179cf7087dbSKim Phillips * pci_add_capability always inserts the new capability at the head
2180cf7087dbSKim Phillips * of the chain. Therefore to end up with a chain that matches the
2181cf7087dbSKim Phillips * physical device, we insert from the end by making this recursive.
21823fc1c182SWei Yang * This is also why we pre-calculate size above as cached config space
2183cf7087dbSKim Phillips * will be changed as we unwind the stack.
2184cf7087dbSKim Phillips */
2185cf7087dbSKim Phillips if (next) {
2186b771a40fSZhenzhong Duan if (!vfio_add_std_cap(vdev, next, errp)) {
2187b771a40fSZhenzhong Duan return false;
2188cf7087dbSKim Phillips }
2189cf7087dbSKim Phillips } else {
2190cf7087dbSKim Phillips /* Begin the rebuild, use QEMU emulated list bits */
2191cf7087dbSKim Phillips pdev->config[PCI_CAPABILITY_LIST] = 0;
2192cf7087dbSKim Phillips vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
2193cf7087dbSKim Phillips vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
2194e3f79f3bSAlex Williamson
21950a0bda0aSZhenzhong Duan if (!vfio_add_virt_caps(vdev, errp)) {
2196b771a40fSZhenzhong Duan return false;
2197cf7087dbSKim Phillips }
2198e3f79f3bSAlex Williamson }
2199e3f79f3bSAlex Williamson
2200e3f79f3bSAlex Williamson /* Scale down size, esp in case virt caps were added above */
2201e3f79f3bSAlex Williamson size = MIN(size, vfio_std_cap_max_size(pdev, pos));
2202cf7087dbSKim Phillips
2203cf7087dbSKim Phillips /* Use emulated next pointer to allow dropping caps */
22043fc1c182SWei Yang pci_set_byte(vdev->emulated_config_bits + pos + PCI_CAP_LIST_NEXT, 0xff);
2205cf7087dbSKim Phillips
2206cf7087dbSKim Phillips switch (cap_id) {
2207cf7087dbSKim Phillips case PCI_CAP_ID_MSI:
22087ef165b9SEric Auger ret = vfio_msi_setup(vdev, pos, errp);
2209cf7087dbSKim Phillips break;
2210cf7087dbSKim Phillips case PCI_CAP_ID_EXP:
2211cf7087dbSKim Phillips vfio_check_pcie_flr(vdev, pos);
22127ef165b9SEric Auger ret = vfio_setup_pcie_cap(vdev, pos, size, errp);
2213cf7087dbSKim Phillips break;
2214cf7087dbSKim Phillips case PCI_CAP_ID_MSIX:
22157ef165b9SEric Auger ret = vfio_msix_setup(vdev, pos, errp);
2216cf7087dbSKim Phillips break;
2217cf7087dbSKim Phillips case PCI_CAP_ID_PM:
2218cf7087dbSKim Phillips vfio_check_pm_reset(vdev, pos);
2219cf7087dbSKim Phillips vdev->pm_cap = pos;
2220b771a40fSZhenzhong Duan ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
2221cf7087dbSKim Phillips break;
2222cf7087dbSKim Phillips case PCI_CAP_ID_AF:
2223cf7087dbSKim Phillips vfio_check_af_flr(vdev, pos);
2224b771a40fSZhenzhong Duan ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
2225cf7087dbSKim Phillips break;
2226187716feSVinayak Kale case PCI_CAP_ID_VNDR:
2227187716feSVinayak Kale ret = vfio_add_vendor_specific_cap(vdev, pos, size, errp);
2228187716feSVinayak Kale break;
2229cf7087dbSKim Phillips default:
2230b771a40fSZhenzhong Duan ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
2231cf7087dbSKim Phillips break;
2232cf7087dbSKim Phillips }
22335b31c822SAlex Williamson
2234b771a40fSZhenzhong Duan if (!ret) {
22357ef165b9SEric Auger error_prepend(errp,
22367ef165b9SEric Auger "failed to add PCI capability 0x%x[0x%x]@0x%x: ",
22377ef165b9SEric Auger cap_id, size, pos);
2238cf7087dbSKim Phillips }
2239cf7087dbSKim Phillips
2240b771a40fSZhenzhong Duan return ret;
2241cf7087dbSKim Phillips }
2242cf7087dbSKim Phillips
vfio_setup_rebar_ecap(VFIOPCIDevice * vdev,uint16_t pos)2243b5048a4cSAlex Williamson static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos)
2244b5048a4cSAlex Williamson {
2245b5048a4cSAlex Williamson uint32_t ctrl;
2246b5048a4cSAlex Williamson int i, nbar;
2247b5048a4cSAlex Williamson
2248b5048a4cSAlex Williamson ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL);
2249b5048a4cSAlex Williamson nbar = (ctrl & PCI_REBAR_CTRL_NBAR_MASK) >> PCI_REBAR_CTRL_NBAR_SHIFT;
2250b5048a4cSAlex Williamson
2251b5048a4cSAlex Williamson for (i = 0; i < nbar; i++) {
2252b5048a4cSAlex Williamson uint32_t cap;
2253b5048a4cSAlex Williamson int size;
2254b5048a4cSAlex Williamson
2255b5048a4cSAlex Williamson ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL + (i * 8));
2256b5048a4cSAlex Williamson size = (ctrl & PCI_REBAR_CTRL_BAR_SIZE) >> PCI_REBAR_CTRL_BAR_SHIFT;
2257b5048a4cSAlex Williamson
2258b5048a4cSAlex Williamson /* The cap register reports sizes 1MB to 128TB, with 4 reserved bits */
2259b5048a4cSAlex Williamson cap = size <= 27 ? 1U << (size + 4) : 0;
2260b5048a4cSAlex Williamson
2261b5048a4cSAlex Williamson /*
2262b5048a4cSAlex Williamson * The PCIe spec (v6.0.1, 7.8.6) requires HW to support at least one
2263b5048a4cSAlex Williamson * size in the range 1MB to 512GB. We intend to mask all sizes except
2264b5048a4cSAlex Williamson * the one currently enabled in the size field, therefore if it's
2265b5048a4cSAlex Williamson * outside the range, hide the whole capability as this virtualization
2266b5048a4cSAlex Williamson * trick won't work. If >512GB resizable BARs start to appear, we
2267b5048a4cSAlex Williamson * might need an opt-in or reservation scheme in the kernel.
2268b5048a4cSAlex Williamson */
2269b5048a4cSAlex Williamson if (!(cap & PCI_REBAR_CAP_SIZES)) {
2270b5048a4cSAlex Williamson return -EINVAL;
2271b5048a4cSAlex Williamson }
2272b5048a4cSAlex Williamson
2273b5048a4cSAlex Williamson /* Hide all sizes reported in the ctrl reg per above requirement. */
2274b5048a4cSAlex Williamson ctrl &= (PCI_REBAR_CTRL_BAR_SIZE |
2275b5048a4cSAlex Williamson PCI_REBAR_CTRL_NBAR_MASK |
2276b5048a4cSAlex Williamson PCI_REBAR_CTRL_BAR_IDX);
2277b5048a4cSAlex Williamson
2278b5048a4cSAlex Williamson /*
2279b5048a4cSAlex Williamson * The BAR size field is RW, however we've mangled the capability
2280b5048a4cSAlex Williamson * register such that we only report a single size, ie. the current
2281b5048a4cSAlex Williamson * BAR size. A write of an unsupported value is undefined, therefore
2282b5048a4cSAlex Williamson * the register field is essentially RO.
2283b5048a4cSAlex Williamson */
2284b5048a4cSAlex Williamson vfio_add_emulated_long(vdev, pos + PCI_REBAR_CAP + (i * 8), cap, ~0);
2285b5048a4cSAlex Williamson vfio_add_emulated_long(vdev, pos + PCI_REBAR_CTRL + (i * 8), ctrl, ~0);
2286b5048a4cSAlex Williamson }
2287b5048a4cSAlex Williamson
2288b5048a4cSAlex Williamson return 0;
2289b5048a4cSAlex Williamson }
2290b5048a4cSAlex Williamson
vfio_add_ext_cap(VFIOPCIDevice * vdev)22917ef165b9SEric Auger static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
2292325ae8d5SChen Fan {
2293325ae8d5SChen Fan PCIDevice *pdev = &vdev->pdev;
2294325ae8d5SChen Fan uint32_t header;
2295325ae8d5SChen Fan uint16_t cap_id, next, size;
2296325ae8d5SChen Fan uint8_t cap_ver;
2297325ae8d5SChen Fan uint8_t *config;
2298325ae8d5SChen Fan
2299e37dac06SAlex Williamson /* Only add extended caps if we have them and the guest can see them */
2300fd56e061SDavid Gibson if (!pci_is_express(pdev) || !pci_bus_is_express(pci_get_bus(pdev)) ||
2301e37dac06SAlex Williamson !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
23027ef165b9SEric Auger return;
2303e37dac06SAlex Williamson }
2304e37dac06SAlex Williamson
2305325ae8d5SChen Fan /*
2306325ae8d5SChen Fan * pcie_add_capability always inserts the new capability at the tail
2307325ae8d5SChen Fan * of the chain. Therefore to end up with a chain that matches the
2308325ae8d5SChen Fan * physical device, we cache the config space to avoid overwriting
2309325ae8d5SChen Fan * the original config space when we parse the extended capabilities.
2310325ae8d5SChen Fan */
2311325ae8d5SChen Fan config = g_memdup(pdev->config, vdev->config_size);
2312325ae8d5SChen Fan
2313e37dac06SAlex Williamson /*
2314e37dac06SAlex Williamson * Extended capabilities are chained with each pointing to the next, so we
2315e37dac06SAlex Williamson * can drop anything other than the head of the chain simply by modifying
2316d0d1cd70SAlex Williamson * the previous next pointer. Seed the head of the chain here such that
2317d0d1cd70SAlex Williamson * we can simply skip any capabilities we want to drop below, regardless
2318d0d1cd70SAlex Williamson * of their position in the chain. If this stub capability still exists
2319d0d1cd70SAlex Williamson * after we add the capabilities we want to expose, update the capability
2320d0d1cd70SAlex Williamson * ID to zero. Note that we cannot seed with the capability header being
2321d0d1cd70SAlex Williamson * zero as this conflicts with definition of an absent capability chain
2322d0d1cd70SAlex Williamson * and prevents capabilities beyond the head of the list from being added.
2323d0d1cd70SAlex Williamson * By replacing the dummy capability ID with zero after walking the device
2324d0d1cd70SAlex Williamson * chain, we also transparently mark extended capabilities as absent if
2325d0d1cd70SAlex Williamson * no capabilities were added. Note that the PCIe spec defines an absence
2326d0d1cd70SAlex Williamson * of extended capabilities to be determined by a value of zero for the
2327d0d1cd70SAlex Williamson * capability ID, version, AND next pointer. A non-zero next pointer
2328d0d1cd70SAlex Williamson * should be sufficient to indicate additional capabilities are present,
2329d0d1cd70SAlex Williamson * which will occur if we call pcie_add_capability() below. The entire
2330d0d1cd70SAlex Williamson * first dword is emulated to support this.
2331d0d1cd70SAlex Williamson *
2332d0d1cd70SAlex Williamson * NB. The kernel side does similar masking, so be prepared that our
2333d0d1cd70SAlex Williamson * view of the device may also contain a capability ID zero in the head
2334d0d1cd70SAlex Williamson * of the chain. Skip it for the same reason that we cannot seed the
2335d0d1cd70SAlex Williamson * chain with a zero capability.
2336e37dac06SAlex Williamson */
2337e37dac06SAlex Williamson pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
2338e37dac06SAlex Williamson PCI_EXT_CAP(0xFFFF, 0, 0));
2339e37dac06SAlex Williamson pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
2340e37dac06SAlex Williamson pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
2341e37dac06SAlex Williamson
2342325ae8d5SChen Fan for (next = PCI_CONFIG_SPACE_SIZE; next;
2343325ae8d5SChen Fan next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
2344325ae8d5SChen Fan header = pci_get_long(config + next);
2345325ae8d5SChen Fan cap_id = PCI_EXT_CAP_ID(header);
2346325ae8d5SChen Fan cap_ver = PCI_EXT_CAP_VER(header);
2347325ae8d5SChen Fan
2348325ae8d5SChen Fan /*
2349325ae8d5SChen Fan * If it becomes important to configure extended capabilities to their
2350325ae8d5SChen Fan * actual size, use this as the default when it's something we don't
2351325ae8d5SChen Fan * recognize. Since QEMU doesn't actually handle many of the config
2352325ae8d5SChen Fan * accesses, exact size doesn't seem worthwhile.
2353325ae8d5SChen Fan */
2354325ae8d5SChen Fan size = vfio_ext_cap_max_size(config, next);
2355325ae8d5SChen Fan
2356325ae8d5SChen Fan /* Use emulated next pointer to allow dropping extended caps */
2357325ae8d5SChen Fan pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
2358325ae8d5SChen Fan PCI_EXT_CAP_NEXT_MASK);
2359e37dac06SAlex Williamson
2360e37dac06SAlex Williamson switch (cap_id) {
2361d0d1cd70SAlex Williamson case 0: /* kernel masked capability */
2362e37dac06SAlex Williamson case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */
2363383a7af7SAlex Williamson case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */
2364e37dac06SAlex Williamson trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
2365e37dac06SAlex Williamson break;
2366b5048a4cSAlex Williamson case PCI_EXT_CAP_ID_REBAR:
2367b5048a4cSAlex Williamson if (!vfio_setup_rebar_ecap(vdev, next)) {
2368b5048a4cSAlex Williamson pcie_add_capability(pdev, cap_id, cap_ver, next, size);
2369b5048a4cSAlex Williamson }
2370b5048a4cSAlex Williamson break;
2371e37dac06SAlex Williamson default:
2372e37dac06SAlex Williamson pcie_add_capability(pdev, cap_id, cap_ver, next, size);
2373e37dac06SAlex Williamson }
2374e37dac06SAlex Williamson
2375e37dac06SAlex Williamson }
2376e37dac06SAlex Williamson
2377e37dac06SAlex Williamson /* Cleanup chain head ID if necessary */
2378e37dac06SAlex Williamson if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
2379e37dac06SAlex Williamson pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
2380325ae8d5SChen Fan }
2381325ae8d5SChen Fan
2382325ae8d5SChen Fan g_free(config);
23837ef165b9SEric Auger return;
2384325ae8d5SChen Fan }
2385325ae8d5SChen Fan
vfio_add_capabilities(VFIOPCIDevice * vdev,Error ** errp)2386b771a40fSZhenzhong Duan static bool vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
2387cf7087dbSKim Phillips {
2388cf7087dbSKim Phillips PCIDevice *pdev = &vdev->pdev;
2389cf7087dbSKim Phillips
2390cf7087dbSKim Phillips if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
2391cf7087dbSKim Phillips !pdev->config[PCI_CAPABILITY_LIST]) {
2392b771a40fSZhenzhong Duan return true; /* Nothing to add */
2393cf7087dbSKim Phillips }
2394cf7087dbSKim Phillips
2395b771a40fSZhenzhong Duan if (!vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST], errp)) {
2396b771a40fSZhenzhong Duan return false;
2397325ae8d5SChen Fan }
2398325ae8d5SChen Fan
23997ef165b9SEric Auger vfio_add_ext_cap(vdev);
2400b771a40fSZhenzhong Duan return true;
2401cf7087dbSKim Phillips }
2402cf7087dbSKim Phillips
vfio_pci_pre_reset(VFIOPCIDevice * vdev)2403c328e7e8SZhenzhong Duan void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
2404cf7087dbSKim Phillips {
2405cf7087dbSKim Phillips PCIDevice *pdev = &vdev->pdev;
2406cf7087dbSKim Phillips uint16_t cmd;
2407cf7087dbSKim Phillips
2408cf7087dbSKim Phillips vfio_disable_interrupts(vdev);
2409cf7087dbSKim Phillips
2410cf7087dbSKim Phillips /* Make sure the device is in D0 */
2411cf7087dbSKim Phillips if (vdev->pm_cap) {
2412cf7087dbSKim Phillips uint16_t pmcsr;
2413cf7087dbSKim Phillips uint8_t state;
2414cf7087dbSKim Phillips
2415cf7087dbSKim Phillips pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2416cf7087dbSKim Phillips state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2417cf7087dbSKim Phillips if (state) {
2418cf7087dbSKim Phillips pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
2419cf7087dbSKim Phillips vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
2420cf7087dbSKim Phillips /* vfio handles the necessary delay here */
2421cf7087dbSKim Phillips pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2422cf7087dbSKim Phillips state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2423cf7087dbSKim Phillips if (state) {
2424cf7087dbSKim Phillips error_report("vfio: Unable to power on device, stuck in D%d",
2425cf7087dbSKim Phillips state);
2426cf7087dbSKim Phillips }
2427cf7087dbSKim Phillips }
2428cf7087dbSKim Phillips }
2429cf7087dbSKim Phillips
2430cf7087dbSKim Phillips /*
2431631ba5a1SCai Huoqing * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master.
2432cf7087dbSKim Phillips * Also put INTx Disable in known state.
2433cf7087dbSKim Phillips */
2434cf7087dbSKim Phillips cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
2435cf7087dbSKim Phillips cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
2436cf7087dbSKim Phillips PCI_COMMAND_INTX_DISABLE);
2437cf7087dbSKim Phillips vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
2438cf7087dbSKim Phillips }
2439cf7087dbSKim Phillips
vfio_pci_post_reset(VFIOPCIDevice * vdev)2440c328e7e8SZhenzhong Duan void vfio_pci_post_reset(VFIOPCIDevice *vdev)
2441cf7087dbSKim Phillips {
24427dfb3424SEric Auger Error *err = NULL;
2443a52a4c47SIdo Yariv int nr;
24447dfb3424SEric Auger
2445c32bab07SZhenzhong Duan if (!vfio_intx_enable(vdev, &err)) {
2446c3b8e3e0SMarkus Armbruster error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
24477dfb3424SEric Auger }
2448a52a4c47SIdo Yariv
2449a52a4c47SIdo Yariv for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
2450a52a4c47SIdo Yariv off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr);
2451a52a4c47SIdo Yariv uint32_t val = 0;
2452a52a4c47SIdo Yariv uint32_t len = sizeof(val);
2453a52a4c47SIdo Yariv
2454a52a4c47SIdo Yariv if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) {
2455a52a4c47SIdo Yariv error_report("%s(%s) reset bar %d failed: %m", __func__,
2456a52a4c47SIdo Yariv vdev->vbasedev.name, nr);
2457a52a4c47SIdo Yariv }
2458a52a4c47SIdo Yariv }
2459469d02deSAlex Williamson
2460469d02deSAlex Williamson vfio_quirk_reset(vdev);
2461cf7087dbSKim Phillips }
2462cf7087dbSKim Phillips
vfio_pci_host_match(PCIHostDeviceAddress * addr,const char * name)2463c328e7e8SZhenzhong Duan bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
2464cf7087dbSKim Phillips {
24657df9381bSAlex Williamson char tmp[13];
24667df9381bSAlex Williamson
24677df9381bSAlex Williamson sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain,
24687df9381bSAlex Williamson addr->bus, addr->slot, addr->function);
24697df9381bSAlex Williamson
24707df9381bSAlex Williamson return (strcmp(tmp, name) == 0);
2471cf7087dbSKim Phillips }
2472cf7087dbSKim Phillips
vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice * vdev,struct vfio_pci_hot_reset_info ** info_p)24734d36ec23SZhenzhong Duan int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
24744d36ec23SZhenzhong Duan struct vfio_pci_hot_reset_info **info_p)
24754d36ec23SZhenzhong Duan {
24764d36ec23SZhenzhong Duan struct vfio_pci_hot_reset_info *info;
24774d36ec23SZhenzhong Duan int ret, count;
24784d36ec23SZhenzhong Duan
24794d36ec23SZhenzhong Duan assert(info_p && !*info_p);
24804d36ec23SZhenzhong Duan
24814d36ec23SZhenzhong Duan info = g_malloc0(sizeof(*info));
24824d36ec23SZhenzhong Duan info->argsz = sizeof(*info);
24834d36ec23SZhenzhong Duan
24844d36ec23SZhenzhong Duan ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
24854d36ec23SZhenzhong Duan if (ret && errno != ENOSPC) {
24864d36ec23SZhenzhong Duan ret = -errno;
24874d36ec23SZhenzhong Duan g_free(info);
24884d36ec23SZhenzhong Duan if (!vdev->has_pm_reset) {
24894d36ec23SZhenzhong Duan error_report("vfio: Cannot reset device %s, "
24904d36ec23SZhenzhong Duan "no available reset mechanism.", vdev->vbasedev.name);
24914d36ec23SZhenzhong Duan }
24924d36ec23SZhenzhong Duan return ret;
24934d36ec23SZhenzhong Duan }
24944d36ec23SZhenzhong Duan
24954d36ec23SZhenzhong Duan count = info->count;
24964d36ec23SZhenzhong Duan info = g_realloc(info, sizeof(*info) + (count * sizeof(info->devices[0])));
24974d36ec23SZhenzhong Duan info->argsz = sizeof(*info) + (count * sizeof(info->devices[0]));
24984d36ec23SZhenzhong Duan
24994d36ec23SZhenzhong Duan ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
25004d36ec23SZhenzhong Duan if (ret) {
25014d36ec23SZhenzhong Duan ret = -errno;
25024d36ec23SZhenzhong Duan g_free(info);
25034d36ec23SZhenzhong Duan error_report("vfio: hot reset info failed: %m");
25044d36ec23SZhenzhong Duan return ret;
25054d36ec23SZhenzhong Duan }
25064d36ec23SZhenzhong Duan
25074d36ec23SZhenzhong Duan *info_p = info;
25084d36ec23SZhenzhong Duan return 0;
25094d36ec23SZhenzhong Duan }
25104d36ec23SZhenzhong Duan
vfio_pci_hot_reset(VFIOPCIDevice * vdev,bool single)25119ee27d73SEric Auger static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
2512cf7087dbSKim Phillips {
2513c328e7e8SZhenzhong Duan VFIODevice *vbasedev = &vdev->vbasedev;
251441d698b8SCédric Le Goater const VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer);
2515cf7087dbSKim Phillips
251641d698b8SCédric Le Goater return vioc->pci_hot_reset(vbasedev, single);
2517cf7087dbSKim Phillips }
2518cf7087dbSKim Phillips
2519cf7087dbSKim Phillips /*
2520631ba5a1SCai Huoqing * We want to differentiate hot reset of multiple in-use devices vs hot reset
2521cf7087dbSKim Phillips * of a single in-use device. VFIO_DEVICE_RESET will already handle the case
2522cf7087dbSKim Phillips * of doing hot resets when there is only a single device per bus. The in-use
2523cf7087dbSKim Phillips * here refers to how many VFIODevices are affected. A hot reset that affects
2524cf7087dbSKim Phillips * multiple devices, but only a single in-use device, means that we can call
2525cf7087dbSKim Phillips * it from our bus ->reset() callback since the extent is effectively a single
2526cf7087dbSKim Phillips * device. This allows us to make use of it in the hotplug path. When there
2527cf7087dbSKim Phillips * are multiple in-use devices, we can only trigger the hot reset during a
2528cf7087dbSKim Phillips * system reset and thus from our reset handler. We separate _one vs _multi
2529cf7087dbSKim Phillips * here so that we don't overlap and do a double reset on the system reset
2530cf7087dbSKim Phillips * path where both our reset handler and ->reset() callback are used. Calling
2531cf7087dbSKim Phillips * _one() will only do a hot reset for the one in-use devices case, calling
2532cf7087dbSKim Phillips * _multi() will do nothing if a _one() would have been sufficient.
2533cf7087dbSKim Phillips */
vfio_pci_hot_reset_one(VFIOPCIDevice * vdev)25349ee27d73SEric Auger static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev)
2535cf7087dbSKim Phillips {
2536cf7087dbSKim Phillips return vfio_pci_hot_reset(vdev, true);
2537cf7087dbSKim Phillips }
2538cf7087dbSKim Phillips
vfio_pci_hot_reset_multi(VFIODevice * vbasedev)2539b47d8efaSEric Auger static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev)
2540cf7087dbSKim Phillips {
2541b47d8efaSEric Auger VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2542cf7087dbSKim Phillips return vfio_pci_hot_reset(vdev, false);
2543cf7087dbSKim Phillips }
2544cf7087dbSKim Phillips
vfio_pci_compute_needs_reset(VFIODevice * vbasedev)2545b47d8efaSEric Auger static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev)
2546b47d8efaSEric Auger {
2547b47d8efaSEric Auger VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2548b47d8efaSEric Auger if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
2549b47d8efaSEric Auger vbasedev->needs_reset = true;
2550b47d8efaSEric Auger }
2551b47d8efaSEric Auger }
2552b47d8efaSEric Auger
vfio_pci_get_object(VFIODevice * vbasedev)2553e93b733bSKirti Wankhede static Object *vfio_pci_get_object(VFIODevice *vbasedev)
2554e93b733bSKirti Wankhede {
2555e93b733bSKirti Wankhede VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2556e93b733bSKirti Wankhede
2557e93b733bSKirti Wankhede return OBJECT(vdev);
2558e93b733bSKirti Wankhede }
2559e93b733bSKirti Wankhede
vfio_msix_present(void * opaque,int version_id)2560c5e2fb3cSKirti Wankhede static bool vfio_msix_present(void *opaque, int version_id)
2561c5e2fb3cSKirti Wankhede {
2562c5e2fb3cSKirti Wankhede PCIDevice *pdev = opaque;
2563c5e2fb3cSKirti Wankhede
2564c5e2fb3cSKirti Wankhede return msix_present(pdev);
2565c5e2fb3cSKirti Wankhede }
2566c5e2fb3cSKirti Wankhede
vfio_display_migration_needed(void * opaque)256787417811SMarc-André Lureau static bool vfio_display_migration_needed(void *opaque)
256887417811SMarc-André Lureau {
256987417811SMarc-André Lureau VFIOPCIDevice *vdev = opaque;
257087417811SMarc-André Lureau
257187417811SMarc-André Lureau /*
257287417811SMarc-André Lureau * We need to migrate the VFIODisplay object if ramfb *migration* was
257387417811SMarc-André Lureau * explicitly requested (in which case we enforced both ramfb=on and
257487417811SMarc-André Lureau * display=on), or ramfb migration was left at the default "auto"
257587417811SMarc-André Lureau * setting, and *ramfb* was explicitly requested (in which case we
257687417811SMarc-André Lureau * enforced display=on).
257787417811SMarc-André Lureau */
257887417811SMarc-André Lureau return vdev->ramfb_migrate == ON_OFF_AUTO_ON ||
257987417811SMarc-André Lureau (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO && vdev->enable_ramfb);
258087417811SMarc-André Lureau }
258187417811SMarc-André Lureau
258275d5a5feSFrediano Ziglio static const VMStateDescription vmstate_vfio_display = {
258387417811SMarc-André Lureau .name = "VFIOPCIDevice/VFIODisplay",
258487417811SMarc-André Lureau .version_id = 1,
258587417811SMarc-André Lureau .minimum_version_id = 1,
258687417811SMarc-André Lureau .needed = vfio_display_migration_needed,
258765bd53e8SRichard Henderson .fields = (const VMStateField[]){
258887417811SMarc-André Lureau VMSTATE_STRUCT_POINTER(dpy, VFIOPCIDevice, vfio_display_vmstate,
258987417811SMarc-André Lureau VFIODisplay),
259087417811SMarc-André Lureau VMSTATE_END_OF_LIST()
259187417811SMarc-André Lureau }
259287417811SMarc-André Lureau };
259387417811SMarc-André Lureau
259475d5a5feSFrediano Ziglio static const VMStateDescription vmstate_vfio_pci_config = {
2595c5e2fb3cSKirti Wankhede .name = "VFIOPCIDevice",
2596c5e2fb3cSKirti Wankhede .version_id = 1,
2597c5e2fb3cSKirti Wankhede .minimum_version_id = 1,
259865bd53e8SRichard Henderson .fields = (const VMStateField[]) {
2599c5e2fb3cSKirti Wankhede VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
2600c5e2fb3cSKirti Wankhede VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present),
2601c5e2fb3cSKirti Wankhede VMSTATE_END_OF_LIST()
260287417811SMarc-André Lureau },
260365bd53e8SRichard Henderson .subsections = (const VMStateDescription * const []) {
260487417811SMarc-André Lureau &vmstate_vfio_display,
260587417811SMarc-André Lureau NULL
2606c5e2fb3cSKirti Wankhede }
2607c5e2fb3cSKirti Wankhede };
2608c5e2fb3cSKirti Wankhede
vfio_pci_save_config(VFIODevice * vbasedev,QEMUFile * f,Error ** errp)26093783f814SCédric Le Goater static int vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f, Error **errp)
2610c5e2fb3cSKirti Wankhede {
2611c5e2fb3cSKirti Wankhede VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2612c5e2fb3cSKirti Wankhede
26133783f814SCédric Le Goater return vmstate_save_state_with_err(f, &vmstate_vfio_pci_config, vdev, NULL,
26143783f814SCédric Le Goater errp);
2615c5e2fb3cSKirti Wankhede }
2616c5e2fb3cSKirti Wankhede
vfio_pci_load_config(VFIODevice * vbasedev,QEMUFile * f)2617c5e2fb3cSKirti Wankhede static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
2618c5e2fb3cSKirti Wankhede {
2619c5e2fb3cSKirti Wankhede VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2620c5e2fb3cSKirti Wankhede PCIDevice *pdev = &vdev->pdev;
2621f36d4fb8SKunkun Jiang pcibus_t old_addr[PCI_NUM_REGIONS - 1];
2622f36d4fb8SKunkun Jiang int bar, ret;
2623f36d4fb8SKunkun Jiang
2624f36d4fb8SKunkun Jiang for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
2625f36d4fb8SKunkun Jiang old_addr[bar] = pdev->io_regions[bar].addr;
2626f36d4fb8SKunkun Jiang }
2627c5e2fb3cSKirti Wankhede
2628c5e2fb3cSKirti Wankhede ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1);
2629c5e2fb3cSKirti Wankhede if (ret) {
2630c5e2fb3cSKirti Wankhede return ret;
2631c5e2fb3cSKirti Wankhede }
2632c5e2fb3cSKirti Wankhede
2633c5e2fb3cSKirti Wankhede vfio_pci_write_config(pdev, PCI_COMMAND,
2634c5e2fb3cSKirti Wankhede pci_get_word(pdev->config + PCI_COMMAND), 2);
2635c5e2fb3cSKirti Wankhede
2636f36d4fb8SKunkun Jiang for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
2637f36d4fb8SKunkun Jiang /*
2638f36d4fb8SKunkun Jiang * The address may not be changed in some scenarios
2639f36d4fb8SKunkun Jiang * (e.g. the VF driver isn't loaded in VM).
2640f36d4fb8SKunkun Jiang */
2641f36d4fb8SKunkun Jiang if (old_addr[bar] != pdev->io_regions[bar].addr &&
2642f36d4fb8SKunkun Jiang vdev->bars[bar].region.size > 0 &&
26438e3b0cbbSMarc-André Lureau vdev->bars[bar].region.size < qemu_real_host_page_size()) {
2644f36d4fb8SKunkun Jiang vfio_sub_page_bar_update_mapping(pdev, bar);
2645f36d4fb8SKunkun Jiang }
2646f36d4fb8SKunkun Jiang }
2647f36d4fb8SKunkun Jiang
2648c5e2fb3cSKirti Wankhede if (msi_enabled(pdev)) {
2649c5e2fb3cSKirti Wankhede vfio_msi_enable(vdev);
2650c5e2fb3cSKirti Wankhede } else if (msix_enabled(pdev)) {
2651c5e2fb3cSKirti Wankhede vfio_msix_enable(vdev);
2652c5e2fb3cSKirti Wankhede }
2653c5e2fb3cSKirti Wankhede
2654c5e2fb3cSKirti Wankhede return ret;
2655c5e2fb3cSKirti Wankhede }
2656c5e2fb3cSKirti Wankhede
2657b47d8efaSEric Auger static VFIODeviceOps vfio_pci_ops = {
2658b47d8efaSEric Auger .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
2659b47d8efaSEric Auger .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
2660870cb6f1SAlex Williamson .vfio_eoi = vfio_intx_eoi,
2661e93b733bSKirti Wankhede .vfio_get_object = vfio_pci_get_object,
2662c5e2fb3cSKirti Wankhede .vfio_save_config = vfio_pci_save_config,
2663c5e2fb3cSKirti Wankhede .vfio_load_config = vfio_pci_load_config,
2664b47d8efaSEric Auger };
2665b47d8efaSEric Auger
vfio_populate_vga(VFIOPCIDevice * vdev,Error ** errp)266664410a74SZhenzhong Duan bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
2667e593c021SAlex Williamson {
2668e593c021SAlex Williamson VFIODevice *vbasedev = &vdev->vbasedev;
26690d3e89beSZhenzhong Duan g_autofree struct vfio_region_info *reg_info = NULL;
2670e593c021SAlex Williamson int ret;
2671e593c021SAlex Williamson
26724225f2b6SAlex Williamson ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info);
2673e593c021SAlex Williamson if (ret) {
2674cde4279bSEric Auger error_setg_errno(errp, -ret,
2675cde4279bSEric Auger "failed getting region info for VGA region index %d",
2676cde4279bSEric Auger VFIO_PCI_VGA_REGION_INDEX);
267764410a74SZhenzhong Duan return false;
2678e593c021SAlex Williamson }
2679e593c021SAlex Williamson
2680e593c021SAlex Williamson if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) ||
2681e593c021SAlex Williamson !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) ||
2682e593c021SAlex Williamson reg_info->size < 0xbffff + 1) {
2683cde4279bSEric Auger error_setg(errp, "unexpected VGA info, flags 0x%lx, size 0x%lx",
2684e593c021SAlex Williamson (unsigned long)reg_info->flags,
2685e593c021SAlex Williamson (unsigned long)reg_info->size);
268664410a74SZhenzhong Duan return false;
2687e593c021SAlex Williamson }
2688e593c021SAlex Williamson
2689e593c021SAlex Williamson vdev->vga = g_new0(VFIOVGA, 1);
2690e593c021SAlex Williamson
2691e593c021SAlex Williamson vdev->vga->fd_offset = reg_info->offset;
2692e593c021SAlex Williamson vdev->vga->fd = vdev->vbasedev.fd;
2693e593c021SAlex Williamson
2694e593c021SAlex Williamson vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
2695e593c021SAlex Williamson vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
2696e593c021SAlex Williamson QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks);
2697e593c021SAlex Williamson
2698182bca45SAlex Williamson memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
2699182bca45SAlex Williamson OBJECT(vdev), &vfio_vga_ops,
2700182bca45SAlex Williamson &vdev->vga->region[QEMU_PCI_VGA_MEM],
2701182bca45SAlex Williamson "vfio-vga-mmio@0xa0000",
2702182bca45SAlex Williamson QEMU_PCI_VGA_MEM_SIZE);
2703182bca45SAlex Williamson
2704e593c021SAlex Williamson vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
2705e593c021SAlex Williamson vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
2706e593c021SAlex Williamson QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks);
2707e593c021SAlex Williamson
2708182bca45SAlex Williamson memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
2709182bca45SAlex Williamson OBJECT(vdev), &vfio_vga_ops,
2710182bca45SAlex Williamson &vdev->vga->region[QEMU_PCI_VGA_IO_LO],
2711182bca45SAlex Williamson "vfio-vga-io@0x3b0",
2712182bca45SAlex Williamson QEMU_PCI_VGA_IO_LO_SIZE);
2713182bca45SAlex Williamson
2714e593c021SAlex Williamson vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
2715e593c021SAlex Williamson vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
2716e593c021SAlex Williamson QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks);
2717e593c021SAlex Williamson
2718182bca45SAlex Williamson memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
2719182bca45SAlex Williamson OBJECT(vdev), &vfio_vga_ops,
2720182bca45SAlex Williamson &vdev->vga->region[QEMU_PCI_VGA_IO_HI],
2721182bca45SAlex Williamson "vfio-vga-io@0x3c0",
2722182bca45SAlex Williamson QEMU_PCI_VGA_IO_HI_SIZE);
2723182bca45SAlex Williamson
2724182bca45SAlex Williamson pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
2725182bca45SAlex Williamson &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
2726182bca45SAlex Williamson &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem);
2727182bca45SAlex Williamson
272864410a74SZhenzhong Duan return true;
2729e593c021SAlex Williamson }
2730e593c021SAlex Williamson
vfio_populate_device(VFIOPCIDevice * vdev,Error ** errp)2731e942d8f0SZhenzhong Duan static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
2732cf7087dbSKim Phillips {
2733217e9fdcSPaolo Bonzini VFIODevice *vbasedev = &vdev->vbasedev;
27340d3e89beSZhenzhong Duan g_autofree struct vfio_region_info *reg_info = NULL;
2735cf7087dbSKim Phillips struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
2736d13dd2d7SEric Auger int i, ret = -1;
2737cf7087dbSKim Phillips
2738cf7087dbSKim Phillips /* Sanity check device */
2739d13dd2d7SEric Auger if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) {
27402312d907SEric Auger error_setg(errp, "this isn't a PCI device");
2741e942d8f0SZhenzhong Duan return false;
2742cf7087dbSKim Phillips }
2743cf7087dbSKim Phillips
2744d13dd2d7SEric Auger if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
27452312d907SEric Auger error_setg(errp, "unexpected number of io regions %u",
2746d13dd2d7SEric Auger vbasedev->num_regions);
2747e942d8f0SZhenzhong Duan return false;
2748cf7087dbSKim Phillips }
2749cf7087dbSKim Phillips
2750d13dd2d7SEric Auger if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
27512312d907SEric Auger error_setg(errp, "unexpected number of irqs %u", vbasedev->num_irqs);
2752e942d8f0SZhenzhong Duan return false;
2753cf7087dbSKim Phillips }
2754cf7087dbSKim Phillips
2755cf7087dbSKim Phillips for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
2756db0da029SAlex Williamson char *name = g_strdup_printf("%s BAR %d", vbasedev->name, i);
2757db0da029SAlex Williamson
2758db0da029SAlex Williamson ret = vfio_region_setup(OBJECT(vdev), vbasedev,
2759db0da029SAlex Williamson &vdev->bars[i].region, i, name);
2760db0da029SAlex Williamson g_free(name);
2761db0da029SAlex Williamson
2762cf7087dbSKim Phillips if (ret) {
27632312d907SEric Auger error_setg_errno(errp, -ret, "failed to get region %d info", i);
2764e942d8f0SZhenzhong Duan return false;
2765cf7087dbSKim Phillips }
2766cf7087dbSKim Phillips
2767cf7087dbSKim Phillips QLIST_INIT(&vdev->bars[i].quirks);
2768cf7087dbSKim Phillips }
2769cf7087dbSKim Phillips
277046900226SAlex Williamson ret = vfio_get_region_info(vbasedev,
277146900226SAlex Williamson VFIO_PCI_CONFIG_REGION_INDEX, ®_info);
2772cf7087dbSKim Phillips if (ret) {
27732312d907SEric Auger error_setg_errno(errp, -ret, "failed to get config info");
2774e942d8f0SZhenzhong Duan return false;
2775cf7087dbSKim Phillips }
2776cf7087dbSKim Phillips
2777d13dd2d7SEric Auger trace_vfio_populate_device_config(vdev->vbasedev.name,
277846900226SAlex Williamson (unsigned long)reg_info->size,
277946900226SAlex Williamson (unsigned long)reg_info->offset,
278046900226SAlex Williamson (unsigned long)reg_info->flags);
2781cf7087dbSKim Phillips
278246900226SAlex Williamson vdev->config_size = reg_info->size;
2783cf7087dbSKim Phillips if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
2784cf7087dbSKim Phillips vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
2785cf7087dbSKim Phillips }
278646900226SAlex Williamson vdev->config_offset = reg_info->offset;
278746900226SAlex Williamson
2788e593c021SAlex Williamson if (vdev->features & VFIO_FEATURE_ENABLE_VGA) {
278964410a74SZhenzhong Duan if (!vfio_populate_vga(vdev, errp)) {
27902312d907SEric Auger error_append_hint(errp, "device does not support "
2791cde4279bSEric Auger "requested feature x-vga\n");
2792e942d8f0SZhenzhong Duan return false;
2793cf7087dbSKim Phillips }
2794cf7087dbSKim Phillips }
279547cbe50cSAlex Williamson
2796cf7087dbSKim Phillips irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
2797cf7087dbSKim Phillips
27985546a621SEric Auger ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
2799cf7087dbSKim Phillips if (ret) {
2800cf7087dbSKim Phillips /* This can fail for an old kernel or legacy PCI dev */
2801772f1b37SDaniel P. Berrangé trace_vfio_populate_device_get_irq_info_failure(strerror(errno));
2802cf7087dbSKim Phillips } else if (irq_info.count == 1) {
2803cf7087dbSKim Phillips vdev->pci_aer = true;
2804cf7087dbSKim Phillips } else {
2805e1eb292aSMarkus Armbruster warn_report(VFIO_MSG_PREFIX
2806cf7087dbSKim Phillips "Could not enable error recovery for the device",
2807df92ee44SEric Auger vbasedev->name);
2808cf7087dbSKim Phillips }
2809e942d8f0SZhenzhong Duan
2810e942d8f0SZhenzhong Duan return true;
2811d13dd2d7SEric Auger }
2812d13dd2d7SEric Auger
vfio_pci_put_device(VFIOPCIDevice * vdev)2813c06327c9SZhenzhong Duan static void vfio_pci_put_device(VFIOPCIDevice *vdev)
2814cf7087dbSKim Phillips {
28155456b186SEric Auger vfio_detach_device(&vdev->vbasedev);
28165456b186SEric Auger
2817462037c9SEric Auger g_free(vdev->vbasedev.name);
2818cf7087dbSKim Phillips g_free(vdev->msix);
2819cf7087dbSKim Phillips }
2820cf7087dbSKim Phillips
vfio_err_notifier_handler(void * opaque)2821cf7087dbSKim Phillips static void vfio_err_notifier_handler(void *opaque)
2822cf7087dbSKim Phillips {
28239ee27d73SEric Auger VFIOPCIDevice *vdev = opaque;
2824cf7087dbSKim Phillips
2825cf7087dbSKim Phillips if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
2826cf7087dbSKim Phillips return;
2827cf7087dbSKim Phillips }
2828cf7087dbSKim Phillips
2829cf7087dbSKim Phillips /*
2830cf7087dbSKim Phillips * TBD. Retrieve the error details and decide what action
2831cf7087dbSKim Phillips * needs to be taken. One of the actions could be to pass
2832cf7087dbSKim Phillips * the error to the guest and have the guest driver recover
2833cf7087dbSKim Phillips * from the error. This requires that PCIe capabilities be
2834cf7087dbSKim Phillips * exposed to the guest. For now, we just terminate the
2835cf7087dbSKim Phillips * guest to contain the error.
2836cf7087dbSKim Phillips */
2837cf7087dbSKim Phillips
28387df9381bSAlex Williamson error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
2839cf7087dbSKim Phillips
2840cf7087dbSKim Phillips vm_stop(RUN_STATE_INTERNAL_ERROR);
2841cf7087dbSKim Phillips }
2842cf7087dbSKim Phillips
2843cf7087dbSKim Phillips /*
2844cf7087dbSKim Phillips * Registers error notifier for devices supporting error recovery.
2845cf7087dbSKim Phillips * If we encounter a failure in this function, we report an error
2846cf7087dbSKim Phillips * and continue after disabling error recovery support for the
2847cf7087dbSKim Phillips * device.
2848cf7087dbSKim Phillips */
vfio_register_err_notifier(VFIOPCIDevice * vdev)28499ee27d73SEric Auger static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
2850cf7087dbSKim Phillips {
2851201a7331SEric Auger Error *err = NULL;
2852201a7331SEric Auger int32_t fd;
2853cf7087dbSKim Phillips
2854cf7087dbSKim Phillips if (!vdev->pci_aer) {
2855cf7087dbSKim Phillips return;
2856cf7087dbSKim Phillips }
2857cf7087dbSKim Phillips
2858cf7087dbSKim Phillips if (event_notifier_init(&vdev->err_notifier, 0)) {
2859cf7087dbSKim Phillips error_report("vfio: Unable to init event notifier for error detection");
2860cf7087dbSKim Phillips vdev->pci_aer = false;
2861cf7087dbSKim Phillips return;
2862cf7087dbSKim Phillips }
2863cf7087dbSKim Phillips
2864201a7331SEric Auger fd = event_notifier_get_fd(&vdev->err_notifier);
2865201a7331SEric Auger qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev);
2866cf7087dbSKim Phillips
286784e37d02SZhenzhong Duan if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
2868201a7331SEric Auger VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
2869201a7331SEric Auger error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2870201a7331SEric Auger qemu_set_fd_handler(fd, NULL, NULL, vdev);
2871cf7087dbSKim Phillips event_notifier_cleanup(&vdev->err_notifier);
2872cf7087dbSKim Phillips vdev->pci_aer = false;
2873cf7087dbSKim Phillips }
2874cf7087dbSKim Phillips }
2875cf7087dbSKim Phillips
vfio_unregister_err_notifier(VFIOPCIDevice * vdev)28769ee27d73SEric Auger static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
2877cf7087dbSKim Phillips {
2878201a7331SEric Auger Error *err = NULL;
2879cf7087dbSKim Phillips
2880cf7087dbSKim Phillips if (!vdev->pci_aer) {
2881cf7087dbSKim Phillips return;
2882cf7087dbSKim Phillips }
2883cf7087dbSKim Phillips
288484e37d02SZhenzhong Duan if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
2885201a7331SEric Auger VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
2886201a7331SEric Auger error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2887cf7087dbSKim Phillips }
2888cf7087dbSKim Phillips qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
2889cf7087dbSKim Phillips NULL, NULL, vdev);
2890cf7087dbSKim Phillips event_notifier_cleanup(&vdev->err_notifier);
2891cf7087dbSKim Phillips }
2892cf7087dbSKim Phillips
vfio_req_notifier_handler(void * opaque)289347cbe50cSAlex Williamson static void vfio_req_notifier_handler(void *opaque)
289447cbe50cSAlex Williamson {
289547cbe50cSAlex Williamson VFIOPCIDevice *vdev = opaque;
289635c7cb4cSAlex Williamson Error *err = NULL;
289747cbe50cSAlex Williamson
289847cbe50cSAlex Williamson if (!event_notifier_test_and_clear(&vdev->req_notifier)) {
289947cbe50cSAlex Williamson return;
290047cbe50cSAlex Williamson }
290147cbe50cSAlex Williamson
2902a2596aeeSPhilippe Mathieu-Daudé qdev_unplug(DEVICE(vdev), &err);
290335c7cb4cSAlex Williamson if (err) {
2904e1eb292aSMarkus Armbruster warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
290535c7cb4cSAlex Williamson }
290647cbe50cSAlex Williamson }
290747cbe50cSAlex Williamson
vfio_register_req_notifier(VFIOPCIDevice * vdev)290847cbe50cSAlex Williamson static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
290947cbe50cSAlex Williamson {
291047cbe50cSAlex Williamson struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
291147cbe50cSAlex Williamson .index = VFIO_PCI_REQ_IRQ_INDEX };
2912201a7331SEric Auger Error *err = NULL;
2913201a7331SEric Auger int32_t fd;
291447cbe50cSAlex Williamson
291547cbe50cSAlex Williamson if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
291647cbe50cSAlex Williamson return;
291747cbe50cSAlex Williamson }
291847cbe50cSAlex Williamson
291947cbe50cSAlex Williamson if (ioctl(vdev->vbasedev.fd,
292047cbe50cSAlex Williamson VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) {
292147cbe50cSAlex Williamson return;
292247cbe50cSAlex Williamson }
292347cbe50cSAlex Williamson
292447cbe50cSAlex Williamson if (event_notifier_init(&vdev->req_notifier, 0)) {
292547cbe50cSAlex Williamson error_report("vfio: Unable to init event notifier for device request");
292647cbe50cSAlex Williamson return;
292747cbe50cSAlex Williamson }
292847cbe50cSAlex Williamson
2929201a7331SEric Auger fd = event_notifier_get_fd(&vdev->req_notifier);
2930201a7331SEric Auger qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev);
293147cbe50cSAlex Williamson
293284e37d02SZhenzhong Duan if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
2933201a7331SEric Auger VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
2934201a7331SEric Auger error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2935201a7331SEric Auger qemu_set_fd_handler(fd, NULL, NULL, vdev);
293647cbe50cSAlex Williamson event_notifier_cleanup(&vdev->req_notifier);
293747cbe50cSAlex Williamson } else {
293847cbe50cSAlex Williamson vdev->req_enabled = true;
293947cbe50cSAlex Williamson }
294047cbe50cSAlex Williamson }
294147cbe50cSAlex Williamson
vfio_unregister_req_notifier(VFIOPCIDevice * vdev)294247cbe50cSAlex Williamson static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
294347cbe50cSAlex Williamson {
2944201a7331SEric Auger Error *err = NULL;
294547cbe50cSAlex Williamson
294647cbe50cSAlex Williamson if (!vdev->req_enabled) {
294747cbe50cSAlex Williamson return;
294847cbe50cSAlex Williamson }
294947cbe50cSAlex Williamson
295084e37d02SZhenzhong Duan if (!vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
2951201a7331SEric Auger VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
2952201a7331SEric Auger error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
295347cbe50cSAlex Williamson }
295447cbe50cSAlex Williamson qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
295547cbe50cSAlex Williamson NULL, NULL, vdev);
295647cbe50cSAlex Williamson event_notifier_cleanup(&vdev->req_notifier);
295747cbe50cSAlex Williamson
295847cbe50cSAlex Williamson vdev->req_enabled = false;
295947cbe50cSAlex Williamson }
296047cbe50cSAlex Williamson
vfio_realize(PCIDevice * pdev,Error ** errp)29611a22aca1SEric Auger static void vfio_realize(PCIDevice *pdev, Error **errp)
2962cf7087dbSKim Phillips {
2963cf8afdfaSZhao Liu ERRP_GUARD();
296401b46064SEduardo Habkost VFIOPCIDevice *vdev = VFIO_PCI(pdev);
29650d570a25SEric Auger VFIODevice *vbasedev = &vdev->vbasedev;
2966581406e0SAlex Williamson int i, ret;
2967f8d6f3b1SCédric Le Goater char uuid[UUID_STR_LEN];
296881987bd5SZhenzhong Duan g_autofree char *name = NULL;
2969cf7087dbSKim Phillips
2970da3e04b2SZhenzhong Duan if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
29714a946268SEric Auger if (!(~vdev->host.domain || ~vdev->host.bus ||
29724a946268SEric Auger ~vdev->host.slot || ~vdev->host.function)) {
29734a946268SEric Auger error_setg(errp, "No provided host device");
29746e4e6f0dSDong Jia Shi error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
2975da3e04b2SZhenzhong Duan #ifdef CONFIG_IOMMUFD
2976da3e04b2SZhenzhong Duan "or -device vfio-pci,fd=DEVICE_FD "
2977da3e04b2SZhenzhong Duan #endif
29786e4e6f0dSDong Jia Shi "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
29794a946268SEric Auger return;
29804a946268SEric Auger }
29810d570a25SEric Auger vbasedev->sysfsdev =
29827df9381bSAlex Williamson g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x",
29837df9381bSAlex Williamson vdev->host.domain, vdev->host.bus,
29847df9381bSAlex Williamson vdev->host.slot, vdev->host.function);
29857df9381bSAlex Williamson }
29867df9381bSAlex Williamson
2987c6c6cf91SZhenzhong Duan if (!vfio_device_get_name(vbasedev, errp)) {
29881a22aca1SEric Auger return;
2989cf7087dbSKim Phillips }
2990462037c9SEric Auger
2991238e9172SAlex Williamson /*
2992aff92b82SDavid Hildenbrand * Mediated devices *might* operate compatibly with discarding of RAM, but
2993238e9172SAlex Williamson * we cannot know for certain, it depends on whether the mdev vendor driver
2994238e9172SAlex Williamson * stays in sync with the active working set of the guest driver. Prevent
2995238e9172SAlex Williamson * the x-balloon-allowed option unless this is minimally an mdev device.
2996238e9172SAlex Williamson */
299713e522f6SJoao Martins vbasedev->mdev = vfio_device_is_mdev(vbasedev);
2998238e9172SAlex Williamson
299913e522f6SJoao Martins trace_vfio_mdev(vbasedev->name, vbasedev->mdev);
3000238e9172SAlex Williamson
300113e522f6SJoao Martins if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) {
3002238e9172SAlex Williamson error_setg(errp, "x-balloon-allowed only potentially compatible "
3003238e9172SAlex Williamson "with mdev devices");
3004238e9172SAlex Williamson goto error;
3005238e9172SAlex Williamson }
3006238e9172SAlex Williamson
30072dca1b37SMinwoo Im if (!qemu_uuid_is_null(&vdev->vf_token)) {
30082dca1b37SMinwoo Im qemu_uuid_unparse(&vdev->vf_token, uuid);
30092dca1b37SMinwoo Im name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid);
30102dca1b37SMinwoo Im } else {
3011b83b40b6SZhenzhong Duan name = g_strdup(vbasedev->name);
30122dca1b37SMinwoo Im }
30132dca1b37SMinwoo Im
3014b7754835SZhenzhong Duan if (!vfio_attach_device(name, vbasedev,
3015b7754835SZhenzhong Duan pci_device_iommu_address_space(pdev), errp)) {
3016426ec904SEric Auger goto error;
3017cf7087dbSKim Phillips }
3018cf7087dbSKim Phillips
3019e942d8f0SZhenzhong Duan if (!vfio_populate_device(vdev, errp)) {
3020c0f527f4SEric Auger goto error;
3021217e9fdcSPaolo Bonzini }
3022217e9fdcSPaolo Bonzini
3023cf7087dbSKim Phillips /* Get a copy of config space */
30240d570a25SEric Auger ret = pread(vbasedev->fd, vdev->pdev.config,
3025cf7087dbSKim Phillips MIN(pci_config_size(&vdev->pdev), vdev->config_size),
3026cf7087dbSKim Phillips vdev->config_offset);
3027cf7087dbSKim Phillips if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
3028cf7087dbSKim Phillips ret = ret < 0 ? -errno : -EFAULT;
30291a22aca1SEric Auger error_setg_errno(errp, -ret, "failed to read device config space");
3030c0f527f4SEric Auger goto error;
3031cf7087dbSKim Phillips }
3032cf7087dbSKim Phillips
3033cf7087dbSKim Phillips /* vfio emulates a lot for us, but some bits need extra love */
3034cf7087dbSKim Phillips vdev->emulated_config_bits = g_malloc0(vdev->config_size);
3035cf7087dbSKim Phillips
3036cf7087dbSKim Phillips /* QEMU can choose to expose the ROM or not */
3037cf7087dbSKim Phillips memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
303804f336b0SAlex Williamson /* QEMU can also add or extend BARs */
303904f336b0SAlex Williamson memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
3040cf7087dbSKim Phillips
304189dcccc5SAlex Williamson /*
304289dcccc5SAlex Williamson * The PCI spec reserves vendor ID 0xffff as an invalid value. The
304389dcccc5SAlex Williamson * device ID is managed by the vendor and need only be a 16-bit value.
304489dcccc5SAlex Williamson * Allow any 16-bit value for subsystem so they can be hidden or changed.
304589dcccc5SAlex Williamson */
304689dcccc5SAlex Williamson if (vdev->vendor_id != PCI_ANY_ID) {
304789dcccc5SAlex Williamson if (vdev->vendor_id >= 0xffff) {
30481a22aca1SEric Auger error_setg(errp, "invalid PCI vendor ID provided");
3049c0f527f4SEric Auger goto error;
305089dcccc5SAlex Williamson }
305189dcccc5SAlex Williamson vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
30520d570a25SEric Auger trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id);
305389dcccc5SAlex Williamson } else {
3054ff635e37SAlex Williamson vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
305589dcccc5SAlex Williamson }
305689dcccc5SAlex Williamson
305789dcccc5SAlex Williamson if (vdev->device_id != PCI_ANY_ID) {
305889dcccc5SAlex Williamson if (vdev->device_id > 0xffff) {
30591a22aca1SEric Auger error_setg(errp, "invalid PCI device ID provided");
3060c0f527f4SEric Auger goto error;
306189dcccc5SAlex Williamson }
306289dcccc5SAlex Williamson vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
30630d570a25SEric Auger trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id);
306489dcccc5SAlex Williamson } else {
3065ff635e37SAlex Williamson vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
306689dcccc5SAlex Williamson }
306789dcccc5SAlex Williamson
306889dcccc5SAlex Williamson if (vdev->sub_vendor_id != PCI_ANY_ID) {
306989dcccc5SAlex Williamson if (vdev->sub_vendor_id > 0xffff) {
30701a22aca1SEric Auger error_setg(errp, "invalid PCI subsystem vendor ID provided");
3071c0f527f4SEric Auger goto error;
307289dcccc5SAlex Williamson }
307389dcccc5SAlex Williamson vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
307489dcccc5SAlex Williamson vdev->sub_vendor_id, ~0);
30750d570a25SEric Auger trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name,
307689dcccc5SAlex Williamson vdev->sub_vendor_id);
307789dcccc5SAlex Williamson }
307889dcccc5SAlex Williamson
307989dcccc5SAlex Williamson if (vdev->sub_device_id != PCI_ANY_ID) {
308089dcccc5SAlex Williamson if (vdev->sub_device_id > 0xffff) {
30811a22aca1SEric Auger error_setg(errp, "invalid PCI subsystem device ID provided");
3082c0f527f4SEric Auger goto error;
308389dcccc5SAlex Williamson }
308489dcccc5SAlex Williamson vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
30850d570a25SEric Auger trace_vfio_pci_emulated_sub_device_id(vbasedev->name,
308689dcccc5SAlex Williamson vdev->sub_device_id);
308789dcccc5SAlex Williamson }
3088ff635e37SAlex Williamson
3089cf7087dbSKim Phillips /* QEMU can change multi-function devices to single function, or reverse */
3090cf7087dbSKim Phillips vdev->emulated_config_bits[PCI_HEADER_TYPE] =
3091cf7087dbSKim Phillips PCI_HEADER_TYPE_MULTI_FUNCTION;
3092cf7087dbSKim Phillips
3093cf7087dbSKim Phillips /* Restore or clear multifunction, this is always controlled by QEMU */
3094cf7087dbSKim Phillips if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
3095cf7087dbSKim Phillips vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
3096cf7087dbSKim Phillips } else {
3097cf7087dbSKim Phillips vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
3098cf7087dbSKim Phillips }
3099cf7087dbSKim Phillips
3100cf7087dbSKim Phillips /*
3101cf7087dbSKim Phillips * Clear host resource mapping info. If we choose not to register a
3102cf7087dbSKim Phillips * BAR, such as might be the case with the option ROM, we can get
3103cf7087dbSKim Phillips * confusing, unwritable, residual addresses from the host here.
3104cf7087dbSKim Phillips */
3105cf7087dbSKim Phillips memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
3106cf7087dbSKim Phillips memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
3107cf7087dbSKim Phillips
3108cf7087dbSKim Phillips vfio_pci_size_rom(vdev);
3109cf7087dbSKim Phillips
311089d5202eSAlex Williamson vfio_bars_prepare(vdev);
311189d5202eSAlex Williamson
3112713b59a6SZhenzhong Duan if (!vfio_msix_early_setup(vdev, errp)) {
3113c0f527f4SEric Auger goto error;
3114cf7087dbSKim Phillips }
3115cf7087dbSKim Phillips
31163a286732SAlex Williamson vfio_bars_register(vdev);
3117cf7087dbSKim Phillips
31189f176041SJoao Martins if (!vbasedev->mdev &&
31199f176041SJoao Martins !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) {
3120ee26474dSZhenzhong Duan error_prepend(errp, "Failed to set iommu_device: ");
3121cf7087dbSKim Phillips goto out_teardown;
3122cf7087dbSKim Phillips }
3123cf7087dbSKim Phillips
3124ee26474dSZhenzhong Duan if (!vfio_add_capabilities(vdev, errp)) {
3125ee26474dSZhenzhong Duan goto out_unset_idev;
3126ee26474dSZhenzhong Duan }
3127ee26474dSZhenzhong Duan
3128182bca45SAlex Williamson if (vdev->vga) {
3129182bca45SAlex Williamson vfio_vga_quirk_setup(vdev);
3130182bca45SAlex Williamson }
3131182bca45SAlex Williamson
3132581406e0SAlex Williamson for (i = 0; i < PCI_ROM_SLOT; i++) {
3133581406e0SAlex Williamson vfio_bar_quirk_setup(vdev, i);
3134581406e0SAlex Williamson }
3135581406e0SAlex Williamson
31366ced0bbaSAlex Williamson if (!vdev->igd_opregion &&
31376ced0bbaSAlex Williamson vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) {
3138514855e1SZhenzhong Duan g_autofree struct vfio_region_info *opregion = NULL;
31396ced0bbaSAlex Williamson
31406ced0bbaSAlex Williamson if (vdev->pdev.qdev.hotplugged) {
31411a22aca1SEric Auger error_setg(errp,
3142426ec904SEric Auger "cannot support IGD OpRegion feature on hotplugged "
3143426ec904SEric Auger "device");
3144ee26474dSZhenzhong Duan goto out_unset_idev;
31456ced0bbaSAlex Williamson }
31466ced0bbaSAlex Williamson
31470d570a25SEric Auger ret = vfio_get_dev_region_info(vbasedev,
31486ced0bbaSAlex Williamson VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
31496ced0bbaSAlex Williamson VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion);
31506ced0bbaSAlex Williamson if (ret) {
31511a22aca1SEric Auger error_setg_errno(errp, -ret,
3152426ec904SEric Auger "does not support requested IGD OpRegion feature");
3153ee26474dSZhenzhong Duan goto out_unset_idev;
31546ced0bbaSAlex Williamson }
31556ced0bbaSAlex Williamson
3156d3c6a18bSZhenzhong Duan if (!vfio_pci_igd_opregion_init(vdev, opregion, errp)) {
3157ee26474dSZhenzhong Duan goto out_unset_idev;
31586ced0bbaSAlex Williamson }
31596ced0bbaSAlex Williamson }
31606ced0bbaSAlex Williamson
3161cf7087dbSKim Phillips /* QEMU emulates all of MSI & MSIX */
3162cf7087dbSKim Phillips if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
3163cf7087dbSKim Phillips memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
3164cf7087dbSKim Phillips MSIX_CAP_LENGTH);
3165cf7087dbSKim Phillips }
3166cf7087dbSKim Phillips
3167cf7087dbSKim Phillips if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
3168cf7087dbSKim Phillips memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
3169cf7087dbSKim Phillips vdev->msi_cap_size);
3170cf7087dbSKim Phillips }
3171cf7087dbSKim Phillips
3172cf7087dbSKim Phillips if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
3173cf7087dbSKim Phillips vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
3174cf7087dbSKim Phillips vfio_intx_mmap_enable, vdev);
3175ad54dbd8SDavid Gibson pci_device_set_intx_routing_notifier(&vdev->pdev,
3176ad54dbd8SDavid Gibson vfio_intx_routing_notifier);
3177c5478feaSDavid Gibson vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
3178c5478feaSDavid Gibson kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
3179c32bab07SZhenzhong Duan if (!vfio_intx_enable(vdev, errp)) {
3180c5478feaSDavid Gibson goto out_deregister;
3181cf7087dbSKim Phillips }
3182cf7087dbSKim Phillips }
3183cf7087dbSKim Phillips
3184a9994687SGerd Hoffmann if (vdev->display != ON_OFF_AUTO_OFF) {
3185455c009dSZhenzhong Duan if (!vfio_display_probe(vdev, errp)) {
3186c5478feaSDavid Gibson goto out_deregister;
3187a9994687SGerd Hoffmann }
3188a9994687SGerd Hoffmann }
3189b290659fSGerd Hoffmann if (vdev->enable_ramfb && vdev->dpy == NULL) {
3190b290659fSGerd Hoffmann error_setg(errp, "ramfb=on requires display=on");
3191c5478feaSDavid Gibson goto out_deregister;
3192b290659fSGerd Hoffmann }
3193c62a0c7cSGerd Hoffmann if (vdev->display_xres || vdev->display_yres) {
3194c62a0c7cSGerd Hoffmann if (vdev->dpy == NULL) {
3195c62a0c7cSGerd Hoffmann error_setg(errp, "xres and yres properties require display=on");
3196c5478feaSDavid Gibson goto out_deregister;
3197c62a0c7cSGerd Hoffmann }
3198c62a0c7cSGerd Hoffmann if (vdev->dpy->edid_regs == NULL) {
3199c62a0c7cSGerd Hoffmann error_setg(errp, "xres and yres properties need edid support");
3200c5478feaSDavid Gibson goto out_deregister;
3201c62a0c7cSGerd Hoffmann }
3202c62a0c7cSGerd Hoffmann }
3203a9994687SGerd Hoffmann
320487417811SMarc-André Lureau if (vdev->ramfb_migrate == ON_OFF_AUTO_ON && !vdev->enable_ramfb) {
320587417811SMarc-André Lureau warn_report("x-ramfb-migrate=on but ramfb=off. "
320687417811SMarc-André Lureau "Forcing x-ramfb-migrate to off.");
320787417811SMarc-André Lureau vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
320887417811SMarc-André Lureau }
320987417811SMarc-André Lureau if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) {
321087417811SMarc-André Lureau if (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO) {
321187417811SMarc-André Lureau vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
321287417811SMarc-André Lureau } else if (vdev->ramfb_migrate == ON_OFF_AUTO_ON) {
321387417811SMarc-André Lureau error_setg(errp, "x-ramfb-migrate requires enable-migration");
321487417811SMarc-André Lureau goto out_deregister;
321587417811SMarc-André Lureau }
321687417811SMarc-André Lureau }
321787417811SMarc-André Lureau
3218a2265105SKirti Wankhede if (!pdev->failover_pair_id) {
3219d4a2af74SZhenzhong Duan if (!vfio_migration_realize(vbasedev, errp)) {
32202b43b299SZhenzhong Duan goto out_deregister;
3221a2265105SKirti Wankhede }
3222a2265105SKirti Wankhede }
3223a2265105SKirti Wankhede
3224cf7087dbSKim Phillips vfio_register_err_notifier(vdev);
322547cbe50cSAlex Williamson vfio_register_req_notifier(vdev);
3226c9c50009SAlex Williamson vfio_setup_resetfn_quirk(vdev);
3227cf7087dbSKim Phillips
32281a22aca1SEric Auger return;
3229cf7087dbSKim Phillips
3230c5478feaSDavid Gibson out_deregister:
3231adee0da0SZhenzhong Duan if (vdev->interrupt == VFIO_INT_INTx) {
3232adee0da0SZhenzhong Duan vfio_intx_disable(vdev);
3233adee0da0SZhenzhong Duan }
3234cf7087dbSKim Phillips pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3235357bd793SZhenzhong Duan if (vdev->irqchip_change_notifier.notify) {
3236c5478feaSDavid Gibson kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3237357bd793SZhenzhong Duan }
32380cc889c8SZhenzhong Duan if (vdev->intx.mmap_timer) {
32390cc889c8SZhenzhong Duan timer_free(vdev->intx.mmap_timer);
32400cc889c8SZhenzhong Duan }
3241ee26474dSZhenzhong Duan out_unset_idev:
32429f176041SJoao Martins if (!vbasedev->mdev) {
3243ee26474dSZhenzhong Duan pci_device_unset_iommu_device(pdev);
32449f176041SJoao Martins }
3245c5478feaSDavid Gibson out_teardown:
3246cf7087dbSKim Phillips vfio_teardown_msi(vdev);
32472d82f8a3SAlex Williamson vfio_bars_exit(vdev);
3248426ec904SEric Auger error:
32490d570a25SEric Auger error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name);
325077a10d04SPaolo Bonzini }
325177a10d04SPaolo Bonzini
vfio_instance_finalize(Object * obj)325277a10d04SPaolo Bonzini static void vfio_instance_finalize(Object *obj)
325377a10d04SPaolo Bonzini {
325401b46064SEduardo Habkost VFIOPCIDevice *vdev = VFIO_PCI(obj);
325577a10d04SPaolo Bonzini
3256a9994687SGerd Hoffmann vfio_display_finalize(vdev);
32572d82f8a3SAlex Williamson vfio_bars_finalize(vdev);
3258cf7087dbSKim Phillips g_free(vdev->emulated_config_bits);
325977a10d04SPaolo Bonzini g_free(vdev->rom);
3260c4c45e94SAlex Williamson /*
3261c4c45e94SAlex Williamson * XXX Leaking igd_opregion is not an oversight, we can't remove the
3262c4c45e94SAlex Williamson * fw_cfg entry therefore leaking this allocation seems like the safest
3263c4c45e94SAlex Williamson * option.
3264c4c45e94SAlex Williamson *
3265c4c45e94SAlex Williamson * g_free(vdev->igd_opregion);
3266c4c45e94SAlex Williamson */
3267c06327c9SZhenzhong Duan vfio_pci_put_device(vdev);
3268cf7087dbSKim Phillips }
3269cf7087dbSKim Phillips
vfio_exitfn(PCIDevice * pdev)3270cf7087dbSKim Phillips static void vfio_exitfn(PCIDevice *pdev)
3271cf7087dbSKim Phillips {
327201b46064SEduardo Habkost VFIOPCIDevice *vdev = VFIO_PCI(pdev);
3273ee26474dSZhenzhong Duan VFIODevice *vbasedev = &vdev->vbasedev;
3274cf7087dbSKim Phillips
327547cbe50cSAlex Williamson vfio_unregister_req_notifier(vdev);
3276cf7087dbSKim Phillips vfio_unregister_err_notifier(vdev);
3277cf7087dbSKim Phillips pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
32780446f812SPeter Xu if (vdev->irqchip_change_notifier.notify) {
3279c5478feaSDavid Gibson kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
32800446f812SPeter Xu }
3281cf7087dbSKim Phillips vfio_disable_interrupts(vdev);
3282cf7087dbSKim Phillips if (vdev->intx.mmap_timer) {
3283cf7087dbSKim Phillips timer_free(vdev->intx.mmap_timer);
3284cf7087dbSKim Phillips }
3285cf7087dbSKim Phillips vfio_teardown_msi(vdev);
3286c00aac6fSAlex Williamson vfio_pci_disable_rp_atomics(vdev);
32872d82f8a3SAlex Williamson vfio_bars_exit(vdev);
3288ee26474dSZhenzhong Duan vfio_migration_exit(vbasedev);
32899f176041SJoao Martins if (!vbasedev->mdev) {
3290ee26474dSZhenzhong Duan pci_device_unset_iommu_device(pdev);
3291cf7087dbSKim Phillips }
32929f176041SJoao Martins }
3293cf7087dbSKim Phillips
vfio_pci_reset(DeviceState * dev)3294cf7087dbSKim Phillips static void vfio_pci_reset(DeviceState *dev)
3295cf7087dbSKim Phillips {
329601b46064SEduardo Habkost VFIOPCIDevice *vdev = VFIO_PCI(dev);
3297cf7087dbSKim Phillips
3298df92ee44SEric Auger trace_vfio_pci_reset(vdev->vbasedev.name);
3299cf7087dbSKim Phillips
3300cf7087dbSKim Phillips vfio_pci_pre_reset(vdev);
3301cf7087dbSKim Phillips
33028983e3e3STina Zhang if (vdev->display != ON_OFF_AUTO_OFF) {
33038983e3e3STina Zhang vfio_display_reset(vdev);
33048983e3e3STina Zhang }
33058983e3e3STina Zhang
33065655f931SAlex Williamson if (vdev->resetfn && !vdev->resetfn(vdev)) {
33075655f931SAlex Williamson goto post_reset;
33085655f931SAlex Williamson }
33095655f931SAlex Williamson
3310b47d8efaSEric Auger if (vdev->vbasedev.reset_works &&
3311b47d8efaSEric Auger (vdev->has_flr || !vdev->has_pm_reset) &&
33125546a621SEric Auger !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3313df92ee44SEric Auger trace_vfio_pci_reset_flr(vdev->vbasedev.name);
3314cf7087dbSKim Phillips goto post_reset;
3315cf7087dbSKim Phillips }
3316cf7087dbSKim Phillips
3317cf7087dbSKim Phillips /* See if we can do our own bus reset */
3318cf7087dbSKim Phillips if (!vfio_pci_hot_reset_one(vdev)) {
3319cf7087dbSKim Phillips goto post_reset;
3320cf7087dbSKim Phillips }
3321cf7087dbSKim Phillips
3322cf7087dbSKim Phillips /* If nothing else works and the device supports PM reset, use it */
3323b47d8efaSEric Auger if (vdev->vbasedev.reset_works && vdev->has_pm_reset &&
33245546a621SEric Auger !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3325df92ee44SEric Auger trace_vfio_pci_reset_pm(vdev->vbasedev.name);
3326cf7087dbSKim Phillips goto post_reset;
3327cf7087dbSKim Phillips }
3328cf7087dbSKim Phillips
3329cf7087dbSKim Phillips post_reset:
3330cf7087dbSKim Phillips vfio_pci_post_reset(vdev);
3331cf7087dbSKim Phillips }
3332cf7087dbSKim Phillips
vfio_instance_init(Object * obj)3333cf7087dbSKim Phillips static void vfio_instance_init(Object *obj)
3334cf7087dbSKim Phillips {
3335cf7087dbSKim Phillips PCIDevice *pci_dev = PCI_DEVICE(obj);
333601b46064SEduardo Habkost VFIOPCIDevice *vdev = VFIO_PCI(obj);
3337dd2fcb17SZhenzhong Duan VFIODevice *vbasedev = &vdev->vbasedev;
3338cf7087dbSKim Phillips
3339cf7087dbSKim Phillips device_add_bootindex_property(obj, &vdev->bootindex,
3340cf7087dbSKim Phillips "bootindex", NULL,
334140c2281cSMarkus Armbruster &pci_dev->qdev);
33424a946268SEric Auger vdev->host.domain = ~0U;
33434a946268SEric Auger vdev->host.bus = ~0U;
33444a946268SEric Auger vdev->host.slot = ~0U;
33454a946268SEric Auger vdev->host.function = ~0U;
3346dd2fcb17SZhenzhong Duan
33476106a329SZhenzhong Duan vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops,
33486106a329SZhenzhong Duan DEVICE(vdev), false);
3349dfbee78dSAlex Williamson
3350dfbee78dSAlex Williamson vdev->nv_gpudirect_clique = 0xFF;
3351d61a363dSYoni Bettan
3352d61a363dSYoni Bettan /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
3353d61a363dSYoni Bettan * line, therefore, no need to wait to realize like other devices */
3354d61a363dSYoni Bettan pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
3355cf7087dbSKim Phillips }
3356cf7087dbSKim Phillips
3357cf7087dbSKim Phillips static Property vfio_pci_dev_properties[] = {
33589ee27d73SEric Auger DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
33592dca1b37SMinwoo Im DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
33607df9381bSAlex Williamson DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
3361bb0990d1SKirti Wankhede DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
3362bb0990d1SKirti Wankhede vbasedev.pre_copy_dirty_page_tracking,
3363bb0990d1SKirti Wankhede ON_OFF_AUTO_ON),
336430b91677SJoao Martins DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice,
336530b91677SJoao Martins vbasedev.device_dirty_page_tracking,
336630b91677SJoao Martins ON_OFF_AUTO_ON),
3367a9994687SGerd Hoffmann DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
33688151a9c5SAlex Williamson display, ON_OFF_AUTO_OFF),
3369c62a0c7cSGerd Hoffmann DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
3370c62a0c7cSGerd Hoffmann DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
33719ee27d73SEric Auger DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
3372cf7087dbSKim Phillips intx.mmap_timeout, 1100),
33739ee27d73SEric Auger DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
3374cf7087dbSKim Phillips VFIO_FEATURE_ENABLE_VGA_BIT, false),
337547cbe50cSAlex Williamson DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
337647cbe50cSAlex Williamson VFIO_FEATURE_ENABLE_REQ_BIT, true),
33776ced0bbaSAlex Williamson DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
33786ced0bbaSAlex Williamson VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
33798bbcb64aSAvihai Horon DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice,
33808bbcb64aSAvihai Horon vbasedev.enable_migration, ON_OFF_AUTO_AUTO),
33815e1f8905SAvihai Horon DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice,
33825e1f8905SAvihai Horon vbasedev.migration_events, false),
33835e15d79bSAlex Williamson DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
3384238e9172SAlex Williamson DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
3385aff92b82SDavid Hildenbrand vbasedev.ram_block_discard_allowed, false),
338646746dbaSAlex Williamson DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
338746746dbaSAlex Williamson DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
338846746dbaSAlex Williamson DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
3389db32d0f4SAlex Williamson DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
3390db32d0f4SAlex Williamson no_geforce_quirks, false),
3391c958c51dSAlex Williamson DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
3392c958c51dSAlex Williamson false),
33932b1dbd0dSAlex Williamson DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
33942b1dbd0dSAlex Williamson false),
339589dcccc5SAlex Williamson DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
339689dcccc5SAlex Williamson DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
339789dcccc5SAlex Williamson DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
339889dcccc5SAlex Williamson sub_vendor_id, PCI_ANY_ID),
339989dcccc5SAlex Williamson DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
340089dcccc5SAlex Williamson sub_device_id, PCI_ANY_ID),
3401c4c45e94SAlex Williamson DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
3402dfbee78dSAlex Williamson DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
3403dfbee78dSAlex Williamson nv_gpudirect_clique,
3404dfbee78dSAlex Williamson qdev_prop_nv_gpudirect_clique, uint8_t),
340589d5202eSAlex Williamson DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
340655872c70SMarkus Armbruster OFF_AUTO_PCIBAR_OFF),
3407ee42b261SEric Auger #ifdef CONFIG_IOMMUFD
3408ee42b261SEric Auger DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
3409ee42b261SEric Auger TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
3410ee42b261SEric Auger #endif
3411187716feSVinayak Kale DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
3412cf7087dbSKim Phillips DEFINE_PROP_END_OF_LIST(),
3413cf7087dbSKim Phillips };
3414cf7087dbSKim Phillips
3415da3e04b2SZhenzhong Duan #ifdef CONFIG_IOMMUFD
vfio_pci_set_fd(Object * obj,const char * str,Error ** errp)3416da3e04b2SZhenzhong Duan static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp)
3417da3e04b2SZhenzhong Duan {
3418da3e04b2SZhenzhong Duan vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp);
3419da3e04b2SZhenzhong Duan }
3420da3e04b2SZhenzhong Duan #endif
3421da3e04b2SZhenzhong Duan
vfio_pci_dev_class_init(ObjectClass * klass,void * data)3422cf7087dbSKim Phillips static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
3423cf7087dbSKim Phillips {
3424cf7087dbSKim Phillips DeviceClass *dc = DEVICE_CLASS(klass);
3425cf7087dbSKim Phillips PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3426cf7087dbSKim Phillips
3427*e3d08143SPeter Maydell device_class_set_legacy_reset(dc, vfio_pci_reset);
34284f67d30bSMarc-André Lureau device_class_set_props(dc, vfio_pci_dev_properties);
3429da3e04b2SZhenzhong Duan #ifdef CONFIG_IOMMUFD
3430da3e04b2SZhenzhong Duan object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd);
3431da3e04b2SZhenzhong Duan #endif
3432cf7087dbSKim Phillips dc->desc = "VFIO-based PCI device assignment";
3433cf7087dbSKim Phillips set_bit(DEVICE_CATEGORY_MISC, dc->categories);
34341a22aca1SEric Auger pdc->realize = vfio_realize;
3435cf7087dbSKim Phillips pdc->exit = vfio_exitfn;
3436cf7087dbSKim Phillips pdc->config_read = vfio_pci_read_config;
3437cf7087dbSKim Phillips pdc->config_write = vfio_pci_write_config;
3438cf7087dbSKim Phillips }
3439cf7087dbSKim Phillips
3440cf7087dbSKim Phillips static const TypeInfo vfio_pci_dev_info = {
34412683ccd5SLi Qiang .name = TYPE_VFIO_PCI,
3442cf7087dbSKim Phillips .parent = TYPE_PCI_DEVICE,
34439ee27d73SEric Auger .instance_size = sizeof(VFIOPCIDevice),
3444cf7087dbSKim Phillips .class_init = vfio_pci_dev_class_init,
3445cf7087dbSKim Phillips .instance_init = vfio_instance_init,
344677a10d04SPaolo Bonzini .instance_finalize = vfio_instance_finalize,
3447a5fa336fSEduardo Habkost .interfaces = (InterfaceInfo[]) {
3448a5fa336fSEduardo Habkost { INTERFACE_PCIE_DEVICE },
3449a5fa336fSEduardo Habkost { INTERFACE_CONVENTIONAL_PCI_DEVICE },
3450a5fa336fSEduardo Habkost { }
3451a5fa336fSEduardo Habkost },
3452cf7087dbSKim Phillips };
3453cf7087dbSKim Phillips
3454b290659fSGerd Hoffmann static Property vfio_pci_dev_nohotplug_properties[] = {
3455b290659fSGerd Hoffmann DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false),
345687417811SMarc-André Lureau DEFINE_PROP_ON_OFF_AUTO("x-ramfb-migrate", VFIOPCIDevice, ramfb_migrate,
345787417811SMarc-André Lureau ON_OFF_AUTO_AUTO),
3458b290659fSGerd Hoffmann DEFINE_PROP_END_OF_LIST(),
3459b290659fSGerd Hoffmann };
3460b290659fSGerd Hoffmann
vfio_pci_nohotplug_dev_class_init(ObjectClass * klass,void * data)3461b290659fSGerd Hoffmann static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data)
3462b290659fSGerd Hoffmann {
3463b290659fSGerd Hoffmann DeviceClass *dc = DEVICE_CLASS(klass);
3464b290659fSGerd Hoffmann
34654f67d30bSMarc-André Lureau device_class_set_props(dc, vfio_pci_dev_nohotplug_properties);
3466b290659fSGerd Hoffmann dc->hotpluggable = false;
3467b290659fSGerd Hoffmann }
3468b290659fSGerd Hoffmann
3469b290659fSGerd Hoffmann static const TypeInfo vfio_pci_nohotplug_dev_info = {
3470f75ca627SChen Zhang .name = TYPE_VFIO_PCI_NOHOTPLUG,
34710c0c8f8aSLi Qiang .parent = TYPE_VFIO_PCI,
3472b290659fSGerd Hoffmann .instance_size = sizeof(VFIOPCIDevice),
3473b290659fSGerd Hoffmann .class_init = vfio_pci_nohotplug_dev_class_init,
3474b290659fSGerd Hoffmann };
3475b290659fSGerd Hoffmann
register_vfio_pci_dev_type(void)3476cf7087dbSKim Phillips static void register_vfio_pci_dev_type(void)
3477cf7087dbSKim Phillips {
3478cf7087dbSKim Phillips type_register_static(&vfio_pci_dev_info);
3479b290659fSGerd Hoffmann type_register_static(&vfio_pci_nohotplug_dev_info);
3480cf7087dbSKim Phillips }
3481cf7087dbSKim Phillips
3482cf7087dbSKim Phillips type_init(register_vfio_pci_dev_type)
3483