xref: /openbmc/qemu/hw/vfio/cpr.c (revision 21901731410305f66e44f3437c5306cd77b93d95)
1 /*
2  * Copyright (c) 2021-2024 Oracle and/or its affiliates.
3  *
4  * This work is licensed under the terms of the GNU GPL, version 2 or later.
5  * See the COPYING file in the top-level directory.
6  */
7 
8 #include "qemu/osdep.h"
9 #include "hw/vfio/vfio-device.h"
10 #include "hw/vfio/vfio-cpr.h"
11 #include "hw/vfio/pci.h"
12 #include "hw/pci/msix.h"
13 #include "hw/pci/msi.h"
14 #include "migration/cpr.h"
15 #include "qapi/error.h"
16 #include "system/runstate.h"
17 
vfio_cpr_reboot_notifier(NotifierWithReturn * notifier,MigrationEvent * e,Error ** errp)18 int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier,
19                              MigrationEvent *e, Error **errp)
20 {
21     if (e->type == MIG_EVENT_PRECOPY_SETUP &&
22         !runstate_check(RUN_STATE_SUSPENDED) && !vm_get_suspended()) {
23 
24         error_setg(errp,
25             "VFIO device only supports cpr-reboot for runstate suspended");
26 
27         return -1;
28     }
29     return 0;
30 }
31 
32 #define STRDUP_VECTOR_FD_NAME(vdev, name)   \
33     g_strdup_printf("%s_%s", (vdev)->vbasedev.name, (name))
34 
vfio_cpr_save_vector_fd(VFIOPCIDevice * vdev,const char * name,int nr,int fd)35 void vfio_cpr_save_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr,
36                              int fd)
37 {
38     g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name);
39     cpr_save_fd(fdname, nr, fd);
40 }
41 
vfio_cpr_load_vector_fd(VFIOPCIDevice * vdev,const char * name,int nr)42 int vfio_cpr_load_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr)
43 {
44     g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name);
45     return cpr_find_fd(fdname, nr);
46 }
47 
vfio_cpr_delete_vector_fd(VFIOPCIDevice * vdev,const char * name,int nr)48 void vfio_cpr_delete_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr)
49 {
50     g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name);
51     cpr_delete_fd(fdname, nr);
52 }
53 
vfio_cpr_claim_vectors(VFIOPCIDevice * vdev,int nr_vectors,bool msix)54 static void vfio_cpr_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors,
55                                    bool msix)
56 {
57     int i, fd;
58     bool pending = false;
59     PCIDevice *pdev = &vdev->pdev;
60 
61     vdev->nr_vectors = nr_vectors;
62     vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors);
63     vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI;
64 
65     vfio_pci_prepare_kvm_msi_virq_batch(vdev);
66 
67     for (i = 0; i < nr_vectors; i++) {
68         VFIOMSIVector *vector = &vdev->msi_vectors[i];
69 
70         fd = vfio_cpr_load_vector_fd(vdev, "interrupt", i);
71         if (fd >= 0) {
72             vfio_pci_vector_init(vdev, i);
73             vfio_pci_msi_set_handler(vdev, i, true);
74         }
75 
76         if (vfio_cpr_load_vector_fd(vdev, "kvm_interrupt", i) >= 0) {
77             vfio_pci_add_kvm_msi_virq(vdev, vector, i, msix);
78         } else {
79             vdev->msi_vectors[i].virq = -1;
80         }
81 
82         if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) {
83             set_bit(i, vdev->msix->pending);
84             pending = true;
85         }
86     }
87 
88     vfio_pci_commit_kvm_msi_virq_batch(vdev);
89 
90     if (msix) {
91         memory_region_set_enabled(&pdev->msix_pba_mmio, pending);
92     }
93 }
94 
95 /*
96  * The kernel may change non-emulated config bits.  Exclude them from the
97  * changed-bits check in get_pci_config_device.
98  */
vfio_cpr_pci_pre_load(void * opaque)99 static int vfio_cpr_pci_pre_load(void *opaque)
100 {
101     VFIOPCIDevice *vdev = opaque;
102     PCIDevice *pdev = &vdev->pdev;
103     int size = MIN(pci_config_size(pdev), vdev->config_size);
104     int i;
105 
106     for (i = 0; i < size; i++) {
107         pdev->cmask[i] &= vdev->emulated_config_bits[i];
108     }
109 
110     return 0;
111 }
112 
vfio_cpr_pci_post_load(void * opaque,int version_id)113 static int vfio_cpr_pci_post_load(void *opaque, int version_id)
114 {
115     VFIOPCIDevice *vdev = opaque;
116     PCIDevice *pdev = &vdev->pdev;
117     int nr_vectors;
118 
119     vfio_sub_page_bar_update_mappings(vdev);
120 
121     if (msix_enabled(pdev)) {
122         vfio_pci_msix_set_notifiers(vdev);
123         nr_vectors = vdev->msix->entries;
124         vfio_cpr_claim_vectors(vdev, nr_vectors, true);
125 
126     } else if (msi_enabled(pdev)) {
127         nr_vectors = msi_nr_vectors_allocated(pdev);
128         vfio_cpr_claim_vectors(vdev, nr_vectors, false);
129 
130     } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
131         Error *local_err = NULL;
132         if (!vfio_pci_intx_enable(vdev, &local_err)) {
133             error_report_err(local_err);
134             return -1;
135         }
136     }
137 
138     return 0;
139 }
140 
pci_msix_present(void * opaque,int version_id)141 static bool pci_msix_present(void *opaque, int version_id)
142 {
143     PCIDevice *pdev = opaque;
144 
145     return msix_present(pdev);
146 }
147 
148 static const VMStateDescription vfio_intx_vmstate = {
149     .name = "vfio-cpr-intx",
150     .version_id = 0,
151     .minimum_version_id = 0,
152     .fields = (VMStateField[]) {
153         VMSTATE_BOOL(pending, VFIOINTx),
154         VMSTATE_UINT32(route.mode, VFIOINTx),
155         VMSTATE_INT32(route.irq, VFIOINTx),
156         VMSTATE_END_OF_LIST()
157     }
158 };
159 
160 #define VMSTATE_VFIO_INTX(_field, _state) {                         \
161     .name       = (stringify(_field)),                              \
162     .size       = sizeof(VFIOINTx),                                 \
163     .vmsd       = &vfio_intx_vmstate,                               \
164     .flags      = VMS_STRUCT,                                       \
165     .offset     = vmstate_offset_value(_state, _field, VFIOINTx),   \
166 }
167 
168 const VMStateDescription vfio_cpr_pci_vmstate = {
169     .name = "vfio-cpr-pci",
170     .version_id = 0,
171     .minimum_version_id = 0,
172     .pre_load = vfio_cpr_pci_pre_load,
173     .post_load = vfio_cpr_pci_post_load,
174     .needed = cpr_incoming_needed,
175     .fields = (VMStateField[]) {
176         VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
177         VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, pci_msix_present),
178         VMSTATE_VFIO_INTX(intx, VFIOPCIDevice),
179         VMSTATE_END_OF_LIST()
180     }
181 };
182 
183 static NotifierWithReturn kvm_close_notifier;
184 
vfio_cpr_kvm_close_notifier(NotifierWithReturn * notifier,MigrationEvent * e,Error ** errp)185 static int vfio_cpr_kvm_close_notifier(NotifierWithReturn *notifier,
186                                        MigrationEvent *e,
187                                        Error **errp)
188 {
189     if (e->type == MIG_EVENT_PRECOPY_DONE) {
190         vfio_kvm_device_close();
191     }
192     return 0;
193 }
194 
vfio_cpr_add_kvm_notifier(void)195 void vfio_cpr_add_kvm_notifier(void)
196 {
197     if (!kvm_close_notifier.notify) {
198         migration_add_notifier_mode(&kvm_close_notifier,
199                                     vfio_cpr_kvm_close_notifier,
200                                     MIG_MODE_CPR_TRANSFER);
201     }
202 }
203 
set_irqfd_notifier_gsi(KVMState * s,EventNotifier * n,EventNotifier * rn,int virq,bool enable)204 static int set_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
205                                   EventNotifier *rn, int virq, bool enable)
206 {
207     if (enable) {
208         return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, virq);
209     } else {
210         return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, virq);
211     }
212 }
213 
vfio_cpr_set_msi_virq(VFIOPCIDevice * vdev,Error ** errp,bool enable)214 static int vfio_cpr_set_msi_virq(VFIOPCIDevice *vdev, Error **errp, bool enable)
215 {
216     const char *op = (enable ? "enable" : "disable");
217     PCIDevice *pdev = &vdev->pdev;
218     int i, nr_vectors, ret = 0;
219 
220     if (msix_enabled(pdev)) {
221         nr_vectors = vdev->msix->entries;
222 
223     } else if (msi_enabled(pdev)) {
224         nr_vectors = msi_nr_vectors_allocated(pdev);
225 
226     } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
227         ret = set_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
228                                      &vdev->intx.unmask, vdev->intx.route.irq,
229                                      enable);
230         if (ret) {
231             error_setg_errno(errp, -ret, "failed to %s INTx irq %d",
232                              op, vdev->intx.route.irq);
233             return ret;
234         }
235         vfio_pci_intx_set_handler(vdev, enable);
236         return ret;
237 
238     } else {
239         return 0;
240     }
241 
242     for (i = 0; i < nr_vectors; i++) {
243         VFIOMSIVector *vector = &vdev->msi_vectors[i];
244         if (vector->use) {
245             ret = set_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
246                                          NULL, vector->virq, enable);
247             if (ret) {
248                 error_setg_errno(errp, -ret,
249                                  "failed to %s msi vector %d virq %d",
250                                  op, i, vector->virq);
251                 return ret;
252             }
253             vfio_pci_msi_set_handler(vdev, i, enable);
254         }
255     }
256 
257     return ret;
258 }
259 
260 /*
261  * When CPR starts, detach IRQs from the VFIO device so future interrupts
262  * are posted to kvm_interrupt, which is preserved in new QEMU.  Interrupts
263  * that were already posted to the old KVM instance, but not delivered to the
264  * VCPU, are recovered via KVM_GET_LAPIC and pushed to the new KVM instance
265  * in new QEMU.
266  *
267  * If CPR fails, reattach the IRQs.
268  */
vfio_cpr_pci_notifier(NotifierWithReturn * notifier,MigrationEvent * e,Error ** errp)269 static int vfio_cpr_pci_notifier(NotifierWithReturn *notifier,
270                                  MigrationEvent *e, Error **errp)
271 {
272     VFIOPCIDevice *vdev =
273         container_of(notifier, VFIOPCIDevice, cpr.transfer_notifier);
274 
275     if (e->type == MIG_EVENT_PRECOPY_SETUP) {
276         return vfio_cpr_set_msi_virq(vdev, errp, false);
277     } else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
278         return vfio_cpr_set_msi_virq(vdev, errp, true);
279     }
280     return 0;
281 }
282 
vfio_cpr_pci_register_device(VFIOPCIDevice * vdev)283 void vfio_cpr_pci_register_device(VFIOPCIDevice *vdev)
284 {
285     migration_add_notifier_mode(&vdev->cpr.transfer_notifier,
286                                 vfio_cpr_pci_notifier,
287                                 MIG_MODE_CPR_TRANSFER);
288 }
289 
vfio_cpr_pci_unregister_device(VFIOPCIDevice * vdev)290 void vfio_cpr_pci_unregister_device(VFIOPCIDevice *vdev)
291 {
292     migration_remove_notifier(&vdev->cpr.transfer_notifier);
293 }
294