1 /* 2 * Copyright (c) 2021-2024 Oracle and/or its affiliates. 3 * 4 * This work is licensed under the terms of the GNU GPL, version 2 or later. 5 * See the COPYING file in the top-level directory. 6 */ 7 8 #include "qemu/osdep.h" 9 #include "hw/vfio/vfio-device.h" 10 #include "hw/vfio/vfio-cpr.h" 11 #include "hw/vfio/pci.h" 12 #include "hw/pci/msix.h" 13 #include "hw/pci/msi.h" 14 #include "migration/cpr.h" 15 #include "qapi/error.h" 16 #include "system/runstate.h" 17 18 int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, 19 MigrationEvent *e, Error **errp) 20 { 21 if (e->type == MIG_EVENT_PRECOPY_SETUP && 22 !runstate_check(RUN_STATE_SUSPENDED) && !vm_get_suspended()) { 23 24 error_setg(errp, 25 "VFIO device only supports cpr-reboot for runstate suspended"); 26 27 return -1; 28 } 29 return 0; 30 } 31 32 #define STRDUP_VECTOR_FD_NAME(vdev, name) \ 33 g_strdup_printf("%s_%s", (vdev)->vbasedev.name, (name)) 34 35 void vfio_cpr_save_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr, 36 int fd) 37 { 38 g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name); 39 cpr_save_fd(fdname, nr, fd); 40 } 41 42 int vfio_cpr_load_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr) 43 { 44 g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name); 45 return cpr_find_fd(fdname, nr); 46 } 47 48 void vfio_cpr_delete_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr) 49 { 50 g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name); 51 cpr_delete_fd(fdname, nr); 52 } 53 54 static void vfio_cpr_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors, 55 bool msix) 56 { 57 int i, fd; 58 bool pending = false; 59 PCIDevice *pdev = &vdev->pdev; 60 61 vdev->nr_vectors = nr_vectors; 62 vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors); 63 vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI; 64 65 vfio_pci_prepare_kvm_msi_virq_batch(vdev); 66 67 for (i = 0; i < nr_vectors; i++) { 68 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 69 70 fd = vfio_cpr_load_vector_fd(vdev, "interrupt", i); 71 if (fd >= 0) { 72 vfio_pci_vector_init(vdev, i); 73 vfio_pci_msi_set_handler(vdev, i, true); 74 } 75 76 if (vfio_cpr_load_vector_fd(vdev, "kvm_interrupt", i) >= 0) { 77 vfio_pci_add_kvm_msi_virq(vdev, vector, i, msix); 78 } else { 79 vdev->msi_vectors[i].virq = -1; 80 } 81 82 if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) { 83 set_bit(i, vdev->msix->pending); 84 pending = true; 85 } 86 } 87 88 vfio_pci_commit_kvm_msi_virq_batch(vdev); 89 90 if (msix) { 91 memory_region_set_enabled(&pdev->msix_pba_mmio, pending); 92 } 93 } 94 95 /* 96 * The kernel may change non-emulated config bits. Exclude them from the 97 * changed-bits check in get_pci_config_device. 98 */ 99 static int vfio_cpr_pci_pre_load(void *opaque) 100 { 101 VFIOPCIDevice *vdev = opaque; 102 PCIDevice *pdev = &vdev->pdev; 103 int size = MIN(pci_config_size(pdev), vdev->config_size); 104 int i; 105 106 for (i = 0; i < size; i++) { 107 pdev->cmask[i] &= vdev->emulated_config_bits[i]; 108 } 109 110 return 0; 111 } 112 113 static int vfio_cpr_pci_post_load(void *opaque, int version_id) 114 { 115 VFIOPCIDevice *vdev = opaque; 116 PCIDevice *pdev = &vdev->pdev; 117 int nr_vectors; 118 119 vfio_sub_page_bar_update_mappings(vdev); 120 121 if (msix_enabled(pdev)) { 122 vfio_pci_msix_set_notifiers(vdev); 123 nr_vectors = vdev->msix->entries; 124 vfio_cpr_claim_vectors(vdev, nr_vectors, true); 125 126 } else if (msi_enabled(pdev)) { 127 nr_vectors = msi_nr_vectors_allocated(pdev); 128 vfio_cpr_claim_vectors(vdev, nr_vectors, false); 129 130 } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) { 131 Error *local_err = NULL; 132 if (!vfio_pci_intx_enable(vdev, &local_err)) { 133 error_report_err(local_err); 134 return -1; 135 } 136 } 137 138 return 0; 139 } 140 141 static bool pci_msix_present(void *opaque, int version_id) 142 { 143 PCIDevice *pdev = opaque; 144 145 return msix_present(pdev); 146 } 147 148 static const VMStateDescription vfio_intx_vmstate = { 149 .name = "vfio-cpr-intx", 150 .version_id = 0, 151 .minimum_version_id = 0, 152 .fields = (VMStateField[]) { 153 VMSTATE_BOOL(pending, VFIOINTx), 154 VMSTATE_UINT32(route.mode, VFIOINTx), 155 VMSTATE_INT32(route.irq, VFIOINTx), 156 VMSTATE_END_OF_LIST() 157 } 158 }; 159 160 #define VMSTATE_VFIO_INTX(_field, _state) { \ 161 .name = (stringify(_field)), \ 162 .size = sizeof(VFIOINTx), \ 163 .vmsd = &vfio_intx_vmstate, \ 164 .flags = VMS_STRUCT, \ 165 .offset = vmstate_offset_value(_state, _field, VFIOINTx), \ 166 } 167 168 const VMStateDescription vfio_cpr_pci_vmstate = { 169 .name = "vfio-cpr-pci", 170 .version_id = 0, 171 .minimum_version_id = 0, 172 .pre_load = vfio_cpr_pci_pre_load, 173 .post_load = vfio_cpr_pci_post_load, 174 .needed = cpr_incoming_needed, 175 .fields = (VMStateField[]) { 176 VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), 177 VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, pci_msix_present), 178 VMSTATE_VFIO_INTX(intx, VFIOPCIDevice), 179 VMSTATE_END_OF_LIST() 180 } 181 }; 182 183 static NotifierWithReturn kvm_close_notifier; 184 185 static int vfio_cpr_kvm_close_notifier(NotifierWithReturn *notifier, 186 MigrationEvent *e, 187 Error **errp) 188 { 189 if (e->type == MIG_EVENT_PRECOPY_DONE) { 190 vfio_kvm_device_close(); 191 } 192 return 0; 193 } 194 195 void vfio_cpr_add_kvm_notifier(void) 196 { 197 if (!kvm_close_notifier.notify) { 198 migration_add_notifier_mode(&kvm_close_notifier, 199 vfio_cpr_kvm_close_notifier, 200 MIG_MODE_CPR_TRANSFER); 201 } 202 } 203 204 static int set_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 205 EventNotifier *rn, int virq, bool enable) 206 { 207 if (enable) { 208 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, virq); 209 } else { 210 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, virq); 211 } 212 } 213 214 static int vfio_cpr_set_msi_virq(VFIOPCIDevice *vdev, Error **errp, bool enable) 215 { 216 const char *op = (enable ? "enable" : "disable"); 217 PCIDevice *pdev = &vdev->pdev; 218 int i, nr_vectors, ret = 0; 219 220 if (msix_enabled(pdev)) { 221 nr_vectors = vdev->msix->entries; 222 223 } else if (msi_enabled(pdev)) { 224 nr_vectors = msi_nr_vectors_allocated(pdev); 225 226 } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) { 227 ret = set_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt, 228 &vdev->intx.unmask, vdev->intx.route.irq, 229 enable); 230 if (ret) { 231 error_setg_errno(errp, -ret, "failed to %s INTx irq %d", 232 op, vdev->intx.route.irq); 233 return ret; 234 } 235 vfio_pci_intx_set_handler(vdev, enable); 236 return ret; 237 238 } else { 239 return 0; 240 } 241 242 for (i = 0; i < nr_vectors; i++) { 243 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 244 if (vector->use) { 245 ret = set_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, 246 NULL, vector->virq, enable); 247 if (ret) { 248 error_setg_errno(errp, -ret, 249 "failed to %s msi vector %d virq %d", 250 op, i, vector->virq); 251 return ret; 252 } 253 vfio_pci_msi_set_handler(vdev, i, enable); 254 } 255 } 256 257 return ret; 258 } 259 260 /* 261 * When CPR starts, detach IRQs from the VFIO device so future interrupts 262 * are posted to kvm_interrupt, which is preserved in new QEMU. Interrupts 263 * that were already posted to the old KVM instance, but not delivered to the 264 * VCPU, are recovered via KVM_GET_LAPIC and pushed to the new KVM instance 265 * in new QEMU. 266 * 267 * If CPR fails, reattach the IRQs. 268 */ 269 static int vfio_cpr_pci_notifier(NotifierWithReturn *notifier, 270 MigrationEvent *e, Error **errp) 271 { 272 VFIOPCIDevice *vdev = 273 container_of(notifier, VFIOPCIDevice, cpr.transfer_notifier); 274 275 if (e->type == MIG_EVENT_PRECOPY_SETUP) { 276 return vfio_cpr_set_msi_virq(vdev, errp, false); 277 } else if (e->type == MIG_EVENT_PRECOPY_FAILED) { 278 return vfio_cpr_set_msi_virq(vdev, errp, true); 279 } 280 return 0; 281 } 282 283 void vfio_cpr_pci_register_device(VFIOPCIDevice *vdev) 284 { 285 migration_add_notifier_mode(&vdev->cpr.transfer_notifier, 286 vfio_cpr_pci_notifier, 287 MIG_MODE_CPR_TRANSFER); 288 } 289 290 void vfio_cpr_pci_unregister_device(VFIOPCIDevice *vdev) 291 { 292 migration_remove_notifier(&vdev->cpr.transfer_notifier); 293 } 294