1 /* 2 * Migration support for VFIO devices 3 * 4 * Copyright NVIDIA, Inc. 2020 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2. See 7 * the COPYING file in the top-level directory. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "qemu/main-loop.h" 12 #include "qemu/cutils.h" 13 #include "qemu/units.h" 14 #include "qemu/error-report.h" 15 #include <linux/vfio.h> 16 #include <sys/ioctl.h> 17 18 #include "sysemu/runstate.h" 19 #include "hw/vfio/vfio-common.h" 20 #include "migration/migration.h" 21 #include "migration/vmstate.h" 22 #include "migration/qemu-file.h" 23 #include "migration/register.h" 24 #include "migration/blocker.h" 25 #include "migration/misc.h" 26 #include "qapi/error.h" 27 #include "exec/ramlist.h" 28 #include "exec/ram_addr.h" 29 #include "pci.h" 30 #include "trace.h" 31 #include "hw/hw.h" 32 33 /* 34 * Flags to be used as unique delimiters for VFIO devices in the migration 35 * stream. These flags are composed as: 36 * 0xffffffff => MSB 32-bit all 1s 37 * 0xef10 => Magic ID, represents emulated (virtual) function IO 38 * 0x0000 => 16-bits reserved for flags 39 * 40 * The beginning of state information is marked by _DEV_CONFIG_STATE, 41 * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a 42 * certain state information is marked by _END_OF_STATE. 43 */ 44 #define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) 45 #define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) 46 #define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) 47 #define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) 48 49 /* 50 * This is an arbitrary size based on migration of mlx5 devices, where typically 51 * total device migration size is on the order of 100s of MB. Testing with 52 * larger values, e.g. 128MB and 1GB, did not show a performance improvement. 53 */ 54 #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB) 55 56 static int64_t bytes_transferred; 57 58 static const char *mig_state_to_str(enum vfio_device_mig_state state) 59 { 60 switch (state) { 61 case VFIO_DEVICE_STATE_ERROR: 62 return "ERROR"; 63 case VFIO_DEVICE_STATE_STOP: 64 return "STOP"; 65 case VFIO_DEVICE_STATE_RUNNING: 66 return "RUNNING"; 67 case VFIO_DEVICE_STATE_STOP_COPY: 68 return "STOP_COPY"; 69 case VFIO_DEVICE_STATE_RESUMING: 70 return "RESUMING"; 71 default: 72 return "UNKNOWN STATE"; 73 } 74 } 75 76 static int vfio_migration_set_state(VFIODevice *vbasedev, 77 enum vfio_device_mig_state new_state, 78 enum vfio_device_mig_state recover_state) 79 { 80 VFIOMigration *migration = vbasedev->migration; 81 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 82 sizeof(struct vfio_device_feature_mig_state), 83 sizeof(uint64_t))] = {}; 84 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 85 struct vfio_device_feature_mig_state *mig_state = 86 (struct vfio_device_feature_mig_state *)feature->data; 87 int ret; 88 89 feature->argsz = sizeof(buf); 90 feature->flags = 91 VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE; 92 mig_state->device_state = new_state; 93 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 94 /* Try to set the device in some good state */ 95 ret = -errno; 96 97 if (recover_state == VFIO_DEVICE_STATE_ERROR) { 98 error_report("%s: Failed setting device state to %s, err: %s. " 99 "Recover state is ERROR. Resetting device", 100 vbasedev->name, mig_state_to_str(new_state), 101 strerror(errno)); 102 103 goto reset_device; 104 } 105 106 error_report( 107 "%s: Failed setting device state to %s, err: %s. Setting device in recover state %s", 108 vbasedev->name, mig_state_to_str(new_state), 109 strerror(errno), mig_state_to_str(recover_state)); 110 111 mig_state->device_state = recover_state; 112 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 113 ret = -errno; 114 error_report( 115 "%s: Failed setting device in recover state, err: %s. Resetting device", 116 vbasedev->name, strerror(errno)); 117 118 goto reset_device; 119 } 120 121 migration->device_state = recover_state; 122 123 return ret; 124 } 125 126 migration->device_state = new_state; 127 if (mig_state->data_fd != -1) { 128 if (migration->data_fd != -1) { 129 /* 130 * This can happen if the device is asynchronously reset and 131 * terminates a data transfer. 132 */ 133 error_report("%s: data_fd out of sync", vbasedev->name); 134 close(mig_state->data_fd); 135 136 return -EBADF; 137 } 138 139 migration->data_fd = mig_state->data_fd; 140 } 141 142 trace_vfio_migration_set_state(vbasedev->name, mig_state_to_str(new_state)); 143 144 return 0; 145 146 reset_device: 147 if (ioctl(vbasedev->fd, VFIO_DEVICE_RESET)) { 148 hw_error("%s: Failed resetting device, err: %s", vbasedev->name, 149 strerror(errno)); 150 } 151 152 migration->device_state = VFIO_DEVICE_STATE_RUNNING; 153 154 return ret; 155 } 156 157 static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev, 158 uint64_t data_size) 159 { 160 VFIOMigration *migration = vbasedev->migration; 161 int ret; 162 163 ret = qemu_file_get_to_fd(f, migration->data_fd, data_size); 164 trace_vfio_load_state_device_data(vbasedev->name, data_size, ret); 165 166 return ret; 167 } 168 169 static int vfio_save_device_config_state(QEMUFile *f, void *opaque) 170 { 171 VFIODevice *vbasedev = opaque; 172 173 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE); 174 175 if (vbasedev->ops && vbasedev->ops->vfio_save_config) { 176 vbasedev->ops->vfio_save_config(vbasedev, f); 177 } 178 179 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 180 181 trace_vfio_save_device_config_state(vbasedev->name); 182 183 return qemu_file_get_error(f); 184 } 185 186 static int vfio_load_device_config_state(QEMUFile *f, void *opaque) 187 { 188 VFIODevice *vbasedev = opaque; 189 uint64_t data; 190 191 if (vbasedev->ops && vbasedev->ops->vfio_load_config) { 192 int ret; 193 194 ret = vbasedev->ops->vfio_load_config(vbasedev, f); 195 if (ret) { 196 error_report("%s: Failed to load device config space", 197 vbasedev->name); 198 return ret; 199 } 200 } 201 202 data = qemu_get_be64(f); 203 if (data != VFIO_MIG_FLAG_END_OF_STATE) { 204 error_report("%s: Failed loading device config space, " 205 "end flag incorrect 0x%"PRIx64, vbasedev->name, data); 206 return -EINVAL; 207 } 208 209 trace_vfio_load_device_config_state(vbasedev->name); 210 return qemu_file_get_error(f); 211 } 212 213 static void vfio_migration_cleanup(VFIODevice *vbasedev) 214 { 215 VFIOMigration *migration = vbasedev->migration; 216 217 close(migration->data_fd); 218 migration->data_fd = -1; 219 } 220 221 static int vfio_query_stop_copy_size(VFIODevice *vbasedev, 222 uint64_t *stop_copy_size) 223 { 224 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 225 sizeof(struct vfio_device_feature_mig_data_size), 226 sizeof(uint64_t))] = {}; 227 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 228 struct vfio_device_feature_mig_data_size *mig_data_size = 229 (struct vfio_device_feature_mig_data_size *)feature->data; 230 231 feature->argsz = sizeof(buf); 232 feature->flags = 233 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DATA_SIZE; 234 235 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 236 return -errno; 237 } 238 239 *stop_copy_size = mig_data_size->stop_copy_length; 240 241 return 0; 242 } 243 244 /* Returns 1 if end-of-stream is reached, 0 if more data and -errno if error */ 245 static int vfio_save_block(QEMUFile *f, VFIOMigration *migration) 246 { 247 ssize_t data_size; 248 249 data_size = read(migration->data_fd, migration->data_buffer, 250 migration->data_buffer_size); 251 if (data_size < 0) { 252 return -errno; 253 } 254 if (data_size == 0) { 255 return 1; 256 } 257 258 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); 259 qemu_put_be64(f, data_size); 260 qemu_put_buffer(f, migration->data_buffer, data_size); 261 bytes_transferred += data_size; 262 263 trace_vfio_save_block(migration->vbasedev->name, data_size); 264 265 return qemu_file_get_error(f); 266 } 267 268 /* ---------------------------------------------------------------------- */ 269 270 static int vfio_save_setup(QEMUFile *f, void *opaque) 271 { 272 VFIODevice *vbasedev = opaque; 273 VFIOMigration *migration = vbasedev->migration; 274 uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; 275 276 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); 277 278 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 279 migration->data_buffer_size = MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE, 280 stop_copy_size); 281 migration->data_buffer = g_try_malloc0(migration->data_buffer_size); 282 if (!migration->data_buffer) { 283 error_report("%s: Failed to allocate migration data buffer", 284 vbasedev->name); 285 return -ENOMEM; 286 } 287 288 trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size); 289 290 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 291 292 return qemu_file_get_error(f); 293 } 294 295 static void vfio_save_cleanup(void *opaque) 296 { 297 VFIODevice *vbasedev = opaque; 298 VFIOMigration *migration = vbasedev->migration; 299 300 g_free(migration->data_buffer); 301 migration->data_buffer = NULL; 302 vfio_migration_cleanup(vbasedev); 303 trace_vfio_save_cleanup(vbasedev->name); 304 } 305 306 /* 307 * Migration size of VFIO devices can be as little as a few KBs or as big as 308 * many GBs. This value should be big enough to cover the worst case. 309 */ 310 #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB) 311 312 /* 313 * Only exact function is implemented and not estimate function. The reason is 314 * that during pre-copy phase of migration the estimate function is called 315 * repeatedly while pending RAM size is over the threshold, thus migration 316 * can't converge and querying the VFIO device pending data size is useless. 317 */ 318 static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, 319 uint64_t *can_postcopy) 320 { 321 VFIODevice *vbasedev = opaque; 322 uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE; 323 324 /* 325 * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is 326 * reported so downtime limit won't be violated. 327 */ 328 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 329 *must_precopy += stop_copy_size; 330 331 trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy, 332 stop_copy_size); 333 } 334 335 static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) 336 { 337 VFIODevice *vbasedev = opaque; 338 int ret; 339 340 /* We reach here with device state STOP only */ 341 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, 342 VFIO_DEVICE_STATE_STOP); 343 if (ret) { 344 return ret; 345 } 346 347 do { 348 ret = vfio_save_block(f, vbasedev->migration); 349 if (ret < 0) { 350 return ret; 351 } 352 } while (!ret); 353 354 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 355 ret = qemu_file_get_error(f); 356 if (ret) { 357 return ret; 358 } 359 360 /* 361 * If setting the device in STOP state fails, the device should be reset. 362 * To do so, use ERROR state as a recover state. 363 */ 364 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP, 365 VFIO_DEVICE_STATE_ERROR); 366 trace_vfio_save_complete_precopy(vbasedev->name, ret); 367 368 return ret; 369 } 370 371 static void vfio_save_state(QEMUFile *f, void *opaque) 372 { 373 VFIODevice *vbasedev = opaque; 374 int ret; 375 376 ret = vfio_save_device_config_state(f, opaque); 377 if (ret) { 378 error_report("%s: Failed to save device config space", 379 vbasedev->name); 380 qemu_file_set_error(f, ret); 381 } 382 } 383 384 static int vfio_load_setup(QEMUFile *f, void *opaque) 385 { 386 VFIODevice *vbasedev = opaque; 387 388 return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING, 389 vbasedev->migration->device_state); 390 } 391 392 static int vfio_load_cleanup(void *opaque) 393 { 394 VFIODevice *vbasedev = opaque; 395 396 vfio_migration_cleanup(vbasedev); 397 trace_vfio_load_cleanup(vbasedev->name); 398 399 return 0; 400 } 401 402 static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) 403 { 404 VFIODevice *vbasedev = opaque; 405 int ret = 0; 406 uint64_t data; 407 408 data = qemu_get_be64(f); 409 while (data != VFIO_MIG_FLAG_END_OF_STATE) { 410 411 trace_vfio_load_state(vbasedev->name, data); 412 413 switch (data) { 414 case VFIO_MIG_FLAG_DEV_CONFIG_STATE: 415 { 416 return vfio_load_device_config_state(f, opaque); 417 } 418 case VFIO_MIG_FLAG_DEV_SETUP_STATE: 419 { 420 data = qemu_get_be64(f); 421 if (data == VFIO_MIG_FLAG_END_OF_STATE) { 422 return ret; 423 } else { 424 error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64, 425 vbasedev->name, data); 426 return -EINVAL; 427 } 428 break; 429 } 430 case VFIO_MIG_FLAG_DEV_DATA_STATE: 431 { 432 uint64_t data_size = qemu_get_be64(f); 433 434 if (data_size) { 435 ret = vfio_load_buffer(f, vbasedev, data_size); 436 if (ret < 0) { 437 return ret; 438 } 439 } 440 break; 441 } 442 default: 443 error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data); 444 return -EINVAL; 445 } 446 447 data = qemu_get_be64(f); 448 ret = qemu_file_get_error(f); 449 if (ret) { 450 return ret; 451 } 452 } 453 return ret; 454 } 455 456 static const SaveVMHandlers savevm_vfio_handlers = { 457 .save_setup = vfio_save_setup, 458 .save_cleanup = vfio_save_cleanup, 459 .state_pending_exact = vfio_state_pending_exact, 460 .save_live_complete_precopy = vfio_save_complete_precopy, 461 .save_state = vfio_save_state, 462 .load_setup = vfio_load_setup, 463 .load_cleanup = vfio_load_cleanup, 464 .load_state = vfio_load_state, 465 }; 466 467 /* ---------------------------------------------------------------------- */ 468 469 static void vfio_vmstate_change(void *opaque, bool running, RunState state) 470 { 471 VFIODevice *vbasedev = opaque; 472 enum vfio_device_mig_state new_state; 473 int ret; 474 475 if (running) { 476 new_state = VFIO_DEVICE_STATE_RUNNING; 477 } else { 478 new_state = VFIO_DEVICE_STATE_STOP; 479 } 480 481 /* 482 * If setting the device in new_state fails, the device should be reset. 483 * To do so, use ERROR state as a recover state. 484 */ 485 ret = vfio_migration_set_state(vbasedev, new_state, 486 VFIO_DEVICE_STATE_ERROR); 487 if (ret) { 488 /* 489 * Migration should be aborted in this case, but vm_state_notify() 490 * currently does not support reporting failures. 491 */ 492 if (migrate_get_current()->to_dst_file) { 493 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 494 } 495 } 496 497 trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state), 498 mig_state_to_str(new_state)); 499 } 500 501 static void vfio_migration_state_notifier(Notifier *notifier, void *data) 502 { 503 MigrationState *s = data; 504 VFIOMigration *migration = container_of(notifier, VFIOMigration, 505 migration_state); 506 VFIODevice *vbasedev = migration->vbasedev; 507 508 trace_vfio_migration_state_notifier(vbasedev->name, 509 MigrationStatus_str(s->state)); 510 511 switch (s->state) { 512 case MIGRATION_STATUS_CANCELLING: 513 case MIGRATION_STATUS_CANCELLED: 514 case MIGRATION_STATUS_FAILED: 515 bytes_transferred = 0; 516 /* 517 * If setting the device in RUNNING state fails, the device should 518 * be reset. To do so, use ERROR state as a recover state. 519 */ 520 vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING, 521 VFIO_DEVICE_STATE_ERROR); 522 } 523 } 524 525 static void vfio_migration_free(VFIODevice *vbasedev) 526 { 527 g_free(vbasedev->migration); 528 vbasedev->migration = NULL; 529 } 530 531 static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags) 532 { 533 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 534 sizeof(struct vfio_device_feature_migration), 535 sizeof(uint64_t))] = {}; 536 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 537 struct vfio_device_feature_migration *mig = 538 (struct vfio_device_feature_migration *)feature->data; 539 540 feature->argsz = sizeof(buf); 541 feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION; 542 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 543 if (errno == ENOTTY) { 544 error_report("%s: VFIO migration is not supported in kernel", 545 vbasedev->name); 546 } else { 547 error_report("%s: Failed to query VFIO migration support, err: %s", 548 vbasedev->name, strerror(errno)); 549 } 550 551 return -errno; 552 } 553 554 *mig_flags = mig->flags; 555 556 return 0; 557 } 558 559 static bool vfio_dma_logging_supported(VFIODevice *vbasedev) 560 { 561 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), 562 sizeof(uint64_t))] = {}; 563 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 564 565 feature->argsz = sizeof(buf); 566 feature->flags = VFIO_DEVICE_FEATURE_PROBE | 567 VFIO_DEVICE_FEATURE_DMA_LOGGING_START; 568 569 return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); 570 } 571 572 static int vfio_migration_init(VFIODevice *vbasedev) 573 { 574 int ret; 575 Object *obj; 576 VFIOMigration *migration; 577 char id[256] = ""; 578 g_autofree char *path = NULL, *oid = NULL; 579 uint64_t mig_flags = 0; 580 581 if (!vbasedev->ops->vfio_get_object) { 582 return -EINVAL; 583 } 584 585 obj = vbasedev->ops->vfio_get_object(vbasedev); 586 if (!obj) { 587 return -EINVAL; 588 } 589 590 ret = vfio_migration_query_flags(vbasedev, &mig_flags); 591 if (ret) { 592 return ret; 593 } 594 595 /* Basic migration functionality must be supported */ 596 if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) { 597 return -EOPNOTSUPP; 598 } 599 600 vbasedev->migration = g_new0(VFIOMigration, 1); 601 migration = vbasedev->migration; 602 migration->vbasedev = vbasedev; 603 migration->device_state = VFIO_DEVICE_STATE_RUNNING; 604 migration->data_fd = -1; 605 606 vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev); 607 608 oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); 609 if (oid) { 610 path = g_strdup_printf("%s/vfio", oid); 611 } else { 612 path = g_strdup("vfio"); 613 } 614 strpadcpy(id, sizeof(id), path, '\0'); 615 616 register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers, 617 vbasedev); 618 619 migration->vm_state = qdev_add_vm_change_state_handler(vbasedev->dev, 620 vfio_vmstate_change, 621 vbasedev); 622 migration->migration_state.notify = vfio_migration_state_notifier; 623 add_migration_state_change_notifier(&migration->migration_state); 624 625 return 0; 626 } 627 628 /* ---------------------------------------------------------------------- */ 629 630 int64_t vfio_mig_bytes_transferred(void) 631 { 632 return bytes_transferred; 633 } 634 635 int vfio_migration_realize(VFIODevice *vbasedev, Error **errp) 636 { 637 int ret = -ENOTSUP; 638 639 if (!vbasedev->enable_migration) { 640 goto add_blocker; 641 } 642 643 ret = vfio_migration_init(vbasedev); 644 if (ret) { 645 goto add_blocker; 646 } 647 648 ret = vfio_block_multiple_devices_migration(errp); 649 if (ret) { 650 return ret; 651 } 652 653 ret = vfio_block_giommu_migration(errp); 654 if (ret) { 655 return ret; 656 } 657 658 trace_vfio_migration_probe(vbasedev->name); 659 return 0; 660 661 add_blocker: 662 error_setg(&vbasedev->migration_blocker, 663 "VFIO device doesn't support migration"); 664 665 ret = migrate_add_blocker(vbasedev->migration_blocker, errp); 666 if (ret < 0) { 667 error_free(vbasedev->migration_blocker); 668 vbasedev->migration_blocker = NULL; 669 } 670 return ret; 671 } 672 673 void vfio_migration_exit(VFIODevice *vbasedev) 674 { 675 if (vbasedev->migration) { 676 VFIOMigration *migration = vbasedev->migration; 677 678 remove_migration_state_change_notifier(&migration->migration_state); 679 qemu_del_vm_change_state_handler(migration->vm_state); 680 unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); 681 vfio_migration_free(vbasedev); 682 vfio_unblock_multiple_devices_migration(); 683 } 684 685 if (vbasedev->migration_blocker) { 686 migrate_del_blocker(vbasedev->migration_blocker); 687 error_free(vbasedev->migration_blocker); 688 vbasedev->migration_blocker = NULL; 689 } 690 } 691