1 /* 2 * Migration support for VFIO devices 3 * 4 * Copyright NVIDIA, Inc. 2020 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2. See 7 * the COPYING file in the top-level directory. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "qemu/main-loop.h" 12 #include "qemu/cutils.h" 13 #include "qemu/units.h" 14 #include "qemu/error-report.h" 15 #include <linux/vfio.h> 16 #include <sys/ioctl.h> 17 18 #include "sysemu/runstate.h" 19 #include "hw/vfio/vfio-common.h" 20 #include "migration/misc.h" 21 #include "migration/savevm.h" 22 #include "migration/vmstate.h" 23 #include "migration/qemu-file.h" 24 #include "migration/register.h" 25 #include "migration/blocker.h" 26 #include "qapi/error.h" 27 #include "exec/ramlist.h" 28 #include "exec/ram_addr.h" 29 #include "pci.h" 30 #include "trace.h" 31 #include "hw/hw.h" 32 33 /* 34 * Flags to be used as unique delimiters for VFIO devices in the migration 35 * stream. These flags are composed as: 36 * 0xffffffff => MSB 32-bit all 1s 37 * 0xef10 => Magic ID, represents emulated (virtual) function IO 38 * 0x0000 => 16-bits reserved for flags 39 * 40 * The beginning of state information is marked by _DEV_CONFIG_STATE, 41 * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a 42 * certain state information is marked by _END_OF_STATE. 43 */ 44 #define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) 45 #define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) 46 #define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) 47 #define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) 48 #define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL) 49 50 /* 51 * This is an arbitrary size based on migration of mlx5 devices, where typically 52 * total device migration size is on the order of 100s of MB. Testing with 53 * larger values, e.g. 128MB and 1GB, did not show a performance improvement. 54 */ 55 #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB) 56 57 static int64_t bytes_transferred; 58 59 static const char *mig_state_to_str(enum vfio_device_mig_state state) 60 { 61 switch (state) { 62 case VFIO_DEVICE_STATE_ERROR: 63 return "ERROR"; 64 case VFIO_DEVICE_STATE_STOP: 65 return "STOP"; 66 case VFIO_DEVICE_STATE_RUNNING: 67 return "RUNNING"; 68 case VFIO_DEVICE_STATE_STOP_COPY: 69 return "STOP_COPY"; 70 case VFIO_DEVICE_STATE_RESUMING: 71 return "RESUMING"; 72 case VFIO_DEVICE_STATE_RUNNING_P2P: 73 return "RUNNING_P2P"; 74 case VFIO_DEVICE_STATE_PRE_COPY: 75 return "PRE_COPY"; 76 case VFIO_DEVICE_STATE_PRE_COPY_P2P: 77 return "PRE_COPY_P2P"; 78 default: 79 return "UNKNOWN STATE"; 80 } 81 } 82 83 static int vfio_migration_set_state(VFIODevice *vbasedev, 84 enum vfio_device_mig_state new_state, 85 enum vfio_device_mig_state recover_state) 86 { 87 VFIOMigration *migration = vbasedev->migration; 88 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 89 sizeof(struct vfio_device_feature_mig_state), 90 sizeof(uint64_t))] = {}; 91 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 92 struct vfio_device_feature_mig_state *mig_state = 93 (struct vfio_device_feature_mig_state *)feature->data; 94 int ret; 95 96 feature->argsz = sizeof(buf); 97 feature->flags = 98 VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE; 99 mig_state->device_state = new_state; 100 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 101 /* Try to set the device in some good state */ 102 ret = -errno; 103 104 if (recover_state == VFIO_DEVICE_STATE_ERROR) { 105 error_report("%s: Failed setting device state to %s, err: %s. " 106 "Recover state is ERROR. Resetting device", 107 vbasedev->name, mig_state_to_str(new_state), 108 strerror(errno)); 109 110 goto reset_device; 111 } 112 113 error_report( 114 "%s: Failed setting device state to %s, err: %s. Setting device in recover state %s", 115 vbasedev->name, mig_state_to_str(new_state), 116 strerror(errno), mig_state_to_str(recover_state)); 117 118 mig_state->device_state = recover_state; 119 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 120 ret = -errno; 121 error_report( 122 "%s: Failed setting device in recover state, err: %s. Resetting device", 123 vbasedev->name, strerror(errno)); 124 125 goto reset_device; 126 } 127 128 migration->device_state = recover_state; 129 130 return ret; 131 } 132 133 migration->device_state = new_state; 134 if (mig_state->data_fd != -1) { 135 if (migration->data_fd != -1) { 136 /* 137 * This can happen if the device is asynchronously reset and 138 * terminates a data transfer. 139 */ 140 error_report("%s: data_fd out of sync", vbasedev->name); 141 close(mig_state->data_fd); 142 143 return -EBADF; 144 } 145 146 migration->data_fd = mig_state->data_fd; 147 } 148 149 trace_vfio_migration_set_state(vbasedev->name, mig_state_to_str(new_state)); 150 151 return 0; 152 153 reset_device: 154 if (ioctl(vbasedev->fd, VFIO_DEVICE_RESET)) { 155 hw_error("%s: Failed resetting device, err: %s", vbasedev->name, 156 strerror(errno)); 157 } 158 159 migration->device_state = VFIO_DEVICE_STATE_RUNNING; 160 161 return ret; 162 } 163 164 /* 165 * Some device state transitions require resetting the device if they fail. 166 * This function sets the device in new_state and resets the device if that 167 * fails. Reset is done by using ERROR as the recover state. 168 */ 169 static int 170 vfio_migration_set_state_or_reset(VFIODevice *vbasedev, 171 enum vfio_device_mig_state new_state) 172 { 173 return vfio_migration_set_state(vbasedev, new_state, 174 VFIO_DEVICE_STATE_ERROR); 175 } 176 177 static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev, 178 uint64_t data_size) 179 { 180 VFIOMigration *migration = vbasedev->migration; 181 int ret; 182 183 ret = qemu_file_get_to_fd(f, migration->data_fd, data_size); 184 trace_vfio_load_state_device_data(vbasedev->name, data_size, ret); 185 186 return ret; 187 } 188 189 static int vfio_save_device_config_state(QEMUFile *f, void *opaque) 190 { 191 VFIODevice *vbasedev = opaque; 192 193 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE); 194 195 if (vbasedev->ops && vbasedev->ops->vfio_save_config) { 196 vbasedev->ops->vfio_save_config(vbasedev, f); 197 } 198 199 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 200 201 trace_vfio_save_device_config_state(vbasedev->name); 202 203 return qemu_file_get_error(f); 204 } 205 206 static int vfio_load_device_config_state(QEMUFile *f, void *opaque) 207 { 208 VFIODevice *vbasedev = opaque; 209 uint64_t data; 210 211 if (vbasedev->ops && vbasedev->ops->vfio_load_config) { 212 int ret; 213 214 ret = vbasedev->ops->vfio_load_config(vbasedev, f); 215 if (ret) { 216 error_report("%s: Failed to load device config space", 217 vbasedev->name); 218 return ret; 219 } 220 } 221 222 data = qemu_get_be64(f); 223 if (data != VFIO_MIG_FLAG_END_OF_STATE) { 224 error_report("%s: Failed loading device config space, " 225 "end flag incorrect 0x%"PRIx64, vbasedev->name, data); 226 return -EINVAL; 227 } 228 229 trace_vfio_load_device_config_state(vbasedev->name); 230 return qemu_file_get_error(f); 231 } 232 233 static void vfio_migration_cleanup(VFIODevice *vbasedev) 234 { 235 VFIOMigration *migration = vbasedev->migration; 236 237 close(migration->data_fd); 238 migration->data_fd = -1; 239 } 240 241 static int vfio_query_stop_copy_size(VFIODevice *vbasedev, 242 uint64_t *stop_copy_size) 243 { 244 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 245 sizeof(struct vfio_device_feature_mig_data_size), 246 sizeof(uint64_t))] = {}; 247 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 248 struct vfio_device_feature_mig_data_size *mig_data_size = 249 (struct vfio_device_feature_mig_data_size *)feature->data; 250 251 feature->argsz = sizeof(buf); 252 feature->flags = 253 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DATA_SIZE; 254 255 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 256 return -errno; 257 } 258 259 *stop_copy_size = mig_data_size->stop_copy_length; 260 261 return 0; 262 } 263 264 static int vfio_query_precopy_size(VFIOMigration *migration) 265 { 266 struct vfio_precopy_info precopy = { 267 .argsz = sizeof(precopy), 268 }; 269 270 migration->precopy_init_size = 0; 271 migration->precopy_dirty_size = 0; 272 273 if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) { 274 return -errno; 275 } 276 277 migration->precopy_init_size = precopy.initial_bytes; 278 migration->precopy_dirty_size = precopy.dirty_bytes; 279 280 return 0; 281 } 282 283 /* Returns the size of saved data on success and -errno on error */ 284 static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) 285 { 286 ssize_t data_size; 287 288 data_size = read(migration->data_fd, migration->data_buffer, 289 migration->data_buffer_size); 290 if (data_size < 0) { 291 /* 292 * Pre-copy emptied all the device state for now. For more information, 293 * please refer to the Linux kernel VFIO uAPI. 294 */ 295 if (errno == ENOMSG) { 296 return 0; 297 } 298 299 return -errno; 300 } 301 if (data_size == 0) { 302 return 0; 303 } 304 305 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); 306 qemu_put_be64(f, data_size); 307 qemu_put_buffer(f, migration->data_buffer, data_size); 308 bytes_transferred += data_size; 309 310 trace_vfio_save_block(migration->vbasedev->name, data_size); 311 312 return qemu_file_get_error(f) ?: data_size; 313 } 314 315 static void vfio_update_estimated_pending_data(VFIOMigration *migration, 316 uint64_t data_size) 317 { 318 if (!data_size) { 319 /* 320 * Pre-copy emptied all the device state for now, update estimated sizes 321 * accordingly. 322 */ 323 migration->precopy_init_size = 0; 324 migration->precopy_dirty_size = 0; 325 326 return; 327 } 328 329 if (migration->precopy_init_size) { 330 uint64_t init_size = MIN(migration->precopy_init_size, data_size); 331 332 migration->precopy_init_size -= init_size; 333 data_size -= init_size; 334 } 335 336 migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size, 337 data_size); 338 } 339 340 static bool vfio_precopy_supported(VFIODevice *vbasedev) 341 { 342 VFIOMigration *migration = vbasedev->migration; 343 344 return migration->mig_flags & VFIO_MIGRATION_PRE_COPY; 345 } 346 347 /* ---------------------------------------------------------------------- */ 348 349 static int vfio_save_prepare(void *opaque, Error **errp) 350 { 351 VFIODevice *vbasedev = opaque; 352 353 /* 354 * Snapshot doesn't use postcopy nor background snapshot, so allow snapshot 355 * even if they are on. 356 */ 357 if (runstate_check(RUN_STATE_SAVE_VM)) { 358 return 0; 359 } 360 361 if (migrate_postcopy_ram()) { 362 error_setg( 363 errp, "%s: VFIO migration is not supported with postcopy migration", 364 vbasedev->name); 365 return -EOPNOTSUPP; 366 } 367 368 if (migrate_background_snapshot()) { 369 error_setg( 370 errp, 371 "%s: VFIO migration is not supported with background snapshot", 372 vbasedev->name); 373 return -EOPNOTSUPP; 374 } 375 376 return 0; 377 } 378 379 static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp) 380 { 381 VFIODevice *vbasedev = opaque; 382 VFIOMigration *migration = vbasedev->migration; 383 uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; 384 int ret; 385 386 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); 387 388 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 389 migration->data_buffer_size = MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE, 390 stop_copy_size); 391 migration->data_buffer = g_try_malloc0(migration->data_buffer_size); 392 if (!migration->data_buffer) { 393 error_setg(errp, "%s: Failed to allocate migration data buffer", 394 vbasedev->name); 395 return -ENOMEM; 396 } 397 398 if (vfio_precopy_supported(vbasedev)) { 399 switch (migration->device_state) { 400 case VFIO_DEVICE_STATE_RUNNING: 401 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY, 402 VFIO_DEVICE_STATE_RUNNING); 403 if (ret) { 404 error_setg(errp, "%s: Failed to set new PRE_COPY state", 405 vbasedev->name); 406 return ret; 407 } 408 409 vfio_query_precopy_size(migration); 410 411 break; 412 case VFIO_DEVICE_STATE_STOP: 413 /* vfio_save_complete_precopy() will go to STOP_COPY */ 414 break; 415 default: 416 error_setg(errp, "%s: Invalid device state %d", vbasedev->name, 417 migration->device_state); 418 return -EINVAL; 419 } 420 } 421 422 trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size); 423 424 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 425 426 ret = qemu_file_get_error(f); 427 if (ret < 0) { 428 error_setg_errno(errp, -ret, "%s: save setup failed", vbasedev->name); 429 } 430 431 return ret; 432 } 433 434 static void vfio_save_cleanup(void *opaque) 435 { 436 VFIODevice *vbasedev = opaque; 437 VFIOMigration *migration = vbasedev->migration; 438 439 /* 440 * Changing device state from STOP_COPY to STOP can take time. Do it here, 441 * after migration has completed, so it won't increase downtime. 442 */ 443 if (migration->device_state == VFIO_DEVICE_STATE_STOP_COPY) { 444 vfio_migration_set_state_or_reset(vbasedev, VFIO_DEVICE_STATE_STOP); 445 } 446 447 g_free(migration->data_buffer); 448 migration->data_buffer = NULL; 449 migration->precopy_init_size = 0; 450 migration->precopy_dirty_size = 0; 451 migration->initial_data_sent = false; 452 vfio_migration_cleanup(vbasedev); 453 trace_vfio_save_cleanup(vbasedev->name); 454 } 455 456 static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy, 457 uint64_t *can_postcopy) 458 { 459 VFIODevice *vbasedev = opaque; 460 VFIOMigration *migration = vbasedev->migration; 461 462 if (!vfio_device_state_is_precopy(vbasedev)) { 463 return; 464 } 465 466 *must_precopy += 467 migration->precopy_init_size + migration->precopy_dirty_size; 468 469 trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy, 470 *can_postcopy, 471 migration->precopy_init_size, 472 migration->precopy_dirty_size); 473 } 474 475 /* 476 * Migration size of VFIO devices can be as little as a few KBs or as big as 477 * many GBs. This value should be big enough to cover the worst case. 478 */ 479 #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB) 480 481 static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, 482 uint64_t *can_postcopy) 483 { 484 VFIODevice *vbasedev = opaque; 485 VFIOMigration *migration = vbasedev->migration; 486 uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE; 487 488 /* 489 * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is 490 * reported so downtime limit won't be violated. 491 */ 492 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 493 *must_precopy += stop_copy_size; 494 495 if (vfio_device_state_is_precopy(vbasedev)) { 496 vfio_query_precopy_size(migration); 497 498 *must_precopy += 499 migration->precopy_init_size + migration->precopy_dirty_size; 500 } 501 502 trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy, 503 stop_copy_size, migration->precopy_init_size, 504 migration->precopy_dirty_size); 505 } 506 507 static bool vfio_is_active_iterate(void *opaque) 508 { 509 VFIODevice *vbasedev = opaque; 510 511 return vfio_device_state_is_precopy(vbasedev); 512 } 513 514 /* 515 * Note about migration rate limiting: VFIO migration buffer size is currently 516 * limited to 1MB, so there is no need to check if migration rate exceeded (as 517 * in the worst case it will exceed by 1MB). However, if the buffer size is 518 * later changed to a bigger value, migration rate should be enforced here. 519 */ 520 static int vfio_save_iterate(QEMUFile *f, void *opaque) 521 { 522 VFIODevice *vbasedev = opaque; 523 VFIOMigration *migration = vbasedev->migration; 524 ssize_t data_size; 525 526 data_size = vfio_save_block(f, migration); 527 if (data_size < 0) { 528 return data_size; 529 } 530 531 vfio_update_estimated_pending_data(migration, data_size); 532 533 if (migrate_switchover_ack() && !migration->precopy_init_size && 534 !migration->initial_data_sent) { 535 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT); 536 migration->initial_data_sent = true; 537 } else { 538 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 539 } 540 541 trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size, 542 migration->precopy_dirty_size); 543 544 return !migration->precopy_init_size && !migration->precopy_dirty_size; 545 } 546 547 static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) 548 { 549 VFIODevice *vbasedev = opaque; 550 ssize_t data_size; 551 int ret; 552 553 /* We reach here with device state STOP or STOP_COPY only */ 554 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, 555 VFIO_DEVICE_STATE_STOP); 556 if (ret) { 557 return ret; 558 } 559 560 do { 561 data_size = vfio_save_block(f, vbasedev->migration); 562 if (data_size < 0) { 563 return data_size; 564 } 565 } while (data_size); 566 567 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 568 ret = qemu_file_get_error(f); 569 if (ret) { 570 return ret; 571 } 572 573 trace_vfio_save_complete_precopy(vbasedev->name, ret); 574 575 return ret; 576 } 577 578 static void vfio_save_state(QEMUFile *f, void *opaque) 579 { 580 VFIODevice *vbasedev = opaque; 581 int ret; 582 583 ret = vfio_save_device_config_state(f, opaque); 584 if (ret) { 585 error_report("%s: Failed to save device config space", 586 vbasedev->name); 587 qemu_file_set_error(f, ret); 588 } 589 } 590 591 static int vfio_load_setup(QEMUFile *f, void *opaque, Error **errp) 592 { 593 VFIODevice *vbasedev = opaque; 594 int ret; 595 596 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING, 597 vbasedev->migration->device_state); 598 if (ret) { 599 error_setg(errp, "%s: Failed to set RESUMING state", vbasedev->name); 600 } 601 return ret; 602 } 603 604 static int vfio_load_cleanup(void *opaque) 605 { 606 VFIODevice *vbasedev = opaque; 607 608 vfio_migration_cleanup(vbasedev); 609 trace_vfio_load_cleanup(vbasedev->name); 610 611 return 0; 612 } 613 614 static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) 615 { 616 VFIODevice *vbasedev = opaque; 617 int ret = 0; 618 uint64_t data; 619 620 data = qemu_get_be64(f); 621 while (data != VFIO_MIG_FLAG_END_OF_STATE) { 622 623 trace_vfio_load_state(vbasedev->name, data); 624 625 switch (data) { 626 case VFIO_MIG_FLAG_DEV_CONFIG_STATE: 627 { 628 return vfio_load_device_config_state(f, opaque); 629 } 630 case VFIO_MIG_FLAG_DEV_SETUP_STATE: 631 { 632 data = qemu_get_be64(f); 633 if (data == VFIO_MIG_FLAG_END_OF_STATE) { 634 return ret; 635 } else { 636 error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64, 637 vbasedev->name, data); 638 return -EINVAL; 639 } 640 break; 641 } 642 case VFIO_MIG_FLAG_DEV_DATA_STATE: 643 { 644 uint64_t data_size = qemu_get_be64(f); 645 646 if (data_size) { 647 ret = vfio_load_buffer(f, vbasedev, data_size); 648 if (ret < 0) { 649 return ret; 650 } 651 } 652 break; 653 } 654 case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT: 655 { 656 if (!vfio_precopy_supported(vbasedev) || 657 !migrate_switchover_ack()) { 658 error_report("%s: Received INIT_DATA_SENT but switchover ack " 659 "is not used", vbasedev->name); 660 return -EINVAL; 661 } 662 663 ret = qemu_loadvm_approve_switchover(); 664 if (ret) { 665 error_report( 666 "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)", 667 vbasedev->name, ret, strerror(-ret)); 668 } 669 670 return ret; 671 } 672 default: 673 error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data); 674 return -EINVAL; 675 } 676 677 data = qemu_get_be64(f); 678 ret = qemu_file_get_error(f); 679 if (ret) { 680 return ret; 681 } 682 } 683 return ret; 684 } 685 686 static bool vfio_switchover_ack_needed(void *opaque) 687 { 688 VFIODevice *vbasedev = opaque; 689 690 return vfio_precopy_supported(vbasedev); 691 } 692 693 static const SaveVMHandlers savevm_vfio_handlers = { 694 .save_prepare = vfio_save_prepare, 695 .save_setup = vfio_save_setup, 696 .save_cleanup = vfio_save_cleanup, 697 .state_pending_estimate = vfio_state_pending_estimate, 698 .state_pending_exact = vfio_state_pending_exact, 699 .is_active_iterate = vfio_is_active_iterate, 700 .save_live_iterate = vfio_save_iterate, 701 .save_live_complete_precopy = vfio_save_complete_precopy, 702 .save_state = vfio_save_state, 703 .load_setup = vfio_load_setup, 704 .load_cleanup = vfio_load_cleanup, 705 .load_state = vfio_load_state, 706 .switchover_ack_needed = vfio_switchover_ack_needed, 707 }; 708 709 /* ---------------------------------------------------------------------- */ 710 711 static void vfio_vmstate_change_prepare(void *opaque, bool running, 712 RunState state) 713 { 714 VFIODevice *vbasedev = opaque; 715 VFIOMigration *migration = vbasedev->migration; 716 enum vfio_device_mig_state new_state; 717 int ret; 718 719 new_state = migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ? 720 VFIO_DEVICE_STATE_PRE_COPY_P2P : 721 VFIO_DEVICE_STATE_RUNNING_P2P; 722 723 ret = vfio_migration_set_state_or_reset(vbasedev, new_state); 724 if (ret) { 725 /* 726 * Migration should be aborted in this case, but vm_state_notify() 727 * currently does not support reporting failures. 728 */ 729 migration_file_set_error(ret); 730 } 731 732 trace_vfio_vmstate_change_prepare(vbasedev->name, running, 733 RunState_str(state), 734 mig_state_to_str(new_state)); 735 } 736 737 static void vfio_vmstate_change(void *opaque, bool running, RunState state) 738 { 739 VFIODevice *vbasedev = opaque; 740 enum vfio_device_mig_state new_state; 741 int ret; 742 743 if (running) { 744 new_state = VFIO_DEVICE_STATE_RUNNING; 745 } else { 746 new_state = 747 (vfio_device_state_is_precopy(vbasedev) && 748 (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ? 749 VFIO_DEVICE_STATE_STOP_COPY : 750 VFIO_DEVICE_STATE_STOP; 751 } 752 753 ret = vfio_migration_set_state_or_reset(vbasedev, new_state); 754 if (ret) { 755 /* 756 * Migration should be aborted in this case, but vm_state_notify() 757 * currently does not support reporting failures. 758 */ 759 migration_file_set_error(ret); 760 } 761 762 trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state), 763 mig_state_to_str(new_state)); 764 } 765 766 static int vfio_migration_state_notifier(NotifierWithReturn *notifier, 767 MigrationEvent *e, Error **errp) 768 { 769 VFIOMigration *migration = container_of(notifier, VFIOMigration, 770 migration_state); 771 VFIODevice *vbasedev = migration->vbasedev; 772 773 trace_vfio_migration_state_notifier(vbasedev->name, e->type); 774 775 if (e->type == MIG_EVENT_PRECOPY_FAILED) { 776 vfio_migration_set_state_or_reset(vbasedev, VFIO_DEVICE_STATE_RUNNING); 777 } 778 return 0; 779 } 780 781 static void vfio_migration_free(VFIODevice *vbasedev) 782 { 783 g_free(vbasedev->migration); 784 vbasedev->migration = NULL; 785 } 786 787 static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags) 788 { 789 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 790 sizeof(struct vfio_device_feature_migration), 791 sizeof(uint64_t))] = {}; 792 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 793 struct vfio_device_feature_migration *mig = 794 (struct vfio_device_feature_migration *)feature->data; 795 796 feature->argsz = sizeof(buf); 797 feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION; 798 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 799 return -errno; 800 } 801 802 *mig_flags = mig->flags; 803 804 return 0; 805 } 806 807 static bool vfio_dma_logging_supported(VFIODevice *vbasedev) 808 { 809 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), 810 sizeof(uint64_t))] = {}; 811 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 812 813 feature->argsz = sizeof(buf); 814 feature->flags = VFIO_DEVICE_FEATURE_PROBE | 815 VFIO_DEVICE_FEATURE_DMA_LOGGING_START; 816 817 return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); 818 } 819 820 static int vfio_migration_init(VFIODevice *vbasedev) 821 { 822 int ret; 823 Object *obj; 824 VFIOMigration *migration; 825 char id[256] = ""; 826 g_autofree char *path = NULL, *oid = NULL; 827 uint64_t mig_flags = 0; 828 VMChangeStateHandler *prepare_cb; 829 830 if (!vbasedev->ops->vfio_get_object) { 831 return -EINVAL; 832 } 833 834 obj = vbasedev->ops->vfio_get_object(vbasedev); 835 if (!obj) { 836 return -EINVAL; 837 } 838 839 ret = vfio_migration_query_flags(vbasedev, &mig_flags); 840 if (ret) { 841 return ret; 842 } 843 844 /* Basic migration functionality must be supported */ 845 if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) { 846 return -EOPNOTSUPP; 847 } 848 849 vbasedev->migration = g_new0(VFIOMigration, 1); 850 migration = vbasedev->migration; 851 migration->vbasedev = vbasedev; 852 migration->device_state = VFIO_DEVICE_STATE_RUNNING; 853 migration->data_fd = -1; 854 migration->mig_flags = mig_flags; 855 856 vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev); 857 858 oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); 859 if (oid) { 860 path = g_strdup_printf("%s/vfio", oid); 861 } else { 862 path = g_strdup("vfio"); 863 } 864 strpadcpy(id, sizeof(id), path, '\0'); 865 866 register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers, 867 vbasedev); 868 869 prepare_cb = migration->mig_flags & VFIO_MIGRATION_P2P ? 870 vfio_vmstate_change_prepare : 871 NULL; 872 migration->vm_state = qdev_add_vm_change_state_handler_full( 873 vbasedev->dev, vfio_vmstate_change, prepare_cb, vbasedev); 874 migration_add_notifier(&migration->migration_state, 875 vfio_migration_state_notifier); 876 877 return 0; 878 } 879 880 static void vfio_migration_deinit(VFIODevice *vbasedev) 881 { 882 VFIOMigration *migration = vbasedev->migration; 883 884 migration_remove_notifier(&migration->migration_state); 885 qemu_del_vm_change_state_handler(migration->vm_state); 886 unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); 887 vfio_migration_free(vbasedev); 888 vfio_unblock_multiple_devices_migration(); 889 } 890 891 static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp) 892 { 893 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { 894 error_propagate(errp, err); 895 return -EINVAL; 896 } 897 898 vbasedev->migration_blocker = error_copy(err); 899 error_free(err); 900 901 return migrate_add_blocker_normal(&vbasedev->migration_blocker, errp); 902 } 903 904 /* ---------------------------------------------------------------------- */ 905 906 int64_t vfio_mig_bytes_transferred(void) 907 { 908 return bytes_transferred; 909 } 910 911 void vfio_reset_bytes_transferred(void) 912 { 913 bytes_transferred = 0; 914 } 915 916 /* 917 * Return true when either migration initialized or blocker registered. 918 * Currently only return false when adding blocker fails which will 919 * de-register vfio device. 920 */ 921 bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) 922 { 923 Error *err = NULL; 924 int ret; 925 926 if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) { 927 error_setg(&err, "%s: Migration is disabled for VFIO device", 928 vbasedev->name); 929 return !vfio_block_migration(vbasedev, err, errp); 930 } 931 932 ret = vfio_migration_init(vbasedev); 933 if (ret) { 934 if (ret == -ENOTTY) { 935 error_setg(&err, "%s: VFIO migration is not supported in kernel", 936 vbasedev->name); 937 } else { 938 error_setg(&err, 939 "%s: Migration couldn't be initialized for VFIO device, " 940 "err: %d (%s)", 941 vbasedev->name, ret, strerror(-ret)); 942 } 943 944 return !vfio_block_migration(vbasedev, err, errp); 945 } 946 947 if (!vbasedev->dirty_pages_supported) { 948 if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { 949 error_setg(&err, 950 "%s: VFIO device doesn't support device dirty tracking", 951 vbasedev->name); 952 goto add_blocker; 953 } 954 955 warn_report("%s: VFIO device doesn't support device dirty tracking", 956 vbasedev->name); 957 } 958 959 ret = vfio_block_multiple_devices_migration(vbasedev, errp); 960 if (ret) { 961 goto out_deinit; 962 } 963 964 if (vfio_viommu_preset(vbasedev)) { 965 error_setg(&err, "%s: Migration is currently not supported " 966 "with vIOMMU enabled", vbasedev->name); 967 goto add_blocker; 968 } 969 970 trace_vfio_migration_realize(vbasedev->name); 971 return true; 972 973 add_blocker: 974 ret = vfio_block_migration(vbasedev, err, errp); 975 out_deinit: 976 if (ret) { 977 vfio_migration_deinit(vbasedev); 978 } 979 return !ret; 980 } 981 982 void vfio_migration_exit(VFIODevice *vbasedev) 983 { 984 if (vbasedev->migration) { 985 vfio_migration_deinit(vbasedev); 986 } 987 988 migrate_del_blocker(&vbasedev->migration_blocker); 989 } 990