1 /* 2 * Migration support for VFIO devices 3 * 4 * Copyright NVIDIA, Inc. 2020 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2. See 7 * the COPYING file in the top-level directory. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "qemu/main-loop.h" 12 #include "qemu/cutils.h" 13 #include "qemu/units.h" 14 #include "qemu/error-report.h" 15 #include <linux/vfio.h> 16 #include <sys/ioctl.h> 17 18 #include "sysemu/runstate.h" 19 #include "hw/vfio/vfio-common.h" 20 #include "migration/migration.h" 21 #include "migration/savevm.h" 22 #include "migration/vmstate.h" 23 #include "migration/qemu-file.h" 24 #include "migration/register.h" 25 #include "migration/blocker.h" 26 #include "migration/misc.h" 27 #include "qapi/error.h" 28 #include "exec/ramlist.h" 29 #include "exec/ram_addr.h" 30 #include "pci.h" 31 #include "trace.h" 32 #include "hw/hw.h" 33 34 /* 35 * Flags to be used as unique delimiters for VFIO devices in the migration 36 * stream. These flags are composed as: 37 * 0xffffffff => MSB 32-bit all 1s 38 * 0xef10 => Magic ID, represents emulated (virtual) function IO 39 * 0x0000 => 16-bits reserved for flags 40 * 41 * The beginning of state information is marked by _DEV_CONFIG_STATE, 42 * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a 43 * certain state information is marked by _END_OF_STATE. 44 */ 45 #define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) 46 #define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) 47 #define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) 48 #define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) 49 #define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL) 50 51 /* 52 * This is an arbitrary size based on migration of mlx5 devices, where typically 53 * total device migration size is on the order of 100s of MB. Testing with 54 * larger values, e.g. 128MB and 1GB, did not show a performance improvement. 55 */ 56 #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB) 57 58 static int64_t bytes_transferred; 59 60 static const char *mig_state_to_str(enum vfio_device_mig_state state) 61 { 62 switch (state) { 63 case VFIO_DEVICE_STATE_ERROR: 64 return "ERROR"; 65 case VFIO_DEVICE_STATE_STOP: 66 return "STOP"; 67 case VFIO_DEVICE_STATE_RUNNING: 68 return "RUNNING"; 69 case VFIO_DEVICE_STATE_STOP_COPY: 70 return "STOP_COPY"; 71 case VFIO_DEVICE_STATE_RESUMING: 72 return "RESUMING"; 73 case VFIO_DEVICE_STATE_RUNNING_P2P: 74 return "RUNNING_P2P"; 75 case VFIO_DEVICE_STATE_PRE_COPY: 76 return "PRE_COPY"; 77 case VFIO_DEVICE_STATE_PRE_COPY_P2P: 78 return "PRE_COPY_P2P"; 79 default: 80 return "UNKNOWN STATE"; 81 } 82 } 83 84 static int vfio_migration_set_state(VFIODevice *vbasedev, 85 enum vfio_device_mig_state new_state, 86 enum vfio_device_mig_state recover_state) 87 { 88 VFIOMigration *migration = vbasedev->migration; 89 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 90 sizeof(struct vfio_device_feature_mig_state), 91 sizeof(uint64_t))] = {}; 92 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 93 struct vfio_device_feature_mig_state *mig_state = 94 (struct vfio_device_feature_mig_state *)feature->data; 95 int ret; 96 97 feature->argsz = sizeof(buf); 98 feature->flags = 99 VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE; 100 mig_state->device_state = new_state; 101 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 102 /* Try to set the device in some good state */ 103 ret = -errno; 104 105 if (recover_state == VFIO_DEVICE_STATE_ERROR) { 106 error_report("%s: Failed setting device state to %s, err: %s. " 107 "Recover state is ERROR. Resetting device", 108 vbasedev->name, mig_state_to_str(new_state), 109 strerror(errno)); 110 111 goto reset_device; 112 } 113 114 error_report( 115 "%s: Failed setting device state to %s, err: %s. Setting device in recover state %s", 116 vbasedev->name, mig_state_to_str(new_state), 117 strerror(errno), mig_state_to_str(recover_state)); 118 119 mig_state->device_state = recover_state; 120 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 121 ret = -errno; 122 error_report( 123 "%s: Failed setting device in recover state, err: %s. Resetting device", 124 vbasedev->name, strerror(errno)); 125 126 goto reset_device; 127 } 128 129 migration->device_state = recover_state; 130 131 return ret; 132 } 133 134 migration->device_state = new_state; 135 if (mig_state->data_fd != -1) { 136 if (migration->data_fd != -1) { 137 /* 138 * This can happen if the device is asynchronously reset and 139 * terminates a data transfer. 140 */ 141 error_report("%s: data_fd out of sync", vbasedev->name); 142 close(mig_state->data_fd); 143 144 return -EBADF; 145 } 146 147 migration->data_fd = mig_state->data_fd; 148 } 149 150 trace_vfio_migration_set_state(vbasedev->name, mig_state_to_str(new_state)); 151 152 return 0; 153 154 reset_device: 155 if (ioctl(vbasedev->fd, VFIO_DEVICE_RESET)) { 156 hw_error("%s: Failed resetting device, err: %s", vbasedev->name, 157 strerror(errno)); 158 } 159 160 migration->device_state = VFIO_DEVICE_STATE_RUNNING; 161 162 return ret; 163 } 164 165 /* 166 * Some device state transitions require resetting the device if they fail. 167 * This function sets the device in new_state and resets the device if that 168 * fails. Reset is done by using ERROR as the recover state. 169 */ 170 static int 171 vfio_migration_set_state_or_reset(VFIODevice *vbasedev, 172 enum vfio_device_mig_state new_state) 173 { 174 return vfio_migration_set_state(vbasedev, new_state, 175 VFIO_DEVICE_STATE_ERROR); 176 } 177 178 static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev, 179 uint64_t data_size) 180 { 181 VFIOMigration *migration = vbasedev->migration; 182 int ret; 183 184 ret = qemu_file_get_to_fd(f, migration->data_fd, data_size); 185 trace_vfio_load_state_device_data(vbasedev->name, data_size, ret); 186 187 return ret; 188 } 189 190 static int vfio_save_device_config_state(QEMUFile *f, void *opaque) 191 { 192 VFIODevice *vbasedev = opaque; 193 194 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE); 195 196 if (vbasedev->ops && vbasedev->ops->vfio_save_config) { 197 vbasedev->ops->vfio_save_config(vbasedev, f); 198 } 199 200 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 201 202 trace_vfio_save_device_config_state(vbasedev->name); 203 204 return qemu_file_get_error(f); 205 } 206 207 static int vfio_load_device_config_state(QEMUFile *f, void *opaque) 208 { 209 VFIODevice *vbasedev = opaque; 210 uint64_t data; 211 212 if (vbasedev->ops && vbasedev->ops->vfio_load_config) { 213 int ret; 214 215 ret = vbasedev->ops->vfio_load_config(vbasedev, f); 216 if (ret) { 217 error_report("%s: Failed to load device config space", 218 vbasedev->name); 219 return ret; 220 } 221 } 222 223 data = qemu_get_be64(f); 224 if (data != VFIO_MIG_FLAG_END_OF_STATE) { 225 error_report("%s: Failed loading device config space, " 226 "end flag incorrect 0x%"PRIx64, vbasedev->name, data); 227 return -EINVAL; 228 } 229 230 trace_vfio_load_device_config_state(vbasedev->name); 231 return qemu_file_get_error(f); 232 } 233 234 static void vfio_migration_cleanup(VFIODevice *vbasedev) 235 { 236 VFIOMigration *migration = vbasedev->migration; 237 238 close(migration->data_fd); 239 migration->data_fd = -1; 240 } 241 242 static int vfio_query_stop_copy_size(VFIODevice *vbasedev, 243 uint64_t *stop_copy_size) 244 { 245 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 246 sizeof(struct vfio_device_feature_mig_data_size), 247 sizeof(uint64_t))] = {}; 248 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 249 struct vfio_device_feature_mig_data_size *mig_data_size = 250 (struct vfio_device_feature_mig_data_size *)feature->data; 251 252 feature->argsz = sizeof(buf); 253 feature->flags = 254 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DATA_SIZE; 255 256 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 257 return -errno; 258 } 259 260 *stop_copy_size = mig_data_size->stop_copy_length; 261 262 return 0; 263 } 264 265 static int vfio_query_precopy_size(VFIOMigration *migration) 266 { 267 struct vfio_precopy_info precopy = { 268 .argsz = sizeof(precopy), 269 }; 270 271 migration->precopy_init_size = 0; 272 migration->precopy_dirty_size = 0; 273 274 if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) { 275 return -errno; 276 } 277 278 migration->precopy_init_size = precopy.initial_bytes; 279 migration->precopy_dirty_size = precopy.dirty_bytes; 280 281 return 0; 282 } 283 284 /* Returns the size of saved data on success and -errno on error */ 285 static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) 286 { 287 ssize_t data_size; 288 289 data_size = read(migration->data_fd, migration->data_buffer, 290 migration->data_buffer_size); 291 if (data_size < 0) { 292 /* 293 * Pre-copy emptied all the device state for now. For more information, 294 * please refer to the Linux kernel VFIO uAPI. 295 */ 296 if (errno == ENOMSG) { 297 return 0; 298 } 299 300 return -errno; 301 } 302 if (data_size == 0) { 303 return 0; 304 } 305 306 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); 307 qemu_put_be64(f, data_size); 308 qemu_put_buffer(f, migration->data_buffer, data_size); 309 bytes_transferred += data_size; 310 311 trace_vfio_save_block(migration->vbasedev->name, data_size); 312 313 return qemu_file_get_error(f) ?: data_size; 314 } 315 316 static void vfio_update_estimated_pending_data(VFIOMigration *migration, 317 uint64_t data_size) 318 { 319 if (!data_size) { 320 /* 321 * Pre-copy emptied all the device state for now, update estimated sizes 322 * accordingly. 323 */ 324 migration->precopy_init_size = 0; 325 migration->precopy_dirty_size = 0; 326 327 return; 328 } 329 330 if (migration->precopy_init_size) { 331 uint64_t init_size = MIN(migration->precopy_init_size, data_size); 332 333 migration->precopy_init_size -= init_size; 334 data_size -= init_size; 335 } 336 337 migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size, 338 data_size); 339 } 340 341 static bool vfio_precopy_supported(VFIODevice *vbasedev) 342 { 343 VFIOMigration *migration = vbasedev->migration; 344 345 return migration->mig_flags & VFIO_MIGRATION_PRE_COPY; 346 } 347 348 /* ---------------------------------------------------------------------- */ 349 350 static int vfio_save_prepare(void *opaque, Error **errp) 351 { 352 VFIODevice *vbasedev = opaque; 353 354 /* 355 * Snapshot doesn't use postcopy nor background snapshot, so allow snapshot 356 * even if they are on. 357 */ 358 if (runstate_check(RUN_STATE_SAVE_VM)) { 359 return 0; 360 } 361 362 if (migrate_postcopy_ram()) { 363 error_setg( 364 errp, "%s: VFIO migration is not supported with postcopy migration", 365 vbasedev->name); 366 return -EOPNOTSUPP; 367 } 368 369 if (migrate_background_snapshot()) { 370 error_setg( 371 errp, 372 "%s: VFIO migration is not supported with background snapshot", 373 vbasedev->name); 374 return -EOPNOTSUPP; 375 } 376 377 return 0; 378 } 379 380 static int vfio_save_setup(QEMUFile *f, void *opaque) 381 { 382 VFIODevice *vbasedev = opaque; 383 VFIOMigration *migration = vbasedev->migration; 384 uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; 385 386 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); 387 388 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 389 migration->data_buffer_size = MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE, 390 stop_copy_size); 391 migration->data_buffer = g_try_malloc0(migration->data_buffer_size); 392 if (!migration->data_buffer) { 393 error_report("%s: Failed to allocate migration data buffer", 394 vbasedev->name); 395 return -ENOMEM; 396 } 397 398 if (vfio_precopy_supported(vbasedev)) { 399 int ret; 400 401 switch (migration->device_state) { 402 case VFIO_DEVICE_STATE_RUNNING: 403 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY, 404 VFIO_DEVICE_STATE_RUNNING); 405 if (ret) { 406 return ret; 407 } 408 409 vfio_query_precopy_size(migration); 410 411 break; 412 case VFIO_DEVICE_STATE_STOP: 413 /* vfio_save_complete_precopy() will go to STOP_COPY */ 414 break; 415 default: 416 return -EINVAL; 417 } 418 } 419 420 trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size); 421 422 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 423 424 return qemu_file_get_error(f); 425 } 426 427 static void vfio_save_cleanup(void *opaque) 428 { 429 VFIODevice *vbasedev = opaque; 430 VFIOMigration *migration = vbasedev->migration; 431 432 /* 433 * Changing device state from STOP_COPY to STOP can take time. Do it here, 434 * after migration has completed, so it won't increase downtime. 435 */ 436 if (migration->device_state == VFIO_DEVICE_STATE_STOP_COPY) { 437 vfio_migration_set_state_or_reset(vbasedev, VFIO_DEVICE_STATE_STOP); 438 } 439 440 g_free(migration->data_buffer); 441 migration->data_buffer = NULL; 442 migration->precopy_init_size = 0; 443 migration->precopy_dirty_size = 0; 444 migration->initial_data_sent = false; 445 vfio_migration_cleanup(vbasedev); 446 trace_vfio_save_cleanup(vbasedev->name); 447 } 448 449 static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy, 450 uint64_t *can_postcopy) 451 { 452 VFIODevice *vbasedev = opaque; 453 VFIOMigration *migration = vbasedev->migration; 454 455 if (!vfio_device_state_is_precopy(vbasedev)) { 456 return; 457 } 458 459 *must_precopy += 460 migration->precopy_init_size + migration->precopy_dirty_size; 461 462 trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy, 463 *can_postcopy, 464 migration->precopy_init_size, 465 migration->precopy_dirty_size); 466 } 467 468 /* 469 * Migration size of VFIO devices can be as little as a few KBs or as big as 470 * many GBs. This value should be big enough to cover the worst case. 471 */ 472 #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB) 473 474 static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, 475 uint64_t *can_postcopy) 476 { 477 VFIODevice *vbasedev = opaque; 478 VFIOMigration *migration = vbasedev->migration; 479 uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE; 480 481 /* 482 * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is 483 * reported so downtime limit won't be violated. 484 */ 485 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 486 *must_precopy += stop_copy_size; 487 488 if (vfio_device_state_is_precopy(vbasedev)) { 489 vfio_query_precopy_size(migration); 490 491 *must_precopy += 492 migration->precopy_init_size + migration->precopy_dirty_size; 493 } 494 495 trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy, 496 stop_copy_size, migration->precopy_init_size, 497 migration->precopy_dirty_size); 498 } 499 500 static bool vfio_is_active_iterate(void *opaque) 501 { 502 VFIODevice *vbasedev = opaque; 503 504 return vfio_device_state_is_precopy(vbasedev); 505 } 506 507 /* 508 * Note about migration rate limiting: VFIO migration buffer size is currently 509 * limited to 1MB, so there is no need to check if migration rate exceeded (as 510 * in the worst case it will exceed by 1MB). However, if the buffer size is 511 * later changed to a bigger value, migration rate should be enforced here. 512 */ 513 static int vfio_save_iterate(QEMUFile *f, void *opaque) 514 { 515 VFIODevice *vbasedev = opaque; 516 VFIOMigration *migration = vbasedev->migration; 517 ssize_t data_size; 518 519 data_size = vfio_save_block(f, migration); 520 if (data_size < 0) { 521 return data_size; 522 } 523 524 vfio_update_estimated_pending_data(migration, data_size); 525 526 if (migrate_switchover_ack() && !migration->precopy_init_size && 527 !migration->initial_data_sent) { 528 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT); 529 migration->initial_data_sent = true; 530 } else { 531 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 532 } 533 534 trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size, 535 migration->precopy_dirty_size); 536 537 return !migration->precopy_init_size && !migration->precopy_dirty_size; 538 } 539 540 static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) 541 { 542 VFIODevice *vbasedev = opaque; 543 ssize_t data_size; 544 int ret; 545 546 /* We reach here with device state STOP or STOP_COPY only */ 547 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, 548 VFIO_DEVICE_STATE_STOP); 549 if (ret) { 550 return ret; 551 } 552 553 do { 554 data_size = vfio_save_block(f, vbasedev->migration); 555 if (data_size < 0) { 556 return data_size; 557 } 558 } while (data_size); 559 560 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 561 ret = qemu_file_get_error(f); 562 if (ret) { 563 return ret; 564 } 565 566 trace_vfio_save_complete_precopy(vbasedev->name, ret); 567 568 return ret; 569 } 570 571 static void vfio_save_state(QEMUFile *f, void *opaque) 572 { 573 VFIODevice *vbasedev = opaque; 574 int ret; 575 576 ret = vfio_save_device_config_state(f, opaque); 577 if (ret) { 578 error_report("%s: Failed to save device config space", 579 vbasedev->name); 580 qemu_file_set_error(f, ret); 581 } 582 } 583 584 static int vfio_load_setup(QEMUFile *f, void *opaque) 585 { 586 VFIODevice *vbasedev = opaque; 587 588 return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING, 589 vbasedev->migration->device_state); 590 } 591 592 static int vfio_load_cleanup(void *opaque) 593 { 594 VFIODevice *vbasedev = opaque; 595 596 vfio_migration_cleanup(vbasedev); 597 trace_vfio_load_cleanup(vbasedev->name); 598 599 return 0; 600 } 601 602 static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) 603 { 604 VFIODevice *vbasedev = opaque; 605 int ret = 0; 606 uint64_t data; 607 608 data = qemu_get_be64(f); 609 while (data != VFIO_MIG_FLAG_END_OF_STATE) { 610 611 trace_vfio_load_state(vbasedev->name, data); 612 613 switch (data) { 614 case VFIO_MIG_FLAG_DEV_CONFIG_STATE: 615 { 616 return vfio_load_device_config_state(f, opaque); 617 } 618 case VFIO_MIG_FLAG_DEV_SETUP_STATE: 619 { 620 data = qemu_get_be64(f); 621 if (data == VFIO_MIG_FLAG_END_OF_STATE) { 622 return ret; 623 } else { 624 error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64, 625 vbasedev->name, data); 626 return -EINVAL; 627 } 628 break; 629 } 630 case VFIO_MIG_FLAG_DEV_DATA_STATE: 631 { 632 uint64_t data_size = qemu_get_be64(f); 633 634 if (data_size) { 635 ret = vfio_load_buffer(f, vbasedev, data_size); 636 if (ret < 0) { 637 return ret; 638 } 639 } 640 break; 641 } 642 case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT: 643 { 644 if (!vfio_precopy_supported(vbasedev) || 645 !migrate_switchover_ack()) { 646 error_report("%s: Received INIT_DATA_SENT but switchover ack " 647 "is not used", vbasedev->name); 648 return -EINVAL; 649 } 650 651 ret = qemu_loadvm_approve_switchover(); 652 if (ret) { 653 error_report( 654 "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)", 655 vbasedev->name, ret, strerror(-ret)); 656 } 657 658 return ret; 659 } 660 default: 661 error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data); 662 return -EINVAL; 663 } 664 665 data = qemu_get_be64(f); 666 ret = qemu_file_get_error(f); 667 if (ret) { 668 return ret; 669 } 670 } 671 return ret; 672 } 673 674 static bool vfio_switchover_ack_needed(void *opaque) 675 { 676 VFIODevice *vbasedev = opaque; 677 678 return vfio_precopy_supported(vbasedev); 679 } 680 681 static const SaveVMHandlers savevm_vfio_handlers = { 682 .save_prepare = vfio_save_prepare, 683 .save_setup = vfio_save_setup, 684 .save_cleanup = vfio_save_cleanup, 685 .state_pending_estimate = vfio_state_pending_estimate, 686 .state_pending_exact = vfio_state_pending_exact, 687 .is_active_iterate = vfio_is_active_iterate, 688 .save_live_iterate = vfio_save_iterate, 689 .save_live_complete_precopy = vfio_save_complete_precopy, 690 .save_state = vfio_save_state, 691 .load_setup = vfio_load_setup, 692 .load_cleanup = vfio_load_cleanup, 693 .load_state = vfio_load_state, 694 .switchover_ack_needed = vfio_switchover_ack_needed, 695 }; 696 697 /* ---------------------------------------------------------------------- */ 698 699 static void vfio_vmstate_change_prepare(void *opaque, bool running, 700 RunState state) 701 { 702 VFIODevice *vbasedev = opaque; 703 VFIOMigration *migration = vbasedev->migration; 704 enum vfio_device_mig_state new_state; 705 int ret; 706 707 new_state = migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ? 708 VFIO_DEVICE_STATE_PRE_COPY_P2P : 709 VFIO_DEVICE_STATE_RUNNING_P2P; 710 711 ret = vfio_migration_set_state_or_reset(vbasedev, new_state); 712 if (ret) { 713 /* 714 * Migration should be aborted in this case, but vm_state_notify() 715 * currently does not support reporting failures. 716 */ 717 if (migrate_get_current()->to_dst_file) { 718 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 719 } 720 } 721 722 trace_vfio_vmstate_change_prepare(vbasedev->name, running, 723 RunState_str(state), 724 mig_state_to_str(new_state)); 725 } 726 727 static void vfio_vmstate_change(void *opaque, bool running, RunState state) 728 { 729 VFIODevice *vbasedev = opaque; 730 enum vfio_device_mig_state new_state; 731 int ret; 732 733 if (running) { 734 new_state = VFIO_DEVICE_STATE_RUNNING; 735 } else { 736 new_state = 737 (vfio_device_state_is_precopy(vbasedev) && 738 (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ? 739 VFIO_DEVICE_STATE_STOP_COPY : 740 VFIO_DEVICE_STATE_STOP; 741 } 742 743 ret = vfio_migration_set_state_or_reset(vbasedev, new_state); 744 if (ret) { 745 /* 746 * Migration should be aborted in this case, but vm_state_notify() 747 * currently does not support reporting failures. 748 */ 749 if (migrate_get_current()->to_dst_file) { 750 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 751 } 752 } 753 754 trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state), 755 mig_state_to_str(new_state)); 756 } 757 758 static int vfio_migration_state_notifier(NotifierWithReturn *notifier, 759 MigrationEvent *e, Error **errp) 760 { 761 VFIOMigration *migration = container_of(notifier, VFIOMigration, 762 migration_state); 763 VFIODevice *vbasedev = migration->vbasedev; 764 765 trace_vfio_migration_state_notifier(vbasedev->name, e->type); 766 767 if (e->type == MIG_EVENT_PRECOPY_FAILED) { 768 vfio_migration_set_state_or_reset(vbasedev, VFIO_DEVICE_STATE_RUNNING); 769 } 770 return 0; 771 } 772 773 static void vfio_migration_free(VFIODevice *vbasedev) 774 { 775 g_free(vbasedev->migration); 776 vbasedev->migration = NULL; 777 } 778 779 static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags) 780 { 781 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 782 sizeof(struct vfio_device_feature_migration), 783 sizeof(uint64_t))] = {}; 784 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 785 struct vfio_device_feature_migration *mig = 786 (struct vfio_device_feature_migration *)feature->data; 787 788 feature->argsz = sizeof(buf); 789 feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION; 790 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 791 return -errno; 792 } 793 794 *mig_flags = mig->flags; 795 796 return 0; 797 } 798 799 static bool vfio_dma_logging_supported(VFIODevice *vbasedev) 800 { 801 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), 802 sizeof(uint64_t))] = {}; 803 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 804 805 feature->argsz = sizeof(buf); 806 feature->flags = VFIO_DEVICE_FEATURE_PROBE | 807 VFIO_DEVICE_FEATURE_DMA_LOGGING_START; 808 809 return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); 810 } 811 812 static int vfio_migration_init(VFIODevice *vbasedev) 813 { 814 int ret; 815 Object *obj; 816 VFIOMigration *migration; 817 char id[256] = ""; 818 g_autofree char *path = NULL, *oid = NULL; 819 uint64_t mig_flags = 0; 820 VMChangeStateHandler *prepare_cb; 821 822 if (!vbasedev->ops->vfio_get_object) { 823 return -EINVAL; 824 } 825 826 obj = vbasedev->ops->vfio_get_object(vbasedev); 827 if (!obj) { 828 return -EINVAL; 829 } 830 831 ret = vfio_migration_query_flags(vbasedev, &mig_flags); 832 if (ret) { 833 return ret; 834 } 835 836 /* Basic migration functionality must be supported */ 837 if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) { 838 return -EOPNOTSUPP; 839 } 840 841 vbasedev->migration = g_new0(VFIOMigration, 1); 842 migration = vbasedev->migration; 843 migration->vbasedev = vbasedev; 844 migration->device_state = VFIO_DEVICE_STATE_RUNNING; 845 migration->data_fd = -1; 846 migration->mig_flags = mig_flags; 847 848 vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev); 849 850 oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); 851 if (oid) { 852 path = g_strdup_printf("%s/vfio", oid); 853 } else { 854 path = g_strdup("vfio"); 855 } 856 strpadcpy(id, sizeof(id), path, '\0'); 857 858 register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers, 859 vbasedev); 860 861 prepare_cb = migration->mig_flags & VFIO_MIGRATION_P2P ? 862 vfio_vmstate_change_prepare : 863 NULL; 864 migration->vm_state = qdev_add_vm_change_state_handler_full( 865 vbasedev->dev, vfio_vmstate_change, prepare_cb, vbasedev); 866 migration_add_notifier(&migration->migration_state, 867 vfio_migration_state_notifier); 868 869 return 0; 870 } 871 872 static void vfio_migration_deinit(VFIODevice *vbasedev) 873 { 874 VFIOMigration *migration = vbasedev->migration; 875 876 migration_remove_notifier(&migration->migration_state); 877 qemu_del_vm_change_state_handler(migration->vm_state); 878 unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); 879 vfio_migration_free(vbasedev); 880 vfio_unblock_multiple_devices_migration(); 881 } 882 883 static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp) 884 { 885 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { 886 error_propagate(errp, err); 887 return -EINVAL; 888 } 889 890 vbasedev->migration_blocker = error_copy(err); 891 error_free(err); 892 893 return migrate_add_blocker(&vbasedev->migration_blocker, errp); 894 } 895 896 /* ---------------------------------------------------------------------- */ 897 898 int64_t vfio_mig_bytes_transferred(void) 899 { 900 return bytes_transferred; 901 } 902 903 void vfio_reset_bytes_transferred(void) 904 { 905 bytes_transferred = 0; 906 } 907 908 /* 909 * Return true when either migration initialized or blocker registered. 910 * Currently only return false when adding blocker fails which will 911 * de-register vfio device. 912 */ 913 bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) 914 { 915 Error *err = NULL; 916 int ret; 917 918 if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) { 919 error_setg(&err, "%s: Migration is disabled for VFIO device", 920 vbasedev->name); 921 return !vfio_block_migration(vbasedev, err, errp); 922 } 923 924 ret = vfio_migration_init(vbasedev); 925 if (ret) { 926 if (ret == -ENOTTY) { 927 error_setg(&err, "%s: VFIO migration is not supported in kernel", 928 vbasedev->name); 929 } else { 930 error_setg(&err, 931 "%s: Migration couldn't be initialized for VFIO device, " 932 "err: %d (%s)", 933 vbasedev->name, ret, strerror(-ret)); 934 } 935 936 return !vfio_block_migration(vbasedev, err, errp); 937 } 938 939 if (!vbasedev->dirty_pages_supported) { 940 if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { 941 error_setg(&err, 942 "%s: VFIO device doesn't support device dirty tracking", 943 vbasedev->name); 944 goto add_blocker; 945 } 946 947 warn_report("%s: VFIO device doesn't support device dirty tracking", 948 vbasedev->name); 949 } 950 951 ret = vfio_block_multiple_devices_migration(vbasedev, errp); 952 if (ret) { 953 goto out_deinit; 954 } 955 956 if (vfio_viommu_preset(vbasedev)) { 957 error_setg(&err, "%s: Migration is currently not supported " 958 "with vIOMMU enabled", vbasedev->name); 959 goto add_blocker; 960 } 961 962 trace_vfio_migration_realize(vbasedev->name); 963 return true; 964 965 add_blocker: 966 ret = vfio_block_migration(vbasedev, err, errp); 967 out_deinit: 968 if (ret) { 969 vfio_migration_deinit(vbasedev); 970 } 971 return !ret; 972 } 973 974 void vfio_migration_exit(VFIODevice *vbasedev) 975 { 976 if (vbasedev->migration) { 977 vfio_migration_deinit(vbasedev); 978 } 979 980 migrate_del_blocker(&vbasedev->migration_blocker); 981 } 982