1 /* 2 * Migration support for VFIO devices 3 * 4 * Copyright NVIDIA, Inc. 2020 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2. See 7 * the COPYING file in the top-level directory. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "qemu/main-loop.h" 12 #include "qemu/cutils.h" 13 #include "qemu/units.h" 14 #include "qemu/error-report.h" 15 #include <linux/vfio.h> 16 #include <sys/ioctl.h> 17 18 #include "sysemu/runstate.h" 19 #include "hw/vfio/vfio-common.h" 20 #include "migration/migration.h" 21 #include "migration/options.h" 22 #include "migration/savevm.h" 23 #include "migration/vmstate.h" 24 #include "migration/qemu-file.h" 25 #include "migration/register.h" 26 #include "migration/blocker.h" 27 #include "migration/misc.h" 28 #include "qapi/error.h" 29 #include "exec/ramlist.h" 30 #include "exec/ram_addr.h" 31 #include "pci.h" 32 #include "trace.h" 33 #include "hw/hw.h" 34 35 /* 36 * Flags to be used as unique delimiters for VFIO devices in the migration 37 * stream. These flags are composed as: 38 * 0xffffffff => MSB 32-bit all 1s 39 * 0xef10 => Magic ID, represents emulated (virtual) function IO 40 * 0x0000 => 16-bits reserved for flags 41 * 42 * The beginning of state information is marked by _DEV_CONFIG_STATE, 43 * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a 44 * certain state information is marked by _END_OF_STATE. 45 */ 46 #define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) 47 #define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) 48 #define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) 49 #define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) 50 #define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL) 51 52 /* 53 * This is an arbitrary size based on migration of mlx5 devices, where typically 54 * total device migration size is on the order of 100s of MB. Testing with 55 * larger values, e.g. 128MB and 1GB, did not show a performance improvement. 56 */ 57 #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB) 58 59 static int64_t bytes_transferred; 60 61 static const char *mig_state_to_str(enum vfio_device_mig_state state) 62 { 63 switch (state) { 64 case VFIO_DEVICE_STATE_ERROR: 65 return "ERROR"; 66 case VFIO_DEVICE_STATE_STOP: 67 return "STOP"; 68 case VFIO_DEVICE_STATE_RUNNING: 69 return "RUNNING"; 70 case VFIO_DEVICE_STATE_STOP_COPY: 71 return "STOP_COPY"; 72 case VFIO_DEVICE_STATE_RESUMING: 73 return "RESUMING"; 74 case VFIO_DEVICE_STATE_RUNNING_P2P: 75 return "RUNNING_P2P"; 76 case VFIO_DEVICE_STATE_PRE_COPY: 77 return "PRE_COPY"; 78 case VFIO_DEVICE_STATE_PRE_COPY_P2P: 79 return "PRE_COPY_P2P"; 80 default: 81 return "UNKNOWN STATE"; 82 } 83 } 84 85 static int vfio_migration_set_state(VFIODevice *vbasedev, 86 enum vfio_device_mig_state new_state, 87 enum vfio_device_mig_state recover_state) 88 { 89 VFIOMigration *migration = vbasedev->migration; 90 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 91 sizeof(struct vfio_device_feature_mig_state), 92 sizeof(uint64_t))] = {}; 93 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 94 struct vfio_device_feature_mig_state *mig_state = 95 (struct vfio_device_feature_mig_state *)feature->data; 96 int ret; 97 98 feature->argsz = sizeof(buf); 99 feature->flags = 100 VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE; 101 mig_state->device_state = new_state; 102 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 103 /* Try to set the device in some good state */ 104 ret = -errno; 105 106 if (recover_state == VFIO_DEVICE_STATE_ERROR) { 107 error_report("%s: Failed setting device state to %s, err: %s. " 108 "Recover state is ERROR. Resetting device", 109 vbasedev->name, mig_state_to_str(new_state), 110 strerror(errno)); 111 112 goto reset_device; 113 } 114 115 error_report( 116 "%s: Failed setting device state to %s, err: %s. Setting device in recover state %s", 117 vbasedev->name, mig_state_to_str(new_state), 118 strerror(errno), mig_state_to_str(recover_state)); 119 120 mig_state->device_state = recover_state; 121 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 122 ret = -errno; 123 error_report( 124 "%s: Failed setting device in recover state, err: %s. Resetting device", 125 vbasedev->name, strerror(errno)); 126 127 goto reset_device; 128 } 129 130 migration->device_state = recover_state; 131 132 return ret; 133 } 134 135 migration->device_state = new_state; 136 if (mig_state->data_fd != -1) { 137 if (migration->data_fd != -1) { 138 /* 139 * This can happen if the device is asynchronously reset and 140 * terminates a data transfer. 141 */ 142 error_report("%s: data_fd out of sync", vbasedev->name); 143 close(mig_state->data_fd); 144 145 return -EBADF; 146 } 147 148 migration->data_fd = mig_state->data_fd; 149 } 150 151 trace_vfio_migration_set_state(vbasedev->name, mig_state_to_str(new_state)); 152 153 return 0; 154 155 reset_device: 156 if (ioctl(vbasedev->fd, VFIO_DEVICE_RESET)) { 157 hw_error("%s: Failed resetting device, err: %s", vbasedev->name, 158 strerror(errno)); 159 } 160 161 migration->device_state = VFIO_DEVICE_STATE_RUNNING; 162 163 return ret; 164 } 165 166 /* 167 * Some device state transitions require resetting the device if they fail. 168 * This function sets the device in new_state and resets the device if that 169 * fails. Reset is done by using ERROR as the recover state. 170 */ 171 static int 172 vfio_migration_set_state_or_reset(VFIODevice *vbasedev, 173 enum vfio_device_mig_state new_state) 174 { 175 return vfio_migration_set_state(vbasedev, new_state, 176 VFIO_DEVICE_STATE_ERROR); 177 } 178 179 static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev, 180 uint64_t data_size) 181 { 182 VFIOMigration *migration = vbasedev->migration; 183 int ret; 184 185 ret = qemu_file_get_to_fd(f, migration->data_fd, data_size); 186 trace_vfio_load_state_device_data(vbasedev->name, data_size, ret); 187 188 return ret; 189 } 190 191 static int vfio_save_device_config_state(QEMUFile *f, void *opaque) 192 { 193 VFIODevice *vbasedev = opaque; 194 195 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE); 196 197 if (vbasedev->ops && vbasedev->ops->vfio_save_config) { 198 vbasedev->ops->vfio_save_config(vbasedev, f); 199 } 200 201 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 202 203 trace_vfio_save_device_config_state(vbasedev->name); 204 205 return qemu_file_get_error(f); 206 } 207 208 static int vfio_load_device_config_state(QEMUFile *f, void *opaque) 209 { 210 VFIODevice *vbasedev = opaque; 211 uint64_t data; 212 213 if (vbasedev->ops && vbasedev->ops->vfio_load_config) { 214 int ret; 215 216 ret = vbasedev->ops->vfio_load_config(vbasedev, f); 217 if (ret) { 218 error_report("%s: Failed to load device config space", 219 vbasedev->name); 220 return ret; 221 } 222 } 223 224 data = qemu_get_be64(f); 225 if (data != VFIO_MIG_FLAG_END_OF_STATE) { 226 error_report("%s: Failed loading device config space, " 227 "end flag incorrect 0x%"PRIx64, vbasedev->name, data); 228 return -EINVAL; 229 } 230 231 trace_vfio_load_device_config_state(vbasedev->name); 232 return qemu_file_get_error(f); 233 } 234 235 static void vfio_migration_cleanup(VFIODevice *vbasedev) 236 { 237 VFIOMigration *migration = vbasedev->migration; 238 239 close(migration->data_fd); 240 migration->data_fd = -1; 241 } 242 243 static int vfio_query_stop_copy_size(VFIODevice *vbasedev, 244 uint64_t *stop_copy_size) 245 { 246 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 247 sizeof(struct vfio_device_feature_mig_data_size), 248 sizeof(uint64_t))] = {}; 249 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 250 struct vfio_device_feature_mig_data_size *mig_data_size = 251 (struct vfio_device_feature_mig_data_size *)feature->data; 252 253 feature->argsz = sizeof(buf); 254 feature->flags = 255 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DATA_SIZE; 256 257 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 258 return -errno; 259 } 260 261 *stop_copy_size = mig_data_size->stop_copy_length; 262 263 return 0; 264 } 265 266 static int vfio_query_precopy_size(VFIOMigration *migration) 267 { 268 struct vfio_precopy_info precopy = { 269 .argsz = sizeof(precopy), 270 }; 271 272 migration->precopy_init_size = 0; 273 migration->precopy_dirty_size = 0; 274 275 if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) { 276 return -errno; 277 } 278 279 migration->precopy_init_size = precopy.initial_bytes; 280 migration->precopy_dirty_size = precopy.dirty_bytes; 281 282 return 0; 283 } 284 285 /* Returns the size of saved data on success and -errno on error */ 286 static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) 287 { 288 ssize_t data_size; 289 290 data_size = read(migration->data_fd, migration->data_buffer, 291 migration->data_buffer_size); 292 if (data_size < 0) { 293 /* 294 * Pre-copy emptied all the device state for now. For more information, 295 * please refer to the Linux kernel VFIO uAPI. 296 */ 297 if (errno == ENOMSG) { 298 return 0; 299 } 300 301 return -errno; 302 } 303 if (data_size == 0) { 304 return 0; 305 } 306 307 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); 308 qemu_put_be64(f, data_size); 309 qemu_put_buffer(f, migration->data_buffer, data_size); 310 bytes_transferred += data_size; 311 312 trace_vfio_save_block(migration->vbasedev->name, data_size); 313 314 return qemu_file_get_error(f) ?: data_size; 315 } 316 317 static void vfio_update_estimated_pending_data(VFIOMigration *migration, 318 uint64_t data_size) 319 { 320 if (!data_size) { 321 /* 322 * Pre-copy emptied all the device state for now, update estimated sizes 323 * accordingly. 324 */ 325 migration->precopy_init_size = 0; 326 migration->precopy_dirty_size = 0; 327 328 return; 329 } 330 331 if (migration->precopy_init_size) { 332 uint64_t init_size = MIN(migration->precopy_init_size, data_size); 333 334 migration->precopy_init_size -= init_size; 335 data_size -= init_size; 336 } 337 338 migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size, 339 data_size); 340 } 341 342 static bool vfio_precopy_supported(VFIODevice *vbasedev) 343 { 344 VFIOMigration *migration = vbasedev->migration; 345 346 return migration->mig_flags & VFIO_MIGRATION_PRE_COPY; 347 } 348 349 /* ---------------------------------------------------------------------- */ 350 351 static int vfio_save_prepare(void *opaque, Error **errp) 352 { 353 VFIODevice *vbasedev = opaque; 354 355 /* 356 * Snapshot doesn't use postcopy nor background snapshot, so allow snapshot 357 * even if they are on. 358 */ 359 if (runstate_check(RUN_STATE_SAVE_VM)) { 360 return 0; 361 } 362 363 if (migrate_postcopy_ram()) { 364 error_setg( 365 errp, "%s: VFIO migration is not supported with postcopy migration", 366 vbasedev->name); 367 return -EOPNOTSUPP; 368 } 369 370 if (migrate_background_snapshot()) { 371 error_setg( 372 errp, 373 "%s: VFIO migration is not supported with background snapshot", 374 vbasedev->name); 375 return -EOPNOTSUPP; 376 } 377 378 return 0; 379 } 380 381 static int vfio_save_setup(QEMUFile *f, void *opaque) 382 { 383 VFIODevice *vbasedev = opaque; 384 VFIOMigration *migration = vbasedev->migration; 385 uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; 386 387 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); 388 389 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 390 migration->data_buffer_size = MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE, 391 stop_copy_size); 392 migration->data_buffer = g_try_malloc0(migration->data_buffer_size); 393 if (!migration->data_buffer) { 394 error_report("%s: Failed to allocate migration data buffer", 395 vbasedev->name); 396 return -ENOMEM; 397 } 398 399 if (vfio_precopy_supported(vbasedev)) { 400 int ret; 401 402 switch (migration->device_state) { 403 case VFIO_DEVICE_STATE_RUNNING: 404 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY, 405 VFIO_DEVICE_STATE_RUNNING); 406 if (ret) { 407 return ret; 408 } 409 410 vfio_query_precopy_size(migration); 411 412 break; 413 case VFIO_DEVICE_STATE_STOP: 414 /* vfio_save_complete_precopy() will go to STOP_COPY */ 415 break; 416 default: 417 return -EINVAL; 418 } 419 } 420 421 trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size); 422 423 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 424 425 return qemu_file_get_error(f); 426 } 427 428 static void vfio_save_cleanup(void *opaque) 429 { 430 VFIODevice *vbasedev = opaque; 431 VFIOMigration *migration = vbasedev->migration; 432 433 /* 434 * Changing device state from STOP_COPY to STOP can take time. Do it here, 435 * after migration has completed, so it won't increase downtime. 436 */ 437 if (migration->device_state == VFIO_DEVICE_STATE_STOP_COPY) { 438 vfio_migration_set_state_or_reset(vbasedev, VFIO_DEVICE_STATE_STOP); 439 } 440 441 g_free(migration->data_buffer); 442 migration->data_buffer = NULL; 443 migration->precopy_init_size = 0; 444 migration->precopy_dirty_size = 0; 445 migration->initial_data_sent = false; 446 vfio_migration_cleanup(vbasedev); 447 trace_vfio_save_cleanup(vbasedev->name); 448 } 449 450 static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy, 451 uint64_t *can_postcopy) 452 { 453 VFIODevice *vbasedev = opaque; 454 VFIOMigration *migration = vbasedev->migration; 455 456 if (!vfio_device_state_is_precopy(vbasedev)) { 457 return; 458 } 459 460 *must_precopy += 461 migration->precopy_init_size + migration->precopy_dirty_size; 462 463 trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy, 464 *can_postcopy, 465 migration->precopy_init_size, 466 migration->precopy_dirty_size); 467 } 468 469 /* 470 * Migration size of VFIO devices can be as little as a few KBs or as big as 471 * many GBs. This value should be big enough to cover the worst case. 472 */ 473 #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB) 474 475 static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, 476 uint64_t *can_postcopy) 477 { 478 VFIODevice *vbasedev = opaque; 479 VFIOMigration *migration = vbasedev->migration; 480 uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE; 481 482 /* 483 * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is 484 * reported so downtime limit won't be violated. 485 */ 486 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 487 *must_precopy += stop_copy_size; 488 489 if (vfio_device_state_is_precopy(vbasedev)) { 490 vfio_query_precopy_size(migration); 491 492 *must_precopy += 493 migration->precopy_init_size + migration->precopy_dirty_size; 494 } 495 496 trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy, 497 stop_copy_size, migration->precopy_init_size, 498 migration->precopy_dirty_size); 499 } 500 501 static bool vfio_is_active_iterate(void *opaque) 502 { 503 VFIODevice *vbasedev = opaque; 504 505 return vfio_device_state_is_precopy(vbasedev); 506 } 507 508 static int vfio_save_iterate(QEMUFile *f, void *opaque) 509 { 510 VFIODevice *vbasedev = opaque; 511 VFIOMigration *migration = vbasedev->migration; 512 ssize_t data_size; 513 514 data_size = vfio_save_block(f, migration); 515 if (data_size < 0) { 516 return data_size; 517 } 518 519 vfio_update_estimated_pending_data(migration, data_size); 520 521 if (migrate_switchover_ack() && !migration->precopy_init_size && 522 !migration->initial_data_sent) { 523 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT); 524 migration->initial_data_sent = true; 525 } else { 526 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 527 } 528 529 trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size, 530 migration->precopy_dirty_size); 531 532 /* 533 * A VFIO device's pre-copy dirty_bytes is not guaranteed to reach zero. 534 * Return 1 so following handlers will not be potentially blocked. 535 */ 536 return 1; 537 } 538 539 static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) 540 { 541 VFIODevice *vbasedev = opaque; 542 ssize_t data_size; 543 int ret; 544 545 /* We reach here with device state STOP or STOP_COPY only */ 546 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, 547 VFIO_DEVICE_STATE_STOP); 548 if (ret) { 549 return ret; 550 } 551 552 do { 553 data_size = vfio_save_block(f, vbasedev->migration); 554 if (data_size < 0) { 555 return data_size; 556 } 557 } while (data_size); 558 559 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 560 ret = qemu_file_get_error(f); 561 if (ret) { 562 return ret; 563 } 564 565 trace_vfio_save_complete_precopy(vbasedev->name, ret); 566 567 return ret; 568 } 569 570 static void vfio_save_state(QEMUFile *f, void *opaque) 571 { 572 VFIODevice *vbasedev = opaque; 573 int ret; 574 575 ret = vfio_save_device_config_state(f, opaque); 576 if (ret) { 577 error_report("%s: Failed to save device config space", 578 vbasedev->name); 579 qemu_file_set_error(f, ret); 580 } 581 } 582 583 static int vfio_load_setup(QEMUFile *f, void *opaque) 584 { 585 VFIODevice *vbasedev = opaque; 586 587 return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING, 588 vbasedev->migration->device_state); 589 } 590 591 static int vfio_load_cleanup(void *opaque) 592 { 593 VFIODevice *vbasedev = opaque; 594 595 vfio_migration_cleanup(vbasedev); 596 trace_vfio_load_cleanup(vbasedev->name); 597 598 return 0; 599 } 600 601 static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) 602 { 603 VFIODevice *vbasedev = opaque; 604 int ret = 0; 605 uint64_t data; 606 607 data = qemu_get_be64(f); 608 while (data != VFIO_MIG_FLAG_END_OF_STATE) { 609 610 trace_vfio_load_state(vbasedev->name, data); 611 612 switch (data) { 613 case VFIO_MIG_FLAG_DEV_CONFIG_STATE: 614 { 615 return vfio_load_device_config_state(f, opaque); 616 } 617 case VFIO_MIG_FLAG_DEV_SETUP_STATE: 618 { 619 data = qemu_get_be64(f); 620 if (data == VFIO_MIG_FLAG_END_OF_STATE) { 621 return ret; 622 } else { 623 error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64, 624 vbasedev->name, data); 625 return -EINVAL; 626 } 627 break; 628 } 629 case VFIO_MIG_FLAG_DEV_DATA_STATE: 630 { 631 uint64_t data_size = qemu_get_be64(f); 632 633 if (data_size) { 634 ret = vfio_load_buffer(f, vbasedev, data_size); 635 if (ret < 0) { 636 return ret; 637 } 638 } 639 break; 640 } 641 case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT: 642 { 643 if (!vfio_precopy_supported(vbasedev) || 644 !migrate_switchover_ack()) { 645 error_report("%s: Received INIT_DATA_SENT but switchover ack " 646 "is not used", vbasedev->name); 647 return -EINVAL; 648 } 649 650 ret = qemu_loadvm_approve_switchover(); 651 if (ret) { 652 error_report( 653 "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)", 654 vbasedev->name, ret, strerror(-ret)); 655 } 656 657 return ret; 658 } 659 default: 660 error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data); 661 return -EINVAL; 662 } 663 664 data = qemu_get_be64(f); 665 ret = qemu_file_get_error(f); 666 if (ret) { 667 return ret; 668 } 669 } 670 return ret; 671 } 672 673 static bool vfio_switchover_ack_needed(void *opaque) 674 { 675 VFIODevice *vbasedev = opaque; 676 677 return vfio_precopy_supported(vbasedev); 678 } 679 680 static const SaveVMHandlers savevm_vfio_handlers = { 681 .save_prepare = vfio_save_prepare, 682 .save_setup = vfio_save_setup, 683 .save_cleanup = vfio_save_cleanup, 684 .state_pending_estimate = vfio_state_pending_estimate, 685 .state_pending_exact = vfio_state_pending_exact, 686 .is_active_iterate = vfio_is_active_iterate, 687 .save_live_iterate = vfio_save_iterate, 688 .save_live_complete_precopy = vfio_save_complete_precopy, 689 .save_state = vfio_save_state, 690 .load_setup = vfio_load_setup, 691 .load_cleanup = vfio_load_cleanup, 692 .load_state = vfio_load_state, 693 .switchover_ack_needed = vfio_switchover_ack_needed, 694 }; 695 696 /* ---------------------------------------------------------------------- */ 697 698 static void vfio_vmstate_change_prepare(void *opaque, bool running, 699 RunState state) 700 { 701 VFIODevice *vbasedev = opaque; 702 VFIOMigration *migration = vbasedev->migration; 703 enum vfio_device_mig_state new_state; 704 int ret; 705 706 new_state = migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ? 707 VFIO_DEVICE_STATE_PRE_COPY_P2P : 708 VFIO_DEVICE_STATE_RUNNING_P2P; 709 710 ret = vfio_migration_set_state_or_reset(vbasedev, new_state); 711 if (ret) { 712 /* 713 * Migration should be aborted in this case, but vm_state_notify() 714 * currently does not support reporting failures. 715 */ 716 if (migrate_get_current()->to_dst_file) { 717 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 718 } 719 } 720 721 trace_vfio_vmstate_change_prepare(vbasedev->name, running, 722 RunState_str(state), 723 mig_state_to_str(new_state)); 724 } 725 726 static void vfio_vmstate_change(void *opaque, bool running, RunState state) 727 { 728 VFIODevice *vbasedev = opaque; 729 enum vfio_device_mig_state new_state; 730 int ret; 731 732 if (running) { 733 new_state = VFIO_DEVICE_STATE_RUNNING; 734 } else { 735 new_state = 736 (vfio_device_state_is_precopy(vbasedev) && 737 (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ? 738 VFIO_DEVICE_STATE_STOP_COPY : 739 VFIO_DEVICE_STATE_STOP; 740 } 741 742 ret = vfio_migration_set_state_or_reset(vbasedev, new_state); 743 if (ret) { 744 /* 745 * Migration should be aborted in this case, but vm_state_notify() 746 * currently does not support reporting failures. 747 */ 748 if (migrate_get_current()->to_dst_file) { 749 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 750 } 751 } 752 753 trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state), 754 mig_state_to_str(new_state)); 755 } 756 757 static void vfio_migration_state_notifier(Notifier *notifier, void *data) 758 { 759 MigrationState *s = data; 760 VFIOMigration *migration = container_of(notifier, VFIOMigration, 761 migration_state); 762 VFIODevice *vbasedev = migration->vbasedev; 763 764 trace_vfio_migration_state_notifier(vbasedev->name, 765 MigrationStatus_str(s->state)); 766 767 switch (s->state) { 768 case MIGRATION_STATUS_CANCELLING: 769 case MIGRATION_STATUS_CANCELLED: 770 case MIGRATION_STATUS_FAILED: 771 vfio_migration_set_state_or_reset(vbasedev, VFIO_DEVICE_STATE_RUNNING); 772 } 773 } 774 775 static void vfio_migration_free(VFIODevice *vbasedev) 776 { 777 g_free(vbasedev->migration); 778 vbasedev->migration = NULL; 779 } 780 781 static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags) 782 { 783 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 784 sizeof(struct vfio_device_feature_migration), 785 sizeof(uint64_t))] = {}; 786 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 787 struct vfio_device_feature_migration *mig = 788 (struct vfio_device_feature_migration *)feature->data; 789 790 feature->argsz = sizeof(buf); 791 feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION; 792 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 793 return -errno; 794 } 795 796 *mig_flags = mig->flags; 797 798 return 0; 799 } 800 801 static bool vfio_dma_logging_supported(VFIODevice *vbasedev) 802 { 803 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), 804 sizeof(uint64_t))] = {}; 805 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 806 807 feature->argsz = sizeof(buf); 808 feature->flags = VFIO_DEVICE_FEATURE_PROBE | 809 VFIO_DEVICE_FEATURE_DMA_LOGGING_START; 810 811 return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); 812 } 813 814 static int vfio_migration_init(VFIODevice *vbasedev) 815 { 816 int ret; 817 Object *obj; 818 VFIOMigration *migration; 819 char id[256] = ""; 820 g_autofree char *path = NULL, *oid = NULL; 821 uint64_t mig_flags = 0; 822 VMChangeStateHandler *prepare_cb; 823 824 if (!vbasedev->ops->vfio_get_object) { 825 return -EINVAL; 826 } 827 828 obj = vbasedev->ops->vfio_get_object(vbasedev); 829 if (!obj) { 830 return -EINVAL; 831 } 832 833 ret = vfio_migration_query_flags(vbasedev, &mig_flags); 834 if (ret) { 835 return ret; 836 } 837 838 /* Basic migration functionality must be supported */ 839 if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) { 840 return -EOPNOTSUPP; 841 } 842 843 vbasedev->migration = g_new0(VFIOMigration, 1); 844 migration = vbasedev->migration; 845 migration->vbasedev = vbasedev; 846 migration->device_state = VFIO_DEVICE_STATE_RUNNING; 847 migration->data_fd = -1; 848 migration->mig_flags = mig_flags; 849 850 vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev); 851 852 oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); 853 if (oid) { 854 path = g_strdup_printf("%s/vfio", oid); 855 } else { 856 path = g_strdup("vfio"); 857 } 858 strpadcpy(id, sizeof(id), path, '\0'); 859 860 register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers, 861 vbasedev); 862 863 prepare_cb = migration->mig_flags & VFIO_MIGRATION_P2P ? 864 vfio_vmstate_change_prepare : 865 NULL; 866 migration->vm_state = qdev_add_vm_change_state_handler_full( 867 vbasedev->dev, vfio_vmstate_change, prepare_cb, vbasedev); 868 migration_add_notifier(&migration->migration_state, 869 vfio_migration_state_notifier); 870 871 return 0; 872 } 873 874 static void vfio_migration_deinit(VFIODevice *vbasedev) 875 { 876 VFIOMigration *migration = vbasedev->migration; 877 878 migration_remove_notifier(&migration->migration_state); 879 qemu_del_vm_change_state_handler(migration->vm_state); 880 unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); 881 vfio_migration_free(vbasedev); 882 vfio_unblock_multiple_devices_migration(); 883 } 884 885 static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp) 886 { 887 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { 888 error_propagate(errp, err); 889 return -EINVAL; 890 } 891 892 vbasedev->migration_blocker = error_copy(err); 893 error_free(err); 894 895 return migrate_add_blocker(&vbasedev->migration_blocker, errp); 896 } 897 898 /* ---------------------------------------------------------------------- */ 899 900 int64_t vfio_mig_bytes_transferred(void) 901 { 902 return bytes_transferred; 903 } 904 905 void vfio_reset_bytes_transferred(void) 906 { 907 bytes_transferred = 0; 908 } 909 910 /* 911 * Return true when either migration initialized or blocker registered. 912 * Currently only return false when adding blocker fails which will 913 * de-register vfio device. 914 */ 915 bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) 916 { 917 Error *err = NULL; 918 int ret; 919 920 if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) { 921 error_setg(&err, "%s: Migration is disabled for VFIO device", 922 vbasedev->name); 923 return !vfio_block_migration(vbasedev, err, errp); 924 } 925 926 ret = vfio_migration_init(vbasedev); 927 if (ret) { 928 if (ret == -ENOTTY) { 929 error_setg(&err, "%s: VFIO migration is not supported in kernel", 930 vbasedev->name); 931 } else { 932 error_setg(&err, 933 "%s: Migration couldn't be initialized for VFIO device, " 934 "err: %d (%s)", 935 vbasedev->name, ret, strerror(-ret)); 936 } 937 938 return !vfio_block_migration(vbasedev, err, errp); 939 } 940 941 if (!vbasedev->dirty_pages_supported) { 942 if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { 943 error_setg(&err, 944 "%s: VFIO device doesn't support device dirty tracking", 945 vbasedev->name); 946 goto add_blocker; 947 } 948 949 warn_report("%s: VFIO device doesn't support device dirty tracking", 950 vbasedev->name); 951 } 952 953 ret = vfio_block_multiple_devices_migration(vbasedev, errp); 954 if (ret) { 955 goto out_deinit; 956 } 957 958 if (vfio_viommu_preset(vbasedev)) { 959 error_setg(&err, "%s: Migration is currently not supported " 960 "with vIOMMU enabled", vbasedev->name); 961 goto add_blocker; 962 } 963 964 trace_vfio_migration_realize(vbasedev->name); 965 return true; 966 967 add_blocker: 968 ret = vfio_block_migration(vbasedev, err, errp); 969 out_deinit: 970 if (ret) { 971 vfio_migration_deinit(vbasedev); 972 } 973 return !ret; 974 } 975 976 void vfio_migration_exit(VFIODevice *vbasedev) 977 { 978 if (vbasedev->migration) { 979 vfio_migration_deinit(vbasedev); 980 } 981 982 migrate_del_blocker(&vbasedev->migration_blocker); 983 } 984