1 /* 2 * Migration support for VFIO devices 3 * 4 * Copyright NVIDIA, Inc. 2020 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2. See 7 * the COPYING file in the top-level directory. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "qemu/main-loop.h" 12 #include "qemu/cutils.h" 13 #include "qemu/units.h" 14 #include "qemu/error-report.h" 15 #include <linux/vfio.h> 16 #include <sys/ioctl.h> 17 18 #include "sysemu/runstate.h" 19 #include "hw/vfio/vfio-common.h" 20 #include "migration/migration.h" 21 #include "migration/options.h" 22 #include "migration/savevm.h" 23 #include "migration/vmstate.h" 24 #include "migration/qemu-file.h" 25 #include "migration/register.h" 26 #include "migration/blocker.h" 27 #include "migration/misc.h" 28 #include "qapi/error.h" 29 #include "exec/ramlist.h" 30 #include "exec/ram_addr.h" 31 #include "pci.h" 32 #include "trace.h" 33 #include "hw/hw.h" 34 35 /* 36 * Flags to be used as unique delimiters for VFIO devices in the migration 37 * stream. These flags are composed as: 38 * 0xffffffff => MSB 32-bit all 1s 39 * 0xef10 => Magic ID, represents emulated (virtual) function IO 40 * 0x0000 => 16-bits reserved for flags 41 * 42 * The beginning of state information is marked by _DEV_CONFIG_STATE, 43 * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a 44 * certain state information is marked by _END_OF_STATE. 45 */ 46 #define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) 47 #define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) 48 #define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) 49 #define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) 50 #define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL) 51 52 /* 53 * This is an arbitrary size based on migration of mlx5 devices, where typically 54 * total device migration size is on the order of 100s of MB. Testing with 55 * larger values, e.g. 128MB and 1GB, did not show a performance improvement. 56 */ 57 #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB) 58 59 static int64_t bytes_transferred; 60 61 static const char *mig_state_to_str(enum vfio_device_mig_state state) 62 { 63 switch (state) { 64 case VFIO_DEVICE_STATE_ERROR: 65 return "ERROR"; 66 case VFIO_DEVICE_STATE_STOP: 67 return "STOP"; 68 case VFIO_DEVICE_STATE_RUNNING: 69 return "RUNNING"; 70 case VFIO_DEVICE_STATE_STOP_COPY: 71 return "STOP_COPY"; 72 case VFIO_DEVICE_STATE_RESUMING: 73 return "RESUMING"; 74 case VFIO_DEVICE_STATE_PRE_COPY: 75 return "PRE_COPY"; 76 default: 77 return "UNKNOWN STATE"; 78 } 79 } 80 81 static int vfio_migration_set_state(VFIODevice *vbasedev, 82 enum vfio_device_mig_state new_state, 83 enum vfio_device_mig_state recover_state) 84 { 85 VFIOMigration *migration = vbasedev->migration; 86 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 87 sizeof(struct vfio_device_feature_mig_state), 88 sizeof(uint64_t))] = {}; 89 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 90 struct vfio_device_feature_mig_state *mig_state = 91 (struct vfio_device_feature_mig_state *)feature->data; 92 int ret; 93 94 feature->argsz = sizeof(buf); 95 feature->flags = 96 VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE; 97 mig_state->device_state = new_state; 98 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 99 /* Try to set the device in some good state */ 100 ret = -errno; 101 102 if (recover_state == VFIO_DEVICE_STATE_ERROR) { 103 error_report("%s: Failed setting device state to %s, err: %s. " 104 "Recover state is ERROR. Resetting device", 105 vbasedev->name, mig_state_to_str(new_state), 106 strerror(errno)); 107 108 goto reset_device; 109 } 110 111 error_report( 112 "%s: Failed setting device state to %s, err: %s. Setting device in recover state %s", 113 vbasedev->name, mig_state_to_str(new_state), 114 strerror(errno), mig_state_to_str(recover_state)); 115 116 mig_state->device_state = recover_state; 117 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 118 ret = -errno; 119 error_report( 120 "%s: Failed setting device in recover state, err: %s. Resetting device", 121 vbasedev->name, strerror(errno)); 122 123 goto reset_device; 124 } 125 126 migration->device_state = recover_state; 127 128 return ret; 129 } 130 131 migration->device_state = new_state; 132 if (mig_state->data_fd != -1) { 133 if (migration->data_fd != -1) { 134 /* 135 * This can happen if the device is asynchronously reset and 136 * terminates a data transfer. 137 */ 138 error_report("%s: data_fd out of sync", vbasedev->name); 139 close(mig_state->data_fd); 140 141 return -EBADF; 142 } 143 144 migration->data_fd = mig_state->data_fd; 145 } 146 147 trace_vfio_migration_set_state(vbasedev->name, mig_state_to_str(new_state)); 148 149 return 0; 150 151 reset_device: 152 if (ioctl(vbasedev->fd, VFIO_DEVICE_RESET)) { 153 hw_error("%s: Failed resetting device, err: %s", vbasedev->name, 154 strerror(errno)); 155 } 156 157 migration->device_state = VFIO_DEVICE_STATE_RUNNING; 158 159 return ret; 160 } 161 162 static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev, 163 uint64_t data_size) 164 { 165 VFIOMigration *migration = vbasedev->migration; 166 int ret; 167 168 ret = qemu_file_get_to_fd(f, migration->data_fd, data_size); 169 trace_vfio_load_state_device_data(vbasedev->name, data_size, ret); 170 171 return ret; 172 } 173 174 static int vfio_save_device_config_state(QEMUFile *f, void *opaque) 175 { 176 VFIODevice *vbasedev = opaque; 177 178 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE); 179 180 if (vbasedev->ops && vbasedev->ops->vfio_save_config) { 181 vbasedev->ops->vfio_save_config(vbasedev, f); 182 } 183 184 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 185 186 trace_vfio_save_device_config_state(vbasedev->name); 187 188 return qemu_file_get_error(f); 189 } 190 191 static int vfio_load_device_config_state(QEMUFile *f, void *opaque) 192 { 193 VFIODevice *vbasedev = opaque; 194 uint64_t data; 195 196 if (vbasedev->ops && vbasedev->ops->vfio_load_config) { 197 int ret; 198 199 ret = vbasedev->ops->vfio_load_config(vbasedev, f); 200 if (ret) { 201 error_report("%s: Failed to load device config space", 202 vbasedev->name); 203 return ret; 204 } 205 } 206 207 data = qemu_get_be64(f); 208 if (data != VFIO_MIG_FLAG_END_OF_STATE) { 209 error_report("%s: Failed loading device config space, " 210 "end flag incorrect 0x%"PRIx64, vbasedev->name, data); 211 return -EINVAL; 212 } 213 214 trace_vfio_load_device_config_state(vbasedev->name); 215 return qemu_file_get_error(f); 216 } 217 218 static void vfio_migration_cleanup(VFIODevice *vbasedev) 219 { 220 VFIOMigration *migration = vbasedev->migration; 221 222 close(migration->data_fd); 223 migration->data_fd = -1; 224 } 225 226 static int vfio_query_stop_copy_size(VFIODevice *vbasedev, 227 uint64_t *stop_copy_size) 228 { 229 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 230 sizeof(struct vfio_device_feature_mig_data_size), 231 sizeof(uint64_t))] = {}; 232 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 233 struct vfio_device_feature_mig_data_size *mig_data_size = 234 (struct vfio_device_feature_mig_data_size *)feature->data; 235 236 feature->argsz = sizeof(buf); 237 feature->flags = 238 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DATA_SIZE; 239 240 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 241 return -errno; 242 } 243 244 *stop_copy_size = mig_data_size->stop_copy_length; 245 246 return 0; 247 } 248 249 static int vfio_query_precopy_size(VFIOMigration *migration) 250 { 251 struct vfio_precopy_info precopy = { 252 .argsz = sizeof(precopy), 253 }; 254 255 migration->precopy_init_size = 0; 256 migration->precopy_dirty_size = 0; 257 258 if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) { 259 return -errno; 260 } 261 262 migration->precopy_init_size = precopy.initial_bytes; 263 migration->precopy_dirty_size = precopy.dirty_bytes; 264 265 return 0; 266 } 267 268 /* Returns the size of saved data on success and -errno on error */ 269 static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) 270 { 271 ssize_t data_size; 272 273 data_size = read(migration->data_fd, migration->data_buffer, 274 migration->data_buffer_size); 275 if (data_size < 0) { 276 /* 277 * Pre-copy emptied all the device state for now. For more information, 278 * please refer to the Linux kernel VFIO uAPI. 279 */ 280 if (errno == ENOMSG) { 281 return 0; 282 } 283 284 return -errno; 285 } 286 if (data_size == 0) { 287 return 0; 288 } 289 290 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); 291 qemu_put_be64(f, data_size); 292 qemu_put_buffer(f, migration->data_buffer, data_size); 293 bytes_transferred += data_size; 294 295 trace_vfio_save_block(migration->vbasedev->name, data_size); 296 297 return qemu_file_get_error(f) ?: data_size; 298 } 299 300 static void vfio_update_estimated_pending_data(VFIOMigration *migration, 301 uint64_t data_size) 302 { 303 if (!data_size) { 304 /* 305 * Pre-copy emptied all the device state for now, update estimated sizes 306 * accordingly. 307 */ 308 migration->precopy_init_size = 0; 309 migration->precopy_dirty_size = 0; 310 311 return; 312 } 313 314 if (migration->precopy_init_size) { 315 uint64_t init_size = MIN(migration->precopy_init_size, data_size); 316 317 migration->precopy_init_size -= init_size; 318 data_size -= init_size; 319 } 320 321 migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size, 322 data_size); 323 } 324 325 static bool vfio_precopy_supported(VFIODevice *vbasedev) 326 { 327 VFIOMigration *migration = vbasedev->migration; 328 329 return migration->mig_flags & VFIO_MIGRATION_PRE_COPY; 330 } 331 332 /* ---------------------------------------------------------------------- */ 333 334 static int vfio_save_setup(QEMUFile *f, void *opaque) 335 { 336 VFIODevice *vbasedev = opaque; 337 VFIOMigration *migration = vbasedev->migration; 338 uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; 339 340 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); 341 342 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 343 migration->data_buffer_size = MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE, 344 stop_copy_size); 345 migration->data_buffer = g_try_malloc0(migration->data_buffer_size); 346 if (!migration->data_buffer) { 347 error_report("%s: Failed to allocate migration data buffer", 348 vbasedev->name); 349 return -ENOMEM; 350 } 351 352 if (vfio_precopy_supported(vbasedev)) { 353 int ret; 354 355 switch (migration->device_state) { 356 case VFIO_DEVICE_STATE_RUNNING: 357 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY, 358 VFIO_DEVICE_STATE_RUNNING); 359 if (ret) { 360 return ret; 361 } 362 363 vfio_query_precopy_size(migration); 364 365 break; 366 case VFIO_DEVICE_STATE_STOP: 367 /* vfio_save_complete_precopy() will go to STOP_COPY */ 368 break; 369 default: 370 return -EINVAL; 371 } 372 } 373 374 trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size); 375 376 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 377 378 return qemu_file_get_error(f); 379 } 380 381 static void vfio_save_cleanup(void *opaque) 382 { 383 VFIODevice *vbasedev = opaque; 384 VFIOMigration *migration = vbasedev->migration; 385 386 g_free(migration->data_buffer); 387 migration->data_buffer = NULL; 388 migration->precopy_init_size = 0; 389 migration->precopy_dirty_size = 0; 390 migration->initial_data_sent = false; 391 vfio_migration_cleanup(vbasedev); 392 trace_vfio_save_cleanup(vbasedev->name); 393 } 394 395 static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy, 396 uint64_t *can_postcopy) 397 { 398 VFIODevice *vbasedev = opaque; 399 VFIOMigration *migration = vbasedev->migration; 400 401 if (migration->device_state != VFIO_DEVICE_STATE_PRE_COPY) { 402 return; 403 } 404 405 *must_precopy += 406 migration->precopy_init_size + migration->precopy_dirty_size; 407 408 trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy, 409 *can_postcopy, 410 migration->precopy_init_size, 411 migration->precopy_dirty_size); 412 } 413 414 /* 415 * Migration size of VFIO devices can be as little as a few KBs or as big as 416 * many GBs. This value should be big enough to cover the worst case. 417 */ 418 #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB) 419 420 static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, 421 uint64_t *can_postcopy) 422 { 423 VFIODevice *vbasedev = opaque; 424 VFIOMigration *migration = vbasedev->migration; 425 uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE; 426 427 /* 428 * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is 429 * reported so downtime limit won't be violated. 430 */ 431 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 432 *must_precopy += stop_copy_size; 433 434 if (migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) { 435 vfio_query_precopy_size(migration); 436 437 *must_precopy += 438 migration->precopy_init_size + migration->precopy_dirty_size; 439 } 440 441 trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy, 442 stop_copy_size, migration->precopy_init_size, 443 migration->precopy_dirty_size); 444 } 445 446 static bool vfio_is_active_iterate(void *opaque) 447 { 448 VFIODevice *vbasedev = opaque; 449 VFIOMigration *migration = vbasedev->migration; 450 451 return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY; 452 } 453 454 static int vfio_save_iterate(QEMUFile *f, void *opaque) 455 { 456 VFIODevice *vbasedev = opaque; 457 VFIOMigration *migration = vbasedev->migration; 458 ssize_t data_size; 459 460 data_size = vfio_save_block(f, migration); 461 if (data_size < 0) { 462 return data_size; 463 } 464 465 vfio_update_estimated_pending_data(migration, data_size); 466 467 if (migrate_switchover_ack() && !migration->precopy_init_size && 468 !migration->initial_data_sent) { 469 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT); 470 migration->initial_data_sent = true; 471 } else { 472 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 473 } 474 475 trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size, 476 migration->precopy_dirty_size); 477 478 /* 479 * A VFIO device's pre-copy dirty_bytes is not guaranteed to reach zero. 480 * Return 1 so following handlers will not be potentially blocked. 481 */ 482 return 1; 483 } 484 485 static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) 486 { 487 VFIODevice *vbasedev = opaque; 488 ssize_t data_size; 489 int ret; 490 491 /* We reach here with device state STOP or STOP_COPY only */ 492 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, 493 VFIO_DEVICE_STATE_STOP); 494 if (ret) { 495 return ret; 496 } 497 498 do { 499 data_size = vfio_save_block(f, vbasedev->migration); 500 if (data_size < 0) { 501 return data_size; 502 } 503 } while (data_size); 504 505 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 506 ret = qemu_file_get_error(f); 507 if (ret) { 508 return ret; 509 } 510 511 /* 512 * If setting the device in STOP state fails, the device should be reset. 513 * To do so, use ERROR state as a recover state. 514 */ 515 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP, 516 VFIO_DEVICE_STATE_ERROR); 517 trace_vfio_save_complete_precopy(vbasedev->name, ret); 518 519 return ret; 520 } 521 522 static void vfio_save_state(QEMUFile *f, void *opaque) 523 { 524 VFIODevice *vbasedev = opaque; 525 int ret; 526 527 ret = vfio_save_device_config_state(f, opaque); 528 if (ret) { 529 error_report("%s: Failed to save device config space", 530 vbasedev->name); 531 qemu_file_set_error(f, ret); 532 } 533 } 534 535 static int vfio_load_setup(QEMUFile *f, void *opaque) 536 { 537 VFIODevice *vbasedev = opaque; 538 539 return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING, 540 vbasedev->migration->device_state); 541 } 542 543 static int vfio_load_cleanup(void *opaque) 544 { 545 VFIODevice *vbasedev = opaque; 546 547 vfio_migration_cleanup(vbasedev); 548 trace_vfio_load_cleanup(vbasedev->name); 549 550 return 0; 551 } 552 553 static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) 554 { 555 VFIODevice *vbasedev = opaque; 556 int ret = 0; 557 uint64_t data; 558 559 data = qemu_get_be64(f); 560 while (data != VFIO_MIG_FLAG_END_OF_STATE) { 561 562 trace_vfio_load_state(vbasedev->name, data); 563 564 switch (data) { 565 case VFIO_MIG_FLAG_DEV_CONFIG_STATE: 566 { 567 return vfio_load_device_config_state(f, opaque); 568 } 569 case VFIO_MIG_FLAG_DEV_SETUP_STATE: 570 { 571 data = qemu_get_be64(f); 572 if (data == VFIO_MIG_FLAG_END_OF_STATE) { 573 return ret; 574 } else { 575 error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64, 576 vbasedev->name, data); 577 return -EINVAL; 578 } 579 break; 580 } 581 case VFIO_MIG_FLAG_DEV_DATA_STATE: 582 { 583 uint64_t data_size = qemu_get_be64(f); 584 585 if (data_size) { 586 ret = vfio_load_buffer(f, vbasedev, data_size); 587 if (ret < 0) { 588 return ret; 589 } 590 } 591 break; 592 } 593 case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT: 594 { 595 if (!vfio_precopy_supported(vbasedev) || 596 !migrate_switchover_ack()) { 597 error_report("%s: Received INIT_DATA_SENT but switchover ack " 598 "is not used", vbasedev->name); 599 return -EINVAL; 600 } 601 602 ret = qemu_loadvm_approve_switchover(); 603 if (ret) { 604 error_report( 605 "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)", 606 vbasedev->name, ret, strerror(-ret)); 607 } 608 609 return ret; 610 } 611 default: 612 error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data); 613 return -EINVAL; 614 } 615 616 data = qemu_get_be64(f); 617 ret = qemu_file_get_error(f); 618 if (ret) { 619 return ret; 620 } 621 } 622 return ret; 623 } 624 625 static bool vfio_switchover_ack_needed(void *opaque) 626 { 627 VFIODevice *vbasedev = opaque; 628 629 return vfio_precopy_supported(vbasedev); 630 } 631 632 static const SaveVMHandlers savevm_vfio_handlers = { 633 .save_setup = vfio_save_setup, 634 .save_cleanup = vfio_save_cleanup, 635 .state_pending_estimate = vfio_state_pending_estimate, 636 .state_pending_exact = vfio_state_pending_exact, 637 .is_active_iterate = vfio_is_active_iterate, 638 .save_live_iterate = vfio_save_iterate, 639 .save_live_complete_precopy = vfio_save_complete_precopy, 640 .save_state = vfio_save_state, 641 .load_setup = vfio_load_setup, 642 .load_cleanup = vfio_load_cleanup, 643 .load_state = vfio_load_state, 644 .switchover_ack_needed = vfio_switchover_ack_needed, 645 }; 646 647 /* ---------------------------------------------------------------------- */ 648 649 static void vfio_vmstate_change(void *opaque, bool running, RunState state) 650 { 651 VFIODevice *vbasedev = opaque; 652 VFIOMigration *migration = vbasedev->migration; 653 enum vfio_device_mig_state new_state; 654 int ret; 655 656 if (running) { 657 new_state = VFIO_DEVICE_STATE_RUNNING; 658 } else { 659 new_state = 660 (migration->device_state == VFIO_DEVICE_STATE_PRE_COPY && 661 (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ? 662 VFIO_DEVICE_STATE_STOP_COPY : 663 VFIO_DEVICE_STATE_STOP; 664 } 665 666 /* 667 * If setting the device in new_state fails, the device should be reset. 668 * To do so, use ERROR state as a recover state. 669 */ 670 ret = vfio_migration_set_state(vbasedev, new_state, 671 VFIO_DEVICE_STATE_ERROR); 672 if (ret) { 673 /* 674 * Migration should be aborted in this case, but vm_state_notify() 675 * currently does not support reporting failures. 676 */ 677 if (migrate_get_current()->to_dst_file) { 678 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 679 } 680 } 681 682 trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state), 683 mig_state_to_str(new_state)); 684 } 685 686 static void vfio_migration_state_notifier(Notifier *notifier, void *data) 687 { 688 MigrationState *s = data; 689 VFIOMigration *migration = container_of(notifier, VFIOMigration, 690 migration_state); 691 VFIODevice *vbasedev = migration->vbasedev; 692 693 trace_vfio_migration_state_notifier(vbasedev->name, 694 MigrationStatus_str(s->state)); 695 696 switch (s->state) { 697 case MIGRATION_STATUS_CANCELLING: 698 case MIGRATION_STATUS_CANCELLED: 699 case MIGRATION_STATUS_FAILED: 700 /* 701 * If setting the device in RUNNING state fails, the device should 702 * be reset. To do so, use ERROR state as a recover state. 703 */ 704 vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING, 705 VFIO_DEVICE_STATE_ERROR); 706 } 707 } 708 709 static void vfio_migration_free(VFIODevice *vbasedev) 710 { 711 g_free(vbasedev->migration); 712 vbasedev->migration = NULL; 713 } 714 715 static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags) 716 { 717 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 718 sizeof(struct vfio_device_feature_migration), 719 sizeof(uint64_t))] = {}; 720 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 721 struct vfio_device_feature_migration *mig = 722 (struct vfio_device_feature_migration *)feature->data; 723 724 feature->argsz = sizeof(buf); 725 feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION; 726 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 727 return -errno; 728 } 729 730 *mig_flags = mig->flags; 731 732 return 0; 733 } 734 735 static bool vfio_dma_logging_supported(VFIODevice *vbasedev) 736 { 737 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), 738 sizeof(uint64_t))] = {}; 739 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 740 741 feature->argsz = sizeof(buf); 742 feature->flags = VFIO_DEVICE_FEATURE_PROBE | 743 VFIO_DEVICE_FEATURE_DMA_LOGGING_START; 744 745 return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); 746 } 747 748 static int vfio_migration_init(VFIODevice *vbasedev) 749 { 750 int ret; 751 Object *obj; 752 VFIOMigration *migration; 753 char id[256] = ""; 754 g_autofree char *path = NULL, *oid = NULL; 755 uint64_t mig_flags = 0; 756 757 if (!vbasedev->ops->vfio_get_object) { 758 return -EINVAL; 759 } 760 761 obj = vbasedev->ops->vfio_get_object(vbasedev); 762 if (!obj) { 763 return -EINVAL; 764 } 765 766 ret = vfio_migration_query_flags(vbasedev, &mig_flags); 767 if (ret) { 768 return ret; 769 } 770 771 /* Basic migration functionality must be supported */ 772 if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) { 773 return -EOPNOTSUPP; 774 } 775 776 vbasedev->migration = g_new0(VFIOMigration, 1); 777 migration = vbasedev->migration; 778 migration->vbasedev = vbasedev; 779 migration->device_state = VFIO_DEVICE_STATE_RUNNING; 780 migration->data_fd = -1; 781 migration->mig_flags = mig_flags; 782 783 vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev); 784 785 oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); 786 if (oid) { 787 path = g_strdup_printf("%s/vfio", oid); 788 } else { 789 path = g_strdup("vfio"); 790 } 791 strpadcpy(id, sizeof(id), path, '\0'); 792 793 register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers, 794 vbasedev); 795 796 migration->vm_state = qdev_add_vm_change_state_handler(vbasedev->dev, 797 vfio_vmstate_change, 798 vbasedev); 799 migration->migration_state.notify = vfio_migration_state_notifier; 800 add_migration_state_change_notifier(&migration->migration_state); 801 802 return 0; 803 } 804 805 static void vfio_migration_deinit(VFIODevice *vbasedev) 806 { 807 VFIOMigration *migration = vbasedev->migration; 808 809 remove_migration_state_change_notifier(&migration->migration_state); 810 qemu_del_vm_change_state_handler(migration->vm_state); 811 unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); 812 vfio_migration_free(vbasedev); 813 vfio_unblock_multiple_devices_migration(); 814 } 815 816 static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp) 817 { 818 int ret; 819 820 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { 821 error_propagate(errp, err); 822 return -EINVAL; 823 } 824 825 vbasedev->migration_blocker = error_copy(err); 826 error_free(err); 827 828 ret = migrate_add_blocker(vbasedev->migration_blocker, errp); 829 if (ret < 0) { 830 error_free(vbasedev->migration_blocker); 831 vbasedev->migration_blocker = NULL; 832 } 833 834 return ret; 835 } 836 837 /* ---------------------------------------------------------------------- */ 838 839 int64_t vfio_mig_bytes_transferred(void) 840 { 841 return bytes_transferred; 842 } 843 844 void vfio_reset_bytes_transferred(void) 845 { 846 bytes_transferred = 0; 847 } 848 849 /* 850 * Return true when either migration initialized or blocker registered. 851 * Currently only return false when adding blocker fails which will 852 * de-register vfio device. 853 */ 854 bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) 855 { 856 Error *err = NULL; 857 int ret; 858 859 if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) { 860 error_setg(&err, "%s: Migration is disabled for VFIO device", 861 vbasedev->name); 862 return !vfio_block_migration(vbasedev, err, errp); 863 } 864 865 ret = vfio_migration_init(vbasedev); 866 if (ret) { 867 if (ret == -ENOTTY) { 868 error_setg(&err, "%s: VFIO migration is not supported in kernel", 869 vbasedev->name); 870 } else { 871 error_setg(&err, 872 "%s: Migration couldn't be initialized for VFIO device, " 873 "err: %d (%s)", 874 vbasedev->name, ret, strerror(-ret)); 875 } 876 877 return !vfio_block_migration(vbasedev, err, errp); 878 } 879 880 if (!vbasedev->dirty_pages_supported) { 881 if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { 882 error_setg(&err, 883 "%s: VFIO device doesn't support device dirty tracking", 884 vbasedev->name); 885 goto add_blocker; 886 } 887 888 warn_report("%s: VFIO device doesn't support device dirty tracking", 889 vbasedev->name); 890 } 891 892 ret = vfio_block_multiple_devices_migration(vbasedev, errp); 893 if (ret) { 894 goto out_deinit; 895 } 896 897 if (vfio_viommu_preset(vbasedev)) { 898 error_setg(&err, "%s: Migration is currently not supported " 899 "with vIOMMU enabled", vbasedev->name); 900 goto add_blocker; 901 } 902 903 trace_vfio_migration_realize(vbasedev->name); 904 return true; 905 906 add_blocker: 907 ret = vfio_block_migration(vbasedev, err, errp); 908 out_deinit: 909 if (ret) { 910 vfio_migration_deinit(vbasedev); 911 } 912 return !ret; 913 } 914 915 void vfio_migration_exit(VFIODevice *vbasedev) 916 { 917 if (vbasedev->migration) { 918 vfio_migration_deinit(vbasedev); 919 } 920 921 if (vbasedev->migration_blocker) { 922 migrate_del_blocker(vbasedev->migration_blocker); 923 error_free(vbasedev->migration_blocker); 924 vbasedev->migration_blocker = NULL; 925 } 926 } 927