xref: /openbmc/qemu/hw/vfio/migration.c (revision a0359b56)
1 /*
2  * Migration support for VFIO devices
3  *
4  * Copyright NVIDIA, Inc. 2020
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2. See
7  * the COPYING file in the top-level directory.
8  */
9 
10 #include "qemu/osdep.h"
11 #include "qemu/main-loop.h"
12 #include "qemu/cutils.h"
13 #include "qemu/units.h"
14 #include "qemu/error-report.h"
15 #include <linux/vfio.h>
16 #include <sys/ioctl.h>
17 
18 #include "sysemu/runstate.h"
19 #include "hw/vfio/vfio-common.h"
20 #include "migration/misc.h"
21 #include "migration/savevm.h"
22 #include "migration/vmstate.h"
23 #include "migration/qemu-file.h"
24 #include "migration/register.h"
25 #include "migration/blocker.h"
26 #include "qapi/error.h"
27 #include "exec/ramlist.h"
28 #include "exec/ram_addr.h"
29 #include "pci.h"
30 #include "trace.h"
31 #include "hw/hw.h"
32 
33 /*
34  * Flags to be used as unique delimiters for VFIO devices in the migration
35  * stream. These flags are composed as:
36  * 0xffffffff => MSB 32-bit all 1s
37  * 0xef10     => Magic ID, represents emulated (virtual) function IO
38  * 0x0000     => 16-bits reserved for flags
39  *
40  * The beginning of state information is marked by _DEV_CONFIG_STATE,
41  * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a
42  * certain state information is marked by _END_OF_STATE.
43  */
44 #define VFIO_MIG_FLAG_END_OF_STATE      (0xffffffffef100001ULL)
45 #define VFIO_MIG_FLAG_DEV_CONFIG_STATE  (0xffffffffef100002ULL)
46 #define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xffffffffef100003ULL)
47 #define VFIO_MIG_FLAG_DEV_DATA_STATE    (0xffffffffef100004ULL)
48 #define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL)
49 
50 /*
51  * This is an arbitrary size based on migration of mlx5 devices, where typically
52  * total device migration size is on the order of 100s of MB. Testing with
53  * larger values, e.g. 128MB and 1GB, did not show a performance improvement.
54  */
55 #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB)
56 
57 static int64_t bytes_transferred;
58 
59 static const char *mig_state_to_str(enum vfio_device_mig_state state)
60 {
61     switch (state) {
62     case VFIO_DEVICE_STATE_ERROR:
63         return "ERROR";
64     case VFIO_DEVICE_STATE_STOP:
65         return "STOP";
66     case VFIO_DEVICE_STATE_RUNNING:
67         return "RUNNING";
68     case VFIO_DEVICE_STATE_STOP_COPY:
69         return "STOP_COPY";
70     case VFIO_DEVICE_STATE_RESUMING:
71         return "RESUMING";
72     case VFIO_DEVICE_STATE_RUNNING_P2P:
73         return "RUNNING_P2P";
74     case VFIO_DEVICE_STATE_PRE_COPY:
75         return "PRE_COPY";
76     case VFIO_DEVICE_STATE_PRE_COPY_P2P:
77         return "PRE_COPY_P2P";
78     default:
79         return "UNKNOWN STATE";
80     }
81 }
82 
83 static int vfio_migration_set_state(VFIODevice *vbasedev,
84                                     enum vfio_device_mig_state new_state,
85                                     enum vfio_device_mig_state recover_state,
86                                     Error **errp)
87 {
88     VFIOMigration *migration = vbasedev->migration;
89     uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
90                               sizeof(struct vfio_device_feature_mig_state),
91                               sizeof(uint64_t))] = {};
92     struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
93     struct vfio_device_feature_mig_state *mig_state =
94         (struct vfio_device_feature_mig_state *)feature->data;
95     int ret;
96     g_autofree char *error_prefix =
97         g_strdup_printf("%s: Failed setting device state to %s.",
98                         vbasedev->name, mig_state_to_str(new_state));
99 
100     feature->argsz = sizeof(buf);
101     feature->flags =
102         VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE;
103     mig_state->device_state = new_state;
104     if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
105         /* Try to set the device in some good state */
106         ret = -errno;
107 
108         if (recover_state == VFIO_DEVICE_STATE_ERROR) {
109             error_setg_errno(errp, errno,
110                              "%s Recover state is ERROR. Resetting device",
111                              error_prefix);
112 
113             goto reset_device;
114         }
115 
116         error_setg_errno(errp, errno,
117                          "%s Setting device in recover state %s",
118                          error_prefix, mig_state_to_str(recover_state));
119 
120         mig_state->device_state = recover_state;
121         if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
122             ret = -errno;
123             /*
124              * If setting the device in recover state fails, report
125              * the error here and propagate the first error.
126              */
127             error_report(
128                 "%s: Failed setting device in recover state, err: %s. Resetting device",
129                          vbasedev->name, strerror(errno));
130 
131             goto reset_device;
132         }
133 
134         migration->device_state = recover_state;
135 
136         return ret;
137     }
138 
139     migration->device_state = new_state;
140     if (mig_state->data_fd != -1) {
141         if (migration->data_fd != -1) {
142             /*
143              * This can happen if the device is asynchronously reset and
144              * terminates a data transfer.
145              */
146             error_setg(errp, "%s: data_fd out of sync", vbasedev->name);
147             close(mig_state->data_fd);
148 
149             return -EBADF;
150         }
151 
152         migration->data_fd = mig_state->data_fd;
153     }
154 
155     trace_vfio_migration_set_state(vbasedev->name, mig_state_to_str(new_state));
156 
157     return 0;
158 
159 reset_device:
160     if (ioctl(vbasedev->fd, VFIO_DEVICE_RESET)) {
161         hw_error("%s: Failed resetting device, err: %s", vbasedev->name,
162                  strerror(errno));
163     }
164 
165     migration->device_state = VFIO_DEVICE_STATE_RUNNING;
166 
167     return ret;
168 }
169 
170 /*
171  * Some device state transitions require resetting the device if they fail.
172  * This function sets the device in new_state and resets the device if that
173  * fails. Reset is done by using ERROR as the recover state.
174  */
175 static int
176 vfio_migration_set_state_or_reset(VFIODevice *vbasedev,
177                                   enum vfio_device_mig_state new_state,
178                                   Error **errp)
179 {
180     return vfio_migration_set_state(vbasedev, new_state,
181                                     VFIO_DEVICE_STATE_ERROR, errp);
182 }
183 
184 static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev,
185                             uint64_t data_size)
186 {
187     VFIOMigration *migration = vbasedev->migration;
188     int ret;
189 
190     ret = qemu_file_get_to_fd(f, migration->data_fd, data_size);
191     trace_vfio_load_state_device_data(vbasedev->name, data_size, ret);
192 
193     return ret;
194 }
195 
196 static int vfio_save_device_config_state(QEMUFile *f, void *opaque,
197                                          Error **errp)
198 {
199     VFIODevice *vbasedev = opaque;
200     int ret;
201 
202     qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE);
203 
204     if (vbasedev->ops && vbasedev->ops->vfio_save_config) {
205         ret = vbasedev->ops->vfio_save_config(vbasedev, f, errp);
206         if (ret) {
207             return ret;
208         }
209     }
210 
211     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
212 
213     trace_vfio_save_device_config_state(vbasedev->name);
214 
215     ret = qemu_file_get_error(f);
216     if (ret < 0) {
217         error_setg_errno(errp, -ret, "Failed to save state");
218     }
219     return ret;
220 }
221 
222 static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
223 {
224     VFIODevice *vbasedev = opaque;
225     uint64_t data;
226 
227     if (vbasedev->ops && vbasedev->ops->vfio_load_config) {
228         int ret;
229 
230         ret = vbasedev->ops->vfio_load_config(vbasedev, f);
231         if (ret) {
232             error_report("%s: Failed to load device config space",
233                          vbasedev->name);
234             return ret;
235         }
236     }
237 
238     data = qemu_get_be64(f);
239     if (data != VFIO_MIG_FLAG_END_OF_STATE) {
240         error_report("%s: Failed loading device config space, "
241                      "end flag incorrect 0x%"PRIx64, vbasedev->name, data);
242         return -EINVAL;
243     }
244 
245     trace_vfio_load_device_config_state(vbasedev->name);
246     return qemu_file_get_error(f);
247 }
248 
249 static void vfio_migration_cleanup(VFIODevice *vbasedev)
250 {
251     VFIOMigration *migration = vbasedev->migration;
252 
253     close(migration->data_fd);
254     migration->data_fd = -1;
255 }
256 
257 static int vfio_query_stop_copy_size(VFIODevice *vbasedev,
258                                      uint64_t *stop_copy_size)
259 {
260     uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
261                               sizeof(struct vfio_device_feature_mig_data_size),
262                               sizeof(uint64_t))] = {};
263     struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
264     struct vfio_device_feature_mig_data_size *mig_data_size =
265         (struct vfio_device_feature_mig_data_size *)feature->data;
266 
267     feature->argsz = sizeof(buf);
268     feature->flags =
269         VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DATA_SIZE;
270 
271     if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
272         return -errno;
273     }
274 
275     *stop_copy_size = mig_data_size->stop_copy_length;
276 
277     return 0;
278 }
279 
280 static int vfio_query_precopy_size(VFIOMigration *migration)
281 {
282     struct vfio_precopy_info precopy = {
283         .argsz = sizeof(precopy),
284     };
285 
286     migration->precopy_init_size = 0;
287     migration->precopy_dirty_size = 0;
288 
289     if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) {
290         return -errno;
291     }
292 
293     migration->precopy_init_size = precopy.initial_bytes;
294     migration->precopy_dirty_size = precopy.dirty_bytes;
295 
296     return 0;
297 }
298 
299 /* Returns the size of saved data on success and -errno on error */
300 static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration)
301 {
302     ssize_t data_size;
303 
304     data_size = read(migration->data_fd, migration->data_buffer,
305                      migration->data_buffer_size);
306     if (data_size < 0) {
307         /*
308          * Pre-copy emptied all the device state for now. For more information,
309          * please refer to the Linux kernel VFIO uAPI.
310          */
311         if (errno == ENOMSG) {
312             return 0;
313         }
314 
315         return -errno;
316     }
317     if (data_size == 0) {
318         return 0;
319     }
320 
321     qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
322     qemu_put_be64(f, data_size);
323     qemu_put_buffer(f, migration->data_buffer, data_size);
324     bytes_transferred += data_size;
325 
326     trace_vfio_save_block(migration->vbasedev->name, data_size);
327 
328     return qemu_file_get_error(f) ?: data_size;
329 }
330 
331 static void vfio_update_estimated_pending_data(VFIOMigration *migration,
332                                                uint64_t data_size)
333 {
334     if (!data_size) {
335         /*
336          * Pre-copy emptied all the device state for now, update estimated sizes
337          * accordingly.
338          */
339         migration->precopy_init_size = 0;
340         migration->precopy_dirty_size = 0;
341 
342         return;
343     }
344 
345     if (migration->precopy_init_size) {
346         uint64_t init_size = MIN(migration->precopy_init_size, data_size);
347 
348         migration->precopy_init_size -= init_size;
349         data_size -= init_size;
350     }
351 
352     migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size,
353                                          data_size);
354 }
355 
356 static bool vfio_precopy_supported(VFIODevice *vbasedev)
357 {
358     VFIOMigration *migration = vbasedev->migration;
359 
360     return migration->mig_flags & VFIO_MIGRATION_PRE_COPY;
361 }
362 
363 /* ---------------------------------------------------------------------- */
364 
365 static int vfio_save_prepare(void *opaque, Error **errp)
366 {
367     VFIODevice *vbasedev = opaque;
368 
369     /*
370      * Snapshot doesn't use postcopy nor background snapshot, so allow snapshot
371      * even if they are on.
372      */
373     if (runstate_check(RUN_STATE_SAVE_VM)) {
374         return 0;
375     }
376 
377     if (migrate_postcopy_ram()) {
378         error_setg(
379             errp, "%s: VFIO migration is not supported with postcopy migration",
380             vbasedev->name);
381         return -EOPNOTSUPP;
382     }
383 
384     if (migrate_background_snapshot()) {
385         error_setg(
386             errp,
387             "%s: VFIO migration is not supported with background snapshot",
388             vbasedev->name);
389         return -EOPNOTSUPP;
390     }
391 
392     return 0;
393 }
394 
395 static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp)
396 {
397     VFIODevice *vbasedev = opaque;
398     VFIOMigration *migration = vbasedev->migration;
399     uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE;
400     int ret;
401 
402     qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
403 
404     vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
405     migration->data_buffer_size = MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE,
406                                       stop_copy_size);
407     migration->data_buffer = g_try_malloc0(migration->data_buffer_size);
408     if (!migration->data_buffer) {
409         error_setg(errp, "%s: Failed to allocate migration data buffer",
410                    vbasedev->name);
411         return -ENOMEM;
412     }
413 
414     if (vfio_precopy_supported(vbasedev)) {
415         switch (migration->device_state) {
416         case VFIO_DEVICE_STATE_RUNNING:
417             ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY,
418                                            VFIO_DEVICE_STATE_RUNNING, errp);
419             if (ret) {
420                 return ret;
421             }
422 
423             vfio_query_precopy_size(migration);
424 
425             break;
426         case VFIO_DEVICE_STATE_STOP:
427             /* vfio_save_complete_precopy() will go to STOP_COPY */
428             break;
429         default:
430             error_setg(errp, "%s: Invalid device state %d", vbasedev->name,
431                        migration->device_state);
432             return -EINVAL;
433         }
434     }
435 
436     trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size);
437 
438     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
439 
440     ret = qemu_file_get_error(f);
441     if (ret < 0) {
442         error_setg_errno(errp, -ret, "%s: save setup failed", vbasedev->name);
443     }
444 
445     return ret;
446 }
447 
448 static void vfio_save_cleanup(void *opaque)
449 {
450     VFIODevice *vbasedev = opaque;
451     VFIOMigration *migration = vbasedev->migration;
452     Error *local_err = NULL;
453     int ret;
454 
455     /*
456      * Changing device state from STOP_COPY to STOP can take time. Do it here,
457      * after migration has completed, so it won't increase downtime.
458      */
459     if (migration->device_state == VFIO_DEVICE_STATE_STOP_COPY) {
460         ret = vfio_migration_set_state_or_reset(vbasedev,
461                                                 VFIO_DEVICE_STATE_STOP,
462                                                 &local_err);
463         if (ret) {
464             error_report_err(local_err);
465         }
466     }
467 
468     g_free(migration->data_buffer);
469     migration->data_buffer = NULL;
470     migration->precopy_init_size = 0;
471     migration->precopy_dirty_size = 0;
472     migration->initial_data_sent = false;
473     vfio_migration_cleanup(vbasedev);
474     trace_vfio_save_cleanup(vbasedev->name);
475 }
476 
477 static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy,
478                                         uint64_t *can_postcopy)
479 {
480     VFIODevice *vbasedev = opaque;
481     VFIOMigration *migration = vbasedev->migration;
482 
483     if (!vfio_device_state_is_precopy(vbasedev)) {
484         return;
485     }
486 
487     *must_precopy +=
488         migration->precopy_init_size + migration->precopy_dirty_size;
489 
490     trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy,
491                                       *can_postcopy,
492                                       migration->precopy_init_size,
493                                       migration->precopy_dirty_size);
494 }
495 
496 /*
497  * Migration size of VFIO devices can be as little as a few KBs or as big as
498  * many GBs. This value should be big enough to cover the worst case.
499  */
500 #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB)
501 
502 static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy,
503                                      uint64_t *can_postcopy)
504 {
505     VFIODevice *vbasedev = opaque;
506     VFIOMigration *migration = vbasedev->migration;
507     uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE;
508 
509     /*
510      * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is
511      * reported so downtime limit won't be violated.
512      */
513     vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
514     *must_precopy += stop_copy_size;
515 
516     if (vfio_device_state_is_precopy(vbasedev)) {
517         vfio_query_precopy_size(migration);
518 
519         *must_precopy +=
520             migration->precopy_init_size + migration->precopy_dirty_size;
521     }
522 
523     trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy,
524                                    stop_copy_size, migration->precopy_init_size,
525                                    migration->precopy_dirty_size);
526 }
527 
528 static bool vfio_is_active_iterate(void *opaque)
529 {
530     VFIODevice *vbasedev = opaque;
531 
532     return vfio_device_state_is_precopy(vbasedev);
533 }
534 
535 /*
536  * Note about migration rate limiting: VFIO migration buffer size is currently
537  * limited to 1MB, so there is no need to check if migration rate exceeded (as
538  * in the worst case it will exceed by 1MB). However, if the buffer size is
539  * later changed to a bigger value, migration rate should be enforced here.
540  */
541 static int vfio_save_iterate(QEMUFile *f, void *opaque)
542 {
543     VFIODevice *vbasedev = opaque;
544     VFIOMigration *migration = vbasedev->migration;
545     ssize_t data_size;
546 
547     data_size = vfio_save_block(f, migration);
548     if (data_size < 0) {
549         return data_size;
550     }
551 
552     vfio_update_estimated_pending_data(migration, data_size);
553 
554     if (migrate_switchover_ack() && !migration->precopy_init_size &&
555         !migration->initial_data_sent) {
556         qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT);
557         migration->initial_data_sent = true;
558     } else {
559         qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
560     }
561 
562     trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size,
563                             migration->precopy_dirty_size);
564 
565     return !migration->precopy_init_size && !migration->precopy_dirty_size;
566 }
567 
568 static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
569 {
570     VFIODevice *vbasedev = opaque;
571     ssize_t data_size;
572     int ret;
573     Error *local_err = NULL;
574 
575     /* We reach here with device state STOP or STOP_COPY only */
576     ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
577                                    VFIO_DEVICE_STATE_STOP, &local_err);
578     if (ret) {
579         error_report_err(local_err);
580         return ret;
581     }
582 
583     do {
584         data_size = vfio_save_block(f, vbasedev->migration);
585         if (data_size < 0) {
586             return data_size;
587         }
588     } while (data_size);
589 
590     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
591     ret = qemu_file_get_error(f);
592 
593     trace_vfio_save_complete_precopy(vbasedev->name, ret);
594 
595     return ret;
596 }
597 
598 static void vfio_save_state(QEMUFile *f, void *opaque)
599 {
600     VFIODevice *vbasedev = opaque;
601     Error *local_err = NULL;
602     int ret;
603 
604     ret = vfio_save_device_config_state(f, opaque, &local_err);
605     if (ret) {
606         error_prepend(&local_err,
607                       "vfio: Failed to save device config space of %s - ",
608                       vbasedev->name);
609         qemu_file_set_error_obj(f, ret, local_err);
610     }
611 }
612 
613 static int vfio_load_setup(QEMUFile *f, void *opaque, Error **errp)
614 {
615     VFIODevice *vbasedev = opaque;
616 
617     return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING,
618                                     vbasedev->migration->device_state, errp);
619 }
620 
621 static int vfio_load_cleanup(void *opaque)
622 {
623     VFIODevice *vbasedev = opaque;
624 
625     vfio_migration_cleanup(vbasedev);
626     trace_vfio_load_cleanup(vbasedev->name);
627 
628     return 0;
629 }
630 
631 static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
632 {
633     VFIODevice *vbasedev = opaque;
634     int ret = 0;
635     uint64_t data;
636 
637     data = qemu_get_be64(f);
638     while (data != VFIO_MIG_FLAG_END_OF_STATE) {
639 
640         trace_vfio_load_state(vbasedev->name, data);
641 
642         switch (data) {
643         case VFIO_MIG_FLAG_DEV_CONFIG_STATE:
644         {
645             return vfio_load_device_config_state(f, opaque);
646         }
647         case VFIO_MIG_FLAG_DEV_SETUP_STATE:
648         {
649             data = qemu_get_be64(f);
650             if (data == VFIO_MIG_FLAG_END_OF_STATE) {
651                 return ret;
652             } else {
653                 error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64,
654                              vbasedev->name, data);
655                 return -EINVAL;
656             }
657             break;
658         }
659         case VFIO_MIG_FLAG_DEV_DATA_STATE:
660         {
661             uint64_t data_size = qemu_get_be64(f);
662 
663             if (data_size) {
664                 ret = vfio_load_buffer(f, vbasedev, data_size);
665                 if (ret < 0) {
666                     return ret;
667                 }
668             }
669             break;
670         }
671         case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT:
672         {
673             if (!vfio_precopy_supported(vbasedev) ||
674                 !migrate_switchover_ack()) {
675                 error_report("%s: Received INIT_DATA_SENT but switchover ack "
676                              "is not used", vbasedev->name);
677                 return -EINVAL;
678             }
679 
680             ret = qemu_loadvm_approve_switchover();
681             if (ret) {
682                 error_report(
683                     "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)",
684                     vbasedev->name, ret, strerror(-ret));
685             }
686 
687             return ret;
688         }
689         default:
690             error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data);
691             return -EINVAL;
692         }
693 
694         data = qemu_get_be64(f);
695         ret = qemu_file_get_error(f);
696         if (ret) {
697             return ret;
698         }
699     }
700     return ret;
701 }
702 
703 static bool vfio_switchover_ack_needed(void *opaque)
704 {
705     VFIODevice *vbasedev = opaque;
706 
707     return vfio_precopy_supported(vbasedev);
708 }
709 
710 static const SaveVMHandlers savevm_vfio_handlers = {
711     .save_prepare = vfio_save_prepare,
712     .save_setup = vfio_save_setup,
713     .save_cleanup = vfio_save_cleanup,
714     .state_pending_estimate = vfio_state_pending_estimate,
715     .state_pending_exact = vfio_state_pending_exact,
716     .is_active_iterate = vfio_is_active_iterate,
717     .save_live_iterate = vfio_save_iterate,
718     .save_live_complete_precopy = vfio_save_complete_precopy,
719     .save_state = vfio_save_state,
720     .load_setup = vfio_load_setup,
721     .load_cleanup = vfio_load_cleanup,
722     .load_state = vfio_load_state,
723     .switchover_ack_needed = vfio_switchover_ack_needed,
724 };
725 
726 /* ---------------------------------------------------------------------- */
727 
728 static void vfio_vmstate_change_prepare(void *opaque, bool running,
729                                         RunState state)
730 {
731     VFIODevice *vbasedev = opaque;
732     VFIOMigration *migration = vbasedev->migration;
733     enum vfio_device_mig_state new_state;
734     Error *local_err = NULL;
735     int ret;
736 
737     new_state = migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ?
738                     VFIO_DEVICE_STATE_PRE_COPY_P2P :
739                     VFIO_DEVICE_STATE_RUNNING_P2P;
740 
741     ret = vfio_migration_set_state_or_reset(vbasedev, new_state, &local_err);
742     if (ret) {
743         /*
744          * Migration should be aborted in this case, but vm_state_notify()
745          * currently does not support reporting failures.
746          */
747         migration_file_set_error(ret, local_err);
748     }
749 
750     trace_vfio_vmstate_change_prepare(vbasedev->name, running,
751                                       RunState_str(state),
752                                       mig_state_to_str(new_state));
753 }
754 
755 static void vfio_vmstate_change(void *opaque, bool running, RunState state)
756 {
757     VFIODevice *vbasedev = opaque;
758     enum vfio_device_mig_state new_state;
759     Error *local_err = NULL;
760     int ret;
761 
762     if (running) {
763         new_state = VFIO_DEVICE_STATE_RUNNING;
764     } else {
765         new_state =
766             (vfio_device_state_is_precopy(vbasedev) &&
767              (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ?
768                 VFIO_DEVICE_STATE_STOP_COPY :
769                 VFIO_DEVICE_STATE_STOP;
770     }
771 
772     ret = vfio_migration_set_state_or_reset(vbasedev, new_state, &local_err);
773     if (ret) {
774         /*
775          * Migration should be aborted in this case, but vm_state_notify()
776          * currently does not support reporting failures.
777          */
778         migration_file_set_error(ret, local_err);
779     }
780 
781     trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
782                               mig_state_to_str(new_state));
783 }
784 
785 static int vfio_migration_state_notifier(NotifierWithReturn *notifier,
786                                          MigrationEvent *e, Error **errp)
787 {
788     VFIOMigration *migration = container_of(notifier, VFIOMigration,
789                                             migration_state);
790     VFIODevice *vbasedev = migration->vbasedev;
791     Error *local_err = NULL;
792     int ret;
793 
794     trace_vfio_migration_state_notifier(vbasedev->name, e->type);
795 
796     if (e->type == MIG_EVENT_PRECOPY_FAILED) {
797         /*
798          * MigrationNotifyFunc may not return an error code and an Error
799          * object for MIG_EVENT_PRECOPY_FAILED. Hence, report the error
800          * locally and ignore the errp argument.
801          */
802         ret = vfio_migration_set_state_or_reset(vbasedev,
803                                                 VFIO_DEVICE_STATE_RUNNING,
804                                                 &local_err);
805         if (ret) {
806             error_report_err(local_err);
807         }
808     }
809     return 0;
810 }
811 
812 static void vfio_migration_free(VFIODevice *vbasedev)
813 {
814     g_free(vbasedev->migration);
815     vbasedev->migration = NULL;
816 }
817 
818 static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags)
819 {
820     uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
821                                   sizeof(struct vfio_device_feature_migration),
822                               sizeof(uint64_t))] = {};
823     struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
824     struct vfio_device_feature_migration *mig =
825         (struct vfio_device_feature_migration *)feature->data;
826 
827     feature->argsz = sizeof(buf);
828     feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION;
829     if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
830         return -errno;
831     }
832 
833     *mig_flags = mig->flags;
834 
835     return 0;
836 }
837 
838 static bool vfio_dma_logging_supported(VFIODevice *vbasedev)
839 {
840     uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
841                               sizeof(uint64_t))] = {};
842     struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
843 
844     feature->argsz = sizeof(buf);
845     feature->flags = VFIO_DEVICE_FEATURE_PROBE |
846                      VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
847 
848     return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
849 }
850 
851 static int vfio_migration_init(VFIODevice *vbasedev)
852 {
853     int ret;
854     Object *obj;
855     VFIOMigration *migration;
856     char id[256] = "";
857     g_autofree char *path = NULL, *oid = NULL;
858     uint64_t mig_flags = 0;
859     VMChangeStateHandler *prepare_cb;
860 
861     if (!vbasedev->ops->vfio_get_object) {
862         return -EINVAL;
863     }
864 
865     obj = vbasedev->ops->vfio_get_object(vbasedev);
866     if (!obj) {
867         return -EINVAL;
868     }
869 
870     ret = vfio_migration_query_flags(vbasedev, &mig_flags);
871     if (ret) {
872         return ret;
873     }
874 
875     /* Basic migration functionality must be supported */
876     if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) {
877         return -EOPNOTSUPP;
878     }
879 
880     vbasedev->migration = g_new0(VFIOMigration, 1);
881     migration = vbasedev->migration;
882     migration->vbasedev = vbasedev;
883     migration->device_state = VFIO_DEVICE_STATE_RUNNING;
884     migration->data_fd = -1;
885     migration->mig_flags = mig_flags;
886 
887     vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev);
888 
889     oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj)));
890     if (oid) {
891         path = g_strdup_printf("%s/vfio", oid);
892     } else {
893         path = g_strdup("vfio");
894     }
895     strpadcpy(id, sizeof(id), path, '\0');
896 
897     register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers,
898                          vbasedev);
899 
900     prepare_cb = migration->mig_flags & VFIO_MIGRATION_P2P ?
901                      vfio_vmstate_change_prepare :
902                      NULL;
903     migration->vm_state = qdev_add_vm_change_state_handler_full(
904         vbasedev->dev, vfio_vmstate_change, prepare_cb, vbasedev);
905     migration_add_notifier(&migration->migration_state,
906                            vfio_migration_state_notifier);
907 
908     return 0;
909 }
910 
911 static void vfio_migration_deinit(VFIODevice *vbasedev)
912 {
913     VFIOMigration *migration = vbasedev->migration;
914 
915     migration_remove_notifier(&migration->migration_state);
916     qemu_del_vm_change_state_handler(migration->vm_state);
917     unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev);
918     vfio_migration_free(vbasedev);
919     vfio_unblock_multiple_devices_migration();
920 }
921 
922 static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp)
923 {
924     if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
925         error_propagate(errp, err);
926         return -EINVAL;
927     }
928 
929     vbasedev->migration_blocker = error_copy(err);
930     error_free(err);
931 
932     return migrate_add_blocker_normal(&vbasedev->migration_blocker, errp);
933 }
934 
935 /* ---------------------------------------------------------------------- */
936 
937 int64_t vfio_mig_bytes_transferred(void)
938 {
939     return bytes_transferred;
940 }
941 
942 void vfio_reset_bytes_transferred(void)
943 {
944     bytes_transferred = 0;
945 }
946 
947 /*
948  * Return true when either migration initialized or blocker registered.
949  * Currently only return false when adding blocker fails which will
950  * de-register vfio device.
951  */
952 bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
953 {
954     Error *err = NULL;
955     int ret;
956 
957     if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) {
958         error_setg(&err, "%s: Migration is disabled for VFIO device",
959                    vbasedev->name);
960         return !vfio_block_migration(vbasedev, err, errp);
961     }
962 
963     ret = vfio_migration_init(vbasedev);
964     if (ret) {
965         if (ret == -ENOTTY) {
966             error_setg(&err, "%s: VFIO migration is not supported in kernel",
967                        vbasedev->name);
968         } else {
969             error_setg(&err,
970                        "%s: Migration couldn't be initialized for VFIO device, "
971                        "err: %d (%s)",
972                        vbasedev->name, ret, strerror(-ret));
973         }
974 
975         return !vfio_block_migration(vbasedev, err, errp);
976     }
977 
978     if (!vbasedev->dirty_pages_supported) {
979         if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) {
980             error_setg(&err,
981                        "%s: VFIO device doesn't support device dirty tracking",
982                        vbasedev->name);
983             goto add_blocker;
984         }
985 
986         warn_report("%s: VFIO device doesn't support device dirty tracking",
987                     vbasedev->name);
988     }
989 
990     ret = vfio_block_multiple_devices_migration(vbasedev, errp);
991     if (ret) {
992         goto out_deinit;
993     }
994 
995     if (vfio_viommu_preset(vbasedev)) {
996         error_setg(&err, "%s: Migration is currently not supported "
997                    "with vIOMMU enabled", vbasedev->name);
998         goto add_blocker;
999     }
1000 
1001     trace_vfio_migration_realize(vbasedev->name);
1002     return true;
1003 
1004 add_blocker:
1005     ret = vfio_block_migration(vbasedev, err, errp);
1006 out_deinit:
1007     if (ret) {
1008         vfio_migration_deinit(vbasedev);
1009     }
1010     return !ret;
1011 }
1012 
1013 void vfio_migration_exit(VFIODevice *vbasedev)
1014 {
1015     if (vbasedev->migration) {
1016         vfio_migration_deinit(vbasedev);
1017     }
1018 
1019     migrate_del_blocker(&vbasedev->migration_blocker);
1020 }
1021