xref: /openbmc/qemu/hw/vfio/migration.c (revision 433fcea40c31ff355f84da22a46977c2a1b596c3)
1  /*
2   * Migration support for VFIO devices
3   *
4   * Copyright NVIDIA, Inc. 2020
5   *
6   * This work is licensed under the terms of the GNU GPL, version 2. See
7   * the COPYING file in the top-level directory.
8   */
9  
10  #include "qemu/osdep.h"
11  #include "qemu/main-loop.h"
12  #include "qemu/cutils.h"
13  #include "qemu/units.h"
14  #include "qemu/error-report.h"
15  #include <linux/vfio.h>
16  #include <sys/ioctl.h>
17  
18  #include "sysemu/runstate.h"
19  #include "hw/vfio/vfio-common.h"
20  #include "migration/migration.h"
21  #include "migration/options.h"
22  #include "migration/savevm.h"
23  #include "migration/vmstate.h"
24  #include "migration/qemu-file.h"
25  #include "migration/register.h"
26  #include "migration/blocker.h"
27  #include "migration/misc.h"
28  #include "qapi/error.h"
29  #include "exec/ramlist.h"
30  #include "exec/ram_addr.h"
31  #include "pci.h"
32  #include "trace.h"
33  #include "hw/hw.h"
34  
35  /*
36   * Flags to be used as unique delimiters for VFIO devices in the migration
37   * stream. These flags are composed as:
38   * 0xffffffff => MSB 32-bit all 1s
39   * 0xef10     => Magic ID, represents emulated (virtual) function IO
40   * 0x0000     => 16-bits reserved for flags
41   *
42   * The beginning of state information is marked by _DEV_CONFIG_STATE,
43   * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a
44   * certain state information is marked by _END_OF_STATE.
45   */
46  #define VFIO_MIG_FLAG_END_OF_STATE      (0xffffffffef100001ULL)
47  #define VFIO_MIG_FLAG_DEV_CONFIG_STATE  (0xffffffffef100002ULL)
48  #define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xffffffffef100003ULL)
49  #define VFIO_MIG_FLAG_DEV_DATA_STATE    (0xffffffffef100004ULL)
50  #define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL)
51  
52  /*
53   * This is an arbitrary size based on migration of mlx5 devices, where typically
54   * total device migration size is on the order of 100s of MB. Testing with
55   * larger values, e.g. 128MB and 1GB, did not show a performance improvement.
56   */
57  #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB)
58  
59  static int64_t bytes_transferred;
60  
61  static const char *mig_state_to_str(enum vfio_device_mig_state state)
62  {
63      switch (state) {
64      case VFIO_DEVICE_STATE_ERROR:
65          return "ERROR";
66      case VFIO_DEVICE_STATE_STOP:
67          return "STOP";
68      case VFIO_DEVICE_STATE_RUNNING:
69          return "RUNNING";
70      case VFIO_DEVICE_STATE_STOP_COPY:
71          return "STOP_COPY";
72      case VFIO_DEVICE_STATE_RESUMING:
73          return "RESUMING";
74      case VFIO_DEVICE_STATE_RUNNING_P2P:
75          return "RUNNING_P2P";
76      case VFIO_DEVICE_STATE_PRE_COPY:
77          return "PRE_COPY";
78      case VFIO_DEVICE_STATE_PRE_COPY_P2P:
79          return "PRE_COPY_P2P";
80      default:
81          return "UNKNOWN STATE";
82      }
83  }
84  
85  static int vfio_migration_set_state(VFIODevice *vbasedev,
86                                      enum vfio_device_mig_state new_state,
87                                      enum vfio_device_mig_state recover_state)
88  {
89      VFIOMigration *migration = vbasedev->migration;
90      uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
91                                sizeof(struct vfio_device_feature_mig_state),
92                                sizeof(uint64_t))] = {};
93      struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
94      struct vfio_device_feature_mig_state *mig_state =
95          (struct vfio_device_feature_mig_state *)feature->data;
96      int ret;
97  
98      feature->argsz = sizeof(buf);
99      feature->flags =
100          VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE;
101      mig_state->device_state = new_state;
102      if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
103          /* Try to set the device in some good state */
104          ret = -errno;
105  
106          if (recover_state == VFIO_DEVICE_STATE_ERROR) {
107              error_report("%s: Failed setting device state to %s, err: %s. "
108                           "Recover state is ERROR. Resetting device",
109                           vbasedev->name, mig_state_to_str(new_state),
110                           strerror(errno));
111  
112              goto reset_device;
113          }
114  
115          error_report(
116              "%s: Failed setting device state to %s, err: %s. Setting device in recover state %s",
117                       vbasedev->name, mig_state_to_str(new_state),
118                       strerror(errno), mig_state_to_str(recover_state));
119  
120          mig_state->device_state = recover_state;
121          if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
122              ret = -errno;
123              error_report(
124                  "%s: Failed setting device in recover state, err: %s. Resetting device",
125                           vbasedev->name, strerror(errno));
126  
127              goto reset_device;
128          }
129  
130          migration->device_state = recover_state;
131  
132          return ret;
133      }
134  
135      migration->device_state = new_state;
136      if (mig_state->data_fd != -1) {
137          if (migration->data_fd != -1) {
138              /*
139               * This can happen if the device is asynchronously reset and
140               * terminates a data transfer.
141               */
142              error_report("%s: data_fd out of sync", vbasedev->name);
143              close(mig_state->data_fd);
144  
145              return -EBADF;
146          }
147  
148          migration->data_fd = mig_state->data_fd;
149      }
150  
151      trace_vfio_migration_set_state(vbasedev->name, mig_state_to_str(new_state));
152  
153      return 0;
154  
155  reset_device:
156      if (ioctl(vbasedev->fd, VFIO_DEVICE_RESET)) {
157          hw_error("%s: Failed resetting device, err: %s", vbasedev->name,
158                   strerror(errno));
159      }
160  
161      migration->device_state = VFIO_DEVICE_STATE_RUNNING;
162  
163      return ret;
164  }
165  
166  static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev,
167                              uint64_t data_size)
168  {
169      VFIOMigration *migration = vbasedev->migration;
170      int ret;
171  
172      ret = qemu_file_get_to_fd(f, migration->data_fd, data_size);
173      trace_vfio_load_state_device_data(vbasedev->name, data_size, ret);
174  
175      return ret;
176  }
177  
178  static int vfio_save_device_config_state(QEMUFile *f, void *opaque)
179  {
180      VFIODevice *vbasedev = opaque;
181  
182      qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE);
183  
184      if (vbasedev->ops && vbasedev->ops->vfio_save_config) {
185          vbasedev->ops->vfio_save_config(vbasedev, f);
186      }
187  
188      qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
189  
190      trace_vfio_save_device_config_state(vbasedev->name);
191  
192      return qemu_file_get_error(f);
193  }
194  
195  static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
196  {
197      VFIODevice *vbasedev = opaque;
198      uint64_t data;
199  
200      if (vbasedev->ops && vbasedev->ops->vfio_load_config) {
201          int ret;
202  
203          ret = vbasedev->ops->vfio_load_config(vbasedev, f);
204          if (ret) {
205              error_report("%s: Failed to load device config space",
206                           vbasedev->name);
207              return ret;
208          }
209      }
210  
211      data = qemu_get_be64(f);
212      if (data != VFIO_MIG_FLAG_END_OF_STATE) {
213          error_report("%s: Failed loading device config space, "
214                       "end flag incorrect 0x%"PRIx64, vbasedev->name, data);
215          return -EINVAL;
216      }
217  
218      trace_vfio_load_device_config_state(vbasedev->name);
219      return qemu_file_get_error(f);
220  }
221  
222  static void vfio_migration_cleanup(VFIODevice *vbasedev)
223  {
224      VFIOMigration *migration = vbasedev->migration;
225  
226      close(migration->data_fd);
227      migration->data_fd = -1;
228  }
229  
230  static int vfio_query_stop_copy_size(VFIODevice *vbasedev,
231                                       uint64_t *stop_copy_size)
232  {
233      uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
234                                sizeof(struct vfio_device_feature_mig_data_size),
235                                sizeof(uint64_t))] = {};
236      struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
237      struct vfio_device_feature_mig_data_size *mig_data_size =
238          (struct vfio_device_feature_mig_data_size *)feature->data;
239  
240      feature->argsz = sizeof(buf);
241      feature->flags =
242          VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DATA_SIZE;
243  
244      if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
245          return -errno;
246      }
247  
248      *stop_copy_size = mig_data_size->stop_copy_length;
249  
250      return 0;
251  }
252  
253  static int vfio_query_precopy_size(VFIOMigration *migration)
254  {
255      struct vfio_precopy_info precopy = {
256          .argsz = sizeof(precopy),
257      };
258  
259      migration->precopy_init_size = 0;
260      migration->precopy_dirty_size = 0;
261  
262      if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) {
263          return -errno;
264      }
265  
266      migration->precopy_init_size = precopy.initial_bytes;
267      migration->precopy_dirty_size = precopy.dirty_bytes;
268  
269      return 0;
270  }
271  
272  /* Returns the size of saved data on success and -errno on error */
273  static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration)
274  {
275      ssize_t data_size;
276  
277      data_size = read(migration->data_fd, migration->data_buffer,
278                       migration->data_buffer_size);
279      if (data_size < 0) {
280          /*
281           * Pre-copy emptied all the device state for now. For more information,
282           * please refer to the Linux kernel VFIO uAPI.
283           */
284          if (errno == ENOMSG) {
285              return 0;
286          }
287  
288          return -errno;
289      }
290      if (data_size == 0) {
291          return 0;
292      }
293  
294      qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
295      qemu_put_be64(f, data_size);
296      qemu_put_buffer(f, migration->data_buffer, data_size);
297      bytes_transferred += data_size;
298  
299      trace_vfio_save_block(migration->vbasedev->name, data_size);
300  
301      return qemu_file_get_error(f) ?: data_size;
302  }
303  
304  static void vfio_update_estimated_pending_data(VFIOMigration *migration,
305                                                 uint64_t data_size)
306  {
307      if (!data_size) {
308          /*
309           * Pre-copy emptied all the device state for now, update estimated sizes
310           * accordingly.
311           */
312          migration->precopy_init_size = 0;
313          migration->precopy_dirty_size = 0;
314  
315          return;
316      }
317  
318      if (migration->precopy_init_size) {
319          uint64_t init_size = MIN(migration->precopy_init_size, data_size);
320  
321          migration->precopy_init_size -= init_size;
322          data_size -= init_size;
323      }
324  
325      migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size,
326                                           data_size);
327  }
328  
329  static bool vfio_precopy_supported(VFIODevice *vbasedev)
330  {
331      VFIOMigration *migration = vbasedev->migration;
332  
333      return migration->mig_flags & VFIO_MIGRATION_PRE_COPY;
334  }
335  
336  /* ---------------------------------------------------------------------- */
337  
338  static int vfio_save_prepare(void *opaque, Error **errp)
339  {
340      VFIODevice *vbasedev = opaque;
341  
342      /*
343       * Snapshot doesn't use postcopy nor background snapshot, so allow snapshot
344       * even if they are on.
345       */
346      if (runstate_check(RUN_STATE_SAVE_VM)) {
347          return 0;
348      }
349  
350      if (migrate_postcopy_ram()) {
351          error_setg(
352              errp, "%s: VFIO migration is not supported with postcopy migration",
353              vbasedev->name);
354          return -EOPNOTSUPP;
355      }
356  
357      if (migrate_background_snapshot()) {
358          error_setg(
359              errp,
360              "%s: VFIO migration is not supported with background snapshot",
361              vbasedev->name);
362          return -EOPNOTSUPP;
363      }
364  
365      return 0;
366  }
367  
368  static int vfio_save_setup(QEMUFile *f, void *opaque)
369  {
370      VFIODevice *vbasedev = opaque;
371      VFIOMigration *migration = vbasedev->migration;
372      uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE;
373  
374      qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
375  
376      vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
377      migration->data_buffer_size = MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE,
378                                        stop_copy_size);
379      migration->data_buffer = g_try_malloc0(migration->data_buffer_size);
380      if (!migration->data_buffer) {
381          error_report("%s: Failed to allocate migration data buffer",
382                       vbasedev->name);
383          return -ENOMEM;
384      }
385  
386      if (vfio_precopy_supported(vbasedev)) {
387          int ret;
388  
389          switch (migration->device_state) {
390          case VFIO_DEVICE_STATE_RUNNING:
391              ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY,
392                                             VFIO_DEVICE_STATE_RUNNING);
393              if (ret) {
394                  return ret;
395              }
396  
397              vfio_query_precopy_size(migration);
398  
399              break;
400          case VFIO_DEVICE_STATE_STOP:
401              /* vfio_save_complete_precopy() will go to STOP_COPY */
402              break;
403          default:
404              return -EINVAL;
405          }
406      }
407  
408      trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size);
409  
410      qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
411  
412      return qemu_file_get_error(f);
413  }
414  
415  static void vfio_save_cleanup(void *opaque)
416  {
417      VFIODevice *vbasedev = opaque;
418      VFIOMigration *migration = vbasedev->migration;
419  
420      /*
421       * Changing device state from STOP_COPY to STOP can take time. Do it here,
422       * after migration has completed, so it won't increase downtime.
423       */
424      if (migration->device_state == VFIO_DEVICE_STATE_STOP_COPY) {
425          /*
426           * If setting the device in STOP state fails, the device should be
427           * reset. To do so, use ERROR state as a recover state.
428           */
429          vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP,
430                                   VFIO_DEVICE_STATE_ERROR);
431      }
432  
433      g_free(migration->data_buffer);
434      migration->data_buffer = NULL;
435      migration->precopy_init_size = 0;
436      migration->precopy_dirty_size = 0;
437      migration->initial_data_sent = false;
438      vfio_migration_cleanup(vbasedev);
439      trace_vfio_save_cleanup(vbasedev->name);
440  }
441  
442  static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy,
443                                          uint64_t *can_postcopy)
444  {
445      VFIODevice *vbasedev = opaque;
446      VFIOMigration *migration = vbasedev->migration;
447  
448      if (!vfio_device_state_is_precopy(vbasedev)) {
449          return;
450      }
451  
452      *must_precopy +=
453          migration->precopy_init_size + migration->precopy_dirty_size;
454  
455      trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy,
456                                        *can_postcopy,
457                                        migration->precopy_init_size,
458                                        migration->precopy_dirty_size);
459  }
460  
461  /*
462   * Migration size of VFIO devices can be as little as a few KBs or as big as
463   * many GBs. This value should be big enough to cover the worst case.
464   */
465  #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB)
466  
467  static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy,
468                                       uint64_t *can_postcopy)
469  {
470      VFIODevice *vbasedev = opaque;
471      VFIOMigration *migration = vbasedev->migration;
472      uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE;
473  
474      /*
475       * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is
476       * reported so downtime limit won't be violated.
477       */
478      vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
479      *must_precopy += stop_copy_size;
480  
481      if (vfio_device_state_is_precopy(vbasedev)) {
482          vfio_query_precopy_size(migration);
483  
484          *must_precopy +=
485              migration->precopy_init_size + migration->precopy_dirty_size;
486      }
487  
488      trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy,
489                                     stop_copy_size, migration->precopy_init_size,
490                                     migration->precopy_dirty_size);
491  }
492  
493  static bool vfio_is_active_iterate(void *opaque)
494  {
495      VFIODevice *vbasedev = opaque;
496  
497      return vfio_device_state_is_precopy(vbasedev);
498  }
499  
500  static int vfio_save_iterate(QEMUFile *f, void *opaque)
501  {
502      VFIODevice *vbasedev = opaque;
503      VFIOMigration *migration = vbasedev->migration;
504      ssize_t data_size;
505  
506      data_size = vfio_save_block(f, migration);
507      if (data_size < 0) {
508          return data_size;
509      }
510  
511      vfio_update_estimated_pending_data(migration, data_size);
512  
513      if (migrate_switchover_ack() && !migration->precopy_init_size &&
514          !migration->initial_data_sent) {
515          qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT);
516          migration->initial_data_sent = true;
517      } else {
518          qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
519      }
520  
521      trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size,
522                              migration->precopy_dirty_size);
523  
524      /*
525       * A VFIO device's pre-copy dirty_bytes is not guaranteed to reach zero.
526       * Return 1 so following handlers will not be potentially blocked.
527       */
528      return 1;
529  }
530  
531  static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
532  {
533      VFIODevice *vbasedev = opaque;
534      ssize_t data_size;
535      int ret;
536  
537      /* We reach here with device state STOP or STOP_COPY only */
538      ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
539                                     VFIO_DEVICE_STATE_STOP);
540      if (ret) {
541          return ret;
542      }
543  
544      do {
545          data_size = vfio_save_block(f, vbasedev->migration);
546          if (data_size < 0) {
547              return data_size;
548          }
549      } while (data_size);
550  
551      qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
552      ret = qemu_file_get_error(f);
553      if (ret) {
554          return ret;
555      }
556  
557      trace_vfio_save_complete_precopy(vbasedev->name, ret);
558  
559      return ret;
560  }
561  
562  static void vfio_save_state(QEMUFile *f, void *opaque)
563  {
564      VFIODevice *vbasedev = opaque;
565      int ret;
566  
567      ret = vfio_save_device_config_state(f, opaque);
568      if (ret) {
569          error_report("%s: Failed to save device config space",
570                       vbasedev->name);
571          qemu_file_set_error(f, ret);
572      }
573  }
574  
575  static int vfio_load_setup(QEMUFile *f, void *opaque)
576  {
577      VFIODevice *vbasedev = opaque;
578  
579      return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING,
580                                     vbasedev->migration->device_state);
581  }
582  
583  static int vfio_load_cleanup(void *opaque)
584  {
585      VFIODevice *vbasedev = opaque;
586  
587      vfio_migration_cleanup(vbasedev);
588      trace_vfio_load_cleanup(vbasedev->name);
589  
590      return 0;
591  }
592  
593  static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
594  {
595      VFIODevice *vbasedev = opaque;
596      int ret = 0;
597      uint64_t data;
598  
599      data = qemu_get_be64(f);
600      while (data != VFIO_MIG_FLAG_END_OF_STATE) {
601  
602          trace_vfio_load_state(vbasedev->name, data);
603  
604          switch (data) {
605          case VFIO_MIG_FLAG_DEV_CONFIG_STATE:
606          {
607              return vfio_load_device_config_state(f, opaque);
608          }
609          case VFIO_MIG_FLAG_DEV_SETUP_STATE:
610          {
611              data = qemu_get_be64(f);
612              if (data == VFIO_MIG_FLAG_END_OF_STATE) {
613                  return ret;
614              } else {
615                  error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64,
616                               vbasedev->name, data);
617                  return -EINVAL;
618              }
619              break;
620          }
621          case VFIO_MIG_FLAG_DEV_DATA_STATE:
622          {
623              uint64_t data_size = qemu_get_be64(f);
624  
625              if (data_size) {
626                  ret = vfio_load_buffer(f, vbasedev, data_size);
627                  if (ret < 0) {
628                      return ret;
629                  }
630              }
631              break;
632          }
633          case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT:
634          {
635              if (!vfio_precopy_supported(vbasedev) ||
636                  !migrate_switchover_ack()) {
637                  error_report("%s: Received INIT_DATA_SENT but switchover ack "
638                               "is not used", vbasedev->name);
639                  return -EINVAL;
640              }
641  
642              ret = qemu_loadvm_approve_switchover();
643              if (ret) {
644                  error_report(
645                      "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)",
646                      vbasedev->name, ret, strerror(-ret));
647              }
648  
649              return ret;
650          }
651          default:
652              error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data);
653              return -EINVAL;
654          }
655  
656          data = qemu_get_be64(f);
657          ret = qemu_file_get_error(f);
658          if (ret) {
659              return ret;
660          }
661      }
662      return ret;
663  }
664  
665  static bool vfio_switchover_ack_needed(void *opaque)
666  {
667      VFIODevice *vbasedev = opaque;
668  
669      return vfio_precopy_supported(vbasedev);
670  }
671  
672  static const SaveVMHandlers savevm_vfio_handlers = {
673      .save_prepare = vfio_save_prepare,
674      .save_setup = vfio_save_setup,
675      .save_cleanup = vfio_save_cleanup,
676      .state_pending_estimate = vfio_state_pending_estimate,
677      .state_pending_exact = vfio_state_pending_exact,
678      .is_active_iterate = vfio_is_active_iterate,
679      .save_live_iterate = vfio_save_iterate,
680      .save_live_complete_precopy = vfio_save_complete_precopy,
681      .save_state = vfio_save_state,
682      .load_setup = vfio_load_setup,
683      .load_cleanup = vfio_load_cleanup,
684      .load_state = vfio_load_state,
685      .switchover_ack_needed = vfio_switchover_ack_needed,
686  };
687  
688  /* ---------------------------------------------------------------------- */
689  
690  static void vfio_vmstate_change_prepare(void *opaque, bool running,
691                                          RunState state)
692  {
693      VFIODevice *vbasedev = opaque;
694      VFIOMigration *migration = vbasedev->migration;
695      enum vfio_device_mig_state new_state;
696      int ret;
697  
698      new_state = migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ?
699                      VFIO_DEVICE_STATE_PRE_COPY_P2P :
700                      VFIO_DEVICE_STATE_RUNNING_P2P;
701  
702      /*
703       * If setting the device in new_state fails, the device should be reset.
704       * To do so, use ERROR state as a recover state.
705       */
706      ret = vfio_migration_set_state(vbasedev, new_state,
707                                     VFIO_DEVICE_STATE_ERROR);
708      if (ret) {
709          /*
710           * Migration should be aborted in this case, but vm_state_notify()
711           * currently does not support reporting failures.
712           */
713          if (migrate_get_current()->to_dst_file) {
714              qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
715          }
716      }
717  
718      trace_vfio_vmstate_change_prepare(vbasedev->name, running,
719                                        RunState_str(state),
720                                        mig_state_to_str(new_state));
721  }
722  
723  static void vfio_vmstate_change(void *opaque, bool running, RunState state)
724  {
725      VFIODevice *vbasedev = opaque;
726      enum vfio_device_mig_state new_state;
727      int ret;
728  
729      if (running) {
730          new_state = VFIO_DEVICE_STATE_RUNNING;
731      } else {
732          new_state =
733              (vfio_device_state_is_precopy(vbasedev) &&
734               (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ?
735                  VFIO_DEVICE_STATE_STOP_COPY :
736                  VFIO_DEVICE_STATE_STOP;
737      }
738  
739      /*
740       * If setting the device in new_state fails, the device should be reset.
741       * To do so, use ERROR state as a recover state.
742       */
743      ret = vfio_migration_set_state(vbasedev, new_state,
744                                     VFIO_DEVICE_STATE_ERROR);
745      if (ret) {
746          /*
747           * Migration should be aborted in this case, but vm_state_notify()
748           * currently does not support reporting failures.
749           */
750          if (migrate_get_current()->to_dst_file) {
751              qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
752          }
753      }
754  
755      trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
756                                mig_state_to_str(new_state));
757  }
758  
759  static void vfio_migration_state_notifier(Notifier *notifier, void *data)
760  {
761      MigrationState *s = data;
762      VFIOMigration *migration = container_of(notifier, VFIOMigration,
763                                              migration_state);
764      VFIODevice *vbasedev = migration->vbasedev;
765  
766      trace_vfio_migration_state_notifier(vbasedev->name,
767                                          MigrationStatus_str(s->state));
768  
769      switch (s->state) {
770      case MIGRATION_STATUS_CANCELLING:
771      case MIGRATION_STATUS_CANCELLED:
772      case MIGRATION_STATUS_FAILED:
773          /*
774           * If setting the device in RUNNING state fails, the device should
775           * be reset. To do so, use ERROR state as a recover state.
776           */
777          vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
778                                   VFIO_DEVICE_STATE_ERROR);
779      }
780  }
781  
782  static void vfio_migration_free(VFIODevice *vbasedev)
783  {
784      g_free(vbasedev->migration);
785      vbasedev->migration = NULL;
786  }
787  
788  static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags)
789  {
790      uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
791                                    sizeof(struct vfio_device_feature_migration),
792                                sizeof(uint64_t))] = {};
793      struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
794      struct vfio_device_feature_migration *mig =
795          (struct vfio_device_feature_migration *)feature->data;
796  
797      feature->argsz = sizeof(buf);
798      feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION;
799      if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
800          return -errno;
801      }
802  
803      *mig_flags = mig->flags;
804  
805      return 0;
806  }
807  
808  static bool vfio_dma_logging_supported(VFIODevice *vbasedev)
809  {
810      uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
811                                sizeof(uint64_t))] = {};
812      struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
813  
814      feature->argsz = sizeof(buf);
815      feature->flags = VFIO_DEVICE_FEATURE_PROBE |
816                       VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
817  
818      return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
819  }
820  
821  static int vfio_migration_init(VFIODevice *vbasedev)
822  {
823      int ret;
824      Object *obj;
825      VFIOMigration *migration;
826      char id[256] = "";
827      g_autofree char *path = NULL, *oid = NULL;
828      uint64_t mig_flags = 0;
829      VMChangeStateHandler *prepare_cb;
830  
831      if (!vbasedev->ops->vfio_get_object) {
832          return -EINVAL;
833      }
834  
835      obj = vbasedev->ops->vfio_get_object(vbasedev);
836      if (!obj) {
837          return -EINVAL;
838      }
839  
840      ret = vfio_migration_query_flags(vbasedev, &mig_flags);
841      if (ret) {
842          return ret;
843      }
844  
845      /* Basic migration functionality must be supported */
846      if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) {
847          return -EOPNOTSUPP;
848      }
849  
850      vbasedev->migration = g_new0(VFIOMigration, 1);
851      migration = vbasedev->migration;
852      migration->vbasedev = vbasedev;
853      migration->device_state = VFIO_DEVICE_STATE_RUNNING;
854      migration->data_fd = -1;
855      migration->mig_flags = mig_flags;
856  
857      vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev);
858  
859      oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj)));
860      if (oid) {
861          path = g_strdup_printf("%s/vfio", oid);
862      } else {
863          path = g_strdup("vfio");
864      }
865      strpadcpy(id, sizeof(id), path, '\0');
866  
867      register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers,
868                           vbasedev);
869  
870      prepare_cb = migration->mig_flags & VFIO_MIGRATION_P2P ?
871                       vfio_vmstate_change_prepare :
872                       NULL;
873      migration->vm_state = qdev_add_vm_change_state_handler_full(
874          vbasedev->dev, vfio_vmstate_change, prepare_cb, vbasedev);
875      migration_add_notifier(&migration->migration_state,
876                             vfio_migration_state_notifier);
877  
878      return 0;
879  }
880  
881  static void vfio_migration_deinit(VFIODevice *vbasedev)
882  {
883      VFIOMigration *migration = vbasedev->migration;
884  
885      migration_remove_notifier(&migration->migration_state);
886      qemu_del_vm_change_state_handler(migration->vm_state);
887      unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev);
888      vfio_migration_free(vbasedev);
889      vfio_unblock_multiple_devices_migration();
890  }
891  
892  static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp)
893  {
894      if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
895          error_propagate(errp, err);
896          return -EINVAL;
897      }
898  
899      vbasedev->migration_blocker = error_copy(err);
900      error_free(err);
901  
902      return migrate_add_blocker(&vbasedev->migration_blocker, errp);
903  }
904  
905  /* ---------------------------------------------------------------------- */
906  
907  int64_t vfio_mig_bytes_transferred(void)
908  {
909      return bytes_transferred;
910  }
911  
912  void vfio_reset_bytes_transferred(void)
913  {
914      bytes_transferred = 0;
915  }
916  
917  /*
918   * Return true when either migration initialized or blocker registered.
919   * Currently only return false when adding blocker fails which will
920   * de-register vfio device.
921   */
922  bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
923  {
924      Error *err = NULL;
925      int ret;
926  
927      if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) {
928          error_setg(&err, "%s: Migration is disabled for VFIO device",
929                     vbasedev->name);
930          return !vfio_block_migration(vbasedev, err, errp);
931      }
932  
933      ret = vfio_migration_init(vbasedev);
934      if (ret) {
935          if (ret == -ENOTTY) {
936              error_setg(&err, "%s: VFIO migration is not supported in kernel",
937                         vbasedev->name);
938          } else {
939              error_setg(&err,
940                         "%s: Migration couldn't be initialized for VFIO device, "
941                         "err: %d (%s)",
942                         vbasedev->name, ret, strerror(-ret));
943          }
944  
945          return !vfio_block_migration(vbasedev, err, errp);
946      }
947  
948      if (!vbasedev->dirty_pages_supported) {
949          if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) {
950              error_setg(&err,
951                         "%s: VFIO device doesn't support device dirty tracking",
952                         vbasedev->name);
953              goto add_blocker;
954          }
955  
956          warn_report("%s: VFIO device doesn't support device dirty tracking",
957                      vbasedev->name);
958      }
959  
960      ret = vfio_block_multiple_devices_migration(vbasedev, errp);
961      if (ret) {
962          goto out_deinit;
963      }
964  
965      if (vfio_viommu_preset(vbasedev)) {
966          error_setg(&err, "%s: Migration is currently not supported "
967                     "with vIOMMU enabled", vbasedev->name);
968          goto add_blocker;
969      }
970  
971      trace_vfio_migration_realize(vbasedev->name);
972      return true;
973  
974  add_blocker:
975      ret = vfio_block_migration(vbasedev, err, errp);
976  out_deinit:
977      if (ret) {
978          vfio_migration_deinit(vbasedev);
979      }
980      return !ret;
981  }
982  
983  void vfio_migration_exit(VFIODevice *vbasedev)
984  {
985      if (vbasedev->migration) {
986          vfio_migration_deinit(vbasedev);
987      }
988  
989      migrate_del_blocker(&vbasedev->migration_blocker);
990  }
991