xref: /openbmc/qemu/hw/vfio/migration.c (revision 51d7495ed9901966d90517032d9b9ae8faebe1d5)
1  /*
2   * Migration support for VFIO devices
3   *
4   * Copyright NVIDIA, Inc. 2020
5   *
6   * This work is licensed under the terms of the GNU GPL, version 2. See
7   * the COPYING file in the top-level directory.
8   */
9  
10  #include "qemu/osdep.h"
11  #include "qemu/main-loop.h"
12  #include "qemu/cutils.h"
13  #include "qemu/units.h"
14  #include "qemu/error-report.h"
15  #include <linux/vfio.h>
16  #include <sys/ioctl.h>
17  
18  #include "sysemu/runstate.h"
19  #include "hw/vfio/vfio-common.h"
20  #include "migration/misc.h"
21  #include "migration/savevm.h"
22  #include "migration/vmstate.h"
23  #include "migration/qemu-file.h"
24  #include "migration/register.h"
25  #include "migration/blocker.h"
26  #include "qapi/error.h"
27  #include "qapi/qapi-events-vfio.h"
28  #include "exec/ramlist.h"
29  #include "exec/ram_addr.h"
30  #include "pci.h"
31  #include "trace.h"
32  #include "hw/hw.h"
33  
34  /*
35   * Flags to be used as unique delimiters for VFIO devices in the migration
36   * stream. These flags are composed as:
37   * 0xffffffff => MSB 32-bit all 1s
38   * 0xef10     => Magic ID, represents emulated (virtual) function IO
39   * 0x0000     => 16-bits reserved for flags
40   *
41   * The beginning of state information is marked by _DEV_CONFIG_STATE,
42   * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a
43   * certain state information is marked by _END_OF_STATE.
44   */
45  #define VFIO_MIG_FLAG_END_OF_STATE      (0xffffffffef100001ULL)
46  #define VFIO_MIG_FLAG_DEV_CONFIG_STATE  (0xffffffffef100002ULL)
47  #define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xffffffffef100003ULL)
48  #define VFIO_MIG_FLAG_DEV_DATA_STATE    (0xffffffffef100004ULL)
49  #define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL)
50  
51  /*
52   * This is an arbitrary size based on migration of mlx5 devices, where typically
53   * total device migration size is on the order of 100s of MB. Testing with
54   * larger values, e.g. 128MB and 1GB, did not show a performance improvement.
55   */
56  #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB)
57  
58  static int64_t bytes_transferred;
59  
mig_state_to_str(enum vfio_device_mig_state state)60  static const char *mig_state_to_str(enum vfio_device_mig_state state)
61  {
62      switch (state) {
63      case VFIO_DEVICE_STATE_ERROR:
64          return "ERROR";
65      case VFIO_DEVICE_STATE_STOP:
66          return "STOP";
67      case VFIO_DEVICE_STATE_RUNNING:
68          return "RUNNING";
69      case VFIO_DEVICE_STATE_STOP_COPY:
70          return "STOP_COPY";
71      case VFIO_DEVICE_STATE_RESUMING:
72          return "RESUMING";
73      case VFIO_DEVICE_STATE_RUNNING_P2P:
74          return "RUNNING_P2P";
75      case VFIO_DEVICE_STATE_PRE_COPY:
76          return "PRE_COPY";
77      case VFIO_DEVICE_STATE_PRE_COPY_P2P:
78          return "PRE_COPY_P2P";
79      default:
80          return "UNKNOWN STATE";
81      }
82  }
83  
84  static QapiVfioMigrationState
mig_state_to_qapi_state(enum vfio_device_mig_state state)85  mig_state_to_qapi_state(enum vfio_device_mig_state state)
86  {
87      switch (state) {
88      case VFIO_DEVICE_STATE_STOP:
89          return QAPI_VFIO_MIGRATION_STATE_STOP;
90      case VFIO_DEVICE_STATE_RUNNING:
91          return QAPI_VFIO_MIGRATION_STATE_RUNNING;
92      case VFIO_DEVICE_STATE_STOP_COPY:
93          return QAPI_VFIO_MIGRATION_STATE_STOP_COPY;
94      case VFIO_DEVICE_STATE_RESUMING:
95          return QAPI_VFIO_MIGRATION_STATE_RESUMING;
96      case VFIO_DEVICE_STATE_RUNNING_P2P:
97          return QAPI_VFIO_MIGRATION_STATE_RUNNING_P2P;
98      case VFIO_DEVICE_STATE_PRE_COPY:
99          return QAPI_VFIO_MIGRATION_STATE_PRE_COPY;
100      case VFIO_DEVICE_STATE_PRE_COPY_P2P:
101          return QAPI_VFIO_MIGRATION_STATE_PRE_COPY_P2P;
102      default:
103          g_assert_not_reached();
104      }
105  }
106  
vfio_migration_send_event(VFIODevice * vbasedev)107  static void vfio_migration_send_event(VFIODevice *vbasedev)
108  {
109      VFIOMigration *migration = vbasedev->migration;
110      DeviceState *dev = vbasedev->dev;
111      g_autofree char *qom_path = NULL;
112      Object *obj;
113  
114      if (!vbasedev->migration_events) {
115          return;
116      }
117  
118      g_assert(vbasedev->ops->vfio_get_object);
119      obj = vbasedev->ops->vfio_get_object(vbasedev);
120      g_assert(obj);
121      qom_path = object_get_canonical_path(obj);
122  
123      qapi_event_send_vfio_migration(
124          dev->id, qom_path, mig_state_to_qapi_state(migration->device_state));
125  }
126  
vfio_migration_set_device_state(VFIODevice * vbasedev,enum vfio_device_mig_state state)127  static void vfio_migration_set_device_state(VFIODevice *vbasedev,
128                                              enum vfio_device_mig_state state)
129  {
130      VFIOMigration *migration = vbasedev->migration;
131  
132      trace_vfio_migration_set_device_state(vbasedev->name,
133                                            mig_state_to_str(state));
134  
135      migration->device_state = state;
136      vfio_migration_send_event(vbasedev);
137  }
138  
vfio_migration_set_state(VFIODevice * vbasedev,enum vfio_device_mig_state new_state,enum vfio_device_mig_state recover_state,Error ** errp)139  static int vfio_migration_set_state(VFIODevice *vbasedev,
140                                      enum vfio_device_mig_state new_state,
141                                      enum vfio_device_mig_state recover_state,
142                                      Error **errp)
143  {
144      VFIOMigration *migration = vbasedev->migration;
145      uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
146                                sizeof(struct vfio_device_feature_mig_state),
147                                sizeof(uint64_t))] = {};
148      struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
149      struct vfio_device_feature_mig_state *mig_state =
150          (struct vfio_device_feature_mig_state *)feature->data;
151      int ret;
152      g_autofree char *error_prefix =
153          g_strdup_printf("%s: Failed setting device state to %s.",
154                          vbasedev->name, mig_state_to_str(new_state));
155  
156      trace_vfio_migration_set_state(vbasedev->name, mig_state_to_str(new_state),
157                                     mig_state_to_str(recover_state));
158  
159      if (new_state == migration->device_state) {
160          return 0;
161      }
162  
163      feature->argsz = sizeof(buf);
164      feature->flags =
165          VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE;
166      mig_state->device_state = new_state;
167      if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
168          /* Try to set the device in some good state */
169          ret = -errno;
170  
171          if (recover_state == VFIO_DEVICE_STATE_ERROR) {
172              error_setg_errno(errp, errno,
173                               "%s Recover state is ERROR. Resetting device",
174                               error_prefix);
175  
176              goto reset_device;
177          }
178  
179          error_setg_errno(errp, errno,
180                           "%s Setting device in recover state %s",
181                           error_prefix, mig_state_to_str(recover_state));
182  
183          mig_state->device_state = recover_state;
184          if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
185              ret = -errno;
186              /*
187               * If setting the device in recover state fails, report
188               * the error here and propagate the first error.
189               */
190              error_report(
191                  "%s: Failed setting device in recover state, err: %s. Resetting device",
192                           vbasedev->name, strerror(errno));
193  
194              goto reset_device;
195          }
196  
197          vfio_migration_set_device_state(vbasedev, recover_state);
198  
199          return ret;
200      }
201  
202      vfio_migration_set_device_state(vbasedev, new_state);
203      if (mig_state->data_fd != -1) {
204          if (migration->data_fd != -1) {
205              /*
206               * This can happen if the device is asynchronously reset and
207               * terminates a data transfer.
208               */
209              error_setg(errp, "%s: data_fd out of sync", vbasedev->name);
210              close(mig_state->data_fd);
211  
212              return -EBADF;
213          }
214  
215          migration->data_fd = mig_state->data_fd;
216      }
217  
218      return 0;
219  
220  reset_device:
221      if (ioctl(vbasedev->fd, VFIO_DEVICE_RESET)) {
222          hw_error("%s: Failed resetting device, err: %s", vbasedev->name,
223                   strerror(errno));
224      }
225  
226      vfio_migration_set_device_state(vbasedev, VFIO_DEVICE_STATE_RUNNING);
227  
228      return ret;
229  }
230  
231  /*
232   * Some device state transitions require resetting the device if they fail.
233   * This function sets the device in new_state and resets the device if that
234   * fails. Reset is done by using ERROR as the recover state.
235   */
236  static int
vfio_migration_set_state_or_reset(VFIODevice * vbasedev,enum vfio_device_mig_state new_state,Error ** errp)237  vfio_migration_set_state_or_reset(VFIODevice *vbasedev,
238                                    enum vfio_device_mig_state new_state,
239                                    Error **errp)
240  {
241      return vfio_migration_set_state(vbasedev, new_state,
242                                      VFIO_DEVICE_STATE_ERROR, errp);
243  }
244  
vfio_load_buffer(QEMUFile * f,VFIODevice * vbasedev,uint64_t data_size)245  static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev,
246                              uint64_t data_size)
247  {
248      VFIOMigration *migration = vbasedev->migration;
249      int ret;
250  
251      ret = qemu_file_get_to_fd(f, migration->data_fd, data_size);
252      trace_vfio_load_state_device_data(vbasedev->name, data_size, ret);
253  
254      return ret;
255  }
256  
vfio_save_device_config_state(QEMUFile * f,void * opaque,Error ** errp)257  static int vfio_save_device_config_state(QEMUFile *f, void *opaque,
258                                           Error **errp)
259  {
260      VFIODevice *vbasedev = opaque;
261      int ret;
262  
263      qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE);
264  
265      if (vbasedev->ops && vbasedev->ops->vfio_save_config) {
266          ret = vbasedev->ops->vfio_save_config(vbasedev, f, errp);
267          if (ret) {
268              return ret;
269          }
270      }
271  
272      qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
273  
274      trace_vfio_save_device_config_state(vbasedev->name);
275  
276      ret = qemu_file_get_error(f);
277      if (ret < 0) {
278          error_setg_errno(errp, -ret, "Failed to save state");
279      }
280      return ret;
281  }
282  
vfio_load_device_config_state(QEMUFile * f,void * opaque)283  static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
284  {
285      VFIODevice *vbasedev = opaque;
286      uint64_t data;
287  
288      if (vbasedev->ops && vbasedev->ops->vfio_load_config) {
289          int ret;
290  
291          ret = vbasedev->ops->vfio_load_config(vbasedev, f);
292          if (ret) {
293              error_report("%s: Failed to load device config space",
294                           vbasedev->name);
295              return ret;
296          }
297      }
298  
299      data = qemu_get_be64(f);
300      if (data != VFIO_MIG_FLAG_END_OF_STATE) {
301          error_report("%s: Failed loading device config space, "
302                       "end flag incorrect 0x%"PRIx64, vbasedev->name, data);
303          return -EINVAL;
304      }
305  
306      trace_vfio_load_device_config_state(vbasedev->name);
307      return qemu_file_get_error(f);
308  }
309  
vfio_migration_cleanup(VFIODevice * vbasedev)310  static void vfio_migration_cleanup(VFIODevice *vbasedev)
311  {
312      VFIOMigration *migration = vbasedev->migration;
313  
314      close(migration->data_fd);
315      migration->data_fd = -1;
316  }
317  
vfio_query_stop_copy_size(VFIODevice * vbasedev,uint64_t * stop_copy_size)318  static int vfio_query_stop_copy_size(VFIODevice *vbasedev,
319                                       uint64_t *stop_copy_size)
320  {
321      uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
322                                sizeof(struct vfio_device_feature_mig_data_size),
323                                sizeof(uint64_t))] = {};
324      struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
325      struct vfio_device_feature_mig_data_size *mig_data_size =
326          (struct vfio_device_feature_mig_data_size *)feature->data;
327  
328      feature->argsz = sizeof(buf);
329      feature->flags =
330          VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DATA_SIZE;
331  
332      if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
333          return -errno;
334      }
335  
336      *stop_copy_size = mig_data_size->stop_copy_length;
337  
338      return 0;
339  }
340  
vfio_query_precopy_size(VFIOMigration * migration)341  static int vfio_query_precopy_size(VFIOMigration *migration)
342  {
343      struct vfio_precopy_info precopy = {
344          .argsz = sizeof(precopy),
345      };
346  
347      migration->precopy_init_size = 0;
348      migration->precopy_dirty_size = 0;
349  
350      if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) {
351          return -errno;
352      }
353  
354      migration->precopy_init_size = precopy.initial_bytes;
355      migration->precopy_dirty_size = precopy.dirty_bytes;
356  
357      return 0;
358  }
359  
360  /* Returns the size of saved data on success and -errno on error */
vfio_save_block(QEMUFile * f,VFIOMigration * migration)361  static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration)
362  {
363      ssize_t data_size;
364  
365      data_size = read(migration->data_fd, migration->data_buffer,
366                       migration->data_buffer_size);
367      if (data_size < 0) {
368          /*
369           * Pre-copy emptied all the device state for now. For more information,
370           * please refer to the Linux kernel VFIO uAPI.
371           */
372          if (errno == ENOMSG) {
373              if (!migration->event_precopy_empty_hit) {
374                  trace_vfio_save_block_precopy_empty_hit(migration->vbasedev->name);
375                  migration->event_precopy_empty_hit = true;
376              }
377              return 0;
378          }
379  
380          return -errno;
381      }
382      if (data_size == 0) {
383          return 0;
384      }
385  
386      /* Non-empty read: re-arm the trace event */
387      migration->event_precopy_empty_hit = false;
388  
389      qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
390      qemu_put_be64(f, data_size);
391      qemu_put_buffer(f, migration->data_buffer, data_size);
392      bytes_transferred += data_size;
393  
394      trace_vfio_save_block(migration->vbasedev->name, data_size);
395  
396      return qemu_file_get_error(f) ?: data_size;
397  }
398  
vfio_update_estimated_pending_data(VFIOMigration * migration,uint64_t data_size)399  static void vfio_update_estimated_pending_data(VFIOMigration *migration,
400                                                 uint64_t data_size)
401  {
402      if (!data_size) {
403          /*
404           * Pre-copy emptied all the device state for now, update estimated sizes
405           * accordingly.
406           */
407          migration->precopy_init_size = 0;
408          migration->precopy_dirty_size = 0;
409  
410          return;
411      }
412  
413      if (migration->precopy_init_size) {
414          uint64_t init_size = MIN(migration->precopy_init_size, data_size);
415  
416          migration->precopy_init_size -= init_size;
417          data_size -= init_size;
418      }
419  
420      migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size,
421                                           data_size);
422  }
423  
vfio_precopy_supported(VFIODevice * vbasedev)424  static bool vfio_precopy_supported(VFIODevice *vbasedev)
425  {
426      VFIOMigration *migration = vbasedev->migration;
427  
428      return migration->mig_flags & VFIO_MIGRATION_PRE_COPY;
429  }
430  
431  /* ---------------------------------------------------------------------- */
432  
vfio_save_prepare(void * opaque,Error ** errp)433  static int vfio_save_prepare(void *opaque, Error **errp)
434  {
435      VFIODevice *vbasedev = opaque;
436  
437      /*
438       * Snapshot doesn't use postcopy nor background snapshot, so allow snapshot
439       * even if they are on.
440       */
441      if (runstate_check(RUN_STATE_SAVE_VM)) {
442          return 0;
443      }
444  
445      if (migrate_postcopy_ram()) {
446          error_setg(
447              errp, "%s: VFIO migration is not supported with postcopy migration",
448              vbasedev->name);
449          return -EOPNOTSUPP;
450      }
451  
452      if (migrate_background_snapshot()) {
453          error_setg(
454              errp,
455              "%s: VFIO migration is not supported with background snapshot",
456              vbasedev->name);
457          return -EOPNOTSUPP;
458      }
459  
460      return 0;
461  }
462  
vfio_save_setup(QEMUFile * f,void * opaque,Error ** errp)463  static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp)
464  {
465      VFIODevice *vbasedev = opaque;
466      VFIOMigration *migration = vbasedev->migration;
467      uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE;
468      int ret;
469  
470      qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
471  
472      vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
473      migration->data_buffer_size = MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE,
474                                        stop_copy_size);
475      migration->data_buffer = g_try_malloc0(migration->data_buffer_size);
476      if (!migration->data_buffer) {
477          error_setg(errp, "%s: Failed to allocate migration data buffer",
478                     vbasedev->name);
479          return -ENOMEM;
480      }
481  
482      migration->event_save_iterate_started = false;
483      migration->event_precopy_empty_hit = false;
484  
485      if (vfio_precopy_supported(vbasedev)) {
486          switch (migration->device_state) {
487          case VFIO_DEVICE_STATE_RUNNING:
488              ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY,
489                                             VFIO_DEVICE_STATE_RUNNING, errp);
490              if (ret) {
491                  return ret;
492              }
493  
494              vfio_query_precopy_size(migration);
495  
496              break;
497          case VFIO_DEVICE_STATE_STOP:
498              /* vfio_save_complete_precopy() will go to STOP_COPY */
499              break;
500          default:
501              error_setg(errp, "%s: Invalid device state %d", vbasedev->name,
502                         migration->device_state);
503              return -EINVAL;
504          }
505      }
506  
507      trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size);
508  
509      qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
510  
511      ret = qemu_file_get_error(f);
512      if (ret < 0) {
513          error_setg_errno(errp, -ret, "%s: save setup failed", vbasedev->name);
514      }
515  
516      return ret;
517  }
518  
vfio_save_cleanup(void * opaque)519  static void vfio_save_cleanup(void *opaque)
520  {
521      VFIODevice *vbasedev = opaque;
522      VFIOMigration *migration = vbasedev->migration;
523      Error *local_err = NULL;
524      int ret;
525  
526      /*
527       * Changing device state from STOP_COPY to STOP can take time. Do it here,
528       * after migration has completed, so it won't increase downtime.
529       */
530      if (migration->device_state == VFIO_DEVICE_STATE_STOP_COPY) {
531          ret = vfio_migration_set_state_or_reset(vbasedev,
532                                                  VFIO_DEVICE_STATE_STOP,
533                                                  &local_err);
534          if (ret) {
535              error_report_err(local_err);
536          }
537      }
538  
539      g_free(migration->data_buffer);
540      migration->data_buffer = NULL;
541      migration->precopy_init_size = 0;
542      migration->precopy_dirty_size = 0;
543      migration->initial_data_sent = false;
544      vfio_migration_cleanup(vbasedev);
545      trace_vfio_save_cleanup(vbasedev->name);
546  }
547  
vfio_state_pending_estimate(void * opaque,uint64_t * must_precopy,uint64_t * can_postcopy)548  static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy,
549                                          uint64_t *can_postcopy)
550  {
551      VFIODevice *vbasedev = opaque;
552      VFIOMigration *migration = vbasedev->migration;
553  
554      if (!vfio_device_state_is_precopy(vbasedev)) {
555          return;
556      }
557  
558      *must_precopy +=
559          migration->precopy_init_size + migration->precopy_dirty_size;
560  
561      trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy,
562                                        *can_postcopy,
563                                        migration->precopy_init_size,
564                                        migration->precopy_dirty_size);
565  }
566  
567  /*
568   * Migration size of VFIO devices can be as little as a few KBs or as big as
569   * many GBs. This value should be big enough to cover the worst case.
570   */
571  #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB)
572  
vfio_state_pending_exact(void * opaque,uint64_t * must_precopy,uint64_t * can_postcopy)573  static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy,
574                                       uint64_t *can_postcopy)
575  {
576      VFIODevice *vbasedev = opaque;
577      VFIOMigration *migration = vbasedev->migration;
578      uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE;
579  
580      /*
581       * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is
582       * reported so downtime limit won't be violated.
583       */
584      vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
585      *must_precopy += stop_copy_size;
586  
587      if (vfio_device_state_is_precopy(vbasedev)) {
588          vfio_query_precopy_size(migration);
589      }
590  
591      trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy,
592                                     stop_copy_size, migration->precopy_init_size,
593                                     migration->precopy_dirty_size);
594  }
595  
vfio_is_active_iterate(void * opaque)596  static bool vfio_is_active_iterate(void *opaque)
597  {
598      VFIODevice *vbasedev = opaque;
599  
600      return vfio_device_state_is_precopy(vbasedev);
601  }
602  
603  /*
604   * Note about migration rate limiting: VFIO migration buffer size is currently
605   * limited to 1MB, so there is no need to check if migration rate exceeded (as
606   * in the worst case it will exceed by 1MB). However, if the buffer size is
607   * later changed to a bigger value, migration rate should be enforced here.
608   */
vfio_save_iterate(QEMUFile * f,void * opaque)609  static int vfio_save_iterate(QEMUFile *f, void *opaque)
610  {
611      VFIODevice *vbasedev = opaque;
612      VFIOMigration *migration = vbasedev->migration;
613      ssize_t data_size;
614  
615      if (!migration->event_save_iterate_started) {
616          trace_vfio_save_iterate_start(vbasedev->name);
617          migration->event_save_iterate_started = true;
618      }
619  
620      data_size = vfio_save_block(f, migration);
621      if (data_size < 0) {
622          return data_size;
623      }
624  
625      vfio_update_estimated_pending_data(migration, data_size);
626  
627      if (migrate_switchover_ack() && !migration->precopy_init_size &&
628          !migration->initial_data_sent) {
629          qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT);
630          migration->initial_data_sent = true;
631      } else {
632          qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
633      }
634  
635      trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size,
636                              migration->precopy_dirty_size);
637  
638      return !migration->precopy_init_size && !migration->precopy_dirty_size;
639  }
640  
vfio_save_complete_precopy(QEMUFile * f,void * opaque)641  static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
642  {
643      VFIODevice *vbasedev = opaque;
644      ssize_t data_size;
645      int ret;
646      Error *local_err = NULL;
647  
648      trace_vfio_save_complete_precopy_start(vbasedev->name);
649  
650      /* We reach here with device state STOP or STOP_COPY only */
651      ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
652                                     VFIO_DEVICE_STATE_STOP, &local_err);
653      if (ret) {
654          error_report_err(local_err);
655          return ret;
656      }
657  
658      do {
659          data_size = vfio_save_block(f, vbasedev->migration);
660          if (data_size < 0) {
661              return data_size;
662          }
663      } while (data_size);
664  
665      qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
666      ret = qemu_file_get_error(f);
667  
668      trace_vfio_save_complete_precopy(vbasedev->name, ret);
669  
670      return ret;
671  }
672  
vfio_save_state(QEMUFile * f,void * opaque)673  static void vfio_save_state(QEMUFile *f, void *opaque)
674  {
675      VFIODevice *vbasedev = opaque;
676      Error *local_err = NULL;
677      int ret;
678  
679      ret = vfio_save_device_config_state(f, opaque, &local_err);
680      if (ret) {
681          error_prepend(&local_err,
682                        "vfio: Failed to save device config space of %s - ",
683                        vbasedev->name);
684          qemu_file_set_error_obj(f, ret, local_err);
685      }
686  }
687  
vfio_load_setup(QEMUFile * f,void * opaque,Error ** errp)688  static int vfio_load_setup(QEMUFile *f, void *opaque, Error **errp)
689  {
690      VFIODevice *vbasedev = opaque;
691  
692      return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING,
693                                      vbasedev->migration->device_state, errp);
694  }
695  
vfio_load_cleanup(void * opaque)696  static int vfio_load_cleanup(void *opaque)
697  {
698      VFIODevice *vbasedev = opaque;
699  
700      vfio_migration_cleanup(vbasedev);
701      trace_vfio_load_cleanup(vbasedev->name);
702  
703      return 0;
704  }
705  
vfio_load_state(QEMUFile * f,void * opaque,int version_id)706  static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
707  {
708      VFIODevice *vbasedev = opaque;
709      int ret = 0;
710      uint64_t data;
711  
712      data = qemu_get_be64(f);
713      while (data != VFIO_MIG_FLAG_END_OF_STATE) {
714  
715          trace_vfio_load_state(vbasedev->name, data);
716  
717          switch (data) {
718          case VFIO_MIG_FLAG_DEV_CONFIG_STATE:
719          {
720              return vfio_load_device_config_state(f, opaque);
721          }
722          case VFIO_MIG_FLAG_DEV_SETUP_STATE:
723          {
724              data = qemu_get_be64(f);
725              if (data == VFIO_MIG_FLAG_END_OF_STATE) {
726                  return ret;
727              } else {
728                  error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64,
729                               vbasedev->name, data);
730                  return -EINVAL;
731              }
732              break;
733          }
734          case VFIO_MIG_FLAG_DEV_DATA_STATE:
735          {
736              uint64_t data_size = qemu_get_be64(f);
737  
738              if (data_size) {
739                  ret = vfio_load_buffer(f, vbasedev, data_size);
740                  if (ret < 0) {
741                      return ret;
742                  }
743              }
744              break;
745          }
746          case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT:
747          {
748              if (!vfio_precopy_supported(vbasedev) ||
749                  !migrate_switchover_ack()) {
750                  error_report("%s: Received INIT_DATA_SENT but switchover ack "
751                               "is not used", vbasedev->name);
752                  return -EINVAL;
753              }
754  
755              ret = qemu_loadvm_approve_switchover();
756              if (ret) {
757                  error_report(
758                      "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)",
759                      vbasedev->name, ret, strerror(-ret));
760              }
761  
762              return ret;
763          }
764          default:
765              error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data);
766              return -EINVAL;
767          }
768  
769          data = qemu_get_be64(f);
770          ret = qemu_file_get_error(f);
771          if (ret) {
772              return ret;
773          }
774      }
775      return ret;
776  }
777  
vfio_switchover_ack_needed(void * opaque)778  static bool vfio_switchover_ack_needed(void *opaque)
779  {
780      VFIODevice *vbasedev = opaque;
781  
782      return vfio_precopy_supported(vbasedev);
783  }
784  
785  static const SaveVMHandlers savevm_vfio_handlers = {
786      .save_prepare = vfio_save_prepare,
787      .save_setup = vfio_save_setup,
788      .save_cleanup = vfio_save_cleanup,
789      .state_pending_estimate = vfio_state_pending_estimate,
790      .state_pending_exact = vfio_state_pending_exact,
791      .is_active_iterate = vfio_is_active_iterate,
792      .save_live_iterate = vfio_save_iterate,
793      .save_live_complete_precopy = vfio_save_complete_precopy,
794      .save_state = vfio_save_state,
795      .load_setup = vfio_load_setup,
796      .load_cleanup = vfio_load_cleanup,
797      .load_state = vfio_load_state,
798      .switchover_ack_needed = vfio_switchover_ack_needed,
799  };
800  
801  /* ---------------------------------------------------------------------- */
802  
vfio_vmstate_change_prepare(void * opaque,bool running,RunState state)803  static void vfio_vmstate_change_prepare(void *opaque, bool running,
804                                          RunState state)
805  {
806      VFIODevice *vbasedev = opaque;
807      VFIOMigration *migration = vbasedev->migration;
808      enum vfio_device_mig_state new_state;
809      Error *local_err = NULL;
810      int ret;
811  
812      new_state = migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ?
813                      VFIO_DEVICE_STATE_PRE_COPY_P2P :
814                      VFIO_DEVICE_STATE_RUNNING_P2P;
815  
816      ret = vfio_migration_set_state_or_reset(vbasedev, new_state, &local_err);
817      if (ret) {
818          /*
819           * Migration should be aborted in this case, but vm_state_notify()
820           * currently does not support reporting failures.
821           */
822          migration_file_set_error(ret, local_err);
823      }
824  
825      trace_vfio_vmstate_change_prepare(vbasedev->name, running,
826                                        RunState_str(state),
827                                        mig_state_to_str(new_state));
828  }
829  
vfio_vmstate_change(void * opaque,bool running,RunState state)830  static void vfio_vmstate_change(void *opaque, bool running, RunState state)
831  {
832      VFIODevice *vbasedev = opaque;
833      enum vfio_device_mig_state new_state;
834      Error *local_err = NULL;
835      int ret;
836  
837      if (running) {
838          new_state = VFIO_DEVICE_STATE_RUNNING;
839      } else {
840          new_state =
841              (vfio_device_state_is_precopy(vbasedev) &&
842               (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ?
843                  VFIO_DEVICE_STATE_STOP_COPY :
844                  VFIO_DEVICE_STATE_STOP;
845      }
846  
847      ret = vfio_migration_set_state_or_reset(vbasedev, new_state, &local_err);
848      if (ret) {
849          /*
850           * Migration should be aborted in this case, but vm_state_notify()
851           * currently does not support reporting failures.
852           */
853          migration_file_set_error(ret, local_err);
854      }
855  
856      trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
857                                mig_state_to_str(new_state));
858  }
859  
vfio_migration_state_notifier(NotifierWithReturn * notifier,MigrationEvent * e,Error ** errp)860  static int vfio_migration_state_notifier(NotifierWithReturn *notifier,
861                                           MigrationEvent *e, Error **errp)
862  {
863      VFIOMigration *migration = container_of(notifier, VFIOMigration,
864                                              migration_state);
865      VFIODevice *vbasedev = migration->vbasedev;
866      Error *local_err = NULL;
867      int ret;
868  
869      trace_vfio_migration_state_notifier(vbasedev->name, e->type);
870  
871      if (e->type == MIG_EVENT_PRECOPY_FAILED) {
872          /*
873           * MigrationNotifyFunc may not return an error code and an Error
874           * object for MIG_EVENT_PRECOPY_FAILED. Hence, report the error
875           * locally and ignore the errp argument.
876           */
877          ret = vfio_migration_set_state_or_reset(vbasedev,
878                                                  VFIO_DEVICE_STATE_RUNNING,
879                                                  &local_err);
880          if (ret) {
881              error_report_err(local_err);
882          }
883      }
884      return 0;
885  }
886  
vfio_migration_free(VFIODevice * vbasedev)887  static void vfio_migration_free(VFIODevice *vbasedev)
888  {
889      g_free(vbasedev->migration);
890      vbasedev->migration = NULL;
891  }
892  
vfio_migration_query_flags(VFIODevice * vbasedev,uint64_t * mig_flags)893  static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags)
894  {
895      uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
896                                    sizeof(struct vfio_device_feature_migration),
897                                sizeof(uint64_t))] = {};
898      struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
899      struct vfio_device_feature_migration *mig =
900          (struct vfio_device_feature_migration *)feature->data;
901  
902      feature->argsz = sizeof(buf);
903      feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION;
904      if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
905          return -errno;
906      }
907  
908      *mig_flags = mig->flags;
909  
910      return 0;
911  }
912  
vfio_dma_logging_supported(VFIODevice * vbasedev)913  static bool vfio_dma_logging_supported(VFIODevice *vbasedev)
914  {
915      uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
916                                sizeof(uint64_t))] = {};
917      struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
918  
919      feature->argsz = sizeof(buf);
920      feature->flags = VFIO_DEVICE_FEATURE_PROBE |
921                       VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
922  
923      return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
924  }
925  
vfio_migration_init(VFIODevice * vbasedev)926  static int vfio_migration_init(VFIODevice *vbasedev)
927  {
928      int ret;
929      Object *obj;
930      VFIOMigration *migration;
931      char id[256] = "";
932      g_autofree char *path = NULL, *oid = NULL;
933      uint64_t mig_flags = 0;
934      VMChangeStateHandler *prepare_cb;
935  
936      if (!vbasedev->ops->vfio_get_object) {
937          return -EINVAL;
938      }
939  
940      obj = vbasedev->ops->vfio_get_object(vbasedev);
941      if (!obj) {
942          return -EINVAL;
943      }
944  
945      ret = vfio_migration_query_flags(vbasedev, &mig_flags);
946      if (ret) {
947          return ret;
948      }
949  
950      /* Basic migration functionality must be supported */
951      if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) {
952          return -EOPNOTSUPP;
953      }
954  
955      vbasedev->migration = g_new0(VFIOMigration, 1);
956      migration = vbasedev->migration;
957      migration->vbasedev = vbasedev;
958      migration->device_state = VFIO_DEVICE_STATE_RUNNING;
959      migration->data_fd = -1;
960      migration->mig_flags = mig_flags;
961  
962      vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev);
963  
964      oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj)));
965      if (oid) {
966          path = g_strdup_printf("%s/vfio", oid);
967      } else {
968          path = g_strdup("vfio");
969      }
970      strpadcpy(id, sizeof(id), path, '\0');
971  
972      register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers,
973                           vbasedev);
974  
975      prepare_cb = migration->mig_flags & VFIO_MIGRATION_P2P ?
976                       vfio_vmstate_change_prepare :
977                       NULL;
978      migration->vm_state = qdev_add_vm_change_state_handler_full(
979          vbasedev->dev, vfio_vmstate_change, prepare_cb, vbasedev);
980      migration_add_notifier(&migration->migration_state,
981                             vfio_migration_state_notifier);
982  
983      return 0;
984  }
985  
vfio_migration_deinit(VFIODevice * vbasedev)986  static void vfio_migration_deinit(VFIODevice *vbasedev)
987  {
988      VFIOMigration *migration = vbasedev->migration;
989  
990      migration_remove_notifier(&migration->migration_state);
991      qemu_del_vm_change_state_handler(migration->vm_state);
992      unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev);
993      vfio_migration_free(vbasedev);
994      vfio_unblock_multiple_devices_migration();
995  }
996  
vfio_block_migration(VFIODevice * vbasedev,Error * err,Error ** errp)997  static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp)
998  {
999      if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
1000          error_propagate(errp, err);
1001          return -EINVAL;
1002      }
1003  
1004      vbasedev->migration_blocker = error_copy(err);
1005      error_free(err);
1006  
1007      return migrate_add_blocker_normal(&vbasedev->migration_blocker, errp);
1008  }
1009  
1010  /* ---------------------------------------------------------------------- */
1011  
vfio_mig_bytes_transferred(void)1012  int64_t vfio_mig_bytes_transferred(void)
1013  {
1014      return bytes_transferred;
1015  }
1016  
vfio_reset_bytes_transferred(void)1017  void vfio_reset_bytes_transferred(void)
1018  {
1019      bytes_transferred = 0;
1020  }
1021  
1022  /*
1023   * Return true when either migration initialized or blocker registered.
1024   * Currently only return false when adding blocker fails which will
1025   * de-register vfio device.
1026   */
vfio_migration_realize(VFIODevice * vbasedev,Error ** errp)1027  bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
1028  {
1029      Error *err = NULL;
1030      int ret;
1031  
1032      if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) {
1033          error_setg(&err, "%s: Migration is disabled for VFIO device",
1034                     vbasedev->name);
1035          return !vfio_block_migration(vbasedev, err, errp);
1036      }
1037  
1038      ret = vfio_migration_init(vbasedev);
1039      if (ret) {
1040          if (ret == -ENOTTY) {
1041              error_setg(&err, "%s: VFIO migration is not supported in kernel",
1042                         vbasedev->name);
1043          } else {
1044              error_setg(&err,
1045                         "%s: Migration couldn't be initialized for VFIO device, "
1046                         "err: %d (%s)",
1047                         vbasedev->name, ret, strerror(-ret));
1048          }
1049  
1050          return !vfio_block_migration(vbasedev, err, errp);
1051      }
1052  
1053      if ((!vbasedev->dirty_pages_supported ||
1054           vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) &&
1055          !vbasedev->iommu_dirty_tracking) {
1056          if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) {
1057              error_setg(&err,
1058                         "%s: VFIO device doesn't support device and "
1059                         "IOMMU dirty tracking", vbasedev->name);
1060              goto add_blocker;
1061          }
1062  
1063          warn_report("%s: VFIO device doesn't support device and "
1064                      "IOMMU dirty tracking", vbasedev->name);
1065      }
1066  
1067      ret = vfio_block_multiple_devices_migration(vbasedev, errp);
1068      if (ret) {
1069          goto out_deinit;
1070      }
1071  
1072      if (vfio_viommu_preset(vbasedev)) {
1073          error_setg(&err, "%s: Migration is currently not supported "
1074                     "with vIOMMU enabled", vbasedev->name);
1075          goto add_blocker;
1076      }
1077  
1078      trace_vfio_migration_realize(vbasedev->name);
1079      return true;
1080  
1081  add_blocker:
1082      ret = vfio_block_migration(vbasedev, err, errp);
1083  out_deinit:
1084      if (ret) {
1085          vfio_migration_deinit(vbasedev);
1086      }
1087      return !ret;
1088  }
1089  
vfio_migration_exit(VFIODevice * vbasedev)1090  void vfio_migration_exit(VFIODevice *vbasedev)
1091  {
1092      if (vbasedev->migration) {
1093          vfio_migration_deinit(vbasedev);
1094      }
1095  
1096      migrate_del_blocker(&vbasedev->migration_blocker);
1097  }
1098