xref: /openbmc/qemu/hw/vfio/migration-multifd.c (revision f79afdf7dafd5fc9551c002de0f4139af4e9f5aa)
1 /*
2  * Multifd VFIO migration
3  *
4  * Copyright (C) 2024,2025 Oracle and/or its affiliates.
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  * SPDX-License-Identifier: GPL-2.0-or-later
10  */
11 
12 #include "qemu/osdep.h"
13 #include "hw/vfio/vfio-device.h"
14 #include "migration/misc.h"
15 #include "qapi/error.h"
16 #include "qemu/error-report.h"
17 #include "qemu/lockable.h"
18 #include "qemu/main-loop.h"
19 #include "qemu/thread.h"
20 #include "io/channel-buffer.h"
21 #include "migration/qemu-file.h"
22 #include "migration-multifd.h"
23 #include "vfio-migration-internal.h"
24 #include "trace.h"
25 #include "vfio-helpers.h"
26 
27 #define VFIO_DEVICE_STATE_CONFIG_STATE (1)
28 
29 #define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0)
30 
31 typedef struct VFIODeviceStatePacket {
32     uint32_t version;
33     uint32_t idx;
34     uint32_t flags;
35     uint8_t data[0];
36 } QEMU_PACKED VFIODeviceStatePacket;
37 
vfio_load_config_after_iter(VFIODevice * vbasedev)38 bool vfio_load_config_after_iter(VFIODevice *vbasedev)
39 {
40     if (vbasedev->migration_load_config_after_iter == ON_OFF_AUTO_ON) {
41         return true;
42     } else if (vbasedev->migration_load_config_after_iter == ON_OFF_AUTO_OFF) {
43         return false;
44     }
45 
46     assert(vbasedev->migration_load_config_after_iter == ON_OFF_AUTO_AUTO);
47     return vfio_arch_wants_loading_config_after_iter();
48 }
49 
50 /* type safety */
51 typedef struct VFIOStateBuffers {
52     GArray *array;
53 } VFIOStateBuffers;
54 
55 typedef struct VFIOStateBuffer {
56     bool is_present;
57     char *data;
58     size_t len;
59 } VFIOStateBuffer;
60 
61 typedef struct VFIOMultifd {
62     bool load_bufs_thread_running;
63     bool load_bufs_thread_want_exit;
64 
65     bool load_bufs_iter_done;
66     QemuCond load_bufs_iter_done_cond;
67 
68     VFIOStateBuffers load_bufs;
69     QemuCond load_bufs_buffer_ready_cond;
70     QemuCond load_bufs_thread_finished_cond;
71     QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
72     uint32_t load_buf_idx;
73     uint32_t load_buf_idx_last;
74     size_t load_buf_queued_pending_buffers_size;
75 } VFIOMultifd;
76 
vfio_state_buffer_clear(gpointer data)77 static void vfio_state_buffer_clear(gpointer data)
78 {
79     VFIOStateBuffer *lb = data;
80 
81     if (!lb->is_present) {
82         return;
83     }
84 
85     g_clear_pointer(&lb->data, g_free);
86     lb->is_present = false;
87 }
88 
vfio_state_buffers_init(VFIOStateBuffers * bufs)89 static void vfio_state_buffers_init(VFIOStateBuffers *bufs)
90 {
91     bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer));
92     g_array_set_clear_func(bufs->array, vfio_state_buffer_clear);
93 }
94 
vfio_state_buffers_destroy(VFIOStateBuffers * bufs)95 static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs)
96 {
97     g_clear_pointer(&bufs->array, g_array_unref);
98 }
99 
vfio_state_buffers_assert_init(VFIOStateBuffers * bufs)100 static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs)
101 {
102     assert(bufs->array);
103 }
104 
vfio_state_buffers_size_get(VFIOStateBuffers * bufs)105 static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs)
106 {
107     return bufs->array->len;
108 }
109 
vfio_state_buffers_size_set(VFIOStateBuffers * bufs,unsigned int size)110 static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs,
111                                         unsigned int size)
112 {
113     g_array_set_size(bufs->array, size);
114 }
115 
vfio_state_buffers_at(VFIOStateBuffers * bufs,unsigned int idx)116 static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs,
117                                               unsigned int idx)
118 {
119     return &g_array_index(bufs->array, VFIOStateBuffer, idx);
120 }
121 
122 /* called with load_bufs_mutex locked */
vfio_load_state_buffer_insert(VFIODevice * vbasedev,VFIODeviceStatePacket * packet,size_t packet_total_size,Error ** errp)123 static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
124                                           VFIODeviceStatePacket *packet,
125                                           size_t packet_total_size,
126                                           Error **errp)
127 {
128     VFIOMigration *migration = vbasedev->migration;
129     VFIOMultifd *multifd = migration->multifd;
130     VFIOStateBuffer *lb;
131     size_t data_size = packet_total_size - sizeof(*packet);
132 
133     vfio_state_buffers_assert_init(&multifd->load_bufs);
134     if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) {
135         vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1);
136     }
137 
138     lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx);
139     if (lb->is_present) {
140         error_setg(errp, "%s: state buffer %" PRIu32 " already filled",
141                    vbasedev->name, packet->idx);
142         return false;
143     }
144 
145     assert(packet->idx >= multifd->load_buf_idx);
146 
147     multifd->load_buf_queued_pending_buffers_size += data_size;
148     if (multifd->load_buf_queued_pending_buffers_size >
149         vbasedev->migration_max_queued_buffers_size) {
150         error_setg(errp,
151                    "%s: queuing state buffer %" PRIu32
152                    " would exceed the size max of %" PRIu64,
153                    vbasedev->name, packet->idx,
154                    vbasedev->migration_max_queued_buffers_size);
155         return false;
156     }
157 
158     lb->data = g_memdup2(&packet->data, data_size);
159     lb->len = data_size;
160     lb->is_present = true;
161 
162     return true;
163 }
164 
vfio_multifd_load_state_buffer(void * opaque,char * data,size_t data_size,Error ** errp)165 bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size,
166                                     Error **errp)
167 {
168     VFIODevice *vbasedev = opaque;
169     VFIOMigration *migration = vbasedev->migration;
170     VFIOMultifd *multifd = migration->multifd;
171     VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data;
172 
173     if (!vfio_multifd_transfer_enabled(vbasedev)) {
174         error_setg(errp,
175                    "%s: got device state packet but not doing multifd transfer",
176                    vbasedev->name);
177         return false;
178     }
179 
180     assert(multifd);
181 
182     if (data_size < sizeof(*packet)) {
183         error_setg(errp, "%s: packet too short at %zu (min is %zu)",
184                    vbasedev->name, data_size, sizeof(*packet));
185         return false;
186     }
187 
188     packet->version = be32_to_cpu(packet->version);
189     if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) {
190         error_setg(errp, "%s: packet has unknown version %" PRIu32,
191                    vbasedev->name, packet->version);
192         return false;
193     }
194 
195     packet->idx = be32_to_cpu(packet->idx);
196     packet->flags = be32_to_cpu(packet->flags);
197 
198     if (packet->idx == UINT32_MAX) {
199         error_setg(errp, "%s: packet index is invalid", vbasedev->name);
200         return false;
201     }
202 
203     trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx);
204 
205     /*
206      * Holding BQL here would violate the lock order and can cause
207      * a deadlock once we attempt to lock load_bufs_mutex below.
208      */
209     assert(!bql_locked());
210 
211     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
212         /* config state packet should be the last one in the stream */
213         if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) {
214             multifd->load_buf_idx_last = packet->idx;
215         }
216 
217         if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size,
218                                            errp)) {
219             return false;
220         }
221 
222         qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
223     }
224 
225     return true;
226 }
227 
vfio_load_bufs_thread_load_config(VFIODevice * vbasedev,Error ** errp)228 static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev,
229                                               Error **errp)
230 {
231     VFIOMigration *migration = vbasedev->migration;
232     VFIOMultifd *multifd = migration->multifd;
233     VFIOStateBuffer *lb;
234     g_autoptr(QIOChannelBuffer) bioc = NULL;
235     g_autoptr(QEMUFile) f_out = NULL, f_in = NULL;
236     uint64_t mig_header;
237     int ret;
238 
239     assert(multifd->load_buf_idx == multifd->load_buf_idx_last);
240     lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx);
241     assert(lb->is_present);
242 
243     bioc = qio_channel_buffer_new(lb->len);
244     qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load");
245 
246     f_out = qemu_file_new_output(QIO_CHANNEL(bioc));
247     qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len);
248 
249     ret = qemu_fflush(f_out);
250     if (ret) {
251         error_setg(errp, "%s: load config state flush failed: %d",
252                    vbasedev->name, ret);
253         return false;
254     }
255 
256     qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
257     f_in = qemu_file_new_input(QIO_CHANNEL(bioc));
258 
259     mig_header = qemu_get_be64(f_in);
260     if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) {
261         error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64,
262                    vbasedev->name, mig_header);
263         return false;
264     }
265 
266     bql_lock();
267     ret = vfio_load_device_config_state(f_in, vbasedev);
268     bql_unlock();
269 
270     if (ret < 0) {
271         error_setg(errp, "%s: vfio_load_device_config_state() failed: %d",
272                    vbasedev->name, ret);
273         return false;
274     }
275 
276     return true;
277 }
278 
vfio_load_state_buffer_get(VFIOMultifd * multifd)279 static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
280 {
281     VFIOStateBuffer *lb;
282     unsigned int bufs_len;
283 
284     bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
285     if (multifd->load_buf_idx >= bufs_len) {
286         assert(multifd->load_buf_idx == bufs_len);
287         return NULL;
288     }
289 
290     lb = vfio_state_buffers_at(&multifd->load_bufs,
291                                multifd->load_buf_idx);
292     if (!lb->is_present) {
293         return NULL;
294     }
295 
296     return lb;
297 }
298 
vfio_load_state_buffer_write(VFIODevice * vbasedev,VFIOStateBuffer * lb,Error ** errp)299 static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
300                                          VFIOStateBuffer *lb,
301                                          Error **errp)
302 {
303     VFIOMigration *migration = vbasedev->migration;
304     VFIOMultifd *multifd = migration->multifd;
305     g_autofree char *buf = NULL;
306     char *buf_cur;
307     size_t buf_len;
308 
309     if (!lb->len) {
310         return true;
311     }
312 
313     trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
314                                                    multifd->load_buf_idx);
315 
316     /* lb might become re-allocated when we drop the lock */
317     buf = g_steal_pointer(&lb->data);
318     buf_cur = buf;
319     buf_len = lb->len;
320     while (buf_len > 0) {
321         ssize_t wr_ret;
322         int errno_save;
323 
324         /*
325          * Loading data to the device takes a while,
326          * drop the lock during this process.
327          */
328         qemu_mutex_unlock(&multifd->load_bufs_mutex);
329         wr_ret = write(migration->data_fd, buf_cur, buf_len);
330         errno_save = errno;
331         qemu_mutex_lock(&multifd->load_bufs_mutex);
332 
333         if (wr_ret < 0) {
334             error_setg(errp,
335                        "%s: writing state buffer %" PRIu32 " failed: %d",
336                        vbasedev->name, multifd->load_buf_idx, errno_save);
337             return false;
338         }
339 
340         assert(wr_ret <= buf_len);
341         buf_len -= wr_ret;
342         buf_cur += wr_ret;
343 
344         assert(multifd->load_buf_queued_pending_buffers_size >= wr_ret);
345         multifd->load_buf_queued_pending_buffers_size -= wr_ret;
346     }
347 
348     trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
349                                                  multifd->load_buf_idx);
350 
351     return true;
352 }
353 
vfio_load_bufs_thread_want_exit(VFIOMultifd * multifd,bool * should_quit)354 static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
355                                             bool *should_quit)
356 {
357     return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit);
358 }
359 
360 /*
361  * This thread is spawned by vfio_multifd_switchover_start() which gets
362  * called upon encountering the switchover point marker in main migration
363  * stream.
364  *
365  * It exits after either:
366  * * completing loading the remaining device state and device config, OR:
367  * * encountering some error while doing the above, OR:
368  * * being forcefully aborted by the migration core by it setting should_quit
369  *   or by vfio_load_cleanup_load_bufs_thread() setting
370  *   multifd->load_bufs_thread_want_exit.
371  */
vfio_load_bufs_thread(void * opaque,bool * should_quit,Error ** errp)372 static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp)
373 {
374     VFIODevice *vbasedev = opaque;
375     VFIOMigration *migration = vbasedev->migration;
376     VFIOMultifd *multifd = migration->multifd;
377     bool ret = false;
378 
379     trace_vfio_load_bufs_thread_start(vbasedev->name);
380 
381     assert(multifd);
382     QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
383 
384     assert(multifd->load_bufs_thread_running);
385 
386     while (true) {
387         VFIOStateBuffer *lb;
388 
389         /*
390          * Always check cancellation first after the buffer_ready wait below in
391          * case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
392          */
393         if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
394             error_setg(errp, "operation cancelled");
395             goto thread_exit;
396         }
397 
398         assert(multifd->load_buf_idx <= multifd->load_buf_idx_last);
399 
400         lb = vfio_load_state_buffer_get(multifd);
401         if (!lb) {
402             trace_vfio_load_state_device_buffer_starved(vbasedev->name,
403                                                         multifd->load_buf_idx);
404             qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond,
405                            &multifd->load_bufs_mutex);
406             continue;
407         }
408 
409         if (multifd->load_buf_idx == multifd->load_buf_idx_last) {
410             break;
411         }
412 
413         if (multifd->load_buf_idx == 0) {
414             trace_vfio_load_state_device_buffer_start(vbasedev->name);
415         }
416 
417         if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) {
418             goto thread_exit;
419         }
420 
421         if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) {
422             trace_vfio_load_state_device_buffer_end(vbasedev->name);
423         }
424 
425         multifd->load_buf_idx++;
426     }
427 
428     if (vfio_load_config_after_iter(vbasedev)) {
429         while (!multifd->load_bufs_iter_done) {
430             qemu_cond_wait(&multifd->load_bufs_iter_done_cond,
431                            &multifd->load_bufs_mutex);
432 
433             /*
434              * Need to re-check cancellation immediately after wait in case
435              * cond was signalled by vfio_load_cleanup_load_bufs_thread().
436              */
437             if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
438                 error_setg(errp, "operation cancelled");
439                 goto thread_exit;
440             }
441         }
442     }
443 
444     if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) {
445         goto thread_exit;
446     }
447 
448     ret = true;
449 
450 thread_exit:
451     /*
452      * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that
453      * this thread is exiting.
454      */
455     multifd->load_bufs_thread_running = false;
456     qemu_cond_signal(&multifd->load_bufs_thread_finished_cond);
457 
458     trace_vfio_load_bufs_thread_end(vbasedev->name);
459 
460     return ret;
461 }
462 
vfio_load_state_config_load_ready(VFIODevice * vbasedev)463 int vfio_load_state_config_load_ready(VFIODevice *vbasedev)
464 {
465     VFIOMigration *migration = vbasedev->migration;
466     VFIOMultifd *multifd = migration->multifd;
467     int ret = 0;
468 
469     if (!vfio_multifd_transfer_enabled(vbasedev)) {
470         error_report("%s: got DEV_CONFIG_LOAD_READY outside multifd transfer",
471                      vbasedev->name);
472         return -EINVAL;
473     }
474 
475     if (!vfio_load_config_after_iter(vbasedev)) {
476         error_report("%s: got DEV_CONFIG_LOAD_READY but was disabled",
477                      vbasedev->name);
478         return -EINVAL;
479     }
480 
481     assert(multifd);
482 
483     /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
484     bql_unlock();
485     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
486         if (multifd->load_bufs_iter_done) {
487             /* Can't print error here as we're outside BQL */
488             ret = -EINVAL;
489             break;
490         }
491 
492         multifd->load_bufs_iter_done = true;
493         qemu_cond_signal(&multifd->load_bufs_iter_done_cond);
494     }
495     bql_lock();
496 
497     if (ret) {
498         error_report("%s: duplicate DEV_CONFIG_LOAD_READY",
499                      vbasedev->name);
500     }
501 
502     return ret;
503 }
504 
vfio_multifd_new(void)505 static VFIOMultifd *vfio_multifd_new(void)
506 {
507     VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
508 
509     vfio_state_buffers_init(&multifd->load_bufs);
510 
511     qemu_mutex_init(&multifd->load_bufs_mutex);
512 
513     multifd->load_buf_idx = 0;
514     multifd->load_buf_idx_last = UINT32_MAX;
515     multifd->load_buf_queued_pending_buffers_size = 0;
516     qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
517 
518     multifd->load_bufs_iter_done = false;
519     qemu_cond_init(&multifd->load_bufs_iter_done_cond);
520 
521     multifd->load_bufs_thread_running = false;
522     multifd->load_bufs_thread_want_exit = false;
523     qemu_cond_init(&multifd->load_bufs_thread_finished_cond);
524 
525     return multifd;
526 }
527 
528 /*
529  * Terminates vfio_load_bufs_thread by setting
530  * multifd->load_bufs_thread_want_exit and signalling all the conditions
531  * the thread could be blocked on.
532  *
533  * Waits for the thread to signal that it had finished.
534  */
vfio_load_cleanup_load_bufs_thread(VFIOMultifd * multifd)535 static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd)
536 {
537     /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
538     bql_unlock();
539     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
540         while (multifd->load_bufs_thread_running) {
541             multifd->load_bufs_thread_want_exit = true;
542 
543             qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
544             qemu_cond_signal(&multifd->load_bufs_iter_done_cond);
545             qemu_cond_wait(&multifd->load_bufs_thread_finished_cond,
546                            &multifd->load_bufs_mutex);
547         }
548     }
549     bql_lock();
550 }
551 
vfio_multifd_free(VFIOMultifd * multifd)552 static void vfio_multifd_free(VFIOMultifd *multifd)
553 {
554     vfio_load_cleanup_load_bufs_thread(multifd);
555 
556     qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond);
557     qemu_cond_destroy(&multifd->load_bufs_iter_done_cond);
558     vfio_state_buffers_destroy(&multifd->load_bufs);
559     qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
560     qemu_mutex_destroy(&multifd->load_bufs_mutex);
561 
562     g_free(multifd);
563 }
564 
vfio_multifd_cleanup(VFIODevice * vbasedev)565 void vfio_multifd_cleanup(VFIODevice *vbasedev)
566 {
567     VFIOMigration *migration = vbasedev->migration;
568 
569     g_clear_pointer(&migration->multifd, vfio_multifd_free);
570 }
571 
vfio_multifd_transfer_supported(void)572 bool vfio_multifd_transfer_supported(void)
573 {
574     return multifd_device_state_supported() &&
575         migrate_send_switchover_start();
576 }
577 
vfio_multifd_transfer_enabled(VFIODevice * vbasedev)578 bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev)
579 {
580     VFIOMigration *migration = vbasedev->migration;
581 
582     return migration->multifd_transfer;
583 }
584 
vfio_multifd_setup(VFIODevice * vbasedev,bool alloc_multifd,Error ** errp)585 bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp)
586 {
587     VFIOMigration *migration = vbasedev->migration;
588 
589     /*
590      * Make a copy of this setting at the start in case it is changed
591      * mid-migration.
592      */
593     if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) {
594         migration->multifd_transfer = vfio_multifd_transfer_supported();
595     } else {
596         migration->multifd_transfer =
597             vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON;
598     }
599 
600     if (!vfio_multifd_transfer_enabled(vbasedev)) {
601         /* Nothing further to check or do */
602         return true;
603     }
604 
605     if (!vfio_multifd_transfer_supported()) {
606         error_setg(errp,
607                    "%s: Multifd device transfer requested but unsupported in the current config",
608                    vbasedev->name);
609         return false;
610     }
611 
612     if (alloc_multifd) {
613         assert(!migration->multifd);
614         migration->multifd = vfio_multifd_new();
615     }
616 
617     return true;
618 }
619 
vfio_multifd_emit_dummy_eos(VFIODevice * vbasedev,QEMUFile * f)620 void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f)
621 {
622     assert(vfio_multifd_transfer_enabled(vbasedev));
623 
624     /*
625      * Emit dummy NOP data on the main migration channel since the actual
626      * device state transfer is done via multifd channels.
627      */
628     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
629 }
630 
631 static bool
vfio_save_complete_precopy_thread_config_state(VFIODevice * vbasedev,char * idstr,uint32_t instance_id,uint32_t idx,Error ** errp)632 vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev,
633                                                char *idstr,
634                                                uint32_t instance_id,
635                                                uint32_t idx,
636                                                Error **errp)
637 {
638     g_autoptr(QIOChannelBuffer) bioc = NULL;
639     g_autoptr(QEMUFile) f = NULL;
640     int ret;
641     g_autofree VFIODeviceStatePacket *packet = NULL;
642     size_t packet_len;
643 
644     bioc = qio_channel_buffer_new(0);
645     qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save");
646 
647     f = qemu_file_new_output(QIO_CHANNEL(bioc));
648 
649     if (vfio_save_device_config_state(f, vbasedev, errp)) {
650         return false;
651     }
652 
653     ret = qemu_fflush(f);
654     if (ret) {
655         error_setg(errp, "%s: save config state flush failed: %d",
656                    vbasedev->name, ret);
657         return false;
658     }
659 
660     packet_len = sizeof(*packet) + bioc->usage;
661     packet = g_malloc0(packet_len);
662     packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT);
663     packet->idx = cpu_to_be32(idx);
664     packet->flags = cpu_to_be32(VFIO_DEVICE_STATE_CONFIG_STATE);
665     memcpy(&packet->data, bioc->data, bioc->usage);
666 
667     if (!multifd_queue_device_state(idstr, instance_id,
668                                     (char *)packet, packet_len)) {
669         error_setg(errp, "%s: multifd config data queuing failed",
670                    vbasedev->name);
671         return false;
672     }
673 
674     vfio_migration_add_bytes_transferred(packet_len);
675 
676     return true;
677 }
678 
679 /*
680  * This thread is spawned by the migration core directly via
681  * .save_complete_precopy_thread SaveVMHandler.
682  *
683  * It exits after either:
684  * * completing saving the remaining device state and device config, OR:
685  * * encountering some error while doing the above, OR:
686  * * being forcefully aborted by the migration core by
687  *   multifd_device_state_save_thread_should_exit() returning true.
688  */
689 bool
vfio_multifd_save_complete_precopy_thread(SaveCompletePrecopyThreadData * d,Error ** errp)690 vfio_multifd_save_complete_precopy_thread(SaveCompletePrecopyThreadData *d,
691                                           Error **errp)
692 {
693     VFIODevice *vbasedev = d->handler_opaque;
694     VFIOMigration *migration = vbasedev->migration;
695     bool ret = false;
696     g_autofree VFIODeviceStatePacket *packet = NULL;
697     uint32_t idx;
698 
699     if (!vfio_multifd_transfer_enabled(vbasedev)) {
700         /* Nothing to do, vfio_save_complete_precopy() does the transfer. */
701         return true;
702     }
703 
704     trace_vfio_save_complete_precopy_thread_start(vbasedev->name,
705                                                   d->idstr, d->instance_id);
706 
707     /* We reach here with device state STOP or STOP_COPY only */
708     if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
709                                  VFIO_DEVICE_STATE_STOP, errp)) {
710         goto thread_exit;
711     }
712 
713     packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size);
714     packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT);
715 
716     for (idx = 0; ; idx++) {
717         ssize_t data_size;
718         size_t packet_size;
719 
720         if (multifd_device_state_save_thread_should_exit()) {
721             error_setg(errp, "operation cancelled");
722             goto thread_exit;
723         }
724 
725         data_size = read(migration->data_fd, &packet->data,
726                          migration->data_buffer_size);
727         if (data_size < 0) {
728             error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d",
729                        vbasedev->name, idx, errno);
730             goto thread_exit;
731         } else if (data_size == 0) {
732             break;
733         }
734 
735         packet->idx = cpu_to_be32(idx);
736         packet_size = sizeof(*packet) + data_size;
737 
738         if (!multifd_queue_device_state(d->idstr, d->instance_id,
739                                         (char *)packet, packet_size)) {
740             error_setg(errp, "%s: multifd data queuing failed", vbasedev->name);
741             goto thread_exit;
742         }
743 
744         vfio_migration_add_bytes_transferred(packet_size);
745     }
746 
747     if (!vfio_save_complete_precopy_thread_config_state(vbasedev,
748                                                         d->idstr,
749                                                         d->instance_id,
750                                                         idx, errp)) {
751         goto thread_exit;
752    }
753 
754     ret = true;
755 
756 thread_exit:
757     trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret);
758 
759     return ret;
760 }
761 
vfio_multifd_switchover_start(VFIODevice * vbasedev)762 int vfio_multifd_switchover_start(VFIODevice *vbasedev)
763 {
764     VFIOMigration *migration = vbasedev->migration;
765     VFIOMultifd *multifd = migration->multifd;
766 
767     assert(multifd);
768 
769     /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
770     bql_unlock();
771     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
772         assert(!multifd->load_bufs_thread_running);
773         multifd->load_bufs_thread_running = true;
774     }
775     bql_lock();
776 
777     qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev);
778 
779     return 0;
780 }
781