xref: /openbmc/qemu/migration/ram.c (revision 6b8f9c6e)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "qemu/pmem.h"
37 #include "xbzrle.h"
38 #include "ram.h"
39 #include "migration.h"
40 #include "socket.h"
41 #include "migration/register.h"
42 #include "migration/misc.h"
43 #include "qemu-file.h"
44 #include "postcopy-ram.h"
45 #include "page_cache.h"
46 #include "qemu/error-report.h"
47 #include "qapi/error.h"
48 #include "qapi/qapi-events-migration.h"
49 #include "qapi/qmp/qerror.h"
50 #include "trace.h"
51 #include "exec/ram_addr.h"
52 #include "exec/target_page.h"
53 #include "qemu/rcu_queue.h"
54 #include "migration/colo.h"
55 #include "block.h"
56 #include "sysemu/sysemu.h"
57 #include "qemu/uuid.h"
58 #include "savevm.h"
59 #include "qemu/iov.h"
60 
61 /***********************************************************/
62 /* ram save/restore */
63 
64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
65  * worked for pages that where filled with the same char.  We switched
66  * it to only search for the zero value.  And to avoid confusion with
67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
68  */
69 
70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
71 #define RAM_SAVE_FLAG_ZERO     0x02
72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
73 #define RAM_SAVE_FLAG_PAGE     0x08
74 #define RAM_SAVE_FLAG_EOS      0x10
75 #define RAM_SAVE_FLAG_CONTINUE 0x20
76 #define RAM_SAVE_FLAG_XBZRLE   0x40
77 /* 0x80 is reserved in migration.h start with 0x100 next */
78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
79 
80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
81 {
82     return buffer_is_zero(p, size);
83 }
84 
85 XBZRLECacheStats xbzrle_counters;
86 
87 /* struct contains XBZRLE cache and a static page
88    used by the compression */
89 static struct {
90     /* buffer used for XBZRLE encoding */
91     uint8_t *encoded_buf;
92     /* buffer for storing page content */
93     uint8_t *current_buf;
94     /* Cache for XBZRLE, Protected by lock. */
95     PageCache *cache;
96     QemuMutex lock;
97     /* it will store a page full of zeros */
98     uint8_t *zero_target_page;
99     /* buffer used for XBZRLE decoding */
100     uint8_t *decoded_buf;
101 } XBZRLE;
102 
103 static void XBZRLE_cache_lock(void)
104 {
105     if (migrate_use_xbzrle())
106         qemu_mutex_lock(&XBZRLE.lock);
107 }
108 
109 static void XBZRLE_cache_unlock(void)
110 {
111     if (migrate_use_xbzrle())
112         qemu_mutex_unlock(&XBZRLE.lock);
113 }
114 
115 /**
116  * xbzrle_cache_resize: resize the xbzrle cache
117  *
118  * This function is called from qmp_migrate_set_cache_size in main
119  * thread, possibly while a migration is in progress.  A running
120  * migration may be using the cache and might finish during this call,
121  * hence changes to the cache are protected by XBZRLE.lock().
122  *
123  * Returns 0 for success or -1 for error
124  *
125  * @new_size: new cache size
126  * @errp: set *errp if the check failed, with reason
127  */
128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
129 {
130     PageCache *new_cache;
131     int64_t ret = 0;
132 
133     /* Check for truncation */
134     if (new_size != (size_t)new_size) {
135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
136                    "exceeding address space");
137         return -1;
138     }
139 
140     if (new_size == migrate_xbzrle_cache_size()) {
141         /* nothing to do */
142         return 0;
143     }
144 
145     XBZRLE_cache_lock();
146 
147     if (XBZRLE.cache != NULL) {
148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
149         if (!new_cache) {
150             ret = -1;
151             goto out;
152         }
153 
154         cache_fini(XBZRLE.cache);
155         XBZRLE.cache = new_cache;
156     }
157 out:
158     XBZRLE_cache_unlock();
159     return ret;
160 }
161 
162 static bool ramblock_is_ignored(RAMBlock *block)
163 {
164     return !qemu_ram_is_migratable(block) ||
165            (migrate_ignore_shared() && qemu_ram_is_shared(block));
166 }
167 
168 /* Should be holding either ram_list.mutex, or the RCU lock. */
169 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
170     INTERNAL_RAMBLOCK_FOREACH(block)                   \
171         if (ramblock_is_ignored(block)) {} else
172 
173 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
174     INTERNAL_RAMBLOCK_FOREACH(block)                   \
175         if (!qemu_ram_is_migratable(block)) {} else
176 
177 #undef RAMBLOCK_FOREACH
178 
179 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
180 {
181     RAMBlock *block;
182     int ret = 0;
183 
184     rcu_read_lock();
185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
186         ret = func(block, opaque);
187         if (ret) {
188             break;
189         }
190     }
191     rcu_read_unlock();
192     return ret;
193 }
194 
195 static void ramblock_recv_map_init(void)
196 {
197     RAMBlock *rb;
198 
199     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
200         assert(!rb->receivedmap);
201         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
202     }
203 }
204 
205 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
206 {
207     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
208                     rb->receivedmap);
209 }
210 
211 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
212 {
213     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
214 }
215 
216 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
217 {
218     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
219 }
220 
221 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
222                                     size_t nr)
223 {
224     bitmap_set_atomic(rb->receivedmap,
225                       ramblock_recv_bitmap_offset(host_addr, rb),
226                       nr);
227 }
228 
229 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
230 
231 /*
232  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
233  *
234  * Returns >0 if success with sent bytes, or <0 if error.
235  */
236 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
237                                   const char *block_name)
238 {
239     RAMBlock *block = qemu_ram_block_by_name(block_name);
240     unsigned long *le_bitmap, nbits;
241     uint64_t size;
242 
243     if (!block) {
244         error_report("%s: invalid block name: %s", __func__, block_name);
245         return -1;
246     }
247 
248     nbits = block->used_length >> TARGET_PAGE_BITS;
249 
250     /*
251      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
252      * machines we may need 4 more bytes for padding (see below
253      * comment). So extend it a bit before hand.
254      */
255     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
256 
257     /*
258      * Always use little endian when sending the bitmap. This is
259      * required that when source and destination VMs are not using the
260      * same endianess. (Note: big endian won't work.)
261      */
262     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
263 
264     /* Size of the bitmap, in bytes */
265     size = DIV_ROUND_UP(nbits, 8);
266 
267     /*
268      * size is always aligned to 8 bytes for 64bit machines, but it
269      * may not be true for 32bit machines. We need this padding to
270      * make sure the migration can survive even between 32bit and
271      * 64bit machines.
272      */
273     size = ROUND_UP(size, 8);
274 
275     qemu_put_be64(file, size);
276     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
277     /*
278      * Mark as an end, in case the middle part is screwed up due to
279      * some "misterious" reason.
280      */
281     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
282     qemu_fflush(file);
283 
284     g_free(le_bitmap);
285 
286     if (qemu_file_get_error(file)) {
287         return qemu_file_get_error(file);
288     }
289 
290     return size + sizeof(size);
291 }
292 
293 /*
294  * An outstanding page request, on the source, having been received
295  * and queued
296  */
297 struct RAMSrcPageRequest {
298     RAMBlock *rb;
299     hwaddr    offset;
300     hwaddr    len;
301 
302     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
303 };
304 
305 /* State of RAM for migration */
306 struct RAMState {
307     /* QEMUFile used for this migration */
308     QEMUFile *f;
309     /* Last block that we have visited searching for dirty pages */
310     RAMBlock *last_seen_block;
311     /* Last block from where we have sent data */
312     RAMBlock *last_sent_block;
313     /* Last dirty target page we have sent */
314     ram_addr_t last_page;
315     /* last ram version we have seen */
316     uint32_t last_version;
317     /* We are in the first round */
318     bool ram_bulk_stage;
319     /* The free page optimization is enabled */
320     bool fpo_enabled;
321     /* How many times we have dirty too many pages */
322     int dirty_rate_high_cnt;
323     /* these variables are used for bitmap sync */
324     /* last time we did a full bitmap_sync */
325     int64_t time_last_bitmap_sync;
326     /* bytes transferred at start_time */
327     uint64_t bytes_xfer_prev;
328     /* number of dirty pages since start_time */
329     uint64_t num_dirty_pages_period;
330     /* xbzrle misses since the beginning of the period */
331     uint64_t xbzrle_cache_miss_prev;
332 
333     /* compression statistics since the beginning of the period */
334     /* amount of count that no free thread to compress data */
335     uint64_t compress_thread_busy_prev;
336     /* amount bytes after compression */
337     uint64_t compressed_size_prev;
338     /* amount of compressed pages */
339     uint64_t compress_pages_prev;
340 
341     /* total handled target pages at the beginning of period */
342     uint64_t target_page_count_prev;
343     /* total handled target pages since start */
344     uint64_t target_page_count;
345     /* number of dirty bits in the bitmap */
346     uint64_t migration_dirty_pages;
347     /* Protects modification of the bitmap and migration dirty pages */
348     QemuMutex bitmap_mutex;
349     /* The RAMBlock used in the last src_page_requests */
350     RAMBlock *last_req_rb;
351     /* Queue of outstanding page requests from the destination */
352     QemuMutex src_page_req_mutex;
353     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
354 };
355 typedef struct RAMState RAMState;
356 
357 static RAMState *ram_state;
358 
359 static NotifierWithReturnList precopy_notifier_list;
360 
361 void precopy_infrastructure_init(void)
362 {
363     notifier_with_return_list_init(&precopy_notifier_list);
364 }
365 
366 void precopy_add_notifier(NotifierWithReturn *n)
367 {
368     notifier_with_return_list_add(&precopy_notifier_list, n);
369 }
370 
371 void precopy_remove_notifier(NotifierWithReturn *n)
372 {
373     notifier_with_return_remove(n);
374 }
375 
376 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
377 {
378     PrecopyNotifyData pnd;
379     pnd.reason = reason;
380     pnd.errp = errp;
381 
382     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
383 }
384 
385 void precopy_enable_free_page_optimization(void)
386 {
387     if (!ram_state) {
388         return;
389     }
390 
391     ram_state->fpo_enabled = true;
392 }
393 
394 uint64_t ram_bytes_remaining(void)
395 {
396     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
397                        0;
398 }
399 
400 MigrationStats ram_counters;
401 
402 /* used by the search for pages to send */
403 struct PageSearchStatus {
404     /* Current block being searched */
405     RAMBlock    *block;
406     /* Current page to search from */
407     unsigned long page;
408     /* Set once we wrap around */
409     bool         complete_round;
410 };
411 typedef struct PageSearchStatus PageSearchStatus;
412 
413 CompressionStats compression_counters;
414 
415 struct CompressParam {
416     bool done;
417     bool quit;
418     bool zero_page;
419     QEMUFile *file;
420     QemuMutex mutex;
421     QemuCond cond;
422     RAMBlock *block;
423     ram_addr_t offset;
424 
425     /* internally used fields */
426     z_stream stream;
427     uint8_t *originbuf;
428 };
429 typedef struct CompressParam CompressParam;
430 
431 struct DecompressParam {
432     bool done;
433     bool quit;
434     QemuMutex mutex;
435     QemuCond cond;
436     void *des;
437     uint8_t *compbuf;
438     int len;
439     z_stream stream;
440 };
441 typedef struct DecompressParam DecompressParam;
442 
443 static CompressParam *comp_param;
444 static QemuThread *compress_threads;
445 /* comp_done_cond is used to wake up the migration thread when
446  * one of the compression threads has finished the compression.
447  * comp_done_lock is used to co-work with comp_done_cond.
448  */
449 static QemuMutex comp_done_lock;
450 static QemuCond comp_done_cond;
451 /* The empty QEMUFileOps will be used by file in CompressParam */
452 static const QEMUFileOps empty_ops = { };
453 
454 static QEMUFile *decomp_file;
455 static DecompressParam *decomp_param;
456 static QemuThread *decompress_threads;
457 static QemuMutex decomp_done_lock;
458 static QemuCond decomp_done_cond;
459 
460 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
461                                  ram_addr_t offset, uint8_t *source_buf);
462 
463 static void *do_data_compress(void *opaque)
464 {
465     CompressParam *param = opaque;
466     RAMBlock *block;
467     ram_addr_t offset;
468     bool zero_page;
469 
470     qemu_mutex_lock(&param->mutex);
471     while (!param->quit) {
472         if (param->block) {
473             block = param->block;
474             offset = param->offset;
475             param->block = NULL;
476             qemu_mutex_unlock(&param->mutex);
477 
478             zero_page = do_compress_ram_page(param->file, &param->stream,
479                                              block, offset, param->originbuf);
480 
481             qemu_mutex_lock(&comp_done_lock);
482             param->done = true;
483             param->zero_page = zero_page;
484             qemu_cond_signal(&comp_done_cond);
485             qemu_mutex_unlock(&comp_done_lock);
486 
487             qemu_mutex_lock(&param->mutex);
488         } else {
489             qemu_cond_wait(&param->cond, &param->mutex);
490         }
491     }
492     qemu_mutex_unlock(&param->mutex);
493 
494     return NULL;
495 }
496 
497 static void compress_threads_save_cleanup(void)
498 {
499     int i, thread_count;
500 
501     if (!migrate_use_compression() || !comp_param) {
502         return;
503     }
504 
505     thread_count = migrate_compress_threads();
506     for (i = 0; i < thread_count; i++) {
507         /*
508          * we use it as a indicator which shows if the thread is
509          * properly init'd or not
510          */
511         if (!comp_param[i].file) {
512             break;
513         }
514 
515         qemu_mutex_lock(&comp_param[i].mutex);
516         comp_param[i].quit = true;
517         qemu_cond_signal(&comp_param[i].cond);
518         qemu_mutex_unlock(&comp_param[i].mutex);
519 
520         qemu_thread_join(compress_threads + i);
521         qemu_mutex_destroy(&comp_param[i].mutex);
522         qemu_cond_destroy(&comp_param[i].cond);
523         deflateEnd(&comp_param[i].stream);
524         g_free(comp_param[i].originbuf);
525         qemu_fclose(comp_param[i].file);
526         comp_param[i].file = NULL;
527     }
528     qemu_mutex_destroy(&comp_done_lock);
529     qemu_cond_destroy(&comp_done_cond);
530     g_free(compress_threads);
531     g_free(comp_param);
532     compress_threads = NULL;
533     comp_param = NULL;
534 }
535 
536 static int compress_threads_save_setup(void)
537 {
538     int i, thread_count;
539 
540     if (!migrate_use_compression()) {
541         return 0;
542     }
543     thread_count = migrate_compress_threads();
544     compress_threads = g_new0(QemuThread, thread_count);
545     comp_param = g_new0(CompressParam, thread_count);
546     qemu_cond_init(&comp_done_cond);
547     qemu_mutex_init(&comp_done_lock);
548     for (i = 0; i < thread_count; i++) {
549         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
550         if (!comp_param[i].originbuf) {
551             goto exit;
552         }
553 
554         if (deflateInit(&comp_param[i].stream,
555                         migrate_compress_level()) != Z_OK) {
556             g_free(comp_param[i].originbuf);
557             goto exit;
558         }
559 
560         /* comp_param[i].file is just used as a dummy buffer to save data,
561          * set its ops to empty.
562          */
563         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
564         comp_param[i].done = true;
565         comp_param[i].quit = false;
566         qemu_mutex_init(&comp_param[i].mutex);
567         qemu_cond_init(&comp_param[i].cond);
568         qemu_thread_create(compress_threads + i, "compress",
569                            do_data_compress, comp_param + i,
570                            QEMU_THREAD_JOINABLE);
571     }
572     return 0;
573 
574 exit:
575     compress_threads_save_cleanup();
576     return -1;
577 }
578 
579 /* Multiple fd's */
580 
581 #define MULTIFD_MAGIC 0x11223344U
582 #define MULTIFD_VERSION 1
583 
584 #define MULTIFD_FLAG_SYNC (1 << 0)
585 
586 /* This value needs to be a multiple of qemu_target_page_size() */
587 #define MULTIFD_PACKET_SIZE (512 * 1024)
588 
589 typedef struct {
590     uint32_t magic;
591     uint32_t version;
592     unsigned char uuid[16]; /* QemuUUID */
593     uint8_t id;
594     uint8_t unused1[7];     /* Reserved for future use */
595     uint64_t unused2[4];    /* Reserved for future use */
596 } __attribute__((packed)) MultiFDInit_t;
597 
598 typedef struct {
599     uint32_t magic;
600     uint32_t version;
601     uint32_t flags;
602     /* maximum number of allocated pages */
603     uint32_t pages_alloc;
604     uint32_t pages_used;
605     /* size of the next packet that contains pages */
606     uint32_t next_packet_size;
607     uint64_t packet_num;
608     uint64_t unused[4];    /* Reserved for future use */
609     char ramblock[256];
610     uint64_t offset[];
611 } __attribute__((packed)) MultiFDPacket_t;
612 
613 typedef struct {
614     /* number of used pages */
615     uint32_t used;
616     /* number of allocated pages */
617     uint32_t allocated;
618     /* global number of generated multifd packets */
619     uint64_t packet_num;
620     /* offset of each page */
621     ram_addr_t *offset;
622     /* pointer to each page */
623     struct iovec *iov;
624     RAMBlock *block;
625 } MultiFDPages_t;
626 
627 typedef struct {
628     /* this fields are not changed once the thread is created */
629     /* channel number */
630     uint8_t id;
631     /* channel thread name */
632     char *name;
633     /* channel thread id */
634     QemuThread thread;
635     /* communication channel */
636     QIOChannel *c;
637     /* sem where to wait for more work */
638     QemuSemaphore sem;
639     /* this mutex protects the following parameters */
640     QemuMutex mutex;
641     /* is this channel thread running */
642     bool running;
643     /* should this thread finish */
644     bool quit;
645     /* thread has work to do */
646     int pending_job;
647     /* array of pages to sent */
648     MultiFDPages_t *pages;
649     /* packet allocated len */
650     uint32_t packet_len;
651     /* pointer to the packet */
652     MultiFDPacket_t *packet;
653     /* multifd flags for each packet */
654     uint32_t flags;
655     /* size of the next packet that contains pages */
656     uint32_t next_packet_size;
657     /* global number of generated multifd packets */
658     uint64_t packet_num;
659     /* thread local variables */
660     /* packets sent through this channel */
661     uint64_t num_packets;
662     /* pages sent through this channel */
663     uint64_t num_pages;
664     /* syncs main thread and channels */
665     QemuSemaphore sem_sync;
666 }  MultiFDSendParams;
667 
668 typedef struct {
669     /* this fields are not changed once the thread is created */
670     /* channel number */
671     uint8_t id;
672     /* channel thread name */
673     char *name;
674     /* channel thread id */
675     QemuThread thread;
676     /* communication channel */
677     QIOChannel *c;
678     /* this mutex protects the following parameters */
679     QemuMutex mutex;
680     /* is this channel thread running */
681     bool running;
682     /* array of pages to receive */
683     MultiFDPages_t *pages;
684     /* packet allocated len */
685     uint32_t packet_len;
686     /* pointer to the packet */
687     MultiFDPacket_t *packet;
688     /* multifd flags for each packet */
689     uint32_t flags;
690     /* global number of generated multifd packets */
691     uint64_t packet_num;
692     /* thread local variables */
693     /* size of the next packet that contains pages */
694     uint32_t next_packet_size;
695     /* packets sent through this channel */
696     uint64_t num_packets;
697     /* pages sent through this channel */
698     uint64_t num_pages;
699     /* syncs main thread and channels */
700     QemuSemaphore sem_sync;
701 } MultiFDRecvParams;
702 
703 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
704 {
705     MultiFDInit_t msg;
706     int ret;
707 
708     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
709     msg.version = cpu_to_be32(MULTIFD_VERSION);
710     msg.id = p->id;
711     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
712 
713     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
714     if (ret != 0) {
715         return -1;
716     }
717     return 0;
718 }
719 
720 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
721 {
722     MultiFDInit_t msg;
723     int ret;
724 
725     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
726     if (ret != 0) {
727         return -1;
728     }
729 
730     msg.magic = be32_to_cpu(msg.magic);
731     msg.version = be32_to_cpu(msg.version);
732 
733     if (msg.magic != MULTIFD_MAGIC) {
734         error_setg(errp, "multifd: received packet magic %x "
735                    "expected %x", msg.magic, MULTIFD_MAGIC);
736         return -1;
737     }
738 
739     if (msg.version != MULTIFD_VERSION) {
740         error_setg(errp, "multifd: received packet version %d "
741                    "expected %d", msg.version, MULTIFD_VERSION);
742         return -1;
743     }
744 
745     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
746         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
747         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
748 
749         error_setg(errp, "multifd: received uuid '%s' and expected "
750                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
751         g_free(uuid);
752         g_free(msg_uuid);
753         return -1;
754     }
755 
756     if (msg.id > migrate_multifd_channels()) {
757         error_setg(errp, "multifd: received channel version %d "
758                    "expected %d", msg.version, MULTIFD_VERSION);
759         return -1;
760     }
761 
762     return msg.id;
763 }
764 
765 static MultiFDPages_t *multifd_pages_init(size_t size)
766 {
767     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
768 
769     pages->allocated = size;
770     pages->iov = g_new0(struct iovec, size);
771     pages->offset = g_new0(ram_addr_t, size);
772 
773     return pages;
774 }
775 
776 static void multifd_pages_clear(MultiFDPages_t *pages)
777 {
778     pages->used = 0;
779     pages->allocated = 0;
780     pages->packet_num = 0;
781     pages->block = NULL;
782     g_free(pages->iov);
783     pages->iov = NULL;
784     g_free(pages->offset);
785     pages->offset = NULL;
786     g_free(pages);
787 }
788 
789 static void multifd_send_fill_packet(MultiFDSendParams *p)
790 {
791     MultiFDPacket_t *packet = p->packet;
792     uint32_t page_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
793     int i;
794 
795     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
796     packet->version = cpu_to_be32(MULTIFD_VERSION);
797     packet->flags = cpu_to_be32(p->flags);
798     packet->pages_alloc = cpu_to_be32(page_max);
799     packet->pages_used = cpu_to_be32(p->pages->used);
800     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
801     packet->packet_num = cpu_to_be64(p->packet_num);
802 
803     if (p->pages->block) {
804         strncpy(packet->ramblock, p->pages->block->idstr, 256);
805     }
806 
807     for (i = 0; i < p->pages->used; i++) {
808         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
809     }
810 }
811 
812 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
813 {
814     MultiFDPacket_t *packet = p->packet;
815     uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
816     RAMBlock *block;
817     int i;
818 
819     packet->magic = be32_to_cpu(packet->magic);
820     if (packet->magic != MULTIFD_MAGIC) {
821         error_setg(errp, "multifd: received packet "
822                    "magic %x and expected magic %x",
823                    packet->magic, MULTIFD_MAGIC);
824         return -1;
825     }
826 
827     packet->version = be32_to_cpu(packet->version);
828     if (packet->version != MULTIFD_VERSION) {
829         error_setg(errp, "multifd: received packet "
830                    "version %d and expected version %d",
831                    packet->version, MULTIFD_VERSION);
832         return -1;
833     }
834 
835     p->flags = be32_to_cpu(packet->flags);
836 
837     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
838     /*
839      * If we recevied a packet that is 100 times bigger than expected
840      * just stop migration.  It is a magic number.
841      */
842     if (packet->pages_alloc > pages_max * 100) {
843         error_setg(errp, "multifd: received packet "
844                    "with size %d and expected a maximum size of %d",
845                    packet->pages_alloc, pages_max * 100) ;
846         return -1;
847     }
848     /*
849      * We received a packet that is bigger than expected but inside
850      * reasonable limits (see previous comment).  Just reallocate.
851      */
852     if (packet->pages_alloc > p->pages->allocated) {
853         multifd_pages_clear(p->pages);
854         p->pages = multifd_pages_init(packet->pages_alloc);
855     }
856 
857     p->pages->used = be32_to_cpu(packet->pages_used);
858     if (p->pages->used > packet->pages_alloc) {
859         error_setg(errp, "multifd: received packet "
860                    "with %d pages and expected maximum pages are %d",
861                    p->pages->used, packet->pages_alloc) ;
862         return -1;
863     }
864 
865     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
866     p->packet_num = be64_to_cpu(packet->packet_num);
867 
868     if (p->pages->used) {
869         /* make sure that ramblock is 0 terminated */
870         packet->ramblock[255] = 0;
871         block = qemu_ram_block_by_name(packet->ramblock);
872         if (!block) {
873             error_setg(errp, "multifd: unknown ram block %s",
874                        packet->ramblock);
875             return -1;
876         }
877     }
878 
879     for (i = 0; i < p->pages->used; i++) {
880         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
881 
882         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
883             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
884                        " (max " RAM_ADDR_FMT ")",
885                        offset, block->max_length);
886             return -1;
887         }
888         p->pages->iov[i].iov_base = block->host + offset;
889         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
890     }
891 
892     return 0;
893 }
894 
895 struct {
896     MultiFDSendParams *params;
897     /* number of created threads */
898     int count;
899     /* array of pages to sent */
900     MultiFDPages_t *pages;
901     /* syncs main thread and channels */
902     QemuSemaphore sem_sync;
903     /* global number of generated multifd packets */
904     uint64_t packet_num;
905     /* send channels ready */
906     QemuSemaphore channels_ready;
907 } *multifd_send_state;
908 
909 /*
910  * How we use multifd_send_state->pages and channel->pages?
911  *
912  * We create a pages for each channel, and a main one.  Each time that
913  * we need to send a batch of pages we interchange the ones between
914  * multifd_send_state and the channel that is sending it.  There are
915  * two reasons for that:
916  *    - to not have to do so many mallocs during migration
917  *    - to make easier to know what to free at the end of migration
918  *
919  * This way we always know who is the owner of each "pages" struct,
920  * and we don't need any locking.  It belongs to the migration thread
921  * or to the channel thread.  Switching is safe because the migration
922  * thread is using the channel mutex when changing it, and the channel
923  * have to had finish with its own, otherwise pending_job can't be
924  * false.
925  */
926 
927 static void multifd_send_pages(void)
928 {
929     int i;
930     static int next_channel;
931     MultiFDSendParams *p = NULL; /* make happy gcc */
932     MultiFDPages_t *pages = multifd_send_state->pages;
933     uint64_t transferred;
934 
935     qemu_sem_wait(&multifd_send_state->channels_ready);
936     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
937         p = &multifd_send_state->params[i];
938 
939         qemu_mutex_lock(&p->mutex);
940         if (!p->pending_job) {
941             p->pending_job++;
942             next_channel = (i + 1) % migrate_multifd_channels();
943             break;
944         }
945         qemu_mutex_unlock(&p->mutex);
946     }
947     p->pages->used = 0;
948 
949     p->packet_num = multifd_send_state->packet_num++;
950     p->pages->block = NULL;
951     multifd_send_state->pages = p->pages;
952     p->pages = pages;
953     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
954     ram_counters.multifd_bytes += transferred;
955     ram_counters.transferred += transferred;;
956     qemu_mutex_unlock(&p->mutex);
957     qemu_sem_post(&p->sem);
958 }
959 
960 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
961 {
962     MultiFDPages_t *pages = multifd_send_state->pages;
963 
964     if (!pages->block) {
965         pages->block = block;
966     }
967 
968     if (pages->block == block) {
969         pages->offset[pages->used] = offset;
970         pages->iov[pages->used].iov_base = block->host + offset;
971         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
972         pages->used++;
973 
974         if (pages->used < pages->allocated) {
975             return;
976         }
977     }
978 
979     multifd_send_pages();
980 
981     if (pages->block != block) {
982         multifd_queue_page(block, offset);
983     }
984 }
985 
986 static void multifd_send_terminate_threads(Error *err)
987 {
988     int i;
989 
990     if (err) {
991         MigrationState *s = migrate_get_current();
992         migrate_set_error(s, err);
993         if (s->state == MIGRATION_STATUS_SETUP ||
994             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
995             s->state == MIGRATION_STATUS_DEVICE ||
996             s->state == MIGRATION_STATUS_ACTIVE) {
997             migrate_set_state(&s->state, s->state,
998                               MIGRATION_STATUS_FAILED);
999         }
1000     }
1001 
1002     for (i = 0; i < migrate_multifd_channels(); i++) {
1003         MultiFDSendParams *p = &multifd_send_state->params[i];
1004 
1005         qemu_mutex_lock(&p->mutex);
1006         p->quit = true;
1007         qemu_sem_post(&p->sem);
1008         qemu_mutex_unlock(&p->mutex);
1009     }
1010 }
1011 
1012 void multifd_save_cleanup(void)
1013 {
1014     int i;
1015 
1016     if (!migrate_use_multifd()) {
1017         return;
1018     }
1019     multifd_send_terminate_threads(NULL);
1020     for (i = 0; i < migrate_multifd_channels(); i++) {
1021         MultiFDSendParams *p = &multifd_send_state->params[i];
1022 
1023         if (p->running) {
1024             qemu_thread_join(&p->thread);
1025         }
1026         socket_send_channel_destroy(p->c);
1027         p->c = NULL;
1028         qemu_mutex_destroy(&p->mutex);
1029         qemu_sem_destroy(&p->sem);
1030         qemu_sem_destroy(&p->sem_sync);
1031         g_free(p->name);
1032         p->name = NULL;
1033         multifd_pages_clear(p->pages);
1034         p->pages = NULL;
1035         p->packet_len = 0;
1036         g_free(p->packet);
1037         p->packet = NULL;
1038     }
1039     qemu_sem_destroy(&multifd_send_state->channels_ready);
1040     qemu_sem_destroy(&multifd_send_state->sem_sync);
1041     g_free(multifd_send_state->params);
1042     multifd_send_state->params = NULL;
1043     multifd_pages_clear(multifd_send_state->pages);
1044     multifd_send_state->pages = NULL;
1045     g_free(multifd_send_state);
1046     multifd_send_state = NULL;
1047 }
1048 
1049 static void multifd_send_sync_main(void)
1050 {
1051     int i;
1052 
1053     if (!migrate_use_multifd()) {
1054         return;
1055     }
1056     if (multifd_send_state->pages->used) {
1057         multifd_send_pages();
1058     }
1059     for (i = 0; i < migrate_multifd_channels(); i++) {
1060         MultiFDSendParams *p = &multifd_send_state->params[i];
1061 
1062         trace_multifd_send_sync_main_signal(p->id);
1063 
1064         qemu_mutex_lock(&p->mutex);
1065 
1066         p->packet_num = multifd_send_state->packet_num++;
1067         p->flags |= MULTIFD_FLAG_SYNC;
1068         p->pending_job++;
1069         qemu_mutex_unlock(&p->mutex);
1070         qemu_sem_post(&p->sem);
1071     }
1072     for (i = 0; i < migrate_multifd_channels(); i++) {
1073         MultiFDSendParams *p = &multifd_send_state->params[i];
1074 
1075         trace_multifd_send_sync_main_wait(p->id);
1076         qemu_sem_wait(&multifd_send_state->sem_sync);
1077     }
1078     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1079 }
1080 
1081 static void *multifd_send_thread(void *opaque)
1082 {
1083     MultiFDSendParams *p = opaque;
1084     Error *local_err = NULL;
1085     int ret;
1086 
1087     trace_multifd_send_thread_start(p->id);
1088     rcu_register_thread();
1089 
1090     if (multifd_send_initial_packet(p, &local_err) < 0) {
1091         goto out;
1092     }
1093     /* initial packet */
1094     p->num_packets = 1;
1095 
1096     while (true) {
1097         qemu_sem_wait(&p->sem);
1098         qemu_mutex_lock(&p->mutex);
1099 
1100         if (p->pending_job) {
1101             uint32_t used = p->pages->used;
1102             uint64_t packet_num = p->packet_num;
1103             uint32_t flags = p->flags;
1104 
1105             p->next_packet_size = used * qemu_target_page_size();
1106             multifd_send_fill_packet(p);
1107             p->flags = 0;
1108             p->num_packets++;
1109             p->num_pages += used;
1110             p->pages->used = 0;
1111             qemu_mutex_unlock(&p->mutex);
1112 
1113             trace_multifd_send(p->id, packet_num, used, flags,
1114                                p->next_packet_size);
1115 
1116             ret = qio_channel_write_all(p->c, (void *)p->packet,
1117                                         p->packet_len, &local_err);
1118             if (ret != 0) {
1119                 break;
1120             }
1121 
1122             if (used) {
1123                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1124                                              used, &local_err);
1125                 if (ret != 0) {
1126                     break;
1127                 }
1128             }
1129 
1130             qemu_mutex_lock(&p->mutex);
1131             p->pending_job--;
1132             qemu_mutex_unlock(&p->mutex);
1133 
1134             if (flags & MULTIFD_FLAG_SYNC) {
1135                 qemu_sem_post(&multifd_send_state->sem_sync);
1136             }
1137             qemu_sem_post(&multifd_send_state->channels_ready);
1138         } else if (p->quit) {
1139             qemu_mutex_unlock(&p->mutex);
1140             break;
1141         } else {
1142             qemu_mutex_unlock(&p->mutex);
1143             /* sometimes there are spurious wakeups */
1144         }
1145     }
1146 
1147 out:
1148     if (local_err) {
1149         multifd_send_terminate_threads(local_err);
1150     }
1151 
1152     qemu_mutex_lock(&p->mutex);
1153     p->running = false;
1154     qemu_mutex_unlock(&p->mutex);
1155 
1156     rcu_unregister_thread();
1157     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1158 
1159     return NULL;
1160 }
1161 
1162 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1163 {
1164     MultiFDSendParams *p = opaque;
1165     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1166     Error *local_err = NULL;
1167 
1168     if (qio_task_propagate_error(task, &local_err)) {
1169         migrate_set_error(migrate_get_current(), local_err);
1170         multifd_save_cleanup();
1171     } else {
1172         p->c = QIO_CHANNEL(sioc);
1173         qio_channel_set_delay(p->c, false);
1174         p->running = true;
1175         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1176                            QEMU_THREAD_JOINABLE);
1177 
1178         atomic_inc(&multifd_send_state->count);
1179     }
1180 }
1181 
1182 int multifd_save_setup(void)
1183 {
1184     int thread_count;
1185     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1186     uint8_t i;
1187 
1188     if (!migrate_use_multifd()) {
1189         return 0;
1190     }
1191     thread_count = migrate_multifd_channels();
1192     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1193     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1194     atomic_set(&multifd_send_state->count, 0);
1195     multifd_send_state->pages = multifd_pages_init(page_count);
1196     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1197     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1198 
1199     for (i = 0; i < thread_count; i++) {
1200         MultiFDSendParams *p = &multifd_send_state->params[i];
1201 
1202         qemu_mutex_init(&p->mutex);
1203         qemu_sem_init(&p->sem, 0);
1204         qemu_sem_init(&p->sem_sync, 0);
1205         p->quit = false;
1206         p->pending_job = 0;
1207         p->id = i;
1208         p->pages = multifd_pages_init(page_count);
1209         p->packet_len = sizeof(MultiFDPacket_t)
1210                       + sizeof(ram_addr_t) * page_count;
1211         p->packet = g_malloc0(p->packet_len);
1212         p->name = g_strdup_printf("multifdsend_%d", i);
1213         socket_send_channel_create(multifd_new_send_channel_async, p);
1214     }
1215     return 0;
1216 }
1217 
1218 struct {
1219     MultiFDRecvParams *params;
1220     /* number of created threads */
1221     int count;
1222     /* syncs main thread and channels */
1223     QemuSemaphore sem_sync;
1224     /* global number of generated multifd packets */
1225     uint64_t packet_num;
1226 } *multifd_recv_state;
1227 
1228 static void multifd_recv_terminate_threads(Error *err)
1229 {
1230     int i;
1231 
1232     if (err) {
1233         MigrationState *s = migrate_get_current();
1234         migrate_set_error(s, err);
1235         if (s->state == MIGRATION_STATUS_SETUP ||
1236             s->state == MIGRATION_STATUS_ACTIVE) {
1237             migrate_set_state(&s->state, s->state,
1238                               MIGRATION_STATUS_FAILED);
1239         }
1240     }
1241 
1242     for (i = 0; i < migrate_multifd_channels(); i++) {
1243         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1244 
1245         qemu_mutex_lock(&p->mutex);
1246         /* We could arrive here for two reasons:
1247            - normal quit, i.e. everything went fine, just finished
1248            - error quit: We close the channels so the channel threads
1249              finish the qio_channel_read_all_eof() */
1250         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1251         qemu_mutex_unlock(&p->mutex);
1252     }
1253 }
1254 
1255 int multifd_load_cleanup(Error **errp)
1256 {
1257     int i;
1258     int ret = 0;
1259 
1260     if (!migrate_use_multifd()) {
1261         return 0;
1262     }
1263     multifd_recv_terminate_threads(NULL);
1264     for (i = 0; i < migrate_multifd_channels(); i++) {
1265         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1266 
1267         if (p->running) {
1268             qemu_thread_join(&p->thread);
1269         }
1270         object_unref(OBJECT(p->c));
1271         p->c = NULL;
1272         qemu_mutex_destroy(&p->mutex);
1273         qemu_sem_destroy(&p->sem_sync);
1274         g_free(p->name);
1275         p->name = NULL;
1276         multifd_pages_clear(p->pages);
1277         p->pages = NULL;
1278         p->packet_len = 0;
1279         g_free(p->packet);
1280         p->packet = NULL;
1281     }
1282     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1283     g_free(multifd_recv_state->params);
1284     multifd_recv_state->params = NULL;
1285     g_free(multifd_recv_state);
1286     multifd_recv_state = NULL;
1287 
1288     return ret;
1289 }
1290 
1291 static void multifd_recv_sync_main(void)
1292 {
1293     int i;
1294 
1295     if (!migrate_use_multifd()) {
1296         return;
1297     }
1298     for (i = 0; i < migrate_multifd_channels(); i++) {
1299         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1300 
1301         trace_multifd_recv_sync_main_wait(p->id);
1302         qemu_sem_wait(&multifd_recv_state->sem_sync);
1303         qemu_mutex_lock(&p->mutex);
1304         if (multifd_recv_state->packet_num < p->packet_num) {
1305             multifd_recv_state->packet_num = p->packet_num;
1306         }
1307         qemu_mutex_unlock(&p->mutex);
1308     }
1309     for (i = 0; i < migrate_multifd_channels(); i++) {
1310         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1311 
1312         trace_multifd_recv_sync_main_signal(p->id);
1313         qemu_sem_post(&p->sem_sync);
1314     }
1315     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1316 }
1317 
1318 static void *multifd_recv_thread(void *opaque)
1319 {
1320     MultiFDRecvParams *p = opaque;
1321     Error *local_err = NULL;
1322     int ret;
1323 
1324     trace_multifd_recv_thread_start(p->id);
1325     rcu_register_thread();
1326 
1327     while (true) {
1328         uint32_t used;
1329         uint32_t flags;
1330 
1331         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1332                                        p->packet_len, &local_err);
1333         if (ret == 0) {   /* EOF */
1334             break;
1335         }
1336         if (ret == -1) {   /* Error */
1337             break;
1338         }
1339 
1340         qemu_mutex_lock(&p->mutex);
1341         ret = multifd_recv_unfill_packet(p, &local_err);
1342         if (ret) {
1343             qemu_mutex_unlock(&p->mutex);
1344             break;
1345         }
1346 
1347         used = p->pages->used;
1348         flags = p->flags;
1349         trace_multifd_recv(p->id, p->packet_num, used, flags,
1350                            p->next_packet_size);
1351         p->num_packets++;
1352         p->num_pages += used;
1353         qemu_mutex_unlock(&p->mutex);
1354 
1355         if (used) {
1356             ret = qio_channel_readv_all(p->c, p->pages->iov,
1357                                         used, &local_err);
1358             if (ret != 0) {
1359                 break;
1360             }
1361         }
1362 
1363         if (flags & MULTIFD_FLAG_SYNC) {
1364             qemu_sem_post(&multifd_recv_state->sem_sync);
1365             qemu_sem_wait(&p->sem_sync);
1366         }
1367     }
1368 
1369     if (local_err) {
1370         multifd_recv_terminate_threads(local_err);
1371     }
1372     qemu_mutex_lock(&p->mutex);
1373     p->running = false;
1374     qemu_mutex_unlock(&p->mutex);
1375 
1376     rcu_unregister_thread();
1377     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1378 
1379     return NULL;
1380 }
1381 
1382 int multifd_load_setup(void)
1383 {
1384     int thread_count;
1385     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1386     uint8_t i;
1387 
1388     if (!migrate_use_multifd()) {
1389         return 0;
1390     }
1391     thread_count = migrate_multifd_channels();
1392     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1393     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1394     atomic_set(&multifd_recv_state->count, 0);
1395     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1396 
1397     for (i = 0; i < thread_count; i++) {
1398         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1399 
1400         qemu_mutex_init(&p->mutex);
1401         qemu_sem_init(&p->sem_sync, 0);
1402         p->id = i;
1403         p->pages = multifd_pages_init(page_count);
1404         p->packet_len = sizeof(MultiFDPacket_t)
1405                       + sizeof(ram_addr_t) * page_count;
1406         p->packet = g_malloc0(p->packet_len);
1407         p->name = g_strdup_printf("multifdrecv_%d", i);
1408     }
1409     return 0;
1410 }
1411 
1412 bool multifd_recv_all_channels_created(void)
1413 {
1414     int thread_count = migrate_multifd_channels();
1415 
1416     if (!migrate_use_multifd()) {
1417         return true;
1418     }
1419 
1420     return thread_count == atomic_read(&multifd_recv_state->count);
1421 }
1422 
1423 /*
1424  * Try to receive all multifd channels to get ready for the migration.
1425  * - Return true and do not set @errp when correctly receving all channels;
1426  * - Return false and do not set @errp when correctly receiving the current one;
1427  * - Return false and set @errp when failing to receive the current channel.
1428  */
1429 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1430 {
1431     MultiFDRecvParams *p;
1432     Error *local_err = NULL;
1433     int id;
1434 
1435     id = multifd_recv_initial_packet(ioc, &local_err);
1436     if (id < 0) {
1437         multifd_recv_terminate_threads(local_err);
1438         error_propagate_prepend(errp, local_err,
1439                                 "failed to receive packet"
1440                                 " via multifd channel %d: ",
1441                                 atomic_read(&multifd_recv_state->count));
1442         return false;
1443     }
1444 
1445     p = &multifd_recv_state->params[id];
1446     if (p->c != NULL) {
1447         error_setg(&local_err, "multifd: received id '%d' already setup'",
1448                    id);
1449         multifd_recv_terminate_threads(local_err);
1450         error_propagate(errp, local_err);
1451         return false;
1452     }
1453     p->c = ioc;
1454     object_ref(OBJECT(ioc));
1455     /* initial packet */
1456     p->num_packets = 1;
1457 
1458     p->running = true;
1459     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1460                        QEMU_THREAD_JOINABLE);
1461     atomic_inc(&multifd_recv_state->count);
1462     return atomic_read(&multifd_recv_state->count) ==
1463            migrate_multifd_channels();
1464 }
1465 
1466 /**
1467  * save_page_header: write page header to wire
1468  *
1469  * If this is the 1st block, it also writes the block identification
1470  *
1471  * Returns the number of bytes written
1472  *
1473  * @f: QEMUFile where to send the data
1474  * @block: block that contains the page we want to send
1475  * @offset: offset inside the block for the page
1476  *          in the lower bits, it contains flags
1477  */
1478 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1479                                ram_addr_t offset)
1480 {
1481     size_t size, len;
1482 
1483     if (block == rs->last_sent_block) {
1484         offset |= RAM_SAVE_FLAG_CONTINUE;
1485     }
1486     qemu_put_be64(f, offset);
1487     size = 8;
1488 
1489     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1490         len = strlen(block->idstr);
1491         qemu_put_byte(f, len);
1492         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1493         size += 1 + len;
1494         rs->last_sent_block = block;
1495     }
1496     return size;
1497 }
1498 
1499 /**
1500  * mig_throttle_guest_down: throotle down the guest
1501  *
1502  * Reduce amount of guest cpu execution to hopefully slow down memory
1503  * writes. If guest dirty memory rate is reduced below the rate at
1504  * which we can transfer pages to the destination then we should be
1505  * able to complete migration. Some workloads dirty memory way too
1506  * fast and will not effectively converge, even with auto-converge.
1507  */
1508 static void mig_throttle_guest_down(void)
1509 {
1510     MigrationState *s = migrate_get_current();
1511     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1512     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1513     int pct_max = s->parameters.max_cpu_throttle;
1514 
1515     /* We have not started throttling yet. Let's start it. */
1516     if (!cpu_throttle_active()) {
1517         cpu_throttle_set(pct_initial);
1518     } else {
1519         /* Throttling already on, just increase the rate */
1520         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1521                          pct_max));
1522     }
1523 }
1524 
1525 /**
1526  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1527  *
1528  * @rs: current RAM state
1529  * @current_addr: address for the zero page
1530  *
1531  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1532  * The important thing is that a stale (not-yet-0'd) page be replaced
1533  * by the new data.
1534  * As a bonus, if the page wasn't in the cache it gets added so that
1535  * when a small write is made into the 0'd page it gets XBZRLE sent.
1536  */
1537 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1538 {
1539     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1540         return;
1541     }
1542 
1543     /* We don't care if this fails to allocate a new cache page
1544      * as long as it updated an old one */
1545     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1546                  ram_counters.dirty_sync_count);
1547 }
1548 
1549 #define ENCODING_FLAG_XBZRLE 0x1
1550 
1551 /**
1552  * save_xbzrle_page: compress and send current page
1553  *
1554  * Returns: 1 means that we wrote the page
1555  *          0 means that page is identical to the one already sent
1556  *          -1 means that xbzrle would be longer than normal
1557  *
1558  * @rs: current RAM state
1559  * @current_data: pointer to the address of the page contents
1560  * @current_addr: addr of the page
1561  * @block: block that contains the page we want to send
1562  * @offset: offset inside the block for the page
1563  * @last_stage: if we are at the completion stage
1564  */
1565 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1566                             ram_addr_t current_addr, RAMBlock *block,
1567                             ram_addr_t offset, bool last_stage)
1568 {
1569     int encoded_len = 0, bytes_xbzrle;
1570     uint8_t *prev_cached_page;
1571 
1572     if (!cache_is_cached(XBZRLE.cache, current_addr,
1573                          ram_counters.dirty_sync_count)) {
1574         xbzrle_counters.cache_miss++;
1575         if (!last_stage) {
1576             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1577                              ram_counters.dirty_sync_count) == -1) {
1578                 return -1;
1579             } else {
1580                 /* update *current_data when the page has been
1581                    inserted into cache */
1582                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1583             }
1584         }
1585         return -1;
1586     }
1587 
1588     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1589 
1590     /* save current buffer into memory */
1591     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1592 
1593     /* XBZRLE encoding (if there is no overflow) */
1594     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1595                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1596                                        TARGET_PAGE_SIZE);
1597     if (encoded_len == 0) {
1598         trace_save_xbzrle_page_skipping();
1599         return 0;
1600     } else if (encoded_len == -1) {
1601         trace_save_xbzrle_page_overflow();
1602         xbzrle_counters.overflow++;
1603         /* update data in the cache */
1604         if (!last_stage) {
1605             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1606             *current_data = prev_cached_page;
1607         }
1608         return -1;
1609     }
1610 
1611     /* we need to update the data in the cache, in order to get the same data */
1612     if (!last_stage) {
1613         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1614     }
1615 
1616     /* Send XBZRLE based compressed page */
1617     bytes_xbzrle = save_page_header(rs, rs->f, block,
1618                                     offset | RAM_SAVE_FLAG_XBZRLE);
1619     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1620     qemu_put_be16(rs->f, encoded_len);
1621     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1622     bytes_xbzrle += encoded_len + 1 + 2;
1623     xbzrle_counters.pages++;
1624     xbzrle_counters.bytes += bytes_xbzrle;
1625     ram_counters.transferred += bytes_xbzrle;
1626 
1627     return 1;
1628 }
1629 
1630 /**
1631  * migration_bitmap_find_dirty: find the next dirty page from start
1632  *
1633  * Returns the page offset within memory region of the start of a dirty page
1634  *
1635  * @rs: current RAM state
1636  * @rb: RAMBlock where to search for dirty pages
1637  * @start: page where we start the search
1638  */
1639 static inline
1640 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1641                                           unsigned long start)
1642 {
1643     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1644     unsigned long *bitmap = rb->bmap;
1645     unsigned long next;
1646 
1647     if (ramblock_is_ignored(rb)) {
1648         return size;
1649     }
1650 
1651     /*
1652      * When the free page optimization is enabled, we need to check the bitmap
1653      * to send the non-free pages rather than all the pages in the bulk stage.
1654      */
1655     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1656         next = start + 1;
1657     } else {
1658         next = find_next_bit(bitmap, size, start);
1659     }
1660 
1661     return next;
1662 }
1663 
1664 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1665                                                 RAMBlock *rb,
1666                                                 unsigned long page)
1667 {
1668     bool ret;
1669 
1670     qemu_mutex_lock(&rs->bitmap_mutex);
1671     ret = test_and_clear_bit(page, rb->bmap);
1672 
1673     if (ret) {
1674         rs->migration_dirty_pages--;
1675     }
1676     qemu_mutex_unlock(&rs->bitmap_mutex);
1677 
1678     return ret;
1679 }
1680 
1681 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1682                                         ram_addr_t length)
1683 {
1684     rs->migration_dirty_pages +=
1685         cpu_physical_memory_sync_dirty_bitmap(rb, 0, length,
1686                                               &rs->num_dirty_pages_period);
1687 }
1688 
1689 /**
1690  * ram_pagesize_summary: calculate all the pagesizes of a VM
1691  *
1692  * Returns a summary bitmap of the page sizes of all RAMBlocks
1693  *
1694  * For VMs with just normal pages this is equivalent to the host page
1695  * size. If it's got some huge pages then it's the OR of all the
1696  * different page sizes.
1697  */
1698 uint64_t ram_pagesize_summary(void)
1699 {
1700     RAMBlock *block;
1701     uint64_t summary = 0;
1702 
1703     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1704         summary |= block->page_size;
1705     }
1706 
1707     return summary;
1708 }
1709 
1710 uint64_t ram_get_total_transferred_pages(void)
1711 {
1712     return  ram_counters.normal + ram_counters.duplicate +
1713                 compression_counters.pages + xbzrle_counters.pages;
1714 }
1715 
1716 static void migration_update_rates(RAMState *rs, int64_t end_time)
1717 {
1718     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1719     double compressed_size;
1720 
1721     /* calculate period counters */
1722     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1723                 / (end_time - rs->time_last_bitmap_sync);
1724 
1725     if (!page_count) {
1726         return;
1727     }
1728 
1729     if (migrate_use_xbzrle()) {
1730         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1731             rs->xbzrle_cache_miss_prev) / page_count;
1732         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1733     }
1734 
1735     if (migrate_use_compression()) {
1736         compression_counters.busy_rate = (double)(compression_counters.busy -
1737             rs->compress_thread_busy_prev) / page_count;
1738         rs->compress_thread_busy_prev = compression_counters.busy;
1739 
1740         compressed_size = compression_counters.compressed_size -
1741                           rs->compressed_size_prev;
1742         if (compressed_size) {
1743             double uncompressed_size = (compression_counters.pages -
1744                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1745 
1746             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1747             compression_counters.compression_rate =
1748                                         uncompressed_size / compressed_size;
1749 
1750             rs->compress_pages_prev = compression_counters.pages;
1751             rs->compressed_size_prev = compression_counters.compressed_size;
1752         }
1753     }
1754 }
1755 
1756 static void migration_bitmap_sync(RAMState *rs)
1757 {
1758     RAMBlock *block;
1759     int64_t end_time;
1760     uint64_t bytes_xfer_now;
1761 
1762     ram_counters.dirty_sync_count++;
1763 
1764     if (!rs->time_last_bitmap_sync) {
1765         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1766     }
1767 
1768     trace_migration_bitmap_sync_start();
1769     memory_global_dirty_log_sync();
1770 
1771     qemu_mutex_lock(&rs->bitmap_mutex);
1772     rcu_read_lock();
1773     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1774         migration_bitmap_sync_range(rs, block, block->used_length);
1775     }
1776     ram_counters.remaining = ram_bytes_remaining();
1777     rcu_read_unlock();
1778     qemu_mutex_unlock(&rs->bitmap_mutex);
1779 
1780     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1781 
1782     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1783 
1784     /* more than 1 second = 1000 millisecons */
1785     if (end_time > rs->time_last_bitmap_sync + 1000) {
1786         bytes_xfer_now = ram_counters.transferred;
1787 
1788         /* During block migration the auto-converge logic incorrectly detects
1789          * that ram migration makes no progress. Avoid this by disabling the
1790          * throttling logic during the bulk phase of block migration. */
1791         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1792             /* The following detection logic can be refined later. For now:
1793                Check to see if the dirtied bytes is 50% more than the approx.
1794                amount of bytes that just got transferred since the last time we
1795                were in this routine. If that happens twice, start or increase
1796                throttling */
1797 
1798             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1799                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1800                 (++rs->dirty_rate_high_cnt >= 2)) {
1801                     trace_migration_throttle();
1802                     rs->dirty_rate_high_cnt = 0;
1803                     mig_throttle_guest_down();
1804             }
1805         }
1806 
1807         migration_update_rates(rs, end_time);
1808 
1809         rs->target_page_count_prev = rs->target_page_count;
1810 
1811         /* reset period counters */
1812         rs->time_last_bitmap_sync = end_time;
1813         rs->num_dirty_pages_period = 0;
1814         rs->bytes_xfer_prev = bytes_xfer_now;
1815     }
1816     if (migrate_use_events()) {
1817         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1818     }
1819 }
1820 
1821 static void migration_bitmap_sync_precopy(RAMState *rs)
1822 {
1823     Error *local_err = NULL;
1824 
1825     /*
1826      * The current notifier usage is just an optimization to migration, so we
1827      * don't stop the normal migration process in the error case.
1828      */
1829     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1830         error_report_err(local_err);
1831     }
1832 
1833     migration_bitmap_sync(rs);
1834 
1835     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1836         error_report_err(local_err);
1837     }
1838 }
1839 
1840 /**
1841  * save_zero_page_to_file: send the zero page to the file
1842  *
1843  * Returns the size of data written to the file, 0 means the page is not
1844  * a zero page
1845  *
1846  * @rs: current RAM state
1847  * @file: the file where the data is saved
1848  * @block: block that contains the page we want to send
1849  * @offset: offset inside the block for the page
1850  */
1851 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1852                                   RAMBlock *block, ram_addr_t offset)
1853 {
1854     uint8_t *p = block->host + offset;
1855     int len = 0;
1856 
1857     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1858         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1859         qemu_put_byte(file, 0);
1860         len += 1;
1861     }
1862     return len;
1863 }
1864 
1865 /**
1866  * save_zero_page: send the zero page to the stream
1867  *
1868  * Returns the number of pages written.
1869  *
1870  * @rs: current RAM state
1871  * @block: block that contains the page we want to send
1872  * @offset: offset inside the block for the page
1873  */
1874 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1875 {
1876     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1877 
1878     if (len) {
1879         ram_counters.duplicate++;
1880         ram_counters.transferred += len;
1881         return 1;
1882     }
1883     return -1;
1884 }
1885 
1886 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1887 {
1888     if (!migrate_release_ram() || !migration_in_postcopy()) {
1889         return;
1890     }
1891 
1892     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1893 }
1894 
1895 /*
1896  * @pages: the number of pages written by the control path,
1897  *        < 0 - error
1898  *        > 0 - number of pages written
1899  *
1900  * Return true if the pages has been saved, otherwise false is returned.
1901  */
1902 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1903                               int *pages)
1904 {
1905     uint64_t bytes_xmit = 0;
1906     int ret;
1907 
1908     *pages = -1;
1909     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1910                                 &bytes_xmit);
1911     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1912         return false;
1913     }
1914 
1915     if (bytes_xmit) {
1916         ram_counters.transferred += bytes_xmit;
1917         *pages = 1;
1918     }
1919 
1920     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1921         return true;
1922     }
1923 
1924     if (bytes_xmit > 0) {
1925         ram_counters.normal++;
1926     } else if (bytes_xmit == 0) {
1927         ram_counters.duplicate++;
1928     }
1929 
1930     return true;
1931 }
1932 
1933 /*
1934  * directly send the page to the stream
1935  *
1936  * Returns the number of pages written.
1937  *
1938  * @rs: current RAM state
1939  * @block: block that contains the page we want to send
1940  * @offset: offset inside the block for the page
1941  * @buf: the page to be sent
1942  * @async: send to page asyncly
1943  */
1944 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1945                             uint8_t *buf, bool async)
1946 {
1947     ram_counters.transferred += save_page_header(rs, rs->f, block,
1948                                                  offset | RAM_SAVE_FLAG_PAGE);
1949     if (async) {
1950         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1951                               migrate_release_ram() &
1952                               migration_in_postcopy());
1953     } else {
1954         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1955     }
1956     ram_counters.transferred += TARGET_PAGE_SIZE;
1957     ram_counters.normal++;
1958     return 1;
1959 }
1960 
1961 /**
1962  * ram_save_page: send the given page to the stream
1963  *
1964  * Returns the number of pages written.
1965  *          < 0 - error
1966  *          >=0 - Number of pages written - this might legally be 0
1967  *                if xbzrle noticed the page was the same.
1968  *
1969  * @rs: current RAM state
1970  * @block: block that contains the page we want to send
1971  * @offset: offset inside the block for the page
1972  * @last_stage: if we are at the completion stage
1973  */
1974 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1975 {
1976     int pages = -1;
1977     uint8_t *p;
1978     bool send_async = true;
1979     RAMBlock *block = pss->block;
1980     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1981     ram_addr_t current_addr = block->offset + offset;
1982 
1983     p = block->host + offset;
1984     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1985 
1986     XBZRLE_cache_lock();
1987     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1988         migrate_use_xbzrle()) {
1989         pages = save_xbzrle_page(rs, &p, current_addr, block,
1990                                  offset, last_stage);
1991         if (!last_stage) {
1992             /* Can't send this cached data async, since the cache page
1993              * might get updated before it gets to the wire
1994              */
1995             send_async = false;
1996         }
1997     }
1998 
1999     /* XBZRLE overflow or normal page */
2000     if (pages == -1) {
2001         pages = save_normal_page(rs, block, offset, p, send_async);
2002     }
2003 
2004     XBZRLE_cache_unlock();
2005 
2006     return pages;
2007 }
2008 
2009 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2010                                  ram_addr_t offset)
2011 {
2012     multifd_queue_page(block, offset);
2013     ram_counters.normal++;
2014 
2015     return 1;
2016 }
2017 
2018 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
2019                                  ram_addr_t offset, uint8_t *source_buf)
2020 {
2021     RAMState *rs = ram_state;
2022     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
2023     bool zero_page = false;
2024     int ret;
2025 
2026     if (save_zero_page_to_file(rs, f, block, offset)) {
2027         zero_page = true;
2028         goto exit;
2029     }
2030 
2031     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2032 
2033     /*
2034      * copy it to a internal buffer to avoid it being modified by VM
2035      * so that we can catch up the error during compression and
2036      * decompression
2037      */
2038     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2039     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2040     if (ret < 0) {
2041         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2042         error_report("compressed data failed!");
2043         return false;
2044     }
2045 
2046 exit:
2047     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2048     return zero_page;
2049 }
2050 
2051 static void
2052 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2053 {
2054     ram_counters.transferred += bytes_xmit;
2055 
2056     if (param->zero_page) {
2057         ram_counters.duplicate++;
2058         return;
2059     }
2060 
2061     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2062     compression_counters.compressed_size += bytes_xmit - 8;
2063     compression_counters.pages++;
2064 }
2065 
2066 static bool save_page_use_compression(RAMState *rs);
2067 
2068 static void flush_compressed_data(RAMState *rs)
2069 {
2070     int idx, len, thread_count;
2071 
2072     if (!save_page_use_compression(rs)) {
2073         return;
2074     }
2075     thread_count = migrate_compress_threads();
2076 
2077     qemu_mutex_lock(&comp_done_lock);
2078     for (idx = 0; idx < thread_count; idx++) {
2079         while (!comp_param[idx].done) {
2080             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2081         }
2082     }
2083     qemu_mutex_unlock(&comp_done_lock);
2084 
2085     for (idx = 0; idx < thread_count; idx++) {
2086         qemu_mutex_lock(&comp_param[idx].mutex);
2087         if (!comp_param[idx].quit) {
2088             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2089             /*
2090              * it's safe to fetch zero_page without holding comp_done_lock
2091              * as there is no further request submitted to the thread,
2092              * i.e, the thread should be waiting for a request at this point.
2093              */
2094             update_compress_thread_counts(&comp_param[idx], len);
2095         }
2096         qemu_mutex_unlock(&comp_param[idx].mutex);
2097     }
2098 }
2099 
2100 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2101                                        ram_addr_t offset)
2102 {
2103     param->block = block;
2104     param->offset = offset;
2105 }
2106 
2107 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2108                                            ram_addr_t offset)
2109 {
2110     int idx, thread_count, bytes_xmit = -1, pages = -1;
2111     bool wait = migrate_compress_wait_thread();
2112 
2113     thread_count = migrate_compress_threads();
2114     qemu_mutex_lock(&comp_done_lock);
2115 retry:
2116     for (idx = 0; idx < thread_count; idx++) {
2117         if (comp_param[idx].done) {
2118             comp_param[idx].done = false;
2119             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2120             qemu_mutex_lock(&comp_param[idx].mutex);
2121             set_compress_params(&comp_param[idx], block, offset);
2122             qemu_cond_signal(&comp_param[idx].cond);
2123             qemu_mutex_unlock(&comp_param[idx].mutex);
2124             pages = 1;
2125             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2126             break;
2127         }
2128     }
2129 
2130     /*
2131      * wait for the free thread if the user specifies 'compress-wait-thread',
2132      * otherwise we will post the page out in the main thread as normal page.
2133      */
2134     if (pages < 0 && wait) {
2135         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2136         goto retry;
2137     }
2138     qemu_mutex_unlock(&comp_done_lock);
2139 
2140     return pages;
2141 }
2142 
2143 /**
2144  * find_dirty_block: find the next dirty page and update any state
2145  * associated with the search process.
2146  *
2147  * Returns true if a page is found
2148  *
2149  * @rs: current RAM state
2150  * @pss: data about the state of the current dirty page scan
2151  * @again: set to false if the search has scanned the whole of RAM
2152  */
2153 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2154 {
2155     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2156     if (pss->complete_round && pss->block == rs->last_seen_block &&
2157         pss->page >= rs->last_page) {
2158         /*
2159          * We've been once around the RAM and haven't found anything.
2160          * Give up.
2161          */
2162         *again = false;
2163         return false;
2164     }
2165     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2166         /* Didn't find anything in this RAM Block */
2167         pss->page = 0;
2168         pss->block = QLIST_NEXT_RCU(pss->block, next);
2169         if (!pss->block) {
2170             /*
2171              * If memory migration starts over, we will meet a dirtied page
2172              * which may still exists in compression threads's ring, so we
2173              * should flush the compressed data to make sure the new page
2174              * is not overwritten by the old one in the destination.
2175              *
2176              * Also If xbzrle is on, stop using the data compression at this
2177              * point. In theory, xbzrle can do better than compression.
2178              */
2179             flush_compressed_data(rs);
2180 
2181             /* Hit the end of the list */
2182             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2183             /* Flag that we've looped */
2184             pss->complete_round = true;
2185             rs->ram_bulk_stage = false;
2186         }
2187         /* Didn't find anything this time, but try again on the new block */
2188         *again = true;
2189         return false;
2190     } else {
2191         /* Can go around again, but... */
2192         *again = true;
2193         /* We've found something so probably don't need to */
2194         return true;
2195     }
2196 }
2197 
2198 /**
2199  * unqueue_page: gets a page of the queue
2200  *
2201  * Helper for 'get_queued_page' - gets a page off the queue
2202  *
2203  * Returns the block of the page (or NULL if none available)
2204  *
2205  * @rs: current RAM state
2206  * @offset: used to return the offset within the RAMBlock
2207  */
2208 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2209 {
2210     RAMBlock *block = NULL;
2211 
2212     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2213         return NULL;
2214     }
2215 
2216     qemu_mutex_lock(&rs->src_page_req_mutex);
2217     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2218         struct RAMSrcPageRequest *entry =
2219                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2220         block = entry->rb;
2221         *offset = entry->offset;
2222 
2223         if (entry->len > TARGET_PAGE_SIZE) {
2224             entry->len -= TARGET_PAGE_SIZE;
2225             entry->offset += TARGET_PAGE_SIZE;
2226         } else {
2227             memory_region_unref(block->mr);
2228             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2229             g_free(entry);
2230             migration_consume_urgent_request();
2231         }
2232     }
2233     qemu_mutex_unlock(&rs->src_page_req_mutex);
2234 
2235     return block;
2236 }
2237 
2238 /**
2239  * get_queued_page: unqueue a page from the postocpy requests
2240  *
2241  * Skips pages that are already sent (!dirty)
2242  *
2243  * Returns true if a queued page is found
2244  *
2245  * @rs: current RAM state
2246  * @pss: data about the state of the current dirty page scan
2247  */
2248 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2249 {
2250     RAMBlock  *block;
2251     ram_addr_t offset;
2252     bool dirty;
2253 
2254     do {
2255         block = unqueue_page(rs, &offset);
2256         /*
2257          * We're sending this page, and since it's postcopy nothing else
2258          * will dirty it, and we must make sure it doesn't get sent again
2259          * even if this queue request was received after the background
2260          * search already sent it.
2261          */
2262         if (block) {
2263             unsigned long page;
2264 
2265             page = offset >> TARGET_PAGE_BITS;
2266             dirty = test_bit(page, block->bmap);
2267             if (!dirty) {
2268                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2269                        page, test_bit(page, block->unsentmap));
2270             } else {
2271                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2272             }
2273         }
2274 
2275     } while (block && !dirty);
2276 
2277     if (block) {
2278         /*
2279          * As soon as we start servicing pages out of order, then we have
2280          * to kill the bulk stage, since the bulk stage assumes
2281          * in (migration_bitmap_find_and_reset_dirty) that every page is
2282          * dirty, that's no longer true.
2283          */
2284         rs->ram_bulk_stage = false;
2285 
2286         /*
2287          * We want the background search to continue from the queued page
2288          * since the guest is likely to want other pages near to the page
2289          * it just requested.
2290          */
2291         pss->block = block;
2292         pss->page = offset >> TARGET_PAGE_BITS;
2293     }
2294 
2295     return !!block;
2296 }
2297 
2298 /**
2299  * migration_page_queue_free: drop any remaining pages in the ram
2300  * request queue
2301  *
2302  * It should be empty at the end anyway, but in error cases there may
2303  * be some left.  in case that there is any page left, we drop it.
2304  *
2305  */
2306 static void migration_page_queue_free(RAMState *rs)
2307 {
2308     struct RAMSrcPageRequest *mspr, *next_mspr;
2309     /* This queue generally should be empty - but in the case of a failed
2310      * migration might have some droppings in.
2311      */
2312     rcu_read_lock();
2313     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2314         memory_region_unref(mspr->rb->mr);
2315         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2316         g_free(mspr);
2317     }
2318     rcu_read_unlock();
2319 }
2320 
2321 /**
2322  * ram_save_queue_pages: queue the page for transmission
2323  *
2324  * A request from postcopy destination for example.
2325  *
2326  * Returns zero on success or negative on error
2327  *
2328  * @rbname: Name of the RAMBLock of the request. NULL means the
2329  *          same that last one.
2330  * @start: starting address from the start of the RAMBlock
2331  * @len: length (in bytes) to send
2332  */
2333 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2334 {
2335     RAMBlock *ramblock;
2336     RAMState *rs = ram_state;
2337 
2338     ram_counters.postcopy_requests++;
2339     rcu_read_lock();
2340     if (!rbname) {
2341         /* Reuse last RAMBlock */
2342         ramblock = rs->last_req_rb;
2343 
2344         if (!ramblock) {
2345             /*
2346              * Shouldn't happen, we can't reuse the last RAMBlock if
2347              * it's the 1st request.
2348              */
2349             error_report("ram_save_queue_pages no previous block");
2350             goto err;
2351         }
2352     } else {
2353         ramblock = qemu_ram_block_by_name(rbname);
2354 
2355         if (!ramblock) {
2356             /* We shouldn't be asked for a non-existent RAMBlock */
2357             error_report("ram_save_queue_pages no block '%s'", rbname);
2358             goto err;
2359         }
2360         rs->last_req_rb = ramblock;
2361     }
2362     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2363     if (start+len > ramblock->used_length) {
2364         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2365                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2366                      __func__, start, len, ramblock->used_length);
2367         goto err;
2368     }
2369 
2370     struct RAMSrcPageRequest *new_entry =
2371         g_malloc0(sizeof(struct RAMSrcPageRequest));
2372     new_entry->rb = ramblock;
2373     new_entry->offset = start;
2374     new_entry->len = len;
2375 
2376     memory_region_ref(ramblock->mr);
2377     qemu_mutex_lock(&rs->src_page_req_mutex);
2378     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2379     migration_make_urgent_request();
2380     qemu_mutex_unlock(&rs->src_page_req_mutex);
2381     rcu_read_unlock();
2382 
2383     return 0;
2384 
2385 err:
2386     rcu_read_unlock();
2387     return -1;
2388 }
2389 
2390 static bool save_page_use_compression(RAMState *rs)
2391 {
2392     if (!migrate_use_compression()) {
2393         return false;
2394     }
2395 
2396     /*
2397      * If xbzrle is on, stop using the data compression after first
2398      * round of migration even if compression is enabled. In theory,
2399      * xbzrle can do better than compression.
2400      */
2401     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2402         return true;
2403     }
2404 
2405     return false;
2406 }
2407 
2408 /*
2409  * try to compress the page before posting it out, return true if the page
2410  * has been properly handled by compression, otherwise needs other
2411  * paths to handle it
2412  */
2413 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2414 {
2415     if (!save_page_use_compression(rs)) {
2416         return false;
2417     }
2418 
2419     /*
2420      * When starting the process of a new block, the first page of
2421      * the block should be sent out before other pages in the same
2422      * block, and all the pages in last block should have been sent
2423      * out, keeping this order is important, because the 'cont' flag
2424      * is used to avoid resending the block name.
2425      *
2426      * We post the fist page as normal page as compression will take
2427      * much CPU resource.
2428      */
2429     if (block != rs->last_sent_block) {
2430         flush_compressed_data(rs);
2431         return false;
2432     }
2433 
2434     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2435         return true;
2436     }
2437 
2438     compression_counters.busy++;
2439     return false;
2440 }
2441 
2442 /**
2443  * ram_save_target_page: save one target page
2444  *
2445  * Returns the number of pages written
2446  *
2447  * @rs: current RAM state
2448  * @pss: data about the page we want to send
2449  * @last_stage: if we are at the completion stage
2450  */
2451 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2452                                 bool last_stage)
2453 {
2454     RAMBlock *block = pss->block;
2455     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2456     int res;
2457 
2458     if (control_save_page(rs, block, offset, &res)) {
2459         return res;
2460     }
2461 
2462     if (save_compress_page(rs, block, offset)) {
2463         return 1;
2464     }
2465 
2466     res = save_zero_page(rs, block, offset);
2467     if (res > 0) {
2468         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2469          * page would be stale
2470          */
2471         if (!save_page_use_compression(rs)) {
2472             XBZRLE_cache_lock();
2473             xbzrle_cache_zero_page(rs, block->offset + offset);
2474             XBZRLE_cache_unlock();
2475         }
2476         ram_release_pages(block->idstr, offset, res);
2477         return res;
2478     }
2479 
2480     /*
2481      * do not use multifd for compression as the first page in the new
2482      * block should be posted out before sending the compressed page
2483      */
2484     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2485         return ram_save_multifd_page(rs, block, offset);
2486     }
2487 
2488     return ram_save_page(rs, pss, last_stage);
2489 }
2490 
2491 /**
2492  * ram_save_host_page: save a whole host page
2493  *
2494  * Starting at *offset send pages up to the end of the current host
2495  * page. It's valid for the initial offset to point into the middle of
2496  * a host page in which case the remainder of the hostpage is sent.
2497  * Only dirty target pages are sent. Note that the host page size may
2498  * be a huge page for this block.
2499  * The saving stops at the boundary of the used_length of the block
2500  * if the RAMBlock isn't a multiple of the host page size.
2501  *
2502  * Returns the number of pages written or negative on error
2503  *
2504  * @rs: current RAM state
2505  * @ms: current migration state
2506  * @pss: data about the page we want to send
2507  * @last_stage: if we are at the completion stage
2508  */
2509 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2510                               bool last_stage)
2511 {
2512     int tmppages, pages = 0;
2513     size_t pagesize_bits =
2514         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2515 
2516     if (ramblock_is_ignored(pss->block)) {
2517         error_report("block %s should not be migrated !", pss->block->idstr);
2518         return 0;
2519     }
2520 
2521     do {
2522         /* Check the pages is dirty and if it is send it */
2523         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2524             pss->page++;
2525             continue;
2526         }
2527 
2528         tmppages = ram_save_target_page(rs, pss, last_stage);
2529         if (tmppages < 0) {
2530             return tmppages;
2531         }
2532 
2533         pages += tmppages;
2534         if (pss->block->unsentmap) {
2535             clear_bit(pss->page, pss->block->unsentmap);
2536         }
2537 
2538         pss->page++;
2539     } while ((pss->page & (pagesize_bits - 1)) &&
2540              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2541 
2542     /* The offset we leave with is the last one we looked at */
2543     pss->page--;
2544     return pages;
2545 }
2546 
2547 /**
2548  * ram_find_and_save_block: finds a dirty page and sends it to f
2549  *
2550  * Called within an RCU critical section.
2551  *
2552  * Returns the number of pages written where zero means no dirty pages,
2553  * or negative on error
2554  *
2555  * @rs: current RAM state
2556  * @last_stage: if we are at the completion stage
2557  *
2558  * On systems where host-page-size > target-page-size it will send all the
2559  * pages in a host page that are dirty.
2560  */
2561 
2562 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2563 {
2564     PageSearchStatus pss;
2565     int pages = 0;
2566     bool again, found;
2567 
2568     /* No dirty page as there is zero RAM */
2569     if (!ram_bytes_total()) {
2570         return pages;
2571     }
2572 
2573     pss.block = rs->last_seen_block;
2574     pss.page = rs->last_page;
2575     pss.complete_round = false;
2576 
2577     if (!pss.block) {
2578         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2579     }
2580 
2581     do {
2582         again = true;
2583         found = get_queued_page(rs, &pss);
2584 
2585         if (!found) {
2586             /* priority queue empty, so just search for something dirty */
2587             found = find_dirty_block(rs, &pss, &again);
2588         }
2589 
2590         if (found) {
2591             pages = ram_save_host_page(rs, &pss, last_stage);
2592         }
2593     } while (!pages && again);
2594 
2595     rs->last_seen_block = pss.block;
2596     rs->last_page = pss.page;
2597 
2598     return pages;
2599 }
2600 
2601 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2602 {
2603     uint64_t pages = size / TARGET_PAGE_SIZE;
2604 
2605     if (zero) {
2606         ram_counters.duplicate += pages;
2607     } else {
2608         ram_counters.normal += pages;
2609         ram_counters.transferred += size;
2610         qemu_update_position(f, size);
2611     }
2612 }
2613 
2614 static uint64_t ram_bytes_total_common(bool count_ignored)
2615 {
2616     RAMBlock *block;
2617     uint64_t total = 0;
2618 
2619     rcu_read_lock();
2620     if (count_ignored) {
2621         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2622             total += block->used_length;
2623         }
2624     } else {
2625         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2626             total += block->used_length;
2627         }
2628     }
2629     rcu_read_unlock();
2630     return total;
2631 }
2632 
2633 uint64_t ram_bytes_total(void)
2634 {
2635     return ram_bytes_total_common(false);
2636 }
2637 
2638 static void xbzrle_load_setup(void)
2639 {
2640     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2641 }
2642 
2643 static void xbzrle_load_cleanup(void)
2644 {
2645     g_free(XBZRLE.decoded_buf);
2646     XBZRLE.decoded_buf = NULL;
2647 }
2648 
2649 static void ram_state_cleanup(RAMState **rsp)
2650 {
2651     if (*rsp) {
2652         migration_page_queue_free(*rsp);
2653         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2654         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2655         g_free(*rsp);
2656         *rsp = NULL;
2657     }
2658 }
2659 
2660 static void xbzrle_cleanup(void)
2661 {
2662     XBZRLE_cache_lock();
2663     if (XBZRLE.cache) {
2664         cache_fini(XBZRLE.cache);
2665         g_free(XBZRLE.encoded_buf);
2666         g_free(XBZRLE.current_buf);
2667         g_free(XBZRLE.zero_target_page);
2668         XBZRLE.cache = NULL;
2669         XBZRLE.encoded_buf = NULL;
2670         XBZRLE.current_buf = NULL;
2671         XBZRLE.zero_target_page = NULL;
2672     }
2673     XBZRLE_cache_unlock();
2674 }
2675 
2676 static void ram_save_cleanup(void *opaque)
2677 {
2678     RAMState **rsp = opaque;
2679     RAMBlock *block;
2680 
2681     /* caller have hold iothread lock or is in a bh, so there is
2682      * no writing race against the migration bitmap
2683      */
2684     memory_global_dirty_log_stop();
2685 
2686     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2687         g_free(block->bmap);
2688         block->bmap = NULL;
2689         g_free(block->unsentmap);
2690         block->unsentmap = NULL;
2691     }
2692 
2693     xbzrle_cleanup();
2694     compress_threads_save_cleanup();
2695     ram_state_cleanup(rsp);
2696 }
2697 
2698 static void ram_state_reset(RAMState *rs)
2699 {
2700     rs->last_seen_block = NULL;
2701     rs->last_sent_block = NULL;
2702     rs->last_page = 0;
2703     rs->last_version = ram_list.version;
2704     rs->ram_bulk_stage = true;
2705     rs->fpo_enabled = false;
2706 }
2707 
2708 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2709 
2710 /*
2711  * 'expected' is the value you expect the bitmap mostly to be full
2712  * of; it won't bother printing lines that are all this value.
2713  * If 'todump' is null the migration bitmap is dumped.
2714  */
2715 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2716                            unsigned long pages)
2717 {
2718     int64_t cur;
2719     int64_t linelen = 128;
2720     char linebuf[129];
2721 
2722     for (cur = 0; cur < pages; cur += linelen) {
2723         int64_t curb;
2724         bool found = false;
2725         /*
2726          * Last line; catch the case where the line length
2727          * is longer than remaining ram
2728          */
2729         if (cur + linelen > pages) {
2730             linelen = pages - cur;
2731         }
2732         for (curb = 0; curb < linelen; curb++) {
2733             bool thisbit = test_bit(cur + curb, todump);
2734             linebuf[curb] = thisbit ? '1' : '.';
2735             found = found || (thisbit != expected);
2736         }
2737         if (found) {
2738             linebuf[curb] = '\0';
2739             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2740         }
2741     }
2742 }
2743 
2744 /* **** functions for postcopy ***** */
2745 
2746 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2747 {
2748     struct RAMBlock *block;
2749 
2750     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2751         unsigned long *bitmap = block->bmap;
2752         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2753         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2754 
2755         while (run_start < range) {
2756             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2757             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2758                               (run_end - run_start) << TARGET_PAGE_BITS);
2759             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2760         }
2761     }
2762 }
2763 
2764 /**
2765  * postcopy_send_discard_bm_ram: discard a RAMBlock
2766  *
2767  * Returns zero on success
2768  *
2769  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2770  * Note: At this point the 'unsentmap' is the processed bitmap combined
2771  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2772  *
2773  * @ms: current migration state
2774  * @pds: state for postcopy
2775  * @start: RAMBlock starting page
2776  * @length: RAMBlock size
2777  */
2778 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2779                                         PostcopyDiscardState *pds,
2780                                         RAMBlock *block)
2781 {
2782     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2783     unsigned long current;
2784     unsigned long *unsentmap = block->unsentmap;
2785 
2786     for (current = 0; current < end; ) {
2787         unsigned long one = find_next_bit(unsentmap, end, current);
2788 
2789         if (one <= end) {
2790             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2791             unsigned long discard_length;
2792 
2793             if (zero >= end) {
2794                 discard_length = end - one;
2795             } else {
2796                 discard_length = zero - one;
2797             }
2798             if (discard_length) {
2799                 postcopy_discard_send_range(ms, pds, one, discard_length);
2800             }
2801             current = one + discard_length;
2802         } else {
2803             current = one;
2804         }
2805     }
2806 
2807     return 0;
2808 }
2809 
2810 /**
2811  * postcopy_each_ram_send_discard: discard all RAMBlocks
2812  *
2813  * Returns 0 for success or negative for error
2814  *
2815  * Utility for the outgoing postcopy code.
2816  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2817  *   passing it bitmap indexes and name.
2818  * (qemu_ram_foreach_block ends up passing unscaled lengths
2819  *  which would mean postcopy code would have to deal with target page)
2820  *
2821  * @ms: current migration state
2822  */
2823 static int postcopy_each_ram_send_discard(MigrationState *ms)
2824 {
2825     struct RAMBlock *block;
2826     int ret;
2827 
2828     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2829         PostcopyDiscardState *pds =
2830             postcopy_discard_send_init(ms, block->idstr);
2831 
2832         /*
2833          * Postcopy sends chunks of bitmap over the wire, but it
2834          * just needs indexes at this point, avoids it having
2835          * target page specific code.
2836          */
2837         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2838         postcopy_discard_send_finish(ms, pds);
2839         if (ret) {
2840             return ret;
2841         }
2842     }
2843 
2844     return 0;
2845 }
2846 
2847 /**
2848  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2849  *
2850  * Helper for postcopy_chunk_hostpages; it's called twice to
2851  * canonicalize the two bitmaps, that are similar, but one is
2852  * inverted.
2853  *
2854  * Postcopy requires that all target pages in a hostpage are dirty or
2855  * clean, not a mix.  This function canonicalizes the bitmaps.
2856  *
2857  * @ms: current migration state
2858  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2859  *               otherwise we need to canonicalize partially dirty host pages
2860  * @block: block that contains the page we want to canonicalize
2861  * @pds: state for postcopy
2862  */
2863 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2864                                           RAMBlock *block,
2865                                           PostcopyDiscardState *pds)
2866 {
2867     RAMState *rs = ram_state;
2868     unsigned long *bitmap = block->bmap;
2869     unsigned long *unsentmap = block->unsentmap;
2870     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2871     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2872     unsigned long run_start;
2873 
2874     if (block->page_size == TARGET_PAGE_SIZE) {
2875         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2876         return;
2877     }
2878 
2879     if (unsent_pass) {
2880         /* Find a sent page */
2881         run_start = find_next_zero_bit(unsentmap, pages, 0);
2882     } else {
2883         /* Find a dirty page */
2884         run_start = find_next_bit(bitmap, pages, 0);
2885     }
2886 
2887     while (run_start < pages) {
2888         bool do_fixup = false;
2889         unsigned long fixup_start_addr;
2890         unsigned long host_offset;
2891 
2892         /*
2893          * If the start of this run of pages is in the middle of a host
2894          * page, then we need to fixup this host page.
2895          */
2896         host_offset = run_start % host_ratio;
2897         if (host_offset) {
2898             do_fixup = true;
2899             run_start -= host_offset;
2900             fixup_start_addr = run_start;
2901             /* For the next pass */
2902             run_start = run_start + host_ratio;
2903         } else {
2904             /* Find the end of this run */
2905             unsigned long run_end;
2906             if (unsent_pass) {
2907                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2908             } else {
2909                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2910             }
2911             /*
2912              * If the end isn't at the start of a host page, then the
2913              * run doesn't finish at the end of a host page
2914              * and we need to discard.
2915              */
2916             host_offset = run_end % host_ratio;
2917             if (host_offset) {
2918                 do_fixup = true;
2919                 fixup_start_addr = run_end - host_offset;
2920                 /*
2921                  * This host page has gone, the next loop iteration starts
2922                  * from after the fixup
2923                  */
2924                 run_start = fixup_start_addr + host_ratio;
2925             } else {
2926                 /*
2927                  * No discards on this iteration, next loop starts from
2928                  * next sent/dirty page
2929                  */
2930                 run_start = run_end + 1;
2931             }
2932         }
2933 
2934         if (do_fixup) {
2935             unsigned long page;
2936 
2937             /* Tell the destination to discard this page */
2938             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2939                 /* For the unsent_pass we:
2940                  *     discard partially sent pages
2941                  * For the !unsent_pass (dirty) we:
2942                  *     discard partially dirty pages that were sent
2943                  *     (any partially sent pages were already discarded
2944                  *     by the previous unsent_pass)
2945                  */
2946                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2947                                             host_ratio);
2948             }
2949 
2950             /* Clean up the bitmap */
2951             for (page = fixup_start_addr;
2952                  page < fixup_start_addr + host_ratio; page++) {
2953                 /* All pages in this host page are now not sent */
2954                 set_bit(page, unsentmap);
2955 
2956                 /*
2957                  * Remark them as dirty, updating the count for any pages
2958                  * that weren't previously dirty.
2959                  */
2960                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2961             }
2962         }
2963 
2964         if (unsent_pass) {
2965             /* Find the next sent page for the next iteration */
2966             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2967         } else {
2968             /* Find the next dirty page for the next iteration */
2969             run_start = find_next_bit(bitmap, pages, run_start);
2970         }
2971     }
2972 }
2973 
2974 /**
2975  * postcopy_chuck_hostpages: discrad any partially sent host page
2976  *
2977  * Utility for the outgoing postcopy code.
2978  *
2979  * Discard any partially sent host-page size chunks, mark any partially
2980  * dirty host-page size chunks as all dirty.  In this case the host-page
2981  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2982  *
2983  * Returns zero on success
2984  *
2985  * @ms: current migration state
2986  * @block: block we want to work with
2987  */
2988 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2989 {
2990     PostcopyDiscardState *pds =
2991         postcopy_discard_send_init(ms, block->idstr);
2992 
2993     /* First pass: Discard all partially sent host pages */
2994     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2995     /*
2996      * Second pass: Ensure that all partially dirty host pages are made
2997      * fully dirty.
2998      */
2999     postcopy_chunk_hostpages_pass(ms, false, block, pds);
3000 
3001     postcopy_discard_send_finish(ms, pds);
3002     return 0;
3003 }
3004 
3005 /**
3006  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3007  *
3008  * Returns zero on success
3009  *
3010  * Transmit the set of pages to be discarded after precopy to the target
3011  * these are pages that:
3012  *     a) Have been previously transmitted but are now dirty again
3013  *     b) Pages that have never been transmitted, this ensures that
3014  *        any pages on the destination that have been mapped by background
3015  *        tasks get discarded (transparent huge pages is the specific concern)
3016  * Hopefully this is pretty sparse
3017  *
3018  * @ms: current migration state
3019  */
3020 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3021 {
3022     RAMState *rs = ram_state;
3023     RAMBlock *block;
3024     int ret;
3025 
3026     rcu_read_lock();
3027 
3028     /* This should be our last sync, the src is now paused */
3029     migration_bitmap_sync(rs);
3030 
3031     /* Easiest way to make sure we don't resume in the middle of a host-page */
3032     rs->last_seen_block = NULL;
3033     rs->last_sent_block = NULL;
3034     rs->last_page = 0;
3035 
3036     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3037         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3038         unsigned long *bitmap = block->bmap;
3039         unsigned long *unsentmap = block->unsentmap;
3040 
3041         if (!unsentmap) {
3042             /* We don't have a safe way to resize the sentmap, so
3043              * if the bitmap was resized it will be NULL at this
3044              * point.
3045              */
3046             error_report("migration ram resized during precopy phase");
3047             rcu_read_unlock();
3048             return -EINVAL;
3049         }
3050         /* Deal with TPS != HPS and huge pages */
3051         ret = postcopy_chunk_hostpages(ms, block);
3052         if (ret) {
3053             rcu_read_unlock();
3054             return ret;
3055         }
3056 
3057         /*
3058          * Update the unsentmap to be unsentmap = unsentmap | dirty
3059          */
3060         bitmap_or(unsentmap, unsentmap, bitmap, pages);
3061 #ifdef DEBUG_POSTCOPY
3062         ram_debug_dump_bitmap(unsentmap, true, pages);
3063 #endif
3064     }
3065     trace_ram_postcopy_send_discard_bitmap();
3066 
3067     ret = postcopy_each_ram_send_discard(ms);
3068     rcu_read_unlock();
3069 
3070     return ret;
3071 }
3072 
3073 /**
3074  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3075  *
3076  * Returns zero on success
3077  *
3078  * @rbname: name of the RAMBlock of the request. NULL means the
3079  *          same that last one.
3080  * @start: RAMBlock starting page
3081  * @length: RAMBlock size
3082  */
3083 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3084 {
3085     int ret = -1;
3086 
3087     trace_ram_discard_range(rbname, start, length);
3088 
3089     rcu_read_lock();
3090     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3091 
3092     if (!rb) {
3093         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3094         goto err;
3095     }
3096 
3097     /*
3098      * On source VM, we don't need to update the received bitmap since
3099      * we don't even have one.
3100      */
3101     if (rb->receivedmap) {
3102         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3103                      length >> qemu_target_page_bits());
3104     }
3105 
3106     ret = ram_block_discard_range(rb, start, length);
3107 
3108 err:
3109     rcu_read_unlock();
3110 
3111     return ret;
3112 }
3113 
3114 /*
3115  * For every allocation, we will try not to crash the VM if the
3116  * allocation failed.
3117  */
3118 static int xbzrle_init(void)
3119 {
3120     Error *local_err = NULL;
3121 
3122     if (!migrate_use_xbzrle()) {
3123         return 0;
3124     }
3125 
3126     XBZRLE_cache_lock();
3127 
3128     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3129     if (!XBZRLE.zero_target_page) {
3130         error_report("%s: Error allocating zero page", __func__);
3131         goto err_out;
3132     }
3133 
3134     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3135                               TARGET_PAGE_SIZE, &local_err);
3136     if (!XBZRLE.cache) {
3137         error_report_err(local_err);
3138         goto free_zero_page;
3139     }
3140 
3141     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3142     if (!XBZRLE.encoded_buf) {
3143         error_report("%s: Error allocating encoded_buf", __func__);
3144         goto free_cache;
3145     }
3146 
3147     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3148     if (!XBZRLE.current_buf) {
3149         error_report("%s: Error allocating current_buf", __func__);
3150         goto free_encoded_buf;
3151     }
3152 
3153     /* We are all good */
3154     XBZRLE_cache_unlock();
3155     return 0;
3156 
3157 free_encoded_buf:
3158     g_free(XBZRLE.encoded_buf);
3159     XBZRLE.encoded_buf = NULL;
3160 free_cache:
3161     cache_fini(XBZRLE.cache);
3162     XBZRLE.cache = NULL;
3163 free_zero_page:
3164     g_free(XBZRLE.zero_target_page);
3165     XBZRLE.zero_target_page = NULL;
3166 err_out:
3167     XBZRLE_cache_unlock();
3168     return -ENOMEM;
3169 }
3170 
3171 static int ram_state_init(RAMState **rsp)
3172 {
3173     *rsp = g_try_new0(RAMState, 1);
3174 
3175     if (!*rsp) {
3176         error_report("%s: Init ramstate fail", __func__);
3177         return -1;
3178     }
3179 
3180     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3181     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3182     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3183 
3184     /*
3185      * Count the total number of pages used by ram blocks not including any
3186      * gaps due to alignment or unplugs.
3187      */
3188     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3189 
3190     ram_state_reset(*rsp);
3191 
3192     return 0;
3193 }
3194 
3195 static void ram_list_init_bitmaps(void)
3196 {
3197     RAMBlock *block;
3198     unsigned long pages;
3199 
3200     /* Skip setting bitmap if there is no RAM */
3201     if (ram_bytes_total()) {
3202         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3203             pages = block->max_length >> TARGET_PAGE_BITS;
3204             block->bmap = bitmap_new(pages);
3205             bitmap_set(block->bmap, 0, pages);
3206             if (migrate_postcopy_ram()) {
3207                 block->unsentmap = bitmap_new(pages);
3208                 bitmap_set(block->unsentmap, 0, pages);
3209             }
3210         }
3211     }
3212 }
3213 
3214 static void ram_init_bitmaps(RAMState *rs)
3215 {
3216     /* For memory_global_dirty_log_start below.  */
3217     qemu_mutex_lock_iothread();
3218     qemu_mutex_lock_ramlist();
3219     rcu_read_lock();
3220 
3221     ram_list_init_bitmaps();
3222     memory_global_dirty_log_start();
3223     migration_bitmap_sync_precopy(rs);
3224 
3225     rcu_read_unlock();
3226     qemu_mutex_unlock_ramlist();
3227     qemu_mutex_unlock_iothread();
3228 }
3229 
3230 static int ram_init_all(RAMState **rsp)
3231 {
3232     if (ram_state_init(rsp)) {
3233         return -1;
3234     }
3235 
3236     if (xbzrle_init()) {
3237         ram_state_cleanup(rsp);
3238         return -1;
3239     }
3240 
3241     ram_init_bitmaps(*rsp);
3242 
3243     return 0;
3244 }
3245 
3246 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3247 {
3248     RAMBlock *block;
3249     uint64_t pages = 0;
3250 
3251     /*
3252      * Postcopy is not using xbzrle/compression, so no need for that.
3253      * Also, since source are already halted, we don't need to care
3254      * about dirty page logging as well.
3255      */
3256 
3257     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3258         pages += bitmap_count_one(block->bmap,
3259                                   block->used_length >> TARGET_PAGE_BITS);
3260     }
3261 
3262     /* This may not be aligned with current bitmaps. Recalculate. */
3263     rs->migration_dirty_pages = pages;
3264 
3265     rs->last_seen_block = NULL;
3266     rs->last_sent_block = NULL;
3267     rs->last_page = 0;
3268     rs->last_version = ram_list.version;
3269     /*
3270      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3271      * matter what we have sent.
3272      */
3273     rs->ram_bulk_stage = false;
3274 
3275     /* Update RAMState cache of output QEMUFile */
3276     rs->f = out;
3277 
3278     trace_ram_state_resume_prepare(pages);
3279 }
3280 
3281 /*
3282  * This function clears bits of the free pages reported by the caller from the
3283  * migration dirty bitmap. @addr is the host address corresponding to the
3284  * start of the continuous guest free pages, and @len is the total bytes of
3285  * those pages.
3286  */
3287 void qemu_guest_free_page_hint(void *addr, size_t len)
3288 {
3289     RAMBlock *block;
3290     ram_addr_t offset;
3291     size_t used_len, start, npages;
3292     MigrationState *s = migrate_get_current();
3293 
3294     /* This function is currently expected to be used during live migration */
3295     if (!migration_is_setup_or_active(s->state)) {
3296         return;
3297     }
3298 
3299     for (; len > 0; len -= used_len, addr += used_len) {
3300         block = qemu_ram_block_from_host(addr, false, &offset);
3301         if (unlikely(!block || offset >= block->used_length)) {
3302             /*
3303              * The implementation might not support RAMBlock resize during
3304              * live migration, but it could happen in theory with future
3305              * updates. So we add a check here to capture that case.
3306              */
3307             error_report_once("%s unexpected error", __func__);
3308             return;
3309         }
3310 
3311         if (len <= block->used_length - offset) {
3312             used_len = len;
3313         } else {
3314             used_len = block->used_length - offset;
3315         }
3316 
3317         start = offset >> TARGET_PAGE_BITS;
3318         npages = used_len >> TARGET_PAGE_BITS;
3319 
3320         qemu_mutex_lock(&ram_state->bitmap_mutex);
3321         ram_state->migration_dirty_pages -=
3322                       bitmap_count_one_with_offset(block->bmap, start, npages);
3323         bitmap_clear(block->bmap, start, npages);
3324         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3325     }
3326 }
3327 
3328 /*
3329  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3330  * long-running RCU critical section.  When rcu-reclaims in the code
3331  * start to become numerous it will be necessary to reduce the
3332  * granularity of these critical sections.
3333  */
3334 
3335 /**
3336  * ram_save_setup: Setup RAM for migration
3337  *
3338  * Returns zero to indicate success and negative for error
3339  *
3340  * @f: QEMUFile where to send the data
3341  * @opaque: RAMState pointer
3342  */
3343 static int ram_save_setup(QEMUFile *f, void *opaque)
3344 {
3345     RAMState **rsp = opaque;
3346     RAMBlock *block;
3347 
3348     if (compress_threads_save_setup()) {
3349         return -1;
3350     }
3351 
3352     /* migration has already setup the bitmap, reuse it. */
3353     if (!migration_in_colo_state()) {
3354         if (ram_init_all(rsp) != 0) {
3355             compress_threads_save_cleanup();
3356             return -1;
3357         }
3358     }
3359     (*rsp)->f = f;
3360 
3361     rcu_read_lock();
3362 
3363     qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3364 
3365     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3366         qemu_put_byte(f, strlen(block->idstr));
3367         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3368         qemu_put_be64(f, block->used_length);
3369         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3370             qemu_put_be64(f, block->page_size);
3371         }
3372         if (migrate_ignore_shared()) {
3373             qemu_put_be64(f, block->mr->addr);
3374             qemu_put_byte(f, ramblock_is_ignored(block) ? 1 : 0);
3375         }
3376     }
3377 
3378     rcu_read_unlock();
3379 
3380     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3381     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3382 
3383     multifd_send_sync_main();
3384     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3385     qemu_fflush(f);
3386 
3387     return 0;
3388 }
3389 
3390 /**
3391  * ram_save_iterate: iterative stage for migration
3392  *
3393  * Returns zero to indicate success and negative for error
3394  *
3395  * @f: QEMUFile where to send the data
3396  * @opaque: RAMState pointer
3397  */
3398 static int ram_save_iterate(QEMUFile *f, void *opaque)
3399 {
3400     RAMState **temp = opaque;
3401     RAMState *rs = *temp;
3402     int ret;
3403     int i;
3404     int64_t t0;
3405     int done = 0;
3406 
3407     if (blk_mig_bulk_active()) {
3408         /* Avoid transferring ram during bulk phase of block migration as
3409          * the bulk phase will usually take a long time and transferring
3410          * ram updates during that time is pointless. */
3411         goto out;
3412     }
3413 
3414     rcu_read_lock();
3415     if (ram_list.version != rs->last_version) {
3416         ram_state_reset(rs);
3417     }
3418 
3419     /* Read version before ram_list.blocks */
3420     smp_rmb();
3421 
3422     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3423 
3424     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3425     i = 0;
3426     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3427             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3428         int pages;
3429 
3430         if (qemu_file_get_error(f)) {
3431             break;
3432         }
3433 
3434         pages = ram_find_and_save_block(rs, false);
3435         /* no more pages to sent */
3436         if (pages == 0) {
3437             done = 1;
3438             break;
3439         }
3440 
3441         if (pages < 0) {
3442             qemu_file_set_error(f, pages);
3443             break;
3444         }
3445 
3446         rs->target_page_count += pages;
3447 
3448         /* we want to check in the 1st loop, just in case it was the 1st time
3449            and we had to sync the dirty bitmap.
3450            qemu_clock_get_ns() is a bit expensive, so we only check each some
3451            iterations
3452         */
3453         if ((i & 63) == 0) {
3454             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3455             if (t1 > MAX_WAIT) {
3456                 trace_ram_save_iterate_big_wait(t1, i);
3457                 break;
3458             }
3459         }
3460         i++;
3461     }
3462     rcu_read_unlock();
3463 
3464     /*
3465      * Must occur before EOS (or any QEMUFile operation)
3466      * because of RDMA protocol.
3467      */
3468     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3469 
3470     multifd_send_sync_main();
3471 out:
3472     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3473     qemu_fflush(f);
3474     ram_counters.transferred += 8;
3475 
3476     ret = qemu_file_get_error(f);
3477     if (ret < 0) {
3478         return ret;
3479     }
3480 
3481     return done;
3482 }
3483 
3484 /**
3485  * ram_save_complete: function called to send the remaining amount of ram
3486  *
3487  * Returns zero to indicate success or negative on error
3488  *
3489  * Called with iothread lock
3490  *
3491  * @f: QEMUFile where to send the data
3492  * @opaque: RAMState pointer
3493  */
3494 static int ram_save_complete(QEMUFile *f, void *opaque)
3495 {
3496     RAMState **temp = opaque;
3497     RAMState *rs = *temp;
3498     int ret = 0;
3499 
3500     rcu_read_lock();
3501 
3502     if (!migration_in_postcopy()) {
3503         migration_bitmap_sync_precopy(rs);
3504     }
3505 
3506     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3507 
3508     /* try transferring iterative blocks of memory */
3509 
3510     /* flush all remaining blocks regardless of rate limiting */
3511     while (true) {
3512         int pages;
3513 
3514         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3515         /* no more blocks to sent */
3516         if (pages == 0) {
3517             break;
3518         }
3519         if (pages < 0) {
3520             ret = pages;
3521             break;
3522         }
3523     }
3524 
3525     flush_compressed_data(rs);
3526     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3527 
3528     rcu_read_unlock();
3529 
3530     multifd_send_sync_main();
3531     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3532     qemu_fflush(f);
3533 
3534     return ret;
3535 }
3536 
3537 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3538                              uint64_t *res_precopy_only,
3539                              uint64_t *res_compatible,
3540                              uint64_t *res_postcopy_only)
3541 {
3542     RAMState **temp = opaque;
3543     RAMState *rs = *temp;
3544     uint64_t remaining_size;
3545 
3546     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3547 
3548     if (!migration_in_postcopy() &&
3549         remaining_size < max_size) {
3550         qemu_mutex_lock_iothread();
3551         rcu_read_lock();
3552         migration_bitmap_sync_precopy(rs);
3553         rcu_read_unlock();
3554         qemu_mutex_unlock_iothread();
3555         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3556     }
3557 
3558     if (migrate_postcopy_ram()) {
3559         /* We can do postcopy, and all the data is postcopiable */
3560         *res_compatible += remaining_size;
3561     } else {
3562         *res_precopy_only += remaining_size;
3563     }
3564 }
3565 
3566 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3567 {
3568     unsigned int xh_len;
3569     int xh_flags;
3570     uint8_t *loaded_data;
3571 
3572     /* extract RLE header */
3573     xh_flags = qemu_get_byte(f);
3574     xh_len = qemu_get_be16(f);
3575 
3576     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3577         error_report("Failed to load XBZRLE page - wrong compression!");
3578         return -1;
3579     }
3580 
3581     if (xh_len > TARGET_PAGE_SIZE) {
3582         error_report("Failed to load XBZRLE page - len overflow!");
3583         return -1;
3584     }
3585     loaded_data = XBZRLE.decoded_buf;
3586     /* load data and decode */
3587     /* it can change loaded_data to point to an internal buffer */
3588     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3589 
3590     /* decode RLE */
3591     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3592                              TARGET_PAGE_SIZE) == -1) {
3593         error_report("Failed to load XBZRLE page - decode error!");
3594         return -1;
3595     }
3596 
3597     return 0;
3598 }
3599 
3600 /**
3601  * ram_block_from_stream: read a RAMBlock id from the migration stream
3602  *
3603  * Must be called from within a rcu critical section.
3604  *
3605  * Returns a pointer from within the RCU-protected ram_list.
3606  *
3607  * @f: QEMUFile where to read the data from
3608  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3609  */
3610 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3611 {
3612     static RAMBlock *block = NULL;
3613     char id[256];
3614     uint8_t len;
3615 
3616     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3617         if (!block) {
3618             error_report("Ack, bad migration stream!");
3619             return NULL;
3620         }
3621         return block;
3622     }
3623 
3624     len = qemu_get_byte(f);
3625     qemu_get_buffer(f, (uint8_t *)id, len);
3626     id[len] = 0;
3627 
3628     block = qemu_ram_block_by_name(id);
3629     if (!block) {
3630         error_report("Can't find block %s", id);
3631         return NULL;
3632     }
3633 
3634     if (ramblock_is_ignored(block)) {
3635         error_report("block %s should not be migrated !", id);
3636         return NULL;
3637     }
3638 
3639     return block;
3640 }
3641 
3642 static inline void *host_from_ram_block_offset(RAMBlock *block,
3643                                                ram_addr_t offset)
3644 {
3645     if (!offset_in_ramblock(block, offset)) {
3646         return NULL;
3647     }
3648 
3649     return block->host + offset;
3650 }
3651 
3652 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3653                                                  ram_addr_t offset)
3654 {
3655     if (!offset_in_ramblock(block, offset)) {
3656         return NULL;
3657     }
3658     if (!block->colo_cache) {
3659         error_report("%s: colo_cache is NULL in block :%s",
3660                      __func__, block->idstr);
3661         return NULL;
3662     }
3663 
3664     /*
3665     * During colo checkpoint, we need bitmap of these migrated pages.
3666     * It help us to decide which pages in ram cache should be flushed
3667     * into VM's RAM later.
3668     */
3669     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3670         ram_state->migration_dirty_pages++;
3671     }
3672     return block->colo_cache + offset;
3673 }
3674 
3675 /**
3676  * ram_handle_compressed: handle the zero page case
3677  *
3678  * If a page (or a whole RDMA chunk) has been
3679  * determined to be zero, then zap it.
3680  *
3681  * @host: host address for the zero page
3682  * @ch: what the page is filled from.  We only support zero
3683  * @size: size of the zero page
3684  */
3685 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3686 {
3687     if (ch != 0 || !is_zero_range(host, size)) {
3688         memset(host, ch, size);
3689     }
3690 }
3691 
3692 /* return the size after decompression, or negative value on error */
3693 static int
3694 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3695                      const uint8_t *source, size_t source_len)
3696 {
3697     int err;
3698 
3699     err = inflateReset(stream);
3700     if (err != Z_OK) {
3701         return -1;
3702     }
3703 
3704     stream->avail_in = source_len;
3705     stream->next_in = (uint8_t *)source;
3706     stream->avail_out = dest_len;
3707     stream->next_out = dest;
3708 
3709     err = inflate(stream, Z_NO_FLUSH);
3710     if (err != Z_STREAM_END) {
3711         return -1;
3712     }
3713 
3714     return stream->total_out;
3715 }
3716 
3717 static void *do_data_decompress(void *opaque)
3718 {
3719     DecompressParam *param = opaque;
3720     unsigned long pagesize;
3721     uint8_t *des;
3722     int len, ret;
3723 
3724     qemu_mutex_lock(&param->mutex);
3725     while (!param->quit) {
3726         if (param->des) {
3727             des = param->des;
3728             len = param->len;
3729             param->des = 0;
3730             qemu_mutex_unlock(&param->mutex);
3731 
3732             pagesize = TARGET_PAGE_SIZE;
3733 
3734             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3735                                        param->compbuf, len);
3736             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3737                 error_report("decompress data failed");
3738                 qemu_file_set_error(decomp_file, ret);
3739             }
3740 
3741             qemu_mutex_lock(&decomp_done_lock);
3742             param->done = true;
3743             qemu_cond_signal(&decomp_done_cond);
3744             qemu_mutex_unlock(&decomp_done_lock);
3745 
3746             qemu_mutex_lock(&param->mutex);
3747         } else {
3748             qemu_cond_wait(&param->cond, &param->mutex);
3749         }
3750     }
3751     qemu_mutex_unlock(&param->mutex);
3752 
3753     return NULL;
3754 }
3755 
3756 static int wait_for_decompress_done(void)
3757 {
3758     int idx, thread_count;
3759 
3760     if (!migrate_use_compression()) {
3761         return 0;
3762     }
3763 
3764     thread_count = migrate_decompress_threads();
3765     qemu_mutex_lock(&decomp_done_lock);
3766     for (idx = 0; idx < thread_count; idx++) {
3767         while (!decomp_param[idx].done) {
3768             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3769         }
3770     }
3771     qemu_mutex_unlock(&decomp_done_lock);
3772     return qemu_file_get_error(decomp_file);
3773 }
3774 
3775 static void compress_threads_load_cleanup(void)
3776 {
3777     int i, thread_count;
3778 
3779     if (!migrate_use_compression()) {
3780         return;
3781     }
3782     thread_count = migrate_decompress_threads();
3783     for (i = 0; i < thread_count; i++) {
3784         /*
3785          * we use it as a indicator which shows if the thread is
3786          * properly init'd or not
3787          */
3788         if (!decomp_param[i].compbuf) {
3789             break;
3790         }
3791 
3792         qemu_mutex_lock(&decomp_param[i].mutex);
3793         decomp_param[i].quit = true;
3794         qemu_cond_signal(&decomp_param[i].cond);
3795         qemu_mutex_unlock(&decomp_param[i].mutex);
3796     }
3797     for (i = 0; i < thread_count; i++) {
3798         if (!decomp_param[i].compbuf) {
3799             break;
3800         }
3801 
3802         qemu_thread_join(decompress_threads + i);
3803         qemu_mutex_destroy(&decomp_param[i].mutex);
3804         qemu_cond_destroy(&decomp_param[i].cond);
3805         inflateEnd(&decomp_param[i].stream);
3806         g_free(decomp_param[i].compbuf);
3807         decomp_param[i].compbuf = NULL;
3808     }
3809     g_free(decompress_threads);
3810     g_free(decomp_param);
3811     decompress_threads = NULL;
3812     decomp_param = NULL;
3813     decomp_file = NULL;
3814 }
3815 
3816 static int compress_threads_load_setup(QEMUFile *f)
3817 {
3818     int i, thread_count;
3819 
3820     if (!migrate_use_compression()) {
3821         return 0;
3822     }
3823 
3824     thread_count = migrate_decompress_threads();
3825     decompress_threads = g_new0(QemuThread, thread_count);
3826     decomp_param = g_new0(DecompressParam, thread_count);
3827     qemu_mutex_init(&decomp_done_lock);
3828     qemu_cond_init(&decomp_done_cond);
3829     decomp_file = f;
3830     for (i = 0; i < thread_count; i++) {
3831         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3832             goto exit;
3833         }
3834 
3835         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3836         qemu_mutex_init(&decomp_param[i].mutex);
3837         qemu_cond_init(&decomp_param[i].cond);
3838         decomp_param[i].done = true;
3839         decomp_param[i].quit = false;
3840         qemu_thread_create(decompress_threads + i, "decompress",
3841                            do_data_decompress, decomp_param + i,
3842                            QEMU_THREAD_JOINABLE);
3843     }
3844     return 0;
3845 exit:
3846     compress_threads_load_cleanup();
3847     return -1;
3848 }
3849 
3850 static void decompress_data_with_multi_threads(QEMUFile *f,
3851                                                void *host, int len)
3852 {
3853     int idx, thread_count;
3854 
3855     thread_count = migrate_decompress_threads();
3856     qemu_mutex_lock(&decomp_done_lock);
3857     while (true) {
3858         for (idx = 0; idx < thread_count; idx++) {
3859             if (decomp_param[idx].done) {
3860                 decomp_param[idx].done = false;
3861                 qemu_mutex_lock(&decomp_param[idx].mutex);
3862                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3863                 decomp_param[idx].des = host;
3864                 decomp_param[idx].len = len;
3865                 qemu_cond_signal(&decomp_param[idx].cond);
3866                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3867                 break;
3868             }
3869         }
3870         if (idx < thread_count) {
3871             break;
3872         } else {
3873             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3874         }
3875     }
3876     qemu_mutex_unlock(&decomp_done_lock);
3877 }
3878 
3879 /*
3880  * colo cache: this is for secondary VM, we cache the whole
3881  * memory of the secondary VM, it is need to hold the global lock
3882  * to call this helper.
3883  */
3884 int colo_init_ram_cache(void)
3885 {
3886     RAMBlock *block;
3887 
3888     rcu_read_lock();
3889     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3890         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3891                                                 NULL,
3892                                                 false);
3893         if (!block->colo_cache) {
3894             error_report("%s: Can't alloc memory for COLO cache of block %s,"
3895                          "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3896                          block->used_length);
3897             goto out_locked;
3898         }
3899         memcpy(block->colo_cache, block->host, block->used_length);
3900     }
3901     rcu_read_unlock();
3902     /*
3903     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3904     * with to decide which page in cache should be flushed into SVM's RAM. Here
3905     * we use the same name 'ram_bitmap' as for migration.
3906     */
3907     if (ram_bytes_total()) {
3908         RAMBlock *block;
3909 
3910         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3911             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3912 
3913             block->bmap = bitmap_new(pages);
3914             bitmap_set(block->bmap, 0, pages);
3915         }
3916     }
3917     ram_state = g_new0(RAMState, 1);
3918     ram_state->migration_dirty_pages = 0;
3919     qemu_mutex_init(&ram_state->bitmap_mutex);
3920     memory_global_dirty_log_start();
3921 
3922     return 0;
3923 
3924 out_locked:
3925 
3926     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3927         if (block->colo_cache) {
3928             qemu_anon_ram_free(block->colo_cache, block->used_length);
3929             block->colo_cache = NULL;
3930         }
3931     }
3932 
3933     rcu_read_unlock();
3934     return -errno;
3935 }
3936 
3937 /* It is need to hold the global lock to call this helper */
3938 void colo_release_ram_cache(void)
3939 {
3940     RAMBlock *block;
3941 
3942     memory_global_dirty_log_stop();
3943     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3944         g_free(block->bmap);
3945         block->bmap = NULL;
3946     }
3947 
3948     rcu_read_lock();
3949 
3950     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3951         if (block->colo_cache) {
3952             qemu_anon_ram_free(block->colo_cache, block->used_length);
3953             block->colo_cache = NULL;
3954         }
3955     }
3956 
3957     rcu_read_unlock();
3958     qemu_mutex_destroy(&ram_state->bitmap_mutex);
3959     g_free(ram_state);
3960     ram_state = NULL;
3961 }
3962 
3963 /**
3964  * ram_load_setup: Setup RAM for migration incoming side
3965  *
3966  * Returns zero to indicate success and negative for error
3967  *
3968  * @f: QEMUFile where to receive the data
3969  * @opaque: RAMState pointer
3970  */
3971 static int ram_load_setup(QEMUFile *f, void *opaque)
3972 {
3973     if (compress_threads_load_setup(f)) {
3974         return -1;
3975     }
3976 
3977     xbzrle_load_setup();
3978     ramblock_recv_map_init();
3979 
3980     return 0;
3981 }
3982 
3983 static int ram_load_cleanup(void *opaque)
3984 {
3985     RAMBlock *rb;
3986 
3987     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3988         if (ramblock_is_pmem(rb)) {
3989             pmem_persist(rb->host, rb->used_length);
3990         }
3991     }
3992 
3993     xbzrle_load_cleanup();
3994     compress_threads_load_cleanup();
3995 
3996     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3997         g_free(rb->receivedmap);
3998         rb->receivedmap = NULL;
3999     }
4000 
4001     return 0;
4002 }
4003 
4004 /**
4005  * ram_postcopy_incoming_init: allocate postcopy data structures
4006  *
4007  * Returns 0 for success and negative if there was one error
4008  *
4009  * @mis: current migration incoming state
4010  *
4011  * Allocate data structures etc needed by incoming migration with
4012  * postcopy-ram. postcopy-ram's similarly names
4013  * postcopy_ram_incoming_init does the work.
4014  */
4015 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4016 {
4017     return postcopy_ram_incoming_init(mis);
4018 }
4019 
4020 /**
4021  * ram_load_postcopy: load a page in postcopy case
4022  *
4023  * Returns 0 for success or -errno in case of error
4024  *
4025  * Called in postcopy mode by ram_load().
4026  * rcu_read_lock is taken prior to this being called.
4027  *
4028  * @f: QEMUFile where to send the data
4029  */
4030 static int ram_load_postcopy(QEMUFile *f)
4031 {
4032     int flags = 0, ret = 0;
4033     bool place_needed = false;
4034     bool matches_target_page_size = false;
4035     MigrationIncomingState *mis = migration_incoming_get_current();
4036     /* Temporary page that is later 'placed' */
4037     void *postcopy_host_page = postcopy_get_tmp_page(mis);
4038     void *last_host = NULL;
4039     bool all_zero = false;
4040 
4041     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4042         ram_addr_t addr;
4043         void *host = NULL;
4044         void *page_buffer = NULL;
4045         void *place_source = NULL;
4046         RAMBlock *block = NULL;
4047         uint8_t ch;
4048 
4049         addr = qemu_get_be64(f);
4050 
4051         /*
4052          * If qemu file error, we should stop here, and then "addr"
4053          * may be invalid
4054          */
4055         ret = qemu_file_get_error(f);
4056         if (ret) {
4057             break;
4058         }
4059 
4060         flags = addr & ~TARGET_PAGE_MASK;
4061         addr &= TARGET_PAGE_MASK;
4062 
4063         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4064         place_needed = false;
4065         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4066             block = ram_block_from_stream(f, flags);
4067 
4068             host = host_from_ram_block_offset(block, addr);
4069             if (!host) {
4070                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4071                 ret = -EINVAL;
4072                 break;
4073             }
4074             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4075             /*
4076              * Postcopy requires that we place whole host pages atomically;
4077              * these may be huge pages for RAMBlocks that are backed by
4078              * hugetlbfs.
4079              * To make it atomic, the data is read into a temporary page
4080              * that's moved into place later.
4081              * The migration protocol uses,  possibly smaller, target-pages
4082              * however the source ensures it always sends all the components
4083              * of a host page in order.
4084              */
4085             page_buffer = postcopy_host_page +
4086                           ((uintptr_t)host & (block->page_size - 1));
4087             /* If all TP are zero then we can optimise the place */
4088             if (!((uintptr_t)host & (block->page_size - 1))) {
4089                 all_zero = true;
4090             } else {
4091                 /* not the 1st TP within the HP */
4092                 if (host != (last_host + TARGET_PAGE_SIZE)) {
4093                     error_report("Non-sequential target page %p/%p",
4094                                   host, last_host);
4095                     ret = -EINVAL;
4096                     break;
4097                 }
4098             }
4099 
4100 
4101             /*
4102              * If it's the last part of a host page then we place the host
4103              * page
4104              */
4105             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4106                                      (block->page_size - 1)) == 0;
4107             place_source = postcopy_host_page;
4108         }
4109         last_host = host;
4110 
4111         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4112         case RAM_SAVE_FLAG_ZERO:
4113             ch = qemu_get_byte(f);
4114             memset(page_buffer, ch, TARGET_PAGE_SIZE);
4115             if (ch) {
4116                 all_zero = false;
4117             }
4118             break;
4119 
4120         case RAM_SAVE_FLAG_PAGE:
4121             all_zero = false;
4122             if (!matches_target_page_size) {
4123                 /* For huge pages, we always use temporary buffer */
4124                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4125             } else {
4126                 /*
4127                  * For small pages that matches target page size, we
4128                  * avoid the qemu_file copy.  Instead we directly use
4129                  * the buffer of QEMUFile to place the page.  Note: we
4130                  * cannot do any QEMUFile operation before using that
4131                  * buffer to make sure the buffer is valid when
4132                  * placing the page.
4133                  */
4134                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4135                                          TARGET_PAGE_SIZE);
4136             }
4137             break;
4138         case RAM_SAVE_FLAG_EOS:
4139             /* normal exit */
4140             multifd_recv_sync_main();
4141             break;
4142         default:
4143             error_report("Unknown combination of migration flags: %#x"
4144                          " (postcopy mode)", flags);
4145             ret = -EINVAL;
4146             break;
4147         }
4148 
4149         /* Detect for any possible file errors */
4150         if (!ret && qemu_file_get_error(f)) {
4151             ret = qemu_file_get_error(f);
4152         }
4153 
4154         if (!ret && place_needed) {
4155             /* This gets called at the last target page in the host page */
4156             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4157 
4158             if (all_zero) {
4159                 ret = postcopy_place_page_zero(mis, place_dest,
4160                                                block);
4161             } else {
4162                 ret = postcopy_place_page(mis, place_dest,
4163                                           place_source, block);
4164             }
4165         }
4166     }
4167 
4168     return ret;
4169 }
4170 
4171 static bool postcopy_is_advised(void)
4172 {
4173     PostcopyState ps = postcopy_state_get();
4174     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4175 }
4176 
4177 static bool postcopy_is_running(void)
4178 {
4179     PostcopyState ps = postcopy_state_get();
4180     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4181 }
4182 
4183 /*
4184  * Flush content of RAM cache into SVM's memory.
4185  * Only flush the pages that be dirtied by PVM or SVM or both.
4186  */
4187 static void colo_flush_ram_cache(void)
4188 {
4189     RAMBlock *block = NULL;
4190     void *dst_host;
4191     void *src_host;
4192     unsigned long offset = 0;
4193 
4194     memory_global_dirty_log_sync();
4195     rcu_read_lock();
4196     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4197         migration_bitmap_sync_range(ram_state, block, block->used_length);
4198     }
4199     rcu_read_unlock();
4200 
4201     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4202     rcu_read_lock();
4203     block = QLIST_FIRST_RCU(&ram_list.blocks);
4204 
4205     while (block) {
4206         offset = migration_bitmap_find_dirty(ram_state, block, offset);
4207 
4208         if (offset << TARGET_PAGE_BITS >= block->used_length) {
4209             offset = 0;
4210             block = QLIST_NEXT_RCU(block, next);
4211         } else {
4212             migration_bitmap_clear_dirty(ram_state, block, offset);
4213             dst_host = block->host + (offset << TARGET_PAGE_BITS);
4214             src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4215             memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4216         }
4217     }
4218 
4219     rcu_read_unlock();
4220     trace_colo_flush_ram_cache_end();
4221 }
4222 
4223 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4224 {
4225     int flags = 0, ret = 0, invalid_flags = 0;
4226     static uint64_t seq_iter;
4227     int len = 0;
4228     /*
4229      * If system is running in postcopy mode, page inserts to host memory must
4230      * be atomic
4231      */
4232     bool postcopy_running = postcopy_is_running();
4233     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4234     bool postcopy_advised = postcopy_is_advised();
4235 
4236     seq_iter++;
4237 
4238     if (version_id != 4) {
4239         ret = -EINVAL;
4240     }
4241 
4242     if (!migrate_use_compression()) {
4243         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4244     }
4245     /* This RCU critical section can be very long running.
4246      * When RCU reclaims in the code start to become numerous,
4247      * it will be necessary to reduce the granularity of this
4248      * critical section.
4249      */
4250     rcu_read_lock();
4251 
4252     if (postcopy_running) {
4253         ret = ram_load_postcopy(f);
4254     }
4255 
4256     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4257         ram_addr_t addr, total_ram_bytes;
4258         void *host = NULL;
4259         uint8_t ch;
4260 
4261         addr = qemu_get_be64(f);
4262         flags = addr & ~TARGET_PAGE_MASK;
4263         addr &= TARGET_PAGE_MASK;
4264 
4265         if (flags & invalid_flags) {
4266             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4267                 error_report("Received an unexpected compressed page");
4268             }
4269 
4270             ret = -EINVAL;
4271             break;
4272         }
4273 
4274         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4275                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4276             RAMBlock *block = ram_block_from_stream(f, flags);
4277 
4278             /*
4279              * After going into COLO, we should load the Page into colo_cache.
4280              */
4281             if (migration_incoming_in_colo_state()) {
4282                 host = colo_cache_from_block_offset(block, addr);
4283             } else {
4284                 host = host_from_ram_block_offset(block, addr);
4285             }
4286             if (!host) {
4287                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4288                 ret = -EINVAL;
4289                 break;
4290             }
4291 
4292             if (!migration_incoming_in_colo_state()) {
4293                 ramblock_recv_bitmap_set(block, host);
4294             }
4295 
4296             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4297         }
4298 
4299         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4300         case RAM_SAVE_FLAG_MEM_SIZE:
4301             /* Synchronize RAM block list */
4302             total_ram_bytes = addr;
4303             while (!ret && total_ram_bytes) {
4304                 RAMBlock *block;
4305                 char id[256];
4306                 ram_addr_t length;
4307 
4308                 len = qemu_get_byte(f);
4309                 qemu_get_buffer(f, (uint8_t *)id, len);
4310                 id[len] = 0;
4311                 length = qemu_get_be64(f);
4312 
4313                 block = qemu_ram_block_by_name(id);
4314                 if (block && !qemu_ram_is_migratable(block)) {
4315                     error_report("block %s should not be migrated !", id);
4316                     ret = -EINVAL;
4317                 } else if (block) {
4318                     if (length != block->used_length) {
4319                         Error *local_err = NULL;
4320 
4321                         ret = qemu_ram_resize(block, length,
4322                                               &local_err);
4323                         if (local_err) {
4324                             error_report_err(local_err);
4325                         }
4326                     }
4327                     /* For postcopy we need to check hugepage sizes match */
4328                     if (postcopy_advised &&
4329                         block->page_size != qemu_host_page_size) {
4330                         uint64_t remote_page_size = qemu_get_be64(f);
4331                         if (remote_page_size != block->page_size) {
4332                             error_report("Mismatched RAM page size %s "
4333                                          "(local) %zd != %" PRId64,
4334                                          id, block->page_size,
4335                                          remote_page_size);
4336                             ret = -EINVAL;
4337                         }
4338                     }
4339                     if (migrate_ignore_shared()) {
4340                         hwaddr addr = qemu_get_be64(f);
4341                         bool ignored = qemu_get_byte(f);
4342                         if (ignored != ramblock_is_ignored(block)) {
4343                             error_report("RAM block %s should %s be migrated",
4344                                          id, ignored ? "" : "not");
4345                             ret = -EINVAL;
4346                         }
4347                         if (ramblock_is_ignored(block) &&
4348                             block->mr->addr != addr) {
4349                             error_report("Mismatched GPAs for block %s "
4350                                          "%" PRId64 "!= %" PRId64,
4351                                          id, (uint64_t)addr,
4352                                          (uint64_t)block->mr->addr);
4353                             ret = -EINVAL;
4354                         }
4355                     }
4356                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4357                                           block->idstr);
4358                 } else {
4359                     error_report("Unknown ramblock \"%s\", cannot "
4360                                  "accept migration", id);
4361                     ret = -EINVAL;
4362                 }
4363 
4364                 total_ram_bytes -= length;
4365             }
4366             break;
4367 
4368         case RAM_SAVE_FLAG_ZERO:
4369             ch = qemu_get_byte(f);
4370             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4371             break;
4372 
4373         case RAM_SAVE_FLAG_PAGE:
4374             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4375             break;
4376 
4377         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4378             len = qemu_get_be32(f);
4379             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4380                 error_report("Invalid compressed data length: %d", len);
4381                 ret = -EINVAL;
4382                 break;
4383             }
4384             decompress_data_with_multi_threads(f, host, len);
4385             break;
4386 
4387         case RAM_SAVE_FLAG_XBZRLE:
4388             if (load_xbzrle(f, addr, host) < 0) {
4389                 error_report("Failed to decompress XBZRLE page at "
4390                              RAM_ADDR_FMT, addr);
4391                 ret = -EINVAL;
4392                 break;
4393             }
4394             break;
4395         case RAM_SAVE_FLAG_EOS:
4396             /* normal exit */
4397             multifd_recv_sync_main();
4398             break;
4399         default:
4400             if (flags & RAM_SAVE_FLAG_HOOK) {
4401                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4402             } else {
4403                 error_report("Unknown combination of migration flags: %#x",
4404                              flags);
4405                 ret = -EINVAL;
4406             }
4407         }
4408         if (!ret) {
4409             ret = qemu_file_get_error(f);
4410         }
4411     }
4412 
4413     ret |= wait_for_decompress_done();
4414     rcu_read_unlock();
4415     trace_ram_load_complete(ret, seq_iter);
4416 
4417     if (!ret  && migration_incoming_in_colo_state()) {
4418         colo_flush_ram_cache();
4419     }
4420     return ret;
4421 }
4422 
4423 static bool ram_has_postcopy(void *opaque)
4424 {
4425     RAMBlock *rb;
4426     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4427         if (ramblock_is_pmem(rb)) {
4428             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4429                          "is not supported now!", rb->idstr, rb->host);
4430             return false;
4431         }
4432     }
4433 
4434     return migrate_postcopy_ram();
4435 }
4436 
4437 /* Sync all the dirty bitmap with destination VM.  */
4438 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4439 {
4440     RAMBlock *block;
4441     QEMUFile *file = s->to_dst_file;
4442     int ramblock_count = 0;
4443 
4444     trace_ram_dirty_bitmap_sync_start();
4445 
4446     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4447         qemu_savevm_send_recv_bitmap(file, block->idstr);
4448         trace_ram_dirty_bitmap_request(block->idstr);
4449         ramblock_count++;
4450     }
4451 
4452     trace_ram_dirty_bitmap_sync_wait();
4453 
4454     /* Wait until all the ramblocks' dirty bitmap synced */
4455     while (ramblock_count--) {
4456         qemu_sem_wait(&s->rp_state.rp_sem);
4457     }
4458 
4459     trace_ram_dirty_bitmap_sync_complete();
4460 
4461     return 0;
4462 }
4463 
4464 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4465 {
4466     qemu_sem_post(&s->rp_state.rp_sem);
4467 }
4468 
4469 /*
4470  * Read the received bitmap, revert it as the initial dirty bitmap.
4471  * This is only used when the postcopy migration is paused but wants
4472  * to resume from a middle point.
4473  */
4474 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4475 {
4476     int ret = -EINVAL;
4477     QEMUFile *file = s->rp_state.from_dst_file;
4478     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4479     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4480     uint64_t size, end_mark;
4481 
4482     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4483 
4484     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4485         error_report("%s: incorrect state %s", __func__,
4486                      MigrationStatus_str(s->state));
4487         return -EINVAL;
4488     }
4489 
4490     /*
4491      * Note: see comments in ramblock_recv_bitmap_send() on why we
4492      * need the endianess convertion, and the paddings.
4493      */
4494     local_size = ROUND_UP(local_size, 8);
4495 
4496     /* Add paddings */
4497     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4498 
4499     size = qemu_get_be64(file);
4500 
4501     /* The size of the bitmap should match with our ramblock */
4502     if (size != local_size) {
4503         error_report("%s: ramblock '%s' bitmap size mismatch "
4504                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4505                      block->idstr, size, local_size);
4506         ret = -EINVAL;
4507         goto out;
4508     }
4509 
4510     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4511     end_mark = qemu_get_be64(file);
4512 
4513     ret = qemu_file_get_error(file);
4514     if (ret || size != local_size) {
4515         error_report("%s: read bitmap failed for ramblock '%s': %d"
4516                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4517                      __func__, block->idstr, ret, local_size, size);
4518         ret = -EIO;
4519         goto out;
4520     }
4521 
4522     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4523         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4524                      __func__, block->idstr, end_mark);
4525         ret = -EINVAL;
4526         goto out;
4527     }
4528 
4529     /*
4530      * Endianess convertion. We are during postcopy (though paused).
4531      * The dirty bitmap won't change. We can directly modify it.
4532      */
4533     bitmap_from_le(block->bmap, le_bitmap, nbits);
4534 
4535     /*
4536      * What we received is "received bitmap". Revert it as the initial
4537      * dirty bitmap for this ramblock.
4538      */
4539     bitmap_complement(block->bmap, block->bmap, nbits);
4540 
4541     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4542 
4543     /*
4544      * We succeeded to sync bitmap for current ramblock. If this is
4545      * the last one to sync, we need to notify the main send thread.
4546      */
4547     ram_dirty_bitmap_reload_notify(s);
4548 
4549     ret = 0;
4550 out:
4551     g_free(le_bitmap);
4552     return ret;
4553 }
4554 
4555 static int ram_resume_prepare(MigrationState *s, void *opaque)
4556 {
4557     RAMState *rs = *(RAMState **)opaque;
4558     int ret;
4559 
4560     ret = ram_dirty_bitmap_sync_all(s, rs);
4561     if (ret) {
4562         return ret;
4563     }
4564 
4565     ram_state_resume_prepare(rs, s->to_dst_file);
4566 
4567     return 0;
4568 }
4569 
4570 static SaveVMHandlers savevm_ram_handlers = {
4571     .save_setup = ram_save_setup,
4572     .save_live_iterate = ram_save_iterate,
4573     .save_live_complete_postcopy = ram_save_complete,
4574     .save_live_complete_precopy = ram_save_complete,
4575     .has_postcopy = ram_has_postcopy,
4576     .save_live_pending = ram_save_pending,
4577     .load_state = ram_load,
4578     .save_cleanup = ram_save_cleanup,
4579     .load_setup = ram_load_setup,
4580     .load_cleanup = ram_load_cleanup,
4581     .resume_prepare = ram_resume_prepare,
4582 };
4583 
4584 void ram_mig_init(void)
4585 {
4586     qemu_mutex_init(&XBZRLE.lock);
4587     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4588 }
4589