xref: /openbmc/qemu/migration/ram.c (revision 4a12a11a)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "qemu/pmem.h"
37 #include "xbzrle.h"
38 #include "ram.h"
39 #include "migration.h"
40 #include "socket.h"
41 #include "migration/register.h"
42 #include "migration/misc.h"
43 #include "qemu-file.h"
44 #include "postcopy-ram.h"
45 #include "page_cache.h"
46 #include "qemu/error-report.h"
47 #include "qapi/error.h"
48 #include "qapi/qapi-events-migration.h"
49 #include "qapi/qmp/qerror.h"
50 #include "trace.h"
51 #include "exec/ram_addr.h"
52 #include "exec/target_page.h"
53 #include "qemu/rcu_queue.h"
54 #include "migration/colo.h"
55 #include "block.h"
56 #include "sysemu/sysemu.h"
57 #include "qemu/uuid.h"
58 #include "savevm.h"
59 #include "qemu/iov.h"
60 
61 /***********************************************************/
62 /* ram save/restore */
63 
64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
65  * worked for pages that where filled with the same char.  We switched
66  * it to only search for the zero value.  And to avoid confusion with
67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
68  */
69 
70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
71 #define RAM_SAVE_FLAG_ZERO     0x02
72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
73 #define RAM_SAVE_FLAG_PAGE     0x08
74 #define RAM_SAVE_FLAG_EOS      0x10
75 #define RAM_SAVE_FLAG_CONTINUE 0x20
76 #define RAM_SAVE_FLAG_XBZRLE   0x40
77 /* 0x80 is reserved in migration.h start with 0x100 next */
78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
79 
80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
81 {
82     return buffer_is_zero(p, size);
83 }
84 
85 XBZRLECacheStats xbzrle_counters;
86 
87 /* struct contains XBZRLE cache and a static page
88    used by the compression */
89 static struct {
90     /* buffer used for XBZRLE encoding */
91     uint8_t *encoded_buf;
92     /* buffer for storing page content */
93     uint8_t *current_buf;
94     /* Cache for XBZRLE, Protected by lock. */
95     PageCache *cache;
96     QemuMutex lock;
97     /* it will store a page full of zeros */
98     uint8_t *zero_target_page;
99     /* buffer used for XBZRLE decoding */
100     uint8_t *decoded_buf;
101 } XBZRLE;
102 
103 static void XBZRLE_cache_lock(void)
104 {
105     if (migrate_use_xbzrle())
106         qemu_mutex_lock(&XBZRLE.lock);
107 }
108 
109 static void XBZRLE_cache_unlock(void)
110 {
111     if (migrate_use_xbzrle())
112         qemu_mutex_unlock(&XBZRLE.lock);
113 }
114 
115 /**
116  * xbzrle_cache_resize: resize the xbzrle cache
117  *
118  * This function is called from qmp_migrate_set_cache_size in main
119  * thread, possibly while a migration is in progress.  A running
120  * migration may be using the cache and might finish during this call,
121  * hence changes to the cache are protected by XBZRLE.lock().
122  *
123  * Returns 0 for success or -1 for error
124  *
125  * @new_size: new cache size
126  * @errp: set *errp if the check failed, with reason
127  */
128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
129 {
130     PageCache *new_cache;
131     int64_t ret = 0;
132 
133     /* Check for truncation */
134     if (new_size != (size_t)new_size) {
135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
136                    "exceeding address space");
137         return -1;
138     }
139 
140     if (new_size == migrate_xbzrle_cache_size()) {
141         /* nothing to do */
142         return 0;
143     }
144 
145     XBZRLE_cache_lock();
146 
147     if (XBZRLE.cache != NULL) {
148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
149         if (!new_cache) {
150             ret = -1;
151             goto out;
152         }
153 
154         cache_fini(XBZRLE.cache);
155         XBZRLE.cache = new_cache;
156     }
157 out:
158     XBZRLE_cache_unlock();
159     return ret;
160 }
161 
162 static bool ramblock_is_ignored(RAMBlock *block)
163 {
164     return !qemu_ram_is_migratable(block) ||
165            (migrate_ignore_shared() && qemu_ram_is_shared(block));
166 }
167 
168 /* Should be holding either ram_list.mutex, or the RCU lock. */
169 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
170     INTERNAL_RAMBLOCK_FOREACH(block)                   \
171         if (ramblock_is_ignored(block)) {} else
172 
173 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
174     INTERNAL_RAMBLOCK_FOREACH(block)                   \
175         if (!qemu_ram_is_migratable(block)) {} else
176 
177 #undef RAMBLOCK_FOREACH
178 
179 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
180 {
181     RAMBlock *block;
182     int ret = 0;
183 
184     rcu_read_lock();
185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
186         ret = func(block, opaque);
187         if (ret) {
188             break;
189         }
190     }
191     rcu_read_unlock();
192     return ret;
193 }
194 
195 static void ramblock_recv_map_init(void)
196 {
197     RAMBlock *rb;
198 
199     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
200         assert(!rb->receivedmap);
201         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
202     }
203 }
204 
205 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
206 {
207     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
208                     rb->receivedmap);
209 }
210 
211 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
212 {
213     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
214 }
215 
216 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
217 {
218     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
219 }
220 
221 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
222                                     size_t nr)
223 {
224     bitmap_set_atomic(rb->receivedmap,
225                       ramblock_recv_bitmap_offset(host_addr, rb),
226                       nr);
227 }
228 
229 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
230 
231 /*
232  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
233  *
234  * Returns >0 if success with sent bytes, or <0 if error.
235  */
236 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
237                                   const char *block_name)
238 {
239     RAMBlock *block = qemu_ram_block_by_name(block_name);
240     unsigned long *le_bitmap, nbits;
241     uint64_t size;
242 
243     if (!block) {
244         error_report("%s: invalid block name: %s", __func__, block_name);
245         return -1;
246     }
247 
248     nbits = block->used_length >> TARGET_PAGE_BITS;
249 
250     /*
251      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
252      * machines we may need 4 more bytes for padding (see below
253      * comment). So extend it a bit before hand.
254      */
255     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
256 
257     /*
258      * Always use little endian when sending the bitmap. This is
259      * required that when source and destination VMs are not using the
260      * same endianess. (Note: big endian won't work.)
261      */
262     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
263 
264     /* Size of the bitmap, in bytes */
265     size = DIV_ROUND_UP(nbits, 8);
266 
267     /*
268      * size is always aligned to 8 bytes for 64bit machines, but it
269      * may not be true for 32bit machines. We need this padding to
270      * make sure the migration can survive even between 32bit and
271      * 64bit machines.
272      */
273     size = ROUND_UP(size, 8);
274 
275     qemu_put_be64(file, size);
276     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
277     /*
278      * Mark as an end, in case the middle part is screwed up due to
279      * some "misterious" reason.
280      */
281     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
282     qemu_fflush(file);
283 
284     g_free(le_bitmap);
285 
286     if (qemu_file_get_error(file)) {
287         return qemu_file_get_error(file);
288     }
289 
290     return size + sizeof(size);
291 }
292 
293 /*
294  * An outstanding page request, on the source, having been received
295  * and queued
296  */
297 struct RAMSrcPageRequest {
298     RAMBlock *rb;
299     hwaddr    offset;
300     hwaddr    len;
301 
302     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
303 };
304 
305 /* State of RAM for migration */
306 struct RAMState {
307     /* QEMUFile used for this migration */
308     QEMUFile *f;
309     /* Last block that we have visited searching for dirty pages */
310     RAMBlock *last_seen_block;
311     /* Last block from where we have sent data */
312     RAMBlock *last_sent_block;
313     /* Last dirty target page we have sent */
314     ram_addr_t last_page;
315     /* last ram version we have seen */
316     uint32_t last_version;
317     /* We are in the first round */
318     bool ram_bulk_stage;
319     /* The free page optimization is enabled */
320     bool fpo_enabled;
321     /* How many times we have dirty too many pages */
322     int dirty_rate_high_cnt;
323     /* these variables are used for bitmap sync */
324     /* last time we did a full bitmap_sync */
325     int64_t time_last_bitmap_sync;
326     /* bytes transferred at start_time */
327     uint64_t bytes_xfer_prev;
328     /* number of dirty pages since start_time */
329     uint64_t num_dirty_pages_period;
330     /* xbzrle misses since the beginning of the period */
331     uint64_t xbzrle_cache_miss_prev;
332 
333     /* compression statistics since the beginning of the period */
334     /* amount of count that no free thread to compress data */
335     uint64_t compress_thread_busy_prev;
336     /* amount bytes after compression */
337     uint64_t compressed_size_prev;
338     /* amount of compressed pages */
339     uint64_t compress_pages_prev;
340 
341     /* total handled target pages at the beginning of period */
342     uint64_t target_page_count_prev;
343     /* total handled target pages since start */
344     uint64_t target_page_count;
345     /* number of dirty bits in the bitmap */
346     uint64_t migration_dirty_pages;
347     /* Protects modification of the bitmap and migration dirty pages */
348     QemuMutex bitmap_mutex;
349     /* The RAMBlock used in the last src_page_requests */
350     RAMBlock *last_req_rb;
351     /* Queue of outstanding page requests from the destination */
352     QemuMutex src_page_req_mutex;
353     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
354 };
355 typedef struct RAMState RAMState;
356 
357 static RAMState *ram_state;
358 
359 static NotifierWithReturnList precopy_notifier_list;
360 
361 void precopy_infrastructure_init(void)
362 {
363     notifier_with_return_list_init(&precopy_notifier_list);
364 }
365 
366 void precopy_add_notifier(NotifierWithReturn *n)
367 {
368     notifier_with_return_list_add(&precopy_notifier_list, n);
369 }
370 
371 void precopy_remove_notifier(NotifierWithReturn *n)
372 {
373     notifier_with_return_remove(n);
374 }
375 
376 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
377 {
378     PrecopyNotifyData pnd;
379     pnd.reason = reason;
380     pnd.errp = errp;
381 
382     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
383 }
384 
385 void precopy_enable_free_page_optimization(void)
386 {
387     if (!ram_state) {
388         return;
389     }
390 
391     ram_state->fpo_enabled = true;
392 }
393 
394 uint64_t ram_bytes_remaining(void)
395 {
396     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
397                        0;
398 }
399 
400 MigrationStats ram_counters;
401 
402 /* used by the search for pages to send */
403 struct PageSearchStatus {
404     /* Current block being searched */
405     RAMBlock    *block;
406     /* Current page to search from */
407     unsigned long page;
408     /* Set once we wrap around */
409     bool         complete_round;
410 };
411 typedef struct PageSearchStatus PageSearchStatus;
412 
413 CompressionStats compression_counters;
414 
415 struct CompressParam {
416     bool done;
417     bool quit;
418     bool zero_page;
419     QEMUFile *file;
420     QemuMutex mutex;
421     QemuCond cond;
422     RAMBlock *block;
423     ram_addr_t offset;
424 
425     /* internally used fields */
426     z_stream stream;
427     uint8_t *originbuf;
428 };
429 typedef struct CompressParam CompressParam;
430 
431 struct DecompressParam {
432     bool done;
433     bool quit;
434     QemuMutex mutex;
435     QemuCond cond;
436     void *des;
437     uint8_t *compbuf;
438     int len;
439     z_stream stream;
440 };
441 typedef struct DecompressParam DecompressParam;
442 
443 static CompressParam *comp_param;
444 static QemuThread *compress_threads;
445 /* comp_done_cond is used to wake up the migration thread when
446  * one of the compression threads has finished the compression.
447  * comp_done_lock is used to co-work with comp_done_cond.
448  */
449 static QemuMutex comp_done_lock;
450 static QemuCond comp_done_cond;
451 /* The empty QEMUFileOps will be used by file in CompressParam */
452 static const QEMUFileOps empty_ops = { };
453 
454 static QEMUFile *decomp_file;
455 static DecompressParam *decomp_param;
456 static QemuThread *decompress_threads;
457 static QemuMutex decomp_done_lock;
458 static QemuCond decomp_done_cond;
459 
460 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
461                                  ram_addr_t offset, uint8_t *source_buf);
462 
463 static void *do_data_compress(void *opaque)
464 {
465     CompressParam *param = opaque;
466     RAMBlock *block;
467     ram_addr_t offset;
468     bool zero_page;
469 
470     qemu_mutex_lock(&param->mutex);
471     while (!param->quit) {
472         if (param->block) {
473             block = param->block;
474             offset = param->offset;
475             param->block = NULL;
476             qemu_mutex_unlock(&param->mutex);
477 
478             zero_page = do_compress_ram_page(param->file, &param->stream,
479                                              block, offset, param->originbuf);
480 
481             qemu_mutex_lock(&comp_done_lock);
482             param->done = true;
483             param->zero_page = zero_page;
484             qemu_cond_signal(&comp_done_cond);
485             qemu_mutex_unlock(&comp_done_lock);
486 
487             qemu_mutex_lock(&param->mutex);
488         } else {
489             qemu_cond_wait(&param->cond, &param->mutex);
490         }
491     }
492     qemu_mutex_unlock(&param->mutex);
493 
494     return NULL;
495 }
496 
497 static void compress_threads_save_cleanup(void)
498 {
499     int i, thread_count;
500 
501     if (!migrate_use_compression() || !comp_param) {
502         return;
503     }
504 
505     thread_count = migrate_compress_threads();
506     for (i = 0; i < thread_count; i++) {
507         /*
508          * we use it as a indicator which shows if the thread is
509          * properly init'd or not
510          */
511         if (!comp_param[i].file) {
512             break;
513         }
514 
515         qemu_mutex_lock(&comp_param[i].mutex);
516         comp_param[i].quit = true;
517         qemu_cond_signal(&comp_param[i].cond);
518         qemu_mutex_unlock(&comp_param[i].mutex);
519 
520         qemu_thread_join(compress_threads + i);
521         qemu_mutex_destroy(&comp_param[i].mutex);
522         qemu_cond_destroy(&comp_param[i].cond);
523         deflateEnd(&comp_param[i].stream);
524         g_free(comp_param[i].originbuf);
525         qemu_fclose(comp_param[i].file);
526         comp_param[i].file = NULL;
527     }
528     qemu_mutex_destroy(&comp_done_lock);
529     qemu_cond_destroy(&comp_done_cond);
530     g_free(compress_threads);
531     g_free(comp_param);
532     compress_threads = NULL;
533     comp_param = NULL;
534 }
535 
536 static int compress_threads_save_setup(void)
537 {
538     int i, thread_count;
539 
540     if (!migrate_use_compression()) {
541         return 0;
542     }
543     thread_count = migrate_compress_threads();
544     compress_threads = g_new0(QemuThread, thread_count);
545     comp_param = g_new0(CompressParam, thread_count);
546     qemu_cond_init(&comp_done_cond);
547     qemu_mutex_init(&comp_done_lock);
548     for (i = 0; i < thread_count; i++) {
549         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
550         if (!comp_param[i].originbuf) {
551             goto exit;
552         }
553 
554         if (deflateInit(&comp_param[i].stream,
555                         migrate_compress_level()) != Z_OK) {
556             g_free(comp_param[i].originbuf);
557             goto exit;
558         }
559 
560         /* comp_param[i].file is just used as a dummy buffer to save data,
561          * set its ops to empty.
562          */
563         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
564         comp_param[i].done = true;
565         comp_param[i].quit = false;
566         qemu_mutex_init(&comp_param[i].mutex);
567         qemu_cond_init(&comp_param[i].cond);
568         qemu_thread_create(compress_threads + i, "compress",
569                            do_data_compress, comp_param + i,
570                            QEMU_THREAD_JOINABLE);
571     }
572     return 0;
573 
574 exit:
575     compress_threads_save_cleanup();
576     return -1;
577 }
578 
579 /* Multiple fd's */
580 
581 #define MULTIFD_MAGIC 0x11223344U
582 #define MULTIFD_VERSION 1
583 
584 #define MULTIFD_FLAG_SYNC (1 << 0)
585 
586 /* This value needs to be a multiple of qemu_target_page_size() */
587 #define MULTIFD_PACKET_SIZE (512 * 1024)
588 
589 typedef struct {
590     uint32_t magic;
591     uint32_t version;
592     unsigned char uuid[16]; /* QemuUUID */
593     uint8_t id;
594     uint8_t unused1[7];     /* Reserved for future use */
595     uint64_t unused2[4];    /* Reserved for future use */
596 } __attribute__((packed)) MultiFDInit_t;
597 
598 typedef struct {
599     uint32_t magic;
600     uint32_t version;
601     uint32_t flags;
602     /* maximum number of allocated pages */
603     uint32_t pages_alloc;
604     uint32_t pages_used;
605     /* size of the next packet that contains pages */
606     uint32_t next_packet_size;
607     uint64_t packet_num;
608     uint64_t unused[4];    /* Reserved for future use */
609     char ramblock[256];
610     uint64_t offset[];
611 } __attribute__((packed)) MultiFDPacket_t;
612 
613 typedef struct {
614     /* number of used pages */
615     uint32_t used;
616     /* number of allocated pages */
617     uint32_t allocated;
618     /* global number of generated multifd packets */
619     uint64_t packet_num;
620     /* offset of each page */
621     ram_addr_t *offset;
622     /* pointer to each page */
623     struct iovec *iov;
624     RAMBlock *block;
625 } MultiFDPages_t;
626 
627 typedef struct {
628     /* this fields are not changed once the thread is created */
629     /* channel number */
630     uint8_t id;
631     /* channel thread name */
632     char *name;
633     /* channel thread id */
634     QemuThread thread;
635     /* communication channel */
636     QIOChannel *c;
637     /* sem where to wait for more work */
638     QemuSemaphore sem;
639     /* this mutex protects the following parameters */
640     QemuMutex mutex;
641     /* is this channel thread running */
642     bool running;
643     /* should this thread finish */
644     bool quit;
645     /* thread has work to do */
646     int pending_job;
647     /* array of pages to sent */
648     MultiFDPages_t *pages;
649     /* packet allocated len */
650     uint32_t packet_len;
651     /* pointer to the packet */
652     MultiFDPacket_t *packet;
653     /* multifd flags for each packet */
654     uint32_t flags;
655     /* size of the next packet that contains pages */
656     uint32_t next_packet_size;
657     /* global number of generated multifd packets */
658     uint64_t packet_num;
659     /* thread local variables */
660     /* packets sent through this channel */
661     uint64_t num_packets;
662     /* pages sent through this channel */
663     uint64_t num_pages;
664 }  MultiFDSendParams;
665 
666 typedef struct {
667     /* this fields are not changed once the thread is created */
668     /* channel number */
669     uint8_t id;
670     /* channel thread name */
671     char *name;
672     /* channel thread id */
673     QemuThread thread;
674     /* communication channel */
675     QIOChannel *c;
676     /* this mutex protects the following parameters */
677     QemuMutex mutex;
678     /* is this channel thread running */
679     bool running;
680     /* array of pages to receive */
681     MultiFDPages_t *pages;
682     /* packet allocated len */
683     uint32_t packet_len;
684     /* pointer to the packet */
685     MultiFDPacket_t *packet;
686     /* multifd flags for each packet */
687     uint32_t flags;
688     /* global number of generated multifd packets */
689     uint64_t packet_num;
690     /* thread local variables */
691     /* size of the next packet that contains pages */
692     uint32_t next_packet_size;
693     /* packets sent through this channel */
694     uint64_t num_packets;
695     /* pages sent through this channel */
696     uint64_t num_pages;
697     /* syncs main thread and channels */
698     QemuSemaphore sem_sync;
699 } MultiFDRecvParams;
700 
701 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
702 {
703     MultiFDInit_t msg;
704     int ret;
705 
706     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
707     msg.version = cpu_to_be32(MULTIFD_VERSION);
708     msg.id = p->id;
709     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
710 
711     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
712     if (ret != 0) {
713         return -1;
714     }
715     return 0;
716 }
717 
718 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
719 {
720     MultiFDInit_t msg;
721     int ret;
722 
723     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
724     if (ret != 0) {
725         return -1;
726     }
727 
728     msg.magic = be32_to_cpu(msg.magic);
729     msg.version = be32_to_cpu(msg.version);
730 
731     if (msg.magic != MULTIFD_MAGIC) {
732         error_setg(errp, "multifd: received packet magic %x "
733                    "expected %x", msg.magic, MULTIFD_MAGIC);
734         return -1;
735     }
736 
737     if (msg.version != MULTIFD_VERSION) {
738         error_setg(errp, "multifd: received packet version %d "
739                    "expected %d", msg.version, MULTIFD_VERSION);
740         return -1;
741     }
742 
743     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
744         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
745         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
746 
747         error_setg(errp, "multifd: received uuid '%s' and expected "
748                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
749         g_free(uuid);
750         g_free(msg_uuid);
751         return -1;
752     }
753 
754     if (msg.id > migrate_multifd_channels()) {
755         error_setg(errp, "multifd: received channel version %d "
756                    "expected %d", msg.version, MULTIFD_VERSION);
757         return -1;
758     }
759 
760     return msg.id;
761 }
762 
763 static MultiFDPages_t *multifd_pages_init(size_t size)
764 {
765     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
766 
767     pages->allocated = size;
768     pages->iov = g_new0(struct iovec, size);
769     pages->offset = g_new0(ram_addr_t, size);
770 
771     return pages;
772 }
773 
774 static void multifd_pages_clear(MultiFDPages_t *pages)
775 {
776     pages->used = 0;
777     pages->allocated = 0;
778     pages->packet_num = 0;
779     pages->block = NULL;
780     g_free(pages->iov);
781     pages->iov = NULL;
782     g_free(pages->offset);
783     pages->offset = NULL;
784     g_free(pages);
785 }
786 
787 static void multifd_send_fill_packet(MultiFDSendParams *p)
788 {
789     MultiFDPacket_t *packet = p->packet;
790     uint32_t page_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
791     int i;
792 
793     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
794     packet->version = cpu_to_be32(MULTIFD_VERSION);
795     packet->flags = cpu_to_be32(p->flags);
796     packet->pages_alloc = cpu_to_be32(page_max);
797     packet->pages_used = cpu_to_be32(p->pages->used);
798     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
799     packet->packet_num = cpu_to_be64(p->packet_num);
800 
801     if (p->pages->block) {
802         strncpy(packet->ramblock, p->pages->block->idstr, 256);
803     }
804 
805     for (i = 0; i < p->pages->used; i++) {
806         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
807     }
808 }
809 
810 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
811 {
812     MultiFDPacket_t *packet = p->packet;
813     uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
814     RAMBlock *block;
815     int i;
816 
817     packet->magic = be32_to_cpu(packet->magic);
818     if (packet->magic != MULTIFD_MAGIC) {
819         error_setg(errp, "multifd: received packet "
820                    "magic %x and expected magic %x",
821                    packet->magic, MULTIFD_MAGIC);
822         return -1;
823     }
824 
825     packet->version = be32_to_cpu(packet->version);
826     if (packet->version != MULTIFD_VERSION) {
827         error_setg(errp, "multifd: received packet "
828                    "version %d and expected version %d",
829                    packet->version, MULTIFD_VERSION);
830         return -1;
831     }
832 
833     p->flags = be32_to_cpu(packet->flags);
834 
835     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
836     /*
837      * If we recevied a packet that is 100 times bigger than expected
838      * just stop migration.  It is a magic number.
839      */
840     if (packet->pages_alloc > pages_max * 100) {
841         error_setg(errp, "multifd: received packet "
842                    "with size %d and expected a maximum size of %d",
843                    packet->pages_alloc, pages_max * 100) ;
844         return -1;
845     }
846     /*
847      * We received a packet that is bigger than expected but inside
848      * reasonable limits (see previous comment).  Just reallocate.
849      */
850     if (packet->pages_alloc > p->pages->allocated) {
851         multifd_pages_clear(p->pages);
852         p->pages = multifd_pages_init(packet->pages_alloc);
853     }
854 
855     p->pages->used = be32_to_cpu(packet->pages_used);
856     if (p->pages->used > packet->pages_alloc) {
857         error_setg(errp, "multifd: received packet "
858                    "with %d pages and expected maximum pages are %d",
859                    p->pages->used, packet->pages_alloc) ;
860         return -1;
861     }
862 
863     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
864     p->packet_num = be64_to_cpu(packet->packet_num);
865 
866     if (p->pages->used) {
867         /* make sure that ramblock is 0 terminated */
868         packet->ramblock[255] = 0;
869         block = qemu_ram_block_by_name(packet->ramblock);
870         if (!block) {
871             error_setg(errp, "multifd: unknown ram block %s",
872                        packet->ramblock);
873             return -1;
874         }
875     }
876 
877     for (i = 0; i < p->pages->used; i++) {
878         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
879 
880         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
881             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
882                        " (max " RAM_ADDR_FMT ")",
883                        offset, block->max_length);
884             return -1;
885         }
886         p->pages->iov[i].iov_base = block->host + offset;
887         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
888     }
889 
890     return 0;
891 }
892 
893 struct {
894     MultiFDSendParams *params;
895     /* array of pages to sent */
896     MultiFDPages_t *pages;
897     /* syncs main thread and channels */
898     QemuSemaphore sem_sync;
899     /* global number of generated multifd packets */
900     uint64_t packet_num;
901     /* send channels ready */
902     QemuSemaphore channels_ready;
903 } *multifd_send_state;
904 
905 /*
906  * How we use multifd_send_state->pages and channel->pages?
907  *
908  * We create a pages for each channel, and a main one.  Each time that
909  * we need to send a batch of pages we interchange the ones between
910  * multifd_send_state and the channel that is sending it.  There are
911  * two reasons for that:
912  *    - to not have to do so many mallocs during migration
913  *    - to make easier to know what to free at the end of migration
914  *
915  * This way we always know who is the owner of each "pages" struct,
916  * and we don't need any locking.  It belongs to the migration thread
917  * or to the channel thread.  Switching is safe because the migration
918  * thread is using the channel mutex when changing it, and the channel
919  * have to had finish with its own, otherwise pending_job can't be
920  * false.
921  */
922 
923 static void multifd_send_pages(void)
924 {
925     int i;
926     static int next_channel;
927     MultiFDSendParams *p = NULL; /* make happy gcc */
928     MultiFDPages_t *pages = multifd_send_state->pages;
929     uint64_t transferred;
930 
931     qemu_sem_wait(&multifd_send_state->channels_ready);
932     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
933         p = &multifd_send_state->params[i];
934 
935         qemu_mutex_lock(&p->mutex);
936         if (!p->pending_job) {
937             p->pending_job++;
938             next_channel = (i + 1) % migrate_multifd_channels();
939             break;
940         }
941         qemu_mutex_unlock(&p->mutex);
942     }
943     p->pages->used = 0;
944 
945     p->packet_num = multifd_send_state->packet_num++;
946     p->pages->block = NULL;
947     multifd_send_state->pages = p->pages;
948     p->pages = pages;
949     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
950     ram_counters.multifd_bytes += transferred;
951     ram_counters.transferred += transferred;;
952     qemu_mutex_unlock(&p->mutex);
953     qemu_sem_post(&p->sem);
954 }
955 
956 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
957 {
958     MultiFDPages_t *pages = multifd_send_state->pages;
959 
960     if (!pages->block) {
961         pages->block = block;
962     }
963 
964     if (pages->block == block) {
965         pages->offset[pages->used] = offset;
966         pages->iov[pages->used].iov_base = block->host + offset;
967         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
968         pages->used++;
969 
970         if (pages->used < pages->allocated) {
971             return;
972         }
973     }
974 
975     multifd_send_pages();
976 
977     if (pages->block != block) {
978         multifd_queue_page(block, offset);
979     }
980 }
981 
982 static void multifd_send_terminate_threads(Error *err)
983 {
984     int i;
985 
986     if (err) {
987         MigrationState *s = migrate_get_current();
988         migrate_set_error(s, err);
989         if (s->state == MIGRATION_STATUS_SETUP ||
990             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
991             s->state == MIGRATION_STATUS_DEVICE ||
992             s->state == MIGRATION_STATUS_ACTIVE) {
993             migrate_set_state(&s->state, s->state,
994                               MIGRATION_STATUS_FAILED);
995         }
996     }
997 
998     for (i = 0; i < migrate_multifd_channels(); i++) {
999         MultiFDSendParams *p = &multifd_send_state->params[i];
1000 
1001         qemu_mutex_lock(&p->mutex);
1002         p->quit = true;
1003         qemu_sem_post(&p->sem);
1004         qemu_mutex_unlock(&p->mutex);
1005     }
1006 }
1007 
1008 void multifd_save_cleanup(void)
1009 {
1010     int i;
1011 
1012     if (!migrate_use_multifd()) {
1013         return;
1014     }
1015     multifd_send_terminate_threads(NULL);
1016     for (i = 0; i < migrate_multifd_channels(); i++) {
1017         MultiFDSendParams *p = &multifd_send_state->params[i];
1018 
1019         if (p->running) {
1020             qemu_thread_join(&p->thread);
1021         }
1022         socket_send_channel_destroy(p->c);
1023         p->c = NULL;
1024         qemu_mutex_destroy(&p->mutex);
1025         qemu_sem_destroy(&p->sem);
1026         g_free(p->name);
1027         p->name = NULL;
1028         multifd_pages_clear(p->pages);
1029         p->pages = NULL;
1030         p->packet_len = 0;
1031         g_free(p->packet);
1032         p->packet = NULL;
1033     }
1034     qemu_sem_destroy(&multifd_send_state->channels_ready);
1035     qemu_sem_destroy(&multifd_send_state->sem_sync);
1036     g_free(multifd_send_state->params);
1037     multifd_send_state->params = NULL;
1038     multifd_pages_clear(multifd_send_state->pages);
1039     multifd_send_state->pages = NULL;
1040     g_free(multifd_send_state);
1041     multifd_send_state = NULL;
1042 }
1043 
1044 static void multifd_send_sync_main(void)
1045 {
1046     int i;
1047 
1048     if (!migrate_use_multifd()) {
1049         return;
1050     }
1051     if (multifd_send_state->pages->used) {
1052         multifd_send_pages();
1053     }
1054     for (i = 0; i < migrate_multifd_channels(); i++) {
1055         MultiFDSendParams *p = &multifd_send_state->params[i];
1056 
1057         trace_multifd_send_sync_main_signal(p->id);
1058 
1059         qemu_mutex_lock(&p->mutex);
1060 
1061         p->packet_num = multifd_send_state->packet_num++;
1062         p->flags |= MULTIFD_FLAG_SYNC;
1063         p->pending_job++;
1064         qemu_mutex_unlock(&p->mutex);
1065         qemu_sem_post(&p->sem);
1066     }
1067     for (i = 0; i < migrate_multifd_channels(); i++) {
1068         MultiFDSendParams *p = &multifd_send_state->params[i];
1069 
1070         trace_multifd_send_sync_main_wait(p->id);
1071         qemu_sem_wait(&multifd_send_state->sem_sync);
1072     }
1073     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1074 }
1075 
1076 static void *multifd_send_thread(void *opaque)
1077 {
1078     MultiFDSendParams *p = opaque;
1079     Error *local_err = NULL;
1080     int ret;
1081 
1082     trace_multifd_send_thread_start(p->id);
1083     rcu_register_thread();
1084 
1085     if (multifd_send_initial_packet(p, &local_err) < 0) {
1086         goto out;
1087     }
1088     /* initial packet */
1089     p->num_packets = 1;
1090 
1091     while (true) {
1092         qemu_sem_wait(&p->sem);
1093         qemu_mutex_lock(&p->mutex);
1094 
1095         if (p->pending_job) {
1096             uint32_t used = p->pages->used;
1097             uint64_t packet_num = p->packet_num;
1098             uint32_t flags = p->flags;
1099 
1100             p->next_packet_size = used * qemu_target_page_size();
1101             multifd_send_fill_packet(p);
1102             p->flags = 0;
1103             p->num_packets++;
1104             p->num_pages += used;
1105             p->pages->used = 0;
1106             qemu_mutex_unlock(&p->mutex);
1107 
1108             trace_multifd_send(p->id, packet_num, used, flags,
1109                                p->next_packet_size);
1110 
1111             ret = qio_channel_write_all(p->c, (void *)p->packet,
1112                                         p->packet_len, &local_err);
1113             if (ret != 0) {
1114                 break;
1115             }
1116 
1117             if (used) {
1118                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1119                                              used, &local_err);
1120                 if (ret != 0) {
1121                     break;
1122                 }
1123             }
1124 
1125             qemu_mutex_lock(&p->mutex);
1126             p->pending_job--;
1127             qemu_mutex_unlock(&p->mutex);
1128 
1129             if (flags & MULTIFD_FLAG_SYNC) {
1130                 qemu_sem_post(&multifd_send_state->sem_sync);
1131             }
1132             qemu_sem_post(&multifd_send_state->channels_ready);
1133         } else if (p->quit) {
1134             qemu_mutex_unlock(&p->mutex);
1135             break;
1136         } else {
1137             qemu_mutex_unlock(&p->mutex);
1138             /* sometimes there are spurious wakeups */
1139         }
1140     }
1141 
1142 out:
1143     if (local_err) {
1144         multifd_send_terminate_threads(local_err);
1145     }
1146 
1147     qemu_mutex_lock(&p->mutex);
1148     p->running = false;
1149     qemu_mutex_unlock(&p->mutex);
1150 
1151     rcu_unregister_thread();
1152     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1153 
1154     return NULL;
1155 }
1156 
1157 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1158 {
1159     MultiFDSendParams *p = opaque;
1160     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1161     Error *local_err = NULL;
1162 
1163     if (qio_task_propagate_error(task, &local_err)) {
1164         migrate_set_error(migrate_get_current(), local_err);
1165         multifd_save_cleanup();
1166     } else {
1167         p->c = QIO_CHANNEL(sioc);
1168         qio_channel_set_delay(p->c, false);
1169         p->running = true;
1170         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1171                            QEMU_THREAD_JOINABLE);
1172     }
1173 }
1174 
1175 int multifd_save_setup(void)
1176 {
1177     int thread_count;
1178     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1179     uint8_t i;
1180 
1181     if (!migrate_use_multifd()) {
1182         return 0;
1183     }
1184     thread_count = migrate_multifd_channels();
1185     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1186     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1187     multifd_send_state->pages = multifd_pages_init(page_count);
1188     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1189     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1190 
1191     for (i = 0; i < thread_count; i++) {
1192         MultiFDSendParams *p = &multifd_send_state->params[i];
1193 
1194         qemu_mutex_init(&p->mutex);
1195         qemu_sem_init(&p->sem, 0);
1196         p->quit = false;
1197         p->pending_job = 0;
1198         p->id = i;
1199         p->pages = multifd_pages_init(page_count);
1200         p->packet_len = sizeof(MultiFDPacket_t)
1201                       + sizeof(ram_addr_t) * page_count;
1202         p->packet = g_malloc0(p->packet_len);
1203         p->name = g_strdup_printf("multifdsend_%d", i);
1204         socket_send_channel_create(multifd_new_send_channel_async, p);
1205     }
1206     return 0;
1207 }
1208 
1209 struct {
1210     MultiFDRecvParams *params;
1211     /* number of created threads */
1212     int count;
1213     /* syncs main thread and channels */
1214     QemuSemaphore sem_sync;
1215     /* global number of generated multifd packets */
1216     uint64_t packet_num;
1217 } *multifd_recv_state;
1218 
1219 static void multifd_recv_terminate_threads(Error *err)
1220 {
1221     int i;
1222 
1223     if (err) {
1224         MigrationState *s = migrate_get_current();
1225         migrate_set_error(s, err);
1226         if (s->state == MIGRATION_STATUS_SETUP ||
1227             s->state == MIGRATION_STATUS_ACTIVE) {
1228             migrate_set_state(&s->state, s->state,
1229                               MIGRATION_STATUS_FAILED);
1230         }
1231     }
1232 
1233     for (i = 0; i < migrate_multifd_channels(); i++) {
1234         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1235 
1236         qemu_mutex_lock(&p->mutex);
1237         /* We could arrive here for two reasons:
1238            - normal quit, i.e. everything went fine, just finished
1239            - error quit: We close the channels so the channel threads
1240              finish the qio_channel_read_all_eof() */
1241         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1242         qemu_mutex_unlock(&p->mutex);
1243     }
1244 }
1245 
1246 int multifd_load_cleanup(Error **errp)
1247 {
1248     int i;
1249     int ret = 0;
1250 
1251     if (!migrate_use_multifd()) {
1252         return 0;
1253     }
1254     multifd_recv_terminate_threads(NULL);
1255     for (i = 0; i < migrate_multifd_channels(); i++) {
1256         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1257 
1258         if (p->running) {
1259             qemu_thread_join(&p->thread);
1260         }
1261         object_unref(OBJECT(p->c));
1262         p->c = NULL;
1263         qemu_mutex_destroy(&p->mutex);
1264         qemu_sem_destroy(&p->sem_sync);
1265         g_free(p->name);
1266         p->name = NULL;
1267         multifd_pages_clear(p->pages);
1268         p->pages = NULL;
1269         p->packet_len = 0;
1270         g_free(p->packet);
1271         p->packet = NULL;
1272     }
1273     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1274     g_free(multifd_recv_state->params);
1275     multifd_recv_state->params = NULL;
1276     g_free(multifd_recv_state);
1277     multifd_recv_state = NULL;
1278 
1279     return ret;
1280 }
1281 
1282 static void multifd_recv_sync_main(void)
1283 {
1284     int i;
1285 
1286     if (!migrate_use_multifd()) {
1287         return;
1288     }
1289     for (i = 0; i < migrate_multifd_channels(); i++) {
1290         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1291 
1292         trace_multifd_recv_sync_main_wait(p->id);
1293         qemu_sem_wait(&multifd_recv_state->sem_sync);
1294     }
1295     for (i = 0; i < migrate_multifd_channels(); i++) {
1296         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1297 
1298         qemu_mutex_lock(&p->mutex);
1299         if (multifd_recv_state->packet_num < p->packet_num) {
1300             multifd_recv_state->packet_num = p->packet_num;
1301         }
1302         qemu_mutex_unlock(&p->mutex);
1303         trace_multifd_recv_sync_main_signal(p->id);
1304         qemu_sem_post(&p->sem_sync);
1305     }
1306     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1307 }
1308 
1309 static void *multifd_recv_thread(void *opaque)
1310 {
1311     MultiFDRecvParams *p = opaque;
1312     Error *local_err = NULL;
1313     int ret;
1314 
1315     trace_multifd_recv_thread_start(p->id);
1316     rcu_register_thread();
1317 
1318     while (true) {
1319         uint32_t used;
1320         uint32_t flags;
1321 
1322         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1323                                        p->packet_len, &local_err);
1324         if (ret == 0) {   /* EOF */
1325             break;
1326         }
1327         if (ret == -1) {   /* Error */
1328             break;
1329         }
1330 
1331         qemu_mutex_lock(&p->mutex);
1332         ret = multifd_recv_unfill_packet(p, &local_err);
1333         if (ret) {
1334             qemu_mutex_unlock(&p->mutex);
1335             break;
1336         }
1337 
1338         used = p->pages->used;
1339         flags = p->flags;
1340         trace_multifd_recv(p->id, p->packet_num, used, flags,
1341                            p->next_packet_size);
1342         p->num_packets++;
1343         p->num_pages += used;
1344         qemu_mutex_unlock(&p->mutex);
1345 
1346         if (used) {
1347             ret = qio_channel_readv_all(p->c, p->pages->iov,
1348                                         used, &local_err);
1349             if (ret != 0) {
1350                 break;
1351             }
1352         }
1353 
1354         if (flags & MULTIFD_FLAG_SYNC) {
1355             qemu_sem_post(&multifd_recv_state->sem_sync);
1356             qemu_sem_wait(&p->sem_sync);
1357         }
1358     }
1359 
1360     if (local_err) {
1361         multifd_recv_terminate_threads(local_err);
1362     }
1363     qemu_mutex_lock(&p->mutex);
1364     p->running = false;
1365     qemu_mutex_unlock(&p->mutex);
1366 
1367     rcu_unregister_thread();
1368     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1369 
1370     return NULL;
1371 }
1372 
1373 int multifd_load_setup(void)
1374 {
1375     int thread_count;
1376     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1377     uint8_t i;
1378 
1379     if (!migrate_use_multifd()) {
1380         return 0;
1381     }
1382     thread_count = migrate_multifd_channels();
1383     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1384     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1385     atomic_set(&multifd_recv_state->count, 0);
1386     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1387 
1388     for (i = 0; i < thread_count; i++) {
1389         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1390 
1391         qemu_mutex_init(&p->mutex);
1392         qemu_sem_init(&p->sem_sync, 0);
1393         p->id = i;
1394         p->pages = multifd_pages_init(page_count);
1395         p->packet_len = sizeof(MultiFDPacket_t)
1396                       + sizeof(ram_addr_t) * page_count;
1397         p->packet = g_malloc0(p->packet_len);
1398         p->name = g_strdup_printf("multifdrecv_%d", i);
1399     }
1400     return 0;
1401 }
1402 
1403 bool multifd_recv_all_channels_created(void)
1404 {
1405     int thread_count = migrate_multifd_channels();
1406 
1407     if (!migrate_use_multifd()) {
1408         return true;
1409     }
1410 
1411     return thread_count == atomic_read(&multifd_recv_state->count);
1412 }
1413 
1414 /*
1415  * Try to receive all multifd channels to get ready for the migration.
1416  * - Return true and do not set @errp when correctly receving all channels;
1417  * - Return false and do not set @errp when correctly receiving the current one;
1418  * - Return false and set @errp when failing to receive the current channel.
1419  */
1420 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1421 {
1422     MultiFDRecvParams *p;
1423     Error *local_err = NULL;
1424     int id;
1425 
1426     id = multifd_recv_initial_packet(ioc, &local_err);
1427     if (id < 0) {
1428         multifd_recv_terminate_threads(local_err);
1429         error_propagate_prepend(errp, local_err,
1430                                 "failed to receive packet"
1431                                 " via multifd channel %d: ",
1432                                 atomic_read(&multifd_recv_state->count));
1433         return false;
1434     }
1435 
1436     p = &multifd_recv_state->params[id];
1437     if (p->c != NULL) {
1438         error_setg(&local_err, "multifd: received id '%d' already setup'",
1439                    id);
1440         multifd_recv_terminate_threads(local_err);
1441         error_propagate(errp, local_err);
1442         return false;
1443     }
1444     p->c = ioc;
1445     object_ref(OBJECT(ioc));
1446     /* initial packet */
1447     p->num_packets = 1;
1448 
1449     p->running = true;
1450     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1451                        QEMU_THREAD_JOINABLE);
1452     atomic_inc(&multifd_recv_state->count);
1453     return atomic_read(&multifd_recv_state->count) ==
1454            migrate_multifd_channels();
1455 }
1456 
1457 /**
1458  * save_page_header: write page header to wire
1459  *
1460  * If this is the 1st block, it also writes the block identification
1461  *
1462  * Returns the number of bytes written
1463  *
1464  * @f: QEMUFile where to send the data
1465  * @block: block that contains the page we want to send
1466  * @offset: offset inside the block for the page
1467  *          in the lower bits, it contains flags
1468  */
1469 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1470                                ram_addr_t offset)
1471 {
1472     size_t size, len;
1473 
1474     if (block == rs->last_sent_block) {
1475         offset |= RAM_SAVE_FLAG_CONTINUE;
1476     }
1477     qemu_put_be64(f, offset);
1478     size = 8;
1479 
1480     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1481         len = strlen(block->idstr);
1482         qemu_put_byte(f, len);
1483         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1484         size += 1 + len;
1485         rs->last_sent_block = block;
1486     }
1487     return size;
1488 }
1489 
1490 /**
1491  * mig_throttle_guest_down: throotle down the guest
1492  *
1493  * Reduce amount of guest cpu execution to hopefully slow down memory
1494  * writes. If guest dirty memory rate is reduced below the rate at
1495  * which we can transfer pages to the destination then we should be
1496  * able to complete migration. Some workloads dirty memory way too
1497  * fast and will not effectively converge, even with auto-converge.
1498  */
1499 static void mig_throttle_guest_down(void)
1500 {
1501     MigrationState *s = migrate_get_current();
1502     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1503     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1504     int pct_max = s->parameters.max_cpu_throttle;
1505 
1506     /* We have not started throttling yet. Let's start it. */
1507     if (!cpu_throttle_active()) {
1508         cpu_throttle_set(pct_initial);
1509     } else {
1510         /* Throttling already on, just increase the rate */
1511         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1512                          pct_max));
1513     }
1514 }
1515 
1516 /**
1517  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1518  *
1519  * @rs: current RAM state
1520  * @current_addr: address for the zero page
1521  *
1522  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1523  * The important thing is that a stale (not-yet-0'd) page be replaced
1524  * by the new data.
1525  * As a bonus, if the page wasn't in the cache it gets added so that
1526  * when a small write is made into the 0'd page it gets XBZRLE sent.
1527  */
1528 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1529 {
1530     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1531         return;
1532     }
1533 
1534     /* We don't care if this fails to allocate a new cache page
1535      * as long as it updated an old one */
1536     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1537                  ram_counters.dirty_sync_count);
1538 }
1539 
1540 #define ENCODING_FLAG_XBZRLE 0x1
1541 
1542 /**
1543  * save_xbzrle_page: compress and send current page
1544  *
1545  * Returns: 1 means that we wrote the page
1546  *          0 means that page is identical to the one already sent
1547  *          -1 means that xbzrle would be longer than normal
1548  *
1549  * @rs: current RAM state
1550  * @current_data: pointer to the address of the page contents
1551  * @current_addr: addr of the page
1552  * @block: block that contains the page we want to send
1553  * @offset: offset inside the block for the page
1554  * @last_stage: if we are at the completion stage
1555  */
1556 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1557                             ram_addr_t current_addr, RAMBlock *block,
1558                             ram_addr_t offset, bool last_stage)
1559 {
1560     int encoded_len = 0, bytes_xbzrle;
1561     uint8_t *prev_cached_page;
1562 
1563     if (!cache_is_cached(XBZRLE.cache, current_addr,
1564                          ram_counters.dirty_sync_count)) {
1565         xbzrle_counters.cache_miss++;
1566         if (!last_stage) {
1567             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1568                              ram_counters.dirty_sync_count) == -1) {
1569                 return -1;
1570             } else {
1571                 /* update *current_data when the page has been
1572                    inserted into cache */
1573                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1574             }
1575         }
1576         return -1;
1577     }
1578 
1579     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1580 
1581     /* save current buffer into memory */
1582     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1583 
1584     /* XBZRLE encoding (if there is no overflow) */
1585     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1586                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1587                                        TARGET_PAGE_SIZE);
1588 
1589     /*
1590      * Update the cache contents, so that it corresponds to the data
1591      * sent, in all cases except where we skip the page.
1592      */
1593     if (!last_stage && encoded_len != 0) {
1594         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1595         /*
1596          * In the case where we couldn't compress, ensure that the caller
1597          * sends the data from the cache, since the guest might have
1598          * changed the RAM since we copied it.
1599          */
1600         *current_data = prev_cached_page;
1601     }
1602 
1603     if (encoded_len == 0) {
1604         trace_save_xbzrle_page_skipping();
1605         return 0;
1606     } else if (encoded_len == -1) {
1607         trace_save_xbzrle_page_overflow();
1608         xbzrle_counters.overflow++;
1609         return -1;
1610     }
1611 
1612     /* Send XBZRLE based compressed page */
1613     bytes_xbzrle = save_page_header(rs, rs->f, block,
1614                                     offset | RAM_SAVE_FLAG_XBZRLE);
1615     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1616     qemu_put_be16(rs->f, encoded_len);
1617     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1618     bytes_xbzrle += encoded_len + 1 + 2;
1619     xbzrle_counters.pages++;
1620     xbzrle_counters.bytes += bytes_xbzrle;
1621     ram_counters.transferred += bytes_xbzrle;
1622 
1623     return 1;
1624 }
1625 
1626 /**
1627  * migration_bitmap_find_dirty: find the next dirty page from start
1628  *
1629  * Returns the page offset within memory region of the start of a dirty page
1630  *
1631  * @rs: current RAM state
1632  * @rb: RAMBlock where to search for dirty pages
1633  * @start: page where we start the search
1634  */
1635 static inline
1636 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1637                                           unsigned long start)
1638 {
1639     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1640     unsigned long *bitmap = rb->bmap;
1641     unsigned long next;
1642 
1643     if (ramblock_is_ignored(rb)) {
1644         return size;
1645     }
1646 
1647     /*
1648      * When the free page optimization is enabled, we need to check the bitmap
1649      * to send the non-free pages rather than all the pages in the bulk stage.
1650      */
1651     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1652         next = start + 1;
1653     } else {
1654         next = find_next_bit(bitmap, size, start);
1655     }
1656 
1657     return next;
1658 }
1659 
1660 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1661                                                 RAMBlock *rb,
1662                                                 unsigned long page)
1663 {
1664     bool ret;
1665 
1666     qemu_mutex_lock(&rs->bitmap_mutex);
1667     ret = test_and_clear_bit(page, rb->bmap);
1668 
1669     if (ret) {
1670         rs->migration_dirty_pages--;
1671     }
1672     qemu_mutex_unlock(&rs->bitmap_mutex);
1673 
1674     return ret;
1675 }
1676 
1677 /* Called with RCU critical section */
1678 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1679                                         ram_addr_t length)
1680 {
1681     rs->migration_dirty_pages +=
1682         cpu_physical_memory_sync_dirty_bitmap(rb, 0, length,
1683                                               &rs->num_dirty_pages_period);
1684 }
1685 
1686 /**
1687  * ram_pagesize_summary: calculate all the pagesizes of a VM
1688  *
1689  * Returns a summary bitmap of the page sizes of all RAMBlocks
1690  *
1691  * For VMs with just normal pages this is equivalent to the host page
1692  * size. If it's got some huge pages then it's the OR of all the
1693  * different page sizes.
1694  */
1695 uint64_t ram_pagesize_summary(void)
1696 {
1697     RAMBlock *block;
1698     uint64_t summary = 0;
1699 
1700     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1701         summary |= block->page_size;
1702     }
1703 
1704     return summary;
1705 }
1706 
1707 uint64_t ram_get_total_transferred_pages(void)
1708 {
1709     return  ram_counters.normal + ram_counters.duplicate +
1710                 compression_counters.pages + xbzrle_counters.pages;
1711 }
1712 
1713 static void migration_update_rates(RAMState *rs, int64_t end_time)
1714 {
1715     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1716     double compressed_size;
1717 
1718     /* calculate period counters */
1719     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1720                 / (end_time - rs->time_last_bitmap_sync);
1721 
1722     if (!page_count) {
1723         return;
1724     }
1725 
1726     if (migrate_use_xbzrle()) {
1727         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1728             rs->xbzrle_cache_miss_prev) / page_count;
1729         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1730     }
1731 
1732     if (migrate_use_compression()) {
1733         compression_counters.busy_rate = (double)(compression_counters.busy -
1734             rs->compress_thread_busy_prev) / page_count;
1735         rs->compress_thread_busy_prev = compression_counters.busy;
1736 
1737         compressed_size = compression_counters.compressed_size -
1738                           rs->compressed_size_prev;
1739         if (compressed_size) {
1740             double uncompressed_size = (compression_counters.pages -
1741                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1742 
1743             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1744             compression_counters.compression_rate =
1745                                         uncompressed_size / compressed_size;
1746 
1747             rs->compress_pages_prev = compression_counters.pages;
1748             rs->compressed_size_prev = compression_counters.compressed_size;
1749         }
1750     }
1751 }
1752 
1753 static void migration_bitmap_sync(RAMState *rs)
1754 {
1755     RAMBlock *block;
1756     int64_t end_time;
1757     uint64_t bytes_xfer_now;
1758 
1759     ram_counters.dirty_sync_count++;
1760 
1761     if (!rs->time_last_bitmap_sync) {
1762         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1763     }
1764 
1765     trace_migration_bitmap_sync_start();
1766     memory_global_dirty_log_sync();
1767 
1768     qemu_mutex_lock(&rs->bitmap_mutex);
1769     rcu_read_lock();
1770     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1771         migration_bitmap_sync_range(rs, block, block->used_length);
1772     }
1773     ram_counters.remaining = ram_bytes_remaining();
1774     rcu_read_unlock();
1775     qemu_mutex_unlock(&rs->bitmap_mutex);
1776 
1777     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1778 
1779     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1780 
1781     /* more than 1 second = 1000 millisecons */
1782     if (end_time > rs->time_last_bitmap_sync + 1000) {
1783         bytes_xfer_now = ram_counters.transferred;
1784 
1785         /* During block migration the auto-converge logic incorrectly detects
1786          * that ram migration makes no progress. Avoid this by disabling the
1787          * throttling logic during the bulk phase of block migration. */
1788         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1789             /* The following detection logic can be refined later. For now:
1790                Check to see if the dirtied bytes is 50% more than the approx.
1791                amount of bytes that just got transferred since the last time we
1792                were in this routine. If that happens twice, start or increase
1793                throttling */
1794 
1795             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1796                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1797                 (++rs->dirty_rate_high_cnt >= 2)) {
1798                     trace_migration_throttle();
1799                     rs->dirty_rate_high_cnt = 0;
1800                     mig_throttle_guest_down();
1801             }
1802         }
1803 
1804         migration_update_rates(rs, end_time);
1805 
1806         rs->target_page_count_prev = rs->target_page_count;
1807 
1808         /* reset period counters */
1809         rs->time_last_bitmap_sync = end_time;
1810         rs->num_dirty_pages_period = 0;
1811         rs->bytes_xfer_prev = bytes_xfer_now;
1812     }
1813     if (migrate_use_events()) {
1814         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1815     }
1816 }
1817 
1818 static void migration_bitmap_sync_precopy(RAMState *rs)
1819 {
1820     Error *local_err = NULL;
1821 
1822     /*
1823      * The current notifier usage is just an optimization to migration, so we
1824      * don't stop the normal migration process in the error case.
1825      */
1826     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1827         error_report_err(local_err);
1828     }
1829 
1830     migration_bitmap_sync(rs);
1831 
1832     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1833         error_report_err(local_err);
1834     }
1835 }
1836 
1837 /**
1838  * save_zero_page_to_file: send the zero page to the file
1839  *
1840  * Returns the size of data written to the file, 0 means the page is not
1841  * a zero page
1842  *
1843  * @rs: current RAM state
1844  * @file: the file where the data is saved
1845  * @block: block that contains the page we want to send
1846  * @offset: offset inside the block for the page
1847  */
1848 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1849                                   RAMBlock *block, ram_addr_t offset)
1850 {
1851     uint8_t *p = block->host + offset;
1852     int len = 0;
1853 
1854     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1855         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1856         qemu_put_byte(file, 0);
1857         len += 1;
1858     }
1859     return len;
1860 }
1861 
1862 /**
1863  * save_zero_page: send the zero page to the stream
1864  *
1865  * Returns the number of pages written.
1866  *
1867  * @rs: current RAM state
1868  * @block: block that contains the page we want to send
1869  * @offset: offset inside the block for the page
1870  */
1871 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1872 {
1873     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1874 
1875     if (len) {
1876         ram_counters.duplicate++;
1877         ram_counters.transferred += len;
1878         return 1;
1879     }
1880     return -1;
1881 }
1882 
1883 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1884 {
1885     if (!migrate_release_ram() || !migration_in_postcopy()) {
1886         return;
1887     }
1888 
1889     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1890 }
1891 
1892 /*
1893  * @pages: the number of pages written by the control path,
1894  *        < 0 - error
1895  *        > 0 - number of pages written
1896  *
1897  * Return true if the pages has been saved, otherwise false is returned.
1898  */
1899 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1900                               int *pages)
1901 {
1902     uint64_t bytes_xmit = 0;
1903     int ret;
1904 
1905     *pages = -1;
1906     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1907                                 &bytes_xmit);
1908     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1909         return false;
1910     }
1911 
1912     if (bytes_xmit) {
1913         ram_counters.transferred += bytes_xmit;
1914         *pages = 1;
1915     }
1916 
1917     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1918         return true;
1919     }
1920 
1921     if (bytes_xmit > 0) {
1922         ram_counters.normal++;
1923     } else if (bytes_xmit == 0) {
1924         ram_counters.duplicate++;
1925     }
1926 
1927     return true;
1928 }
1929 
1930 /*
1931  * directly send the page to the stream
1932  *
1933  * Returns the number of pages written.
1934  *
1935  * @rs: current RAM state
1936  * @block: block that contains the page we want to send
1937  * @offset: offset inside the block for the page
1938  * @buf: the page to be sent
1939  * @async: send to page asyncly
1940  */
1941 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1942                             uint8_t *buf, bool async)
1943 {
1944     ram_counters.transferred += save_page_header(rs, rs->f, block,
1945                                                  offset | RAM_SAVE_FLAG_PAGE);
1946     if (async) {
1947         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1948                               migrate_release_ram() &
1949                               migration_in_postcopy());
1950     } else {
1951         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1952     }
1953     ram_counters.transferred += TARGET_PAGE_SIZE;
1954     ram_counters.normal++;
1955     return 1;
1956 }
1957 
1958 /**
1959  * ram_save_page: send the given page to the stream
1960  *
1961  * Returns the number of pages written.
1962  *          < 0 - error
1963  *          >=0 - Number of pages written - this might legally be 0
1964  *                if xbzrle noticed the page was the same.
1965  *
1966  * @rs: current RAM state
1967  * @block: block that contains the page we want to send
1968  * @offset: offset inside the block for the page
1969  * @last_stage: if we are at the completion stage
1970  */
1971 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1972 {
1973     int pages = -1;
1974     uint8_t *p;
1975     bool send_async = true;
1976     RAMBlock *block = pss->block;
1977     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1978     ram_addr_t current_addr = block->offset + offset;
1979 
1980     p = block->host + offset;
1981     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1982 
1983     XBZRLE_cache_lock();
1984     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1985         migrate_use_xbzrle()) {
1986         pages = save_xbzrle_page(rs, &p, current_addr, block,
1987                                  offset, last_stage);
1988         if (!last_stage) {
1989             /* Can't send this cached data async, since the cache page
1990              * might get updated before it gets to the wire
1991              */
1992             send_async = false;
1993         }
1994     }
1995 
1996     /* XBZRLE overflow or normal page */
1997     if (pages == -1) {
1998         pages = save_normal_page(rs, block, offset, p, send_async);
1999     }
2000 
2001     XBZRLE_cache_unlock();
2002 
2003     return pages;
2004 }
2005 
2006 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2007                                  ram_addr_t offset)
2008 {
2009     multifd_queue_page(block, offset);
2010     ram_counters.normal++;
2011 
2012     return 1;
2013 }
2014 
2015 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
2016                                  ram_addr_t offset, uint8_t *source_buf)
2017 {
2018     RAMState *rs = ram_state;
2019     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
2020     bool zero_page = false;
2021     int ret;
2022 
2023     if (save_zero_page_to_file(rs, f, block, offset)) {
2024         zero_page = true;
2025         goto exit;
2026     }
2027 
2028     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2029 
2030     /*
2031      * copy it to a internal buffer to avoid it being modified by VM
2032      * so that we can catch up the error during compression and
2033      * decompression
2034      */
2035     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2036     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2037     if (ret < 0) {
2038         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2039         error_report("compressed data failed!");
2040         return false;
2041     }
2042 
2043 exit:
2044     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2045     return zero_page;
2046 }
2047 
2048 static void
2049 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2050 {
2051     ram_counters.transferred += bytes_xmit;
2052 
2053     if (param->zero_page) {
2054         ram_counters.duplicate++;
2055         return;
2056     }
2057 
2058     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2059     compression_counters.compressed_size += bytes_xmit - 8;
2060     compression_counters.pages++;
2061 }
2062 
2063 static bool save_page_use_compression(RAMState *rs);
2064 
2065 static void flush_compressed_data(RAMState *rs)
2066 {
2067     int idx, len, thread_count;
2068 
2069     if (!save_page_use_compression(rs)) {
2070         return;
2071     }
2072     thread_count = migrate_compress_threads();
2073 
2074     qemu_mutex_lock(&comp_done_lock);
2075     for (idx = 0; idx < thread_count; idx++) {
2076         while (!comp_param[idx].done) {
2077             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2078         }
2079     }
2080     qemu_mutex_unlock(&comp_done_lock);
2081 
2082     for (idx = 0; idx < thread_count; idx++) {
2083         qemu_mutex_lock(&comp_param[idx].mutex);
2084         if (!comp_param[idx].quit) {
2085             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2086             /*
2087              * it's safe to fetch zero_page without holding comp_done_lock
2088              * as there is no further request submitted to the thread,
2089              * i.e, the thread should be waiting for a request at this point.
2090              */
2091             update_compress_thread_counts(&comp_param[idx], len);
2092         }
2093         qemu_mutex_unlock(&comp_param[idx].mutex);
2094     }
2095 }
2096 
2097 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2098                                        ram_addr_t offset)
2099 {
2100     param->block = block;
2101     param->offset = offset;
2102 }
2103 
2104 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2105                                            ram_addr_t offset)
2106 {
2107     int idx, thread_count, bytes_xmit = -1, pages = -1;
2108     bool wait = migrate_compress_wait_thread();
2109 
2110     thread_count = migrate_compress_threads();
2111     qemu_mutex_lock(&comp_done_lock);
2112 retry:
2113     for (idx = 0; idx < thread_count; idx++) {
2114         if (comp_param[idx].done) {
2115             comp_param[idx].done = false;
2116             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2117             qemu_mutex_lock(&comp_param[idx].mutex);
2118             set_compress_params(&comp_param[idx], block, offset);
2119             qemu_cond_signal(&comp_param[idx].cond);
2120             qemu_mutex_unlock(&comp_param[idx].mutex);
2121             pages = 1;
2122             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2123             break;
2124         }
2125     }
2126 
2127     /*
2128      * wait for the free thread if the user specifies 'compress-wait-thread',
2129      * otherwise we will post the page out in the main thread as normal page.
2130      */
2131     if (pages < 0 && wait) {
2132         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2133         goto retry;
2134     }
2135     qemu_mutex_unlock(&comp_done_lock);
2136 
2137     return pages;
2138 }
2139 
2140 /**
2141  * find_dirty_block: find the next dirty page and update any state
2142  * associated with the search process.
2143  *
2144  * Returns true if a page is found
2145  *
2146  * @rs: current RAM state
2147  * @pss: data about the state of the current dirty page scan
2148  * @again: set to false if the search has scanned the whole of RAM
2149  */
2150 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2151 {
2152     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2153     if (pss->complete_round && pss->block == rs->last_seen_block &&
2154         pss->page >= rs->last_page) {
2155         /*
2156          * We've been once around the RAM and haven't found anything.
2157          * Give up.
2158          */
2159         *again = false;
2160         return false;
2161     }
2162     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2163         /* Didn't find anything in this RAM Block */
2164         pss->page = 0;
2165         pss->block = QLIST_NEXT_RCU(pss->block, next);
2166         if (!pss->block) {
2167             /*
2168              * If memory migration starts over, we will meet a dirtied page
2169              * which may still exists in compression threads's ring, so we
2170              * should flush the compressed data to make sure the new page
2171              * is not overwritten by the old one in the destination.
2172              *
2173              * Also If xbzrle is on, stop using the data compression at this
2174              * point. In theory, xbzrle can do better than compression.
2175              */
2176             flush_compressed_data(rs);
2177 
2178             /* Hit the end of the list */
2179             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2180             /* Flag that we've looped */
2181             pss->complete_round = true;
2182             rs->ram_bulk_stage = false;
2183         }
2184         /* Didn't find anything this time, but try again on the new block */
2185         *again = true;
2186         return false;
2187     } else {
2188         /* Can go around again, but... */
2189         *again = true;
2190         /* We've found something so probably don't need to */
2191         return true;
2192     }
2193 }
2194 
2195 /**
2196  * unqueue_page: gets a page of the queue
2197  *
2198  * Helper for 'get_queued_page' - gets a page off the queue
2199  *
2200  * Returns the block of the page (or NULL if none available)
2201  *
2202  * @rs: current RAM state
2203  * @offset: used to return the offset within the RAMBlock
2204  */
2205 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2206 {
2207     RAMBlock *block = NULL;
2208 
2209     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2210         return NULL;
2211     }
2212 
2213     qemu_mutex_lock(&rs->src_page_req_mutex);
2214     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2215         struct RAMSrcPageRequest *entry =
2216                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2217         block = entry->rb;
2218         *offset = entry->offset;
2219 
2220         if (entry->len > TARGET_PAGE_SIZE) {
2221             entry->len -= TARGET_PAGE_SIZE;
2222             entry->offset += TARGET_PAGE_SIZE;
2223         } else {
2224             memory_region_unref(block->mr);
2225             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2226             g_free(entry);
2227             migration_consume_urgent_request();
2228         }
2229     }
2230     qemu_mutex_unlock(&rs->src_page_req_mutex);
2231 
2232     return block;
2233 }
2234 
2235 /**
2236  * get_queued_page: unqueue a page from the postcopy requests
2237  *
2238  * Skips pages that are already sent (!dirty)
2239  *
2240  * Returns true if a queued page is found
2241  *
2242  * @rs: current RAM state
2243  * @pss: data about the state of the current dirty page scan
2244  */
2245 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2246 {
2247     RAMBlock  *block;
2248     ram_addr_t offset;
2249     bool dirty;
2250 
2251     do {
2252         block = unqueue_page(rs, &offset);
2253         /*
2254          * We're sending this page, and since it's postcopy nothing else
2255          * will dirty it, and we must make sure it doesn't get sent again
2256          * even if this queue request was received after the background
2257          * search already sent it.
2258          */
2259         if (block) {
2260             unsigned long page;
2261 
2262             page = offset >> TARGET_PAGE_BITS;
2263             dirty = test_bit(page, block->bmap);
2264             if (!dirty) {
2265                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2266                        page, test_bit(page, block->unsentmap));
2267             } else {
2268                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2269             }
2270         }
2271 
2272     } while (block && !dirty);
2273 
2274     if (block) {
2275         /*
2276          * As soon as we start servicing pages out of order, then we have
2277          * to kill the bulk stage, since the bulk stage assumes
2278          * in (migration_bitmap_find_and_reset_dirty) that every page is
2279          * dirty, that's no longer true.
2280          */
2281         rs->ram_bulk_stage = false;
2282 
2283         /*
2284          * We want the background search to continue from the queued page
2285          * since the guest is likely to want other pages near to the page
2286          * it just requested.
2287          */
2288         pss->block = block;
2289         pss->page = offset >> TARGET_PAGE_BITS;
2290 
2291         /*
2292          * This unqueued page would break the "one round" check, even is
2293          * really rare.
2294          */
2295         pss->complete_round = false;
2296     }
2297 
2298     return !!block;
2299 }
2300 
2301 /**
2302  * migration_page_queue_free: drop any remaining pages in the ram
2303  * request queue
2304  *
2305  * It should be empty at the end anyway, but in error cases there may
2306  * be some left.  in case that there is any page left, we drop it.
2307  *
2308  */
2309 static void migration_page_queue_free(RAMState *rs)
2310 {
2311     struct RAMSrcPageRequest *mspr, *next_mspr;
2312     /* This queue generally should be empty - but in the case of a failed
2313      * migration might have some droppings in.
2314      */
2315     rcu_read_lock();
2316     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2317         memory_region_unref(mspr->rb->mr);
2318         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2319         g_free(mspr);
2320     }
2321     rcu_read_unlock();
2322 }
2323 
2324 /**
2325  * ram_save_queue_pages: queue the page for transmission
2326  *
2327  * A request from postcopy destination for example.
2328  *
2329  * Returns zero on success or negative on error
2330  *
2331  * @rbname: Name of the RAMBLock of the request. NULL means the
2332  *          same that last one.
2333  * @start: starting address from the start of the RAMBlock
2334  * @len: length (in bytes) to send
2335  */
2336 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2337 {
2338     RAMBlock *ramblock;
2339     RAMState *rs = ram_state;
2340 
2341     ram_counters.postcopy_requests++;
2342     rcu_read_lock();
2343     if (!rbname) {
2344         /* Reuse last RAMBlock */
2345         ramblock = rs->last_req_rb;
2346 
2347         if (!ramblock) {
2348             /*
2349              * Shouldn't happen, we can't reuse the last RAMBlock if
2350              * it's the 1st request.
2351              */
2352             error_report("ram_save_queue_pages no previous block");
2353             goto err;
2354         }
2355     } else {
2356         ramblock = qemu_ram_block_by_name(rbname);
2357 
2358         if (!ramblock) {
2359             /* We shouldn't be asked for a non-existent RAMBlock */
2360             error_report("ram_save_queue_pages no block '%s'", rbname);
2361             goto err;
2362         }
2363         rs->last_req_rb = ramblock;
2364     }
2365     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2366     if (start+len > ramblock->used_length) {
2367         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2368                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2369                      __func__, start, len, ramblock->used_length);
2370         goto err;
2371     }
2372 
2373     struct RAMSrcPageRequest *new_entry =
2374         g_malloc0(sizeof(struct RAMSrcPageRequest));
2375     new_entry->rb = ramblock;
2376     new_entry->offset = start;
2377     new_entry->len = len;
2378 
2379     memory_region_ref(ramblock->mr);
2380     qemu_mutex_lock(&rs->src_page_req_mutex);
2381     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2382     migration_make_urgent_request();
2383     qemu_mutex_unlock(&rs->src_page_req_mutex);
2384     rcu_read_unlock();
2385 
2386     return 0;
2387 
2388 err:
2389     rcu_read_unlock();
2390     return -1;
2391 }
2392 
2393 static bool save_page_use_compression(RAMState *rs)
2394 {
2395     if (!migrate_use_compression()) {
2396         return false;
2397     }
2398 
2399     /*
2400      * If xbzrle is on, stop using the data compression after first
2401      * round of migration even if compression is enabled. In theory,
2402      * xbzrle can do better than compression.
2403      */
2404     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2405         return true;
2406     }
2407 
2408     return false;
2409 }
2410 
2411 /*
2412  * try to compress the page before posting it out, return true if the page
2413  * has been properly handled by compression, otherwise needs other
2414  * paths to handle it
2415  */
2416 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2417 {
2418     if (!save_page_use_compression(rs)) {
2419         return false;
2420     }
2421 
2422     /*
2423      * When starting the process of a new block, the first page of
2424      * the block should be sent out before other pages in the same
2425      * block, and all the pages in last block should have been sent
2426      * out, keeping this order is important, because the 'cont' flag
2427      * is used to avoid resending the block name.
2428      *
2429      * We post the fist page as normal page as compression will take
2430      * much CPU resource.
2431      */
2432     if (block != rs->last_sent_block) {
2433         flush_compressed_data(rs);
2434         return false;
2435     }
2436 
2437     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2438         return true;
2439     }
2440 
2441     compression_counters.busy++;
2442     return false;
2443 }
2444 
2445 /**
2446  * ram_save_target_page: save one target page
2447  *
2448  * Returns the number of pages written
2449  *
2450  * @rs: current RAM state
2451  * @pss: data about the page we want to send
2452  * @last_stage: if we are at the completion stage
2453  */
2454 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2455                                 bool last_stage)
2456 {
2457     RAMBlock *block = pss->block;
2458     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2459     int res;
2460 
2461     if (control_save_page(rs, block, offset, &res)) {
2462         return res;
2463     }
2464 
2465     if (save_compress_page(rs, block, offset)) {
2466         return 1;
2467     }
2468 
2469     res = save_zero_page(rs, block, offset);
2470     if (res > 0) {
2471         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2472          * page would be stale
2473          */
2474         if (!save_page_use_compression(rs)) {
2475             XBZRLE_cache_lock();
2476             xbzrle_cache_zero_page(rs, block->offset + offset);
2477             XBZRLE_cache_unlock();
2478         }
2479         ram_release_pages(block->idstr, offset, res);
2480         return res;
2481     }
2482 
2483     /*
2484      * do not use multifd for compression as the first page in the new
2485      * block should be posted out before sending the compressed page
2486      */
2487     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2488         return ram_save_multifd_page(rs, block, offset);
2489     }
2490 
2491     return ram_save_page(rs, pss, last_stage);
2492 }
2493 
2494 /**
2495  * ram_save_host_page: save a whole host page
2496  *
2497  * Starting at *offset send pages up to the end of the current host
2498  * page. It's valid for the initial offset to point into the middle of
2499  * a host page in which case the remainder of the hostpage is sent.
2500  * Only dirty target pages are sent. Note that the host page size may
2501  * be a huge page for this block.
2502  * The saving stops at the boundary of the used_length of the block
2503  * if the RAMBlock isn't a multiple of the host page size.
2504  *
2505  * Returns the number of pages written or negative on error
2506  *
2507  * @rs: current RAM state
2508  * @ms: current migration state
2509  * @pss: data about the page we want to send
2510  * @last_stage: if we are at the completion stage
2511  */
2512 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2513                               bool last_stage)
2514 {
2515     int tmppages, pages = 0;
2516     size_t pagesize_bits =
2517         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2518 
2519     if (ramblock_is_ignored(pss->block)) {
2520         error_report("block %s should not be migrated !", pss->block->idstr);
2521         return 0;
2522     }
2523 
2524     do {
2525         /* Check the pages is dirty and if it is send it */
2526         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2527             pss->page++;
2528             continue;
2529         }
2530 
2531         tmppages = ram_save_target_page(rs, pss, last_stage);
2532         if (tmppages < 0) {
2533             return tmppages;
2534         }
2535 
2536         pages += tmppages;
2537         if (pss->block->unsentmap) {
2538             clear_bit(pss->page, pss->block->unsentmap);
2539         }
2540 
2541         pss->page++;
2542     } while ((pss->page & (pagesize_bits - 1)) &&
2543              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2544 
2545     /* The offset we leave with is the last one we looked at */
2546     pss->page--;
2547     return pages;
2548 }
2549 
2550 /**
2551  * ram_find_and_save_block: finds a dirty page and sends it to f
2552  *
2553  * Called within an RCU critical section.
2554  *
2555  * Returns the number of pages written where zero means no dirty pages,
2556  * or negative on error
2557  *
2558  * @rs: current RAM state
2559  * @last_stage: if we are at the completion stage
2560  *
2561  * On systems where host-page-size > target-page-size it will send all the
2562  * pages in a host page that are dirty.
2563  */
2564 
2565 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2566 {
2567     PageSearchStatus pss;
2568     int pages = 0;
2569     bool again, found;
2570 
2571     /* No dirty page as there is zero RAM */
2572     if (!ram_bytes_total()) {
2573         return pages;
2574     }
2575 
2576     pss.block = rs->last_seen_block;
2577     pss.page = rs->last_page;
2578     pss.complete_round = false;
2579 
2580     if (!pss.block) {
2581         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2582     }
2583 
2584     do {
2585         again = true;
2586         found = get_queued_page(rs, &pss);
2587 
2588         if (!found) {
2589             /* priority queue empty, so just search for something dirty */
2590             found = find_dirty_block(rs, &pss, &again);
2591         }
2592 
2593         if (found) {
2594             pages = ram_save_host_page(rs, &pss, last_stage);
2595         }
2596     } while (!pages && again);
2597 
2598     rs->last_seen_block = pss.block;
2599     rs->last_page = pss.page;
2600 
2601     return pages;
2602 }
2603 
2604 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2605 {
2606     uint64_t pages = size / TARGET_PAGE_SIZE;
2607 
2608     if (zero) {
2609         ram_counters.duplicate += pages;
2610     } else {
2611         ram_counters.normal += pages;
2612         ram_counters.transferred += size;
2613         qemu_update_position(f, size);
2614     }
2615 }
2616 
2617 static uint64_t ram_bytes_total_common(bool count_ignored)
2618 {
2619     RAMBlock *block;
2620     uint64_t total = 0;
2621 
2622     rcu_read_lock();
2623     if (count_ignored) {
2624         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2625             total += block->used_length;
2626         }
2627     } else {
2628         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2629             total += block->used_length;
2630         }
2631     }
2632     rcu_read_unlock();
2633     return total;
2634 }
2635 
2636 uint64_t ram_bytes_total(void)
2637 {
2638     return ram_bytes_total_common(false);
2639 }
2640 
2641 static void xbzrle_load_setup(void)
2642 {
2643     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2644 }
2645 
2646 static void xbzrle_load_cleanup(void)
2647 {
2648     g_free(XBZRLE.decoded_buf);
2649     XBZRLE.decoded_buf = NULL;
2650 }
2651 
2652 static void ram_state_cleanup(RAMState **rsp)
2653 {
2654     if (*rsp) {
2655         migration_page_queue_free(*rsp);
2656         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2657         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2658         g_free(*rsp);
2659         *rsp = NULL;
2660     }
2661 }
2662 
2663 static void xbzrle_cleanup(void)
2664 {
2665     XBZRLE_cache_lock();
2666     if (XBZRLE.cache) {
2667         cache_fini(XBZRLE.cache);
2668         g_free(XBZRLE.encoded_buf);
2669         g_free(XBZRLE.current_buf);
2670         g_free(XBZRLE.zero_target_page);
2671         XBZRLE.cache = NULL;
2672         XBZRLE.encoded_buf = NULL;
2673         XBZRLE.current_buf = NULL;
2674         XBZRLE.zero_target_page = NULL;
2675     }
2676     XBZRLE_cache_unlock();
2677 }
2678 
2679 static void ram_save_cleanup(void *opaque)
2680 {
2681     RAMState **rsp = opaque;
2682     RAMBlock *block;
2683 
2684     /* caller have hold iothread lock or is in a bh, so there is
2685      * no writing race against the migration bitmap
2686      */
2687     memory_global_dirty_log_stop();
2688 
2689     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2690         g_free(block->bmap);
2691         block->bmap = NULL;
2692         g_free(block->unsentmap);
2693         block->unsentmap = NULL;
2694     }
2695 
2696     xbzrle_cleanup();
2697     compress_threads_save_cleanup();
2698     ram_state_cleanup(rsp);
2699 }
2700 
2701 static void ram_state_reset(RAMState *rs)
2702 {
2703     rs->last_seen_block = NULL;
2704     rs->last_sent_block = NULL;
2705     rs->last_page = 0;
2706     rs->last_version = ram_list.version;
2707     rs->ram_bulk_stage = true;
2708     rs->fpo_enabled = false;
2709 }
2710 
2711 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2712 
2713 /*
2714  * 'expected' is the value you expect the bitmap mostly to be full
2715  * of; it won't bother printing lines that are all this value.
2716  * If 'todump' is null the migration bitmap is dumped.
2717  */
2718 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2719                            unsigned long pages)
2720 {
2721     int64_t cur;
2722     int64_t linelen = 128;
2723     char linebuf[129];
2724 
2725     for (cur = 0; cur < pages; cur += linelen) {
2726         int64_t curb;
2727         bool found = false;
2728         /*
2729          * Last line; catch the case where the line length
2730          * is longer than remaining ram
2731          */
2732         if (cur + linelen > pages) {
2733             linelen = pages - cur;
2734         }
2735         for (curb = 0; curb < linelen; curb++) {
2736             bool thisbit = test_bit(cur + curb, todump);
2737             linebuf[curb] = thisbit ? '1' : '.';
2738             found = found || (thisbit != expected);
2739         }
2740         if (found) {
2741             linebuf[curb] = '\0';
2742             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2743         }
2744     }
2745 }
2746 
2747 /* **** functions for postcopy ***** */
2748 
2749 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2750 {
2751     struct RAMBlock *block;
2752 
2753     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2754         unsigned long *bitmap = block->bmap;
2755         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2756         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2757 
2758         while (run_start < range) {
2759             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2760             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2761                               (run_end - run_start) << TARGET_PAGE_BITS);
2762             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2763         }
2764     }
2765 }
2766 
2767 /**
2768  * postcopy_send_discard_bm_ram: discard a RAMBlock
2769  *
2770  * Returns zero on success
2771  *
2772  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2773  * Note: At this point the 'unsentmap' is the processed bitmap combined
2774  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2775  *
2776  * @ms: current migration state
2777  * @pds: state for postcopy
2778  * @start: RAMBlock starting page
2779  * @length: RAMBlock size
2780  */
2781 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2782                                         PostcopyDiscardState *pds,
2783                                         RAMBlock *block)
2784 {
2785     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2786     unsigned long current;
2787     unsigned long *unsentmap = block->unsentmap;
2788 
2789     for (current = 0; current < end; ) {
2790         unsigned long one = find_next_bit(unsentmap, end, current);
2791 
2792         if (one <= end) {
2793             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2794             unsigned long discard_length;
2795 
2796             if (zero >= end) {
2797                 discard_length = end - one;
2798             } else {
2799                 discard_length = zero - one;
2800             }
2801             if (discard_length) {
2802                 postcopy_discard_send_range(ms, pds, one, discard_length);
2803             }
2804             current = one + discard_length;
2805         } else {
2806             current = one;
2807         }
2808     }
2809 
2810     return 0;
2811 }
2812 
2813 /**
2814  * postcopy_each_ram_send_discard: discard all RAMBlocks
2815  *
2816  * Returns 0 for success or negative for error
2817  *
2818  * Utility for the outgoing postcopy code.
2819  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2820  *   passing it bitmap indexes and name.
2821  * (qemu_ram_foreach_block ends up passing unscaled lengths
2822  *  which would mean postcopy code would have to deal with target page)
2823  *
2824  * @ms: current migration state
2825  */
2826 static int postcopy_each_ram_send_discard(MigrationState *ms)
2827 {
2828     struct RAMBlock *block;
2829     int ret;
2830 
2831     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2832         PostcopyDiscardState *pds =
2833             postcopy_discard_send_init(ms, block->idstr);
2834 
2835         /*
2836          * Postcopy sends chunks of bitmap over the wire, but it
2837          * just needs indexes at this point, avoids it having
2838          * target page specific code.
2839          */
2840         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2841         postcopy_discard_send_finish(ms, pds);
2842         if (ret) {
2843             return ret;
2844         }
2845     }
2846 
2847     return 0;
2848 }
2849 
2850 /**
2851  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2852  *
2853  * Helper for postcopy_chunk_hostpages; it's called twice to
2854  * canonicalize the two bitmaps, that are similar, but one is
2855  * inverted.
2856  *
2857  * Postcopy requires that all target pages in a hostpage are dirty or
2858  * clean, not a mix.  This function canonicalizes the bitmaps.
2859  *
2860  * @ms: current migration state
2861  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2862  *               otherwise we need to canonicalize partially dirty host pages
2863  * @block: block that contains the page we want to canonicalize
2864  * @pds: state for postcopy
2865  */
2866 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2867                                           RAMBlock *block,
2868                                           PostcopyDiscardState *pds)
2869 {
2870     RAMState *rs = ram_state;
2871     unsigned long *bitmap = block->bmap;
2872     unsigned long *unsentmap = block->unsentmap;
2873     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2874     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2875     unsigned long run_start;
2876 
2877     if (block->page_size == TARGET_PAGE_SIZE) {
2878         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2879         return;
2880     }
2881 
2882     if (unsent_pass) {
2883         /* Find a sent page */
2884         run_start = find_next_zero_bit(unsentmap, pages, 0);
2885     } else {
2886         /* Find a dirty page */
2887         run_start = find_next_bit(bitmap, pages, 0);
2888     }
2889 
2890     while (run_start < pages) {
2891         bool do_fixup = false;
2892         unsigned long fixup_start_addr;
2893         unsigned long host_offset;
2894 
2895         /*
2896          * If the start of this run of pages is in the middle of a host
2897          * page, then we need to fixup this host page.
2898          */
2899         host_offset = run_start % host_ratio;
2900         if (host_offset) {
2901             do_fixup = true;
2902             run_start -= host_offset;
2903             fixup_start_addr = run_start;
2904             /* For the next pass */
2905             run_start = run_start + host_ratio;
2906         } else {
2907             /* Find the end of this run */
2908             unsigned long run_end;
2909             if (unsent_pass) {
2910                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2911             } else {
2912                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2913             }
2914             /*
2915              * If the end isn't at the start of a host page, then the
2916              * run doesn't finish at the end of a host page
2917              * and we need to discard.
2918              */
2919             host_offset = run_end % host_ratio;
2920             if (host_offset) {
2921                 do_fixup = true;
2922                 fixup_start_addr = run_end - host_offset;
2923                 /*
2924                  * This host page has gone, the next loop iteration starts
2925                  * from after the fixup
2926                  */
2927                 run_start = fixup_start_addr + host_ratio;
2928             } else {
2929                 /*
2930                  * No discards on this iteration, next loop starts from
2931                  * next sent/dirty page
2932                  */
2933                 run_start = run_end + 1;
2934             }
2935         }
2936 
2937         if (do_fixup) {
2938             unsigned long page;
2939 
2940             /* Tell the destination to discard this page */
2941             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2942                 /* For the unsent_pass we:
2943                  *     discard partially sent pages
2944                  * For the !unsent_pass (dirty) we:
2945                  *     discard partially dirty pages that were sent
2946                  *     (any partially sent pages were already discarded
2947                  *     by the previous unsent_pass)
2948                  */
2949                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2950                                             host_ratio);
2951             }
2952 
2953             /* Clean up the bitmap */
2954             for (page = fixup_start_addr;
2955                  page < fixup_start_addr + host_ratio; page++) {
2956                 /* All pages in this host page are now not sent */
2957                 set_bit(page, unsentmap);
2958 
2959                 /*
2960                  * Remark them as dirty, updating the count for any pages
2961                  * that weren't previously dirty.
2962                  */
2963                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2964             }
2965         }
2966 
2967         if (unsent_pass) {
2968             /* Find the next sent page for the next iteration */
2969             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2970         } else {
2971             /* Find the next dirty page for the next iteration */
2972             run_start = find_next_bit(bitmap, pages, run_start);
2973         }
2974     }
2975 }
2976 
2977 /**
2978  * postcopy_chuck_hostpages: discrad any partially sent host page
2979  *
2980  * Utility for the outgoing postcopy code.
2981  *
2982  * Discard any partially sent host-page size chunks, mark any partially
2983  * dirty host-page size chunks as all dirty.  In this case the host-page
2984  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2985  *
2986  * Returns zero on success
2987  *
2988  * @ms: current migration state
2989  * @block: block we want to work with
2990  */
2991 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2992 {
2993     PostcopyDiscardState *pds =
2994         postcopy_discard_send_init(ms, block->idstr);
2995 
2996     /* First pass: Discard all partially sent host pages */
2997     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2998     /*
2999      * Second pass: Ensure that all partially dirty host pages are made
3000      * fully dirty.
3001      */
3002     postcopy_chunk_hostpages_pass(ms, false, block, pds);
3003 
3004     postcopy_discard_send_finish(ms, pds);
3005     return 0;
3006 }
3007 
3008 /**
3009  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3010  *
3011  * Returns zero on success
3012  *
3013  * Transmit the set of pages to be discarded after precopy to the target
3014  * these are pages that:
3015  *     a) Have been previously transmitted but are now dirty again
3016  *     b) Pages that have never been transmitted, this ensures that
3017  *        any pages on the destination that have been mapped by background
3018  *        tasks get discarded (transparent huge pages is the specific concern)
3019  * Hopefully this is pretty sparse
3020  *
3021  * @ms: current migration state
3022  */
3023 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3024 {
3025     RAMState *rs = ram_state;
3026     RAMBlock *block;
3027     int ret;
3028 
3029     rcu_read_lock();
3030 
3031     /* This should be our last sync, the src is now paused */
3032     migration_bitmap_sync(rs);
3033 
3034     /* Easiest way to make sure we don't resume in the middle of a host-page */
3035     rs->last_seen_block = NULL;
3036     rs->last_sent_block = NULL;
3037     rs->last_page = 0;
3038 
3039     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3040         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3041         unsigned long *bitmap = block->bmap;
3042         unsigned long *unsentmap = block->unsentmap;
3043 
3044         if (!unsentmap) {
3045             /* We don't have a safe way to resize the sentmap, so
3046              * if the bitmap was resized it will be NULL at this
3047              * point.
3048              */
3049             error_report("migration ram resized during precopy phase");
3050             rcu_read_unlock();
3051             return -EINVAL;
3052         }
3053         /* Deal with TPS != HPS and huge pages */
3054         ret = postcopy_chunk_hostpages(ms, block);
3055         if (ret) {
3056             rcu_read_unlock();
3057             return ret;
3058         }
3059 
3060         /*
3061          * Update the unsentmap to be unsentmap = unsentmap | dirty
3062          */
3063         bitmap_or(unsentmap, unsentmap, bitmap, pages);
3064 #ifdef DEBUG_POSTCOPY
3065         ram_debug_dump_bitmap(unsentmap, true, pages);
3066 #endif
3067     }
3068     trace_ram_postcopy_send_discard_bitmap();
3069 
3070     ret = postcopy_each_ram_send_discard(ms);
3071     rcu_read_unlock();
3072 
3073     return ret;
3074 }
3075 
3076 /**
3077  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3078  *
3079  * Returns zero on success
3080  *
3081  * @rbname: name of the RAMBlock of the request. NULL means the
3082  *          same that last one.
3083  * @start: RAMBlock starting page
3084  * @length: RAMBlock size
3085  */
3086 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3087 {
3088     int ret = -1;
3089 
3090     trace_ram_discard_range(rbname, start, length);
3091 
3092     rcu_read_lock();
3093     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3094 
3095     if (!rb) {
3096         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3097         goto err;
3098     }
3099 
3100     /*
3101      * On source VM, we don't need to update the received bitmap since
3102      * we don't even have one.
3103      */
3104     if (rb->receivedmap) {
3105         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3106                      length >> qemu_target_page_bits());
3107     }
3108 
3109     ret = ram_block_discard_range(rb, start, length);
3110 
3111 err:
3112     rcu_read_unlock();
3113 
3114     return ret;
3115 }
3116 
3117 /*
3118  * For every allocation, we will try not to crash the VM if the
3119  * allocation failed.
3120  */
3121 static int xbzrle_init(void)
3122 {
3123     Error *local_err = NULL;
3124 
3125     if (!migrate_use_xbzrle()) {
3126         return 0;
3127     }
3128 
3129     XBZRLE_cache_lock();
3130 
3131     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3132     if (!XBZRLE.zero_target_page) {
3133         error_report("%s: Error allocating zero page", __func__);
3134         goto err_out;
3135     }
3136 
3137     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3138                               TARGET_PAGE_SIZE, &local_err);
3139     if (!XBZRLE.cache) {
3140         error_report_err(local_err);
3141         goto free_zero_page;
3142     }
3143 
3144     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3145     if (!XBZRLE.encoded_buf) {
3146         error_report("%s: Error allocating encoded_buf", __func__);
3147         goto free_cache;
3148     }
3149 
3150     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3151     if (!XBZRLE.current_buf) {
3152         error_report("%s: Error allocating current_buf", __func__);
3153         goto free_encoded_buf;
3154     }
3155 
3156     /* We are all good */
3157     XBZRLE_cache_unlock();
3158     return 0;
3159 
3160 free_encoded_buf:
3161     g_free(XBZRLE.encoded_buf);
3162     XBZRLE.encoded_buf = NULL;
3163 free_cache:
3164     cache_fini(XBZRLE.cache);
3165     XBZRLE.cache = NULL;
3166 free_zero_page:
3167     g_free(XBZRLE.zero_target_page);
3168     XBZRLE.zero_target_page = NULL;
3169 err_out:
3170     XBZRLE_cache_unlock();
3171     return -ENOMEM;
3172 }
3173 
3174 static int ram_state_init(RAMState **rsp)
3175 {
3176     *rsp = g_try_new0(RAMState, 1);
3177 
3178     if (!*rsp) {
3179         error_report("%s: Init ramstate fail", __func__);
3180         return -1;
3181     }
3182 
3183     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3184     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3185     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3186 
3187     /*
3188      * This must match with the initial values of dirty bitmap.
3189      * Currently we initialize the dirty bitmap to all zeros so
3190      * here the total dirty page count is zero.
3191      */
3192     (*rsp)->migration_dirty_pages = 0;
3193     ram_state_reset(*rsp);
3194 
3195     return 0;
3196 }
3197 
3198 static void ram_list_init_bitmaps(void)
3199 {
3200     RAMBlock *block;
3201     unsigned long pages;
3202 
3203     /* Skip setting bitmap if there is no RAM */
3204     if (ram_bytes_total()) {
3205         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3206             pages = block->max_length >> TARGET_PAGE_BITS;
3207             /*
3208              * The initial dirty bitmap for migration must be set with all
3209              * ones to make sure we'll migrate every guest RAM page to
3210              * destination.
3211              * Here we didn't set RAMBlock.bmap simply because it is already
3212              * set in ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION] in
3213              * ram_block_add, and that's where we'll sync the dirty bitmaps.
3214              * Here setting RAMBlock.bmap would be fine too but not necessary.
3215              */
3216             block->bmap = bitmap_new(pages);
3217             if (migrate_postcopy_ram()) {
3218                 block->unsentmap = bitmap_new(pages);
3219                 bitmap_set(block->unsentmap, 0, pages);
3220             }
3221         }
3222     }
3223 }
3224 
3225 static void ram_init_bitmaps(RAMState *rs)
3226 {
3227     /* For memory_global_dirty_log_start below.  */
3228     qemu_mutex_lock_iothread();
3229     qemu_mutex_lock_ramlist();
3230     rcu_read_lock();
3231 
3232     ram_list_init_bitmaps();
3233     memory_global_dirty_log_start();
3234     migration_bitmap_sync_precopy(rs);
3235 
3236     rcu_read_unlock();
3237     qemu_mutex_unlock_ramlist();
3238     qemu_mutex_unlock_iothread();
3239 }
3240 
3241 static int ram_init_all(RAMState **rsp)
3242 {
3243     if (ram_state_init(rsp)) {
3244         return -1;
3245     }
3246 
3247     if (xbzrle_init()) {
3248         ram_state_cleanup(rsp);
3249         return -1;
3250     }
3251 
3252     ram_init_bitmaps(*rsp);
3253 
3254     return 0;
3255 }
3256 
3257 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3258 {
3259     RAMBlock *block;
3260     uint64_t pages = 0;
3261 
3262     /*
3263      * Postcopy is not using xbzrle/compression, so no need for that.
3264      * Also, since source are already halted, we don't need to care
3265      * about dirty page logging as well.
3266      */
3267 
3268     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3269         pages += bitmap_count_one(block->bmap,
3270                                   block->used_length >> TARGET_PAGE_BITS);
3271     }
3272 
3273     /* This may not be aligned with current bitmaps. Recalculate. */
3274     rs->migration_dirty_pages = pages;
3275 
3276     rs->last_seen_block = NULL;
3277     rs->last_sent_block = NULL;
3278     rs->last_page = 0;
3279     rs->last_version = ram_list.version;
3280     /*
3281      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3282      * matter what we have sent.
3283      */
3284     rs->ram_bulk_stage = false;
3285 
3286     /* Update RAMState cache of output QEMUFile */
3287     rs->f = out;
3288 
3289     trace_ram_state_resume_prepare(pages);
3290 }
3291 
3292 /*
3293  * This function clears bits of the free pages reported by the caller from the
3294  * migration dirty bitmap. @addr is the host address corresponding to the
3295  * start of the continuous guest free pages, and @len is the total bytes of
3296  * those pages.
3297  */
3298 void qemu_guest_free_page_hint(void *addr, size_t len)
3299 {
3300     RAMBlock *block;
3301     ram_addr_t offset;
3302     size_t used_len, start, npages;
3303     MigrationState *s = migrate_get_current();
3304 
3305     /* This function is currently expected to be used during live migration */
3306     if (!migration_is_setup_or_active(s->state)) {
3307         return;
3308     }
3309 
3310     for (; len > 0; len -= used_len, addr += used_len) {
3311         block = qemu_ram_block_from_host(addr, false, &offset);
3312         if (unlikely(!block || offset >= block->used_length)) {
3313             /*
3314              * The implementation might not support RAMBlock resize during
3315              * live migration, but it could happen in theory with future
3316              * updates. So we add a check here to capture that case.
3317              */
3318             error_report_once("%s unexpected error", __func__);
3319             return;
3320         }
3321 
3322         if (len <= block->used_length - offset) {
3323             used_len = len;
3324         } else {
3325             used_len = block->used_length - offset;
3326         }
3327 
3328         start = offset >> TARGET_PAGE_BITS;
3329         npages = used_len >> TARGET_PAGE_BITS;
3330 
3331         qemu_mutex_lock(&ram_state->bitmap_mutex);
3332         ram_state->migration_dirty_pages -=
3333                       bitmap_count_one_with_offset(block->bmap, start, npages);
3334         bitmap_clear(block->bmap, start, npages);
3335         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3336     }
3337 }
3338 
3339 /*
3340  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3341  * long-running RCU critical section.  When rcu-reclaims in the code
3342  * start to become numerous it will be necessary to reduce the
3343  * granularity of these critical sections.
3344  */
3345 
3346 /**
3347  * ram_save_setup: Setup RAM for migration
3348  *
3349  * Returns zero to indicate success and negative for error
3350  *
3351  * @f: QEMUFile where to send the data
3352  * @opaque: RAMState pointer
3353  */
3354 static int ram_save_setup(QEMUFile *f, void *opaque)
3355 {
3356     RAMState **rsp = opaque;
3357     RAMBlock *block;
3358 
3359     if (compress_threads_save_setup()) {
3360         return -1;
3361     }
3362 
3363     /* migration has already setup the bitmap, reuse it. */
3364     if (!migration_in_colo_state()) {
3365         if (ram_init_all(rsp) != 0) {
3366             compress_threads_save_cleanup();
3367             return -1;
3368         }
3369     }
3370     (*rsp)->f = f;
3371 
3372     rcu_read_lock();
3373 
3374     qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3375 
3376     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3377         qemu_put_byte(f, strlen(block->idstr));
3378         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3379         qemu_put_be64(f, block->used_length);
3380         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3381             qemu_put_be64(f, block->page_size);
3382         }
3383         if (migrate_ignore_shared()) {
3384             qemu_put_be64(f, block->mr->addr);
3385             qemu_put_byte(f, ramblock_is_ignored(block) ? 1 : 0);
3386         }
3387     }
3388 
3389     rcu_read_unlock();
3390 
3391     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3392     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3393 
3394     multifd_send_sync_main();
3395     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3396     qemu_fflush(f);
3397 
3398     return 0;
3399 }
3400 
3401 /**
3402  * ram_save_iterate: iterative stage for migration
3403  *
3404  * Returns zero to indicate success and negative for error
3405  *
3406  * @f: QEMUFile where to send the data
3407  * @opaque: RAMState pointer
3408  */
3409 static int ram_save_iterate(QEMUFile *f, void *opaque)
3410 {
3411     RAMState **temp = opaque;
3412     RAMState *rs = *temp;
3413     int ret;
3414     int i;
3415     int64_t t0;
3416     int done = 0;
3417 
3418     if (blk_mig_bulk_active()) {
3419         /* Avoid transferring ram during bulk phase of block migration as
3420          * the bulk phase will usually take a long time and transferring
3421          * ram updates during that time is pointless. */
3422         goto out;
3423     }
3424 
3425     rcu_read_lock();
3426     if (ram_list.version != rs->last_version) {
3427         ram_state_reset(rs);
3428     }
3429 
3430     /* Read version before ram_list.blocks */
3431     smp_rmb();
3432 
3433     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3434 
3435     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3436     i = 0;
3437     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3438             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3439         int pages;
3440 
3441         if (qemu_file_get_error(f)) {
3442             break;
3443         }
3444 
3445         pages = ram_find_and_save_block(rs, false);
3446         /* no more pages to sent */
3447         if (pages == 0) {
3448             done = 1;
3449             break;
3450         }
3451 
3452         if (pages < 0) {
3453             qemu_file_set_error(f, pages);
3454             break;
3455         }
3456 
3457         rs->target_page_count += pages;
3458 
3459         /* we want to check in the 1st loop, just in case it was the 1st time
3460            and we had to sync the dirty bitmap.
3461            qemu_clock_get_ns() is a bit expensive, so we only check each some
3462            iterations
3463         */
3464         if ((i & 63) == 0) {
3465             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3466             if (t1 > MAX_WAIT) {
3467                 trace_ram_save_iterate_big_wait(t1, i);
3468                 break;
3469             }
3470         }
3471         i++;
3472     }
3473     rcu_read_unlock();
3474 
3475     /*
3476      * Must occur before EOS (or any QEMUFile operation)
3477      * because of RDMA protocol.
3478      */
3479     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3480 
3481 out:
3482     multifd_send_sync_main();
3483     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3484     qemu_fflush(f);
3485     ram_counters.transferred += 8;
3486 
3487     ret = qemu_file_get_error(f);
3488     if (ret < 0) {
3489         return ret;
3490     }
3491 
3492     return done;
3493 }
3494 
3495 /**
3496  * ram_save_complete: function called to send the remaining amount of ram
3497  *
3498  * Returns zero to indicate success or negative on error
3499  *
3500  * Called with iothread lock
3501  *
3502  * @f: QEMUFile where to send the data
3503  * @opaque: RAMState pointer
3504  */
3505 static int ram_save_complete(QEMUFile *f, void *opaque)
3506 {
3507     RAMState **temp = opaque;
3508     RAMState *rs = *temp;
3509     int ret = 0;
3510 
3511     rcu_read_lock();
3512 
3513     if (!migration_in_postcopy()) {
3514         migration_bitmap_sync_precopy(rs);
3515     }
3516 
3517     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3518 
3519     /* try transferring iterative blocks of memory */
3520 
3521     /* flush all remaining blocks regardless of rate limiting */
3522     while (true) {
3523         int pages;
3524 
3525         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3526         /* no more blocks to sent */
3527         if (pages == 0) {
3528             break;
3529         }
3530         if (pages < 0) {
3531             ret = pages;
3532             break;
3533         }
3534     }
3535 
3536     flush_compressed_data(rs);
3537     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3538 
3539     rcu_read_unlock();
3540 
3541     multifd_send_sync_main();
3542     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3543     qemu_fflush(f);
3544 
3545     return ret;
3546 }
3547 
3548 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3549                              uint64_t *res_precopy_only,
3550                              uint64_t *res_compatible,
3551                              uint64_t *res_postcopy_only)
3552 {
3553     RAMState **temp = opaque;
3554     RAMState *rs = *temp;
3555     uint64_t remaining_size;
3556 
3557     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3558 
3559     if (!migration_in_postcopy() &&
3560         remaining_size < max_size) {
3561         qemu_mutex_lock_iothread();
3562         rcu_read_lock();
3563         migration_bitmap_sync_precopy(rs);
3564         rcu_read_unlock();
3565         qemu_mutex_unlock_iothread();
3566         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3567     }
3568 
3569     if (migrate_postcopy_ram()) {
3570         /* We can do postcopy, and all the data is postcopiable */
3571         *res_compatible += remaining_size;
3572     } else {
3573         *res_precopy_only += remaining_size;
3574     }
3575 }
3576 
3577 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3578 {
3579     unsigned int xh_len;
3580     int xh_flags;
3581     uint8_t *loaded_data;
3582 
3583     /* extract RLE header */
3584     xh_flags = qemu_get_byte(f);
3585     xh_len = qemu_get_be16(f);
3586 
3587     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3588         error_report("Failed to load XBZRLE page - wrong compression!");
3589         return -1;
3590     }
3591 
3592     if (xh_len > TARGET_PAGE_SIZE) {
3593         error_report("Failed to load XBZRLE page - len overflow!");
3594         return -1;
3595     }
3596     loaded_data = XBZRLE.decoded_buf;
3597     /* load data and decode */
3598     /* it can change loaded_data to point to an internal buffer */
3599     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3600 
3601     /* decode RLE */
3602     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3603                              TARGET_PAGE_SIZE) == -1) {
3604         error_report("Failed to load XBZRLE page - decode error!");
3605         return -1;
3606     }
3607 
3608     return 0;
3609 }
3610 
3611 /**
3612  * ram_block_from_stream: read a RAMBlock id from the migration stream
3613  *
3614  * Must be called from within a rcu critical section.
3615  *
3616  * Returns a pointer from within the RCU-protected ram_list.
3617  *
3618  * @f: QEMUFile where to read the data from
3619  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3620  */
3621 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3622 {
3623     static RAMBlock *block = NULL;
3624     char id[256];
3625     uint8_t len;
3626 
3627     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3628         if (!block) {
3629             error_report("Ack, bad migration stream!");
3630             return NULL;
3631         }
3632         return block;
3633     }
3634 
3635     len = qemu_get_byte(f);
3636     qemu_get_buffer(f, (uint8_t *)id, len);
3637     id[len] = 0;
3638 
3639     block = qemu_ram_block_by_name(id);
3640     if (!block) {
3641         error_report("Can't find block %s", id);
3642         return NULL;
3643     }
3644 
3645     if (ramblock_is_ignored(block)) {
3646         error_report("block %s should not be migrated !", id);
3647         return NULL;
3648     }
3649 
3650     return block;
3651 }
3652 
3653 static inline void *host_from_ram_block_offset(RAMBlock *block,
3654                                                ram_addr_t offset)
3655 {
3656     if (!offset_in_ramblock(block, offset)) {
3657         return NULL;
3658     }
3659 
3660     return block->host + offset;
3661 }
3662 
3663 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3664                                                  ram_addr_t offset)
3665 {
3666     if (!offset_in_ramblock(block, offset)) {
3667         return NULL;
3668     }
3669     if (!block->colo_cache) {
3670         error_report("%s: colo_cache is NULL in block :%s",
3671                      __func__, block->idstr);
3672         return NULL;
3673     }
3674 
3675     /*
3676     * During colo checkpoint, we need bitmap of these migrated pages.
3677     * It help us to decide which pages in ram cache should be flushed
3678     * into VM's RAM later.
3679     */
3680     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3681         ram_state->migration_dirty_pages++;
3682     }
3683     return block->colo_cache + offset;
3684 }
3685 
3686 /**
3687  * ram_handle_compressed: handle the zero page case
3688  *
3689  * If a page (or a whole RDMA chunk) has been
3690  * determined to be zero, then zap it.
3691  *
3692  * @host: host address for the zero page
3693  * @ch: what the page is filled from.  We only support zero
3694  * @size: size of the zero page
3695  */
3696 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3697 {
3698     if (ch != 0 || !is_zero_range(host, size)) {
3699         memset(host, ch, size);
3700     }
3701 }
3702 
3703 /* return the size after decompression, or negative value on error */
3704 static int
3705 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3706                      const uint8_t *source, size_t source_len)
3707 {
3708     int err;
3709 
3710     err = inflateReset(stream);
3711     if (err != Z_OK) {
3712         return -1;
3713     }
3714 
3715     stream->avail_in = source_len;
3716     stream->next_in = (uint8_t *)source;
3717     stream->avail_out = dest_len;
3718     stream->next_out = dest;
3719 
3720     err = inflate(stream, Z_NO_FLUSH);
3721     if (err != Z_STREAM_END) {
3722         return -1;
3723     }
3724 
3725     return stream->total_out;
3726 }
3727 
3728 static void *do_data_decompress(void *opaque)
3729 {
3730     DecompressParam *param = opaque;
3731     unsigned long pagesize;
3732     uint8_t *des;
3733     int len, ret;
3734 
3735     qemu_mutex_lock(&param->mutex);
3736     while (!param->quit) {
3737         if (param->des) {
3738             des = param->des;
3739             len = param->len;
3740             param->des = 0;
3741             qemu_mutex_unlock(&param->mutex);
3742 
3743             pagesize = TARGET_PAGE_SIZE;
3744 
3745             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3746                                        param->compbuf, len);
3747             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3748                 error_report("decompress data failed");
3749                 qemu_file_set_error(decomp_file, ret);
3750             }
3751 
3752             qemu_mutex_lock(&decomp_done_lock);
3753             param->done = true;
3754             qemu_cond_signal(&decomp_done_cond);
3755             qemu_mutex_unlock(&decomp_done_lock);
3756 
3757             qemu_mutex_lock(&param->mutex);
3758         } else {
3759             qemu_cond_wait(&param->cond, &param->mutex);
3760         }
3761     }
3762     qemu_mutex_unlock(&param->mutex);
3763 
3764     return NULL;
3765 }
3766 
3767 static int wait_for_decompress_done(void)
3768 {
3769     int idx, thread_count;
3770 
3771     if (!migrate_use_compression()) {
3772         return 0;
3773     }
3774 
3775     thread_count = migrate_decompress_threads();
3776     qemu_mutex_lock(&decomp_done_lock);
3777     for (idx = 0; idx < thread_count; idx++) {
3778         while (!decomp_param[idx].done) {
3779             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3780         }
3781     }
3782     qemu_mutex_unlock(&decomp_done_lock);
3783     return qemu_file_get_error(decomp_file);
3784 }
3785 
3786 static void compress_threads_load_cleanup(void)
3787 {
3788     int i, thread_count;
3789 
3790     if (!migrate_use_compression()) {
3791         return;
3792     }
3793     thread_count = migrate_decompress_threads();
3794     for (i = 0; i < thread_count; i++) {
3795         /*
3796          * we use it as a indicator which shows if the thread is
3797          * properly init'd or not
3798          */
3799         if (!decomp_param[i].compbuf) {
3800             break;
3801         }
3802 
3803         qemu_mutex_lock(&decomp_param[i].mutex);
3804         decomp_param[i].quit = true;
3805         qemu_cond_signal(&decomp_param[i].cond);
3806         qemu_mutex_unlock(&decomp_param[i].mutex);
3807     }
3808     for (i = 0; i < thread_count; i++) {
3809         if (!decomp_param[i].compbuf) {
3810             break;
3811         }
3812 
3813         qemu_thread_join(decompress_threads + i);
3814         qemu_mutex_destroy(&decomp_param[i].mutex);
3815         qemu_cond_destroy(&decomp_param[i].cond);
3816         inflateEnd(&decomp_param[i].stream);
3817         g_free(decomp_param[i].compbuf);
3818         decomp_param[i].compbuf = NULL;
3819     }
3820     g_free(decompress_threads);
3821     g_free(decomp_param);
3822     decompress_threads = NULL;
3823     decomp_param = NULL;
3824     decomp_file = NULL;
3825 }
3826 
3827 static int compress_threads_load_setup(QEMUFile *f)
3828 {
3829     int i, thread_count;
3830 
3831     if (!migrate_use_compression()) {
3832         return 0;
3833     }
3834 
3835     thread_count = migrate_decompress_threads();
3836     decompress_threads = g_new0(QemuThread, thread_count);
3837     decomp_param = g_new0(DecompressParam, thread_count);
3838     qemu_mutex_init(&decomp_done_lock);
3839     qemu_cond_init(&decomp_done_cond);
3840     decomp_file = f;
3841     for (i = 0; i < thread_count; i++) {
3842         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3843             goto exit;
3844         }
3845 
3846         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3847         qemu_mutex_init(&decomp_param[i].mutex);
3848         qemu_cond_init(&decomp_param[i].cond);
3849         decomp_param[i].done = true;
3850         decomp_param[i].quit = false;
3851         qemu_thread_create(decompress_threads + i, "decompress",
3852                            do_data_decompress, decomp_param + i,
3853                            QEMU_THREAD_JOINABLE);
3854     }
3855     return 0;
3856 exit:
3857     compress_threads_load_cleanup();
3858     return -1;
3859 }
3860 
3861 static void decompress_data_with_multi_threads(QEMUFile *f,
3862                                                void *host, int len)
3863 {
3864     int idx, thread_count;
3865 
3866     thread_count = migrate_decompress_threads();
3867     qemu_mutex_lock(&decomp_done_lock);
3868     while (true) {
3869         for (idx = 0; idx < thread_count; idx++) {
3870             if (decomp_param[idx].done) {
3871                 decomp_param[idx].done = false;
3872                 qemu_mutex_lock(&decomp_param[idx].mutex);
3873                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3874                 decomp_param[idx].des = host;
3875                 decomp_param[idx].len = len;
3876                 qemu_cond_signal(&decomp_param[idx].cond);
3877                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3878                 break;
3879             }
3880         }
3881         if (idx < thread_count) {
3882             break;
3883         } else {
3884             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3885         }
3886     }
3887     qemu_mutex_unlock(&decomp_done_lock);
3888 }
3889 
3890 /*
3891  * colo cache: this is for secondary VM, we cache the whole
3892  * memory of the secondary VM, it is need to hold the global lock
3893  * to call this helper.
3894  */
3895 int colo_init_ram_cache(void)
3896 {
3897     RAMBlock *block;
3898 
3899     rcu_read_lock();
3900     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3901         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3902                                                 NULL,
3903                                                 false);
3904         if (!block->colo_cache) {
3905             error_report("%s: Can't alloc memory for COLO cache of block %s,"
3906                          "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3907                          block->used_length);
3908             goto out_locked;
3909         }
3910         memcpy(block->colo_cache, block->host, block->used_length);
3911     }
3912     rcu_read_unlock();
3913     /*
3914     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3915     * with to decide which page in cache should be flushed into SVM's RAM. Here
3916     * we use the same name 'ram_bitmap' as for migration.
3917     */
3918     if (ram_bytes_total()) {
3919         RAMBlock *block;
3920 
3921         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3922             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3923 
3924             block->bmap = bitmap_new(pages);
3925             bitmap_set(block->bmap, 0, pages);
3926         }
3927     }
3928     ram_state = g_new0(RAMState, 1);
3929     ram_state->migration_dirty_pages = 0;
3930     qemu_mutex_init(&ram_state->bitmap_mutex);
3931     memory_global_dirty_log_start();
3932 
3933     return 0;
3934 
3935 out_locked:
3936 
3937     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3938         if (block->colo_cache) {
3939             qemu_anon_ram_free(block->colo_cache, block->used_length);
3940             block->colo_cache = NULL;
3941         }
3942     }
3943 
3944     rcu_read_unlock();
3945     return -errno;
3946 }
3947 
3948 /* It is need to hold the global lock to call this helper */
3949 void colo_release_ram_cache(void)
3950 {
3951     RAMBlock *block;
3952 
3953     memory_global_dirty_log_stop();
3954     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3955         g_free(block->bmap);
3956         block->bmap = NULL;
3957     }
3958 
3959     rcu_read_lock();
3960 
3961     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3962         if (block->colo_cache) {
3963             qemu_anon_ram_free(block->colo_cache, block->used_length);
3964             block->colo_cache = NULL;
3965         }
3966     }
3967 
3968     rcu_read_unlock();
3969     qemu_mutex_destroy(&ram_state->bitmap_mutex);
3970     g_free(ram_state);
3971     ram_state = NULL;
3972 }
3973 
3974 /**
3975  * ram_load_setup: Setup RAM for migration incoming side
3976  *
3977  * Returns zero to indicate success and negative for error
3978  *
3979  * @f: QEMUFile where to receive the data
3980  * @opaque: RAMState pointer
3981  */
3982 static int ram_load_setup(QEMUFile *f, void *opaque)
3983 {
3984     if (compress_threads_load_setup(f)) {
3985         return -1;
3986     }
3987 
3988     xbzrle_load_setup();
3989     ramblock_recv_map_init();
3990 
3991     return 0;
3992 }
3993 
3994 static int ram_load_cleanup(void *opaque)
3995 {
3996     RAMBlock *rb;
3997 
3998     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3999         if (ramblock_is_pmem(rb)) {
4000             pmem_persist(rb->host, rb->used_length);
4001         }
4002     }
4003 
4004     xbzrle_load_cleanup();
4005     compress_threads_load_cleanup();
4006 
4007     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4008         g_free(rb->receivedmap);
4009         rb->receivedmap = NULL;
4010     }
4011 
4012     return 0;
4013 }
4014 
4015 /**
4016  * ram_postcopy_incoming_init: allocate postcopy data structures
4017  *
4018  * Returns 0 for success and negative if there was one error
4019  *
4020  * @mis: current migration incoming state
4021  *
4022  * Allocate data structures etc needed by incoming migration with
4023  * postcopy-ram. postcopy-ram's similarly names
4024  * postcopy_ram_incoming_init does the work.
4025  */
4026 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4027 {
4028     return postcopy_ram_incoming_init(mis);
4029 }
4030 
4031 /**
4032  * ram_load_postcopy: load a page in postcopy case
4033  *
4034  * Returns 0 for success or -errno in case of error
4035  *
4036  * Called in postcopy mode by ram_load().
4037  * rcu_read_lock is taken prior to this being called.
4038  *
4039  * @f: QEMUFile where to send the data
4040  */
4041 static int ram_load_postcopy(QEMUFile *f)
4042 {
4043     int flags = 0, ret = 0;
4044     bool place_needed = false;
4045     bool matches_target_page_size = false;
4046     MigrationIncomingState *mis = migration_incoming_get_current();
4047     /* Temporary page that is later 'placed' */
4048     void *postcopy_host_page = postcopy_get_tmp_page(mis);
4049     void *last_host = NULL;
4050     bool all_zero = false;
4051 
4052     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4053         ram_addr_t addr;
4054         void *host = NULL;
4055         void *page_buffer = NULL;
4056         void *place_source = NULL;
4057         RAMBlock *block = NULL;
4058         uint8_t ch;
4059 
4060         addr = qemu_get_be64(f);
4061 
4062         /*
4063          * If qemu file error, we should stop here, and then "addr"
4064          * may be invalid
4065          */
4066         ret = qemu_file_get_error(f);
4067         if (ret) {
4068             break;
4069         }
4070 
4071         flags = addr & ~TARGET_PAGE_MASK;
4072         addr &= TARGET_PAGE_MASK;
4073 
4074         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4075         place_needed = false;
4076         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4077             block = ram_block_from_stream(f, flags);
4078 
4079             host = host_from_ram_block_offset(block, addr);
4080             if (!host) {
4081                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4082                 ret = -EINVAL;
4083                 break;
4084             }
4085             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4086             /*
4087              * Postcopy requires that we place whole host pages atomically;
4088              * these may be huge pages for RAMBlocks that are backed by
4089              * hugetlbfs.
4090              * To make it atomic, the data is read into a temporary page
4091              * that's moved into place later.
4092              * The migration protocol uses,  possibly smaller, target-pages
4093              * however the source ensures it always sends all the components
4094              * of a host page in order.
4095              */
4096             page_buffer = postcopy_host_page +
4097                           ((uintptr_t)host & (block->page_size - 1));
4098             /* If all TP are zero then we can optimise the place */
4099             if (!((uintptr_t)host & (block->page_size - 1))) {
4100                 all_zero = true;
4101             } else {
4102                 /* not the 1st TP within the HP */
4103                 if (host != (last_host + TARGET_PAGE_SIZE)) {
4104                     error_report("Non-sequential target page %p/%p",
4105                                   host, last_host);
4106                     ret = -EINVAL;
4107                     break;
4108                 }
4109             }
4110 
4111 
4112             /*
4113              * If it's the last part of a host page then we place the host
4114              * page
4115              */
4116             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4117                                      (block->page_size - 1)) == 0;
4118             place_source = postcopy_host_page;
4119         }
4120         last_host = host;
4121 
4122         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4123         case RAM_SAVE_FLAG_ZERO:
4124             ch = qemu_get_byte(f);
4125             memset(page_buffer, ch, TARGET_PAGE_SIZE);
4126             if (ch) {
4127                 all_zero = false;
4128             }
4129             break;
4130 
4131         case RAM_SAVE_FLAG_PAGE:
4132             all_zero = false;
4133             if (!matches_target_page_size) {
4134                 /* For huge pages, we always use temporary buffer */
4135                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4136             } else {
4137                 /*
4138                  * For small pages that matches target page size, we
4139                  * avoid the qemu_file copy.  Instead we directly use
4140                  * the buffer of QEMUFile to place the page.  Note: we
4141                  * cannot do any QEMUFile operation before using that
4142                  * buffer to make sure the buffer is valid when
4143                  * placing the page.
4144                  */
4145                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4146                                          TARGET_PAGE_SIZE);
4147             }
4148             break;
4149         case RAM_SAVE_FLAG_EOS:
4150             /* normal exit */
4151             multifd_recv_sync_main();
4152             break;
4153         default:
4154             error_report("Unknown combination of migration flags: %#x"
4155                          " (postcopy mode)", flags);
4156             ret = -EINVAL;
4157             break;
4158         }
4159 
4160         /* Detect for any possible file errors */
4161         if (!ret && qemu_file_get_error(f)) {
4162             ret = qemu_file_get_error(f);
4163         }
4164 
4165         if (!ret && place_needed) {
4166             /* This gets called at the last target page in the host page */
4167             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4168 
4169             if (all_zero) {
4170                 ret = postcopy_place_page_zero(mis, place_dest,
4171                                                block);
4172             } else {
4173                 ret = postcopy_place_page(mis, place_dest,
4174                                           place_source, block);
4175             }
4176         }
4177     }
4178 
4179     return ret;
4180 }
4181 
4182 static bool postcopy_is_advised(void)
4183 {
4184     PostcopyState ps = postcopy_state_get();
4185     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4186 }
4187 
4188 static bool postcopy_is_running(void)
4189 {
4190     PostcopyState ps = postcopy_state_get();
4191     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4192 }
4193 
4194 /*
4195  * Flush content of RAM cache into SVM's memory.
4196  * Only flush the pages that be dirtied by PVM or SVM or both.
4197  */
4198 static void colo_flush_ram_cache(void)
4199 {
4200     RAMBlock *block = NULL;
4201     void *dst_host;
4202     void *src_host;
4203     unsigned long offset = 0;
4204 
4205     memory_global_dirty_log_sync();
4206     rcu_read_lock();
4207     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4208         migration_bitmap_sync_range(ram_state, block, block->used_length);
4209     }
4210     rcu_read_unlock();
4211 
4212     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4213     rcu_read_lock();
4214     block = QLIST_FIRST_RCU(&ram_list.blocks);
4215 
4216     while (block) {
4217         offset = migration_bitmap_find_dirty(ram_state, block, offset);
4218 
4219         if (offset << TARGET_PAGE_BITS >= block->used_length) {
4220             offset = 0;
4221             block = QLIST_NEXT_RCU(block, next);
4222         } else {
4223             migration_bitmap_clear_dirty(ram_state, block, offset);
4224             dst_host = block->host + (offset << TARGET_PAGE_BITS);
4225             src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4226             memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4227         }
4228     }
4229 
4230     rcu_read_unlock();
4231     trace_colo_flush_ram_cache_end();
4232 }
4233 
4234 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4235 {
4236     int flags = 0, ret = 0, invalid_flags = 0;
4237     static uint64_t seq_iter;
4238     int len = 0;
4239     /*
4240      * If system is running in postcopy mode, page inserts to host memory must
4241      * be atomic
4242      */
4243     bool postcopy_running = postcopy_is_running();
4244     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4245     bool postcopy_advised = postcopy_is_advised();
4246 
4247     seq_iter++;
4248 
4249     if (version_id != 4) {
4250         ret = -EINVAL;
4251     }
4252 
4253     if (!migrate_use_compression()) {
4254         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4255     }
4256     /* This RCU critical section can be very long running.
4257      * When RCU reclaims in the code start to become numerous,
4258      * it will be necessary to reduce the granularity of this
4259      * critical section.
4260      */
4261     rcu_read_lock();
4262 
4263     if (postcopy_running) {
4264         ret = ram_load_postcopy(f);
4265     }
4266 
4267     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4268         ram_addr_t addr, total_ram_bytes;
4269         void *host = NULL;
4270         uint8_t ch;
4271 
4272         addr = qemu_get_be64(f);
4273         flags = addr & ~TARGET_PAGE_MASK;
4274         addr &= TARGET_PAGE_MASK;
4275 
4276         if (flags & invalid_flags) {
4277             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4278                 error_report("Received an unexpected compressed page");
4279             }
4280 
4281             ret = -EINVAL;
4282             break;
4283         }
4284 
4285         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4286                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4287             RAMBlock *block = ram_block_from_stream(f, flags);
4288 
4289             /*
4290              * After going into COLO, we should load the Page into colo_cache.
4291              */
4292             if (migration_incoming_in_colo_state()) {
4293                 host = colo_cache_from_block_offset(block, addr);
4294             } else {
4295                 host = host_from_ram_block_offset(block, addr);
4296             }
4297             if (!host) {
4298                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4299                 ret = -EINVAL;
4300                 break;
4301             }
4302 
4303             if (!migration_incoming_in_colo_state()) {
4304                 ramblock_recv_bitmap_set(block, host);
4305             }
4306 
4307             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4308         }
4309 
4310         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4311         case RAM_SAVE_FLAG_MEM_SIZE:
4312             /* Synchronize RAM block list */
4313             total_ram_bytes = addr;
4314             while (!ret && total_ram_bytes) {
4315                 RAMBlock *block;
4316                 char id[256];
4317                 ram_addr_t length;
4318 
4319                 len = qemu_get_byte(f);
4320                 qemu_get_buffer(f, (uint8_t *)id, len);
4321                 id[len] = 0;
4322                 length = qemu_get_be64(f);
4323 
4324                 block = qemu_ram_block_by_name(id);
4325                 if (block && !qemu_ram_is_migratable(block)) {
4326                     error_report("block %s should not be migrated !", id);
4327                     ret = -EINVAL;
4328                 } else if (block) {
4329                     if (length != block->used_length) {
4330                         Error *local_err = NULL;
4331 
4332                         ret = qemu_ram_resize(block, length,
4333                                               &local_err);
4334                         if (local_err) {
4335                             error_report_err(local_err);
4336                         }
4337                     }
4338                     /* For postcopy we need to check hugepage sizes match */
4339                     if (postcopy_advised &&
4340                         block->page_size != qemu_host_page_size) {
4341                         uint64_t remote_page_size = qemu_get_be64(f);
4342                         if (remote_page_size != block->page_size) {
4343                             error_report("Mismatched RAM page size %s "
4344                                          "(local) %zd != %" PRId64,
4345                                          id, block->page_size,
4346                                          remote_page_size);
4347                             ret = -EINVAL;
4348                         }
4349                     }
4350                     if (migrate_ignore_shared()) {
4351                         hwaddr addr = qemu_get_be64(f);
4352                         bool ignored = qemu_get_byte(f);
4353                         if (ignored != ramblock_is_ignored(block)) {
4354                             error_report("RAM block %s should %s be migrated",
4355                                          id, ignored ? "" : "not");
4356                             ret = -EINVAL;
4357                         }
4358                         if (ramblock_is_ignored(block) &&
4359                             block->mr->addr != addr) {
4360                             error_report("Mismatched GPAs for block %s "
4361                                          "%" PRId64 "!= %" PRId64,
4362                                          id, (uint64_t)addr,
4363                                          (uint64_t)block->mr->addr);
4364                             ret = -EINVAL;
4365                         }
4366                     }
4367                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4368                                           block->idstr);
4369                 } else {
4370                     error_report("Unknown ramblock \"%s\", cannot "
4371                                  "accept migration", id);
4372                     ret = -EINVAL;
4373                 }
4374 
4375                 total_ram_bytes -= length;
4376             }
4377             break;
4378 
4379         case RAM_SAVE_FLAG_ZERO:
4380             ch = qemu_get_byte(f);
4381             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4382             break;
4383 
4384         case RAM_SAVE_FLAG_PAGE:
4385             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4386             break;
4387 
4388         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4389             len = qemu_get_be32(f);
4390             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4391                 error_report("Invalid compressed data length: %d", len);
4392                 ret = -EINVAL;
4393                 break;
4394             }
4395             decompress_data_with_multi_threads(f, host, len);
4396             break;
4397 
4398         case RAM_SAVE_FLAG_XBZRLE:
4399             if (load_xbzrle(f, addr, host) < 0) {
4400                 error_report("Failed to decompress XBZRLE page at "
4401                              RAM_ADDR_FMT, addr);
4402                 ret = -EINVAL;
4403                 break;
4404             }
4405             break;
4406         case RAM_SAVE_FLAG_EOS:
4407             /* normal exit */
4408             multifd_recv_sync_main();
4409             break;
4410         default:
4411             if (flags & RAM_SAVE_FLAG_HOOK) {
4412                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4413             } else {
4414                 error_report("Unknown combination of migration flags: %#x",
4415                              flags);
4416                 ret = -EINVAL;
4417             }
4418         }
4419         if (!ret) {
4420             ret = qemu_file_get_error(f);
4421         }
4422     }
4423 
4424     ret |= wait_for_decompress_done();
4425     rcu_read_unlock();
4426     trace_ram_load_complete(ret, seq_iter);
4427 
4428     if (!ret  && migration_incoming_in_colo_state()) {
4429         colo_flush_ram_cache();
4430     }
4431     return ret;
4432 }
4433 
4434 static bool ram_has_postcopy(void *opaque)
4435 {
4436     RAMBlock *rb;
4437     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4438         if (ramblock_is_pmem(rb)) {
4439             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4440                          "is not supported now!", rb->idstr, rb->host);
4441             return false;
4442         }
4443     }
4444 
4445     return migrate_postcopy_ram();
4446 }
4447 
4448 /* Sync all the dirty bitmap with destination VM.  */
4449 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4450 {
4451     RAMBlock *block;
4452     QEMUFile *file = s->to_dst_file;
4453     int ramblock_count = 0;
4454 
4455     trace_ram_dirty_bitmap_sync_start();
4456 
4457     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4458         qemu_savevm_send_recv_bitmap(file, block->idstr);
4459         trace_ram_dirty_bitmap_request(block->idstr);
4460         ramblock_count++;
4461     }
4462 
4463     trace_ram_dirty_bitmap_sync_wait();
4464 
4465     /* Wait until all the ramblocks' dirty bitmap synced */
4466     while (ramblock_count--) {
4467         qemu_sem_wait(&s->rp_state.rp_sem);
4468     }
4469 
4470     trace_ram_dirty_bitmap_sync_complete();
4471 
4472     return 0;
4473 }
4474 
4475 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4476 {
4477     qemu_sem_post(&s->rp_state.rp_sem);
4478 }
4479 
4480 /*
4481  * Read the received bitmap, revert it as the initial dirty bitmap.
4482  * This is only used when the postcopy migration is paused but wants
4483  * to resume from a middle point.
4484  */
4485 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4486 {
4487     int ret = -EINVAL;
4488     QEMUFile *file = s->rp_state.from_dst_file;
4489     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4490     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4491     uint64_t size, end_mark;
4492 
4493     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4494 
4495     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4496         error_report("%s: incorrect state %s", __func__,
4497                      MigrationStatus_str(s->state));
4498         return -EINVAL;
4499     }
4500 
4501     /*
4502      * Note: see comments in ramblock_recv_bitmap_send() on why we
4503      * need the endianess convertion, and the paddings.
4504      */
4505     local_size = ROUND_UP(local_size, 8);
4506 
4507     /* Add paddings */
4508     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4509 
4510     size = qemu_get_be64(file);
4511 
4512     /* The size of the bitmap should match with our ramblock */
4513     if (size != local_size) {
4514         error_report("%s: ramblock '%s' bitmap size mismatch "
4515                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4516                      block->idstr, size, local_size);
4517         ret = -EINVAL;
4518         goto out;
4519     }
4520 
4521     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4522     end_mark = qemu_get_be64(file);
4523 
4524     ret = qemu_file_get_error(file);
4525     if (ret || size != local_size) {
4526         error_report("%s: read bitmap failed for ramblock '%s': %d"
4527                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4528                      __func__, block->idstr, ret, local_size, size);
4529         ret = -EIO;
4530         goto out;
4531     }
4532 
4533     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4534         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4535                      __func__, block->idstr, end_mark);
4536         ret = -EINVAL;
4537         goto out;
4538     }
4539 
4540     /*
4541      * Endianess convertion. We are during postcopy (though paused).
4542      * The dirty bitmap won't change. We can directly modify it.
4543      */
4544     bitmap_from_le(block->bmap, le_bitmap, nbits);
4545 
4546     /*
4547      * What we received is "received bitmap". Revert it as the initial
4548      * dirty bitmap for this ramblock.
4549      */
4550     bitmap_complement(block->bmap, block->bmap, nbits);
4551 
4552     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4553 
4554     /*
4555      * We succeeded to sync bitmap for current ramblock. If this is
4556      * the last one to sync, we need to notify the main send thread.
4557      */
4558     ram_dirty_bitmap_reload_notify(s);
4559 
4560     ret = 0;
4561 out:
4562     g_free(le_bitmap);
4563     return ret;
4564 }
4565 
4566 static int ram_resume_prepare(MigrationState *s, void *opaque)
4567 {
4568     RAMState *rs = *(RAMState **)opaque;
4569     int ret;
4570 
4571     ret = ram_dirty_bitmap_sync_all(s, rs);
4572     if (ret) {
4573         return ret;
4574     }
4575 
4576     ram_state_resume_prepare(rs, s->to_dst_file);
4577 
4578     return 0;
4579 }
4580 
4581 static SaveVMHandlers savevm_ram_handlers = {
4582     .save_setup = ram_save_setup,
4583     .save_live_iterate = ram_save_iterate,
4584     .save_live_complete_postcopy = ram_save_complete,
4585     .save_live_complete_precopy = ram_save_complete,
4586     .has_postcopy = ram_has_postcopy,
4587     .save_live_pending = ram_save_pending,
4588     .load_state = ram_load,
4589     .save_cleanup = ram_save_cleanup,
4590     .load_setup = ram_load_setup,
4591     .load_cleanup = ram_load_cleanup,
4592     .resume_prepare = ram_resume_prepare,
4593 };
4594 
4595 void ram_mig_init(void)
4596 {
4597     qemu_mutex_init(&XBZRLE.lock);
4598     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4599 }
4600