xref: /openbmc/qemu/migration/ram.c (revision 8072aae3)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "qemu/pmem.h"
37 #include "xbzrle.h"
38 #include "ram.h"
39 #include "migration.h"
40 #include "socket.h"
41 #include "migration/register.h"
42 #include "migration/misc.h"
43 #include "qemu-file.h"
44 #include "postcopy-ram.h"
45 #include "page_cache.h"
46 #include "qemu/error-report.h"
47 #include "qapi/error.h"
48 #include "qapi/qapi-events-migration.h"
49 #include "qapi/qmp/qerror.h"
50 #include "trace.h"
51 #include "exec/ram_addr.h"
52 #include "exec/target_page.h"
53 #include "qemu/rcu_queue.h"
54 #include "migration/colo.h"
55 #include "block.h"
56 #include "sysemu/sysemu.h"
57 #include "qemu/uuid.h"
58 #include "savevm.h"
59 #include "qemu/iov.h"
60 
61 /***********************************************************/
62 /* ram save/restore */
63 
64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
65  * worked for pages that where filled with the same char.  We switched
66  * it to only search for the zero value.  And to avoid confusion with
67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
68  */
69 
70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
71 #define RAM_SAVE_FLAG_ZERO     0x02
72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
73 #define RAM_SAVE_FLAG_PAGE     0x08
74 #define RAM_SAVE_FLAG_EOS      0x10
75 #define RAM_SAVE_FLAG_CONTINUE 0x20
76 #define RAM_SAVE_FLAG_XBZRLE   0x40
77 /* 0x80 is reserved in migration.h start with 0x100 next */
78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
79 
80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
81 {
82     return buffer_is_zero(p, size);
83 }
84 
85 XBZRLECacheStats xbzrle_counters;
86 
87 /* struct contains XBZRLE cache and a static page
88    used by the compression */
89 static struct {
90     /* buffer used for XBZRLE encoding */
91     uint8_t *encoded_buf;
92     /* buffer for storing page content */
93     uint8_t *current_buf;
94     /* Cache for XBZRLE, Protected by lock. */
95     PageCache *cache;
96     QemuMutex lock;
97     /* it will store a page full of zeros */
98     uint8_t *zero_target_page;
99     /* buffer used for XBZRLE decoding */
100     uint8_t *decoded_buf;
101 } XBZRLE;
102 
103 static void XBZRLE_cache_lock(void)
104 {
105     if (migrate_use_xbzrle())
106         qemu_mutex_lock(&XBZRLE.lock);
107 }
108 
109 static void XBZRLE_cache_unlock(void)
110 {
111     if (migrate_use_xbzrle())
112         qemu_mutex_unlock(&XBZRLE.lock);
113 }
114 
115 /**
116  * xbzrle_cache_resize: resize the xbzrle cache
117  *
118  * This function is called from qmp_migrate_set_cache_size in main
119  * thread, possibly while a migration is in progress.  A running
120  * migration may be using the cache and might finish during this call,
121  * hence changes to the cache are protected by XBZRLE.lock().
122  *
123  * Returns 0 for success or -1 for error
124  *
125  * @new_size: new cache size
126  * @errp: set *errp if the check failed, with reason
127  */
128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
129 {
130     PageCache *new_cache;
131     int64_t ret = 0;
132 
133     /* Check for truncation */
134     if (new_size != (size_t)new_size) {
135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
136                    "exceeding address space");
137         return -1;
138     }
139 
140     if (new_size == migrate_xbzrle_cache_size()) {
141         /* nothing to do */
142         return 0;
143     }
144 
145     XBZRLE_cache_lock();
146 
147     if (XBZRLE.cache != NULL) {
148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
149         if (!new_cache) {
150             ret = -1;
151             goto out;
152         }
153 
154         cache_fini(XBZRLE.cache);
155         XBZRLE.cache = new_cache;
156     }
157 out:
158     XBZRLE_cache_unlock();
159     return ret;
160 }
161 
162 static bool ramblock_is_ignored(RAMBlock *block)
163 {
164     return !qemu_ram_is_migratable(block) ||
165            (migrate_ignore_shared() && qemu_ram_is_shared(block));
166 }
167 
168 /* Should be holding either ram_list.mutex, or the RCU lock. */
169 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
170     INTERNAL_RAMBLOCK_FOREACH(block)                   \
171         if (ramblock_is_ignored(block)) {} else
172 
173 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
174     INTERNAL_RAMBLOCK_FOREACH(block)                   \
175         if (!qemu_ram_is_migratable(block)) {} else
176 
177 #undef RAMBLOCK_FOREACH
178 
179 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
180 {
181     RAMBlock *block;
182     int ret = 0;
183 
184     rcu_read_lock();
185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
186         ret = func(block, opaque);
187         if (ret) {
188             break;
189         }
190     }
191     rcu_read_unlock();
192     return ret;
193 }
194 
195 static void ramblock_recv_map_init(void)
196 {
197     RAMBlock *rb;
198 
199     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
200         assert(!rb->receivedmap);
201         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
202     }
203 }
204 
205 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
206 {
207     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
208                     rb->receivedmap);
209 }
210 
211 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
212 {
213     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
214 }
215 
216 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
217 {
218     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
219 }
220 
221 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
222                                     size_t nr)
223 {
224     bitmap_set_atomic(rb->receivedmap,
225                       ramblock_recv_bitmap_offset(host_addr, rb),
226                       nr);
227 }
228 
229 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
230 
231 /*
232  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
233  *
234  * Returns >0 if success with sent bytes, or <0 if error.
235  */
236 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
237                                   const char *block_name)
238 {
239     RAMBlock *block = qemu_ram_block_by_name(block_name);
240     unsigned long *le_bitmap, nbits;
241     uint64_t size;
242 
243     if (!block) {
244         error_report("%s: invalid block name: %s", __func__, block_name);
245         return -1;
246     }
247 
248     nbits = block->used_length >> TARGET_PAGE_BITS;
249 
250     /*
251      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
252      * machines we may need 4 more bytes for padding (see below
253      * comment). So extend it a bit before hand.
254      */
255     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
256 
257     /*
258      * Always use little endian when sending the bitmap. This is
259      * required that when source and destination VMs are not using the
260      * same endianess. (Note: big endian won't work.)
261      */
262     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
263 
264     /* Size of the bitmap, in bytes */
265     size = DIV_ROUND_UP(nbits, 8);
266 
267     /*
268      * size is always aligned to 8 bytes for 64bit machines, but it
269      * may not be true for 32bit machines. We need this padding to
270      * make sure the migration can survive even between 32bit and
271      * 64bit machines.
272      */
273     size = ROUND_UP(size, 8);
274 
275     qemu_put_be64(file, size);
276     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
277     /*
278      * Mark as an end, in case the middle part is screwed up due to
279      * some "misterious" reason.
280      */
281     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
282     qemu_fflush(file);
283 
284     g_free(le_bitmap);
285 
286     if (qemu_file_get_error(file)) {
287         return qemu_file_get_error(file);
288     }
289 
290     return size + sizeof(size);
291 }
292 
293 /*
294  * An outstanding page request, on the source, having been received
295  * and queued
296  */
297 struct RAMSrcPageRequest {
298     RAMBlock *rb;
299     hwaddr    offset;
300     hwaddr    len;
301 
302     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
303 };
304 
305 /* State of RAM for migration */
306 struct RAMState {
307     /* QEMUFile used for this migration */
308     QEMUFile *f;
309     /* Last block that we have visited searching for dirty pages */
310     RAMBlock *last_seen_block;
311     /* Last block from where we have sent data */
312     RAMBlock *last_sent_block;
313     /* Last dirty target page we have sent */
314     ram_addr_t last_page;
315     /* last ram version we have seen */
316     uint32_t last_version;
317     /* We are in the first round */
318     bool ram_bulk_stage;
319     /* The free page optimization is enabled */
320     bool fpo_enabled;
321     /* How many times we have dirty too many pages */
322     int dirty_rate_high_cnt;
323     /* these variables are used for bitmap sync */
324     /* last time we did a full bitmap_sync */
325     int64_t time_last_bitmap_sync;
326     /* bytes transferred at start_time */
327     uint64_t bytes_xfer_prev;
328     /* number of dirty pages since start_time */
329     uint64_t num_dirty_pages_period;
330     /* xbzrle misses since the beginning of the period */
331     uint64_t xbzrle_cache_miss_prev;
332 
333     /* compression statistics since the beginning of the period */
334     /* amount of count that no free thread to compress data */
335     uint64_t compress_thread_busy_prev;
336     /* amount bytes after compression */
337     uint64_t compressed_size_prev;
338     /* amount of compressed pages */
339     uint64_t compress_pages_prev;
340 
341     /* total handled target pages at the beginning of period */
342     uint64_t target_page_count_prev;
343     /* total handled target pages since start */
344     uint64_t target_page_count;
345     /* number of dirty bits in the bitmap */
346     uint64_t migration_dirty_pages;
347     /* Protects modification of the bitmap and migration dirty pages */
348     QemuMutex bitmap_mutex;
349     /* The RAMBlock used in the last src_page_requests */
350     RAMBlock *last_req_rb;
351     /* Queue of outstanding page requests from the destination */
352     QemuMutex src_page_req_mutex;
353     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
354 };
355 typedef struct RAMState RAMState;
356 
357 static RAMState *ram_state;
358 
359 static NotifierWithReturnList precopy_notifier_list;
360 
361 void precopy_infrastructure_init(void)
362 {
363     notifier_with_return_list_init(&precopy_notifier_list);
364 }
365 
366 void precopy_add_notifier(NotifierWithReturn *n)
367 {
368     notifier_with_return_list_add(&precopy_notifier_list, n);
369 }
370 
371 void precopy_remove_notifier(NotifierWithReturn *n)
372 {
373     notifier_with_return_remove(n);
374 }
375 
376 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
377 {
378     PrecopyNotifyData pnd;
379     pnd.reason = reason;
380     pnd.errp = errp;
381 
382     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
383 }
384 
385 void precopy_enable_free_page_optimization(void)
386 {
387     if (!ram_state) {
388         return;
389     }
390 
391     ram_state->fpo_enabled = true;
392 }
393 
394 uint64_t ram_bytes_remaining(void)
395 {
396     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
397                        0;
398 }
399 
400 MigrationStats ram_counters;
401 
402 /* used by the search for pages to send */
403 struct PageSearchStatus {
404     /* Current block being searched */
405     RAMBlock    *block;
406     /* Current page to search from */
407     unsigned long page;
408     /* Set once we wrap around */
409     bool         complete_round;
410 };
411 typedef struct PageSearchStatus PageSearchStatus;
412 
413 CompressionStats compression_counters;
414 
415 struct CompressParam {
416     bool done;
417     bool quit;
418     bool zero_page;
419     QEMUFile *file;
420     QemuMutex mutex;
421     QemuCond cond;
422     RAMBlock *block;
423     ram_addr_t offset;
424 
425     /* internally used fields */
426     z_stream stream;
427     uint8_t *originbuf;
428 };
429 typedef struct CompressParam CompressParam;
430 
431 struct DecompressParam {
432     bool done;
433     bool quit;
434     QemuMutex mutex;
435     QemuCond cond;
436     void *des;
437     uint8_t *compbuf;
438     int len;
439     z_stream stream;
440 };
441 typedef struct DecompressParam DecompressParam;
442 
443 static CompressParam *comp_param;
444 static QemuThread *compress_threads;
445 /* comp_done_cond is used to wake up the migration thread when
446  * one of the compression threads has finished the compression.
447  * comp_done_lock is used to co-work with comp_done_cond.
448  */
449 static QemuMutex comp_done_lock;
450 static QemuCond comp_done_cond;
451 /* The empty QEMUFileOps will be used by file in CompressParam */
452 static const QEMUFileOps empty_ops = { };
453 
454 static QEMUFile *decomp_file;
455 static DecompressParam *decomp_param;
456 static QemuThread *decompress_threads;
457 static QemuMutex decomp_done_lock;
458 static QemuCond decomp_done_cond;
459 
460 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
461                                  ram_addr_t offset, uint8_t *source_buf);
462 
463 static void *do_data_compress(void *opaque)
464 {
465     CompressParam *param = opaque;
466     RAMBlock *block;
467     ram_addr_t offset;
468     bool zero_page;
469 
470     qemu_mutex_lock(&param->mutex);
471     while (!param->quit) {
472         if (param->block) {
473             block = param->block;
474             offset = param->offset;
475             param->block = NULL;
476             qemu_mutex_unlock(&param->mutex);
477 
478             zero_page = do_compress_ram_page(param->file, &param->stream,
479                                              block, offset, param->originbuf);
480 
481             qemu_mutex_lock(&comp_done_lock);
482             param->done = true;
483             param->zero_page = zero_page;
484             qemu_cond_signal(&comp_done_cond);
485             qemu_mutex_unlock(&comp_done_lock);
486 
487             qemu_mutex_lock(&param->mutex);
488         } else {
489             qemu_cond_wait(&param->cond, &param->mutex);
490         }
491     }
492     qemu_mutex_unlock(&param->mutex);
493 
494     return NULL;
495 }
496 
497 static void compress_threads_save_cleanup(void)
498 {
499     int i, thread_count;
500 
501     if (!migrate_use_compression() || !comp_param) {
502         return;
503     }
504 
505     thread_count = migrate_compress_threads();
506     for (i = 0; i < thread_count; i++) {
507         /*
508          * we use it as a indicator which shows if the thread is
509          * properly init'd or not
510          */
511         if (!comp_param[i].file) {
512             break;
513         }
514 
515         qemu_mutex_lock(&comp_param[i].mutex);
516         comp_param[i].quit = true;
517         qemu_cond_signal(&comp_param[i].cond);
518         qemu_mutex_unlock(&comp_param[i].mutex);
519 
520         qemu_thread_join(compress_threads + i);
521         qemu_mutex_destroy(&comp_param[i].mutex);
522         qemu_cond_destroy(&comp_param[i].cond);
523         deflateEnd(&comp_param[i].stream);
524         g_free(comp_param[i].originbuf);
525         qemu_fclose(comp_param[i].file);
526         comp_param[i].file = NULL;
527     }
528     qemu_mutex_destroy(&comp_done_lock);
529     qemu_cond_destroy(&comp_done_cond);
530     g_free(compress_threads);
531     g_free(comp_param);
532     compress_threads = NULL;
533     comp_param = NULL;
534 }
535 
536 static int compress_threads_save_setup(void)
537 {
538     int i, thread_count;
539 
540     if (!migrate_use_compression()) {
541         return 0;
542     }
543     thread_count = migrate_compress_threads();
544     compress_threads = g_new0(QemuThread, thread_count);
545     comp_param = g_new0(CompressParam, thread_count);
546     qemu_cond_init(&comp_done_cond);
547     qemu_mutex_init(&comp_done_lock);
548     for (i = 0; i < thread_count; i++) {
549         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
550         if (!comp_param[i].originbuf) {
551             goto exit;
552         }
553 
554         if (deflateInit(&comp_param[i].stream,
555                         migrate_compress_level()) != Z_OK) {
556             g_free(comp_param[i].originbuf);
557             goto exit;
558         }
559 
560         /* comp_param[i].file is just used as a dummy buffer to save data,
561          * set its ops to empty.
562          */
563         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
564         comp_param[i].done = true;
565         comp_param[i].quit = false;
566         qemu_mutex_init(&comp_param[i].mutex);
567         qemu_cond_init(&comp_param[i].cond);
568         qemu_thread_create(compress_threads + i, "compress",
569                            do_data_compress, comp_param + i,
570                            QEMU_THREAD_JOINABLE);
571     }
572     return 0;
573 
574 exit:
575     compress_threads_save_cleanup();
576     return -1;
577 }
578 
579 /* Multiple fd's */
580 
581 #define MULTIFD_MAGIC 0x11223344U
582 #define MULTIFD_VERSION 1
583 
584 #define MULTIFD_FLAG_SYNC (1 << 0)
585 
586 /* This value needs to be a multiple of qemu_target_page_size() */
587 #define MULTIFD_PACKET_SIZE (512 * 1024)
588 
589 typedef struct {
590     uint32_t magic;
591     uint32_t version;
592     unsigned char uuid[16]; /* QemuUUID */
593     uint8_t id;
594     uint8_t unused1[7];     /* Reserved for future use */
595     uint64_t unused2[4];    /* Reserved for future use */
596 } __attribute__((packed)) MultiFDInit_t;
597 
598 typedef struct {
599     uint32_t magic;
600     uint32_t version;
601     uint32_t flags;
602     /* maximum number of allocated pages */
603     uint32_t pages_alloc;
604     uint32_t pages_used;
605     /* size of the next packet that contains pages */
606     uint32_t next_packet_size;
607     uint64_t packet_num;
608     uint64_t unused[4];    /* Reserved for future use */
609     char ramblock[256];
610     uint64_t offset[];
611 } __attribute__((packed)) MultiFDPacket_t;
612 
613 typedef struct {
614     /* number of used pages */
615     uint32_t used;
616     /* number of allocated pages */
617     uint32_t allocated;
618     /* global number of generated multifd packets */
619     uint64_t packet_num;
620     /* offset of each page */
621     ram_addr_t *offset;
622     /* pointer to each page */
623     struct iovec *iov;
624     RAMBlock *block;
625 } MultiFDPages_t;
626 
627 typedef struct {
628     /* this fields are not changed once the thread is created */
629     /* channel number */
630     uint8_t id;
631     /* channel thread name */
632     char *name;
633     /* channel thread id */
634     QemuThread thread;
635     /* communication channel */
636     QIOChannel *c;
637     /* sem where to wait for more work */
638     QemuSemaphore sem;
639     /* this mutex protects the following parameters */
640     QemuMutex mutex;
641     /* is this channel thread running */
642     bool running;
643     /* should this thread finish */
644     bool quit;
645     /* thread has work to do */
646     int pending_job;
647     /* array of pages to sent */
648     MultiFDPages_t *pages;
649     /* packet allocated len */
650     uint32_t packet_len;
651     /* pointer to the packet */
652     MultiFDPacket_t *packet;
653     /* multifd flags for each packet */
654     uint32_t flags;
655     /* size of the next packet that contains pages */
656     uint32_t next_packet_size;
657     /* global number of generated multifd packets */
658     uint64_t packet_num;
659     /* thread local variables */
660     /* packets sent through this channel */
661     uint64_t num_packets;
662     /* pages sent through this channel */
663     uint64_t num_pages;
664 }  MultiFDSendParams;
665 
666 typedef struct {
667     /* this fields are not changed once the thread is created */
668     /* channel number */
669     uint8_t id;
670     /* channel thread name */
671     char *name;
672     /* channel thread id */
673     QemuThread thread;
674     /* communication channel */
675     QIOChannel *c;
676     /* this mutex protects the following parameters */
677     QemuMutex mutex;
678     /* is this channel thread running */
679     bool running;
680     /* array of pages to receive */
681     MultiFDPages_t *pages;
682     /* packet allocated len */
683     uint32_t packet_len;
684     /* pointer to the packet */
685     MultiFDPacket_t *packet;
686     /* multifd flags for each packet */
687     uint32_t flags;
688     /* global number of generated multifd packets */
689     uint64_t packet_num;
690     /* thread local variables */
691     /* size of the next packet that contains pages */
692     uint32_t next_packet_size;
693     /* packets sent through this channel */
694     uint64_t num_packets;
695     /* pages sent through this channel */
696     uint64_t num_pages;
697     /* syncs main thread and channels */
698     QemuSemaphore sem_sync;
699 } MultiFDRecvParams;
700 
701 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
702 {
703     MultiFDInit_t msg;
704     int ret;
705 
706     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
707     msg.version = cpu_to_be32(MULTIFD_VERSION);
708     msg.id = p->id;
709     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
710 
711     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
712     if (ret != 0) {
713         return -1;
714     }
715     return 0;
716 }
717 
718 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
719 {
720     MultiFDInit_t msg;
721     int ret;
722 
723     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
724     if (ret != 0) {
725         return -1;
726     }
727 
728     msg.magic = be32_to_cpu(msg.magic);
729     msg.version = be32_to_cpu(msg.version);
730 
731     if (msg.magic != MULTIFD_MAGIC) {
732         error_setg(errp, "multifd: received packet magic %x "
733                    "expected %x", msg.magic, MULTIFD_MAGIC);
734         return -1;
735     }
736 
737     if (msg.version != MULTIFD_VERSION) {
738         error_setg(errp, "multifd: received packet version %d "
739                    "expected %d", msg.version, MULTIFD_VERSION);
740         return -1;
741     }
742 
743     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
744         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
745         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
746 
747         error_setg(errp, "multifd: received uuid '%s' and expected "
748                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
749         g_free(uuid);
750         g_free(msg_uuid);
751         return -1;
752     }
753 
754     if (msg.id > migrate_multifd_channels()) {
755         error_setg(errp, "multifd: received channel version %d "
756                    "expected %d", msg.version, MULTIFD_VERSION);
757         return -1;
758     }
759 
760     return msg.id;
761 }
762 
763 static MultiFDPages_t *multifd_pages_init(size_t size)
764 {
765     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
766 
767     pages->allocated = size;
768     pages->iov = g_new0(struct iovec, size);
769     pages->offset = g_new0(ram_addr_t, size);
770 
771     return pages;
772 }
773 
774 static void multifd_pages_clear(MultiFDPages_t *pages)
775 {
776     pages->used = 0;
777     pages->allocated = 0;
778     pages->packet_num = 0;
779     pages->block = NULL;
780     g_free(pages->iov);
781     pages->iov = NULL;
782     g_free(pages->offset);
783     pages->offset = NULL;
784     g_free(pages);
785 }
786 
787 static void multifd_send_fill_packet(MultiFDSendParams *p)
788 {
789     MultiFDPacket_t *packet = p->packet;
790     uint32_t page_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
791     int i;
792 
793     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
794     packet->version = cpu_to_be32(MULTIFD_VERSION);
795     packet->flags = cpu_to_be32(p->flags);
796     packet->pages_alloc = cpu_to_be32(page_max);
797     packet->pages_used = cpu_to_be32(p->pages->used);
798     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
799     packet->packet_num = cpu_to_be64(p->packet_num);
800 
801     if (p->pages->block) {
802         strncpy(packet->ramblock, p->pages->block->idstr, 256);
803     }
804 
805     for (i = 0; i < p->pages->used; i++) {
806         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
807     }
808 }
809 
810 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
811 {
812     MultiFDPacket_t *packet = p->packet;
813     uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
814     RAMBlock *block;
815     int i;
816 
817     packet->magic = be32_to_cpu(packet->magic);
818     if (packet->magic != MULTIFD_MAGIC) {
819         error_setg(errp, "multifd: received packet "
820                    "magic %x and expected magic %x",
821                    packet->magic, MULTIFD_MAGIC);
822         return -1;
823     }
824 
825     packet->version = be32_to_cpu(packet->version);
826     if (packet->version != MULTIFD_VERSION) {
827         error_setg(errp, "multifd: received packet "
828                    "version %d and expected version %d",
829                    packet->version, MULTIFD_VERSION);
830         return -1;
831     }
832 
833     p->flags = be32_to_cpu(packet->flags);
834 
835     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
836     /*
837      * If we recevied a packet that is 100 times bigger than expected
838      * just stop migration.  It is a magic number.
839      */
840     if (packet->pages_alloc > pages_max * 100) {
841         error_setg(errp, "multifd: received packet "
842                    "with size %d and expected a maximum size of %d",
843                    packet->pages_alloc, pages_max * 100) ;
844         return -1;
845     }
846     /*
847      * We received a packet that is bigger than expected but inside
848      * reasonable limits (see previous comment).  Just reallocate.
849      */
850     if (packet->pages_alloc > p->pages->allocated) {
851         multifd_pages_clear(p->pages);
852         p->pages = multifd_pages_init(packet->pages_alloc);
853     }
854 
855     p->pages->used = be32_to_cpu(packet->pages_used);
856     if (p->pages->used > packet->pages_alloc) {
857         error_setg(errp, "multifd: received packet "
858                    "with %d pages and expected maximum pages are %d",
859                    p->pages->used, packet->pages_alloc) ;
860         return -1;
861     }
862 
863     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
864     p->packet_num = be64_to_cpu(packet->packet_num);
865 
866     if (p->pages->used) {
867         /* make sure that ramblock is 0 terminated */
868         packet->ramblock[255] = 0;
869         block = qemu_ram_block_by_name(packet->ramblock);
870         if (!block) {
871             error_setg(errp, "multifd: unknown ram block %s",
872                        packet->ramblock);
873             return -1;
874         }
875     }
876 
877     for (i = 0; i < p->pages->used; i++) {
878         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
879 
880         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
881             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
882                        " (max " RAM_ADDR_FMT ")",
883                        offset, block->max_length);
884             return -1;
885         }
886         p->pages->iov[i].iov_base = block->host + offset;
887         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
888     }
889 
890     return 0;
891 }
892 
893 struct {
894     MultiFDSendParams *params;
895     /* array of pages to sent */
896     MultiFDPages_t *pages;
897     /* syncs main thread and channels */
898     QemuSemaphore sem_sync;
899     /* global number of generated multifd packets */
900     uint64_t packet_num;
901     /* send channels ready */
902     QemuSemaphore channels_ready;
903 } *multifd_send_state;
904 
905 /*
906  * How we use multifd_send_state->pages and channel->pages?
907  *
908  * We create a pages for each channel, and a main one.  Each time that
909  * we need to send a batch of pages we interchange the ones between
910  * multifd_send_state and the channel that is sending it.  There are
911  * two reasons for that:
912  *    - to not have to do so many mallocs during migration
913  *    - to make easier to know what to free at the end of migration
914  *
915  * This way we always know who is the owner of each "pages" struct,
916  * and we don't need any locking.  It belongs to the migration thread
917  * or to the channel thread.  Switching is safe because the migration
918  * thread is using the channel mutex when changing it, and the channel
919  * have to had finish with its own, otherwise pending_job can't be
920  * false.
921  */
922 
923 static void multifd_send_pages(void)
924 {
925     int i;
926     static int next_channel;
927     MultiFDSendParams *p = NULL; /* make happy gcc */
928     MultiFDPages_t *pages = multifd_send_state->pages;
929     uint64_t transferred;
930 
931     qemu_sem_wait(&multifd_send_state->channels_ready);
932     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
933         p = &multifd_send_state->params[i];
934 
935         qemu_mutex_lock(&p->mutex);
936         if (!p->pending_job) {
937             p->pending_job++;
938             next_channel = (i + 1) % migrate_multifd_channels();
939             break;
940         }
941         qemu_mutex_unlock(&p->mutex);
942     }
943     p->pages->used = 0;
944 
945     p->packet_num = multifd_send_state->packet_num++;
946     p->pages->block = NULL;
947     multifd_send_state->pages = p->pages;
948     p->pages = pages;
949     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
950     ram_counters.multifd_bytes += transferred;
951     ram_counters.transferred += transferred;;
952     qemu_mutex_unlock(&p->mutex);
953     qemu_sem_post(&p->sem);
954 }
955 
956 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
957 {
958     MultiFDPages_t *pages = multifd_send_state->pages;
959 
960     if (!pages->block) {
961         pages->block = block;
962     }
963 
964     if (pages->block == block) {
965         pages->offset[pages->used] = offset;
966         pages->iov[pages->used].iov_base = block->host + offset;
967         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
968         pages->used++;
969 
970         if (pages->used < pages->allocated) {
971             return;
972         }
973     }
974 
975     multifd_send_pages();
976 
977     if (pages->block != block) {
978         multifd_queue_page(block, offset);
979     }
980 }
981 
982 static void multifd_send_terminate_threads(Error *err)
983 {
984     int i;
985 
986     if (err) {
987         MigrationState *s = migrate_get_current();
988         migrate_set_error(s, err);
989         if (s->state == MIGRATION_STATUS_SETUP ||
990             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
991             s->state == MIGRATION_STATUS_DEVICE ||
992             s->state == MIGRATION_STATUS_ACTIVE) {
993             migrate_set_state(&s->state, s->state,
994                               MIGRATION_STATUS_FAILED);
995         }
996     }
997 
998     for (i = 0; i < migrate_multifd_channels(); i++) {
999         MultiFDSendParams *p = &multifd_send_state->params[i];
1000 
1001         qemu_mutex_lock(&p->mutex);
1002         p->quit = true;
1003         qemu_sem_post(&p->sem);
1004         qemu_mutex_unlock(&p->mutex);
1005     }
1006 }
1007 
1008 void multifd_save_cleanup(void)
1009 {
1010     int i;
1011 
1012     if (!migrate_use_multifd()) {
1013         return;
1014     }
1015     multifd_send_terminate_threads(NULL);
1016     for (i = 0; i < migrate_multifd_channels(); i++) {
1017         MultiFDSendParams *p = &multifd_send_state->params[i];
1018 
1019         if (p->running) {
1020             qemu_thread_join(&p->thread);
1021         }
1022         socket_send_channel_destroy(p->c);
1023         p->c = NULL;
1024         qemu_mutex_destroy(&p->mutex);
1025         qemu_sem_destroy(&p->sem);
1026         g_free(p->name);
1027         p->name = NULL;
1028         multifd_pages_clear(p->pages);
1029         p->pages = NULL;
1030         p->packet_len = 0;
1031         g_free(p->packet);
1032         p->packet = NULL;
1033     }
1034     qemu_sem_destroy(&multifd_send_state->channels_ready);
1035     qemu_sem_destroy(&multifd_send_state->sem_sync);
1036     g_free(multifd_send_state->params);
1037     multifd_send_state->params = NULL;
1038     multifd_pages_clear(multifd_send_state->pages);
1039     multifd_send_state->pages = NULL;
1040     g_free(multifd_send_state);
1041     multifd_send_state = NULL;
1042 }
1043 
1044 static void multifd_send_sync_main(void)
1045 {
1046     int i;
1047 
1048     if (!migrate_use_multifd()) {
1049         return;
1050     }
1051     if (multifd_send_state->pages->used) {
1052         multifd_send_pages();
1053     }
1054     for (i = 0; i < migrate_multifd_channels(); i++) {
1055         MultiFDSendParams *p = &multifd_send_state->params[i];
1056 
1057         trace_multifd_send_sync_main_signal(p->id);
1058 
1059         qemu_mutex_lock(&p->mutex);
1060 
1061         p->packet_num = multifd_send_state->packet_num++;
1062         p->flags |= MULTIFD_FLAG_SYNC;
1063         p->pending_job++;
1064         qemu_mutex_unlock(&p->mutex);
1065         qemu_sem_post(&p->sem);
1066     }
1067     for (i = 0; i < migrate_multifd_channels(); i++) {
1068         MultiFDSendParams *p = &multifd_send_state->params[i];
1069 
1070         trace_multifd_send_sync_main_wait(p->id);
1071         qemu_sem_wait(&multifd_send_state->sem_sync);
1072     }
1073     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1074 }
1075 
1076 static void *multifd_send_thread(void *opaque)
1077 {
1078     MultiFDSendParams *p = opaque;
1079     Error *local_err = NULL;
1080     int ret;
1081 
1082     trace_multifd_send_thread_start(p->id);
1083     rcu_register_thread();
1084 
1085     if (multifd_send_initial_packet(p, &local_err) < 0) {
1086         goto out;
1087     }
1088     /* initial packet */
1089     p->num_packets = 1;
1090 
1091     while (true) {
1092         qemu_sem_wait(&p->sem);
1093         qemu_mutex_lock(&p->mutex);
1094 
1095         if (p->pending_job) {
1096             uint32_t used = p->pages->used;
1097             uint64_t packet_num = p->packet_num;
1098             uint32_t flags = p->flags;
1099 
1100             p->next_packet_size = used * qemu_target_page_size();
1101             multifd_send_fill_packet(p);
1102             p->flags = 0;
1103             p->num_packets++;
1104             p->num_pages += used;
1105             p->pages->used = 0;
1106             qemu_mutex_unlock(&p->mutex);
1107 
1108             trace_multifd_send(p->id, packet_num, used, flags,
1109                                p->next_packet_size);
1110 
1111             ret = qio_channel_write_all(p->c, (void *)p->packet,
1112                                         p->packet_len, &local_err);
1113             if (ret != 0) {
1114                 break;
1115             }
1116 
1117             if (used) {
1118                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1119                                              used, &local_err);
1120                 if (ret != 0) {
1121                     break;
1122                 }
1123             }
1124 
1125             qemu_mutex_lock(&p->mutex);
1126             p->pending_job--;
1127             qemu_mutex_unlock(&p->mutex);
1128 
1129             if (flags & MULTIFD_FLAG_SYNC) {
1130                 qemu_sem_post(&multifd_send_state->sem_sync);
1131             }
1132             qemu_sem_post(&multifd_send_state->channels_ready);
1133         } else if (p->quit) {
1134             qemu_mutex_unlock(&p->mutex);
1135             break;
1136         } else {
1137             qemu_mutex_unlock(&p->mutex);
1138             /* sometimes there are spurious wakeups */
1139         }
1140     }
1141 
1142 out:
1143     if (local_err) {
1144         multifd_send_terminate_threads(local_err);
1145     }
1146 
1147     qemu_mutex_lock(&p->mutex);
1148     p->running = false;
1149     qemu_mutex_unlock(&p->mutex);
1150 
1151     rcu_unregister_thread();
1152     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1153 
1154     return NULL;
1155 }
1156 
1157 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1158 {
1159     MultiFDSendParams *p = opaque;
1160     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1161     Error *local_err = NULL;
1162 
1163     if (qio_task_propagate_error(task, &local_err)) {
1164         migrate_set_error(migrate_get_current(), local_err);
1165         multifd_save_cleanup();
1166     } else {
1167         p->c = QIO_CHANNEL(sioc);
1168         qio_channel_set_delay(p->c, false);
1169         p->running = true;
1170         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1171                            QEMU_THREAD_JOINABLE);
1172     }
1173 }
1174 
1175 int multifd_save_setup(void)
1176 {
1177     int thread_count;
1178     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1179     uint8_t i;
1180 
1181     if (!migrate_use_multifd()) {
1182         return 0;
1183     }
1184     thread_count = migrate_multifd_channels();
1185     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1186     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1187     multifd_send_state->pages = multifd_pages_init(page_count);
1188     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1189     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1190 
1191     for (i = 0; i < thread_count; i++) {
1192         MultiFDSendParams *p = &multifd_send_state->params[i];
1193 
1194         qemu_mutex_init(&p->mutex);
1195         qemu_sem_init(&p->sem, 0);
1196         p->quit = false;
1197         p->pending_job = 0;
1198         p->id = i;
1199         p->pages = multifd_pages_init(page_count);
1200         p->packet_len = sizeof(MultiFDPacket_t)
1201                       + sizeof(ram_addr_t) * page_count;
1202         p->packet = g_malloc0(p->packet_len);
1203         p->name = g_strdup_printf("multifdsend_%d", i);
1204         socket_send_channel_create(multifd_new_send_channel_async, p);
1205     }
1206     return 0;
1207 }
1208 
1209 struct {
1210     MultiFDRecvParams *params;
1211     /* number of created threads */
1212     int count;
1213     /* syncs main thread and channels */
1214     QemuSemaphore sem_sync;
1215     /* global number of generated multifd packets */
1216     uint64_t packet_num;
1217 } *multifd_recv_state;
1218 
1219 static void multifd_recv_terminate_threads(Error *err)
1220 {
1221     int i;
1222 
1223     if (err) {
1224         MigrationState *s = migrate_get_current();
1225         migrate_set_error(s, err);
1226         if (s->state == MIGRATION_STATUS_SETUP ||
1227             s->state == MIGRATION_STATUS_ACTIVE) {
1228             migrate_set_state(&s->state, s->state,
1229                               MIGRATION_STATUS_FAILED);
1230         }
1231     }
1232 
1233     for (i = 0; i < migrate_multifd_channels(); i++) {
1234         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1235 
1236         qemu_mutex_lock(&p->mutex);
1237         /* We could arrive here for two reasons:
1238            - normal quit, i.e. everything went fine, just finished
1239            - error quit: We close the channels so the channel threads
1240              finish the qio_channel_read_all_eof() */
1241         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1242         qemu_mutex_unlock(&p->mutex);
1243     }
1244 }
1245 
1246 int multifd_load_cleanup(Error **errp)
1247 {
1248     int i;
1249     int ret = 0;
1250 
1251     if (!migrate_use_multifd()) {
1252         return 0;
1253     }
1254     multifd_recv_terminate_threads(NULL);
1255     for (i = 0; i < migrate_multifd_channels(); i++) {
1256         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1257 
1258         if (p->running) {
1259             qemu_thread_join(&p->thread);
1260         }
1261         object_unref(OBJECT(p->c));
1262         p->c = NULL;
1263         qemu_mutex_destroy(&p->mutex);
1264         qemu_sem_destroy(&p->sem_sync);
1265         g_free(p->name);
1266         p->name = NULL;
1267         multifd_pages_clear(p->pages);
1268         p->pages = NULL;
1269         p->packet_len = 0;
1270         g_free(p->packet);
1271         p->packet = NULL;
1272     }
1273     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1274     g_free(multifd_recv_state->params);
1275     multifd_recv_state->params = NULL;
1276     g_free(multifd_recv_state);
1277     multifd_recv_state = NULL;
1278 
1279     return ret;
1280 }
1281 
1282 static void multifd_recv_sync_main(void)
1283 {
1284     int i;
1285 
1286     if (!migrate_use_multifd()) {
1287         return;
1288     }
1289     for (i = 0; i < migrate_multifd_channels(); i++) {
1290         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1291 
1292         trace_multifd_recv_sync_main_wait(p->id);
1293         qemu_sem_wait(&multifd_recv_state->sem_sync);
1294     }
1295     for (i = 0; i < migrate_multifd_channels(); i++) {
1296         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1297 
1298         qemu_mutex_lock(&p->mutex);
1299         if (multifd_recv_state->packet_num < p->packet_num) {
1300             multifd_recv_state->packet_num = p->packet_num;
1301         }
1302         qemu_mutex_unlock(&p->mutex);
1303         trace_multifd_recv_sync_main_signal(p->id);
1304         qemu_sem_post(&p->sem_sync);
1305     }
1306     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1307 }
1308 
1309 static void *multifd_recv_thread(void *opaque)
1310 {
1311     MultiFDRecvParams *p = opaque;
1312     Error *local_err = NULL;
1313     int ret;
1314 
1315     trace_multifd_recv_thread_start(p->id);
1316     rcu_register_thread();
1317 
1318     while (true) {
1319         uint32_t used;
1320         uint32_t flags;
1321 
1322         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1323                                        p->packet_len, &local_err);
1324         if (ret == 0) {   /* EOF */
1325             break;
1326         }
1327         if (ret == -1) {   /* Error */
1328             break;
1329         }
1330 
1331         qemu_mutex_lock(&p->mutex);
1332         ret = multifd_recv_unfill_packet(p, &local_err);
1333         if (ret) {
1334             qemu_mutex_unlock(&p->mutex);
1335             break;
1336         }
1337 
1338         used = p->pages->used;
1339         flags = p->flags;
1340         trace_multifd_recv(p->id, p->packet_num, used, flags,
1341                            p->next_packet_size);
1342         p->num_packets++;
1343         p->num_pages += used;
1344         qemu_mutex_unlock(&p->mutex);
1345 
1346         if (used) {
1347             ret = qio_channel_readv_all(p->c, p->pages->iov,
1348                                         used, &local_err);
1349             if (ret != 0) {
1350                 break;
1351             }
1352         }
1353 
1354         if (flags & MULTIFD_FLAG_SYNC) {
1355             qemu_sem_post(&multifd_recv_state->sem_sync);
1356             qemu_sem_wait(&p->sem_sync);
1357         }
1358     }
1359 
1360     if (local_err) {
1361         multifd_recv_terminate_threads(local_err);
1362     }
1363     qemu_mutex_lock(&p->mutex);
1364     p->running = false;
1365     qemu_mutex_unlock(&p->mutex);
1366 
1367     rcu_unregister_thread();
1368     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1369 
1370     return NULL;
1371 }
1372 
1373 int multifd_load_setup(void)
1374 {
1375     int thread_count;
1376     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1377     uint8_t i;
1378 
1379     if (!migrate_use_multifd()) {
1380         return 0;
1381     }
1382     thread_count = migrate_multifd_channels();
1383     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1384     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1385     atomic_set(&multifd_recv_state->count, 0);
1386     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1387 
1388     for (i = 0; i < thread_count; i++) {
1389         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1390 
1391         qemu_mutex_init(&p->mutex);
1392         qemu_sem_init(&p->sem_sync, 0);
1393         p->id = i;
1394         p->pages = multifd_pages_init(page_count);
1395         p->packet_len = sizeof(MultiFDPacket_t)
1396                       + sizeof(ram_addr_t) * page_count;
1397         p->packet = g_malloc0(p->packet_len);
1398         p->name = g_strdup_printf("multifdrecv_%d", i);
1399     }
1400     return 0;
1401 }
1402 
1403 bool multifd_recv_all_channels_created(void)
1404 {
1405     int thread_count = migrate_multifd_channels();
1406 
1407     if (!migrate_use_multifd()) {
1408         return true;
1409     }
1410 
1411     return thread_count == atomic_read(&multifd_recv_state->count);
1412 }
1413 
1414 /*
1415  * Try to receive all multifd channels to get ready for the migration.
1416  * - Return true and do not set @errp when correctly receving all channels;
1417  * - Return false and do not set @errp when correctly receiving the current one;
1418  * - Return false and set @errp when failing to receive the current channel.
1419  */
1420 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1421 {
1422     MultiFDRecvParams *p;
1423     Error *local_err = NULL;
1424     int id;
1425 
1426     id = multifd_recv_initial_packet(ioc, &local_err);
1427     if (id < 0) {
1428         multifd_recv_terminate_threads(local_err);
1429         error_propagate_prepend(errp, local_err,
1430                                 "failed to receive packet"
1431                                 " via multifd channel %d: ",
1432                                 atomic_read(&multifd_recv_state->count));
1433         return false;
1434     }
1435 
1436     p = &multifd_recv_state->params[id];
1437     if (p->c != NULL) {
1438         error_setg(&local_err, "multifd: received id '%d' already setup'",
1439                    id);
1440         multifd_recv_terminate_threads(local_err);
1441         error_propagate(errp, local_err);
1442         return false;
1443     }
1444     p->c = ioc;
1445     object_ref(OBJECT(ioc));
1446     /* initial packet */
1447     p->num_packets = 1;
1448 
1449     p->running = true;
1450     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1451                        QEMU_THREAD_JOINABLE);
1452     atomic_inc(&multifd_recv_state->count);
1453     return atomic_read(&multifd_recv_state->count) ==
1454            migrate_multifd_channels();
1455 }
1456 
1457 /**
1458  * save_page_header: write page header to wire
1459  *
1460  * If this is the 1st block, it also writes the block identification
1461  *
1462  * Returns the number of bytes written
1463  *
1464  * @f: QEMUFile where to send the data
1465  * @block: block that contains the page we want to send
1466  * @offset: offset inside the block for the page
1467  *          in the lower bits, it contains flags
1468  */
1469 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1470                                ram_addr_t offset)
1471 {
1472     size_t size, len;
1473 
1474     if (block == rs->last_sent_block) {
1475         offset |= RAM_SAVE_FLAG_CONTINUE;
1476     }
1477     qemu_put_be64(f, offset);
1478     size = 8;
1479 
1480     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1481         len = strlen(block->idstr);
1482         qemu_put_byte(f, len);
1483         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1484         size += 1 + len;
1485         rs->last_sent_block = block;
1486     }
1487     return size;
1488 }
1489 
1490 /**
1491  * mig_throttle_guest_down: throotle down the guest
1492  *
1493  * Reduce amount of guest cpu execution to hopefully slow down memory
1494  * writes. If guest dirty memory rate is reduced below the rate at
1495  * which we can transfer pages to the destination then we should be
1496  * able to complete migration. Some workloads dirty memory way too
1497  * fast and will not effectively converge, even with auto-converge.
1498  */
1499 static void mig_throttle_guest_down(void)
1500 {
1501     MigrationState *s = migrate_get_current();
1502     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1503     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1504     int pct_max = s->parameters.max_cpu_throttle;
1505 
1506     /* We have not started throttling yet. Let's start it. */
1507     if (!cpu_throttle_active()) {
1508         cpu_throttle_set(pct_initial);
1509     } else {
1510         /* Throttling already on, just increase the rate */
1511         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1512                          pct_max));
1513     }
1514 }
1515 
1516 /**
1517  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1518  *
1519  * @rs: current RAM state
1520  * @current_addr: address for the zero page
1521  *
1522  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1523  * The important thing is that a stale (not-yet-0'd) page be replaced
1524  * by the new data.
1525  * As a bonus, if the page wasn't in the cache it gets added so that
1526  * when a small write is made into the 0'd page it gets XBZRLE sent.
1527  */
1528 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1529 {
1530     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1531         return;
1532     }
1533 
1534     /* We don't care if this fails to allocate a new cache page
1535      * as long as it updated an old one */
1536     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1537                  ram_counters.dirty_sync_count);
1538 }
1539 
1540 #define ENCODING_FLAG_XBZRLE 0x1
1541 
1542 /**
1543  * save_xbzrle_page: compress and send current page
1544  *
1545  * Returns: 1 means that we wrote the page
1546  *          0 means that page is identical to the one already sent
1547  *          -1 means that xbzrle would be longer than normal
1548  *
1549  * @rs: current RAM state
1550  * @current_data: pointer to the address of the page contents
1551  * @current_addr: addr of the page
1552  * @block: block that contains the page we want to send
1553  * @offset: offset inside the block for the page
1554  * @last_stage: if we are at the completion stage
1555  */
1556 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1557                             ram_addr_t current_addr, RAMBlock *block,
1558                             ram_addr_t offset, bool last_stage)
1559 {
1560     int encoded_len = 0, bytes_xbzrle;
1561     uint8_t *prev_cached_page;
1562 
1563     if (!cache_is_cached(XBZRLE.cache, current_addr,
1564                          ram_counters.dirty_sync_count)) {
1565         xbzrle_counters.cache_miss++;
1566         if (!last_stage) {
1567             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1568                              ram_counters.dirty_sync_count) == -1) {
1569                 return -1;
1570             } else {
1571                 /* update *current_data when the page has been
1572                    inserted into cache */
1573                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1574             }
1575         }
1576         return -1;
1577     }
1578 
1579     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1580 
1581     /* save current buffer into memory */
1582     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1583 
1584     /* XBZRLE encoding (if there is no overflow) */
1585     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1586                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1587                                        TARGET_PAGE_SIZE);
1588 
1589     /*
1590      * Update the cache contents, so that it corresponds to the data
1591      * sent, in all cases except where we skip the page.
1592      */
1593     if (!last_stage && encoded_len != 0) {
1594         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1595         /*
1596          * In the case where we couldn't compress, ensure that the caller
1597          * sends the data from the cache, since the guest might have
1598          * changed the RAM since we copied it.
1599          */
1600         *current_data = prev_cached_page;
1601     }
1602 
1603     if (encoded_len == 0) {
1604         trace_save_xbzrle_page_skipping();
1605         return 0;
1606     } else if (encoded_len == -1) {
1607         trace_save_xbzrle_page_overflow();
1608         xbzrle_counters.overflow++;
1609         return -1;
1610     }
1611 
1612     /* Send XBZRLE based compressed page */
1613     bytes_xbzrle = save_page_header(rs, rs->f, block,
1614                                     offset | RAM_SAVE_FLAG_XBZRLE);
1615     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1616     qemu_put_be16(rs->f, encoded_len);
1617     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1618     bytes_xbzrle += encoded_len + 1 + 2;
1619     xbzrle_counters.pages++;
1620     xbzrle_counters.bytes += bytes_xbzrle;
1621     ram_counters.transferred += bytes_xbzrle;
1622 
1623     return 1;
1624 }
1625 
1626 /**
1627  * migration_bitmap_find_dirty: find the next dirty page from start
1628  *
1629  * Returns the page offset within memory region of the start of a dirty page
1630  *
1631  * @rs: current RAM state
1632  * @rb: RAMBlock where to search for dirty pages
1633  * @start: page where we start the search
1634  */
1635 static inline
1636 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1637                                           unsigned long start)
1638 {
1639     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1640     unsigned long *bitmap = rb->bmap;
1641     unsigned long next;
1642 
1643     if (ramblock_is_ignored(rb)) {
1644         return size;
1645     }
1646 
1647     /*
1648      * When the free page optimization is enabled, we need to check the bitmap
1649      * to send the non-free pages rather than all the pages in the bulk stage.
1650      */
1651     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1652         next = start + 1;
1653     } else {
1654         next = find_next_bit(bitmap, size, start);
1655     }
1656 
1657     return next;
1658 }
1659 
1660 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1661                                                 RAMBlock *rb,
1662                                                 unsigned long page)
1663 {
1664     bool ret;
1665 
1666     qemu_mutex_lock(&rs->bitmap_mutex);
1667 
1668     /*
1669      * Clear dirty bitmap if needed.  This _must_ be called before we
1670      * send any of the page in the chunk because we need to make sure
1671      * we can capture further page content changes when we sync dirty
1672      * log the next time.  So as long as we are going to send any of
1673      * the page in the chunk we clear the remote dirty bitmap for all.
1674      * Clearing it earlier won't be a problem, but too late will.
1675      */
1676     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
1677         uint8_t shift = rb->clear_bmap_shift;
1678         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
1679         hwaddr start = (page << TARGET_PAGE_BITS) & (-size);
1680 
1681         /*
1682          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
1683          * can make things easier sometimes since then start address
1684          * of the small chunk will always be 64 pages aligned so the
1685          * bitmap will always be aligned to unsigned long.  We should
1686          * even be able to remove this restriction but I'm simply
1687          * keeping it.
1688          */
1689         assert(shift >= 6);
1690         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
1691         memory_region_clear_dirty_bitmap(rb->mr, start, size);
1692     }
1693 
1694     ret = test_and_clear_bit(page, rb->bmap);
1695 
1696     if (ret) {
1697         rs->migration_dirty_pages--;
1698     }
1699     qemu_mutex_unlock(&rs->bitmap_mutex);
1700 
1701     return ret;
1702 }
1703 
1704 /* Called with RCU critical section */
1705 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1706                                         ram_addr_t length)
1707 {
1708     rs->migration_dirty_pages +=
1709         cpu_physical_memory_sync_dirty_bitmap(rb, 0, length,
1710                                               &rs->num_dirty_pages_period);
1711 }
1712 
1713 /**
1714  * ram_pagesize_summary: calculate all the pagesizes of a VM
1715  *
1716  * Returns a summary bitmap of the page sizes of all RAMBlocks
1717  *
1718  * For VMs with just normal pages this is equivalent to the host page
1719  * size. If it's got some huge pages then it's the OR of all the
1720  * different page sizes.
1721  */
1722 uint64_t ram_pagesize_summary(void)
1723 {
1724     RAMBlock *block;
1725     uint64_t summary = 0;
1726 
1727     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1728         summary |= block->page_size;
1729     }
1730 
1731     return summary;
1732 }
1733 
1734 uint64_t ram_get_total_transferred_pages(void)
1735 {
1736     return  ram_counters.normal + ram_counters.duplicate +
1737                 compression_counters.pages + xbzrle_counters.pages;
1738 }
1739 
1740 static void migration_update_rates(RAMState *rs, int64_t end_time)
1741 {
1742     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1743     double compressed_size;
1744 
1745     /* calculate period counters */
1746     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1747                 / (end_time - rs->time_last_bitmap_sync);
1748 
1749     if (!page_count) {
1750         return;
1751     }
1752 
1753     if (migrate_use_xbzrle()) {
1754         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1755             rs->xbzrle_cache_miss_prev) / page_count;
1756         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1757     }
1758 
1759     if (migrate_use_compression()) {
1760         compression_counters.busy_rate = (double)(compression_counters.busy -
1761             rs->compress_thread_busy_prev) / page_count;
1762         rs->compress_thread_busy_prev = compression_counters.busy;
1763 
1764         compressed_size = compression_counters.compressed_size -
1765                           rs->compressed_size_prev;
1766         if (compressed_size) {
1767             double uncompressed_size = (compression_counters.pages -
1768                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1769 
1770             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1771             compression_counters.compression_rate =
1772                                         uncompressed_size / compressed_size;
1773 
1774             rs->compress_pages_prev = compression_counters.pages;
1775             rs->compressed_size_prev = compression_counters.compressed_size;
1776         }
1777     }
1778 }
1779 
1780 static void migration_bitmap_sync(RAMState *rs)
1781 {
1782     RAMBlock *block;
1783     int64_t end_time;
1784     uint64_t bytes_xfer_now;
1785 
1786     ram_counters.dirty_sync_count++;
1787 
1788     if (!rs->time_last_bitmap_sync) {
1789         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1790     }
1791 
1792     trace_migration_bitmap_sync_start();
1793     memory_global_dirty_log_sync();
1794 
1795     qemu_mutex_lock(&rs->bitmap_mutex);
1796     rcu_read_lock();
1797     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1798         migration_bitmap_sync_range(rs, block, block->used_length);
1799     }
1800     ram_counters.remaining = ram_bytes_remaining();
1801     rcu_read_unlock();
1802     qemu_mutex_unlock(&rs->bitmap_mutex);
1803 
1804     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1805 
1806     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1807 
1808     /* more than 1 second = 1000 millisecons */
1809     if (end_time > rs->time_last_bitmap_sync + 1000) {
1810         bytes_xfer_now = ram_counters.transferred;
1811 
1812         /* During block migration the auto-converge logic incorrectly detects
1813          * that ram migration makes no progress. Avoid this by disabling the
1814          * throttling logic during the bulk phase of block migration. */
1815         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1816             /* The following detection logic can be refined later. For now:
1817                Check to see if the dirtied bytes is 50% more than the approx.
1818                amount of bytes that just got transferred since the last time we
1819                were in this routine. If that happens twice, start or increase
1820                throttling */
1821 
1822             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1823                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1824                 (++rs->dirty_rate_high_cnt >= 2)) {
1825                     trace_migration_throttle();
1826                     rs->dirty_rate_high_cnt = 0;
1827                     mig_throttle_guest_down();
1828             }
1829         }
1830 
1831         migration_update_rates(rs, end_time);
1832 
1833         rs->target_page_count_prev = rs->target_page_count;
1834 
1835         /* reset period counters */
1836         rs->time_last_bitmap_sync = end_time;
1837         rs->num_dirty_pages_period = 0;
1838         rs->bytes_xfer_prev = bytes_xfer_now;
1839     }
1840     if (migrate_use_events()) {
1841         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1842     }
1843 }
1844 
1845 static void migration_bitmap_sync_precopy(RAMState *rs)
1846 {
1847     Error *local_err = NULL;
1848 
1849     /*
1850      * The current notifier usage is just an optimization to migration, so we
1851      * don't stop the normal migration process in the error case.
1852      */
1853     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1854         error_report_err(local_err);
1855     }
1856 
1857     migration_bitmap_sync(rs);
1858 
1859     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1860         error_report_err(local_err);
1861     }
1862 }
1863 
1864 /**
1865  * save_zero_page_to_file: send the zero page to the file
1866  *
1867  * Returns the size of data written to the file, 0 means the page is not
1868  * a zero page
1869  *
1870  * @rs: current RAM state
1871  * @file: the file where the data is saved
1872  * @block: block that contains the page we want to send
1873  * @offset: offset inside the block for the page
1874  */
1875 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1876                                   RAMBlock *block, ram_addr_t offset)
1877 {
1878     uint8_t *p = block->host + offset;
1879     int len = 0;
1880 
1881     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1882         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1883         qemu_put_byte(file, 0);
1884         len += 1;
1885     }
1886     return len;
1887 }
1888 
1889 /**
1890  * save_zero_page: send the zero page to the stream
1891  *
1892  * Returns the number of pages written.
1893  *
1894  * @rs: current RAM state
1895  * @block: block that contains the page we want to send
1896  * @offset: offset inside the block for the page
1897  */
1898 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1899 {
1900     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1901 
1902     if (len) {
1903         ram_counters.duplicate++;
1904         ram_counters.transferred += len;
1905         return 1;
1906     }
1907     return -1;
1908 }
1909 
1910 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1911 {
1912     if (!migrate_release_ram() || !migration_in_postcopy()) {
1913         return;
1914     }
1915 
1916     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1917 }
1918 
1919 /*
1920  * @pages: the number of pages written by the control path,
1921  *        < 0 - error
1922  *        > 0 - number of pages written
1923  *
1924  * Return true if the pages has been saved, otherwise false is returned.
1925  */
1926 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1927                               int *pages)
1928 {
1929     uint64_t bytes_xmit = 0;
1930     int ret;
1931 
1932     *pages = -1;
1933     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1934                                 &bytes_xmit);
1935     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1936         return false;
1937     }
1938 
1939     if (bytes_xmit) {
1940         ram_counters.transferred += bytes_xmit;
1941         *pages = 1;
1942     }
1943 
1944     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1945         return true;
1946     }
1947 
1948     if (bytes_xmit > 0) {
1949         ram_counters.normal++;
1950     } else if (bytes_xmit == 0) {
1951         ram_counters.duplicate++;
1952     }
1953 
1954     return true;
1955 }
1956 
1957 /*
1958  * directly send the page to the stream
1959  *
1960  * Returns the number of pages written.
1961  *
1962  * @rs: current RAM state
1963  * @block: block that contains the page we want to send
1964  * @offset: offset inside the block for the page
1965  * @buf: the page to be sent
1966  * @async: send to page asyncly
1967  */
1968 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1969                             uint8_t *buf, bool async)
1970 {
1971     ram_counters.transferred += save_page_header(rs, rs->f, block,
1972                                                  offset | RAM_SAVE_FLAG_PAGE);
1973     if (async) {
1974         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1975                               migrate_release_ram() &
1976                               migration_in_postcopy());
1977     } else {
1978         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1979     }
1980     ram_counters.transferred += TARGET_PAGE_SIZE;
1981     ram_counters.normal++;
1982     return 1;
1983 }
1984 
1985 /**
1986  * ram_save_page: send the given page to the stream
1987  *
1988  * Returns the number of pages written.
1989  *          < 0 - error
1990  *          >=0 - Number of pages written - this might legally be 0
1991  *                if xbzrle noticed the page was the same.
1992  *
1993  * @rs: current RAM state
1994  * @block: block that contains the page we want to send
1995  * @offset: offset inside the block for the page
1996  * @last_stage: if we are at the completion stage
1997  */
1998 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1999 {
2000     int pages = -1;
2001     uint8_t *p;
2002     bool send_async = true;
2003     RAMBlock *block = pss->block;
2004     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2005     ram_addr_t current_addr = block->offset + offset;
2006 
2007     p = block->host + offset;
2008     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
2009 
2010     XBZRLE_cache_lock();
2011     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
2012         migrate_use_xbzrle()) {
2013         pages = save_xbzrle_page(rs, &p, current_addr, block,
2014                                  offset, last_stage);
2015         if (!last_stage) {
2016             /* Can't send this cached data async, since the cache page
2017              * might get updated before it gets to the wire
2018              */
2019             send_async = false;
2020         }
2021     }
2022 
2023     /* XBZRLE overflow or normal page */
2024     if (pages == -1) {
2025         pages = save_normal_page(rs, block, offset, p, send_async);
2026     }
2027 
2028     XBZRLE_cache_unlock();
2029 
2030     return pages;
2031 }
2032 
2033 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2034                                  ram_addr_t offset)
2035 {
2036     multifd_queue_page(block, offset);
2037     ram_counters.normal++;
2038 
2039     return 1;
2040 }
2041 
2042 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
2043                                  ram_addr_t offset, uint8_t *source_buf)
2044 {
2045     RAMState *rs = ram_state;
2046     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
2047     bool zero_page = false;
2048     int ret;
2049 
2050     if (save_zero_page_to_file(rs, f, block, offset)) {
2051         zero_page = true;
2052         goto exit;
2053     }
2054 
2055     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2056 
2057     /*
2058      * copy it to a internal buffer to avoid it being modified by VM
2059      * so that we can catch up the error during compression and
2060      * decompression
2061      */
2062     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2063     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2064     if (ret < 0) {
2065         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2066         error_report("compressed data failed!");
2067         return false;
2068     }
2069 
2070 exit:
2071     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2072     return zero_page;
2073 }
2074 
2075 static void
2076 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2077 {
2078     ram_counters.transferred += bytes_xmit;
2079 
2080     if (param->zero_page) {
2081         ram_counters.duplicate++;
2082         return;
2083     }
2084 
2085     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2086     compression_counters.compressed_size += bytes_xmit - 8;
2087     compression_counters.pages++;
2088 }
2089 
2090 static bool save_page_use_compression(RAMState *rs);
2091 
2092 static void flush_compressed_data(RAMState *rs)
2093 {
2094     int idx, len, thread_count;
2095 
2096     if (!save_page_use_compression(rs)) {
2097         return;
2098     }
2099     thread_count = migrate_compress_threads();
2100 
2101     qemu_mutex_lock(&comp_done_lock);
2102     for (idx = 0; idx < thread_count; idx++) {
2103         while (!comp_param[idx].done) {
2104             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2105         }
2106     }
2107     qemu_mutex_unlock(&comp_done_lock);
2108 
2109     for (idx = 0; idx < thread_count; idx++) {
2110         qemu_mutex_lock(&comp_param[idx].mutex);
2111         if (!comp_param[idx].quit) {
2112             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2113             /*
2114              * it's safe to fetch zero_page without holding comp_done_lock
2115              * as there is no further request submitted to the thread,
2116              * i.e, the thread should be waiting for a request at this point.
2117              */
2118             update_compress_thread_counts(&comp_param[idx], len);
2119         }
2120         qemu_mutex_unlock(&comp_param[idx].mutex);
2121     }
2122 }
2123 
2124 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2125                                        ram_addr_t offset)
2126 {
2127     param->block = block;
2128     param->offset = offset;
2129 }
2130 
2131 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2132                                            ram_addr_t offset)
2133 {
2134     int idx, thread_count, bytes_xmit = -1, pages = -1;
2135     bool wait = migrate_compress_wait_thread();
2136 
2137     thread_count = migrate_compress_threads();
2138     qemu_mutex_lock(&comp_done_lock);
2139 retry:
2140     for (idx = 0; idx < thread_count; idx++) {
2141         if (comp_param[idx].done) {
2142             comp_param[idx].done = false;
2143             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2144             qemu_mutex_lock(&comp_param[idx].mutex);
2145             set_compress_params(&comp_param[idx], block, offset);
2146             qemu_cond_signal(&comp_param[idx].cond);
2147             qemu_mutex_unlock(&comp_param[idx].mutex);
2148             pages = 1;
2149             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2150             break;
2151         }
2152     }
2153 
2154     /*
2155      * wait for the free thread if the user specifies 'compress-wait-thread',
2156      * otherwise we will post the page out in the main thread as normal page.
2157      */
2158     if (pages < 0 && wait) {
2159         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2160         goto retry;
2161     }
2162     qemu_mutex_unlock(&comp_done_lock);
2163 
2164     return pages;
2165 }
2166 
2167 /**
2168  * find_dirty_block: find the next dirty page and update any state
2169  * associated with the search process.
2170  *
2171  * Returns true if a page is found
2172  *
2173  * @rs: current RAM state
2174  * @pss: data about the state of the current dirty page scan
2175  * @again: set to false if the search has scanned the whole of RAM
2176  */
2177 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2178 {
2179     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2180     if (pss->complete_round && pss->block == rs->last_seen_block &&
2181         pss->page >= rs->last_page) {
2182         /*
2183          * We've been once around the RAM and haven't found anything.
2184          * Give up.
2185          */
2186         *again = false;
2187         return false;
2188     }
2189     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2190         /* Didn't find anything in this RAM Block */
2191         pss->page = 0;
2192         pss->block = QLIST_NEXT_RCU(pss->block, next);
2193         if (!pss->block) {
2194             /*
2195              * If memory migration starts over, we will meet a dirtied page
2196              * which may still exists in compression threads's ring, so we
2197              * should flush the compressed data to make sure the new page
2198              * is not overwritten by the old one in the destination.
2199              *
2200              * Also If xbzrle is on, stop using the data compression at this
2201              * point. In theory, xbzrle can do better than compression.
2202              */
2203             flush_compressed_data(rs);
2204 
2205             /* Hit the end of the list */
2206             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2207             /* Flag that we've looped */
2208             pss->complete_round = true;
2209             rs->ram_bulk_stage = false;
2210         }
2211         /* Didn't find anything this time, but try again on the new block */
2212         *again = true;
2213         return false;
2214     } else {
2215         /* Can go around again, but... */
2216         *again = true;
2217         /* We've found something so probably don't need to */
2218         return true;
2219     }
2220 }
2221 
2222 /**
2223  * unqueue_page: gets a page of the queue
2224  *
2225  * Helper for 'get_queued_page' - gets a page off the queue
2226  *
2227  * Returns the block of the page (or NULL if none available)
2228  *
2229  * @rs: current RAM state
2230  * @offset: used to return the offset within the RAMBlock
2231  */
2232 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2233 {
2234     RAMBlock *block = NULL;
2235 
2236     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2237         return NULL;
2238     }
2239 
2240     qemu_mutex_lock(&rs->src_page_req_mutex);
2241     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2242         struct RAMSrcPageRequest *entry =
2243                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2244         block = entry->rb;
2245         *offset = entry->offset;
2246 
2247         if (entry->len > TARGET_PAGE_SIZE) {
2248             entry->len -= TARGET_PAGE_SIZE;
2249             entry->offset += TARGET_PAGE_SIZE;
2250         } else {
2251             memory_region_unref(block->mr);
2252             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2253             g_free(entry);
2254             migration_consume_urgent_request();
2255         }
2256     }
2257     qemu_mutex_unlock(&rs->src_page_req_mutex);
2258 
2259     return block;
2260 }
2261 
2262 /**
2263  * get_queued_page: unqueue a page from the postcopy requests
2264  *
2265  * Skips pages that are already sent (!dirty)
2266  *
2267  * Returns true if a queued page is found
2268  *
2269  * @rs: current RAM state
2270  * @pss: data about the state of the current dirty page scan
2271  */
2272 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2273 {
2274     RAMBlock  *block;
2275     ram_addr_t offset;
2276     bool dirty;
2277 
2278     do {
2279         block = unqueue_page(rs, &offset);
2280         /*
2281          * We're sending this page, and since it's postcopy nothing else
2282          * will dirty it, and we must make sure it doesn't get sent again
2283          * even if this queue request was received after the background
2284          * search already sent it.
2285          */
2286         if (block) {
2287             unsigned long page;
2288 
2289             page = offset >> TARGET_PAGE_BITS;
2290             dirty = test_bit(page, block->bmap);
2291             if (!dirty) {
2292                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2293                        page, test_bit(page, block->unsentmap));
2294             } else {
2295                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2296             }
2297         }
2298 
2299     } while (block && !dirty);
2300 
2301     if (block) {
2302         /*
2303          * As soon as we start servicing pages out of order, then we have
2304          * to kill the bulk stage, since the bulk stage assumes
2305          * in (migration_bitmap_find_and_reset_dirty) that every page is
2306          * dirty, that's no longer true.
2307          */
2308         rs->ram_bulk_stage = false;
2309 
2310         /*
2311          * We want the background search to continue from the queued page
2312          * since the guest is likely to want other pages near to the page
2313          * it just requested.
2314          */
2315         pss->block = block;
2316         pss->page = offset >> TARGET_PAGE_BITS;
2317 
2318         /*
2319          * This unqueued page would break the "one round" check, even is
2320          * really rare.
2321          */
2322         pss->complete_round = false;
2323     }
2324 
2325     return !!block;
2326 }
2327 
2328 /**
2329  * migration_page_queue_free: drop any remaining pages in the ram
2330  * request queue
2331  *
2332  * It should be empty at the end anyway, but in error cases there may
2333  * be some left.  in case that there is any page left, we drop it.
2334  *
2335  */
2336 static void migration_page_queue_free(RAMState *rs)
2337 {
2338     struct RAMSrcPageRequest *mspr, *next_mspr;
2339     /* This queue generally should be empty - but in the case of a failed
2340      * migration might have some droppings in.
2341      */
2342     rcu_read_lock();
2343     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2344         memory_region_unref(mspr->rb->mr);
2345         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2346         g_free(mspr);
2347     }
2348     rcu_read_unlock();
2349 }
2350 
2351 /**
2352  * ram_save_queue_pages: queue the page for transmission
2353  *
2354  * A request from postcopy destination for example.
2355  *
2356  * Returns zero on success or negative on error
2357  *
2358  * @rbname: Name of the RAMBLock of the request. NULL means the
2359  *          same that last one.
2360  * @start: starting address from the start of the RAMBlock
2361  * @len: length (in bytes) to send
2362  */
2363 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2364 {
2365     RAMBlock *ramblock;
2366     RAMState *rs = ram_state;
2367 
2368     ram_counters.postcopy_requests++;
2369     rcu_read_lock();
2370     if (!rbname) {
2371         /* Reuse last RAMBlock */
2372         ramblock = rs->last_req_rb;
2373 
2374         if (!ramblock) {
2375             /*
2376              * Shouldn't happen, we can't reuse the last RAMBlock if
2377              * it's the 1st request.
2378              */
2379             error_report("ram_save_queue_pages no previous block");
2380             goto err;
2381         }
2382     } else {
2383         ramblock = qemu_ram_block_by_name(rbname);
2384 
2385         if (!ramblock) {
2386             /* We shouldn't be asked for a non-existent RAMBlock */
2387             error_report("ram_save_queue_pages no block '%s'", rbname);
2388             goto err;
2389         }
2390         rs->last_req_rb = ramblock;
2391     }
2392     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2393     if (start+len > ramblock->used_length) {
2394         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2395                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2396                      __func__, start, len, ramblock->used_length);
2397         goto err;
2398     }
2399 
2400     struct RAMSrcPageRequest *new_entry =
2401         g_malloc0(sizeof(struct RAMSrcPageRequest));
2402     new_entry->rb = ramblock;
2403     new_entry->offset = start;
2404     new_entry->len = len;
2405 
2406     memory_region_ref(ramblock->mr);
2407     qemu_mutex_lock(&rs->src_page_req_mutex);
2408     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2409     migration_make_urgent_request();
2410     qemu_mutex_unlock(&rs->src_page_req_mutex);
2411     rcu_read_unlock();
2412 
2413     return 0;
2414 
2415 err:
2416     rcu_read_unlock();
2417     return -1;
2418 }
2419 
2420 static bool save_page_use_compression(RAMState *rs)
2421 {
2422     if (!migrate_use_compression()) {
2423         return false;
2424     }
2425 
2426     /*
2427      * If xbzrle is on, stop using the data compression after first
2428      * round of migration even if compression is enabled. In theory,
2429      * xbzrle can do better than compression.
2430      */
2431     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2432         return true;
2433     }
2434 
2435     return false;
2436 }
2437 
2438 /*
2439  * try to compress the page before posting it out, return true if the page
2440  * has been properly handled by compression, otherwise needs other
2441  * paths to handle it
2442  */
2443 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2444 {
2445     if (!save_page_use_compression(rs)) {
2446         return false;
2447     }
2448 
2449     /*
2450      * When starting the process of a new block, the first page of
2451      * the block should be sent out before other pages in the same
2452      * block, and all the pages in last block should have been sent
2453      * out, keeping this order is important, because the 'cont' flag
2454      * is used to avoid resending the block name.
2455      *
2456      * We post the fist page as normal page as compression will take
2457      * much CPU resource.
2458      */
2459     if (block != rs->last_sent_block) {
2460         flush_compressed_data(rs);
2461         return false;
2462     }
2463 
2464     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2465         return true;
2466     }
2467 
2468     compression_counters.busy++;
2469     return false;
2470 }
2471 
2472 /**
2473  * ram_save_target_page: save one target page
2474  *
2475  * Returns the number of pages written
2476  *
2477  * @rs: current RAM state
2478  * @pss: data about the page we want to send
2479  * @last_stage: if we are at the completion stage
2480  */
2481 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2482                                 bool last_stage)
2483 {
2484     RAMBlock *block = pss->block;
2485     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2486     int res;
2487 
2488     if (control_save_page(rs, block, offset, &res)) {
2489         return res;
2490     }
2491 
2492     if (save_compress_page(rs, block, offset)) {
2493         return 1;
2494     }
2495 
2496     res = save_zero_page(rs, block, offset);
2497     if (res > 0) {
2498         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2499          * page would be stale
2500          */
2501         if (!save_page_use_compression(rs)) {
2502             XBZRLE_cache_lock();
2503             xbzrle_cache_zero_page(rs, block->offset + offset);
2504             XBZRLE_cache_unlock();
2505         }
2506         ram_release_pages(block->idstr, offset, res);
2507         return res;
2508     }
2509 
2510     /*
2511      * do not use multifd for compression as the first page in the new
2512      * block should be posted out before sending the compressed page
2513      */
2514     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2515         return ram_save_multifd_page(rs, block, offset);
2516     }
2517 
2518     return ram_save_page(rs, pss, last_stage);
2519 }
2520 
2521 /**
2522  * ram_save_host_page: save a whole host page
2523  *
2524  * Starting at *offset send pages up to the end of the current host
2525  * page. It's valid for the initial offset to point into the middle of
2526  * a host page in which case the remainder of the hostpage is sent.
2527  * Only dirty target pages are sent. Note that the host page size may
2528  * be a huge page for this block.
2529  * The saving stops at the boundary of the used_length of the block
2530  * if the RAMBlock isn't a multiple of the host page size.
2531  *
2532  * Returns the number of pages written or negative on error
2533  *
2534  * @rs: current RAM state
2535  * @ms: current migration state
2536  * @pss: data about the page we want to send
2537  * @last_stage: if we are at the completion stage
2538  */
2539 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2540                               bool last_stage)
2541 {
2542     int tmppages, pages = 0;
2543     size_t pagesize_bits =
2544         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2545 
2546     if (ramblock_is_ignored(pss->block)) {
2547         error_report("block %s should not be migrated !", pss->block->idstr);
2548         return 0;
2549     }
2550 
2551     do {
2552         /* Check the pages is dirty and if it is send it */
2553         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2554             pss->page++;
2555             continue;
2556         }
2557 
2558         tmppages = ram_save_target_page(rs, pss, last_stage);
2559         if (tmppages < 0) {
2560             return tmppages;
2561         }
2562 
2563         pages += tmppages;
2564         if (pss->block->unsentmap) {
2565             clear_bit(pss->page, pss->block->unsentmap);
2566         }
2567 
2568         pss->page++;
2569     } while ((pss->page & (pagesize_bits - 1)) &&
2570              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2571 
2572     /* The offset we leave with is the last one we looked at */
2573     pss->page--;
2574     return pages;
2575 }
2576 
2577 /**
2578  * ram_find_and_save_block: finds a dirty page and sends it to f
2579  *
2580  * Called within an RCU critical section.
2581  *
2582  * Returns the number of pages written where zero means no dirty pages,
2583  * or negative on error
2584  *
2585  * @rs: current RAM state
2586  * @last_stage: if we are at the completion stage
2587  *
2588  * On systems where host-page-size > target-page-size it will send all the
2589  * pages in a host page that are dirty.
2590  */
2591 
2592 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2593 {
2594     PageSearchStatus pss;
2595     int pages = 0;
2596     bool again, found;
2597 
2598     /* No dirty page as there is zero RAM */
2599     if (!ram_bytes_total()) {
2600         return pages;
2601     }
2602 
2603     pss.block = rs->last_seen_block;
2604     pss.page = rs->last_page;
2605     pss.complete_round = false;
2606 
2607     if (!pss.block) {
2608         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2609     }
2610 
2611     do {
2612         again = true;
2613         found = get_queued_page(rs, &pss);
2614 
2615         if (!found) {
2616             /* priority queue empty, so just search for something dirty */
2617             found = find_dirty_block(rs, &pss, &again);
2618         }
2619 
2620         if (found) {
2621             pages = ram_save_host_page(rs, &pss, last_stage);
2622         }
2623     } while (!pages && again);
2624 
2625     rs->last_seen_block = pss.block;
2626     rs->last_page = pss.page;
2627 
2628     return pages;
2629 }
2630 
2631 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2632 {
2633     uint64_t pages = size / TARGET_PAGE_SIZE;
2634 
2635     if (zero) {
2636         ram_counters.duplicate += pages;
2637     } else {
2638         ram_counters.normal += pages;
2639         ram_counters.transferred += size;
2640         qemu_update_position(f, size);
2641     }
2642 }
2643 
2644 static uint64_t ram_bytes_total_common(bool count_ignored)
2645 {
2646     RAMBlock *block;
2647     uint64_t total = 0;
2648 
2649     rcu_read_lock();
2650     if (count_ignored) {
2651         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2652             total += block->used_length;
2653         }
2654     } else {
2655         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2656             total += block->used_length;
2657         }
2658     }
2659     rcu_read_unlock();
2660     return total;
2661 }
2662 
2663 uint64_t ram_bytes_total(void)
2664 {
2665     return ram_bytes_total_common(false);
2666 }
2667 
2668 static void xbzrle_load_setup(void)
2669 {
2670     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2671 }
2672 
2673 static void xbzrle_load_cleanup(void)
2674 {
2675     g_free(XBZRLE.decoded_buf);
2676     XBZRLE.decoded_buf = NULL;
2677 }
2678 
2679 static void ram_state_cleanup(RAMState **rsp)
2680 {
2681     if (*rsp) {
2682         migration_page_queue_free(*rsp);
2683         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2684         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2685         g_free(*rsp);
2686         *rsp = NULL;
2687     }
2688 }
2689 
2690 static void xbzrle_cleanup(void)
2691 {
2692     XBZRLE_cache_lock();
2693     if (XBZRLE.cache) {
2694         cache_fini(XBZRLE.cache);
2695         g_free(XBZRLE.encoded_buf);
2696         g_free(XBZRLE.current_buf);
2697         g_free(XBZRLE.zero_target_page);
2698         XBZRLE.cache = NULL;
2699         XBZRLE.encoded_buf = NULL;
2700         XBZRLE.current_buf = NULL;
2701         XBZRLE.zero_target_page = NULL;
2702     }
2703     XBZRLE_cache_unlock();
2704 }
2705 
2706 static void ram_save_cleanup(void *opaque)
2707 {
2708     RAMState **rsp = opaque;
2709     RAMBlock *block;
2710 
2711     /* caller have hold iothread lock or is in a bh, so there is
2712      * no writing race against the migration bitmap
2713      */
2714     memory_global_dirty_log_stop();
2715 
2716     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2717         g_free(block->clear_bmap);
2718         block->clear_bmap = NULL;
2719         g_free(block->bmap);
2720         block->bmap = NULL;
2721         g_free(block->unsentmap);
2722         block->unsentmap = NULL;
2723     }
2724 
2725     xbzrle_cleanup();
2726     compress_threads_save_cleanup();
2727     ram_state_cleanup(rsp);
2728 }
2729 
2730 static void ram_state_reset(RAMState *rs)
2731 {
2732     rs->last_seen_block = NULL;
2733     rs->last_sent_block = NULL;
2734     rs->last_page = 0;
2735     rs->last_version = ram_list.version;
2736     rs->ram_bulk_stage = true;
2737     rs->fpo_enabled = false;
2738 }
2739 
2740 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2741 
2742 /*
2743  * 'expected' is the value you expect the bitmap mostly to be full
2744  * of; it won't bother printing lines that are all this value.
2745  * If 'todump' is null the migration bitmap is dumped.
2746  */
2747 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2748                            unsigned long pages)
2749 {
2750     int64_t cur;
2751     int64_t linelen = 128;
2752     char linebuf[129];
2753 
2754     for (cur = 0; cur < pages; cur += linelen) {
2755         int64_t curb;
2756         bool found = false;
2757         /*
2758          * Last line; catch the case where the line length
2759          * is longer than remaining ram
2760          */
2761         if (cur + linelen > pages) {
2762             linelen = pages - cur;
2763         }
2764         for (curb = 0; curb < linelen; curb++) {
2765             bool thisbit = test_bit(cur + curb, todump);
2766             linebuf[curb] = thisbit ? '1' : '.';
2767             found = found || (thisbit != expected);
2768         }
2769         if (found) {
2770             linebuf[curb] = '\0';
2771             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2772         }
2773     }
2774 }
2775 
2776 /* **** functions for postcopy ***** */
2777 
2778 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2779 {
2780     struct RAMBlock *block;
2781 
2782     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2783         unsigned long *bitmap = block->bmap;
2784         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2785         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2786 
2787         while (run_start < range) {
2788             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2789             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2790                               (run_end - run_start) << TARGET_PAGE_BITS);
2791             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2792         }
2793     }
2794 }
2795 
2796 /**
2797  * postcopy_send_discard_bm_ram: discard a RAMBlock
2798  *
2799  * Returns zero on success
2800  *
2801  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2802  * Note: At this point the 'unsentmap' is the processed bitmap combined
2803  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2804  *
2805  * @ms: current migration state
2806  * @pds: state for postcopy
2807  * @block: RAMBlock to discard
2808  */
2809 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2810                                         PostcopyDiscardState *pds,
2811                                         RAMBlock *block)
2812 {
2813     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2814     unsigned long current;
2815     unsigned long *unsentmap = block->unsentmap;
2816 
2817     for (current = 0; current < end; ) {
2818         unsigned long one = find_next_bit(unsentmap, end, current);
2819 
2820         if (one <= end) {
2821             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2822             unsigned long discard_length;
2823 
2824             if (zero >= end) {
2825                 discard_length = end - one;
2826             } else {
2827                 discard_length = zero - one;
2828             }
2829             if (discard_length) {
2830                 postcopy_discard_send_range(ms, pds, one, discard_length);
2831             }
2832             current = one + discard_length;
2833         } else {
2834             current = one;
2835         }
2836     }
2837 
2838     return 0;
2839 }
2840 
2841 /**
2842  * postcopy_each_ram_send_discard: discard all RAMBlocks
2843  *
2844  * Returns 0 for success or negative for error
2845  *
2846  * Utility for the outgoing postcopy code.
2847  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2848  *   passing it bitmap indexes and name.
2849  * (qemu_ram_foreach_block ends up passing unscaled lengths
2850  *  which would mean postcopy code would have to deal with target page)
2851  *
2852  * @ms: current migration state
2853  */
2854 static int postcopy_each_ram_send_discard(MigrationState *ms)
2855 {
2856     struct RAMBlock *block;
2857     int ret;
2858 
2859     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2860         PostcopyDiscardState *pds =
2861             postcopy_discard_send_init(ms, block->idstr);
2862 
2863         /*
2864          * Postcopy sends chunks of bitmap over the wire, but it
2865          * just needs indexes at this point, avoids it having
2866          * target page specific code.
2867          */
2868         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2869         postcopy_discard_send_finish(ms, pds);
2870         if (ret) {
2871             return ret;
2872         }
2873     }
2874 
2875     return 0;
2876 }
2877 
2878 /**
2879  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2880  *
2881  * Helper for postcopy_chunk_hostpages; it's called twice to
2882  * canonicalize the two bitmaps, that are similar, but one is
2883  * inverted.
2884  *
2885  * Postcopy requires that all target pages in a hostpage are dirty or
2886  * clean, not a mix.  This function canonicalizes the bitmaps.
2887  *
2888  * @ms: current migration state
2889  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2890  *               otherwise we need to canonicalize partially dirty host pages
2891  * @block: block that contains the page we want to canonicalize
2892  * @pds: state for postcopy
2893  */
2894 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2895                                           RAMBlock *block,
2896                                           PostcopyDiscardState *pds)
2897 {
2898     RAMState *rs = ram_state;
2899     unsigned long *bitmap = block->bmap;
2900     unsigned long *unsentmap = block->unsentmap;
2901     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2902     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2903     unsigned long run_start;
2904 
2905     if (block->page_size == TARGET_PAGE_SIZE) {
2906         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2907         return;
2908     }
2909 
2910     if (unsent_pass) {
2911         /* Find a sent page */
2912         run_start = find_next_zero_bit(unsentmap, pages, 0);
2913     } else {
2914         /* Find a dirty page */
2915         run_start = find_next_bit(bitmap, pages, 0);
2916     }
2917 
2918     while (run_start < pages) {
2919         bool do_fixup = false;
2920         unsigned long fixup_start_addr;
2921         unsigned long host_offset;
2922 
2923         /*
2924          * If the start of this run of pages is in the middle of a host
2925          * page, then we need to fixup this host page.
2926          */
2927         host_offset = run_start % host_ratio;
2928         if (host_offset) {
2929             do_fixup = true;
2930             run_start -= host_offset;
2931             fixup_start_addr = run_start;
2932             /* For the next pass */
2933             run_start = run_start + host_ratio;
2934         } else {
2935             /* Find the end of this run */
2936             unsigned long run_end;
2937             if (unsent_pass) {
2938                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2939             } else {
2940                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2941             }
2942             /*
2943              * If the end isn't at the start of a host page, then the
2944              * run doesn't finish at the end of a host page
2945              * and we need to discard.
2946              */
2947             host_offset = run_end % host_ratio;
2948             if (host_offset) {
2949                 do_fixup = true;
2950                 fixup_start_addr = run_end - host_offset;
2951                 /*
2952                  * This host page has gone, the next loop iteration starts
2953                  * from after the fixup
2954                  */
2955                 run_start = fixup_start_addr + host_ratio;
2956             } else {
2957                 /*
2958                  * No discards on this iteration, next loop starts from
2959                  * next sent/dirty page
2960                  */
2961                 run_start = run_end + 1;
2962             }
2963         }
2964 
2965         if (do_fixup) {
2966             unsigned long page;
2967 
2968             /* Tell the destination to discard this page */
2969             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2970                 /* For the unsent_pass we:
2971                  *     discard partially sent pages
2972                  * For the !unsent_pass (dirty) we:
2973                  *     discard partially dirty pages that were sent
2974                  *     (any partially sent pages were already discarded
2975                  *     by the previous unsent_pass)
2976                  */
2977                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2978                                             host_ratio);
2979             }
2980 
2981             /* Clean up the bitmap */
2982             for (page = fixup_start_addr;
2983                  page < fixup_start_addr + host_ratio; page++) {
2984                 /* All pages in this host page are now not sent */
2985                 set_bit(page, unsentmap);
2986 
2987                 /*
2988                  * Remark them as dirty, updating the count for any pages
2989                  * that weren't previously dirty.
2990                  */
2991                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2992             }
2993         }
2994 
2995         if (unsent_pass) {
2996             /* Find the next sent page for the next iteration */
2997             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2998         } else {
2999             /* Find the next dirty page for the next iteration */
3000             run_start = find_next_bit(bitmap, pages, run_start);
3001         }
3002     }
3003 }
3004 
3005 /**
3006  * postcopy_chunk_hostpages: discard any partially sent host page
3007  *
3008  * Utility for the outgoing postcopy code.
3009  *
3010  * Discard any partially sent host-page size chunks, mark any partially
3011  * dirty host-page size chunks as all dirty.  In this case the host-page
3012  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
3013  *
3014  * Returns zero on success
3015  *
3016  * @ms: current migration state
3017  * @block: block we want to work with
3018  */
3019 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
3020 {
3021     PostcopyDiscardState *pds =
3022         postcopy_discard_send_init(ms, block->idstr);
3023 
3024     /* First pass: Discard all partially sent host pages */
3025     postcopy_chunk_hostpages_pass(ms, true, block, pds);
3026     /*
3027      * Second pass: Ensure that all partially dirty host pages are made
3028      * fully dirty.
3029      */
3030     postcopy_chunk_hostpages_pass(ms, false, block, pds);
3031 
3032     postcopy_discard_send_finish(ms, pds);
3033     return 0;
3034 }
3035 
3036 /**
3037  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3038  *
3039  * Returns zero on success
3040  *
3041  * Transmit the set of pages to be discarded after precopy to the target
3042  * these are pages that:
3043  *     a) Have been previously transmitted but are now dirty again
3044  *     b) Pages that have never been transmitted, this ensures that
3045  *        any pages on the destination that have been mapped by background
3046  *        tasks get discarded (transparent huge pages is the specific concern)
3047  * Hopefully this is pretty sparse
3048  *
3049  * @ms: current migration state
3050  */
3051 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3052 {
3053     RAMState *rs = ram_state;
3054     RAMBlock *block;
3055     int ret;
3056 
3057     rcu_read_lock();
3058 
3059     /* This should be our last sync, the src is now paused */
3060     migration_bitmap_sync(rs);
3061 
3062     /* Easiest way to make sure we don't resume in the middle of a host-page */
3063     rs->last_seen_block = NULL;
3064     rs->last_sent_block = NULL;
3065     rs->last_page = 0;
3066 
3067     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3068         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3069         unsigned long *bitmap = block->bmap;
3070         unsigned long *unsentmap = block->unsentmap;
3071 
3072         if (!unsentmap) {
3073             /* We don't have a safe way to resize the sentmap, so
3074              * if the bitmap was resized it will be NULL at this
3075              * point.
3076              */
3077             error_report("migration ram resized during precopy phase");
3078             rcu_read_unlock();
3079             return -EINVAL;
3080         }
3081         /* Deal with TPS != HPS and huge pages */
3082         ret = postcopy_chunk_hostpages(ms, block);
3083         if (ret) {
3084             rcu_read_unlock();
3085             return ret;
3086         }
3087 
3088         /*
3089          * Update the unsentmap to be unsentmap = unsentmap | dirty
3090          */
3091         bitmap_or(unsentmap, unsentmap, bitmap, pages);
3092 #ifdef DEBUG_POSTCOPY
3093         ram_debug_dump_bitmap(unsentmap, true, pages);
3094 #endif
3095     }
3096     trace_ram_postcopy_send_discard_bitmap();
3097 
3098     ret = postcopy_each_ram_send_discard(ms);
3099     rcu_read_unlock();
3100 
3101     return ret;
3102 }
3103 
3104 /**
3105  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3106  *
3107  * Returns zero on success
3108  *
3109  * @rbname: name of the RAMBlock of the request. NULL means the
3110  *          same that last one.
3111  * @start: RAMBlock starting page
3112  * @length: RAMBlock size
3113  */
3114 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3115 {
3116     int ret = -1;
3117 
3118     trace_ram_discard_range(rbname, start, length);
3119 
3120     rcu_read_lock();
3121     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3122 
3123     if (!rb) {
3124         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3125         goto err;
3126     }
3127 
3128     /*
3129      * On source VM, we don't need to update the received bitmap since
3130      * we don't even have one.
3131      */
3132     if (rb->receivedmap) {
3133         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3134                      length >> qemu_target_page_bits());
3135     }
3136 
3137     ret = ram_block_discard_range(rb, start, length);
3138 
3139 err:
3140     rcu_read_unlock();
3141 
3142     return ret;
3143 }
3144 
3145 /*
3146  * For every allocation, we will try not to crash the VM if the
3147  * allocation failed.
3148  */
3149 static int xbzrle_init(void)
3150 {
3151     Error *local_err = NULL;
3152 
3153     if (!migrate_use_xbzrle()) {
3154         return 0;
3155     }
3156 
3157     XBZRLE_cache_lock();
3158 
3159     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3160     if (!XBZRLE.zero_target_page) {
3161         error_report("%s: Error allocating zero page", __func__);
3162         goto err_out;
3163     }
3164 
3165     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3166                               TARGET_PAGE_SIZE, &local_err);
3167     if (!XBZRLE.cache) {
3168         error_report_err(local_err);
3169         goto free_zero_page;
3170     }
3171 
3172     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3173     if (!XBZRLE.encoded_buf) {
3174         error_report("%s: Error allocating encoded_buf", __func__);
3175         goto free_cache;
3176     }
3177 
3178     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3179     if (!XBZRLE.current_buf) {
3180         error_report("%s: Error allocating current_buf", __func__);
3181         goto free_encoded_buf;
3182     }
3183 
3184     /* We are all good */
3185     XBZRLE_cache_unlock();
3186     return 0;
3187 
3188 free_encoded_buf:
3189     g_free(XBZRLE.encoded_buf);
3190     XBZRLE.encoded_buf = NULL;
3191 free_cache:
3192     cache_fini(XBZRLE.cache);
3193     XBZRLE.cache = NULL;
3194 free_zero_page:
3195     g_free(XBZRLE.zero_target_page);
3196     XBZRLE.zero_target_page = NULL;
3197 err_out:
3198     XBZRLE_cache_unlock();
3199     return -ENOMEM;
3200 }
3201 
3202 static int ram_state_init(RAMState **rsp)
3203 {
3204     *rsp = g_try_new0(RAMState, 1);
3205 
3206     if (!*rsp) {
3207         error_report("%s: Init ramstate fail", __func__);
3208         return -1;
3209     }
3210 
3211     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3212     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3213     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3214 
3215     /*
3216      * Count the total number of pages used by ram blocks not including any
3217      * gaps due to alignment or unplugs.
3218      * This must match with the initial values of dirty bitmap.
3219      */
3220     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3221     ram_state_reset(*rsp);
3222 
3223     return 0;
3224 }
3225 
3226 static void ram_list_init_bitmaps(void)
3227 {
3228     MigrationState *ms = migrate_get_current();
3229     RAMBlock *block;
3230     unsigned long pages;
3231     uint8_t shift;
3232 
3233     /* Skip setting bitmap if there is no RAM */
3234     if (ram_bytes_total()) {
3235         shift = ms->clear_bitmap_shift;
3236         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3237             error_report("clear_bitmap_shift (%u) too big, using "
3238                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3239             shift = CLEAR_BITMAP_SHIFT_MAX;
3240         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3241             error_report("clear_bitmap_shift (%u) too small, using "
3242                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3243             shift = CLEAR_BITMAP_SHIFT_MIN;
3244         }
3245 
3246         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3247             pages = block->max_length >> TARGET_PAGE_BITS;
3248             /*
3249              * The initial dirty bitmap for migration must be set with all
3250              * ones to make sure we'll migrate every guest RAM page to
3251              * destination.
3252              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3253              * new migration after a failed migration, ram_list.
3254              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3255              * guest memory.
3256              */
3257             block->bmap = bitmap_new(pages);
3258             bitmap_set(block->bmap, 0, pages);
3259             block->clear_bmap_shift = shift;
3260             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3261             if (migrate_postcopy_ram()) {
3262                 block->unsentmap = bitmap_new(pages);
3263                 bitmap_set(block->unsentmap, 0, pages);
3264             }
3265         }
3266     }
3267 }
3268 
3269 static void ram_init_bitmaps(RAMState *rs)
3270 {
3271     /* For memory_global_dirty_log_start below.  */
3272     qemu_mutex_lock_iothread();
3273     qemu_mutex_lock_ramlist();
3274     rcu_read_lock();
3275 
3276     ram_list_init_bitmaps();
3277     memory_global_dirty_log_start();
3278     migration_bitmap_sync_precopy(rs);
3279 
3280     rcu_read_unlock();
3281     qemu_mutex_unlock_ramlist();
3282     qemu_mutex_unlock_iothread();
3283 }
3284 
3285 static int ram_init_all(RAMState **rsp)
3286 {
3287     if (ram_state_init(rsp)) {
3288         return -1;
3289     }
3290 
3291     if (xbzrle_init()) {
3292         ram_state_cleanup(rsp);
3293         return -1;
3294     }
3295 
3296     ram_init_bitmaps(*rsp);
3297 
3298     return 0;
3299 }
3300 
3301 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3302 {
3303     RAMBlock *block;
3304     uint64_t pages = 0;
3305 
3306     /*
3307      * Postcopy is not using xbzrle/compression, so no need for that.
3308      * Also, since source are already halted, we don't need to care
3309      * about dirty page logging as well.
3310      */
3311 
3312     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3313         pages += bitmap_count_one(block->bmap,
3314                                   block->used_length >> TARGET_PAGE_BITS);
3315     }
3316 
3317     /* This may not be aligned with current bitmaps. Recalculate. */
3318     rs->migration_dirty_pages = pages;
3319 
3320     rs->last_seen_block = NULL;
3321     rs->last_sent_block = NULL;
3322     rs->last_page = 0;
3323     rs->last_version = ram_list.version;
3324     /*
3325      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3326      * matter what we have sent.
3327      */
3328     rs->ram_bulk_stage = false;
3329 
3330     /* Update RAMState cache of output QEMUFile */
3331     rs->f = out;
3332 
3333     trace_ram_state_resume_prepare(pages);
3334 }
3335 
3336 /*
3337  * This function clears bits of the free pages reported by the caller from the
3338  * migration dirty bitmap. @addr is the host address corresponding to the
3339  * start of the continuous guest free pages, and @len is the total bytes of
3340  * those pages.
3341  */
3342 void qemu_guest_free_page_hint(void *addr, size_t len)
3343 {
3344     RAMBlock *block;
3345     ram_addr_t offset;
3346     size_t used_len, start, npages;
3347     MigrationState *s = migrate_get_current();
3348 
3349     /* This function is currently expected to be used during live migration */
3350     if (!migration_is_setup_or_active(s->state)) {
3351         return;
3352     }
3353 
3354     for (; len > 0; len -= used_len, addr += used_len) {
3355         block = qemu_ram_block_from_host(addr, false, &offset);
3356         if (unlikely(!block || offset >= block->used_length)) {
3357             /*
3358              * The implementation might not support RAMBlock resize during
3359              * live migration, but it could happen in theory with future
3360              * updates. So we add a check here to capture that case.
3361              */
3362             error_report_once("%s unexpected error", __func__);
3363             return;
3364         }
3365 
3366         if (len <= block->used_length - offset) {
3367             used_len = len;
3368         } else {
3369             used_len = block->used_length - offset;
3370         }
3371 
3372         start = offset >> TARGET_PAGE_BITS;
3373         npages = used_len >> TARGET_PAGE_BITS;
3374 
3375         qemu_mutex_lock(&ram_state->bitmap_mutex);
3376         ram_state->migration_dirty_pages -=
3377                       bitmap_count_one_with_offset(block->bmap, start, npages);
3378         bitmap_clear(block->bmap, start, npages);
3379         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3380     }
3381 }
3382 
3383 /*
3384  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3385  * long-running RCU critical section.  When rcu-reclaims in the code
3386  * start to become numerous it will be necessary to reduce the
3387  * granularity of these critical sections.
3388  */
3389 
3390 /**
3391  * ram_save_setup: Setup RAM for migration
3392  *
3393  * Returns zero to indicate success and negative for error
3394  *
3395  * @f: QEMUFile where to send the data
3396  * @opaque: RAMState pointer
3397  */
3398 static int ram_save_setup(QEMUFile *f, void *opaque)
3399 {
3400     RAMState **rsp = opaque;
3401     RAMBlock *block;
3402 
3403     if (compress_threads_save_setup()) {
3404         return -1;
3405     }
3406 
3407     /* migration has already setup the bitmap, reuse it. */
3408     if (!migration_in_colo_state()) {
3409         if (ram_init_all(rsp) != 0) {
3410             compress_threads_save_cleanup();
3411             return -1;
3412         }
3413     }
3414     (*rsp)->f = f;
3415 
3416     rcu_read_lock();
3417 
3418     qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3419 
3420     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3421         qemu_put_byte(f, strlen(block->idstr));
3422         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3423         qemu_put_be64(f, block->used_length);
3424         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3425             qemu_put_be64(f, block->page_size);
3426         }
3427         if (migrate_ignore_shared()) {
3428             qemu_put_be64(f, block->mr->addr);
3429         }
3430     }
3431 
3432     rcu_read_unlock();
3433 
3434     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3435     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3436 
3437     multifd_send_sync_main();
3438     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3439     qemu_fflush(f);
3440 
3441     return 0;
3442 }
3443 
3444 /**
3445  * ram_save_iterate: iterative stage for migration
3446  *
3447  * Returns zero to indicate success and negative for error
3448  *
3449  * @f: QEMUFile where to send the data
3450  * @opaque: RAMState pointer
3451  */
3452 static int ram_save_iterate(QEMUFile *f, void *opaque)
3453 {
3454     RAMState **temp = opaque;
3455     RAMState *rs = *temp;
3456     int ret;
3457     int i;
3458     int64_t t0;
3459     int done = 0;
3460 
3461     if (blk_mig_bulk_active()) {
3462         /* Avoid transferring ram during bulk phase of block migration as
3463          * the bulk phase will usually take a long time and transferring
3464          * ram updates during that time is pointless. */
3465         goto out;
3466     }
3467 
3468     rcu_read_lock();
3469     if (ram_list.version != rs->last_version) {
3470         ram_state_reset(rs);
3471     }
3472 
3473     /* Read version before ram_list.blocks */
3474     smp_rmb();
3475 
3476     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3477 
3478     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3479     i = 0;
3480     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3481             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3482         int pages;
3483 
3484         if (qemu_file_get_error(f)) {
3485             break;
3486         }
3487 
3488         pages = ram_find_and_save_block(rs, false);
3489         /* no more pages to sent */
3490         if (pages == 0) {
3491             done = 1;
3492             break;
3493         }
3494 
3495         if (pages < 0) {
3496             qemu_file_set_error(f, pages);
3497             break;
3498         }
3499 
3500         rs->target_page_count += pages;
3501 
3502         /* we want to check in the 1st loop, just in case it was the 1st time
3503            and we had to sync the dirty bitmap.
3504            qemu_clock_get_ns() is a bit expensive, so we only check each some
3505            iterations
3506         */
3507         if ((i & 63) == 0) {
3508             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3509             if (t1 > MAX_WAIT) {
3510                 trace_ram_save_iterate_big_wait(t1, i);
3511                 break;
3512             }
3513         }
3514         i++;
3515     }
3516     rcu_read_unlock();
3517 
3518     /*
3519      * Must occur before EOS (or any QEMUFile operation)
3520      * because of RDMA protocol.
3521      */
3522     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3523 
3524 out:
3525     multifd_send_sync_main();
3526     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3527     qemu_fflush(f);
3528     ram_counters.transferred += 8;
3529 
3530     ret = qemu_file_get_error(f);
3531     if (ret < 0) {
3532         return ret;
3533     }
3534 
3535     return done;
3536 }
3537 
3538 /**
3539  * ram_save_complete: function called to send the remaining amount of ram
3540  *
3541  * Returns zero to indicate success or negative on error
3542  *
3543  * Called with iothread lock
3544  *
3545  * @f: QEMUFile where to send the data
3546  * @opaque: RAMState pointer
3547  */
3548 static int ram_save_complete(QEMUFile *f, void *opaque)
3549 {
3550     RAMState **temp = opaque;
3551     RAMState *rs = *temp;
3552     int ret = 0;
3553 
3554     rcu_read_lock();
3555 
3556     if (!migration_in_postcopy()) {
3557         migration_bitmap_sync_precopy(rs);
3558     }
3559 
3560     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3561 
3562     /* try transferring iterative blocks of memory */
3563 
3564     /* flush all remaining blocks regardless of rate limiting */
3565     while (true) {
3566         int pages;
3567 
3568         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3569         /* no more blocks to sent */
3570         if (pages == 0) {
3571             break;
3572         }
3573         if (pages < 0) {
3574             ret = pages;
3575             break;
3576         }
3577     }
3578 
3579     flush_compressed_data(rs);
3580     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3581 
3582     rcu_read_unlock();
3583 
3584     multifd_send_sync_main();
3585     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3586     qemu_fflush(f);
3587 
3588     return ret;
3589 }
3590 
3591 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3592                              uint64_t *res_precopy_only,
3593                              uint64_t *res_compatible,
3594                              uint64_t *res_postcopy_only)
3595 {
3596     RAMState **temp = opaque;
3597     RAMState *rs = *temp;
3598     uint64_t remaining_size;
3599 
3600     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3601 
3602     if (!migration_in_postcopy() &&
3603         remaining_size < max_size) {
3604         qemu_mutex_lock_iothread();
3605         rcu_read_lock();
3606         migration_bitmap_sync_precopy(rs);
3607         rcu_read_unlock();
3608         qemu_mutex_unlock_iothread();
3609         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3610     }
3611 
3612     if (migrate_postcopy_ram()) {
3613         /* We can do postcopy, and all the data is postcopiable */
3614         *res_compatible += remaining_size;
3615     } else {
3616         *res_precopy_only += remaining_size;
3617     }
3618 }
3619 
3620 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3621 {
3622     unsigned int xh_len;
3623     int xh_flags;
3624     uint8_t *loaded_data;
3625 
3626     /* extract RLE header */
3627     xh_flags = qemu_get_byte(f);
3628     xh_len = qemu_get_be16(f);
3629 
3630     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3631         error_report("Failed to load XBZRLE page - wrong compression!");
3632         return -1;
3633     }
3634 
3635     if (xh_len > TARGET_PAGE_SIZE) {
3636         error_report("Failed to load XBZRLE page - len overflow!");
3637         return -1;
3638     }
3639     loaded_data = XBZRLE.decoded_buf;
3640     /* load data and decode */
3641     /* it can change loaded_data to point to an internal buffer */
3642     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3643 
3644     /* decode RLE */
3645     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3646                              TARGET_PAGE_SIZE) == -1) {
3647         error_report("Failed to load XBZRLE page - decode error!");
3648         return -1;
3649     }
3650 
3651     return 0;
3652 }
3653 
3654 /**
3655  * ram_block_from_stream: read a RAMBlock id from the migration stream
3656  *
3657  * Must be called from within a rcu critical section.
3658  *
3659  * Returns a pointer from within the RCU-protected ram_list.
3660  *
3661  * @f: QEMUFile where to read the data from
3662  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3663  */
3664 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3665 {
3666     static RAMBlock *block = NULL;
3667     char id[256];
3668     uint8_t len;
3669 
3670     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3671         if (!block) {
3672             error_report("Ack, bad migration stream!");
3673             return NULL;
3674         }
3675         return block;
3676     }
3677 
3678     len = qemu_get_byte(f);
3679     qemu_get_buffer(f, (uint8_t *)id, len);
3680     id[len] = 0;
3681 
3682     block = qemu_ram_block_by_name(id);
3683     if (!block) {
3684         error_report("Can't find block %s", id);
3685         return NULL;
3686     }
3687 
3688     if (ramblock_is_ignored(block)) {
3689         error_report("block %s should not be migrated !", id);
3690         return NULL;
3691     }
3692 
3693     return block;
3694 }
3695 
3696 static inline void *host_from_ram_block_offset(RAMBlock *block,
3697                                                ram_addr_t offset)
3698 {
3699     if (!offset_in_ramblock(block, offset)) {
3700         return NULL;
3701     }
3702 
3703     return block->host + offset;
3704 }
3705 
3706 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3707                                                  ram_addr_t offset)
3708 {
3709     if (!offset_in_ramblock(block, offset)) {
3710         return NULL;
3711     }
3712     if (!block->colo_cache) {
3713         error_report("%s: colo_cache is NULL in block :%s",
3714                      __func__, block->idstr);
3715         return NULL;
3716     }
3717 
3718     /*
3719     * During colo checkpoint, we need bitmap of these migrated pages.
3720     * It help us to decide which pages in ram cache should be flushed
3721     * into VM's RAM later.
3722     */
3723     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3724         ram_state->migration_dirty_pages++;
3725     }
3726     return block->colo_cache + offset;
3727 }
3728 
3729 /**
3730  * ram_handle_compressed: handle the zero page case
3731  *
3732  * If a page (or a whole RDMA chunk) has been
3733  * determined to be zero, then zap it.
3734  *
3735  * @host: host address for the zero page
3736  * @ch: what the page is filled from.  We only support zero
3737  * @size: size of the zero page
3738  */
3739 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3740 {
3741     if (ch != 0 || !is_zero_range(host, size)) {
3742         memset(host, ch, size);
3743     }
3744 }
3745 
3746 /* return the size after decompression, or negative value on error */
3747 static int
3748 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3749                      const uint8_t *source, size_t source_len)
3750 {
3751     int err;
3752 
3753     err = inflateReset(stream);
3754     if (err != Z_OK) {
3755         return -1;
3756     }
3757 
3758     stream->avail_in = source_len;
3759     stream->next_in = (uint8_t *)source;
3760     stream->avail_out = dest_len;
3761     stream->next_out = dest;
3762 
3763     err = inflate(stream, Z_NO_FLUSH);
3764     if (err != Z_STREAM_END) {
3765         return -1;
3766     }
3767 
3768     return stream->total_out;
3769 }
3770 
3771 static void *do_data_decompress(void *opaque)
3772 {
3773     DecompressParam *param = opaque;
3774     unsigned long pagesize;
3775     uint8_t *des;
3776     int len, ret;
3777 
3778     qemu_mutex_lock(&param->mutex);
3779     while (!param->quit) {
3780         if (param->des) {
3781             des = param->des;
3782             len = param->len;
3783             param->des = 0;
3784             qemu_mutex_unlock(&param->mutex);
3785 
3786             pagesize = TARGET_PAGE_SIZE;
3787 
3788             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3789                                        param->compbuf, len);
3790             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3791                 error_report("decompress data failed");
3792                 qemu_file_set_error(decomp_file, ret);
3793             }
3794 
3795             qemu_mutex_lock(&decomp_done_lock);
3796             param->done = true;
3797             qemu_cond_signal(&decomp_done_cond);
3798             qemu_mutex_unlock(&decomp_done_lock);
3799 
3800             qemu_mutex_lock(&param->mutex);
3801         } else {
3802             qemu_cond_wait(&param->cond, &param->mutex);
3803         }
3804     }
3805     qemu_mutex_unlock(&param->mutex);
3806 
3807     return NULL;
3808 }
3809 
3810 static int wait_for_decompress_done(void)
3811 {
3812     int idx, thread_count;
3813 
3814     if (!migrate_use_compression()) {
3815         return 0;
3816     }
3817 
3818     thread_count = migrate_decompress_threads();
3819     qemu_mutex_lock(&decomp_done_lock);
3820     for (idx = 0; idx < thread_count; idx++) {
3821         while (!decomp_param[idx].done) {
3822             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3823         }
3824     }
3825     qemu_mutex_unlock(&decomp_done_lock);
3826     return qemu_file_get_error(decomp_file);
3827 }
3828 
3829 static void compress_threads_load_cleanup(void)
3830 {
3831     int i, thread_count;
3832 
3833     if (!migrate_use_compression()) {
3834         return;
3835     }
3836     thread_count = migrate_decompress_threads();
3837     for (i = 0; i < thread_count; i++) {
3838         /*
3839          * we use it as a indicator which shows if the thread is
3840          * properly init'd or not
3841          */
3842         if (!decomp_param[i].compbuf) {
3843             break;
3844         }
3845 
3846         qemu_mutex_lock(&decomp_param[i].mutex);
3847         decomp_param[i].quit = true;
3848         qemu_cond_signal(&decomp_param[i].cond);
3849         qemu_mutex_unlock(&decomp_param[i].mutex);
3850     }
3851     for (i = 0; i < thread_count; i++) {
3852         if (!decomp_param[i].compbuf) {
3853             break;
3854         }
3855 
3856         qemu_thread_join(decompress_threads + i);
3857         qemu_mutex_destroy(&decomp_param[i].mutex);
3858         qemu_cond_destroy(&decomp_param[i].cond);
3859         inflateEnd(&decomp_param[i].stream);
3860         g_free(decomp_param[i].compbuf);
3861         decomp_param[i].compbuf = NULL;
3862     }
3863     g_free(decompress_threads);
3864     g_free(decomp_param);
3865     decompress_threads = NULL;
3866     decomp_param = NULL;
3867     decomp_file = NULL;
3868 }
3869 
3870 static int compress_threads_load_setup(QEMUFile *f)
3871 {
3872     int i, thread_count;
3873 
3874     if (!migrate_use_compression()) {
3875         return 0;
3876     }
3877 
3878     thread_count = migrate_decompress_threads();
3879     decompress_threads = g_new0(QemuThread, thread_count);
3880     decomp_param = g_new0(DecompressParam, thread_count);
3881     qemu_mutex_init(&decomp_done_lock);
3882     qemu_cond_init(&decomp_done_cond);
3883     decomp_file = f;
3884     for (i = 0; i < thread_count; i++) {
3885         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3886             goto exit;
3887         }
3888 
3889         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3890         qemu_mutex_init(&decomp_param[i].mutex);
3891         qemu_cond_init(&decomp_param[i].cond);
3892         decomp_param[i].done = true;
3893         decomp_param[i].quit = false;
3894         qemu_thread_create(decompress_threads + i, "decompress",
3895                            do_data_decompress, decomp_param + i,
3896                            QEMU_THREAD_JOINABLE);
3897     }
3898     return 0;
3899 exit:
3900     compress_threads_load_cleanup();
3901     return -1;
3902 }
3903 
3904 static void decompress_data_with_multi_threads(QEMUFile *f,
3905                                                void *host, int len)
3906 {
3907     int idx, thread_count;
3908 
3909     thread_count = migrate_decompress_threads();
3910     qemu_mutex_lock(&decomp_done_lock);
3911     while (true) {
3912         for (idx = 0; idx < thread_count; idx++) {
3913             if (decomp_param[idx].done) {
3914                 decomp_param[idx].done = false;
3915                 qemu_mutex_lock(&decomp_param[idx].mutex);
3916                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3917                 decomp_param[idx].des = host;
3918                 decomp_param[idx].len = len;
3919                 qemu_cond_signal(&decomp_param[idx].cond);
3920                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3921                 break;
3922             }
3923         }
3924         if (idx < thread_count) {
3925             break;
3926         } else {
3927             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3928         }
3929     }
3930     qemu_mutex_unlock(&decomp_done_lock);
3931 }
3932 
3933 /*
3934  * colo cache: this is for secondary VM, we cache the whole
3935  * memory of the secondary VM, it is need to hold the global lock
3936  * to call this helper.
3937  */
3938 int colo_init_ram_cache(void)
3939 {
3940     RAMBlock *block;
3941 
3942     rcu_read_lock();
3943     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3944         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3945                                                 NULL,
3946                                                 false);
3947         if (!block->colo_cache) {
3948             error_report("%s: Can't alloc memory for COLO cache of block %s,"
3949                          "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3950                          block->used_length);
3951             goto out_locked;
3952         }
3953         memcpy(block->colo_cache, block->host, block->used_length);
3954     }
3955     rcu_read_unlock();
3956     /*
3957     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3958     * with to decide which page in cache should be flushed into SVM's RAM. Here
3959     * we use the same name 'ram_bitmap' as for migration.
3960     */
3961     if (ram_bytes_total()) {
3962         RAMBlock *block;
3963 
3964         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3965             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3966 
3967             block->bmap = bitmap_new(pages);
3968             bitmap_set(block->bmap, 0, pages);
3969         }
3970     }
3971     ram_state = g_new0(RAMState, 1);
3972     ram_state->migration_dirty_pages = 0;
3973     qemu_mutex_init(&ram_state->bitmap_mutex);
3974     memory_global_dirty_log_start();
3975 
3976     return 0;
3977 
3978 out_locked:
3979 
3980     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3981         if (block->colo_cache) {
3982             qemu_anon_ram_free(block->colo_cache, block->used_length);
3983             block->colo_cache = NULL;
3984         }
3985     }
3986 
3987     rcu_read_unlock();
3988     return -errno;
3989 }
3990 
3991 /* It is need to hold the global lock to call this helper */
3992 void colo_release_ram_cache(void)
3993 {
3994     RAMBlock *block;
3995 
3996     memory_global_dirty_log_stop();
3997     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3998         g_free(block->bmap);
3999         block->bmap = NULL;
4000     }
4001 
4002     rcu_read_lock();
4003 
4004     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4005         if (block->colo_cache) {
4006             qemu_anon_ram_free(block->colo_cache, block->used_length);
4007             block->colo_cache = NULL;
4008         }
4009     }
4010 
4011     rcu_read_unlock();
4012     qemu_mutex_destroy(&ram_state->bitmap_mutex);
4013     g_free(ram_state);
4014     ram_state = NULL;
4015 }
4016 
4017 /**
4018  * ram_load_setup: Setup RAM for migration incoming side
4019  *
4020  * Returns zero to indicate success and negative for error
4021  *
4022  * @f: QEMUFile where to receive the data
4023  * @opaque: RAMState pointer
4024  */
4025 static int ram_load_setup(QEMUFile *f, void *opaque)
4026 {
4027     if (compress_threads_load_setup(f)) {
4028         return -1;
4029     }
4030 
4031     xbzrle_load_setup();
4032     ramblock_recv_map_init();
4033 
4034     return 0;
4035 }
4036 
4037 static int ram_load_cleanup(void *opaque)
4038 {
4039     RAMBlock *rb;
4040 
4041     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4042         if (ramblock_is_pmem(rb)) {
4043             pmem_persist(rb->host, rb->used_length);
4044         }
4045     }
4046 
4047     xbzrle_load_cleanup();
4048     compress_threads_load_cleanup();
4049 
4050     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4051         g_free(rb->receivedmap);
4052         rb->receivedmap = NULL;
4053     }
4054 
4055     return 0;
4056 }
4057 
4058 /**
4059  * ram_postcopy_incoming_init: allocate postcopy data structures
4060  *
4061  * Returns 0 for success and negative if there was one error
4062  *
4063  * @mis: current migration incoming state
4064  *
4065  * Allocate data structures etc needed by incoming migration with
4066  * postcopy-ram. postcopy-ram's similarly names
4067  * postcopy_ram_incoming_init does the work.
4068  */
4069 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4070 {
4071     return postcopy_ram_incoming_init(mis);
4072 }
4073 
4074 /**
4075  * ram_load_postcopy: load a page in postcopy case
4076  *
4077  * Returns 0 for success or -errno in case of error
4078  *
4079  * Called in postcopy mode by ram_load().
4080  * rcu_read_lock is taken prior to this being called.
4081  *
4082  * @f: QEMUFile where to send the data
4083  */
4084 static int ram_load_postcopy(QEMUFile *f)
4085 {
4086     int flags = 0, ret = 0;
4087     bool place_needed = false;
4088     bool matches_target_page_size = false;
4089     MigrationIncomingState *mis = migration_incoming_get_current();
4090     /* Temporary page that is later 'placed' */
4091     void *postcopy_host_page = postcopy_get_tmp_page(mis);
4092     void *last_host = NULL;
4093     bool all_zero = false;
4094 
4095     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4096         ram_addr_t addr;
4097         void *host = NULL;
4098         void *page_buffer = NULL;
4099         void *place_source = NULL;
4100         RAMBlock *block = NULL;
4101         uint8_t ch;
4102 
4103         addr = qemu_get_be64(f);
4104 
4105         /*
4106          * If qemu file error, we should stop here, and then "addr"
4107          * may be invalid
4108          */
4109         ret = qemu_file_get_error(f);
4110         if (ret) {
4111             break;
4112         }
4113 
4114         flags = addr & ~TARGET_PAGE_MASK;
4115         addr &= TARGET_PAGE_MASK;
4116 
4117         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4118         place_needed = false;
4119         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4120             block = ram_block_from_stream(f, flags);
4121 
4122             host = host_from_ram_block_offset(block, addr);
4123             if (!host) {
4124                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4125                 ret = -EINVAL;
4126                 break;
4127             }
4128             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4129             /*
4130              * Postcopy requires that we place whole host pages atomically;
4131              * these may be huge pages for RAMBlocks that are backed by
4132              * hugetlbfs.
4133              * To make it atomic, the data is read into a temporary page
4134              * that's moved into place later.
4135              * The migration protocol uses,  possibly smaller, target-pages
4136              * however the source ensures it always sends all the components
4137              * of a host page in order.
4138              */
4139             page_buffer = postcopy_host_page +
4140                           ((uintptr_t)host & (block->page_size - 1));
4141             /* If all TP are zero then we can optimise the place */
4142             if (!((uintptr_t)host & (block->page_size - 1))) {
4143                 all_zero = true;
4144             } else {
4145                 /* not the 1st TP within the HP */
4146                 if (host != (last_host + TARGET_PAGE_SIZE)) {
4147                     error_report("Non-sequential target page %p/%p",
4148                                   host, last_host);
4149                     ret = -EINVAL;
4150                     break;
4151                 }
4152             }
4153 
4154 
4155             /*
4156              * If it's the last part of a host page then we place the host
4157              * page
4158              */
4159             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4160                                      (block->page_size - 1)) == 0;
4161             place_source = postcopy_host_page;
4162         }
4163         last_host = host;
4164 
4165         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4166         case RAM_SAVE_FLAG_ZERO:
4167             ch = qemu_get_byte(f);
4168             memset(page_buffer, ch, TARGET_PAGE_SIZE);
4169             if (ch) {
4170                 all_zero = false;
4171             }
4172             break;
4173 
4174         case RAM_SAVE_FLAG_PAGE:
4175             all_zero = false;
4176             if (!matches_target_page_size) {
4177                 /* For huge pages, we always use temporary buffer */
4178                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4179             } else {
4180                 /*
4181                  * For small pages that matches target page size, we
4182                  * avoid the qemu_file copy.  Instead we directly use
4183                  * the buffer of QEMUFile to place the page.  Note: we
4184                  * cannot do any QEMUFile operation before using that
4185                  * buffer to make sure the buffer is valid when
4186                  * placing the page.
4187                  */
4188                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4189                                          TARGET_PAGE_SIZE);
4190             }
4191             break;
4192         case RAM_SAVE_FLAG_EOS:
4193             /* normal exit */
4194             multifd_recv_sync_main();
4195             break;
4196         default:
4197             error_report("Unknown combination of migration flags: %#x"
4198                          " (postcopy mode)", flags);
4199             ret = -EINVAL;
4200             break;
4201         }
4202 
4203         /* Detect for any possible file errors */
4204         if (!ret && qemu_file_get_error(f)) {
4205             ret = qemu_file_get_error(f);
4206         }
4207 
4208         if (!ret && place_needed) {
4209             /* This gets called at the last target page in the host page */
4210             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4211 
4212             if (all_zero) {
4213                 ret = postcopy_place_page_zero(mis, place_dest,
4214                                                block);
4215             } else {
4216                 ret = postcopy_place_page(mis, place_dest,
4217                                           place_source, block);
4218             }
4219         }
4220     }
4221 
4222     return ret;
4223 }
4224 
4225 static bool postcopy_is_advised(void)
4226 {
4227     PostcopyState ps = postcopy_state_get();
4228     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4229 }
4230 
4231 static bool postcopy_is_running(void)
4232 {
4233     PostcopyState ps = postcopy_state_get();
4234     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4235 }
4236 
4237 /*
4238  * Flush content of RAM cache into SVM's memory.
4239  * Only flush the pages that be dirtied by PVM or SVM or both.
4240  */
4241 static void colo_flush_ram_cache(void)
4242 {
4243     RAMBlock *block = NULL;
4244     void *dst_host;
4245     void *src_host;
4246     unsigned long offset = 0;
4247 
4248     memory_global_dirty_log_sync();
4249     rcu_read_lock();
4250     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4251         migration_bitmap_sync_range(ram_state, block, block->used_length);
4252     }
4253     rcu_read_unlock();
4254 
4255     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4256     rcu_read_lock();
4257     block = QLIST_FIRST_RCU(&ram_list.blocks);
4258 
4259     while (block) {
4260         offset = migration_bitmap_find_dirty(ram_state, block, offset);
4261 
4262         if (offset << TARGET_PAGE_BITS >= block->used_length) {
4263             offset = 0;
4264             block = QLIST_NEXT_RCU(block, next);
4265         } else {
4266             migration_bitmap_clear_dirty(ram_state, block, offset);
4267             dst_host = block->host + (offset << TARGET_PAGE_BITS);
4268             src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4269             memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4270         }
4271     }
4272 
4273     rcu_read_unlock();
4274     trace_colo_flush_ram_cache_end();
4275 }
4276 
4277 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4278 {
4279     int flags = 0, ret = 0, invalid_flags = 0;
4280     static uint64_t seq_iter;
4281     int len = 0;
4282     /*
4283      * If system is running in postcopy mode, page inserts to host memory must
4284      * be atomic
4285      */
4286     bool postcopy_running = postcopy_is_running();
4287     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4288     bool postcopy_advised = postcopy_is_advised();
4289 
4290     seq_iter++;
4291 
4292     if (version_id != 4) {
4293         ret = -EINVAL;
4294     }
4295 
4296     if (!migrate_use_compression()) {
4297         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4298     }
4299     /* This RCU critical section can be very long running.
4300      * When RCU reclaims in the code start to become numerous,
4301      * it will be necessary to reduce the granularity of this
4302      * critical section.
4303      */
4304     rcu_read_lock();
4305 
4306     if (postcopy_running) {
4307         ret = ram_load_postcopy(f);
4308     }
4309 
4310     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4311         ram_addr_t addr, total_ram_bytes;
4312         void *host = NULL;
4313         uint8_t ch;
4314 
4315         addr = qemu_get_be64(f);
4316         flags = addr & ~TARGET_PAGE_MASK;
4317         addr &= TARGET_PAGE_MASK;
4318 
4319         if (flags & invalid_flags) {
4320             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4321                 error_report("Received an unexpected compressed page");
4322             }
4323 
4324             ret = -EINVAL;
4325             break;
4326         }
4327 
4328         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4329                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4330             RAMBlock *block = ram_block_from_stream(f, flags);
4331 
4332             /*
4333              * After going into COLO, we should load the Page into colo_cache.
4334              */
4335             if (migration_incoming_in_colo_state()) {
4336                 host = colo_cache_from_block_offset(block, addr);
4337             } else {
4338                 host = host_from_ram_block_offset(block, addr);
4339             }
4340             if (!host) {
4341                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4342                 ret = -EINVAL;
4343                 break;
4344             }
4345 
4346             if (!migration_incoming_in_colo_state()) {
4347                 ramblock_recv_bitmap_set(block, host);
4348             }
4349 
4350             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4351         }
4352 
4353         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4354         case RAM_SAVE_FLAG_MEM_SIZE:
4355             /* Synchronize RAM block list */
4356             total_ram_bytes = addr;
4357             while (!ret && total_ram_bytes) {
4358                 RAMBlock *block;
4359                 char id[256];
4360                 ram_addr_t length;
4361 
4362                 len = qemu_get_byte(f);
4363                 qemu_get_buffer(f, (uint8_t *)id, len);
4364                 id[len] = 0;
4365                 length = qemu_get_be64(f);
4366 
4367                 block = qemu_ram_block_by_name(id);
4368                 if (block && !qemu_ram_is_migratable(block)) {
4369                     error_report("block %s should not be migrated !", id);
4370                     ret = -EINVAL;
4371                 } else if (block) {
4372                     if (length != block->used_length) {
4373                         Error *local_err = NULL;
4374 
4375                         ret = qemu_ram_resize(block, length,
4376                                               &local_err);
4377                         if (local_err) {
4378                             error_report_err(local_err);
4379                         }
4380                     }
4381                     /* For postcopy we need to check hugepage sizes match */
4382                     if (postcopy_advised &&
4383                         block->page_size != qemu_host_page_size) {
4384                         uint64_t remote_page_size = qemu_get_be64(f);
4385                         if (remote_page_size != block->page_size) {
4386                             error_report("Mismatched RAM page size %s "
4387                                          "(local) %zd != %" PRId64,
4388                                          id, block->page_size,
4389                                          remote_page_size);
4390                             ret = -EINVAL;
4391                         }
4392                     }
4393                     if (migrate_ignore_shared()) {
4394                         hwaddr addr = qemu_get_be64(f);
4395                         if (ramblock_is_ignored(block) &&
4396                             block->mr->addr != addr) {
4397                             error_report("Mismatched GPAs for block %s "
4398                                          "%" PRId64 "!= %" PRId64,
4399                                          id, (uint64_t)addr,
4400                                          (uint64_t)block->mr->addr);
4401                             ret = -EINVAL;
4402                         }
4403                     }
4404                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4405                                           block->idstr);
4406                 } else {
4407                     error_report("Unknown ramblock \"%s\", cannot "
4408                                  "accept migration", id);
4409                     ret = -EINVAL;
4410                 }
4411 
4412                 total_ram_bytes -= length;
4413             }
4414             break;
4415 
4416         case RAM_SAVE_FLAG_ZERO:
4417             ch = qemu_get_byte(f);
4418             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4419             break;
4420 
4421         case RAM_SAVE_FLAG_PAGE:
4422             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4423             break;
4424 
4425         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4426             len = qemu_get_be32(f);
4427             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4428                 error_report("Invalid compressed data length: %d", len);
4429                 ret = -EINVAL;
4430                 break;
4431             }
4432             decompress_data_with_multi_threads(f, host, len);
4433             break;
4434 
4435         case RAM_SAVE_FLAG_XBZRLE:
4436             if (load_xbzrle(f, addr, host) < 0) {
4437                 error_report("Failed to decompress XBZRLE page at "
4438                              RAM_ADDR_FMT, addr);
4439                 ret = -EINVAL;
4440                 break;
4441             }
4442             break;
4443         case RAM_SAVE_FLAG_EOS:
4444             /* normal exit */
4445             multifd_recv_sync_main();
4446             break;
4447         default:
4448             if (flags & RAM_SAVE_FLAG_HOOK) {
4449                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4450             } else {
4451                 error_report("Unknown combination of migration flags: %#x",
4452                              flags);
4453                 ret = -EINVAL;
4454             }
4455         }
4456         if (!ret) {
4457             ret = qemu_file_get_error(f);
4458         }
4459     }
4460 
4461     ret |= wait_for_decompress_done();
4462     rcu_read_unlock();
4463     trace_ram_load_complete(ret, seq_iter);
4464 
4465     if (!ret  && migration_incoming_in_colo_state()) {
4466         colo_flush_ram_cache();
4467     }
4468     return ret;
4469 }
4470 
4471 static bool ram_has_postcopy(void *opaque)
4472 {
4473     RAMBlock *rb;
4474     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4475         if (ramblock_is_pmem(rb)) {
4476             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4477                          "is not supported now!", rb->idstr, rb->host);
4478             return false;
4479         }
4480     }
4481 
4482     return migrate_postcopy_ram();
4483 }
4484 
4485 /* Sync all the dirty bitmap with destination VM.  */
4486 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4487 {
4488     RAMBlock *block;
4489     QEMUFile *file = s->to_dst_file;
4490     int ramblock_count = 0;
4491 
4492     trace_ram_dirty_bitmap_sync_start();
4493 
4494     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4495         qemu_savevm_send_recv_bitmap(file, block->idstr);
4496         trace_ram_dirty_bitmap_request(block->idstr);
4497         ramblock_count++;
4498     }
4499 
4500     trace_ram_dirty_bitmap_sync_wait();
4501 
4502     /* Wait until all the ramblocks' dirty bitmap synced */
4503     while (ramblock_count--) {
4504         qemu_sem_wait(&s->rp_state.rp_sem);
4505     }
4506 
4507     trace_ram_dirty_bitmap_sync_complete();
4508 
4509     return 0;
4510 }
4511 
4512 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4513 {
4514     qemu_sem_post(&s->rp_state.rp_sem);
4515 }
4516 
4517 /*
4518  * Read the received bitmap, revert it as the initial dirty bitmap.
4519  * This is only used when the postcopy migration is paused but wants
4520  * to resume from a middle point.
4521  */
4522 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4523 {
4524     int ret = -EINVAL;
4525     QEMUFile *file = s->rp_state.from_dst_file;
4526     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4527     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4528     uint64_t size, end_mark;
4529 
4530     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4531 
4532     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4533         error_report("%s: incorrect state %s", __func__,
4534                      MigrationStatus_str(s->state));
4535         return -EINVAL;
4536     }
4537 
4538     /*
4539      * Note: see comments in ramblock_recv_bitmap_send() on why we
4540      * need the endianess convertion, and the paddings.
4541      */
4542     local_size = ROUND_UP(local_size, 8);
4543 
4544     /* Add paddings */
4545     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4546 
4547     size = qemu_get_be64(file);
4548 
4549     /* The size of the bitmap should match with our ramblock */
4550     if (size != local_size) {
4551         error_report("%s: ramblock '%s' bitmap size mismatch "
4552                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4553                      block->idstr, size, local_size);
4554         ret = -EINVAL;
4555         goto out;
4556     }
4557 
4558     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4559     end_mark = qemu_get_be64(file);
4560 
4561     ret = qemu_file_get_error(file);
4562     if (ret || size != local_size) {
4563         error_report("%s: read bitmap failed for ramblock '%s': %d"
4564                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4565                      __func__, block->idstr, ret, local_size, size);
4566         ret = -EIO;
4567         goto out;
4568     }
4569 
4570     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4571         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4572                      __func__, block->idstr, end_mark);
4573         ret = -EINVAL;
4574         goto out;
4575     }
4576 
4577     /*
4578      * Endianess convertion. We are during postcopy (though paused).
4579      * The dirty bitmap won't change. We can directly modify it.
4580      */
4581     bitmap_from_le(block->bmap, le_bitmap, nbits);
4582 
4583     /*
4584      * What we received is "received bitmap". Revert it as the initial
4585      * dirty bitmap for this ramblock.
4586      */
4587     bitmap_complement(block->bmap, block->bmap, nbits);
4588 
4589     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4590 
4591     /*
4592      * We succeeded to sync bitmap for current ramblock. If this is
4593      * the last one to sync, we need to notify the main send thread.
4594      */
4595     ram_dirty_bitmap_reload_notify(s);
4596 
4597     ret = 0;
4598 out:
4599     g_free(le_bitmap);
4600     return ret;
4601 }
4602 
4603 static int ram_resume_prepare(MigrationState *s, void *opaque)
4604 {
4605     RAMState *rs = *(RAMState **)opaque;
4606     int ret;
4607 
4608     ret = ram_dirty_bitmap_sync_all(s, rs);
4609     if (ret) {
4610         return ret;
4611     }
4612 
4613     ram_state_resume_prepare(rs, s->to_dst_file);
4614 
4615     return 0;
4616 }
4617 
4618 static SaveVMHandlers savevm_ram_handlers = {
4619     .save_setup = ram_save_setup,
4620     .save_live_iterate = ram_save_iterate,
4621     .save_live_complete_postcopy = ram_save_complete,
4622     .save_live_complete_precopy = ram_save_complete,
4623     .has_postcopy = ram_has_postcopy,
4624     .save_live_pending = ram_save_pending,
4625     .load_state = ram_load,
4626     .save_cleanup = ram_save_cleanup,
4627     .load_setup = ram_load_setup,
4628     .load_cleanup = ram_load_cleanup,
4629     .resume_prepare = ram_resume_prepare,
4630 };
4631 
4632 void ram_mig_init(void)
4633 {
4634     qemu_mutex_init(&XBZRLE.lock);
4635     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4636 }
4637