xref: /openbmc/qemu/migration/ram.c (revision 00d1d29b768d920342c5e33cc56a9e0be596b2b4)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include "qemu/cutils.h"
32 #include "qemu/bitops.h"
33 #include "qemu/bitmap.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "block.h"
54 #include "sysemu/sysemu.h"
55 #include "savevm.h"
56 #include "qemu/iov.h"
57 #include "multifd.h"
58 
59 /***********************************************************/
60 /* ram save/restore */
61 
62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
63  * worked for pages that where filled with the same char.  We switched
64  * it to only search for the zero value.  And to avoid confusion with
65  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
66  */
67 
68 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
69 #define RAM_SAVE_FLAG_ZERO     0x02
70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
71 #define RAM_SAVE_FLAG_PAGE     0x08
72 #define RAM_SAVE_FLAG_EOS      0x10
73 #define RAM_SAVE_FLAG_CONTINUE 0x20
74 #define RAM_SAVE_FLAG_XBZRLE   0x40
75 /* 0x80 is reserved in migration.h start with 0x100 next */
76 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
77 
78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
79 {
80     return buffer_is_zero(p, size);
81 }
82 
83 XBZRLECacheStats xbzrle_counters;
84 
85 /* struct contains XBZRLE cache and a static page
86    used by the compression */
87 static struct {
88     /* buffer used for XBZRLE encoding */
89     uint8_t *encoded_buf;
90     /* buffer for storing page content */
91     uint8_t *current_buf;
92     /* Cache for XBZRLE, Protected by lock. */
93     PageCache *cache;
94     QemuMutex lock;
95     /* it will store a page full of zeros */
96     uint8_t *zero_target_page;
97     /* buffer used for XBZRLE decoding */
98     uint8_t *decoded_buf;
99 } XBZRLE;
100 
101 static void XBZRLE_cache_lock(void)
102 {
103     if (migrate_use_xbzrle())
104         qemu_mutex_lock(&XBZRLE.lock);
105 }
106 
107 static void XBZRLE_cache_unlock(void)
108 {
109     if (migrate_use_xbzrle())
110         qemu_mutex_unlock(&XBZRLE.lock);
111 }
112 
113 /**
114  * xbzrle_cache_resize: resize the xbzrle cache
115  *
116  * This function is called from qmp_migrate_set_cache_size in main
117  * thread, possibly while a migration is in progress.  A running
118  * migration may be using the cache and might finish during this call,
119  * hence changes to the cache are protected by XBZRLE.lock().
120  *
121  * Returns 0 for success or -1 for error
122  *
123  * @new_size: new cache size
124  * @errp: set *errp if the check failed, with reason
125  */
126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
127 {
128     PageCache *new_cache;
129     int64_t ret = 0;
130 
131     /* Check for truncation */
132     if (new_size != (size_t)new_size) {
133         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134                    "exceeding address space");
135         return -1;
136     }
137 
138     if (new_size == migrate_xbzrle_cache_size()) {
139         /* nothing to do */
140         return 0;
141     }
142 
143     XBZRLE_cache_lock();
144 
145     if (XBZRLE.cache != NULL) {
146         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
147         if (!new_cache) {
148             ret = -1;
149             goto out;
150         }
151 
152         cache_fini(XBZRLE.cache);
153         XBZRLE.cache = new_cache;
154     }
155 out:
156     XBZRLE_cache_unlock();
157     return ret;
158 }
159 
160 static bool ramblock_is_ignored(RAMBlock *block)
161 {
162     return !qemu_ram_is_migratable(block) ||
163            (migrate_ignore_shared() && qemu_ram_is_shared(block));
164 }
165 
166 /* Should be holding either ram_list.mutex, or the RCU lock. */
167 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
168     INTERNAL_RAMBLOCK_FOREACH(block)                   \
169         if (ramblock_is_ignored(block)) {} else
170 
171 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
172     INTERNAL_RAMBLOCK_FOREACH(block)                   \
173         if (!qemu_ram_is_migratable(block)) {} else
174 
175 #undef RAMBLOCK_FOREACH
176 
177 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
178 {
179     RAMBlock *block;
180     int ret = 0;
181 
182     RCU_READ_LOCK_GUARD();
183 
184     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
185         ret = func(block, opaque);
186         if (ret) {
187             break;
188         }
189     }
190     return ret;
191 }
192 
193 static void ramblock_recv_map_init(void)
194 {
195     RAMBlock *rb;
196 
197     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
198         assert(!rb->receivedmap);
199         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
200     }
201 }
202 
203 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
204 {
205     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
206                     rb->receivedmap);
207 }
208 
209 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
210 {
211     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
212 }
213 
214 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
215 {
216     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
217 }
218 
219 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
220                                     size_t nr)
221 {
222     bitmap_set_atomic(rb->receivedmap,
223                       ramblock_recv_bitmap_offset(host_addr, rb),
224                       nr);
225 }
226 
227 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
228 
229 /*
230  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
231  *
232  * Returns >0 if success with sent bytes, or <0 if error.
233  */
234 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
235                                   const char *block_name)
236 {
237     RAMBlock *block = qemu_ram_block_by_name(block_name);
238     unsigned long *le_bitmap, nbits;
239     uint64_t size;
240 
241     if (!block) {
242         error_report("%s: invalid block name: %s", __func__, block_name);
243         return -1;
244     }
245 
246     nbits = block->used_length >> TARGET_PAGE_BITS;
247 
248     /*
249      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
250      * machines we may need 4 more bytes for padding (see below
251      * comment). So extend it a bit before hand.
252      */
253     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
254 
255     /*
256      * Always use little endian when sending the bitmap. This is
257      * required that when source and destination VMs are not using the
258      * same endianess. (Note: big endian won't work.)
259      */
260     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
261 
262     /* Size of the bitmap, in bytes */
263     size = DIV_ROUND_UP(nbits, 8);
264 
265     /*
266      * size is always aligned to 8 bytes for 64bit machines, but it
267      * may not be true for 32bit machines. We need this padding to
268      * make sure the migration can survive even between 32bit and
269      * 64bit machines.
270      */
271     size = ROUND_UP(size, 8);
272 
273     qemu_put_be64(file, size);
274     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
275     /*
276      * Mark as an end, in case the middle part is screwed up due to
277      * some "misterious" reason.
278      */
279     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
280     qemu_fflush(file);
281 
282     g_free(le_bitmap);
283 
284     if (qemu_file_get_error(file)) {
285         return qemu_file_get_error(file);
286     }
287 
288     return size + sizeof(size);
289 }
290 
291 /*
292  * An outstanding page request, on the source, having been received
293  * and queued
294  */
295 struct RAMSrcPageRequest {
296     RAMBlock *rb;
297     hwaddr    offset;
298     hwaddr    len;
299 
300     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
301 };
302 
303 /* State of RAM for migration */
304 struct RAMState {
305     /* QEMUFile used for this migration */
306     QEMUFile *f;
307     /* Last block that we have visited searching for dirty pages */
308     RAMBlock *last_seen_block;
309     /* Last block from where we have sent data */
310     RAMBlock *last_sent_block;
311     /* Last dirty target page we have sent */
312     ram_addr_t last_page;
313     /* last ram version we have seen */
314     uint32_t last_version;
315     /* We are in the first round */
316     bool ram_bulk_stage;
317     /* The free page optimization is enabled */
318     bool fpo_enabled;
319     /* How many times we have dirty too many pages */
320     int dirty_rate_high_cnt;
321     /* these variables are used for bitmap sync */
322     /* last time we did a full bitmap_sync */
323     int64_t time_last_bitmap_sync;
324     /* bytes transferred at start_time */
325     uint64_t bytes_xfer_prev;
326     /* number of dirty pages since start_time */
327     uint64_t num_dirty_pages_period;
328     /* xbzrle misses since the beginning of the period */
329     uint64_t xbzrle_cache_miss_prev;
330 
331     /* compression statistics since the beginning of the period */
332     /* amount of count that no free thread to compress data */
333     uint64_t compress_thread_busy_prev;
334     /* amount bytes after compression */
335     uint64_t compressed_size_prev;
336     /* amount of compressed pages */
337     uint64_t compress_pages_prev;
338 
339     /* total handled target pages at the beginning of period */
340     uint64_t target_page_count_prev;
341     /* total handled target pages since start */
342     uint64_t target_page_count;
343     /* number of dirty bits in the bitmap */
344     uint64_t migration_dirty_pages;
345     /* Protects modification of the bitmap and migration dirty pages */
346     QemuMutex bitmap_mutex;
347     /* The RAMBlock used in the last src_page_requests */
348     RAMBlock *last_req_rb;
349     /* Queue of outstanding page requests from the destination */
350     QemuMutex src_page_req_mutex;
351     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
352 };
353 typedef struct RAMState RAMState;
354 
355 static RAMState *ram_state;
356 
357 static NotifierWithReturnList precopy_notifier_list;
358 
359 void precopy_infrastructure_init(void)
360 {
361     notifier_with_return_list_init(&precopy_notifier_list);
362 }
363 
364 void precopy_add_notifier(NotifierWithReturn *n)
365 {
366     notifier_with_return_list_add(&precopy_notifier_list, n);
367 }
368 
369 void precopy_remove_notifier(NotifierWithReturn *n)
370 {
371     notifier_with_return_remove(n);
372 }
373 
374 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
375 {
376     PrecopyNotifyData pnd;
377     pnd.reason = reason;
378     pnd.errp = errp;
379 
380     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
381 }
382 
383 void precopy_enable_free_page_optimization(void)
384 {
385     if (!ram_state) {
386         return;
387     }
388 
389     ram_state->fpo_enabled = true;
390 }
391 
392 uint64_t ram_bytes_remaining(void)
393 {
394     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
395                        0;
396 }
397 
398 MigrationStats ram_counters;
399 
400 /* used by the search for pages to send */
401 struct PageSearchStatus {
402     /* Current block being searched */
403     RAMBlock    *block;
404     /* Current page to search from */
405     unsigned long page;
406     /* Set once we wrap around */
407     bool         complete_round;
408 };
409 typedef struct PageSearchStatus PageSearchStatus;
410 
411 CompressionStats compression_counters;
412 
413 struct CompressParam {
414     bool done;
415     bool quit;
416     bool zero_page;
417     QEMUFile *file;
418     QemuMutex mutex;
419     QemuCond cond;
420     RAMBlock *block;
421     ram_addr_t offset;
422 
423     /* internally used fields */
424     z_stream stream;
425     uint8_t *originbuf;
426 };
427 typedef struct CompressParam CompressParam;
428 
429 struct DecompressParam {
430     bool done;
431     bool quit;
432     QemuMutex mutex;
433     QemuCond cond;
434     void *des;
435     uint8_t *compbuf;
436     int len;
437     z_stream stream;
438 };
439 typedef struct DecompressParam DecompressParam;
440 
441 static CompressParam *comp_param;
442 static QemuThread *compress_threads;
443 /* comp_done_cond is used to wake up the migration thread when
444  * one of the compression threads has finished the compression.
445  * comp_done_lock is used to co-work with comp_done_cond.
446  */
447 static QemuMutex comp_done_lock;
448 static QemuCond comp_done_cond;
449 /* The empty QEMUFileOps will be used by file in CompressParam */
450 static const QEMUFileOps empty_ops = { };
451 
452 static QEMUFile *decomp_file;
453 static DecompressParam *decomp_param;
454 static QemuThread *decompress_threads;
455 static QemuMutex decomp_done_lock;
456 static QemuCond decomp_done_cond;
457 
458 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
459                                  ram_addr_t offset, uint8_t *source_buf);
460 
461 static void *do_data_compress(void *opaque)
462 {
463     CompressParam *param = opaque;
464     RAMBlock *block;
465     ram_addr_t offset;
466     bool zero_page;
467 
468     qemu_mutex_lock(&param->mutex);
469     while (!param->quit) {
470         if (param->block) {
471             block = param->block;
472             offset = param->offset;
473             param->block = NULL;
474             qemu_mutex_unlock(&param->mutex);
475 
476             zero_page = do_compress_ram_page(param->file, &param->stream,
477                                              block, offset, param->originbuf);
478 
479             qemu_mutex_lock(&comp_done_lock);
480             param->done = true;
481             param->zero_page = zero_page;
482             qemu_cond_signal(&comp_done_cond);
483             qemu_mutex_unlock(&comp_done_lock);
484 
485             qemu_mutex_lock(&param->mutex);
486         } else {
487             qemu_cond_wait(&param->cond, &param->mutex);
488         }
489     }
490     qemu_mutex_unlock(&param->mutex);
491 
492     return NULL;
493 }
494 
495 static void compress_threads_save_cleanup(void)
496 {
497     int i, thread_count;
498 
499     if (!migrate_use_compression() || !comp_param) {
500         return;
501     }
502 
503     thread_count = migrate_compress_threads();
504     for (i = 0; i < thread_count; i++) {
505         /*
506          * we use it as a indicator which shows if the thread is
507          * properly init'd or not
508          */
509         if (!comp_param[i].file) {
510             break;
511         }
512 
513         qemu_mutex_lock(&comp_param[i].mutex);
514         comp_param[i].quit = true;
515         qemu_cond_signal(&comp_param[i].cond);
516         qemu_mutex_unlock(&comp_param[i].mutex);
517 
518         qemu_thread_join(compress_threads + i);
519         qemu_mutex_destroy(&comp_param[i].mutex);
520         qemu_cond_destroy(&comp_param[i].cond);
521         deflateEnd(&comp_param[i].stream);
522         g_free(comp_param[i].originbuf);
523         qemu_fclose(comp_param[i].file);
524         comp_param[i].file = NULL;
525     }
526     qemu_mutex_destroy(&comp_done_lock);
527     qemu_cond_destroy(&comp_done_cond);
528     g_free(compress_threads);
529     g_free(comp_param);
530     compress_threads = NULL;
531     comp_param = NULL;
532 }
533 
534 static int compress_threads_save_setup(void)
535 {
536     int i, thread_count;
537 
538     if (!migrate_use_compression()) {
539         return 0;
540     }
541     thread_count = migrate_compress_threads();
542     compress_threads = g_new0(QemuThread, thread_count);
543     comp_param = g_new0(CompressParam, thread_count);
544     qemu_cond_init(&comp_done_cond);
545     qemu_mutex_init(&comp_done_lock);
546     for (i = 0; i < thread_count; i++) {
547         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
548         if (!comp_param[i].originbuf) {
549             goto exit;
550         }
551 
552         if (deflateInit(&comp_param[i].stream,
553                         migrate_compress_level()) != Z_OK) {
554             g_free(comp_param[i].originbuf);
555             goto exit;
556         }
557 
558         /* comp_param[i].file is just used as a dummy buffer to save data,
559          * set its ops to empty.
560          */
561         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
562         comp_param[i].done = true;
563         comp_param[i].quit = false;
564         qemu_mutex_init(&comp_param[i].mutex);
565         qemu_cond_init(&comp_param[i].cond);
566         qemu_thread_create(compress_threads + i, "compress",
567                            do_data_compress, comp_param + i,
568                            QEMU_THREAD_JOINABLE);
569     }
570     return 0;
571 
572 exit:
573     compress_threads_save_cleanup();
574     return -1;
575 }
576 
577 /**
578  * save_page_header: write page header to wire
579  *
580  * If this is the 1st block, it also writes the block identification
581  *
582  * Returns the number of bytes written
583  *
584  * @f: QEMUFile where to send the data
585  * @block: block that contains the page we want to send
586  * @offset: offset inside the block for the page
587  *          in the lower bits, it contains flags
588  */
589 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
590                                ram_addr_t offset)
591 {
592     size_t size, len;
593 
594     if (block == rs->last_sent_block) {
595         offset |= RAM_SAVE_FLAG_CONTINUE;
596     }
597     qemu_put_be64(f, offset);
598     size = 8;
599 
600     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
601         len = strlen(block->idstr);
602         qemu_put_byte(f, len);
603         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
604         size += 1 + len;
605         rs->last_sent_block = block;
606     }
607     return size;
608 }
609 
610 /**
611  * mig_throttle_guest_down: throotle down the guest
612  *
613  * Reduce amount of guest cpu execution to hopefully slow down memory
614  * writes. If guest dirty memory rate is reduced below the rate at
615  * which we can transfer pages to the destination then we should be
616  * able to complete migration. Some workloads dirty memory way too
617  * fast and will not effectively converge, even with auto-converge.
618  */
619 static void mig_throttle_guest_down(void)
620 {
621     MigrationState *s = migrate_get_current();
622     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
623     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
624     int pct_max = s->parameters.max_cpu_throttle;
625 
626     /* We have not started throttling yet. Let's start it. */
627     if (!cpu_throttle_active()) {
628         cpu_throttle_set(pct_initial);
629     } else {
630         /* Throttling already on, just increase the rate */
631         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
632                          pct_max));
633     }
634 }
635 
636 /**
637  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
638  *
639  * @rs: current RAM state
640  * @current_addr: address for the zero page
641  *
642  * Update the xbzrle cache to reflect a page that's been sent as all 0.
643  * The important thing is that a stale (not-yet-0'd) page be replaced
644  * by the new data.
645  * As a bonus, if the page wasn't in the cache it gets added so that
646  * when a small write is made into the 0'd page it gets XBZRLE sent.
647  */
648 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
649 {
650     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
651         return;
652     }
653 
654     /* We don't care if this fails to allocate a new cache page
655      * as long as it updated an old one */
656     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
657                  ram_counters.dirty_sync_count);
658 }
659 
660 #define ENCODING_FLAG_XBZRLE 0x1
661 
662 /**
663  * save_xbzrle_page: compress and send current page
664  *
665  * Returns: 1 means that we wrote the page
666  *          0 means that page is identical to the one already sent
667  *          -1 means that xbzrle would be longer than normal
668  *
669  * @rs: current RAM state
670  * @current_data: pointer to the address of the page contents
671  * @current_addr: addr of the page
672  * @block: block that contains the page we want to send
673  * @offset: offset inside the block for the page
674  * @last_stage: if we are at the completion stage
675  */
676 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
677                             ram_addr_t current_addr, RAMBlock *block,
678                             ram_addr_t offset, bool last_stage)
679 {
680     int encoded_len = 0, bytes_xbzrle;
681     uint8_t *prev_cached_page;
682 
683     if (!cache_is_cached(XBZRLE.cache, current_addr,
684                          ram_counters.dirty_sync_count)) {
685         xbzrle_counters.cache_miss++;
686         if (!last_stage) {
687             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
688                              ram_counters.dirty_sync_count) == -1) {
689                 return -1;
690             } else {
691                 /* update *current_data when the page has been
692                    inserted into cache */
693                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
694             }
695         }
696         return -1;
697     }
698 
699     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
700 
701     /* save current buffer into memory */
702     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
703 
704     /* XBZRLE encoding (if there is no overflow) */
705     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
706                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
707                                        TARGET_PAGE_SIZE);
708 
709     /*
710      * Update the cache contents, so that it corresponds to the data
711      * sent, in all cases except where we skip the page.
712      */
713     if (!last_stage && encoded_len != 0) {
714         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
715         /*
716          * In the case where we couldn't compress, ensure that the caller
717          * sends the data from the cache, since the guest might have
718          * changed the RAM since we copied it.
719          */
720         *current_data = prev_cached_page;
721     }
722 
723     if (encoded_len == 0) {
724         trace_save_xbzrle_page_skipping();
725         return 0;
726     } else if (encoded_len == -1) {
727         trace_save_xbzrle_page_overflow();
728         xbzrle_counters.overflow++;
729         return -1;
730     }
731 
732     /* Send XBZRLE based compressed page */
733     bytes_xbzrle = save_page_header(rs, rs->f, block,
734                                     offset | RAM_SAVE_FLAG_XBZRLE);
735     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
736     qemu_put_be16(rs->f, encoded_len);
737     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
738     bytes_xbzrle += encoded_len + 1 + 2;
739     xbzrle_counters.pages++;
740     xbzrle_counters.bytes += bytes_xbzrle;
741     ram_counters.transferred += bytes_xbzrle;
742 
743     return 1;
744 }
745 
746 /**
747  * migration_bitmap_find_dirty: find the next dirty page from start
748  *
749  * Returns the page offset within memory region of the start of a dirty page
750  *
751  * @rs: current RAM state
752  * @rb: RAMBlock where to search for dirty pages
753  * @start: page where we start the search
754  */
755 static inline
756 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
757                                           unsigned long start)
758 {
759     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
760     unsigned long *bitmap = rb->bmap;
761     unsigned long next;
762 
763     if (ramblock_is_ignored(rb)) {
764         return size;
765     }
766 
767     /*
768      * When the free page optimization is enabled, we need to check the bitmap
769      * to send the non-free pages rather than all the pages in the bulk stage.
770      */
771     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
772         next = start + 1;
773     } else {
774         next = find_next_bit(bitmap, size, start);
775     }
776 
777     return next;
778 }
779 
780 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
781                                                 RAMBlock *rb,
782                                                 unsigned long page)
783 {
784     bool ret;
785 
786     qemu_mutex_lock(&rs->bitmap_mutex);
787 
788     /*
789      * Clear dirty bitmap if needed.  This _must_ be called before we
790      * send any of the page in the chunk because we need to make sure
791      * we can capture further page content changes when we sync dirty
792      * log the next time.  So as long as we are going to send any of
793      * the page in the chunk we clear the remote dirty bitmap for all.
794      * Clearing it earlier won't be a problem, but too late will.
795      */
796     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
797         uint8_t shift = rb->clear_bmap_shift;
798         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
799         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
800 
801         /*
802          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
803          * can make things easier sometimes since then start address
804          * of the small chunk will always be 64 pages aligned so the
805          * bitmap will always be aligned to unsigned long.  We should
806          * even be able to remove this restriction but I'm simply
807          * keeping it.
808          */
809         assert(shift >= 6);
810         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
811         memory_region_clear_dirty_bitmap(rb->mr, start, size);
812     }
813 
814     ret = test_and_clear_bit(page, rb->bmap);
815 
816     if (ret) {
817         rs->migration_dirty_pages--;
818     }
819     qemu_mutex_unlock(&rs->bitmap_mutex);
820 
821     return ret;
822 }
823 
824 /* Called with RCU critical section */
825 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
826 {
827     rs->migration_dirty_pages +=
828         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
829                                               &rs->num_dirty_pages_period);
830 }
831 
832 /**
833  * ram_pagesize_summary: calculate all the pagesizes of a VM
834  *
835  * Returns a summary bitmap of the page sizes of all RAMBlocks
836  *
837  * For VMs with just normal pages this is equivalent to the host page
838  * size. If it's got some huge pages then it's the OR of all the
839  * different page sizes.
840  */
841 uint64_t ram_pagesize_summary(void)
842 {
843     RAMBlock *block;
844     uint64_t summary = 0;
845 
846     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
847         summary |= block->page_size;
848     }
849 
850     return summary;
851 }
852 
853 uint64_t ram_get_total_transferred_pages(void)
854 {
855     return  ram_counters.normal + ram_counters.duplicate +
856                 compression_counters.pages + xbzrle_counters.pages;
857 }
858 
859 static void migration_update_rates(RAMState *rs, int64_t end_time)
860 {
861     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
862     double compressed_size;
863 
864     /* calculate period counters */
865     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
866                 / (end_time - rs->time_last_bitmap_sync);
867 
868     if (!page_count) {
869         return;
870     }
871 
872     if (migrate_use_xbzrle()) {
873         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
874             rs->xbzrle_cache_miss_prev) / page_count;
875         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
876     }
877 
878     if (migrate_use_compression()) {
879         compression_counters.busy_rate = (double)(compression_counters.busy -
880             rs->compress_thread_busy_prev) / page_count;
881         rs->compress_thread_busy_prev = compression_counters.busy;
882 
883         compressed_size = compression_counters.compressed_size -
884                           rs->compressed_size_prev;
885         if (compressed_size) {
886             double uncompressed_size = (compression_counters.pages -
887                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
888 
889             /* Compression-Ratio = Uncompressed-size / Compressed-size */
890             compression_counters.compression_rate =
891                                         uncompressed_size / compressed_size;
892 
893             rs->compress_pages_prev = compression_counters.pages;
894             rs->compressed_size_prev = compression_counters.compressed_size;
895         }
896     }
897 }
898 
899 static void migration_trigger_throttle(RAMState *rs)
900 {
901     MigrationState *s = migrate_get_current();
902     uint64_t threshold = s->parameters.throttle_trigger_threshold;
903 
904     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
905     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
906     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
907 
908     /* During block migration the auto-converge logic incorrectly detects
909      * that ram migration makes no progress. Avoid this by disabling the
910      * throttling logic during the bulk phase of block migration. */
911     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
912         /* The following detection logic can be refined later. For now:
913            Check to see if the ratio between dirtied bytes and the approx.
914            amount of bytes that just got transferred since the last time
915            we were in this routine reaches the threshold. If that happens
916            twice, start or increase throttling. */
917 
918         if ((bytes_dirty_period > bytes_dirty_threshold) &&
919             (++rs->dirty_rate_high_cnt >= 2)) {
920             trace_migration_throttle();
921             rs->dirty_rate_high_cnt = 0;
922             mig_throttle_guest_down();
923         }
924     }
925 }
926 
927 static void migration_bitmap_sync(RAMState *rs)
928 {
929     RAMBlock *block;
930     int64_t end_time;
931 
932     ram_counters.dirty_sync_count++;
933 
934     if (!rs->time_last_bitmap_sync) {
935         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
936     }
937 
938     trace_migration_bitmap_sync_start();
939     memory_global_dirty_log_sync();
940 
941     qemu_mutex_lock(&rs->bitmap_mutex);
942     WITH_RCU_READ_LOCK_GUARD() {
943         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
944             ramblock_sync_dirty_bitmap(rs, block);
945         }
946         ram_counters.remaining = ram_bytes_remaining();
947     }
948     qemu_mutex_unlock(&rs->bitmap_mutex);
949 
950     memory_global_after_dirty_log_sync();
951     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
952 
953     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
954 
955     /* more than 1 second = 1000 millisecons */
956     if (end_time > rs->time_last_bitmap_sync + 1000) {
957         migration_trigger_throttle(rs);
958 
959         migration_update_rates(rs, end_time);
960 
961         rs->target_page_count_prev = rs->target_page_count;
962 
963         /* reset period counters */
964         rs->time_last_bitmap_sync = end_time;
965         rs->num_dirty_pages_period = 0;
966         rs->bytes_xfer_prev = ram_counters.transferred;
967     }
968     if (migrate_use_events()) {
969         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
970     }
971 }
972 
973 static void migration_bitmap_sync_precopy(RAMState *rs)
974 {
975     Error *local_err = NULL;
976 
977     /*
978      * The current notifier usage is just an optimization to migration, so we
979      * don't stop the normal migration process in the error case.
980      */
981     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
982         error_report_err(local_err);
983         local_err = NULL;
984     }
985 
986     migration_bitmap_sync(rs);
987 
988     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
989         error_report_err(local_err);
990     }
991 }
992 
993 /**
994  * save_zero_page_to_file: send the zero page to the file
995  *
996  * Returns the size of data written to the file, 0 means the page is not
997  * a zero page
998  *
999  * @rs: current RAM state
1000  * @file: the file where the data is saved
1001  * @block: block that contains the page we want to send
1002  * @offset: offset inside the block for the page
1003  */
1004 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1005                                   RAMBlock *block, ram_addr_t offset)
1006 {
1007     uint8_t *p = block->host + offset;
1008     int len = 0;
1009 
1010     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1011         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1012         qemu_put_byte(file, 0);
1013         len += 1;
1014     }
1015     return len;
1016 }
1017 
1018 /**
1019  * save_zero_page: send the zero page to the stream
1020  *
1021  * Returns the number of pages written.
1022  *
1023  * @rs: current RAM state
1024  * @block: block that contains the page we want to send
1025  * @offset: offset inside the block for the page
1026  */
1027 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1028 {
1029     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1030 
1031     if (len) {
1032         ram_counters.duplicate++;
1033         ram_counters.transferred += len;
1034         return 1;
1035     }
1036     return -1;
1037 }
1038 
1039 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1040 {
1041     if (!migrate_release_ram() || !migration_in_postcopy()) {
1042         return;
1043     }
1044 
1045     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1046 }
1047 
1048 /*
1049  * @pages: the number of pages written by the control path,
1050  *        < 0 - error
1051  *        > 0 - number of pages written
1052  *
1053  * Return true if the pages has been saved, otherwise false is returned.
1054  */
1055 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1056                               int *pages)
1057 {
1058     uint64_t bytes_xmit = 0;
1059     int ret;
1060 
1061     *pages = -1;
1062     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1063                                 &bytes_xmit);
1064     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1065         return false;
1066     }
1067 
1068     if (bytes_xmit) {
1069         ram_counters.transferred += bytes_xmit;
1070         *pages = 1;
1071     }
1072 
1073     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1074         return true;
1075     }
1076 
1077     if (bytes_xmit > 0) {
1078         ram_counters.normal++;
1079     } else if (bytes_xmit == 0) {
1080         ram_counters.duplicate++;
1081     }
1082 
1083     return true;
1084 }
1085 
1086 /*
1087  * directly send the page to the stream
1088  *
1089  * Returns the number of pages written.
1090  *
1091  * @rs: current RAM state
1092  * @block: block that contains the page we want to send
1093  * @offset: offset inside the block for the page
1094  * @buf: the page to be sent
1095  * @async: send to page asyncly
1096  */
1097 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1098                             uint8_t *buf, bool async)
1099 {
1100     ram_counters.transferred += save_page_header(rs, rs->f, block,
1101                                                  offset | RAM_SAVE_FLAG_PAGE);
1102     if (async) {
1103         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1104                               migrate_release_ram() &
1105                               migration_in_postcopy());
1106     } else {
1107         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1108     }
1109     ram_counters.transferred += TARGET_PAGE_SIZE;
1110     ram_counters.normal++;
1111     return 1;
1112 }
1113 
1114 /**
1115  * ram_save_page: send the given page to the stream
1116  *
1117  * Returns the number of pages written.
1118  *          < 0 - error
1119  *          >=0 - Number of pages written - this might legally be 0
1120  *                if xbzrle noticed the page was the same.
1121  *
1122  * @rs: current RAM state
1123  * @block: block that contains the page we want to send
1124  * @offset: offset inside the block for the page
1125  * @last_stage: if we are at the completion stage
1126  */
1127 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1128 {
1129     int pages = -1;
1130     uint8_t *p;
1131     bool send_async = true;
1132     RAMBlock *block = pss->block;
1133     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1134     ram_addr_t current_addr = block->offset + offset;
1135 
1136     p = block->host + offset;
1137     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1138 
1139     XBZRLE_cache_lock();
1140     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1141         migrate_use_xbzrle()) {
1142         pages = save_xbzrle_page(rs, &p, current_addr, block,
1143                                  offset, last_stage);
1144         if (!last_stage) {
1145             /* Can't send this cached data async, since the cache page
1146              * might get updated before it gets to the wire
1147              */
1148             send_async = false;
1149         }
1150     }
1151 
1152     /* XBZRLE overflow or normal page */
1153     if (pages == -1) {
1154         pages = save_normal_page(rs, block, offset, p, send_async);
1155     }
1156 
1157     XBZRLE_cache_unlock();
1158 
1159     return pages;
1160 }
1161 
1162 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1163                                  ram_addr_t offset)
1164 {
1165     if (multifd_queue_page(rs->f, block, offset) < 0) {
1166         return -1;
1167     }
1168     ram_counters.normal++;
1169 
1170     return 1;
1171 }
1172 
1173 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1174                                  ram_addr_t offset, uint8_t *source_buf)
1175 {
1176     RAMState *rs = ram_state;
1177     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1178     bool zero_page = false;
1179     int ret;
1180 
1181     if (save_zero_page_to_file(rs, f, block, offset)) {
1182         zero_page = true;
1183         goto exit;
1184     }
1185 
1186     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1187 
1188     /*
1189      * copy it to a internal buffer to avoid it being modified by VM
1190      * so that we can catch up the error during compression and
1191      * decompression
1192      */
1193     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1194     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1195     if (ret < 0) {
1196         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1197         error_report("compressed data failed!");
1198         return false;
1199     }
1200 
1201 exit:
1202     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1203     return zero_page;
1204 }
1205 
1206 static void
1207 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1208 {
1209     ram_counters.transferred += bytes_xmit;
1210 
1211     if (param->zero_page) {
1212         ram_counters.duplicate++;
1213         return;
1214     }
1215 
1216     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1217     compression_counters.compressed_size += bytes_xmit - 8;
1218     compression_counters.pages++;
1219 }
1220 
1221 static bool save_page_use_compression(RAMState *rs);
1222 
1223 static void flush_compressed_data(RAMState *rs)
1224 {
1225     int idx, len, thread_count;
1226 
1227     if (!save_page_use_compression(rs)) {
1228         return;
1229     }
1230     thread_count = migrate_compress_threads();
1231 
1232     qemu_mutex_lock(&comp_done_lock);
1233     for (idx = 0; idx < thread_count; idx++) {
1234         while (!comp_param[idx].done) {
1235             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1236         }
1237     }
1238     qemu_mutex_unlock(&comp_done_lock);
1239 
1240     for (idx = 0; idx < thread_count; idx++) {
1241         qemu_mutex_lock(&comp_param[idx].mutex);
1242         if (!comp_param[idx].quit) {
1243             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1244             /*
1245              * it's safe to fetch zero_page without holding comp_done_lock
1246              * as there is no further request submitted to the thread,
1247              * i.e, the thread should be waiting for a request at this point.
1248              */
1249             update_compress_thread_counts(&comp_param[idx], len);
1250         }
1251         qemu_mutex_unlock(&comp_param[idx].mutex);
1252     }
1253 }
1254 
1255 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1256                                        ram_addr_t offset)
1257 {
1258     param->block = block;
1259     param->offset = offset;
1260 }
1261 
1262 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1263                                            ram_addr_t offset)
1264 {
1265     int idx, thread_count, bytes_xmit = -1, pages = -1;
1266     bool wait = migrate_compress_wait_thread();
1267 
1268     thread_count = migrate_compress_threads();
1269     qemu_mutex_lock(&comp_done_lock);
1270 retry:
1271     for (idx = 0; idx < thread_count; idx++) {
1272         if (comp_param[idx].done) {
1273             comp_param[idx].done = false;
1274             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1275             qemu_mutex_lock(&comp_param[idx].mutex);
1276             set_compress_params(&comp_param[idx], block, offset);
1277             qemu_cond_signal(&comp_param[idx].cond);
1278             qemu_mutex_unlock(&comp_param[idx].mutex);
1279             pages = 1;
1280             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1281             break;
1282         }
1283     }
1284 
1285     /*
1286      * wait for the free thread if the user specifies 'compress-wait-thread',
1287      * otherwise we will post the page out in the main thread as normal page.
1288      */
1289     if (pages < 0 && wait) {
1290         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1291         goto retry;
1292     }
1293     qemu_mutex_unlock(&comp_done_lock);
1294 
1295     return pages;
1296 }
1297 
1298 /**
1299  * find_dirty_block: find the next dirty page and update any state
1300  * associated with the search process.
1301  *
1302  * Returns true if a page is found
1303  *
1304  * @rs: current RAM state
1305  * @pss: data about the state of the current dirty page scan
1306  * @again: set to false if the search has scanned the whole of RAM
1307  */
1308 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1309 {
1310     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1311     if (pss->complete_round && pss->block == rs->last_seen_block &&
1312         pss->page >= rs->last_page) {
1313         /*
1314          * We've been once around the RAM and haven't found anything.
1315          * Give up.
1316          */
1317         *again = false;
1318         return false;
1319     }
1320     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1321         >= pss->block->used_length) {
1322         /* Didn't find anything in this RAM Block */
1323         pss->page = 0;
1324         pss->block = QLIST_NEXT_RCU(pss->block, next);
1325         if (!pss->block) {
1326             /*
1327              * If memory migration starts over, we will meet a dirtied page
1328              * which may still exists in compression threads's ring, so we
1329              * should flush the compressed data to make sure the new page
1330              * is not overwritten by the old one in the destination.
1331              *
1332              * Also If xbzrle is on, stop using the data compression at this
1333              * point. In theory, xbzrle can do better than compression.
1334              */
1335             flush_compressed_data(rs);
1336 
1337             /* Hit the end of the list */
1338             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1339             /* Flag that we've looped */
1340             pss->complete_round = true;
1341             rs->ram_bulk_stage = false;
1342         }
1343         /* Didn't find anything this time, but try again on the new block */
1344         *again = true;
1345         return false;
1346     } else {
1347         /* Can go around again, but... */
1348         *again = true;
1349         /* We've found something so probably don't need to */
1350         return true;
1351     }
1352 }
1353 
1354 /**
1355  * unqueue_page: gets a page of the queue
1356  *
1357  * Helper for 'get_queued_page' - gets a page off the queue
1358  *
1359  * Returns the block of the page (or NULL if none available)
1360  *
1361  * @rs: current RAM state
1362  * @offset: used to return the offset within the RAMBlock
1363  */
1364 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1365 {
1366     RAMBlock *block = NULL;
1367 
1368     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1369         return NULL;
1370     }
1371 
1372     qemu_mutex_lock(&rs->src_page_req_mutex);
1373     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1374         struct RAMSrcPageRequest *entry =
1375                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1376         block = entry->rb;
1377         *offset = entry->offset;
1378 
1379         if (entry->len > TARGET_PAGE_SIZE) {
1380             entry->len -= TARGET_PAGE_SIZE;
1381             entry->offset += TARGET_PAGE_SIZE;
1382         } else {
1383             memory_region_unref(block->mr);
1384             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1385             g_free(entry);
1386             migration_consume_urgent_request();
1387         }
1388     }
1389     qemu_mutex_unlock(&rs->src_page_req_mutex);
1390 
1391     return block;
1392 }
1393 
1394 /**
1395  * get_queued_page: unqueue a page from the postcopy requests
1396  *
1397  * Skips pages that are already sent (!dirty)
1398  *
1399  * Returns true if a queued page is found
1400  *
1401  * @rs: current RAM state
1402  * @pss: data about the state of the current dirty page scan
1403  */
1404 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1405 {
1406     RAMBlock  *block;
1407     ram_addr_t offset;
1408     bool dirty;
1409 
1410     do {
1411         block = unqueue_page(rs, &offset);
1412         /*
1413          * We're sending this page, and since it's postcopy nothing else
1414          * will dirty it, and we must make sure it doesn't get sent again
1415          * even if this queue request was received after the background
1416          * search already sent it.
1417          */
1418         if (block) {
1419             unsigned long page;
1420 
1421             page = offset >> TARGET_PAGE_BITS;
1422             dirty = test_bit(page, block->bmap);
1423             if (!dirty) {
1424                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1425                                                 page);
1426             } else {
1427                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1428             }
1429         }
1430 
1431     } while (block && !dirty);
1432 
1433     if (block) {
1434         /*
1435          * As soon as we start servicing pages out of order, then we have
1436          * to kill the bulk stage, since the bulk stage assumes
1437          * in (migration_bitmap_find_and_reset_dirty) that every page is
1438          * dirty, that's no longer true.
1439          */
1440         rs->ram_bulk_stage = false;
1441 
1442         /*
1443          * We want the background search to continue from the queued page
1444          * since the guest is likely to want other pages near to the page
1445          * it just requested.
1446          */
1447         pss->block = block;
1448         pss->page = offset >> TARGET_PAGE_BITS;
1449 
1450         /*
1451          * This unqueued page would break the "one round" check, even is
1452          * really rare.
1453          */
1454         pss->complete_round = false;
1455     }
1456 
1457     return !!block;
1458 }
1459 
1460 /**
1461  * migration_page_queue_free: drop any remaining pages in the ram
1462  * request queue
1463  *
1464  * It should be empty at the end anyway, but in error cases there may
1465  * be some left.  in case that there is any page left, we drop it.
1466  *
1467  */
1468 static void migration_page_queue_free(RAMState *rs)
1469 {
1470     struct RAMSrcPageRequest *mspr, *next_mspr;
1471     /* This queue generally should be empty - but in the case of a failed
1472      * migration might have some droppings in.
1473      */
1474     RCU_READ_LOCK_GUARD();
1475     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1476         memory_region_unref(mspr->rb->mr);
1477         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1478         g_free(mspr);
1479     }
1480 }
1481 
1482 /**
1483  * ram_save_queue_pages: queue the page for transmission
1484  *
1485  * A request from postcopy destination for example.
1486  *
1487  * Returns zero on success or negative on error
1488  *
1489  * @rbname: Name of the RAMBLock of the request. NULL means the
1490  *          same that last one.
1491  * @start: starting address from the start of the RAMBlock
1492  * @len: length (in bytes) to send
1493  */
1494 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1495 {
1496     RAMBlock *ramblock;
1497     RAMState *rs = ram_state;
1498 
1499     ram_counters.postcopy_requests++;
1500     RCU_READ_LOCK_GUARD();
1501 
1502     if (!rbname) {
1503         /* Reuse last RAMBlock */
1504         ramblock = rs->last_req_rb;
1505 
1506         if (!ramblock) {
1507             /*
1508              * Shouldn't happen, we can't reuse the last RAMBlock if
1509              * it's the 1st request.
1510              */
1511             error_report("ram_save_queue_pages no previous block");
1512             return -1;
1513         }
1514     } else {
1515         ramblock = qemu_ram_block_by_name(rbname);
1516 
1517         if (!ramblock) {
1518             /* We shouldn't be asked for a non-existent RAMBlock */
1519             error_report("ram_save_queue_pages no block '%s'", rbname);
1520             return -1;
1521         }
1522         rs->last_req_rb = ramblock;
1523     }
1524     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1525     if (start+len > ramblock->used_length) {
1526         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1527                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1528                      __func__, start, len, ramblock->used_length);
1529         return -1;
1530     }
1531 
1532     struct RAMSrcPageRequest *new_entry =
1533         g_malloc0(sizeof(struct RAMSrcPageRequest));
1534     new_entry->rb = ramblock;
1535     new_entry->offset = start;
1536     new_entry->len = len;
1537 
1538     memory_region_ref(ramblock->mr);
1539     qemu_mutex_lock(&rs->src_page_req_mutex);
1540     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1541     migration_make_urgent_request();
1542     qemu_mutex_unlock(&rs->src_page_req_mutex);
1543 
1544     return 0;
1545 }
1546 
1547 static bool save_page_use_compression(RAMState *rs)
1548 {
1549     if (!migrate_use_compression()) {
1550         return false;
1551     }
1552 
1553     /*
1554      * If xbzrle is on, stop using the data compression after first
1555      * round of migration even if compression is enabled. In theory,
1556      * xbzrle can do better than compression.
1557      */
1558     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1559         return true;
1560     }
1561 
1562     return false;
1563 }
1564 
1565 /*
1566  * try to compress the page before posting it out, return true if the page
1567  * has been properly handled by compression, otherwise needs other
1568  * paths to handle it
1569  */
1570 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1571 {
1572     if (!save_page_use_compression(rs)) {
1573         return false;
1574     }
1575 
1576     /*
1577      * When starting the process of a new block, the first page of
1578      * the block should be sent out before other pages in the same
1579      * block, and all the pages in last block should have been sent
1580      * out, keeping this order is important, because the 'cont' flag
1581      * is used to avoid resending the block name.
1582      *
1583      * We post the fist page as normal page as compression will take
1584      * much CPU resource.
1585      */
1586     if (block != rs->last_sent_block) {
1587         flush_compressed_data(rs);
1588         return false;
1589     }
1590 
1591     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1592         return true;
1593     }
1594 
1595     compression_counters.busy++;
1596     return false;
1597 }
1598 
1599 /**
1600  * ram_save_target_page: save one target page
1601  *
1602  * Returns the number of pages written
1603  *
1604  * @rs: current RAM state
1605  * @pss: data about the page we want to send
1606  * @last_stage: if we are at the completion stage
1607  */
1608 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1609                                 bool last_stage)
1610 {
1611     RAMBlock *block = pss->block;
1612     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1613     int res;
1614 
1615     if (control_save_page(rs, block, offset, &res)) {
1616         return res;
1617     }
1618 
1619     if (save_compress_page(rs, block, offset)) {
1620         return 1;
1621     }
1622 
1623     res = save_zero_page(rs, block, offset);
1624     if (res > 0) {
1625         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1626          * page would be stale
1627          */
1628         if (!save_page_use_compression(rs)) {
1629             XBZRLE_cache_lock();
1630             xbzrle_cache_zero_page(rs, block->offset + offset);
1631             XBZRLE_cache_unlock();
1632         }
1633         ram_release_pages(block->idstr, offset, res);
1634         return res;
1635     }
1636 
1637     /*
1638      * Do not use multifd for:
1639      * 1. Compression as the first page in the new block should be posted out
1640      *    before sending the compressed page
1641      * 2. In postcopy as one whole host page should be placed
1642      */
1643     if (!save_page_use_compression(rs) && migrate_use_multifd()
1644         && !migration_in_postcopy()) {
1645         return ram_save_multifd_page(rs, block, offset);
1646     }
1647 
1648     return ram_save_page(rs, pss, last_stage);
1649 }
1650 
1651 /**
1652  * ram_save_host_page: save a whole host page
1653  *
1654  * Starting at *offset send pages up to the end of the current host
1655  * page. It's valid for the initial offset to point into the middle of
1656  * a host page in which case the remainder of the hostpage is sent.
1657  * Only dirty target pages are sent. Note that the host page size may
1658  * be a huge page for this block.
1659  * The saving stops at the boundary of the used_length of the block
1660  * if the RAMBlock isn't a multiple of the host page size.
1661  *
1662  * Returns the number of pages written or negative on error
1663  *
1664  * @rs: current RAM state
1665  * @ms: current migration state
1666  * @pss: data about the page we want to send
1667  * @last_stage: if we are at the completion stage
1668  */
1669 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1670                               bool last_stage)
1671 {
1672     int tmppages, pages = 0;
1673     size_t pagesize_bits =
1674         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1675 
1676     if (ramblock_is_ignored(pss->block)) {
1677         error_report("block %s should not be migrated !", pss->block->idstr);
1678         return 0;
1679     }
1680 
1681     do {
1682         /* Check the pages is dirty and if it is send it */
1683         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1684             pss->page++;
1685             continue;
1686         }
1687 
1688         tmppages = ram_save_target_page(rs, pss, last_stage);
1689         if (tmppages < 0) {
1690             return tmppages;
1691         }
1692 
1693         pages += tmppages;
1694         pss->page++;
1695         /* Allow rate limiting to happen in the middle of huge pages */
1696         migration_rate_limit();
1697     } while ((pss->page & (pagesize_bits - 1)) &&
1698              offset_in_ramblock(pss->block,
1699                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1700 
1701     /* The offset we leave with is the last one we looked at */
1702     pss->page--;
1703     return pages;
1704 }
1705 
1706 /**
1707  * ram_find_and_save_block: finds a dirty page and sends it to f
1708  *
1709  * Called within an RCU critical section.
1710  *
1711  * Returns the number of pages written where zero means no dirty pages,
1712  * or negative on error
1713  *
1714  * @rs: current RAM state
1715  * @last_stage: if we are at the completion stage
1716  *
1717  * On systems where host-page-size > target-page-size it will send all the
1718  * pages in a host page that are dirty.
1719  */
1720 
1721 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1722 {
1723     PageSearchStatus pss;
1724     int pages = 0;
1725     bool again, found;
1726 
1727     /* No dirty page as there is zero RAM */
1728     if (!ram_bytes_total()) {
1729         return pages;
1730     }
1731 
1732     pss.block = rs->last_seen_block;
1733     pss.page = rs->last_page;
1734     pss.complete_round = false;
1735 
1736     if (!pss.block) {
1737         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1738     }
1739 
1740     do {
1741         again = true;
1742         found = get_queued_page(rs, &pss);
1743 
1744         if (!found) {
1745             /* priority queue empty, so just search for something dirty */
1746             found = find_dirty_block(rs, &pss, &again);
1747         }
1748 
1749         if (found) {
1750             pages = ram_save_host_page(rs, &pss, last_stage);
1751         }
1752     } while (!pages && again);
1753 
1754     rs->last_seen_block = pss.block;
1755     rs->last_page = pss.page;
1756 
1757     return pages;
1758 }
1759 
1760 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1761 {
1762     uint64_t pages = size / TARGET_PAGE_SIZE;
1763 
1764     if (zero) {
1765         ram_counters.duplicate += pages;
1766     } else {
1767         ram_counters.normal += pages;
1768         ram_counters.transferred += size;
1769         qemu_update_position(f, size);
1770     }
1771 }
1772 
1773 static uint64_t ram_bytes_total_common(bool count_ignored)
1774 {
1775     RAMBlock *block;
1776     uint64_t total = 0;
1777 
1778     RCU_READ_LOCK_GUARD();
1779 
1780     if (count_ignored) {
1781         RAMBLOCK_FOREACH_MIGRATABLE(block) {
1782             total += block->used_length;
1783         }
1784     } else {
1785         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1786             total += block->used_length;
1787         }
1788     }
1789     return total;
1790 }
1791 
1792 uint64_t ram_bytes_total(void)
1793 {
1794     return ram_bytes_total_common(false);
1795 }
1796 
1797 static void xbzrle_load_setup(void)
1798 {
1799     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1800 }
1801 
1802 static void xbzrle_load_cleanup(void)
1803 {
1804     g_free(XBZRLE.decoded_buf);
1805     XBZRLE.decoded_buf = NULL;
1806 }
1807 
1808 static void ram_state_cleanup(RAMState **rsp)
1809 {
1810     if (*rsp) {
1811         migration_page_queue_free(*rsp);
1812         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1813         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1814         g_free(*rsp);
1815         *rsp = NULL;
1816     }
1817 }
1818 
1819 static void xbzrle_cleanup(void)
1820 {
1821     XBZRLE_cache_lock();
1822     if (XBZRLE.cache) {
1823         cache_fini(XBZRLE.cache);
1824         g_free(XBZRLE.encoded_buf);
1825         g_free(XBZRLE.current_buf);
1826         g_free(XBZRLE.zero_target_page);
1827         XBZRLE.cache = NULL;
1828         XBZRLE.encoded_buf = NULL;
1829         XBZRLE.current_buf = NULL;
1830         XBZRLE.zero_target_page = NULL;
1831     }
1832     XBZRLE_cache_unlock();
1833 }
1834 
1835 static void ram_save_cleanup(void *opaque)
1836 {
1837     RAMState **rsp = opaque;
1838     RAMBlock *block;
1839 
1840     /* caller have hold iothread lock or is in a bh, so there is
1841      * no writing race against the migration bitmap
1842      */
1843     memory_global_dirty_log_stop();
1844 
1845     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1846         g_free(block->clear_bmap);
1847         block->clear_bmap = NULL;
1848         g_free(block->bmap);
1849         block->bmap = NULL;
1850     }
1851 
1852     xbzrle_cleanup();
1853     compress_threads_save_cleanup();
1854     ram_state_cleanup(rsp);
1855 }
1856 
1857 static void ram_state_reset(RAMState *rs)
1858 {
1859     rs->last_seen_block = NULL;
1860     rs->last_sent_block = NULL;
1861     rs->last_page = 0;
1862     rs->last_version = ram_list.version;
1863     rs->ram_bulk_stage = true;
1864     rs->fpo_enabled = false;
1865 }
1866 
1867 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1868 
1869 /*
1870  * 'expected' is the value you expect the bitmap mostly to be full
1871  * of; it won't bother printing lines that are all this value.
1872  * If 'todump' is null the migration bitmap is dumped.
1873  */
1874 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1875                            unsigned long pages)
1876 {
1877     int64_t cur;
1878     int64_t linelen = 128;
1879     char linebuf[129];
1880 
1881     for (cur = 0; cur < pages; cur += linelen) {
1882         int64_t curb;
1883         bool found = false;
1884         /*
1885          * Last line; catch the case where the line length
1886          * is longer than remaining ram
1887          */
1888         if (cur + linelen > pages) {
1889             linelen = pages - cur;
1890         }
1891         for (curb = 0; curb < linelen; curb++) {
1892             bool thisbit = test_bit(cur + curb, todump);
1893             linebuf[curb] = thisbit ? '1' : '.';
1894             found = found || (thisbit != expected);
1895         }
1896         if (found) {
1897             linebuf[curb] = '\0';
1898             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1899         }
1900     }
1901 }
1902 
1903 /* **** functions for postcopy ***** */
1904 
1905 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1906 {
1907     struct RAMBlock *block;
1908 
1909     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1910         unsigned long *bitmap = block->bmap;
1911         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1912         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1913 
1914         while (run_start < range) {
1915             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1916             ram_discard_range(block->idstr,
1917                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1918                               ((ram_addr_t)(run_end - run_start))
1919                                 << TARGET_PAGE_BITS);
1920             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1921         }
1922     }
1923 }
1924 
1925 /**
1926  * postcopy_send_discard_bm_ram: discard a RAMBlock
1927  *
1928  * Returns zero on success
1929  *
1930  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1931  *
1932  * @ms: current migration state
1933  * @block: RAMBlock to discard
1934  */
1935 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1936 {
1937     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1938     unsigned long current;
1939     unsigned long *bitmap = block->bmap;
1940 
1941     for (current = 0; current < end; ) {
1942         unsigned long one = find_next_bit(bitmap, end, current);
1943         unsigned long zero, discard_length;
1944 
1945         if (one >= end) {
1946             break;
1947         }
1948 
1949         zero = find_next_zero_bit(bitmap, end, one + 1);
1950 
1951         if (zero >= end) {
1952             discard_length = end - one;
1953         } else {
1954             discard_length = zero - one;
1955         }
1956         postcopy_discard_send_range(ms, one, discard_length);
1957         current = one + discard_length;
1958     }
1959 
1960     return 0;
1961 }
1962 
1963 /**
1964  * postcopy_each_ram_send_discard: discard all RAMBlocks
1965  *
1966  * Returns 0 for success or negative for error
1967  *
1968  * Utility for the outgoing postcopy code.
1969  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1970  *   passing it bitmap indexes and name.
1971  * (qemu_ram_foreach_block ends up passing unscaled lengths
1972  *  which would mean postcopy code would have to deal with target page)
1973  *
1974  * @ms: current migration state
1975  */
1976 static int postcopy_each_ram_send_discard(MigrationState *ms)
1977 {
1978     struct RAMBlock *block;
1979     int ret;
1980 
1981     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1982         postcopy_discard_send_init(ms, block->idstr);
1983 
1984         /*
1985          * Postcopy sends chunks of bitmap over the wire, but it
1986          * just needs indexes at this point, avoids it having
1987          * target page specific code.
1988          */
1989         ret = postcopy_send_discard_bm_ram(ms, block);
1990         postcopy_discard_send_finish(ms);
1991         if (ret) {
1992             return ret;
1993         }
1994     }
1995 
1996     return 0;
1997 }
1998 
1999 /**
2000  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2001  *
2002  * Helper for postcopy_chunk_hostpages; it's called twice to
2003  * canonicalize the two bitmaps, that are similar, but one is
2004  * inverted.
2005  *
2006  * Postcopy requires that all target pages in a hostpage are dirty or
2007  * clean, not a mix.  This function canonicalizes the bitmaps.
2008  *
2009  * @ms: current migration state
2010  * @block: block that contains the page we want to canonicalize
2011  */
2012 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2013 {
2014     RAMState *rs = ram_state;
2015     unsigned long *bitmap = block->bmap;
2016     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2017     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2018     unsigned long run_start;
2019 
2020     if (block->page_size == TARGET_PAGE_SIZE) {
2021         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2022         return;
2023     }
2024 
2025     /* Find a dirty page */
2026     run_start = find_next_bit(bitmap, pages, 0);
2027 
2028     while (run_start < pages) {
2029 
2030         /*
2031          * If the start of this run of pages is in the middle of a host
2032          * page, then we need to fixup this host page.
2033          */
2034         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2035             /* Find the end of this run */
2036             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2037             /*
2038              * If the end isn't at the start of a host page, then the
2039              * run doesn't finish at the end of a host page
2040              * and we need to discard.
2041              */
2042         }
2043 
2044         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2045             unsigned long page;
2046             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2047                                                              host_ratio);
2048             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2049 
2050             /* Clean up the bitmap */
2051             for (page = fixup_start_addr;
2052                  page < fixup_start_addr + host_ratio; page++) {
2053                 /*
2054                  * Remark them as dirty, updating the count for any pages
2055                  * that weren't previously dirty.
2056                  */
2057                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2058             }
2059         }
2060 
2061         /* Find the next dirty page for the next iteration */
2062         run_start = find_next_bit(bitmap, pages, run_start);
2063     }
2064 }
2065 
2066 /**
2067  * postcopy_chunk_hostpages: discard any partially sent host page
2068  *
2069  * Utility for the outgoing postcopy code.
2070  *
2071  * Discard any partially sent host-page size chunks, mark any partially
2072  * dirty host-page size chunks as all dirty.  In this case the host-page
2073  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2074  *
2075  * Returns zero on success
2076  *
2077  * @ms: current migration state
2078  * @block: block we want to work with
2079  */
2080 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2081 {
2082     postcopy_discard_send_init(ms, block->idstr);
2083 
2084     /*
2085      * Ensure that all partially dirty host pages are made fully dirty.
2086      */
2087     postcopy_chunk_hostpages_pass(ms, block);
2088 
2089     postcopy_discard_send_finish(ms);
2090     return 0;
2091 }
2092 
2093 /**
2094  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2095  *
2096  * Returns zero on success
2097  *
2098  * Transmit the set of pages to be discarded after precopy to the target
2099  * these are pages that:
2100  *     a) Have been previously transmitted but are now dirty again
2101  *     b) Pages that have never been transmitted, this ensures that
2102  *        any pages on the destination that have been mapped by background
2103  *        tasks get discarded (transparent huge pages is the specific concern)
2104  * Hopefully this is pretty sparse
2105  *
2106  * @ms: current migration state
2107  */
2108 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2109 {
2110     RAMState *rs = ram_state;
2111     RAMBlock *block;
2112     int ret;
2113 
2114     RCU_READ_LOCK_GUARD();
2115 
2116     /* This should be our last sync, the src is now paused */
2117     migration_bitmap_sync(rs);
2118 
2119     /* Easiest way to make sure we don't resume in the middle of a host-page */
2120     rs->last_seen_block = NULL;
2121     rs->last_sent_block = NULL;
2122     rs->last_page = 0;
2123 
2124     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2125         /* Deal with TPS != HPS and huge pages */
2126         ret = postcopy_chunk_hostpages(ms, block);
2127         if (ret) {
2128             return ret;
2129         }
2130 
2131 #ifdef DEBUG_POSTCOPY
2132         ram_debug_dump_bitmap(block->bmap, true,
2133                               block->used_length >> TARGET_PAGE_BITS);
2134 #endif
2135     }
2136     trace_ram_postcopy_send_discard_bitmap();
2137 
2138     return postcopy_each_ram_send_discard(ms);
2139 }
2140 
2141 /**
2142  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2143  *
2144  * Returns zero on success
2145  *
2146  * @rbname: name of the RAMBlock of the request. NULL means the
2147  *          same that last one.
2148  * @start: RAMBlock starting page
2149  * @length: RAMBlock size
2150  */
2151 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2152 {
2153     trace_ram_discard_range(rbname, start, length);
2154 
2155     RCU_READ_LOCK_GUARD();
2156     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2157 
2158     if (!rb) {
2159         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2160         return -1;
2161     }
2162 
2163     /*
2164      * On source VM, we don't need to update the received bitmap since
2165      * we don't even have one.
2166      */
2167     if (rb->receivedmap) {
2168         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2169                      length >> qemu_target_page_bits());
2170     }
2171 
2172     return ram_block_discard_range(rb, start, length);
2173 }
2174 
2175 /*
2176  * For every allocation, we will try not to crash the VM if the
2177  * allocation failed.
2178  */
2179 static int xbzrle_init(void)
2180 {
2181     Error *local_err = NULL;
2182 
2183     if (!migrate_use_xbzrle()) {
2184         return 0;
2185     }
2186 
2187     XBZRLE_cache_lock();
2188 
2189     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2190     if (!XBZRLE.zero_target_page) {
2191         error_report("%s: Error allocating zero page", __func__);
2192         goto err_out;
2193     }
2194 
2195     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2196                               TARGET_PAGE_SIZE, &local_err);
2197     if (!XBZRLE.cache) {
2198         error_report_err(local_err);
2199         goto free_zero_page;
2200     }
2201 
2202     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2203     if (!XBZRLE.encoded_buf) {
2204         error_report("%s: Error allocating encoded_buf", __func__);
2205         goto free_cache;
2206     }
2207 
2208     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2209     if (!XBZRLE.current_buf) {
2210         error_report("%s: Error allocating current_buf", __func__);
2211         goto free_encoded_buf;
2212     }
2213 
2214     /* We are all good */
2215     XBZRLE_cache_unlock();
2216     return 0;
2217 
2218 free_encoded_buf:
2219     g_free(XBZRLE.encoded_buf);
2220     XBZRLE.encoded_buf = NULL;
2221 free_cache:
2222     cache_fini(XBZRLE.cache);
2223     XBZRLE.cache = NULL;
2224 free_zero_page:
2225     g_free(XBZRLE.zero_target_page);
2226     XBZRLE.zero_target_page = NULL;
2227 err_out:
2228     XBZRLE_cache_unlock();
2229     return -ENOMEM;
2230 }
2231 
2232 static int ram_state_init(RAMState **rsp)
2233 {
2234     *rsp = g_try_new0(RAMState, 1);
2235 
2236     if (!*rsp) {
2237         error_report("%s: Init ramstate fail", __func__);
2238         return -1;
2239     }
2240 
2241     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2242     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2243     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2244 
2245     /*
2246      * Count the total number of pages used by ram blocks not including any
2247      * gaps due to alignment or unplugs.
2248      * This must match with the initial values of dirty bitmap.
2249      */
2250     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2251     ram_state_reset(*rsp);
2252 
2253     return 0;
2254 }
2255 
2256 static void ram_list_init_bitmaps(void)
2257 {
2258     MigrationState *ms = migrate_get_current();
2259     RAMBlock *block;
2260     unsigned long pages;
2261     uint8_t shift;
2262 
2263     /* Skip setting bitmap if there is no RAM */
2264     if (ram_bytes_total()) {
2265         shift = ms->clear_bitmap_shift;
2266         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2267             error_report("clear_bitmap_shift (%u) too big, using "
2268                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2269             shift = CLEAR_BITMAP_SHIFT_MAX;
2270         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2271             error_report("clear_bitmap_shift (%u) too small, using "
2272                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2273             shift = CLEAR_BITMAP_SHIFT_MIN;
2274         }
2275 
2276         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2277             pages = block->max_length >> TARGET_PAGE_BITS;
2278             /*
2279              * The initial dirty bitmap for migration must be set with all
2280              * ones to make sure we'll migrate every guest RAM page to
2281              * destination.
2282              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2283              * new migration after a failed migration, ram_list.
2284              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2285              * guest memory.
2286              */
2287             block->bmap = bitmap_new(pages);
2288             bitmap_set(block->bmap, 0, pages);
2289             block->clear_bmap_shift = shift;
2290             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2291         }
2292     }
2293 }
2294 
2295 static void ram_init_bitmaps(RAMState *rs)
2296 {
2297     /* For memory_global_dirty_log_start below.  */
2298     qemu_mutex_lock_iothread();
2299     qemu_mutex_lock_ramlist();
2300 
2301     WITH_RCU_READ_LOCK_GUARD() {
2302         ram_list_init_bitmaps();
2303         memory_global_dirty_log_start();
2304         migration_bitmap_sync_precopy(rs);
2305     }
2306     qemu_mutex_unlock_ramlist();
2307     qemu_mutex_unlock_iothread();
2308 }
2309 
2310 static int ram_init_all(RAMState **rsp)
2311 {
2312     if (ram_state_init(rsp)) {
2313         return -1;
2314     }
2315 
2316     if (xbzrle_init()) {
2317         ram_state_cleanup(rsp);
2318         return -1;
2319     }
2320 
2321     ram_init_bitmaps(*rsp);
2322 
2323     return 0;
2324 }
2325 
2326 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2327 {
2328     RAMBlock *block;
2329     uint64_t pages = 0;
2330 
2331     /*
2332      * Postcopy is not using xbzrle/compression, so no need for that.
2333      * Also, since source are already halted, we don't need to care
2334      * about dirty page logging as well.
2335      */
2336 
2337     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2338         pages += bitmap_count_one(block->bmap,
2339                                   block->used_length >> TARGET_PAGE_BITS);
2340     }
2341 
2342     /* This may not be aligned with current bitmaps. Recalculate. */
2343     rs->migration_dirty_pages = pages;
2344 
2345     rs->last_seen_block = NULL;
2346     rs->last_sent_block = NULL;
2347     rs->last_page = 0;
2348     rs->last_version = ram_list.version;
2349     /*
2350      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2351      * matter what we have sent.
2352      */
2353     rs->ram_bulk_stage = false;
2354 
2355     /* Update RAMState cache of output QEMUFile */
2356     rs->f = out;
2357 
2358     trace_ram_state_resume_prepare(pages);
2359 }
2360 
2361 /*
2362  * This function clears bits of the free pages reported by the caller from the
2363  * migration dirty bitmap. @addr is the host address corresponding to the
2364  * start of the continuous guest free pages, and @len is the total bytes of
2365  * those pages.
2366  */
2367 void qemu_guest_free_page_hint(void *addr, size_t len)
2368 {
2369     RAMBlock *block;
2370     ram_addr_t offset;
2371     size_t used_len, start, npages;
2372     MigrationState *s = migrate_get_current();
2373 
2374     /* This function is currently expected to be used during live migration */
2375     if (!migration_is_setup_or_active(s->state)) {
2376         return;
2377     }
2378 
2379     for (; len > 0; len -= used_len, addr += used_len) {
2380         block = qemu_ram_block_from_host(addr, false, &offset);
2381         if (unlikely(!block || offset >= block->used_length)) {
2382             /*
2383              * The implementation might not support RAMBlock resize during
2384              * live migration, but it could happen in theory with future
2385              * updates. So we add a check here to capture that case.
2386              */
2387             error_report_once("%s unexpected error", __func__);
2388             return;
2389         }
2390 
2391         if (len <= block->used_length - offset) {
2392             used_len = len;
2393         } else {
2394             used_len = block->used_length - offset;
2395         }
2396 
2397         start = offset >> TARGET_PAGE_BITS;
2398         npages = used_len >> TARGET_PAGE_BITS;
2399 
2400         qemu_mutex_lock(&ram_state->bitmap_mutex);
2401         ram_state->migration_dirty_pages -=
2402                       bitmap_count_one_with_offset(block->bmap, start, npages);
2403         bitmap_clear(block->bmap, start, npages);
2404         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2405     }
2406 }
2407 
2408 /*
2409  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2410  * long-running RCU critical section.  When rcu-reclaims in the code
2411  * start to become numerous it will be necessary to reduce the
2412  * granularity of these critical sections.
2413  */
2414 
2415 /**
2416  * ram_save_setup: Setup RAM for migration
2417  *
2418  * Returns zero to indicate success and negative for error
2419  *
2420  * @f: QEMUFile where to send the data
2421  * @opaque: RAMState pointer
2422  */
2423 static int ram_save_setup(QEMUFile *f, void *opaque)
2424 {
2425     RAMState **rsp = opaque;
2426     RAMBlock *block;
2427 
2428     if (compress_threads_save_setup()) {
2429         return -1;
2430     }
2431 
2432     /* migration has already setup the bitmap, reuse it. */
2433     if (!migration_in_colo_state()) {
2434         if (ram_init_all(rsp) != 0) {
2435             compress_threads_save_cleanup();
2436             return -1;
2437         }
2438     }
2439     (*rsp)->f = f;
2440 
2441     WITH_RCU_READ_LOCK_GUARD() {
2442         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2443 
2444         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2445             qemu_put_byte(f, strlen(block->idstr));
2446             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2447             qemu_put_be64(f, block->used_length);
2448             if (migrate_postcopy_ram() && block->page_size !=
2449                                           qemu_host_page_size) {
2450                 qemu_put_be64(f, block->page_size);
2451             }
2452             if (migrate_ignore_shared()) {
2453                 qemu_put_be64(f, block->mr->addr);
2454             }
2455         }
2456     }
2457 
2458     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2459     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2460 
2461     multifd_send_sync_main(f);
2462     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2463     qemu_fflush(f);
2464 
2465     return 0;
2466 }
2467 
2468 /**
2469  * ram_save_iterate: iterative stage for migration
2470  *
2471  * Returns zero to indicate success and negative for error
2472  *
2473  * @f: QEMUFile where to send the data
2474  * @opaque: RAMState pointer
2475  */
2476 static int ram_save_iterate(QEMUFile *f, void *opaque)
2477 {
2478     RAMState **temp = opaque;
2479     RAMState *rs = *temp;
2480     int ret = 0;
2481     int i;
2482     int64_t t0;
2483     int done = 0;
2484 
2485     if (blk_mig_bulk_active()) {
2486         /* Avoid transferring ram during bulk phase of block migration as
2487          * the bulk phase will usually take a long time and transferring
2488          * ram updates during that time is pointless. */
2489         goto out;
2490     }
2491 
2492     WITH_RCU_READ_LOCK_GUARD() {
2493         if (ram_list.version != rs->last_version) {
2494             ram_state_reset(rs);
2495         }
2496 
2497         /* Read version before ram_list.blocks */
2498         smp_rmb();
2499 
2500         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2501 
2502         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2503         i = 0;
2504         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2505                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2506             int pages;
2507 
2508             if (qemu_file_get_error(f)) {
2509                 break;
2510             }
2511 
2512             pages = ram_find_and_save_block(rs, false);
2513             /* no more pages to sent */
2514             if (pages == 0) {
2515                 done = 1;
2516                 break;
2517             }
2518 
2519             if (pages < 0) {
2520                 qemu_file_set_error(f, pages);
2521                 break;
2522             }
2523 
2524             rs->target_page_count += pages;
2525 
2526             /*
2527              * During postcopy, it is necessary to make sure one whole host
2528              * page is sent in one chunk.
2529              */
2530             if (migrate_postcopy_ram()) {
2531                 flush_compressed_data(rs);
2532             }
2533 
2534             /*
2535              * we want to check in the 1st loop, just in case it was the 1st
2536              * time and we had to sync the dirty bitmap.
2537              * qemu_clock_get_ns() is a bit expensive, so we only check each
2538              * some iterations
2539              */
2540             if ((i & 63) == 0) {
2541                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2542                               1000000;
2543                 if (t1 > MAX_WAIT) {
2544                     trace_ram_save_iterate_big_wait(t1, i);
2545                     break;
2546                 }
2547             }
2548             i++;
2549         }
2550     }
2551 
2552     /*
2553      * Must occur before EOS (or any QEMUFile operation)
2554      * because of RDMA protocol.
2555      */
2556     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2557 
2558 out:
2559     if (ret >= 0
2560         && migration_is_setup_or_active(migrate_get_current()->state)) {
2561         multifd_send_sync_main(rs->f);
2562         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2563         qemu_fflush(f);
2564         ram_counters.transferred += 8;
2565 
2566         ret = qemu_file_get_error(f);
2567     }
2568     if (ret < 0) {
2569         return ret;
2570     }
2571 
2572     return done;
2573 }
2574 
2575 /**
2576  * ram_save_complete: function called to send the remaining amount of ram
2577  *
2578  * Returns zero to indicate success or negative on error
2579  *
2580  * Called with iothread lock
2581  *
2582  * @f: QEMUFile where to send the data
2583  * @opaque: RAMState pointer
2584  */
2585 static int ram_save_complete(QEMUFile *f, void *opaque)
2586 {
2587     RAMState **temp = opaque;
2588     RAMState *rs = *temp;
2589     int ret = 0;
2590 
2591     WITH_RCU_READ_LOCK_GUARD() {
2592         if (!migration_in_postcopy()) {
2593             migration_bitmap_sync_precopy(rs);
2594         }
2595 
2596         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2597 
2598         /* try transferring iterative blocks of memory */
2599 
2600         /* flush all remaining blocks regardless of rate limiting */
2601         while (true) {
2602             int pages;
2603 
2604             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2605             /* no more blocks to sent */
2606             if (pages == 0) {
2607                 break;
2608             }
2609             if (pages < 0) {
2610                 ret = pages;
2611                 break;
2612             }
2613         }
2614 
2615         flush_compressed_data(rs);
2616         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2617     }
2618 
2619     if (ret >= 0) {
2620         multifd_send_sync_main(rs->f);
2621         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2622         qemu_fflush(f);
2623     }
2624 
2625     return ret;
2626 }
2627 
2628 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2629                              uint64_t *res_precopy_only,
2630                              uint64_t *res_compatible,
2631                              uint64_t *res_postcopy_only)
2632 {
2633     RAMState **temp = opaque;
2634     RAMState *rs = *temp;
2635     uint64_t remaining_size;
2636 
2637     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2638 
2639     if (!migration_in_postcopy() &&
2640         remaining_size < max_size) {
2641         qemu_mutex_lock_iothread();
2642         WITH_RCU_READ_LOCK_GUARD() {
2643             migration_bitmap_sync_precopy(rs);
2644         }
2645         qemu_mutex_unlock_iothread();
2646         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2647     }
2648 
2649     if (migrate_postcopy_ram()) {
2650         /* We can do postcopy, and all the data is postcopiable */
2651         *res_compatible += remaining_size;
2652     } else {
2653         *res_precopy_only += remaining_size;
2654     }
2655 }
2656 
2657 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2658 {
2659     unsigned int xh_len;
2660     int xh_flags;
2661     uint8_t *loaded_data;
2662 
2663     /* extract RLE header */
2664     xh_flags = qemu_get_byte(f);
2665     xh_len = qemu_get_be16(f);
2666 
2667     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2668         error_report("Failed to load XBZRLE page - wrong compression!");
2669         return -1;
2670     }
2671 
2672     if (xh_len > TARGET_PAGE_SIZE) {
2673         error_report("Failed to load XBZRLE page - len overflow!");
2674         return -1;
2675     }
2676     loaded_data = XBZRLE.decoded_buf;
2677     /* load data and decode */
2678     /* it can change loaded_data to point to an internal buffer */
2679     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2680 
2681     /* decode RLE */
2682     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2683                              TARGET_PAGE_SIZE) == -1) {
2684         error_report("Failed to load XBZRLE page - decode error!");
2685         return -1;
2686     }
2687 
2688     return 0;
2689 }
2690 
2691 /**
2692  * ram_block_from_stream: read a RAMBlock id from the migration stream
2693  *
2694  * Must be called from within a rcu critical section.
2695  *
2696  * Returns a pointer from within the RCU-protected ram_list.
2697  *
2698  * @f: QEMUFile where to read the data from
2699  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2700  */
2701 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2702 {
2703     static RAMBlock *block = NULL;
2704     char id[256];
2705     uint8_t len;
2706 
2707     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2708         if (!block) {
2709             error_report("Ack, bad migration stream!");
2710             return NULL;
2711         }
2712         return block;
2713     }
2714 
2715     len = qemu_get_byte(f);
2716     qemu_get_buffer(f, (uint8_t *)id, len);
2717     id[len] = 0;
2718 
2719     block = qemu_ram_block_by_name(id);
2720     if (!block) {
2721         error_report("Can't find block %s", id);
2722         return NULL;
2723     }
2724 
2725     if (ramblock_is_ignored(block)) {
2726         error_report("block %s should not be migrated !", id);
2727         return NULL;
2728     }
2729 
2730     return block;
2731 }
2732 
2733 static inline void *host_from_ram_block_offset(RAMBlock *block,
2734                                                ram_addr_t offset)
2735 {
2736     if (!offset_in_ramblock(block, offset)) {
2737         return NULL;
2738     }
2739 
2740     return block->host + offset;
2741 }
2742 
2743 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2744                              ram_addr_t offset, bool record_bitmap)
2745 {
2746     if (!offset_in_ramblock(block, offset)) {
2747         return NULL;
2748     }
2749     if (!block->colo_cache) {
2750         error_report("%s: colo_cache is NULL in block :%s",
2751                      __func__, block->idstr);
2752         return NULL;
2753     }
2754 
2755     /*
2756     * During colo checkpoint, we need bitmap of these migrated pages.
2757     * It help us to decide which pages in ram cache should be flushed
2758     * into VM's RAM later.
2759     */
2760     if (record_bitmap &&
2761         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2762         ram_state->migration_dirty_pages++;
2763     }
2764     return block->colo_cache + offset;
2765 }
2766 
2767 /**
2768  * ram_handle_compressed: handle the zero page case
2769  *
2770  * If a page (or a whole RDMA chunk) has been
2771  * determined to be zero, then zap it.
2772  *
2773  * @host: host address for the zero page
2774  * @ch: what the page is filled from.  We only support zero
2775  * @size: size of the zero page
2776  */
2777 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2778 {
2779     if (ch != 0 || !is_zero_range(host, size)) {
2780         memset(host, ch, size);
2781     }
2782 }
2783 
2784 /* return the size after decompression, or negative value on error */
2785 static int
2786 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2787                      const uint8_t *source, size_t source_len)
2788 {
2789     int err;
2790 
2791     err = inflateReset(stream);
2792     if (err != Z_OK) {
2793         return -1;
2794     }
2795 
2796     stream->avail_in = source_len;
2797     stream->next_in = (uint8_t *)source;
2798     stream->avail_out = dest_len;
2799     stream->next_out = dest;
2800 
2801     err = inflate(stream, Z_NO_FLUSH);
2802     if (err != Z_STREAM_END) {
2803         return -1;
2804     }
2805 
2806     return stream->total_out;
2807 }
2808 
2809 static void *do_data_decompress(void *opaque)
2810 {
2811     DecompressParam *param = opaque;
2812     unsigned long pagesize;
2813     uint8_t *des;
2814     int len, ret;
2815 
2816     qemu_mutex_lock(&param->mutex);
2817     while (!param->quit) {
2818         if (param->des) {
2819             des = param->des;
2820             len = param->len;
2821             param->des = 0;
2822             qemu_mutex_unlock(&param->mutex);
2823 
2824             pagesize = TARGET_PAGE_SIZE;
2825 
2826             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2827                                        param->compbuf, len);
2828             if (ret < 0 && migrate_get_current()->decompress_error_check) {
2829                 error_report("decompress data failed");
2830                 qemu_file_set_error(decomp_file, ret);
2831             }
2832 
2833             qemu_mutex_lock(&decomp_done_lock);
2834             param->done = true;
2835             qemu_cond_signal(&decomp_done_cond);
2836             qemu_mutex_unlock(&decomp_done_lock);
2837 
2838             qemu_mutex_lock(&param->mutex);
2839         } else {
2840             qemu_cond_wait(&param->cond, &param->mutex);
2841         }
2842     }
2843     qemu_mutex_unlock(&param->mutex);
2844 
2845     return NULL;
2846 }
2847 
2848 static int wait_for_decompress_done(void)
2849 {
2850     int idx, thread_count;
2851 
2852     if (!migrate_use_compression()) {
2853         return 0;
2854     }
2855 
2856     thread_count = migrate_decompress_threads();
2857     qemu_mutex_lock(&decomp_done_lock);
2858     for (idx = 0; idx < thread_count; idx++) {
2859         while (!decomp_param[idx].done) {
2860             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2861         }
2862     }
2863     qemu_mutex_unlock(&decomp_done_lock);
2864     return qemu_file_get_error(decomp_file);
2865 }
2866 
2867 static void compress_threads_load_cleanup(void)
2868 {
2869     int i, thread_count;
2870 
2871     if (!migrate_use_compression()) {
2872         return;
2873     }
2874     thread_count = migrate_decompress_threads();
2875     for (i = 0; i < thread_count; i++) {
2876         /*
2877          * we use it as a indicator which shows if the thread is
2878          * properly init'd or not
2879          */
2880         if (!decomp_param[i].compbuf) {
2881             break;
2882         }
2883 
2884         qemu_mutex_lock(&decomp_param[i].mutex);
2885         decomp_param[i].quit = true;
2886         qemu_cond_signal(&decomp_param[i].cond);
2887         qemu_mutex_unlock(&decomp_param[i].mutex);
2888     }
2889     for (i = 0; i < thread_count; i++) {
2890         if (!decomp_param[i].compbuf) {
2891             break;
2892         }
2893 
2894         qemu_thread_join(decompress_threads + i);
2895         qemu_mutex_destroy(&decomp_param[i].mutex);
2896         qemu_cond_destroy(&decomp_param[i].cond);
2897         inflateEnd(&decomp_param[i].stream);
2898         g_free(decomp_param[i].compbuf);
2899         decomp_param[i].compbuf = NULL;
2900     }
2901     g_free(decompress_threads);
2902     g_free(decomp_param);
2903     decompress_threads = NULL;
2904     decomp_param = NULL;
2905     decomp_file = NULL;
2906 }
2907 
2908 static int compress_threads_load_setup(QEMUFile *f)
2909 {
2910     int i, thread_count;
2911 
2912     if (!migrate_use_compression()) {
2913         return 0;
2914     }
2915 
2916     thread_count = migrate_decompress_threads();
2917     decompress_threads = g_new0(QemuThread, thread_count);
2918     decomp_param = g_new0(DecompressParam, thread_count);
2919     qemu_mutex_init(&decomp_done_lock);
2920     qemu_cond_init(&decomp_done_cond);
2921     decomp_file = f;
2922     for (i = 0; i < thread_count; i++) {
2923         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2924             goto exit;
2925         }
2926 
2927         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2928         qemu_mutex_init(&decomp_param[i].mutex);
2929         qemu_cond_init(&decomp_param[i].cond);
2930         decomp_param[i].done = true;
2931         decomp_param[i].quit = false;
2932         qemu_thread_create(decompress_threads + i, "decompress",
2933                            do_data_decompress, decomp_param + i,
2934                            QEMU_THREAD_JOINABLE);
2935     }
2936     return 0;
2937 exit:
2938     compress_threads_load_cleanup();
2939     return -1;
2940 }
2941 
2942 static void decompress_data_with_multi_threads(QEMUFile *f,
2943                                                void *host, int len)
2944 {
2945     int idx, thread_count;
2946 
2947     thread_count = migrate_decompress_threads();
2948     qemu_mutex_lock(&decomp_done_lock);
2949     while (true) {
2950         for (idx = 0; idx < thread_count; idx++) {
2951             if (decomp_param[idx].done) {
2952                 decomp_param[idx].done = false;
2953                 qemu_mutex_lock(&decomp_param[idx].mutex);
2954                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2955                 decomp_param[idx].des = host;
2956                 decomp_param[idx].len = len;
2957                 qemu_cond_signal(&decomp_param[idx].cond);
2958                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2959                 break;
2960             }
2961         }
2962         if (idx < thread_count) {
2963             break;
2964         } else {
2965             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2966         }
2967     }
2968     qemu_mutex_unlock(&decomp_done_lock);
2969 }
2970 
2971 /*
2972  * colo cache: this is for secondary VM, we cache the whole
2973  * memory of the secondary VM, it is need to hold the global lock
2974  * to call this helper.
2975  */
2976 int colo_init_ram_cache(void)
2977 {
2978     RAMBlock *block;
2979 
2980     WITH_RCU_READ_LOCK_GUARD() {
2981         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2982             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
2983                                                     NULL,
2984                                                     false);
2985             if (!block->colo_cache) {
2986                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
2987                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
2988                              block->used_length);
2989                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2990                     if (block->colo_cache) {
2991                         qemu_anon_ram_free(block->colo_cache, block->used_length);
2992                         block->colo_cache = NULL;
2993                     }
2994                 }
2995                 return -errno;
2996             }
2997         }
2998     }
2999 
3000     /*
3001     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3002     * with to decide which page in cache should be flushed into SVM's RAM. Here
3003     * we use the same name 'ram_bitmap' as for migration.
3004     */
3005     if (ram_bytes_total()) {
3006         RAMBlock *block;
3007 
3008         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3009             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3010             block->bmap = bitmap_new(pages);
3011         }
3012     }
3013 
3014     ram_state_init(&ram_state);
3015     return 0;
3016 }
3017 
3018 /* TODO: duplicated with ram_init_bitmaps */
3019 void colo_incoming_start_dirty_log(void)
3020 {
3021     RAMBlock *block = NULL;
3022     /* For memory_global_dirty_log_start below. */
3023     qemu_mutex_lock_iothread();
3024     qemu_mutex_lock_ramlist();
3025 
3026     memory_global_dirty_log_sync();
3027     WITH_RCU_READ_LOCK_GUARD() {
3028         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3029             ramblock_sync_dirty_bitmap(ram_state, block);
3030             /* Discard this dirty bitmap record */
3031             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3032         }
3033         memory_global_dirty_log_start();
3034     }
3035     ram_state->migration_dirty_pages = 0;
3036     qemu_mutex_unlock_ramlist();
3037     qemu_mutex_unlock_iothread();
3038 }
3039 
3040 /* It is need to hold the global lock to call this helper */
3041 void colo_release_ram_cache(void)
3042 {
3043     RAMBlock *block;
3044 
3045     memory_global_dirty_log_stop();
3046     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3047         g_free(block->bmap);
3048         block->bmap = NULL;
3049     }
3050 
3051     WITH_RCU_READ_LOCK_GUARD() {
3052         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3053             if (block->colo_cache) {
3054                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3055                 block->colo_cache = NULL;
3056             }
3057         }
3058     }
3059     ram_state_cleanup(&ram_state);
3060 }
3061 
3062 /**
3063  * ram_load_setup: Setup RAM for migration incoming side
3064  *
3065  * Returns zero to indicate success and negative for error
3066  *
3067  * @f: QEMUFile where to receive the data
3068  * @opaque: RAMState pointer
3069  */
3070 static int ram_load_setup(QEMUFile *f, void *opaque)
3071 {
3072     if (compress_threads_load_setup(f)) {
3073         return -1;
3074     }
3075 
3076     xbzrle_load_setup();
3077     ramblock_recv_map_init();
3078 
3079     return 0;
3080 }
3081 
3082 static int ram_load_cleanup(void *opaque)
3083 {
3084     RAMBlock *rb;
3085 
3086     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3087         qemu_ram_block_writeback(rb);
3088     }
3089 
3090     xbzrle_load_cleanup();
3091     compress_threads_load_cleanup();
3092 
3093     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3094         g_free(rb->receivedmap);
3095         rb->receivedmap = NULL;
3096     }
3097 
3098     return 0;
3099 }
3100 
3101 /**
3102  * ram_postcopy_incoming_init: allocate postcopy data structures
3103  *
3104  * Returns 0 for success and negative if there was one error
3105  *
3106  * @mis: current migration incoming state
3107  *
3108  * Allocate data structures etc needed by incoming migration with
3109  * postcopy-ram. postcopy-ram's similarly names
3110  * postcopy_ram_incoming_init does the work.
3111  */
3112 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3113 {
3114     return postcopy_ram_incoming_init(mis);
3115 }
3116 
3117 /**
3118  * ram_load_postcopy: load a page in postcopy case
3119  *
3120  * Returns 0 for success or -errno in case of error
3121  *
3122  * Called in postcopy mode by ram_load().
3123  * rcu_read_lock is taken prior to this being called.
3124  *
3125  * @f: QEMUFile where to send the data
3126  */
3127 static int ram_load_postcopy(QEMUFile *f)
3128 {
3129     int flags = 0, ret = 0;
3130     bool place_needed = false;
3131     bool matches_target_page_size = false;
3132     MigrationIncomingState *mis = migration_incoming_get_current();
3133     /* Temporary page that is later 'placed' */
3134     void *postcopy_host_page = mis->postcopy_tmp_page;
3135     void *this_host = NULL;
3136     bool all_zero = false;
3137     int target_pages = 0;
3138 
3139     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3140         ram_addr_t addr;
3141         void *host = NULL;
3142         void *page_buffer = NULL;
3143         void *place_source = NULL;
3144         RAMBlock *block = NULL;
3145         uint8_t ch;
3146         int len;
3147 
3148         addr = qemu_get_be64(f);
3149 
3150         /*
3151          * If qemu file error, we should stop here, and then "addr"
3152          * may be invalid
3153          */
3154         ret = qemu_file_get_error(f);
3155         if (ret) {
3156             break;
3157         }
3158 
3159         flags = addr & ~TARGET_PAGE_MASK;
3160         addr &= TARGET_PAGE_MASK;
3161 
3162         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3163         place_needed = false;
3164         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3165                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3166             block = ram_block_from_stream(f, flags);
3167 
3168             host = host_from_ram_block_offset(block, addr);
3169             if (!host) {
3170                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3171                 ret = -EINVAL;
3172                 break;
3173             }
3174             target_pages++;
3175             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3176             /*
3177              * Postcopy requires that we place whole host pages atomically;
3178              * these may be huge pages for RAMBlocks that are backed by
3179              * hugetlbfs.
3180              * To make it atomic, the data is read into a temporary page
3181              * that's moved into place later.
3182              * The migration protocol uses,  possibly smaller, target-pages
3183              * however the source ensures it always sends all the components
3184              * of a host page in one chunk.
3185              */
3186             page_buffer = postcopy_host_page +
3187                           ((uintptr_t)host & (block->page_size - 1));
3188             /* If all TP are zero then we can optimise the place */
3189             if (target_pages == 1) {
3190                 all_zero = true;
3191                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3192                                                     block->page_size);
3193             } else {
3194                 /* not the 1st TP within the HP */
3195                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3196                     (uintptr_t)this_host) {
3197                     error_report("Non-same host page %p/%p",
3198                                   host, this_host);
3199                     ret = -EINVAL;
3200                     break;
3201                 }
3202             }
3203 
3204             /*
3205              * If it's the last part of a host page then we place the host
3206              * page
3207              */
3208             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3209                 place_needed = true;
3210                 target_pages = 0;
3211             }
3212             place_source = postcopy_host_page;
3213         }
3214 
3215         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3216         case RAM_SAVE_FLAG_ZERO:
3217             ch = qemu_get_byte(f);
3218             /*
3219              * Can skip to set page_buffer when
3220              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3221              */
3222             if (ch || !matches_target_page_size) {
3223                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3224             }
3225             if (ch) {
3226                 all_zero = false;
3227             }
3228             break;
3229 
3230         case RAM_SAVE_FLAG_PAGE:
3231             all_zero = false;
3232             if (!matches_target_page_size) {
3233                 /* For huge pages, we always use temporary buffer */
3234                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3235             } else {
3236                 /*
3237                  * For small pages that matches target page size, we
3238                  * avoid the qemu_file copy.  Instead we directly use
3239                  * the buffer of QEMUFile to place the page.  Note: we
3240                  * cannot do any QEMUFile operation before using that
3241                  * buffer to make sure the buffer is valid when
3242                  * placing the page.
3243                  */
3244                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3245                                          TARGET_PAGE_SIZE);
3246             }
3247             break;
3248         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3249             all_zero = false;
3250             len = qemu_get_be32(f);
3251             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3252                 error_report("Invalid compressed data length: %d", len);
3253                 ret = -EINVAL;
3254                 break;
3255             }
3256             decompress_data_with_multi_threads(f, page_buffer, len);
3257             break;
3258 
3259         case RAM_SAVE_FLAG_EOS:
3260             /* normal exit */
3261             multifd_recv_sync_main();
3262             break;
3263         default:
3264             error_report("Unknown combination of migration flags: %#x"
3265                          " (postcopy mode)", flags);
3266             ret = -EINVAL;
3267             break;
3268         }
3269 
3270         /* Got the whole host page, wait for decompress before placing. */
3271         if (place_needed) {
3272             ret |= wait_for_decompress_done();
3273         }
3274 
3275         /* Detect for any possible file errors */
3276         if (!ret && qemu_file_get_error(f)) {
3277             ret = qemu_file_get_error(f);
3278         }
3279 
3280         if (!ret && place_needed) {
3281             /* This gets called at the last target page in the host page */
3282             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3283                                                        block->page_size);
3284 
3285             if (all_zero) {
3286                 ret = postcopy_place_page_zero(mis, place_dest,
3287                                                block);
3288             } else {
3289                 ret = postcopy_place_page(mis, place_dest,
3290                                           place_source, block);
3291             }
3292         }
3293     }
3294 
3295     return ret;
3296 }
3297 
3298 static bool postcopy_is_advised(void)
3299 {
3300     PostcopyState ps = postcopy_state_get();
3301     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3302 }
3303 
3304 static bool postcopy_is_running(void)
3305 {
3306     PostcopyState ps = postcopy_state_get();
3307     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3308 }
3309 
3310 /*
3311  * Flush content of RAM cache into SVM's memory.
3312  * Only flush the pages that be dirtied by PVM or SVM or both.
3313  */
3314 static void colo_flush_ram_cache(void)
3315 {
3316     RAMBlock *block = NULL;
3317     void *dst_host;
3318     void *src_host;
3319     unsigned long offset = 0;
3320 
3321     memory_global_dirty_log_sync();
3322     WITH_RCU_READ_LOCK_GUARD() {
3323         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3324             ramblock_sync_dirty_bitmap(ram_state, block);
3325         }
3326     }
3327 
3328     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3329     WITH_RCU_READ_LOCK_GUARD() {
3330         block = QLIST_FIRST_RCU(&ram_list.blocks);
3331 
3332         while (block) {
3333             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3334 
3335             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3336                 >= block->used_length) {
3337                 offset = 0;
3338                 block = QLIST_NEXT_RCU(block, next);
3339             } else {
3340                 migration_bitmap_clear_dirty(ram_state, block, offset);
3341                 dst_host = block->host
3342                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3343                 src_host = block->colo_cache
3344                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3345                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3346             }
3347         }
3348     }
3349     trace_colo_flush_ram_cache_end();
3350 }
3351 
3352 /**
3353  * ram_load_precopy: load pages in precopy case
3354  *
3355  * Returns 0 for success or -errno in case of error
3356  *
3357  * Called in precopy mode by ram_load().
3358  * rcu_read_lock is taken prior to this being called.
3359  *
3360  * @f: QEMUFile where to send the data
3361  */
3362 static int ram_load_precopy(QEMUFile *f)
3363 {
3364     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3365     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3366     bool postcopy_advised = postcopy_is_advised();
3367     if (!migrate_use_compression()) {
3368         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3369     }
3370 
3371     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3372         ram_addr_t addr, total_ram_bytes;
3373         void *host = NULL, *host_bak = NULL;
3374         uint8_t ch;
3375 
3376         /*
3377          * Yield periodically to let main loop run, but an iteration of
3378          * the main loop is expensive, so do it each some iterations
3379          */
3380         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3381             aio_co_schedule(qemu_get_current_aio_context(),
3382                             qemu_coroutine_self());
3383             qemu_coroutine_yield();
3384         }
3385         i++;
3386 
3387         addr = qemu_get_be64(f);
3388         flags = addr & ~TARGET_PAGE_MASK;
3389         addr &= TARGET_PAGE_MASK;
3390 
3391         if (flags & invalid_flags) {
3392             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3393                 error_report("Received an unexpected compressed page");
3394             }
3395 
3396             ret = -EINVAL;
3397             break;
3398         }
3399 
3400         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3401                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3402             RAMBlock *block = ram_block_from_stream(f, flags);
3403 
3404             host = host_from_ram_block_offset(block, addr);
3405             /*
3406              * After going into COLO stage, we should not load the page
3407              * into SVM's memory directly, we put them into colo_cache firstly.
3408              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3409              * Previously, we copied all these memory in preparing stage of COLO
3410              * while we need to stop VM, which is a time-consuming process.
3411              * Here we optimize it by a trick, back-up every page while in
3412              * migration process while COLO is enabled, though it affects the
3413              * speed of the migration, but it obviously reduce the downtime of
3414              * back-up all SVM'S memory in COLO preparing stage.
3415              */
3416             if (migration_incoming_colo_enabled()) {
3417                 if (migration_incoming_in_colo_state()) {
3418                     /* In COLO stage, put all pages into cache temporarily */
3419                     host = colo_cache_from_block_offset(block, addr, true);
3420                 } else {
3421                    /*
3422                     * In migration stage but before COLO stage,
3423                     * Put all pages into both cache and SVM's memory.
3424                     */
3425                     host_bak = colo_cache_from_block_offset(block, addr, false);
3426                 }
3427             }
3428             if (!host) {
3429                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3430                 ret = -EINVAL;
3431                 break;
3432             }
3433             if (!migration_incoming_in_colo_state()) {
3434                 ramblock_recv_bitmap_set(block, host);
3435             }
3436 
3437             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3438         }
3439 
3440         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3441         case RAM_SAVE_FLAG_MEM_SIZE:
3442             /* Synchronize RAM block list */
3443             total_ram_bytes = addr;
3444             while (!ret && total_ram_bytes) {
3445                 RAMBlock *block;
3446                 char id[256];
3447                 ram_addr_t length;
3448 
3449                 len = qemu_get_byte(f);
3450                 qemu_get_buffer(f, (uint8_t *)id, len);
3451                 id[len] = 0;
3452                 length = qemu_get_be64(f);
3453 
3454                 block = qemu_ram_block_by_name(id);
3455                 if (block && !qemu_ram_is_migratable(block)) {
3456                     error_report("block %s should not be migrated !", id);
3457                     ret = -EINVAL;
3458                 } else if (block) {
3459                     if (length != block->used_length) {
3460                         Error *local_err = NULL;
3461 
3462                         ret = qemu_ram_resize(block, length,
3463                                               &local_err);
3464                         if (local_err) {
3465                             error_report_err(local_err);
3466                         }
3467                     }
3468                     /* For postcopy we need to check hugepage sizes match */
3469                     if (postcopy_advised &&
3470                         block->page_size != qemu_host_page_size) {
3471                         uint64_t remote_page_size = qemu_get_be64(f);
3472                         if (remote_page_size != block->page_size) {
3473                             error_report("Mismatched RAM page size %s "
3474                                          "(local) %zd != %" PRId64,
3475                                          id, block->page_size,
3476                                          remote_page_size);
3477                             ret = -EINVAL;
3478                         }
3479                     }
3480                     if (migrate_ignore_shared()) {
3481                         hwaddr addr = qemu_get_be64(f);
3482                         if (ramblock_is_ignored(block) &&
3483                             block->mr->addr != addr) {
3484                             error_report("Mismatched GPAs for block %s "
3485                                          "%" PRId64 "!= %" PRId64,
3486                                          id, (uint64_t)addr,
3487                                          (uint64_t)block->mr->addr);
3488                             ret = -EINVAL;
3489                         }
3490                     }
3491                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3492                                           block->idstr);
3493                 } else {
3494                     error_report("Unknown ramblock \"%s\", cannot "
3495                                  "accept migration", id);
3496                     ret = -EINVAL;
3497                 }
3498 
3499                 total_ram_bytes -= length;
3500             }
3501             break;
3502 
3503         case RAM_SAVE_FLAG_ZERO:
3504             ch = qemu_get_byte(f);
3505             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3506             break;
3507 
3508         case RAM_SAVE_FLAG_PAGE:
3509             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3510             break;
3511 
3512         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3513             len = qemu_get_be32(f);
3514             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3515                 error_report("Invalid compressed data length: %d", len);
3516                 ret = -EINVAL;
3517                 break;
3518             }
3519             decompress_data_with_multi_threads(f, host, len);
3520             break;
3521 
3522         case RAM_SAVE_FLAG_XBZRLE:
3523             if (load_xbzrle(f, addr, host) < 0) {
3524                 error_report("Failed to decompress XBZRLE page at "
3525                              RAM_ADDR_FMT, addr);
3526                 ret = -EINVAL;
3527                 break;
3528             }
3529             break;
3530         case RAM_SAVE_FLAG_EOS:
3531             /* normal exit */
3532             multifd_recv_sync_main();
3533             break;
3534         default:
3535             if (flags & RAM_SAVE_FLAG_HOOK) {
3536                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3537             } else {
3538                 error_report("Unknown combination of migration flags: %#x",
3539                              flags);
3540                 ret = -EINVAL;
3541             }
3542         }
3543         if (!ret) {
3544             ret = qemu_file_get_error(f);
3545         }
3546         if (!ret && host_bak) {
3547             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3548         }
3549     }
3550 
3551     ret |= wait_for_decompress_done();
3552     return ret;
3553 }
3554 
3555 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3556 {
3557     int ret = 0;
3558     static uint64_t seq_iter;
3559     /*
3560      * If system is running in postcopy mode, page inserts to host memory must
3561      * be atomic
3562      */
3563     bool postcopy_running = postcopy_is_running();
3564 
3565     seq_iter++;
3566 
3567     if (version_id != 4) {
3568         return -EINVAL;
3569     }
3570 
3571     /*
3572      * This RCU critical section can be very long running.
3573      * When RCU reclaims in the code start to become numerous,
3574      * it will be necessary to reduce the granularity of this
3575      * critical section.
3576      */
3577     WITH_RCU_READ_LOCK_GUARD() {
3578         if (postcopy_running) {
3579             ret = ram_load_postcopy(f);
3580         } else {
3581             ret = ram_load_precopy(f);
3582         }
3583     }
3584     trace_ram_load_complete(ret, seq_iter);
3585 
3586     if (!ret  && migration_incoming_in_colo_state()) {
3587         colo_flush_ram_cache();
3588     }
3589     return ret;
3590 }
3591 
3592 static bool ram_has_postcopy(void *opaque)
3593 {
3594     RAMBlock *rb;
3595     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3596         if (ramblock_is_pmem(rb)) {
3597             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3598                          "is not supported now!", rb->idstr, rb->host);
3599             return false;
3600         }
3601     }
3602 
3603     return migrate_postcopy_ram();
3604 }
3605 
3606 /* Sync all the dirty bitmap with destination VM.  */
3607 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3608 {
3609     RAMBlock *block;
3610     QEMUFile *file = s->to_dst_file;
3611     int ramblock_count = 0;
3612 
3613     trace_ram_dirty_bitmap_sync_start();
3614 
3615     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3616         qemu_savevm_send_recv_bitmap(file, block->idstr);
3617         trace_ram_dirty_bitmap_request(block->idstr);
3618         ramblock_count++;
3619     }
3620 
3621     trace_ram_dirty_bitmap_sync_wait();
3622 
3623     /* Wait until all the ramblocks' dirty bitmap synced */
3624     while (ramblock_count--) {
3625         qemu_sem_wait(&s->rp_state.rp_sem);
3626     }
3627 
3628     trace_ram_dirty_bitmap_sync_complete();
3629 
3630     return 0;
3631 }
3632 
3633 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3634 {
3635     qemu_sem_post(&s->rp_state.rp_sem);
3636 }
3637 
3638 /*
3639  * Read the received bitmap, revert it as the initial dirty bitmap.
3640  * This is only used when the postcopy migration is paused but wants
3641  * to resume from a middle point.
3642  */
3643 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3644 {
3645     int ret = -EINVAL;
3646     QEMUFile *file = s->rp_state.from_dst_file;
3647     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3648     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3649     uint64_t size, end_mark;
3650 
3651     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3652 
3653     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3654         error_report("%s: incorrect state %s", __func__,
3655                      MigrationStatus_str(s->state));
3656         return -EINVAL;
3657     }
3658 
3659     /*
3660      * Note: see comments in ramblock_recv_bitmap_send() on why we
3661      * need the endianess convertion, and the paddings.
3662      */
3663     local_size = ROUND_UP(local_size, 8);
3664 
3665     /* Add paddings */
3666     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3667 
3668     size = qemu_get_be64(file);
3669 
3670     /* The size of the bitmap should match with our ramblock */
3671     if (size != local_size) {
3672         error_report("%s: ramblock '%s' bitmap size mismatch "
3673                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3674                      block->idstr, size, local_size);
3675         ret = -EINVAL;
3676         goto out;
3677     }
3678 
3679     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3680     end_mark = qemu_get_be64(file);
3681 
3682     ret = qemu_file_get_error(file);
3683     if (ret || size != local_size) {
3684         error_report("%s: read bitmap failed for ramblock '%s': %d"
3685                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3686                      __func__, block->idstr, ret, local_size, size);
3687         ret = -EIO;
3688         goto out;
3689     }
3690 
3691     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3692         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3693                      __func__, block->idstr, end_mark);
3694         ret = -EINVAL;
3695         goto out;
3696     }
3697 
3698     /*
3699      * Endianess convertion. We are during postcopy (though paused).
3700      * The dirty bitmap won't change. We can directly modify it.
3701      */
3702     bitmap_from_le(block->bmap, le_bitmap, nbits);
3703 
3704     /*
3705      * What we received is "received bitmap". Revert it as the initial
3706      * dirty bitmap for this ramblock.
3707      */
3708     bitmap_complement(block->bmap, block->bmap, nbits);
3709 
3710     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3711 
3712     /*
3713      * We succeeded to sync bitmap for current ramblock. If this is
3714      * the last one to sync, we need to notify the main send thread.
3715      */
3716     ram_dirty_bitmap_reload_notify(s);
3717 
3718     ret = 0;
3719 out:
3720     g_free(le_bitmap);
3721     return ret;
3722 }
3723 
3724 static int ram_resume_prepare(MigrationState *s, void *opaque)
3725 {
3726     RAMState *rs = *(RAMState **)opaque;
3727     int ret;
3728 
3729     ret = ram_dirty_bitmap_sync_all(s, rs);
3730     if (ret) {
3731         return ret;
3732     }
3733 
3734     ram_state_resume_prepare(rs, s->to_dst_file);
3735 
3736     return 0;
3737 }
3738 
3739 static SaveVMHandlers savevm_ram_handlers = {
3740     .save_setup = ram_save_setup,
3741     .save_live_iterate = ram_save_iterate,
3742     .save_live_complete_postcopy = ram_save_complete,
3743     .save_live_complete_precopy = ram_save_complete,
3744     .has_postcopy = ram_has_postcopy,
3745     .save_live_pending = ram_save_pending,
3746     .load_state = ram_load,
3747     .save_cleanup = ram_save_cleanup,
3748     .load_setup = ram_load_setup,
3749     .load_cleanup = ram_load_cleanup,
3750     .resume_prepare = ram_resume_prepare,
3751 };
3752 
3753 void ram_mig_init(void)
3754 {
3755     qemu_mutex_init(&XBZRLE.lock);
3756     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3757 }
3758