xref: /openbmc/qemu/migration/ram.c (revision ed5abf46)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include "qemu/cutils.h"
32 #include "qemu/bitops.h"
33 #include "qemu/bitmap.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "block.h"
54 #include "sysemu/sysemu.h"
55 #include "savevm.h"
56 #include "qemu/iov.h"
57 #include "multifd.h"
58 
59 /***********************************************************/
60 /* ram save/restore */
61 
62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
63  * worked for pages that where filled with the same char.  We switched
64  * it to only search for the zero value.  And to avoid confusion with
65  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
66  */
67 
68 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
69 #define RAM_SAVE_FLAG_ZERO     0x02
70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
71 #define RAM_SAVE_FLAG_PAGE     0x08
72 #define RAM_SAVE_FLAG_EOS      0x10
73 #define RAM_SAVE_FLAG_CONTINUE 0x20
74 #define RAM_SAVE_FLAG_XBZRLE   0x40
75 /* 0x80 is reserved in migration.h start with 0x100 next */
76 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
77 
78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
79 {
80     return buffer_is_zero(p, size);
81 }
82 
83 XBZRLECacheStats xbzrle_counters;
84 
85 /* struct contains XBZRLE cache and a static page
86    used by the compression */
87 static struct {
88     /* buffer used for XBZRLE encoding */
89     uint8_t *encoded_buf;
90     /* buffer for storing page content */
91     uint8_t *current_buf;
92     /* Cache for XBZRLE, Protected by lock. */
93     PageCache *cache;
94     QemuMutex lock;
95     /* it will store a page full of zeros */
96     uint8_t *zero_target_page;
97     /* buffer used for XBZRLE decoding */
98     uint8_t *decoded_buf;
99 } XBZRLE;
100 
101 static void XBZRLE_cache_lock(void)
102 {
103     if (migrate_use_xbzrle())
104         qemu_mutex_lock(&XBZRLE.lock);
105 }
106 
107 static void XBZRLE_cache_unlock(void)
108 {
109     if (migrate_use_xbzrle())
110         qemu_mutex_unlock(&XBZRLE.lock);
111 }
112 
113 /**
114  * xbzrle_cache_resize: resize the xbzrle cache
115  *
116  * This function is called from qmp_migrate_set_cache_size in main
117  * thread, possibly while a migration is in progress.  A running
118  * migration may be using the cache and might finish during this call,
119  * hence changes to the cache are protected by XBZRLE.lock().
120  *
121  * Returns 0 for success or -1 for error
122  *
123  * @new_size: new cache size
124  * @errp: set *errp if the check failed, with reason
125  */
126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
127 {
128     PageCache *new_cache;
129     int64_t ret = 0;
130 
131     /* Check for truncation */
132     if (new_size != (size_t)new_size) {
133         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134                    "exceeding address space");
135         return -1;
136     }
137 
138     if (new_size == migrate_xbzrle_cache_size()) {
139         /* nothing to do */
140         return 0;
141     }
142 
143     XBZRLE_cache_lock();
144 
145     if (XBZRLE.cache != NULL) {
146         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
147         if (!new_cache) {
148             ret = -1;
149             goto out;
150         }
151 
152         cache_fini(XBZRLE.cache);
153         XBZRLE.cache = new_cache;
154     }
155 out:
156     XBZRLE_cache_unlock();
157     return ret;
158 }
159 
160 static bool ramblock_is_ignored(RAMBlock *block)
161 {
162     return !qemu_ram_is_migratable(block) ||
163            (migrate_ignore_shared() && qemu_ram_is_shared(block));
164 }
165 
166 /* Should be holding either ram_list.mutex, or the RCU lock. */
167 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
168     INTERNAL_RAMBLOCK_FOREACH(block)                   \
169         if (ramblock_is_ignored(block)) {} else
170 
171 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
172     INTERNAL_RAMBLOCK_FOREACH(block)                   \
173         if (!qemu_ram_is_migratable(block)) {} else
174 
175 #undef RAMBLOCK_FOREACH
176 
177 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
178 {
179     RAMBlock *block;
180     int ret = 0;
181 
182     RCU_READ_LOCK_GUARD();
183 
184     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
185         ret = func(block, opaque);
186         if (ret) {
187             break;
188         }
189     }
190     return ret;
191 }
192 
193 static void ramblock_recv_map_init(void)
194 {
195     RAMBlock *rb;
196 
197     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
198         assert(!rb->receivedmap);
199         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
200     }
201 }
202 
203 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
204 {
205     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
206                     rb->receivedmap);
207 }
208 
209 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
210 {
211     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
212 }
213 
214 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
215 {
216     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
217 }
218 
219 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
220                                     size_t nr)
221 {
222     bitmap_set_atomic(rb->receivedmap,
223                       ramblock_recv_bitmap_offset(host_addr, rb),
224                       nr);
225 }
226 
227 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
228 
229 /*
230  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
231  *
232  * Returns >0 if success with sent bytes, or <0 if error.
233  */
234 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
235                                   const char *block_name)
236 {
237     RAMBlock *block = qemu_ram_block_by_name(block_name);
238     unsigned long *le_bitmap, nbits;
239     uint64_t size;
240 
241     if (!block) {
242         error_report("%s: invalid block name: %s", __func__, block_name);
243         return -1;
244     }
245 
246     nbits = block->used_length >> TARGET_PAGE_BITS;
247 
248     /*
249      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
250      * machines we may need 4 more bytes for padding (see below
251      * comment). So extend it a bit before hand.
252      */
253     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
254 
255     /*
256      * Always use little endian when sending the bitmap. This is
257      * required that when source and destination VMs are not using the
258      * same endianess. (Note: big endian won't work.)
259      */
260     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
261 
262     /* Size of the bitmap, in bytes */
263     size = DIV_ROUND_UP(nbits, 8);
264 
265     /*
266      * size is always aligned to 8 bytes for 64bit machines, but it
267      * may not be true for 32bit machines. We need this padding to
268      * make sure the migration can survive even between 32bit and
269      * 64bit machines.
270      */
271     size = ROUND_UP(size, 8);
272 
273     qemu_put_be64(file, size);
274     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
275     /*
276      * Mark as an end, in case the middle part is screwed up due to
277      * some "misterious" reason.
278      */
279     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
280     qemu_fflush(file);
281 
282     g_free(le_bitmap);
283 
284     if (qemu_file_get_error(file)) {
285         return qemu_file_get_error(file);
286     }
287 
288     return size + sizeof(size);
289 }
290 
291 /*
292  * An outstanding page request, on the source, having been received
293  * and queued
294  */
295 struct RAMSrcPageRequest {
296     RAMBlock *rb;
297     hwaddr    offset;
298     hwaddr    len;
299 
300     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
301 };
302 
303 /* State of RAM for migration */
304 struct RAMState {
305     /* QEMUFile used for this migration */
306     QEMUFile *f;
307     /* Last block that we have visited searching for dirty pages */
308     RAMBlock *last_seen_block;
309     /* Last block from where we have sent data */
310     RAMBlock *last_sent_block;
311     /* Last dirty target page we have sent */
312     ram_addr_t last_page;
313     /* last ram version we have seen */
314     uint32_t last_version;
315     /* We are in the first round */
316     bool ram_bulk_stage;
317     /* The free page optimization is enabled */
318     bool fpo_enabled;
319     /* How many times we have dirty too many pages */
320     int dirty_rate_high_cnt;
321     /* these variables are used for bitmap sync */
322     /* last time we did a full bitmap_sync */
323     int64_t time_last_bitmap_sync;
324     /* bytes transferred at start_time */
325     uint64_t bytes_xfer_prev;
326     /* number of dirty pages since start_time */
327     uint64_t num_dirty_pages_period;
328     /* xbzrle misses since the beginning of the period */
329     uint64_t xbzrle_cache_miss_prev;
330 
331     /* compression statistics since the beginning of the period */
332     /* amount of count that no free thread to compress data */
333     uint64_t compress_thread_busy_prev;
334     /* amount bytes after compression */
335     uint64_t compressed_size_prev;
336     /* amount of compressed pages */
337     uint64_t compress_pages_prev;
338 
339     /* total handled target pages at the beginning of period */
340     uint64_t target_page_count_prev;
341     /* total handled target pages since start */
342     uint64_t target_page_count;
343     /* number of dirty bits in the bitmap */
344     uint64_t migration_dirty_pages;
345     /* Protects modification of the bitmap and migration dirty pages */
346     QemuMutex bitmap_mutex;
347     /* The RAMBlock used in the last src_page_requests */
348     RAMBlock *last_req_rb;
349     /* Queue of outstanding page requests from the destination */
350     QemuMutex src_page_req_mutex;
351     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
352 };
353 typedef struct RAMState RAMState;
354 
355 static RAMState *ram_state;
356 
357 static NotifierWithReturnList precopy_notifier_list;
358 
359 void precopy_infrastructure_init(void)
360 {
361     notifier_with_return_list_init(&precopy_notifier_list);
362 }
363 
364 void precopy_add_notifier(NotifierWithReturn *n)
365 {
366     notifier_with_return_list_add(&precopy_notifier_list, n);
367 }
368 
369 void precopy_remove_notifier(NotifierWithReturn *n)
370 {
371     notifier_with_return_remove(n);
372 }
373 
374 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
375 {
376     PrecopyNotifyData pnd;
377     pnd.reason = reason;
378     pnd.errp = errp;
379 
380     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
381 }
382 
383 void precopy_enable_free_page_optimization(void)
384 {
385     if (!ram_state) {
386         return;
387     }
388 
389     ram_state->fpo_enabled = true;
390 }
391 
392 uint64_t ram_bytes_remaining(void)
393 {
394     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
395                        0;
396 }
397 
398 MigrationStats ram_counters;
399 
400 /* used by the search for pages to send */
401 struct PageSearchStatus {
402     /* Current block being searched */
403     RAMBlock    *block;
404     /* Current page to search from */
405     unsigned long page;
406     /* Set once we wrap around */
407     bool         complete_round;
408 };
409 typedef struct PageSearchStatus PageSearchStatus;
410 
411 CompressionStats compression_counters;
412 
413 struct CompressParam {
414     bool done;
415     bool quit;
416     bool zero_page;
417     QEMUFile *file;
418     QemuMutex mutex;
419     QemuCond cond;
420     RAMBlock *block;
421     ram_addr_t offset;
422 
423     /* internally used fields */
424     z_stream stream;
425     uint8_t *originbuf;
426 };
427 typedef struct CompressParam CompressParam;
428 
429 struct DecompressParam {
430     bool done;
431     bool quit;
432     QemuMutex mutex;
433     QemuCond cond;
434     void *des;
435     uint8_t *compbuf;
436     int len;
437     z_stream stream;
438 };
439 typedef struct DecompressParam DecompressParam;
440 
441 static CompressParam *comp_param;
442 static QemuThread *compress_threads;
443 /* comp_done_cond is used to wake up the migration thread when
444  * one of the compression threads has finished the compression.
445  * comp_done_lock is used to co-work with comp_done_cond.
446  */
447 static QemuMutex comp_done_lock;
448 static QemuCond comp_done_cond;
449 /* The empty QEMUFileOps will be used by file in CompressParam */
450 static const QEMUFileOps empty_ops = { };
451 
452 static QEMUFile *decomp_file;
453 static DecompressParam *decomp_param;
454 static QemuThread *decompress_threads;
455 static QemuMutex decomp_done_lock;
456 static QemuCond decomp_done_cond;
457 
458 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
459                                  ram_addr_t offset, uint8_t *source_buf);
460 
461 static void *do_data_compress(void *opaque)
462 {
463     CompressParam *param = opaque;
464     RAMBlock *block;
465     ram_addr_t offset;
466     bool zero_page;
467 
468     qemu_mutex_lock(&param->mutex);
469     while (!param->quit) {
470         if (param->block) {
471             block = param->block;
472             offset = param->offset;
473             param->block = NULL;
474             qemu_mutex_unlock(&param->mutex);
475 
476             zero_page = do_compress_ram_page(param->file, &param->stream,
477                                              block, offset, param->originbuf);
478 
479             qemu_mutex_lock(&comp_done_lock);
480             param->done = true;
481             param->zero_page = zero_page;
482             qemu_cond_signal(&comp_done_cond);
483             qemu_mutex_unlock(&comp_done_lock);
484 
485             qemu_mutex_lock(&param->mutex);
486         } else {
487             qemu_cond_wait(&param->cond, &param->mutex);
488         }
489     }
490     qemu_mutex_unlock(&param->mutex);
491 
492     return NULL;
493 }
494 
495 static void compress_threads_save_cleanup(void)
496 {
497     int i, thread_count;
498 
499     if (!migrate_use_compression() || !comp_param) {
500         return;
501     }
502 
503     thread_count = migrate_compress_threads();
504     for (i = 0; i < thread_count; i++) {
505         /*
506          * we use it as a indicator which shows if the thread is
507          * properly init'd or not
508          */
509         if (!comp_param[i].file) {
510             break;
511         }
512 
513         qemu_mutex_lock(&comp_param[i].mutex);
514         comp_param[i].quit = true;
515         qemu_cond_signal(&comp_param[i].cond);
516         qemu_mutex_unlock(&comp_param[i].mutex);
517 
518         qemu_thread_join(compress_threads + i);
519         qemu_mutex_destroy(&comp_param[i].mutex);
520         qemu_cond_destroy(&comp_param[i].cond);
521         deflateEnd(&comp_param[i].stream);
522         g_free(comp_param[i].originbuf);
523         qemu_fclose(comp_param[i].file);
524         comp_param[i].file = NULL;
525     }
526     qemu_mutex_destroy(&comp_done_lock);
527     qemu_cond_destroy(&comp_done_cond);
528     g_free(compress_threads);
529     g_free(comp_param);
530     compress_threads = NULL;
531     comp_param = NULL;
532 }
533 
534 static int compress_threads_save_setup(void)
535 {
536     int i, thread_count;
537 
538     if (!migrate_use_compression()) {
539         return 0;
540     }
541     thread_count = migrate_compress_threads();
542     compress_threads = g_new0(QemuThread, thread_count);
543     comp_param = g_new0(CompressParam, thread_count);
544     qemu_cond_init(&comp_done_cond);
545     qemu_mutex_init(&comp_done_lock);
546     for (i = 0; i < thread_count; i++) {
547         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
548         if (!comp_param[i].originbuf) {
549             goto exit;
550         }
551 
552         if (deflateInit(&comp_param[i].stream,
553                         migrate_compress_level()) != Z_OK) {
554             g_free(comp_param[i].originbuf);
555             goto exit;
556         }
557 
558         /* comp_param[i].file is just used as a dummy buffer to save data,
559          * set its ops to empty.
560          */
561         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
562         comp_param[i].done = true;
563         comp_param[i].quit = false;
564         qemu_mutex_init(&comp_param[i].mutex);
565         qemu_cond_init(&comp_param[i].cond);
566         qemu_thread_create(compress_threads + i, "compress",
567                            do_data_compress, comp_param + i,
568                            QEMU_THREAD_JOINABLE);
569     }
570     return 0;
571 
572 exit:
573     compress_threads_save_cleanup();
574     return -1;
575 }
576 
577 /**
578  * save_page_header: write page header to wire
579  *
580  * If this is the 1st block, it also writes the block identification
581  *
582  * Returns the number of bytes written
583  *
584  * @f: QEMUFile where to send the data
585  * @block: block that contains the page we want to send
586  * @offset: offset inside the block for the page
587  *          in the lower bits, it contains flags
588  */
589 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
590                                ram_addr_t offset)
591 {
592     size_t size, len;
593 
594     if (block == rs->last_sent_block) {
595         offset |= RAM_SAVE_FLAG_CONTINUE;
596     }
597     qemu_put_be64(f, offset);
598     size = 8;
599 
600     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
601         len = strlen(block->idstr);
602         qemu_put_byte(f, len);
603         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
604         size += 1 + len;
605         rs->last_sent_block = block;
606     }
607     return size;
608 }
609 
610 /**
611  * mig_throttle_guest_down: throotle down the guest
612  *
613  * Reduce amount of guest cpu execution to hopefully slow down memory
614  * writes. If guest dirty memory rate is reduced below the rate at
615  * which we can transfer pages to the destination then we should be
616  * able to complete migration. Some workloads dirty memory way too
617  * fast and will not effectively converge, even with auto-converge.
618  */
619 static void mig_throttle_guest_down(void)
620 {
621     MigrationState *s = migrate_get_current();
622     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
623     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
624     int pct_max = s->parameters.max_cpu_throttle;
625 
626     /* We have not started throttling yet. Let's start it. */
627     if (!cpu_throttle_active()) {
628         cpu_throttle_set(pct_initial);
629     } else {
630         /* Throttling already on, just increase the rate */
631         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
632                          pct_max));
633     }
634 }
635 
636 /**
637  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
638  *
639  * @rs: current RAM state
640  * @current_addr: address for the zero page
641  *
642  * Update the xbzrle cache to reflect a page that's been sent as all 0.
643  * The important thing is that a stale (not-yet-0'd) page be replaced
644  * by the new data.
645  * As a bonus, if the page wasn't in the cache it gets added so that
646  * when a small write is made into the 0'd page it gets XBZRLE sent.
647  */
648 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
649 {
650     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
651         return;
652     }
653 
654     /* We don't care if this fails to allocate a new cache page
655      * as long as it updated an old one */
656     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
657                  ram_counters.dirty_sync_count);
658 }
659 
660 #define ENCODING_FLAG_XBZRLE 0x1
661 
662 /**
663  * save_xbzrle_page: compress and send current page
664  *
665  * Returns: 1 means that we wrote the page
666  *          0 means that page is identical to the one already sent
667  *          -1 means that xbzrle would be longer than normal
668  *
669  * @rs: current RAM state
670  * @current_data: pointer to the address of the page contents
671  * @current_addr: addr of the page
672  * @block: block that contains the page we want to send
673  * @offset: offset inside the block for the page
674  * @last_stage: if we are at the completion stage
675  */
676 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
677                             ram_addr_t current_addr, RAMBlock *block,
678                             ram_addr_t offset, bool last_stage)
679 {
680     int encoded_len = 0, bytes_xbzrle;
681     uint8_t *prev_cached_page;
682 
683     if (!cache_is_cached(XBZRLE.cache, current_addr,
684                          ram_counters.dirty_sync_count)) {
685         xbzrle_counters.cache_miss++;
686         if (!last_stage) {
687             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
688                              ram_counters.dirty_sync_count) == -1) {
689                 return -1;
690             } else {
691                 /* update *current_data when the page has been
692                    inserted into cache */
693                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
694             }
695         }
696         return -1;
697     }
698 
699     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
700 
701     /* save current buffer into memory */
702     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
703 
704     /* XBZRLE encoding (if there is no overflow) */
705     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
706                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
707                                        TARGET_PAGE_SIZE);
708 
709     /*
710      * Update the cache contents, so that it corresponds to the data
711      * sent, in all cases except where we skip the page.
712      */
713     if (!last_stage && encoded_len != 0) {
714         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
715         /*
716          * In the case where we couldn't compress, ensure that the caller
717          * sends the data from the cache, since the guest might have
718          * changed the RAM since we copied it.
719          */
720         *current_data = prev_cached_page;
721     }
722 
723     if (encoded_len == 0) {
724         trace_save_xbzrle_page_skipping();
725         return 0;
726     } else if (encoded_len == -1) {
727         trace_save_xbzrle_page_overflow();
728         xbzrle_counters.overflow++;
729         return -1;
730     }
731 
732     /* Send XBZRLE based compressed page */
733     bytes_xbzrle = save_page_header(rs, rs->f, block,
734                                     offset | RAM_SAVE_FLAG_XBZRLE);
735     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
736     qemu_put_be16(rs->f, encoded_len);
737     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
738     bytes_xbzrle += encoded_len + 1 + 2;
739     xbzrle_counters.pages++;
740     xbzrle_counters.bytes += bytes_xbzrle;
741     ram_counters.transferred += bytes_xbzrle;
742 
743     return 1;
744 }
745 
746 /**
747  * migration_bitmap_find_dirty: find the next dirty page from start
748  *
749  * Returns the page offset within memory region of the start of a dirty page
750  *
751  * @rs: current RAM state
752  * @rb: RAMBlock where to search for dirty pages
753  * @start: page where we start the search
754  */
755 static inline
756 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
757                                           unsigned long start)
758 {
759     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
760     unsigned long *bitmap = rb->bmap;
761     unsigned long next;
762 
763     if (ramblock_is_ignored(rb)) {
764         return size;
765     }
766 
767     /*
768      * When the free page optimization is enabled, we need to check the bitmap
769      * to send the non-free pages rather than all the pages in the bulk stage.
770      */
771     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
772         next = start + 1;
773     } else {
774         next = find_next_bit(bitmap, size, start);
775     }
776 
777     return next;
778 }
779 
780 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
781                                                 RAMBlock *rb,
782                                                 unsigned long page)
783 {
784     bool ret;
785 
786     qemu_mutex_lock(&rs->bitmap_mutex);
787 
788     /*
789      * Clear dirty bitmap if needed.  This _must_ be called before we
790      * send any of the page in the chunk because we need to make sure
791      * we can capture further page content changes when we sync dirty
792      * log the next time.  So as long as we are going to send any of
793      * the page in the chunk we clear the remote dirty bitmap for all.
794      * Clearing it earlier won't be a problem, but too late will.
795      */
796     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
797         uint8_t shift = rb->clear_bmap_shift;
798         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
799         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
800 
801         /*
802          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
803          * can make things easier sometimes since then start address
804          * of the small chunk will always be 64 pages aligned so the
805          * bitmap will always be aligned to unsigned long.  We should
806          * even be able to remove this restriction but I'm simply
807          * keeping it.
808          */
809         assert(shift >= 6);
810         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
811         memory_region_clear_dirty_bitmap(rb->mr, start, size);
812     }
813 
814     ret = test_and_clear_bit(page, rb->bmap);
815 
816     if (ret) {
817         rs->migration_dirty_pages--;
818     }
819     qemu_mutex_unlock(&rs->bitmap_mutex);
820 
821     return ret;
822 }
823 
824 /* Called with RCU critical section */
825 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
826 {
827     rs->migration_dirty_pages +=
828         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
829                                               &rs->num_dirty_pages_period);
830 }
831 
832 /**
833  * ram_pagesize_summary: calculate all the pagesizes of a VM
834  *
835  * Returns a summary bitmap of the page sizes of all RAMBlocks
836  *
837  * For VMs with just normal pages this is equivalent to the host page
838  * size. If it's got some huge pages then it's the OR of all the
839  * different page sizes.
840  */
841 uint64_t ram_pagesize_summary(void)
842 {
843     RAMBlock *block;
844     uint64_t summary = 0;
845 
846     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
847         summary |= block->page_size;
848     }
849 
850     return summary;
851 }
852 
853 uint64_t ram_get_total_transferred_pages(void)
854 {
855     return  ram_counters.normal + ram_counters.duplicate +
856                 compression_counters.pages + xbzrle_counters.pages;
857 }
858 
859 static void migration_update_rates(RAMState *rs, int64_t end_time)
860 {
861     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
862     double compressed_size;
863 
864     /* calculate period counters */
865     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
866                 / (end_time - rs->time_last_bitmap_sync);
867 
868     if (!page_count) {
869         return;
870     }
871 
872     if (migrate_use_xbzrle()) {
873         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
874             rs->xbzrle_cache_miss_prev) / page_count;
875         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
876     }
877 
878     if (migrate_use_compression()) {
879         compression_counters.busy_rate = (double)(compression_counters.busy -
880             rs->compress_thread_busy_prev) / page_count;
881         rs->compress_thread_busy_prev = compression_counters.busy;
882 
883         compressed_size = compression_counters.compressed_size -
884                           rs->compressed_size_prev;
885         if (compressed_size) {
886             double uncompressed_size = (compression_counters.pages -
887                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
888 
889             /* Compression-Ratio = Uncompressed-size / Compressed-size */
890             compression_counters.compression_rate =
891                                         uncompressed_size / compressed_size;
892 
893             rs->compress_pages_prev = compression_counters.pages;
894             rs->compressed_size_prev = compression_counters.compressed_size;
895         }
896     }
897 }
898 
899 static void migration_trigger_throttle(RAMState *rs)
900 {
901     MigrationState *s = migrate_get_current();
902     uint64_t threshold = s->parameters.throttle_trigger_threshold;
903 
904     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
905     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
906     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
907 
908     /* During block migration the auto-converge logic incorrectly detects
909      * that ram migration makes no progress. Avoid this by disabling the
910      * throttling logic during the bulk phase of block migration. */
911     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
912         /* The following detection logic can be refined later. For now:
913            Check to see if the ratio between dirtied bytes and the approx.
914            amount of bytes that just got transferred since the last time
915            we were in this routine reaches the threshold. If that happens
916            twice, start or increase throttling. */
917 
918         if ((bytes_dirty_period > bytes_dirty_threshold) &&
919             (++rs->dirty_rate_high_cnt >= 2)) {
920             trace_migration_throttle();
921             rs->dirty_rate_high_cnt = 0;
922             mig_throttle_guest_down();
923         }
924     }
925 }
926 
927 static void migration_bitmap_sync(RAMState *rs)
928 {
929     RAMBlock *block;
930     int64_t end_time;
931 
932     ram_counters.dirty_sync_count++;
933 
934     if (!rs->time_last_bitmap_sync) {
935         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
936     }
937 
938     trace_migration_bitmap_sync_start();
939     memory_global_dirty_log_sync();
940 
941     qemu_mutex_lock(&rs->bitmap_mutex);
942     WITH_RCU_READ_LOCK_GUARD() {
943         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
944             ramblock_sync_dirty_bitmap(rs, block);
945         }
946         ram_counters.remaining = ram_bytes_remaining();
947     }
948     qemu_mutex_unlock(&rs->bitmap_mutex);
949 
950     memory_global_after_dirty_log_sync();
951     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
952 
953     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
954 
955     /* more than 1 second = 1000 millisecons */
956     if (end_time > rs->time_last_bitmap_sync + 1000) {
957         migration_trigger_throttle(rs);
958 
959         migration_update_rates(rs, end_time);
960 
961         rs->target_page_count_prev = rs->target_page_count;
962 
963         /* reset period counters */
964         rs->time_last_bitmap_sync = end_time;
965         rs->num_dirty_pages_period = 0;
966         rs->bytes_xfer_prev = ram_counters.transferred;
967     }
968     if (migrate_use_events()) {
969         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
970     }
971 }
972 
973 static void migration_bitmap_sync_precopy(RAMState *rs)
974 {
975     Error *local_err = NULL;
976 
977     /*
978      * The current notifier usage is just an optimization to migration, so we
979      * don't stop the normal migration process in the error case.
980      */
981     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
982         error_report_err(local_err);
983     }
984 
985     migration_bitmap_sync(rs);
986 
987     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
988         error_report_err(local_err);
989     }
990 }
991 
992 /**
993  * save_zero_page_to_file: send the zero page to the file
994  *
995  * Returns the size of data written to the file, 0 means the page is not
996  * a zero page
997  *
998  * @rs: current RAM state
999  * @file: the file where the data is saved
1000  * @block: block that contains the page we want to send
1001  * @offset: offset inside the block for the page
1002  */
1003 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1004                                   RAMBlock *block, ram_addr_t offset)
1005 {
1006     uint8_t *p = block->host + offset;
1007     int len = 0;
1008 
1009     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1010         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1011         qemu_put_byte(file, 0);
1012         len += 1;
1013     }
1014     return len;
1015 }
1016 
1017 /**
1018  * save_zero_page: send the zero page to the stream
1019  *
1020  * Returns the number of pages written.
1021  *
1022  * @rs: current RAM state
1023  * @block: block that contains the page we want to send
1024  * @offset: offset inside the block for the page
1025  */
1026 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1027 {
1028     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1029 
1030     if (len) {
1031         ram_counters.duplicate++;
1032         ram_counters.transferred += len;
1033         return 1;
1034     }
1035     return -1;
1036 }
1037 
1038 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1039 {
1040     if (!migrate_release_ram() || !migration_in_postcopy()) {
1041         return;
1042     }
1043 
1044     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1045 }
1046 
1047 /*
1048  * @pages: the number of pages written by the control path,
1049  *        < 0 - error
1050  *        > 0 - number of pages written
1051  *
1052  * Return true if the pages has been saved, otherwise false is returned.
1053  */
1054 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1055                               int *pages)
1056 {
1057     uint64_t bytes_xmit = 0;
1058     int ret;
1059 
1060     *pages = -1;
1061     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1062                                 &bytes_xmit);
1063     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1064         return false;
1065     }
1066 
1067     if (bytes_xmit) {
1068         ram_counters.transferred += bytes_xmit;
1069         *pages = 1;
1070     }
1071 
1072     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1073         return true;
1074     }
1075 
1076     if (bytes_xmit > 0) {
1077         ram_counters.normal++;
1078     } else if (bytes_xmit == 0) {
1079         ram_counters.duplicate++;
1080     }
1081 
1082     return true;
1083 }
1084 
1085 /*
1086  * directly send the page to the stream
1087  *
1088  * Returns the number of pages written.
1089  *
1090  * @rs: current RAM state
1091  * @block: block that contains the page we want to send
1092  * @offset: offset inside the block for the page
1093  * @buf: the page to be sent
1094  * @async: send to page asyncly
1095  */
1096 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1097                             uint8_t *buf, bool async)
1098 {
1099     ram_counters.transferred += save_page_header(rs, rs->f, block,
1100                                                  offset | RAM_SAVE_FLAG_PAGE);
1101     if (async) {
1102         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1103                               migrate_release_ram() &
1104                               migration_in_postcopy());
1105     } else {
1106         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1107     }
1108     ram_counters.transferred += TARGET_PAGE_SIZE;
1109     ram_counters.normal++;
1110     return 1;
1111 }
1112 
1113 /**
1114  * ram_save_page: send the given page to the stream
1115  *
1116  * Returns the number of pages written.
1117  *          < 0 - error
1118  *          >=0 - Number of pages written - this might legally be 0
1119  *                if xbzrle noticed the page was the same.
1120  *
1121  * @rs: current RAM state
1122  * @block: block that contains the page we want to send
1123  * @offset: offset inside the block for the page
1124  * @last_stage: if we are at the completion stage
1125  */
1126 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1127 {
1128     int pages = -1;
1129     uint8_t *p;
1130     bool send_async = true;
1131     RAMBlock *block = pss->block;
1132     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1133     ram_addr_t current_addr = block->offset + offset;
1134 
1135     p = block->host + offset;
1136     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1137 
1138     XBZRLE_cache_lock();
1139     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1140         migrate_use_xbzrle()) {
1141         pages = save_xbzrle_page(rs, &p, current_addr, block,
1142                                  offset, last_stage);
1143         if (!last_stage) {
1144             /* Can't send this cached data async, since the cache page
1145              * might get updated before it gets to the wire
1146              */
1147             send_async = false;
1148         }
1149     }
1150 
1151     /* XBZRLE overflow or normal page */
1152     if (pages == -1) {
1153         pages = save_normal_page(rs, block, offset, p, send_async);
1154     }
1155 
1156     XBZRLE_cache_unlock();
1157 
1158     return pages;
1159 }
1160 
1161 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1162                                  ram_addr_t offset)
1163 {
1164     if (multifd_queue_page(rs->f, block, offset) < 0) {
1165         return -1;
1166     }
1167     ram_counters.normal++;
1168 
1169     return 1;
1170 }
1171 
1172 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1173                                  ram_addr_t offset, uint8_t *source_buf)
1174 {
1175     RAMState *rs = ram_state;
1176     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1177     bool zero_page = false;
1178     int ret;
1179 
1180     if (save_zero_page_to_file(rs, f, block, offset)) {
1181         zero_page = true;
1182         goto exit;
1183     }
1184 
1185     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1186 
1187     /*
1188      * copy it to a internal buffer to avoid it being modified by VM
1189      * so that we can catch up the error during compression and
1190      * decompression
1191      */
1192     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1193     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1194     if (ret < 0) {
1195         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1196         error_report("compressed data failed!");
1197         return false;
1198     }
1199 
1200 exit:
1201     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1202     return zero_page;
1203 }
1204 
1205 static void
1206 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1207 {
1208     ram_counters.transferred += bytes_xmit;
1209 
1210     if (param->zero_page) {
1211         ram_counters.duplicate++;
1212         return;
1213     }
1214 
1215     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1216     compression_counters.compressed_size += bytes_xmit - 8;
1217     compression_counters.pages++;
1218 }
1219 
1220 static bool save_page_use_compression(RAMState *rs);
1221 
1222 static void flush_compressed_data(RAMState *rs)
1223 {
1224     int idx, len, thread_count;
1225 
1226     if (!save_page_use_compression(rs)) {
1227         return;
1228     }
1229     thread_count = migrate_compress_threads();
1230 
1231     qemu_mutex_lock(&comp_done_lock);
1232     for (idx = 0; idx < thread_count; idx++) {
1233         while (!comp_param[idx].done) {
1234             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1235         }
1236     }
1237     qemu_mutex_unlock(&comp_done_lock);
1238 
1239     for (idx = 0; idx < thread_count; idx++) {
1240         qemu_mutex_lock(&comp_param[idx].mutex);
1241         if (!comp_param[idx].quit) {
1242             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1243             /*
1244              * it's safe to fetch zero_page without holding comp_done_lock
1245              * as there is no further request submitted to the thread,
1246              * i.e, the thread should be waiting for a request at this point.
1247              */
1248             update_compress_thread_counts(&comp_param[idx], len);
1249         }
1250         qemu_mutex_unlock(&comp_param[idx].mutex);
1251     }
1252 }
1253 
1254 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1255                                        ram_addr_t offset)
1256 {
1257     param->block = block;
1258     param->offset = offset;
1259 }
1260 
1261 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1262                                            ram_addr_t offset)
1263 {
1264     int idx, thread_count, bytes_xmit = -1, pages = -1;
1265     bool wait = migrate_compress_wait_thread();
1266 
1267     thread_count = migrate_compress_threads();
1268     qemu_mutex_lock(&comp_done_lock);
1269 retry:
1270     for (idx = 0; idx < thread_count; idx++) {
1271         if (comp_param[idx].done) {
1272             comp_param[idx].done = false;
1273             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1274             qemu_mutex_lock(&comp_param[idx].mutex);
1275             set_compress_params(&comp_param[idx], block, offset);
1276             qemu_cond_signal(&comp_param[idx].cond);
1277             qemu_mutex_unlock(&comp_param[idx].mutex);
1278             pages = 1;
1279             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1280             break;
1281         }
1282     }
1283 
1284     /*
1285      * wait for the free thread if the user specifies 'compress-wait-thread',
1286      * otherwise we will post the page out in the main thread as normal page.
1287      */
1288     if (pages < 0 && wait) {
1289         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1290         goto retry;
1291     }
1292     qemu_mutex_unlock(&comp_done_lock);
1293 
1294     return pages;
1295 }
1296 
1297 /**
1298  * find_dirty_block: find the next dirty page and update any state
1299  * associated with the search process.
1300  *
1301  * Returns true if a page is found
1302  *
1303  * @rs: current RAM state
1304  * @pss: data about the state of the current dirty page scan
1305  * @again: set to false if the search has scanned the whole of RAM
1306  */
1307 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1308 {
1309     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1310     if (pss->complete_round && pss->block == rs->last_seen_block &&
1311         pss->page >= rs->last_page) {
1312         /*
1313          * We've been once around the RAM and haven't found anything.
1314          * Give up.
1315          */
1316         *again = false;
1317         return false;
1318     }
1319     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1320         >= pss->block->used_length) {
1321         /* Didn't find anything in this RAM Block */
1322         pss->page = 0;
1323         pss->block = QLIST_NEXT_RCU(pss->block, next);
1324         if (!pss->block) {
1325             /*
1326              * If memory migration starts over, we will meet a dirtied page
1327              * which may still exists in compression threads's ring, so we
1328              * should flush the compressed data to make sure the new page
1329              * is not overwritten by the old one in the destination.
1330              *
1331              * Also If xbzrle is on, stop using the data compression at this
1332              * point. In theory, xbzrle can do better than compression.
1333              */
1334             flush_compressed_data(rs);
1335 
1336             /* Hit the end of the list */
1337             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1338             /* Flag that we've looped */
1339             pss->complete_round = true;
1340             rs->ram_bulk_stage = false;
1341         }
1342         /* Didn't find anything this time, but try again on the new block */
1343         *again = true;
1344         return false;
1345     } else {
1346         /* Can go around again, but... */
1347         *again = true;
1348         /* We've found something so probably don't need to */
1349         return true;
1350     }
1351 }
1352 
1353 /**
1354  * unqueue_page: gets a page of the queue
1355  *
1356  * Helper for 'get_queued_page' - gets a page off the queue
1357  *
1358  * Returns the block of the page (or NULL if none available)
1359  *
1360  * @rs: current RAM state
1361  * @offset: used to return the offset within the RAMBlock
1362  */
1363 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1364 {
1365     RAMBlock *block = NULL;
1366 
1367     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1368         return NULL;
1369     }
1370 
1371     qemu_mutex_lock(&rs->src_page_req_mutex);
1372     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1373         struct RAMSrcPageRequest *entry =
1374                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1375         block = entry->rb;
1376         *offset = entry->offset;
1377 
1378         if (entry->len > TARGET_PAGE_SIZE) {
1379             entry->len -= TARGET_PAGE_SIZE;
1380             entry->offset += TARGET_PAGE_SIZE;
1381         } else {
1382             memory_region_unref(block->mr);
1383             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1384             g_free(entry);
1385             migration_consume_urgent_request();
1386         }
1387     }
1388     qemu_mutex_unlock(&rs->src_page_req_mutex);
1389 
1390     return block;
1391 }
1392 
1393 /**
1394  * get_queued_page: unqueue a page from the postcopy requests
1395  *
1396  * Skips pages that are already sent (!dirty)
1397  *
1398  * Returns true if a queued page is found
1399  *
1400  * @rs: current RAM state
1401  * @pss: data about the state of the current dirty page scan
1402  */
1403 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1404 {
1405     RAMBlock  *block;
1406     ram_addr_t offset;
1407     bool dirty;
1408 
1409     do {
1410         block = unqueue_page(rs, &offset);
1411         /*
1412          * We're sending this page, and since it's postcopy nothing else
1413          * will dirty it, and we must make sure it doesn't get sent again
1414          * even if this queue request was received after the background
1415          * search already sent it.
1416          */
1417         if (block) {
1418             unsigned long page;
1419 
1420             page = offset >> TARGET_PAGE_BITS;
1421             dirty = test_bit(page, block->bmap);
1422             if (!dirty) {
1423                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1424                                                 page);
1425             } else {
1426                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1427             }
1428         }
1429 
1430     } while (block && !dirty);
1431 
1432     if (block) {
1433         /*
1434          * As soon as we start servicing pages out of order, then we have
1435          * to kill the bulk stage, since the bulk stage assumes
1436          * in (migration_bitmap_find_and_reset_dirty) that every page is
1437          * dirty, that's no longer true.
1438          */
1439         rs->ram_bulk_stage = false;
1440 
1441         /*
1442          * We want the background search to continue from the queued page
1443          * since the guest is likely to want other pages near to the page
1444          * it just requested.
1445          */
1446         pss->block = block;
1447         pss->page = offset >> TARGET_PAGE_BITS;
1448 
1449         /*
1450          * This unqueued page would break the "one round" check, even is
1451          * really rare.
1452          */
1453         pss->complete_round = false;
1454     }
1455 
1456     return !!block;
1457 }
1458 
1459 /**
1460  * migration_page_queue_free: drop any remaining pages in the ram
1461  * request queue
1462  *
1463  * It should be empty at the end anyway, but in error cases there may
1464  * be some left.  in case that there is any page left, we drop it.
1465  *
1466  */
1467 static void migration_page_queue_free(RAMState *rs)
1468 {
1469     struct RAMSrcPageRequest *mspr, *next_mspr;
1470     /* This queue generally should be empty - but in the case of a failed
1471      * migration might have some droppings in.
1472      */
1473     RCU_READ_LOCK_GUARD();
1474     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1475         memory_region_unref(mspr->rb->mr);
1476         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1477         g_free(mspr);
1478     }
1479 }
1480 
1481 /**
1482  * ram_save_queue_pages: queue the page for transmission
1483  *
1484  * A request from postcopy destination for example.
1485  *
1486  * Returns zero on success or negative on error
1487  *
1488  * @rbname: Name of the RAMBLock of the request. NULL means the
1489  *          same that last one.
1490  * @start: starting address from the start of the RAMBlock
1491  * @len: length (in bytes) to send
1492  */
1493 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1494 {
1495     RAMBlock *ramblock;
1496     RAMState *rs = ram_state;
1497 
1498     ram_counters.postcopy_requests++;
1499     RCU_READ_LOCK_GUARD();
1500 
1501     if (!rbname) {
1502         /* Reuse last RAMBlock */
1503         ramblock = rs->last_req_rb;
1504 
1505         if (!ramblock) {
1506             /*
1507              * Shouldn't happen, we can't reuse the last RAMBlock if
1508              * it's the 1st request.
1509              */
1510             error_report("ram_save_queue_pages no previous block");
1511             return -1;
1512         }
1513     } else {
1514         ramblock = qemu_ram_block_by_name(rbname);
1515 
1516         if (!ramblock) {
1517             /* We shouldn't be asked for a non-existent RAMBlock */
1518             error_report("ram_save_queue_pages no block '%s'", rbname);
1519             return -1;
1520         }
1521         rs->last_req_rb = ramblock;
1522     }
1523     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1524     if (start+len > ramblock->used_length) {
1525         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1526                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1527                      __func__, start, len, ramblock->used_length);
1528         return -1;
1529     }
1530 
1531     struct RAMSrcPageRequest *new_entry =
1532         g_malloc0(sizeof(struct RAMSrcPageRequest));
1533     new_entry->rb = ramblock;
1534     new_entry->offset = start;
1535     new_entry->len = len;
1536 
1537     memory_region_ref(ramblock->mr);
1538     qemu_mutex_lock(&rs->src_page_req_mutex);
1539     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1540     migration_make_urgent_request();
1541     qemu_mutex_unlock(&rs->src_page_req_mutex);
1542 
1543     return 0;
1544 }
1545 
1546 static bool save_page_use_compression(RAMState *rs)
1547 {
1548     if (!migrate_use_compression()) {
1549         return false;
1550     }
1551 
1552     /*
1553      * If xbzrle is on, stop using the data compression after first
1554      * round of migration even if compression is enabled. In theory,
1555      * xbzrle can do better than compression.
1556      */
1557     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1558         return true;
1559     }
1560 
1561     return false;
1562 }
1563 
1564 /*
1565  * try to compress the page before posting it out, return true if the page
1566  * has been properly handled by compression, otherwise needs other
1567  * paths to handle it
1568  */
1569 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1570 {
1571     if (!save_page_use_compression(rs)) {
1572         return false;
1573     }
1574 
1575     /*
1576      * When starting the process of a new block, the first page of
1577      * the block should be sent out before other pages in the same
1578      * block, and all the pages in last block should have been sent
1579      * out, keeping this order is important, because the 'cont' flag
1580      * is used to avoid resending the block name.
1581      *
1582      * We post the fist page as normal page as compression will take
1583      * much CPU resource.
1584      */
1585     if (block != rs->last_sent_block) {
1586         flush_compressed_data(rs);
1587         return false;
1588     }
1589 
1590     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1591         return true;
1592     }
1593 
1594     compression_counters.busy++;
1595     return false;
1596 }
1597 
1598 /**
1599  * ram_save_target_page: save one target page
1600  *
1601  * Returns the number of pages written
1602  *
1603  * @rs: current RAM state
1604  * @pss: data about the page we want to send
1605  * @last_stage: if we are at the completion stage
1606  */
1607 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1608                                 bool last_stage)
1609 {
1610     RAMBlock *block = pss->block;
1611     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1612     int res;
1613 
1614     if (control_save_page(rs, block, offset, &res)) {
1615         return res;
1616     }
1617 
1618     if (save_compress_page(rs, block, offset)) {
1619         return 1;
1620     }
1621 
1622     res = save_zero_page(rs, block, offset);
1623     if (res > 0) {
1624         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1625          * page would be stale
1626          */
1627         if (!save_page_use_compression(rs)) {
1628             XBZRLE_cache_lock();
1629             xbzrle_cache_zero_page(rs, block->offset + offset);
1630             XBZRLE_cache_unlock();
1631         }
1632         ram_release_pages(block->idstr, offset, res);
1633         return res;
1634     }
1635 
1636     /*
1637      * Do not use multifd for:
1638      * 1. Compression as the first page in the new block should be posted out
1639      *    before sending the compressed page
1640      * 2. In postcopy as one whole host page should be placed
1641      */
1642     if (!save_page_use_compression(rs) && migrate_use_multifd()
1643         && !migration_in_postcopy()) {
1644         return ram_save_multifd_page(rs, block, offset);
1645     }
1646 
1647     return ram_save_page(rs, pss, last_stage);
1648 }
1649 
1650 /**
1651  * ram_save_host_page: save a whole host page
1652  *
1653  * Starting at *offset send pages up to the end of the current host
1654  * page. It's valid for the initial offset to point into the middle of
1655  * a host page in which case the remainder of the hostpage is sent.
1656  * Only dirty target pages are sent. Note that the host page size may
1657  * be a huge page for this block.
1658  * The saving stops at the boundary of the used_length of the block
1659  * if the RAMBlock isn't a multiple of the host page size.
1660  *
1661  * Returns the number of pages written or negative on error
1662  *
1663  * @rs: current RAM state
1664  * @ms: current migration state
1665  * @pss: data about the page we want to send
1666  * @last_stage: if we are at the completion stage
1667  */
1668 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1669                               bool last_stage)
1670 {
1671     int tmppages, pages = 0;
1672     size_t pagesize_bits =
1673         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1674 
1675     if (ramblock_is_ignored(pss->block)) {
1676         error_report("block %s should not be migrated !", pss->block->idstr);
1677         return 0;
1678     }
1679 
1680     do {
1681         /* Check the pages is dirty and if it is send it */
1682         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1683             pss->page++;
1684             continue;
1685         }
1686 
1687         tmppages = ram_save_target_page(rs, pss, last_stage);
1688         if (tmppages < 0) {
1689             return tmppages;
1690         }
1691 
1692         pages += tmppages;
1693         pss->page++;
1694         /* Allow rate limiting to happen in the middle of huge pages */
1695         migration_rate_limit();
1696     } while ((pss->page & (pagesize_bits - 1)) &&
1697              offset_in_ramblock(pss->block,
1698                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1699 
1700     /* The offset we leave with is the last one we looked at */
1701     pss->page--;
1702     return pages;
1703 }
1704 
1705 /**
1706  * ram_find_and_save_block: finds a dirty page and sends it to f
1707  *
1708  * Called within an RCU critical section.
1709  *
1710  * Returns the number of pages written where zero means no dirty pages,
1711  * or negative on error
1712  *
1713  * @rs: current RAM state
1714  * @last_stage: if we are at the completion stage
1715  *
1716  * On systems where host-page-size > target-page-size it will send all the
1717  * pages in a host page that are dirty.
1718  */
1719 
1720 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1721 {
1722     PageSearchStatus pss;
1723     int pages = 0;
1724     bool again, found;
1725 
1726     /* No dirty page as there is zero RAM */
1727     if (!ram_bytes_total()) {
1728         return pages;
1729     }
1730 
1731     pss.block = rs->last_seen_block;
1732     pss.page = rs->last_page;
1733     pss.complete_round = false;
1734 
1735     if (!pss.block) {
1736         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1737     }
1738 
1739     do {
1740         again = true;
1741         found = get_queued_page(rs, &pss);
1742 
1743         if (!found) {
1744             /* priority queue empty, so just search for something dirty */
1745             found = find_dirty_block(rs, &pss, &again);
1746         }
1747 
1748         if (found) {
1749             pages = ram_save_host_page(rs, &pss, last_stage);
1750         }
1751     } while (!pages && again);
1752 
1753     rs->last_seen_block = pss.block;
1754     rs->last_page = pss.page;
1755 
1756     return pages;
1757 }
1758 
1759 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1760 {
1761     uint64_t pages = size / TARGET_PAGE_SIZE;
1762 
1763     if (zero) {
1764         ram_counters.duplicate += pages;
1765     } else {
1766         ram_counters.normal += pages;
1767         ram_counters.transferred += size;
1768         qemu_update_position(f, size);
1769     }
1770 }
1771 
1772 static uint64_t ram_bytes_total_common(bool count_ignored)
1773 {
1774     RAMBlock *block;
1775     uint64_t total = 0;
1776 
1777     RCU_READ_LOCK_GUARD();
1778 
1779     if (count_ignored) {
1780         RAMBLOCK_FOREACH_MIGRATABLE(block) {
1781             total += block->used_length;
1782         }
1783     } else {
1784         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1785             total += block->used_length;
1786         }
1787     }
1788     return total;
1789 }
1790 
1791 uint64_t ram_bytes_total(void)
1792 {
1793     return ram_bytes_total_common(false);
1794 }
1795 
1796 static void xbzrle_load_setup(void)
1797 {
1798     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1799 }
1800 
1801 static void xbzrle_load_cleanup(void)
1802 {
1803     g_free(XBZRLE.decoded_buf);
1804     XBZRLE.decoded_buf = NULL;
1805 }
1806 
1807 static void ram_state_cleanup(RAMState **rsp)
1808 {
1809     if (*rsp) {
1810         migration_page_queue_free(*rsp);
1811         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1812         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1813         g_free(*rsp);
1814         *rsp = NULL;
1815     }
1816 }
1817 
1818 static void xbzrle_cleanup(void)
1819 {
1820     XBZRLE_cache_lock();
1821     if (XBZRLE.cache) {
1822         cache_fini(XBZRLE.cache);
1823         g_free(XBZRLE.encoded_buf);
1824         g_free(XBZRLE.current_buf);
1825         g_free(XBZRLE.zero_target_page);
1826         XBZRLE.cache = NULL;
1827         XBZRLE.encoded_buf = NULL;
1828         XBZRLE.current_buf = NULL;
1829         XBZRLE.zero_target_page = NULL;
1830     }
1831     XBZRLE_cache_unlock();
1832 }
1833 
1834 static void ram_save_cleanup(void *opaque)
1835 {
1836     RAMState **rsp = opaque;
1837     RAMBlock *block;
1838 
1839     /* caller have hold iothread lock or is in a bh, so there is
1840      * no writing race against the migration bitmap
1841      */
1842     memory_global_dirty_log_stop();
1843 
1844     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1845         g_free(block->clear_bmap);
1846         block->clear_bmap = NULL;
1847         g_free(block->bmap);
1848         block->bmap = NULL;
1849     }
1850 
1851     xbzrle_cleanup();
1852     compress_threads_save_cleanup();
1853     ram_state_cleanup(rsp);
1854 }
1855 
1856 static void ram_state_reset(RAMState *rs)
1857 {
1858     rs->last_seen_block = NULL;
1859     rs->last_sent_block = NULL;
1860     rs->last_page = 0;
1861     rs->last_version = ram_list.version;
1862     rs->ram_bulk_stage = true;
1863     rs->fpo_enabled = false;
1864 }
1865 
1866 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1867 
1868 /*
1869  * 'expected' is the value you expect the bitmap mostly to be full
1870  * of; it won't bother printing lines that are all this value.
1871  * If 'todump' is null the migration bitmap is dumped.
1872  */
1873 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1874                            unsigned long pages)
1875 {
1876     int64_t cur;
1877     int64_t linelen = 128;
1878     char linebuf[129];
1879 
1880     for (cur = 0; cur < pages; cur += linelen) {
1881         int64_t curb;
1882         bool found = false;
1883         /*
1884          * Last line; catch the case where the line length
1885          * is longer than remaining ram
1886          */
1887         if (cur + linelen > pages) {
1888             linelen = pages - cur;
1889         }
1890         for (curb = 0; curb < linelen; curb++) {
1891             bool thisbit = test_bit(cur + curb, todump);
1892             linebuf[curb] = thisbit ? '1' : '.';
1893             found = found || (thisbit != expected);
1894         }
1895         if (found) {
1896             linebuf[curb] = '\0';
1897             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1898         }
1899     }
1900 }
1901 
1902 /* **** functions for postcopy ***** */
1903 
1904 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1905 {
1906     struct RAMBlock *block;
1907 
1908     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1909         unsigned long *bitmap = block->bmap;
1910         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1911         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1912 
1913         while (run_start < range) {
1914             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1915             ram_discard_range(block->idstr,
1916                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1917                               ((ram_addr_t)(run_end - run_start))
1918                                 << TARGET_PAGE_BITS);
1919             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1920         }
1921     }
1922 }
1923 
1924 /**
1925  * postcopy_send_discard_bm_ram: discard a RAMBlock
1926  *
1927  * Returns zero on success
1928  *
1929  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1930  *
1931  * @ms: current migration state
1932  * @block: RAMBlock to discard
1933  */
1934 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1935 {
1936     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1937     unsigned long current;
1938     unsigned long *bitmap = block->bmap;
1939 
1940     for (current = 0; current < end; ) {
1941         unsigned long one = find_next_bit(bitmap, end, current);
1942         unsigned long zero, discard_length;
1943 
1944         if (one >= end) {
1945             break;
1946         }
1947 
1948         zero = find_next_zero_bit(bitmap, end, one + 1);
1949 
1950         if (zero >= end) {
1951             discard_length = end - one;
1952         } else {
1953             discard_length = zero - one;
1954         }
1955         postcopy_discard_send_range(ms, one, discard_length);
1956         current = one + discard_length;
1957     }
1958 
1959     return 0;
1960 }
1961 
1962 /**
1963  * postcopy_each_ram_send_discard: discard all RAMBlocks
1964  *
1965  * Returns 0 for success or negative for error
1966  *
1967  * Utility for the outgoing postcopy code.
1968  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1969  *   passing it bitmap indexes and name.
1970  * (qemu_ram_foreach_block ends up passing unscaled lengths
1971  *  which would mean postcopy code would have to deal with target page)
1972  *
1973  * @ms: current migration state
1974  */
1975 static int postcopy_each_ram_send_discard(MigrationState *ms)
1976 {
1977     struct RAMBlock *block;
1978     int ret;
1979 
1980     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1981         postcopy_discard_send_init(ms, block->idstr);
1982 
1983         /*
1984          * Postcopy sends chunks of bitmap over the wire, but it
1985          * just needs indexes at this point, avoids it having
1986          * target page specific code.
1987          */
1988         ret = postcopy_send_discard_bm_ram(ms, block);
1989         postcopy_discard_send_finish(ms);
1990         if (ret) {
1991             return ret;
1992         }
1993     }
1994 
1995     return 0;
1996 }
1997 
1998 /**
1999  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2000  *
2001  * Helper for postcopy_chunk_hostpages; it's called twice to
2002  * canonicalize the two bitmaps, that are similar, but one is
2003  * inverted.
2004  *
2005  * Postcopy requires that all target pages in a hostpage are dirty or
2006  * clean, not a mix.  This function canonicalizes the bitmaps.
2007  *
2008  * @ms: current migration state
2009  * @block: block that contains the page we want to canonicalize
2010  */
2011 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2012 {
2013     RAMState *rs = ram_state;
2014     unsigned long *bitmap = block->bmap;
2015     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2016     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2017     unsigned long run_start;
2018 
2019     if (block->page_size == TARGET_PAGE_SIZE) {
2020         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2021         return;
2022     }
2023 
2024     /* Find a dirty page */
2025     run_start = find_next_bit(bitmap, pages, 0);
2026 
2027     while (run_start < pages) {
2028 
2029         /*
2030          * If the start of this run of pages is in the middle of a host
2031          * page, then we need to fixup this host page.
2032          */
2033         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2034             /* Find the end of this run */
2035             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2036             /*
2037              * If the end isn't at the start of a host page, then the
2038              * run doesn't finish at the end of a host page
2039              * and we need to discard.
2040              */
2041         }
2042 
2043         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2044             unsigned long page;
2045             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2046                                                              host_ratio);
2047             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2048 
2049             /* Clean up the bitmap */
2050             for (page = fixup_start_addr;
2051                  page < fixup_start_addr + host_ratio; page++) {
2052                 /*
2053                  * Remark them as dirty, updating the count for any pages
2054                  * that weren't previously dirty.
2055                  */
2056                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2057             }
2058         }
2059 
2060         /* Find the next dirty page for the next iteration */
2061         run_start = find_next_bit(bitmap, pages, run_start);
2062     }
2063 }
2064 
2065 /**
2066  * postcopy_chunk_hostpages: discard any partially sent host page
2067  *
2068  * Utility for the outgoing postcopy code.
2069  *
2070  * Discard any partially sent host-page size chunks, mark any partially
2071  * dirty host-page size chunks as all dirty.  In this case the host-page
2072  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2073  *
2074  * Returns zero on success
2075  *
2076  * @ms: current migration state
2077  * @block: block we want to work with
2078  */
2079 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2080 {
2081     postcopy_discard_send_init(ms, block->idstr);
2082 
2083     /*
2084      * Ensure that all partially dirty host pages are made fully dirty.
2085      */
2086     postcopy_chunk_hostpages_pass(ms, block);
2087 
2088     postcopy_discard_send_finish(ms);
2089     return 0;
2090 }
2091 
2092 /**
2093  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2094  *
2095  * Returns zero on success
2096  *
2097  * Transmit the set of pages to be discarded after precopy to the target
2098  * these are pages that:
2099  *     a) Have been previously transmitted but are now dirty again
2100  *     b) Pages that have never been transmitted, this ensures that
2101  *        any pages on the destination that have been mapped by background
2102  *        tasks get discarded (transparent huge pages is the specific concern)
2103  * Hopefully this is pretty sparse
2104  *
2105  * @ms: current migration state
2106  */
2107 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2108 {
2109     RAMState *rs = ram_state;
2110     RAMBlock *block;
2111     int ret;
2112 
2113     RCU_READ_LOCK_GUARD();
2114 
2115     /* This should be our last sync, the src is now paused */
2116     migration_bitmap_sync(rs);
2117 
2118     /* Easiest way to make sure we don't resume in the middle of a host-page */
2119     rs->last_seen_block = NULL;
2120     rs->last_sent_block = NULL;
2121     rs->last_page = 0;
2122 
2123     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2124         /* Deal with TPS != HPS and huge pages */
2125         ret = postcopy_chunk_hostpages(ms, block);
2126         if (ret) {
2127             return ret;
2128         }
2129 
2130 #ifdef DEBUG_POSTCOPY
2131         ram_debug_dump_bitmap(block->bmap, true,
2132                               block->used_length >> TARGET_PAGE_BITS);
2133 #endif
2134     }
2135     trace_ram_postcopy_send_discard_bitmap();
2136 
2137     ret = postcopy_each_ram_send_discard(ms);
2138 
2139     return ret;
2140 }
2141 
2142 /**
2143  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2144  *
2145  * Returns zero on success
2146  *
2147  * @rbname: name of the RAMBlock of the request. NULL means the
2148  *          same that last one.
2149  * @start: RAMBlock starting page
2150  * @length: RAMBlock size
2151  */
2152 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2153 {
2154     trace_ram_discard_range(rbname, start, length);
2155 
2156     RCU_READ_LOCK_GUARD();
2157     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2158 
2159     if (!rb) {
2160         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2161         return -1;
2162     }
2163 
2164     /*
2165      * On source VM, we don't need to update the received bitmap since
2166      * we don't even have one.
2167      */
2168     if (rb->receivedmap) {
2169         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2170                      length >> qemu_target_page_bits());
2171     }
2172 
2173     return ram_block_discard_range(rb, start, length);
2174 }
2175 
2176 /*
2177  * For every allocation, we will try not to crash the VM if the
2178  * allocation failed.
2179  */
2180 static int xbzrle_init(void)
2181 {
2182     Error *local_err = NULL;
2183 
2184     if (!migrate_use_xbzrle()) {
2185         return 0;
2186     }
2187 
2188     XBZRLE_cache_lock();
2189 
2190     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2191     if (!XBZRLE.zero_target_page) {
2192         error_report("%s: Error allocating zero page", __func__);
2193         goto err_out;
2194     }
2195 
2196     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2197                               TARGET_PAGE_SIZE, &local_err);
2198     if (!XBZRLE.cache) {
2199         error_report_err(local_err);
2200         goto free_zero_page;
2201     }
2202 
2203     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2204     if (!XBZRLE.encoded_buf) {
2205         error_report("%s: Error allocating encoded_buf", __func__);
2206         goto free_cache;
2207     }
2208 
2209     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2210     if (!XBZRLE.current_buf) {
2211         error_report("%s: Error allocating current_buf", __func__);
2212         goto free_encoded_buf;
2213     }
2214 
2215     /* We are all good */
2216     XBZRLE_cache_unlock();
2217     return 0;
2218 
2219 free_encoded_buf:
2220     g_free(XBZRLE.encoded_buf);
2221     XBZRLE.encoded_buf = NULL;
2222 free_cache:
2223     cache_fini(XBZRLE.cache);
2224     XBZRLE.cache = NULL;
2225 free_zero_page:
2226     g_free(XBZRLE.zero_target_page);
2227     XBZRLE.zero_target_page = NULL;
2228 err_out:
2229     XBZRLE_cache_unlock();
2230     return -ENOMEM;
2231 }
2232 
2233 static int ram_state_init(RAMState **rsp)
2234 {
2235     *rsp = g_try_new0(RAMState, 1);
2236 
2237     if (!*rsp) {
2238         error_report("%s: Init ramstate fail", __func__);
2239         return -1;
2240     }
2241 
2242     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2243     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2244     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2245 
2246     /*
2247      * Count the total number of pages used by ram blocks not including any
2248      * gaps due to alignment or unplugs.
2249      * This must match with the initial values of dirty bitmap.
2250      */
2251     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2252     ram_state_reset(*rsp);
2253 
2254     return 0;
2255 }
2256 
2257 static void ram_list_init_bitmaps(void)
2258 {
2259     MigrationState *ms = migrate_get_current();
2260     RAMBlock *block;
2261     unsigned long pages;
2262     uint8_t shift;
2263 
2264     /* Skip setting bitmap if there is no RAM */
2265     if (ram_bytes_total()) {
2266         shift = ms->clear_bitmap_shift;
2267         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2268             error_report("clear_bitmap_shift (%u) too big, using "
2269                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2270             shift = CLEAR_BITMAP_SHIFT_MAX;
2271         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2272             error_report("clear_bitmap_shift (%u) too small, using "
2273                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2274             shift = CLEAR_BITMAP_SHIFT_MIN;
2275         }
2276 
2277         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2278             pages = block->max_length >> TARGET_PAGE_BITS;
2279             /*
2280              * The initial dirty bitmap for migration must be set with all
2281              * ones to make sure we'll migrate every guest RAM page to
2282              * destination.
2283              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2284              * new migration after a failed migration, ram_list.
2285              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2286              * guest memory.
2287              */
2288             block->bmap = bitmap_new(pages);
2289             bitmap_set(block->bmap, 0, pages);
2290             block->clear_bmap_shift = shift;
2291             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2292         }
2293     }
2294 }
2295 
2296 static void ram_init_bitmaps(RAMState *rs)
2297 {
2298     /* For memory_global_dirty_log_start below.  */
2299     qemu_mutex_lock_iothread();
2300     qemu_mutex_lock_ramlist();
2301 
2302     WITH_RCU_READ_LOCK_GUARD() {
2303         ram_list_init_bitmaps();
2304         memory_global_dirty_log_start();
2305         migration_bitmap_sync_precopy(rs);
2306     }
2307     qemu_mutex_unlock_ramlist();
2308     qemu_mutex_unlock_iothread();
2309 }
2310 
2311 static int ram_init_all(RAMState **rsp)
2312 {
2313     if (ram_state_init(rsp)) {
2314         return -1;
2315     }
2316 
2317     if (xbzrle_init()) {
2318         ram_state_cleanup(rsp);
2319         return -1;
2320     }
2321 
2322     ram_init_bitmaps(*rsp);
2323 
2324     return 0;
2325 }
2326 
2327 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2328 {
2329     RAMBlock *block;
2330     uint64_t pages = 0;
2331 
2332     /*
2333      * Postcopy is not using xbzrle/compression, so no need for that.
2334      * Also, since source are already halted, we don't need to care
2335      * about dirty page logging as well.
2336      */
2337 
2338     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2339         pages += bitmap_count_one(block->bmap,
2340                                   block->used_length >> TARGET_PAGE_BITS);
2341     }
2342 
2343     /* This may not be aligned with current bitmaps. Recalculate. */
2344     rs->migration_dirty_pages = pages;
2345 
2346     rs->last_seen_block = NULL;
2347     rs->last_sent_block = NULL;
2348     rs->last_page = 0;
2349     rs->last_version = ram_list.version;
2350     /*
2351      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2352      * matter what we have sent.
2353      */
2354     rs->ram_bulk_stage = false;
2355 
2356     /* Update RAMState cache of output QEMUFile */
2357     rs->f = out;
2358 
2359     trace_ram_state_resume_prepare(pages);
2360 }
2361 
2362 /*
2363  * This function clears bits of the free pages reported by the caller from the
2364  * migration dirty bitmap. @addr is the host address corresponding to the
2365  * start of the continuous guest free pages, and @len is the total bytes of
2366  * those pages.
2367  */
2368 void qemu_guest_free_page_hint(void *addr, size_t len)
2369 {
2370     RAMBlock *block;
2371     ram_addr_t offset;
2372     size_t used_len, start, npages;
2373     MigrationState *s = migrate_get_current();
2374 
2375     /* This function is currently expected to be used during live migration */
2376     if (!migration_is_setup_or_active(s->state)) {
2377         return;
2378     }
2379 
2380     for (; len > 0; len -= used_len, addr += used_len) {
2381         block = qemu_ram_block_from_host(addr, false, &offset);
2382         if (unlikely(!block || offset >= block->used_length)) {
2383             /*
2384              * The implementation might not support RAMBlock resize during
2385              * live migration, but it could happen in theory with future
2386              * updates. So we add a check here to capture that case.
2387              */
2388             error_report_once("%s unexpected error", __func__);
2389             return;
2390         }
2391 
2392         if (len <= block->used_length - offset) {
2393             used_len = len;
2394         } else {
2395             used_len = block->used_length - offset;
2396         }
2397 
2398         start = offset >> TARGET_PAGE_BITS;
2399         npages = used_len >> TARGET_PAGE_BITS;
2400 
2401         qemu_mutex_lock(&ram_state->bitmap_mutex);
2402         ram_state->migration_dirty_pages -=
2403                       bitmap_count_one_with_offset(block->bmap, start, npages);
2404         bitmap_clear(block->bmap, start, npages);
2405         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2406     }
2407 }
2408 
2409 /*
2410  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2411  * long-running RCU critical section.  When rcu-reclaims in the code
2412  * start to become numerous it will be necessary to reduce the
2413  * granularity of these critical sections.
2414  */
2415 
2416 /**
2417  * ram_save_setup: Setup RAM for migration
2418  *
2419  * Returns zero to indicate success and negative for error
2420  *
2421  * @f: QEMUFile where to send the data
2422  * @opaque: RAMState pointer
2423  */
2424 static int ram_save_setup(QEMUFile *f, void *opaque)
2425 {
2426     RAMState **rsp = opaque;
2427     RAMBlock *block;
2428 
2429     if (compress_threads_save_setup()) {
2430         return -1;
2431     }
2432 
2433     /* migration has already setup the bitmap, reuse it. */
2434     if (!migration_in_colo_state()) {
2435         if (ram_init_all(rsp) != 0) {
2436             compress_threads_save_cleanup();
2437             return -1;
2438         }
2439     }
2440     (*rsp)->f = f;
2441 
2442     WITH_RCU_READ_LOCK_GUARD() {
2443         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2444 
2445         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2446             qemu_put_byte(f, strlen(block->idstr));
2447             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2448             qemu_put_be64(f, block->used_length);
2449             if (migrate_postcopy_ram() && block->page_size !=
2450                                           qemu_host_page_size) {
2451                 qemu_put_be64(f, block->page_size);
2452             }
2453             if (migrate_ignore_shared()) {
2454                 qemu_put_be64(f, block->mr->addr);
2455             }
2456         }
2457     }
2458 
2459     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2460     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2461 
2462     multifd_send_sync_main(f);
2463     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2464     qemu_fflush(f);
2465 
2466     return 0;
2467 }
2468 
2469 /**
2470  * ram_save_iterate: iterative stage for migration
2471  *
2472  * Returns zero to indicate success and negative for error
2473  *
2474  * @f: QEMUFile where to send the data
2475  * @opaque: RAMState pointer
2476  */
2477 static int ram_save_iterate(QEMUFile *f, void *opaque)
2478 {
2479     RAMState **temp = opaque;
2480     RAMState *rs = *temp;
2481     int ret = 0;
2482     int i;
2483     int64_t t0;
2484     int done = 0;
2485 
2486     if (blk_mig_bulk_active()) {
2487         /* Avoid transferring ram during bulk phase of block migration as
2488          * the bulk phase will usually take a long time and transferring
2489          * ram updates during that time is pointless. */
2490         goto out;
2491     }
2492 
2493     WITH_RCU_READ_LOCK_GUARD() {
2494         if (ram_list.version != rs->last_version) {
2495             ram_state_reset(rs);
2496         }
2497 
2498         /* Read version before ram_list.blocks */
2499         smp_rmb();
2500 
2501         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2502 
2503         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2504         i = 0;
2505         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2506                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2507             int pages;
2508 
2509             if (qemu_file_get_error(f)) {
2510                 break;
2511             }
2512 
2513             pages = ram_find_and_save_block(rs, false);
2514             /* no more pages to sent */
2515             if (pages == 0) {
2516                 done = 1;
2517                 break;
2518             }
2519 
2520             if (pages < 0) {
2521                 qemu_file_set_error(f, pages);
2522                 break;
2523             }
2524 
2525             rs->target_page_count += pages;
2526 
2527             /*
2528              * During postcopy, it is necessary to make sure one whole host
2529              * page is sent in one chunk.
2530              */
2531             if (migrate_postcopy_ram()) {
2532                 flush_compressed_data(rs);
2533             }
2534 
2535             /*
2536              * we want to check in the 1st loop, just in case it was the 1st
2537              * time and we had to sync the dirty bitmap.
2538              * qemu_clock_get_ns() is a bit expensive, so we only check each
2539              * some iterations
2540              */
2541             if ((i & 63) == 0) {
2542                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2543                               1000000;
2544                 if (t1 > MAX_WAIT) {
2545                     trace_ram_save_iterate_big_wait(t1, i);
2546                     break;
2547                 }
2548             }
2549             i++;
2550         }
2551     }
2552 
2553     /*
2554      * Must occur before EOS (or any QEMUFile operation)
2555      * because of RDMA protocol.
2556      */
2557     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2558 
2559 out:
2560     if (ret >= 0
2561         && migration_is_setup_or_active(migrate_get_current()->state)) {
2562         multifd_send_sync_main(rs->f);
2563         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2564         qemu_fflush(f);
2565         ram_counters.transferred += 8;
2566 
2567         ret = qemu_file_get_error(f);
2568     }
2569     if (ret < 0) {
2570         return ret;
2571     }
2572 
2573     return done;
2574 }
2575 
2576 /**
2577  * ram_save_complete: function called to send the remaining amount of ram
2578  *
2579  * Returns zero to indicate success or negative on error
2580  *
2581  * Called with iothread lock
2582  *
2583  * @f: QEMUFile where to send the data
2584  * @opaque: RAMState pointer
2585  */
2586 static int ram_save_complete(QEMUFile *f, void *opaque)
2587 {
2588     RAMState **temp = opaque;
2589     RAMState *rs = *temp;
2590     int ret = 0;
2591 
2592     WITH_RCU_READ_LOCK_GUARD() {
2593         if (!migration_in_postcopy()) {
2594             migration_bitmap_sync_precopy(rs);
2595         }
2596 
2597         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2598 
2599         /* try transferring iterative blocks of memory */
2600 
2601         /* flush all remaining blocks regardless of rate limiting */
2602         while (true) {
2603             int pages;
2604 
2605             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2606             /* no more blocks to sent */
2607             if (pages == 0) {
2608                 break;
2609             }
2610             if (pages < 0) {
2611                 ret = pages;
2612                 break;
2613             }
2614         }
2615 
2616         flush_compressed_data(rs);
2617         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2618     }
2619 
2620     if (ret >= 0) {
2621         multifd_send_sync_main(rs->f);
2622         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2623         qemu_fflush(f);
2624     }
2625 
2626     return ret;
2627 }
2628 
2629 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2630                              uint64_t *res_precopy_only,
2631                              uint64_t *res_compatible,
2632                              uint64_t *res_postcopy_only)
2633 {
2634     RAMState **temp = opaque;
2635     RAMState *rs = *temp;
2636     uint64_t remaining_size;
2637 
2638     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2639 
2640     if (!migration_in_postcopy() &&
2641         remaining_size < max_size) {
2642         qemu_mutex_lock_iothread();
2643         WITH_RCU_READ_LOCK_GUARD() {
2644             migration_bitmap_sync_precopy(rs);
2645         }
2646         qemu_mutex_unlock_iothread();
2647         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2648     }
2649 
2650     if (migrate_postcopy_ram()) {
2651         /* We can do postcopy, and all the data is postcopiable */
2652         *res_compatible += remaining_size;
2653     } else {
2654         *res_precopy_only += remaining_size;
2655     }
2656 }
2657 
2658 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2659 {
2660     unsigned int xh_len;
2661     int xh_flags;
2662     uint8_t *loaded_data;
2663 
2664     /* extract RLE header */
2665     xh_flags = qemu_get_byte(f);
2666     xh_len = qemu_get_be16(f);
2667 
2668     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2669         error_report("Failed to load XBZRLE page - wrong compression!");
2670         return -1;
2671     }
2672 
2673     if (xh_len > TARGET_PAGE_SIZE) {
2674         error_report("Failed to load XBZRLE page - len overflow!");
2675         return -1;
2676     }
2677     loaded_data = XBZRLE.decoded_buf;
2678     /* load data and decode */
2679     /* it can change loaded_data to point to an internal buffer */
2680     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2681 
2682     /* decode RLE */
2683     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2684                              TARGET_PAGE_SIZE) == -1) {
2685         error_report("Failed to load XBZRLE page - decode error!");
2686         return -1;
2687     }
2688 
2689     return 0;
2690 }
2691 
2692 /**
2693  * ram_block_from_stream: read a RAMBlock id from the migration stream
2694  *
2695  * Must be called from within a rcu critical section.
2696  *
2697  * Returns a pointer from within the RCU-protected ram_list.
2698  *
2699  * @f: QEMUFile where to read the data from
2700  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2701  */
2702 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2703 {
2704     static RAMBlock *block = NULL;
2705     char id[256];
2706     uint8_t len;
2707 
2708     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2709         if (!block) {
2710             error_report("Ack, bad migration stream!");
2711             return NULL;
2712         }
2713         return block;
2714     }
2715 
2716     len = qemu_get_byte(f);
2717     qemu_get_buffer(f, (uint8_t *)id, len);
2718     id[len] = 0;
2719 
2720     block = qemu_ram_block_by_name(id);
2721     if (!block) {
2722         error_report("Can't find block %s", id);
2723         return NULL;
2724     }
2725 
2726     if (ramblock_is_ignored(block)) {
2727         error_report("block %s should not be migrated !", id);
2728         return NULL;
2729     }
2730 
2731     return block;
2732 }
2733 
2734 static inline void *host_from_ram_block_offset(RAMBlock *block,
2735                                                ram_addr_t offset)
2736 {
2737     if (!offset_in_ramblock(block, offset)) {
2738         return NULL;
2739     }
2740 
2741     return block->host + offset;
2742 }
2743 
2744 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2745                              ram_addr_t offset, bool record_bitmap)
2746 {
2747     if (!offset_in_ramblock(block, offset)) {
2748         return NULL;
2749     }
2750     if (!block->colo_cache) {
2751         error_report("%s: colo_cache is NULL in block :%s",
2752                      __func__, block->idstr);
2753         return NULL;
2754     }
2755 
2756     /*
2757     * During colo checkpoint, we need bitmap of these migrated pages.
2758     * It help us to decide which pages in ram cache should be flushed
2759     * into VM's RAM later.
2760     */
2761     if (record_bitmap &&
2762         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2763         ram_state->migration_dirty_pages++;
2764     }
2765     return block->colo_cache + offset;
2766 }
2767 
2768 /**
2769  * ram_handle_compressed: handle the zero page case
2770  *
2771  * If a page (or a whole RDMA chunk) has been
2772  * determined to be zero, then zap it.
2773  *
2774  * @host: host address for the zero page
2775  * @ch: what the page is filled from.  We only support zero
2776  * @size: size of the zero page
2777  */
2778 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2779 {
2780     if (ch != 0 || !is_zero_range(host, size)) {
2781         memset(host, ch, size);
2782     }
2783 }
2784 
2785 /* return the size after decompression, or negative value on error */
2786 static int
2787 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2788                      const uint8_t *source, size_t source_len)
2789 {
2790     int err;
2791 
2792     err = inflateReset(stream);
2793     if (err != Z_OK) {
2794         return -1;
2795     }
2796 
2797     stream->avail_in = source_len;
2798     stream->next_in = (uint8_t *)source;
2799     stream->avail_out = dest_len;
2800     stream->next_out = dest;
2801 
2802     err = inflate(stream, Z_NO_FLUSH);
2803     if (err != Z_STREAM_END) {
2804         return -1;
2805     }
2806 
2807     return stream->total_out;
2808 }
2809 
2810 static void *do_data_decompress(void *opaque)
2811 {
2812     DecompressParam *param = opaque;
2813     unsigned long pagesize;
2814     uint8_t *des;
2815     int len, ret;
2816 
2817     qemu_mutex_lock(&param->mutex);
2818     while (!param->quit) {
2819         if (param->des) {
2820             des = param->des;
2821             len = param->len;
2822             param->des = 0;
2823             qemu_mutex_unlock(&param->mutex);
2824 
2825             pagesize = TARGET_PAGE_SIZE;
2826 
2827             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2828                                        param->compbuf, len);
2829             if (ret < 0 && migrate_get_current()->decompress_error_check) {
2830                 error_report("decompress data failed");
2831                 qemu_file_set_error(decomp_file, ret);
2832             }
2833 
2834             qemu_mutex_lock(&decomp_done_lock);
2835             param->done = true;
2836             qemu_cond_signal(&decomp_done_cond);
2837             qemu_mutex_unlock(&decomp_done_lock);
2838 
2839             qemu_mutex_lock(&param->mutex);
2840         } else {
2841             qemu_cond_wait(&param->cond, &param->mutex);
2842         }
2843     }
2844     qemu_mutex_unlock(&param->mutex);
2845 
2846     return NULL;
2847 }
2848 
2849 static int wait_for_decompress_done(void)
2850 {
2851     int idx, thread_count;
2852 
2853     if (!migrate_use_compression()) {
2854         return 0;
2855     }
2856 
2857     thread_count = migrate_decompress_threads();
2858     qemu_mutex_lock(&decomp_done_lock);
2859     for (idx = 0; idx < thread_count; idx++) {
2860         while (!decomp_param[idx].done) {
2861             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2862         }
2863     }
2864     qemu_mutex_unlock(&decomp_done_lock);
2865     return qemu_file_get_error(decomp_file);
2866 }
2867 
2868 static void compress_threads_load_cleanup(void)
2869 {
2870     int i, thread_count;
2871 
2872     if (!migrate_use_compression()) {
2873         return;
2874     }
2875     thread_count = migrate_decompress_threads();
2876     for (i = 0; i < thread_count; i++) {
2877         /*
2878          * we use it as a indicator which shows if the thread is
2879          * properly init'd or not
2880          */
2881         if (!decomp_param[i].compbuf) {
2882             break;
2883         }
2884 
2885         qemu_mutex_lock(&decomp_param[i].mutex);
2886         decomp_param[i].quit = true;
2887         qemu_cond_signal(&decomp_param[i].cond);
2888         qemu_mutex_unlock(&decomp_param[i].mutex);
2889     }
2890     for (i = 0; i < thread_count; i++) {
2891         if (!decomp_param[i].compbuf) {
2892             break;
2893         }
2894 
2895         qemu_thread_join(decompress_threads + i);
2896         qemu_mutex_destroy(&decomp_param[i].mutex);
2897         qemu_cond_destroy(&decomp_param[i].cond);
2898         inflateEnd(&decomp_param[i].stream);
2899         g_free(decomp_param[i].compbuf);
2900         decomp_param[i].compbuf = NULL;
2901     }
2902     g_free(decompress_threads);
2903     g_free(decomp_param);
2904     decompress_threads = NULL;
2905     decomp_param = NULL;
2906     decomp_file = NULL;
2907 }
2908 
2909 static int compress_threads_load_setup(QEMUFile *f)
2910 {
2911     int i, thread_count;
2912 
2913     if (!migrate_use_compression()) {
2914         return 0;
2915     }
2916 
2917     thread_count = migrate_decompress_threads();
2918     decompress_threads = g_new0(QemuThread, thread_count);
2919     decomp_param = g_new0(DecompressParam, thread_count);
2920     qemu_mutex_init(&decomp_done_lock);
2921     qemu_cond_init(&decomp_done_cond);
2922     decomp_file = f;
2923     for (i = 0; i < thread_count; i++) {
2924         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2925             goto exit;
2926         }
2927 
2928         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2929         qemu_mutex_init(&decomp_param[i].mutex);
2930         qemu_cond_init(&decomp_param[i].cond);
2931         decomp_param[i].done = true;
2932         decomp_param[i].quit = false;
2933         qemu_thread_create(decompress_threads + i, "decompress",
2934                            do_data_decompress, decomp_param + i,
2935                            QEMU_THREAD_JOINABLE);
2936     }
2937     return 0;
2938 exit:
2939     compress_threads_load_cleanup();
2940     return -1;
2941 }
2942 
2943 static void decompress_data_with_multi_threads(QEMUFile *f,
2944                                                void *host, int len)
2945 {
2946     int idx, thread_count;
2947 
2948     thread_count = migrate_decompress_threads();
2949     qemu_mutex_lock(&decomp_done_lock);
2950     while (true) {
2951         for (idx = 0; idx < thread_count; idx++) {
2952             if (decomp_param[idx].done) {
2953                 decomp_param[idx].done = false;
2954                 qemu_mutex_lock(&decomp_param[idx].mutex);
2955                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2956                 decomp_param[idx].des = host;
2957                 decomp_param[idx].len = len;
2958                 qemu_cond_signal(&decomp_param[idx].cond);
2959                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2960                 break;
2961             }
2962         }
2963         if (idx < thread_count) {
2964             break;
2965         } else {
2966             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2967         }
2968     }
2969     qemu_mutex_unlock(&decomp_done_lock);
2970 }
2971 
2972 /*
2973  * colo cache: this is for secondary VM, we cache the whole
2974  * memory of the secondary VM, it is need to hold the global lock
2975  * to call this helper.
2976  */
2977 int colo_init_ram_cache(void)
2978 {
2979     RAMBlock *block;
2980 
2981     WITH_RCU_READ_LOCK_GUARD() {
2982         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2983             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
2984                                                     NULL,
2985                                                     false);
2986             if (!block->colo_cache) {
2987                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
2988                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
2989                              block->used_length);
2990                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2991                     if (block->colo_cache) {
2992                         qemu_anon_ram_free(block->colo_cache, block->used_length);
2993                         block->colo_cache = NULL;
2994                     }
2995                 }
2996                 return -errno;
2997             }
2998         }
2999     }
3000 
3001     /*
3002     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3003     * with to decide which page in cache should be flushed into SVM's RAM. Here
3004     * we use the same name 'ram_bitmap' as for migration.
3005     */
3006     if (ram_bytes_total()) {
3007         RAMBlock *block;
3008 
3009         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3010             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3011             block->bmap = bitmap_new(pages);
3012         }
3013     }
3014 
3015     ram_state_init(&ram_state);
3016     return 0;
3017 }
3018 
3019 /* TODO: duplicated with ram_init_bitmaps */
3020 void colo_incoming_start_dirty_log(void)
3021 {
3022     RAMBlock *block = NULL;
3023     /* For memory_global_dirty_log_start below. */
3024     qemu_mutex_lock_iothread();
3025     qemu_mutex_lock_ramlist();
3026 
3027     memory_global_dirty_log_sync();
3028     WITH_RCU_READ_LOCK_GUARD() {
3029         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3030             ramblock_sync_dirty_bitmap(ram_state, block);
3031             /* Discard this dirty bitmap record */
3032             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3033         }
3034         memory_global_dirty_log_start();
3035     }
3036     ram_state->migration_dirty_pages = 0;
3037     qemu_mutex_unlock_ramlist();
3038     qemu_mutex_unlock_iothread();
3039 }
3040 
3041 /* It is need to hold the global lock to call this helper */
3042 void colo_release_ram_cache(void)
3043 {
3044     RAMBlock *block;
3045 
3046     memory_global_dirty_log_stop();
3047     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3048         g_free(block->bmap);
3049         block->bmap = NULL;
3050     }
3051 
3052     WITH_RCU_READ_LOCK_GUARD() {
3053         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3054             if (block->colo_cache) {
3055                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3056                 block->colo_cache = NULL;
3057             }
3058         }
3059     }
3060     ram_state_cleanup(&ram_state);
3061 }
3062 
3063 /**
3064  * ram_load_setup: Setup RAM for migration incoming side
3065  *
3066  * Returns zero to indicate success and negative for error
3067  *
3068  * @f: QEMUFile where to receive the data
3069  * @opaque: RAMState pointer
3070  */
3071 static int ram_load_setup(QEMUFile *f, void *opaque)
3072 {
3073     if (compress_threads_load_setup(f)) {
3074         return -1;
3075     }
3076 
3077     xbzrle_load_setup();
3078     ramblock_recv_map_init();
3079 
3080     return 0;
3081 }
3082 
3083 static int ram_load_cleanup(void *opaque)
3084 {
3085     RAMBlock *rb;
3086 
3087     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3088         qemu_ram_block_writeback(rb);
3089     }
3090 
3091     xbzrle_load_cleanup();
3092     compress_threads_load_cleanup();
3093 
3094     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3095         g_free(rb->receivedmap);
3096         rb->receivedmap = NULL;
3097     }
3098 
3099     return 0;
3100 }
3101 
3102 /**
3103  * ram_postcopy_incoming_init: allocate postcopy data structures
3104  *
3105  * Returns 0 for success and negative if there was one error
3106  *
3107  * @mis: current migration incoming state
3108  *
3109  * Allocate data structures etc needed by incoming migration with
3110  * postcopy-ram. postcopy-ram's similarly names
3111  * postcopy_ram_incoming_init does the work.
3112  */
3113 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3114 {
3115     return postcopy_ram_incoming_init(mis);
3116 }
3117 
3118 /**
3119  * ram_load_postcopy: load a page in postcopy case
3120  *
3121  * Returns 0 for success or -errno in case of error
3122  *
3123  * Called in postcopy mode by ram_load().
3124  * rcu_read_lock is taken prior to this being called.
3125  *
3126  * @f: QEMUFile where to send the data
3127  */
3128 static int ram_load_postcopy(QEMUFile *f)
3129 {
3130     int flags = 0, ret = 0;
3131     bool place_needed = false;
3132     bool matches_target_page_size = false;
3133     MigrationIncomingState *mis = migration_incoming_get_current();
3134     /* Temporary page that is later 'placed' */
3135     void *postcopy_host_page = mis->postcopy_tmp_page;
3136     void *this_host = NULL;
3137     bool all_zero = false;
3138     int target_pages = 0;
3139 
3140     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3141         ram_addr_t addr;
3142         void *host = NULL;
3143         void *page_buffer = NULL;
3144         void *place_source = NULL;
3145         RAMBlock *block = NULL;
3146         uint8_t ch;
3147         int len;
3148 
3149         addr = qemu_get_be64(f);
3150 
3151         /*
3152          * If qemu file error, we should stop here, and then "addr"
3153          * may be invalid
3154          */
3155         ret = qemu_file_get_error(f);
3156         if (ret) {
3157             break;
3158         }
3159 
3160         flags = addr & ~TARGET_PAGE_MASK;
3161         addr &= TARGET_PAGE_MASK;
3162 
3163         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3164         place_needed = false;
3165         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3166                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3167             block = ram_block_from_stream(f, flags);
3168 
3169             host = host_from_ram_block_offset(block, addr);
3170             if (!host) {
3171                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3172                 ret = -EINVAL;
3173                 break;
3174             }
3175             target_pages++;
3176             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3177             /*
3178              * Postcopy requires that we place whole host pages atomically;
3179              * these may be huge pages for RAMBlocks that are backed by
3180              * hugetlbfs.
3181              * To make it atomic, the data is read into a temporary page
3182              * that's moved into place later.
3183              * The migration protocol uses,  possibly smaller, target-pages
3184              * however the source ensures it always sends all the components
3185              * of a host page in one chunk.
3186              */
3187             page_buffer = postcopy_host_page +
3188                           ((uintptr_t)host & (block->page_size - 1));
3189             /* If all TP are zero then we can optimise the place */
3190             if (target_pages == 1) {
3191                 all_zero = true;
3192                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3193                                                     block->page_size);
3194             } else {
3195                 /* not the 1st TP within the HP */
3196                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3197                     (uintptr_t)this_host) {
3198                     error_report("Non-same host page %p/%p",
3199                                   host, this_host);
3200                     ret = -EINVAL;
3201                     break;
3202                 }
3203             }
3204 
3205             /*
3206              * If it's the last part of a host page then we place the host
3207              * page
3208              */
3209             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3210                 place_needed = true;
3211                 target_pages = 0;
3212             }
3213             place_source = postcopy_host_page;
3214         }
3215 
3216         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3217         case RAM_SAVE_FLAG_ZERO:
3218             ch = qemu_get_byte(f);
3219             /*
3220              * Can skip to set page_buffer when
3221              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3222              */
3223             if (ch || !matches_target_page_size) {
3224                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3225             }
3226             if (ch) {
3227                 all_zero = false;
3228             }
3229             break;
3230 
3231         case RAM_SAVE_FLAG_PAGE:
3232             all_zero = false;
3233             if (!matches_target_page_size) {
3234                 /* For huge pages, we always use temporary buffer */
3235                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3236             } else {
3237                 /*
3238                  * For small pages that matches target page size, we
3239                  * avoid the qemu_file copy.  Instead we directly use
3240                  * the buffer of QEMUFile to place the page.  Note: we
3241                  * cannot do any QEMUFile operation before using that
3242                  * buffer to make sure the buffer is valid when
3243                  * placing the page.
3244                  */
3245                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3246                                          TARGET_PAGE_SIZE);
3247             }
3248             break;
3249         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3250             all_zero = false;
3251             len = qemu_get_be32(f);
3252             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3253                 error_report("Invalid compressed data length: %d", len);
3254                 ret = -EINVAL;
3255                 break;
3256             }
3257             decompress_data_with_multi_threads(f, page_buffer, len);
3258             break;
3259 
3260         case RAM_SAVE_FLAG_EOS:
3261             /* normal exit */
3262             multifd_recv_sync_main();
3263             break;
3264         default:
3265             error_report("Unknown combination of migration flags: %#x"
3266                          " (postcopy mode)", flags);
3267             ret = -EINVAL;
3268             break;
3269         }
3270 
3271         /* Got the whole host page, wait for decompress before placing. */
3272         if (place_needed) {
3273             ret |= wait_for_decompress_done();
3274         }
3275 
3276         /* Detect for any possible file errors */
3277         if (!ret && qemu_file_get_error(f)) {
3278             ret = qemu_file_get_error(f);
3279         }
3280 
3281         if (!ret && place_needed) {
3282             /* This gets called at the last target page in the host page */
3283             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3284                                                        block->page_size);
3285 
3286             if (all_zero) {
3287                 ret = postcopy_place_page_zero(mis, place_dest,
3288                                                block);
3289             } else {
3290                 ret = postcopy_place_page(mis, place_dest,
3291                                           place_source, block);
3292             }
3293         }
3294     }
3295 
3296     return ret;
3297 }
3298 
3299 static bool postcopy_is_advised(void)
3300 {
3301     PostcopyState ps = postcopy_state_get();
3302     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3303 }
3304 
3305 static bool postcopy_is_running(void)
3306 {
3307     PostcopyState ps = postcopy_state_get();
3308     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3309 }
3310 
3311 /*
3312  * Flush content of RAM cache into SVM's memory.
3313  * Only flush the pages that be dirtied by PVM or SVM or both.
3314  */
3315 static void colo_flush_ram_cache(void)
3316 {
3317     RAMBlock *block = NULL;
3318     void *dst_host;
3319     void *src_host;
3320     unsigned long offset = 0;
3321 
3322     memory_global_dirty_log_sync();
3323     WITH_RCU_READ_LOCK_GUARD() {
3324         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3325             ramblock_sync_dirty_bitmap(ram_state, block);
3326         }
3327     }
3328 
3329     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3330     WITH_RCU_READ_LOCK_GUARD() {
3331         block = QLIST_FIRST_RCU(&ram_list.blocks);
3332 
3333         while (block) {
3334             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3335 
3336             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3337                 >= block->used_length) {
3338                 offset = 0;
3339                 block = QLIST_NEXT_RCU(block, next);
3340             } else {
3341                 migration_bitmap_clear_dirty(ram_state, block, offset);
3342                 dst_host = block->host
3343                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3344                 src_host = block->colo_cache
3345                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3346                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3347             }
3348         }
3349     }
3350     trace_colo_flush_ram_cache_end();
3351 }
3352 
3353 /**
3354  * ram_load_precopy: load pages in precopy case
3355  *
3356  * Returns 0 for success or -errno in case of error
3357  *
3358  * Called in precopy mode by ram_load().
3359  * rcu_read_lock is taken prior to this being called.
3360  *
3361  * @f: QEMUFile where to send the data
3362  */
3363 static int ram_load_precopy(QEMUFile *f)
3364 {
3365     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3366     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3367     bool postcopy_advised = postcopy_is_advised();
3368     if (!migrate_use_compression()) {
3369         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3370     }
3371 
3372     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3373         ram_addr_t addr, total_ram_bytes;
3374         void *host = NULL, *host_bak = NULL;
3375         uint8_t ch;
3376 
3377         /*
3378          * Yield periodically to let main loop run, but an iteration of
3379          * the main loop is expensive, so do it each some iterations
3380          */
3381         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3382             aio_co_schedule(qemu_get_current_aio_context(),
3383                             qemu_coroutine_self());
3384             qemu_coroutine_yield();
3385         }
3386         i++;
3387 
3388         addr = qemu_get_be64(f);
3389         flags = addr & ~TARGET_PAGE_MASK;
3390         addr &= TARGET_PAGE_MASK;
3391 
3392         if (flags & invalid_flags) {
3393             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3394                 error_report("Received an unexpected compressed page");
3395             }
3396 
3397             ret = -EINVAL;
3398             break;
3399         }
3400 
3401         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3402                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3403             RAMBlock *block = ram_block_from_stream(f, flags);
3404 
3405             host = host_from_ram_block_offset(block, addr);
3406             /*
3407              * After going into COLO stage, we should not load the page
3408              * into SVM's memory directly, we put them into colo_cache firstly.
3409              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3410              * Previously, we copied all these memory in preparing stage of COLO
3411              * while we need to stop VM, which is a time-consuming process.
3412              * Here we optimize it by a trick, back-up every page while in
3413              * migration process while COLO is enabled, though it affects the
3414              * speed of the migration, but it obviously reduce the downtime of
3415              * back-up all SVM'S memory in COLO preparing stage.
3416              */
3417             if (migration_incoming_colo_enabled()) {
3418                 if (migration_incoming_in_colo_state()) {
3419                     /* In COLO stage, put all pages into cache temporarily */
3420                     host = colo_cache_from_block_offset(block, addr, true);
3421                 } else {
3422                    /*
3423                     * In migration stage but before COLO stage,
3424                     * Put all pages into both cache and SVM's memory.
3425                     */
3426                     host_bak = colo_cache_from_block_offset(block, addr, false);
3427                 }
3428             }
3429             if (!host) {
3430                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3431                 ret = -EINVAL;
3432                 break;
3433             }
3434             if (!migration_incoming_in_colo_state()) {
3435                 ramblock_recv_bitmap_set(block, host);
3436             }
3437 
3438             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3439         }
3440 
3441         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3442         case RAM_SAVE_FLAG_MEM_SIZE:
3443             /* Synchronize RAM block list */
3444             total_ram_bytes = addr;
3445             while (!ret && total_ram_bytes) {
3446                 RAMBlock *block;
3447                 char id[256];
3448                 ram_addr_t length;
3449 
3450                 len = qemu_get_byte(f);
3451                 qemu_get_buffer(f, (uint8_t *)id, len);
3452                 id[len] = 0;
3453                 length = qemu_get_be64(f);
3454 
3455                 block = qemu_ram_block_by_name(id);
3456                 if (block && !qemu_ram_is_migratable(block)) {
3457                     error_report("block %s should not be migrated !", id);
3458                     ret = -EINVAL;
3459                 } else if (block) {
3460                     if (length != block->used_length) {
3461                         Error *local_err = NULL;
3462 
3463                         ret = qemu_ram_resize(block, length,
3464                                               &local_err);
3465                         if (local_err) {
3466                             error_report_err(local_err);
3467                         }
3468                     }
3469                     /* For postcopy we need to check hugepage sizes match */
3470                     if (postcopy_advised &&
3471                         block->page_size != qemu_host_page_size) {
3472                         uint64_t remote_page_size = qemu_get_be64(f);
3473                         if (remote_page_size != block->page_size) {
3474                             error_report("Mismatched RAM page size %s "
3475                                          "(local) %zd != %" PRId64,
3476                                          id, block->page_size,
3477                                          remote_page_size);
3478                             ret = -EINVAL;
3479                         }
3480                     }
3481                     if (migrate_ignore_shared()) {
3482                         hwaddr addr = qemu_get_be64(f);
3483                         if (ramblock_is_ignored(block) &&
3484                             block->mr->addr != addr) {
3485                             error_report("Mismatched GPAs for block %s "
3486                                          "%" PRId64 "!= %" PRId64,
3487                                          id, (uint64_t)addr,
3488                                          (uint64_t)block->mr->addr);
3489                             ret = -EINVAL;
3490                         }
3491                     }
3492                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3493                                           block->idstr);
3494                 } else {
3495                     error_report("Unknown ramblock \"%s\", cannot "
3496                                  "accept migration", id);
3497                     ret = -EINVAL;
3498                 }
3499 
3500                 total_ram_bytes -= length;
3501             }
3502             break;
3503 
3504         case RAM_SAVE_FLAG_ZERO:
3505             ch = qemu_get_byte(f);
3506             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3507             break;
3508 
3509         case RAM_SAVE_FLAG_PAGE:
3510             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3511             break;
3512 
3513         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3514             len = qemu_get_be32(f);
3515             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3516                 error_report("Invalid compressed data length: %d", len);
3517                 ret = -EINVAL;
3518                 break;
3519             }
3520             decompress_data_with_multi_threads(f, host, len);
3521             break;
3522 
3523         case RAM_SAVE_FLAG_XBZRLE:
3524             if (load_xbzrle(f, addr, host) < 0) {
3525                 error_report("Failed to decompress XBZRLE page at "
3526                              RAM_ADDR_FMT, addr);
3527                 ret = -EINVAL;
3528                 break;
3529             }
3530             break;
3531         case RAM_SAVE_FLAG_EOS:
3532             /* normal exit */
3533             multifd_recv_sync_main();
3534             break;
3535         default:
3536             if (flags & RAM_SAVE_FLAG_HOOK) {
3537                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3538             } else {
3539                 error_report("Unknown combination of migration flags: %#x",
3540                              flags);
3541                 ret = -EINVAL;
3542             }
3543         }
3544         if (!ret) {
3545             ret = qemu_file_get_error(f);
3546         }
3547         if (!ret && host_bak) {
3548             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3549         }
3550     }
3551 
3552     ret |= wait_for_decompress_done();
3553     return ret;
3554 }
3555 
3556 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3557 {
3558     int ret = 0;
3559     static uint64_t seq_iter;
3560     /*
3561      * If system is running in postcopy mode, page inserts to host memory must
3562      * be atomic
3563      */
3564     bool postcopy_running = postcopy_is_running();
3565 
3566     seq_iter++;
3567 
3568     if (version_id != 4) {
3569         return -EINVAL;
3570     }
3571 
3572     /*
3573      * This RCU critical section can be very long running.
3574      * When RCU reclaims in the code start to become numerous,
3575      * it will be necessary to reduce the granularity of this
3576      * critical section.
3577      */
3578     WITH_RCU_READ_LOCK_GUARD() {
3579         if (postcopy_running) {
3580             ret = ram_load_postcopy(f);
3581         } else {
3582             ret = ram_load_precopy(f);
3583         }
3584     }
3585     trace_ram_load_complete(ret, seq_iter);
3586 
3587     if (!ret  && migration_incoming_in_colo_state()) {
3588         colo_flush_ram_cache();
3589     }
3590     return ret;
3591 }
3592 
3593 static bool ram_has_postcopy(void *opaque)
3594 {
3595     RAMBlock *rb;
3596     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3597         if (ramblock_is_pmem(rb)) {
3598             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3599                          "is not supported now!", rb->idstr, rb->host);
3600             return false;
3601         }
3602     }
3603 
3604     return migrate_postcopy_ram();
3605 }
3606 
3607 /* Sync all the dirty bitmap with destination VM.  */
3608 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3609 {
3610     RAMBlock *block;
3611     QEMUFile *file = s->to_dst_file;
3612     int ramblock_count = 0;
3613 
3614     trace_ram_dirty_bitmap_sync_start();
3615 
3616     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3617         qemu_savevm_send_recv_bitmap(file, block->idstr);
3618         trace_ram_dirty_bitmap_request(block->idstr);
3619         ramblock_count++;
3620     }
3621 
3622     trace_ram_dirty_bitmap_sync_wait();
3623 
3624     /* Wait until all the ramblocks' dirty bitmap synced */
3625     while (ramblock_count--) {
3626         qemu_sem_wait(&s->rp_state.rp_sem);
3627     }
3628 
3629     trace_ram_dirty_bitmap_sync_complete();
3630 
3631     return 0;
3632 }
3633 
3634 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3635 {
3636     qemu_sem_post(&s->rp_state.rp_sem);
3637 }
3638 
3639 /*
3640  * Read the received bitmap, revert it as the initial dirty bitmap.
3641  * This is only used when the postcopy migration is paused but wants
3642  * to resume from a middle point.
3643  */
3644 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3645 {
3646     int ret = -EINVAL;
3647     QEMUFile *file = s->rp_state.from_dst_file;
3648     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3649     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3650     uint64_t size, end_mark;
3651 
3652     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3653 
3654     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3655         error_report("%s: incorrect state %s", __func__,
3656                      MigrationStatus_str(s->state));
3657         return -EINVAL;
3658     }
3659 
3660     /*
3661      * Note: see comments in ramblock_recv_bitmap_send() on why we
3662      * need the endianess convertion, and the paddings.
3663      */
3664     local_size = ROUND_UP(local_size, 8);
3665 
3666     /* Add paddings */
3667     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3668 
3669     size = qemu_get_be64(file);
3670 
3671     /* The size of the bitmap should match with our ramblock */
3672     if (size != local_size) {
3673         error_report("%s: ramblock '%s' bitmap size mismatch "
3674                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3675                      block->idstr, size, local_size);
3676         ret = -EINVAL;
3677         goto out;
3678     }
3679 
3680     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3681     end_mark = qemu_get_be64(file);
3682 
3683     ret = qemu_file_get_error(file);
3684     if (ret || size != local_size) {
3685         error_report("%s: read bitmap failed for ramblock '%s': %d"
3686                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3687                      __func__, block->idstr, ret, local_size, size);
3688         ret = -EIO;
3689         goto out;
3690     }
3691 
3692     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3693         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3694                      __func__, block->idstr, end_mark);
3695         ret = -EINVAL;
3696         goto out;
3697     }
3698 
3699     /*
3700      * Endianess convertion. We are during postcopy (though paused).
3701      * The dirty bitmap won't change. We can directly modify it.
3702      */
3703     bitmap_from_le(block->bmap, le_bitmap, nbits);
3704 
3705     /*
3706      * What we received is "received bitmap". Revert it as the initial
3707      * dirty bitmap for this ramblock.
3708      */
3709     bitmap_complement(block->bmap, block->bmap, nbits);
3710 
3711     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3712 
3713     /*
3714      * We succeeded to sync bitmap for current ramblock. If this is
3715      * the last one to sync, we need to notify the main send thread.
3716      */
3717     ram_dirty_bitmap_reload_notify(s);
3718 
3719     ret = 0;
3720 out:
3721     g_free(le_bitmap);
3722     return ret;
3723 }
3724 
3725 static int ram_resume_prepare(MigrationState *s, void *opaque)
3726 {
3727     RAMState *rs = *(RAMState **)opaque;
3728     int ret;
3729 
3730     ret = ram_dirty_bitmap_sync_all(s, rs);
3731     if (ret) {
3732         return ret;
3733     }
3734 
3735     ram_state_resume_prepare(rs, s->to_dst_file);
3736 
3737     return 0;
3738 }
3739 
3740 static SaveVMHandlers savevm_ram_handlers = {
3741     .save_setup = ram_save_setup,
3742     .save_live_iterate = ram_save_iterate,
3743     .save_live_complete_postcopy = ram_save_complete,
3744     .save_live_complete_precopy = ram_save_complete,
3745     .has_postcopy = ram_has_postcopy,
3746     .save_live_pending = ram_save_pending,
3747     .load_state = ram_load,
3748     .save_cleanup = ram_save_cleanup,
3749     .load_setup = ram_load_setup,
3750     .load_cleanup = ram_load_cleanup,
3751     .resume_prepare = ram_resume_prepare,
3752 };
3753 
3754 void ram_mig_init(void)
3755 {
3756     qemu_mutex_init(&XBZRLE.lock);
3757     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3758 }
3759