xref: /openbmc/qemu/migration/ram.c (revision be99a9a0)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include "qemu/cutils.h"
32 #include "qemu/bitops.h"
33 #include "qemu/bitmap.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "block.h"
54 #include "sysemu/sysemu.h"
55 #include "savevm.h"
56 #include "qemu/iov.h"
57 #include "multifd.h"
58 
59 /***********************************************************/
60 /* ram save/restore */
61 
62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
63  * worked for pages that where filled with the same char.  We switched
64  * it to only search for the zero value.  And to avoid confusion with
65  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
66  */
67 
68 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
69 #define RAM_SAVE_FLAG_ZERO     0x02
70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
71 #define RAM_SAVE_FLAG_PAGE     0x08
72 #define RAM_SAVE_FLAG_EOS      0x10
73 #define RAM_SAVE_FLAG_CONTINUE 0x20
74 #define RAM_SAVE_FLAG_XBZRLE   0x40
75 /* 0x80 is reserved in migration.h start with 0x100 next */
76 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
77 
78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
79 {
80     return buffer_is_zero(p, size);
81 }
82 
83 XBZRLECacheStats xbzrle_counters;
84 
85 /* struct contains XBZRLE cache and a static page
86    used by the compression */
87 static struct {
88     /* buffer used for XBZRLE encoding */
89     uint8_t *encoded_buf;
90     /* buffer for storing page content */
91     uint8_t *current_buf;
92     /* Cache for XBZRLE, Protected by lock. */
93     PageCache *cache;
94     QemuMutex lock;
95     /* it will store a page full of zeros */
96     uint8_t *zero_target_page;
97     /* buffer used for XBZRLE decoding */
98     uint8_t *decoded_buf;
99 } XBZRLE;
100 
101 static void XBZRLE_cache_lock(void)
102 {
103     if (migrate_use_xbzrle())
104         qemu_mutex_lock(&XBZRLE.lock);
105 }
106 
107 static void XBZRLE_cache_unlock(void)
108 {
109     if (migrate_use_xbzrle())
110         qemu_mutex_unlock(&XBZRLE.lock);
111 }
112 
113 /**
114  * xbzrle_cache_resize: resize the xbzrle cache
115  *
116  * This function is called from qmp_migrate_set_cache_size in main
117  * thread, possibly while a migration is in progress.  A running
118  * migration may be using the cache and might finish during this call,
119  * hence changes to the cache are protected by XBZRLE.lock().
120  *
121  * Returns 0 for success or -1 for error
122  *
123  * @new_size: new cache size
124  * @errp: set *errp if the check failed, with reason
125  */
126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
127 {
128     PageCache *new_cache;
129     int64_t ret = 0;
130 
131     /* Check for truncation */
132     if (new_size != (size_t)new_size) {
133         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134                    "exceeding address space");
135         return -1;
136     }
137 
138     if (new_size == migrate_xbzrle_cache_size()) {
139         /* nothing to do */
140         return 0;
141     }
142 
143     XBZRLE_cache_lock();
144 
145     if (XBZRLE.cache != NULL) {
146         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
147         if (!new_cache) {
148             ret = -1;
149             goto out;
150         }
151 
152         cache_fini(XBZRLE.cache);
153         XBZRLE.cache = new_cache;
154     }
155 out:
156     XBZRLE_cache_unlock();
157     return ret;
158 }
159 
160 static bool ramblock_is_ignored(RAMBlock *block)
161 {
162     return !qemu_ram_is_migratable(block) ||
163            (migrate_ignore_shared() && qemu_ram_is_shared(block));
164 }
165 
166 /* Should be holding either ram_list.mutex, or the RCU lock. */
167 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
168     INTERNAL_RAMBLOCK_FOREACH(block)                   \
169         if (ramblock_is_ignored(block)) {} else
170 
171 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
172     INTERNAL_RAMBLOCK_FOREACH(block)                   \
173         if (!qemu_ram_is_migratable(block)) {} else
174 
175 #undef RAMBLOCK_FOREACH
176 
177 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
178 {
179     RAMBlock *block;
180     int ret = 0;
181 
182     RCU_READ_LOCK_GUARD();
183 
184     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
185         ret = func(block, opaque);
186         if (ret) {
187             break;
188         }
189     }
190     return ret;
191 }
192 
193 static void ramblock_recv_map_init(void)
194 {
195     RAMBlock *rb;
196 
197     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
198         assert(!rb->receivedmap);
199         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
200     }
201 }
202 
203 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
204 {
205     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
206                     rb->receivedmap);
207 }
208 
209 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
210 {
211     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
212 }
213 
214 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
215 {
216     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
217 }
218 
219 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
220                                     size_t nr)
221 {
222     bitmap_set_atomic(rb->receivedmap,
223                       ramblock_recv_bitmap_offset(host_addr, rb),
224                       nr);
225 }
226 
227 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
228 
229 /*
230  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
231  *
232  * Returns >0 if success with sent bytes, or <0 if error.
233  */
234 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
235                                   const char *block_name)
236 {
237     RAMBlock *block = qemu_ram_block_by_name(block_name);
238     unsigned long *le_bitmap, nbits;
239     uint64_t size;
240 
241     if (!block) {
242         error_report("%s: invalid block name: %s", __func__, block_name);
243         return -1;
244     }
245 
246     nbits = block->used_length >> TARGET_PAGE_BITS;
247 
248     /*
249      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
250      * machines we may need 4 more bytes for padding (see below
251      * comment). So extend it a bit before hand.
252      */
253     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
254 
255     /*
256      * Always use little endian when sending the bitmap. This is
257      * required that when source and destination VMs are not using the
258      * same endianess. (Note: big endian won't work.)
259      */
260     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
261 
262     /* Size of the bitmap, in bytes */
263     size = DIV_ROUND_UP(nbits, 8);
264 
265     /*
266      * size is always aligned to 8 bytes for 64bit machines, but it
267      * may not be true for 32bit machines. We need this padding to
268      * make sure the migration can survive even between 32bit and
269      * 64bit machines.
270      */
271     size = ROUND_UP(size, 8);
272 
273     qemu_put_be64(file, size);
274     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
275     /*
276      * Mark as an end, in case the middle part is screwed up due to
277      * some "misterious" reason.
278      */
279     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
280     qemu_fflush(file);
281 
282     g_free(le_bitmap);
283 
284     if (qemu_file_get_error(file)) {
285         return qemu_file_get_error(file);
286     }
287 
288     return size + sizeof(size);
289 }
290 
291 /*
292  * An outstanding page request, on the source, having been received
293  * and queued
294  */
295 struct RAMSrcPageRequest {
296     RAMBlock *rb;
297     hwaddr    offset;
298     hwaddr    len;
299 
300     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
301 };
302 
303 /* State of RAM for migration */
304 struct RAMState {
305     /* QEMUFile used for this migration */
306     QEMUFile *f;
307     /* Last block that we have visited searching for dirty pages */
308     RAMBlock *last_seen_block;
309     /* Last block from where we have sent data */
310     RAMBlock *last_sent_block;
311     /* Last dirty target page we have sent */
312     ram_addr_t last_page;
313     /* last ram version we have seen */
314     uint32_t last_version;
315     /* We are in the first round */
316     bool ram_bulk_stage;
317     /* The free page optimization is enabled */
318     bool fpo_enabled;
319     /* How many times we have dirty too many pages */
320     int dirty_rate_high_cnt;
321     /* these variables are used for bitmap sync */
322     /* last time we did a full bitmap_sync */
323     int64_t time_last_bitmap_sync;
324     /* bytes transferred at start_time */
325     uint64_t bytes_xfer_prev;
326     /* number of dirty pages since start_time */
327     uint64_t num_dirty_pages_period;
328     /* xbzrle misses since the beginning of the period */
329     uint64_t xbzrle_cache_miss_prev;
330     /* Amount of xbzrle pages since the beginning of the period */
331     uint64_t xbzrle_pages_prev;
332     /* Amount of xbzrle encoded bytes since the beginning of the period */
333     uint64_t xbzrle_bytes_prev;
334 
335     /* compression statistics since the beginning of the period */
336     /* amount of count that no free thread to compress data */
337     uint64_t compress_thread_busy_prev;
338     /* amount bytes after compression */
339     uint64_t compressed_size_prev;
340     /* amount of compressed pages */
341     uint64_t compress_pages_prev;
342 
343     /* total handled target pages at the beginning of period */
344     uint64_t target_page_count_prev;
345     /* total handled target pages since start */
346     uint64_t target_page_count;
347     /* number of dirty bits in the bitmap */
348     uint64_t migration_dirty_pages;
349     /* Protects modification of the bitmap and migration dirty pages */
350     QemuMutex bitmap_mutex;
351     /* The RAMBlock used in the last src_page_requests */
352     RAMBlock *last_req_rb;
353     /* Queue of outstanding page requests from the destination */
354     QemuMutex src_page_req_mutex;
355     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
356 };
357 typedef struct RAMState RAMState;
358 
359 static RAMState *ram_state;
360 
361 static NotifierWithReturnList precopy_notifier_list;
362 
363 void precopy_infrastructure_init(void)
364 {
365     notifier_with_return_list_init(&precopy_notifier_list);
366 }
367 
368 void precopy_add_notifier(NotifierWithReturn *n)
369 {
370     notifier_with_return_list_add(&precopy_notifier_list, n);
371 }
372 
373 void precopy_remove_notifier(NotifierWithReturn *n)
374 {
375     notifier_with_return_remove(n);
376 }
377 
378 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
379 {
380     PrecopyNotifyData pnd;
381     pnd.reason = reason;
382     pnd.errp = errp;
383 
384     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
385 }
386 
387 void precopy_enable_free_page_optimization(void)
388 {
389     if (!ram_state) {
390         return;
391     }
392 
393     ram_state->fpo_enabled = true;
394 }
395 
396 uint64_t ram_bytes_remaining(void)
397 {
398     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
399                        0;
400 }
401 
402 MigrationStats ram_counters;
403 
404 /* used by the search for pages to send */
405 struct PageSearchStatus {
406     /* Current block being searched */
407     RAMBlock    *block;
408     /* Current page to search from */
409     unsigned long page;
410     /* Set once we wrap around */
411     bool         complete_round;
412 };
413 typedef struct PageSearchStatus PageSearchStatus;
414 
415 CompressionStats compression_counters;
416 
417 struct CompressParam {
418     bool done;
419     bool quit;
420     bool zero_page;
421     QEMUFile *file;
422     QemuMutex mutex;
423     QemuCond cond;
424     RAMBlock *block;
425     ram_addr_t offset;
426 
427     /* internally used fields */
428     z_stream stream;
429     uint8_t *originbuf;
430 };
431 typedef struct CompressParam CompressParam;
432 
433 struct DecompressParam {
434     bool done;
435     bool quit;
436     QemuMutex mutex;
437     QemuCond cond;
438     void *des;
439     uint8_t *compbuf;
440     int len;
441     z_stream stream;
442 };
443 typedef struct DecompressParam DecompressParam;
444 
445 static CompressParam *comp_param;
446 static QemuThread *compress_threads;
447 /* comp_done_cond is used to wake up the migration thread when
448  * one of the compression threads has finished the compression.
449  * comp_done_lock is used to co-work with comp_done_cond.
450  */
451 static QemuMutex comp_done_lock;
452 static QemuCond comp_done_cond;
453 /* The empty QEMUFileOps will be used by file in CompressParam */
454 static const QEMUFileOps empty_ops = { };
455 
456 static QEMUFile *decomp_file;
457 static DecompressParam *decomp_param;
458 static QemuThread *decompress_threads;
459 static QemuMutex decomp_done_lock;
460 static QemuCond decomp_done_cond;
461 
462 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
463                                  ram_addr_t offset, uint8_t *source_buf);
464 
465 static void *do_data_compress(void *opaque)
466 {
467     CompressParam *param = opaque;
468     RAMBlock *block;
469     ram_addr_t offset;
470     bool zero_page;
471 
472     qemu_mutex_lock(&param->mutex);
473     while (!param->quit) {
474         if (param->block) {
475             block = param->block;
476             offset = param->offset;
477             param->block = NULL;
478             qemu_mutex_unlock(&param->mutex);
479 
480             zero_page = do_compress_ram_page(param->file, &param->stream,
481                                              block, offset, param->originbuf);
482 
483             qemu_mutex_lock(&comp_done_lock);
484             param->done = true;
485             param->zero_page = zero_page;
486             qemu_cond_signal(&comp_done_cond);
487             qemu_mutex_unlock(&comp_done_lock);
488 
489             qemu_mutex_lock(&param->mutex);
490         } else {
491             qemu_cond_wait(&param->cond, &param->mutex);
492         }
493     }
494     qemu_mutex_unlock(&param->mutex);
495 
496     return NULL;
497 }
498 
499 static void compress_threads_save_cleanup(void)
500 {
501     int i, thread_count;
502 
503     if (!migrate_use_compression() || !comp_param) {
504         return;
505     }
506 
507     thread_count = migrate_compress_threads();
508     for (i = 0; i < thread_count; i++) {
509         /*
510          * we use it as a indicator which shows if the thread is
511          * properly init'd or not
512          */
513         if (!comp_param[i].file) {
514             break;
515         }
516 
517         qemu_mutex_lock(&comp_param[i].mutex);
518         comp_param[i].quit = true;
519         qemu_cond_signal(&comp_param[i].cond);
520         qemu_mutex_unlock(&comp_param[i].mutex);
521 
522         qemu_thread_join(compress_threads + i);
523         qemu_mutex_destroy(&comp_param[i].mutex);
524         qemu_cond_destroy(&comp_param[i].cond);
525         deflateEnd(&comp_param[i].stream);
526         g_free(comp_param[i].originbuf);
527         qemu_fclose(comp_param[i].file);
528         comp_param[i].file = NULL;
529     }
530     qemu_mutex_destroy(&comp_done_lock);
531     qemu_cond_destroy(&comp_done_cond);
532     g_free(compress_threads);
533     g_free(comp_param);
534     compress_threads = NULL;
535     comp_param = NULL;
536 }
537 
538 static int compress_threads_save_setup(void)
539 {
540     int i, thread_count;
541 
542     if (!migrate_use_compression()) {
543         return 0;
544     }
545     thread_count = migrate_compress_threads();
546     compress_threads = g_new0(QemuThread, thread_count);
547     comp_param = g_new0(CompressParam, thread_count);
548     qemu_cond_init(&comp_done_cond);
549     qemu_mutex_init(&comp_done_lock);
550     for (i = 0; i < thread_count; i++) {
551         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
552         if (!comp_param[i].originbuf) {
553             goto exit;
554         }
555 
556         if (deflateInit(&comp_param[i].stream,
557                         migrate_compress_level()) != Z_OK) {
558             g_free(comp_param[i].originbuf);
559             goto exit;
560         }
561 
562         /* comp_param[i].file is just used as a dummy buffer to save data,
563          * set its ops to empty.
564          */
565         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
566         comp_param[i].done = true;
567         comp_param[i].quit = false;
568         qemu_mutex_init(&comp_param[i].mutex);
569         qemu_cond_init(&comp_param[i].cond);
570         qemu_thread_create(compress_threads + i, "compress",
571                            do_data_compress, comp_param + i,
572                            QEMU_THREAD_JOINABLE);
573     }
574     return 0;
575 
576 exit:
577     compress_threads_save_cleanup();
578     return -1;
579 }
580 
581 /**
582  * save_page_header: write page header to wire
583  *
584  * If this is the 1st block, it also writes the block identification
585  *
586  * Returns the number of bytes written
587  *
588  * @f: QEMUFile where to send the data
589  * @block: block that contains the page we want to send
590  * @offset: offset inside the block for the page
591  *          in the lower bits, it contains flags
592  */
593 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
594                                ram_addr_t offset)
595 {
596     size_t size, len;
597 
598     if (block == rs->last_sent_block) {
599         offset |= RAM_SAVE_FLAG_CONTINUE;
600     }
601     qemu_put_be64(f, offset);
602     size = 8;
603 
604     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
605         len = strlen(block->idstr);
606         qemu_put_byte(f, len);
607         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
608         size += 1 + len;
609         rs->last_sent_block = block;
610     }
611     return size;
612 }
613 
614 /**
615  * mig_throttle_guest_down: throotle down the guest
616  *
617  * Reduce amount of guest cpu execution to hopefully slow down memory
618  * writes. If guest dirty memory rate is reduced below the rate at
619  * which we can transfer pages to the destination then we should be
620  * able to complete migration. Some workloads dirty memory way too
621  * fast and will not effectively converge, even with auto-converge.
622  */
623 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
624                                     uint64_t bytes_dirty_threshold)
625 {
626     MigrationState *s = migrate_get_current();
627     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
628     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
629     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
630     int pct_max = s->parameters.max_cpu_throttle;
631 
632     uint64_t throttle_now = cpu_throttle_get_percentage();
633     uint64_t cpu_now, cpu_ideal, throttle_inc;
634 
635     /* We have not started throttling yet. Let's start it. */
636     if (!cpu_throttle_active()) {
637         cpu_throttle_set(pct_initial);
638     } else {
639         /* Throttling already on, just increase the rate */
640         if (!pct_tailslow) {
641             throttle_inc = pct_increment;
642         } else {
643             /* Compute the ideal CPU percentage used by Guest, which may
644              * make the dirty rate match the dirty rate threshold. */
645             cpu_now = 100 - throttle_now;
646             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
647                         bytes_dirty_period);
648             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
649         }
650         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
651     }
652 }
653 
654 /**
655  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
656  *
657  * @rs: current RAM state
658  * @current_addr: address for the zero page
659  *
660  * Update the xbzrle cache to reflect a page that's been sent as all 0.
661  * The important thing is that a stale (not-yet-0'd) page be replaced
662  * by the new data.
663  * As a bonus, if the page wasn't in the cache it gets added so that
664  * when a small write is made into the 0'd page it gets XBZRLE sent.
665  */
666 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
667 {
668     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
669         return;
670     }
671 
672     /* We don't care if this fails to allocate a new cache page
673      * as long as it updated an old one */
674     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
675                  ram_counters.dirty_sync_count);
676 }
677 
678 #define ENCODING_FLAG_XBZRLE 0x1
679 
680 /**
681  * save_xbzrle_page: compress and send current page
682  *
683  * Returns: 1 means that we wrote the page
684  *          0 means that page is identical to the one already sent
685  *          -1 means that xbzrle would be longer than normal
686  *
687  * @rs: current RAM state
688  * @current_data: pointer to the address of the page contents
689  * @current_addr: addr of the page
690  * @block: block that contains the page we want to send
691  * @offset: offset inside the block for the page
692  * @last_stage: if we are at the completion stage
693  */
694 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
695                             ram_addr_t current_addr, RAMBlock *block,
696                             ram_addr_t offset, bool last_stage)
697 {
698     int encoded_len = 0, bytes_xbzrle;
699     uint8_t *prev_cached_page;
700 
701     if (!cache_is_cached(XBZRLE.cache, current_addr,
702                          ram_counters.dirty_sync_count)) {
703         xbzrle_counters.cache_miss++;
704         if (!last_stage) {
705             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
706                              ram_counters.dirty_sync_count) == -1) {
707                 return -1;
708             } else {
709                 /* update *current_data when the page has been
710                    inserted into cache */
711                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
712             }
713         }
714         return -1;
715     }
716 
717     /*
718      * Reaching here means the page has hit the xbzrle cache, no matter what
719      * encoding result it is (normal encoding, overflow or skipping the page),
720      * count the page as encoded. This is used to caculate the encoding rate.
721      *
722      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
723      * 2nd page turns out to be skipped (i.e. no new bytes written to the
724      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
725      * skipped page included. In this way, the encoding rate can tell if the
726      * guest page is good for xbzrle encoding.
727      */
728     xbzrle_counters.pages++;
729     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
730 
731     /* save current buffer into memory */
732     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
733 
734     /* XBZRLE encoding (if there is no overflow) */
735     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
736                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
737                                        TARGET_PAGE_SIZE);
738 
739     /*
740      * Update the cache contents, so that it corresponds to the data
741      * sent, in all cases except where we skip the page.
742      */
743     if (!last_stage && encoded_len != 0) {
744         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
745         /*
746          * In the case where we couldn't compress, ensure that the caller
747          * sends the data from the cache, since the guest might have
748          * changed the RAM since we copied it.
749          */
750         *current_data = prev_cached_page;
751     }
752 
753     if (encoded_len == 0) {
754         trace_save_xbzrle_page_skipping();
755         return 0;
756     } else if (encoded_len == -1) {
757         trace_save_xbzrle_page_overflow();
758         xbzrle_counters.overflow++;
759         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
760         return -1;
761     }
762 
763     /* Send XBZRLE based compressed page */
764     bytes_xbzrle = save_page_header(rs, rs->f, block,
765                                     offset | RAM_SAVE_FLAG_XBZRLE);
766     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
767     qemu_put_be16(rs->f, encoded_len);
768     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
769     bytes_xbzrle += encoded_len + 1 + 2;
770     /*
771      * Like compressed_size (please see update_compress_thread_counts),
772      * the xbzrle encoded bytes don't count the 8 byte header with
773      * RAM_SAVE_FLAG_CONTINUE.
774      */
775     xbzrle_counters.bytes += bytes_xbzrle - 8;
776     ram_counters.transferred += bytes_xbzrle;
777 
778     return 1;
779 }
780 
781 /**
782  * migration_bitmap_find_dirty: find the next dirty page from start
783  *
784  * Returns the page offset within memory region of the start of a dirty page
785  *
786  * @rs: current RAM state
787  * @rb: RAMBlock where to search for dirty pages
788  * @start: page where we start the search
789  */
790 static inline
791 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
792                                           unsigned long start)
793 {
794     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
795     unsigned long *bitmap = rb->bmap;
796     unsigned long next;
797 
798     if (ramblock_is_ignored(rb)) {
799         return size;
800     }
801 
802     /*
803      * When the free page optimization is enabled, we need to check the bitmap
804      * to send the non-free pages rather than all the pages in the bulk stage.
805      */
806     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
807         next = start + 1;
808     } else {
809         next = find_next_bit(bitmap, size, start);
810     }
811 
812     return next;
813 }
814 
815 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
816                                                 RAMBlock *rb,
817                                                 unsigned long page)
818 {
819     bool ret;
820 
821     qemu_mutex_lock(&rs->bitmap_mutex);
822 
823     /*
824      * Clear dirty bitmap if needed.  This _must_ be called before we
825      * send any of the page in the chunk because we need to make sure
826      * we can capture further page content changes when we sync dirty
827      * log the next time.  So as long as we are going to send any of
828      * the page in the chunk we clear the remote dirty bitmap for all.
829      * Clearing it earlier won't be a problem, but too late will.
830      */
831     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
832         uint8_t shift = rb->clear_bmap_shift;
833         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
834         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
835 
836         /*
837          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
838          * can make things easier sometimes since then start address
839          * of the small chunk will always be 64 pages aligned so the
840          * bitmap will always be aligned to unsigned long.  We should
841          * even be able to remove this restriction but I'm simply
842          * keeping it.
843          */
844         assert(shift >= 6);
845         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
846         memory_region_clear_dirty_bitmap(rb->mr, start, size);
847     }
848 
849     ret = test_and_clear_bit(page, rb->bmap);
850 
851     if (ret) {
852         rs->migration_dirty_pages--;
853     }
854     qemu_mutex_unlock(&rs->bitmap_mutex);
855 
856     return ret;
857 }
858 
859 /* Called with RCU critical section */
860 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
861 {
862     uint64_t new_dirty_pages =
863         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
864 
865     rs->migration_dirty_pages += new_dirty_pages;
866     rs->num_dirty_pages_period += new_dirty_pages;
867 }
868 
869 /**
870  * ram_pagesize_summary: calculate all the pagesizes of a VM
871  *
872  * Returns a summary bitmap of the page sizes of all RAMBlocks
873  *
874  * For VMs with just normal pages this is equivalent to the host page
875  * size. If it's got some huge pages then it's the OR of all the
876  * different page sizes.
877  */
878 uint64_t ram_pagesize_summary(void)
879 {
880     RAMBlock *block;
881     uint64_t summary = 0;
882 
883     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
884         summary |= block->page_size;
885     }
886 
887     return summary;
888 }
889 
890 uint64_t ram_get_total_transferred_pages(void)
891 {
892     return  ram_counters.normal + ram_counters.duplicate +
893                 compression_counters.pages + xbzrle_counters.pages;
894 }
895 
896 static void migration_update_rates(RAMState *rs, int64_t end_time)
897 {
898     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
899     double compressed_size;
900 
901     /* calculate period counters */
902     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
903                 / (end_time - rs->time_last_bitmap_sync);
904 
905     if (!page_count) {
906         return;
907     }
908 
909     if (migrate_use_xbzrle()) {
910         double encoded_size, unencoded_size;
911 
912         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
913             rs->xbzrle_cache_miss_prev) / page_count;
914         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
915         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
916                          TARGET_PAGE_SIZE;
917         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
918         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
919             xbzrle_counters.encoding_rate = 0;
920         } else {
921             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
922         }
923         rs->xbzrle_pages_prev = xbzrle_counters.pages;
924         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
925     }
926 
927     if (migrate_use_compression()) {
928         compression_counters.busy_rate = (double)(compression_counters.busy -
929             rs->compress_thread_busy_prev) / page_count;
930         rs->compress_thread_busy_prev = compression_counters.busy;
931 
932         compressed_size = compression_counters.compressed_size -
933                           rs->compressed_size_prev;
934         if (compressed_size) {
935             double uncompressed_size = (compression_counters.pages -
936                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
937 
938             /* Compression-Ratio = Uncompressed-size / Compressed-size */
939             compression_counters.compression_rate =
940                                         uncompressed_size / compressed_size;
941 
942             rs->compress_pages_prev = compression_counters.pages;
943             rs->compressed_size_prev = compression_counters.compressed_size;
944         }
945     }
946 }
947 
948 static void migration_trigger_throttle(RAMState *rs)
949 {
950     MigrationState *s = migrate_get_current();
951     uint64_t threshold = s->parameters.throttle_trigger_threshold;
952 
953     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
954     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
955     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
956 
957     /* During block migration the auto-converge logic incorrectly detects
958      * that ram migration makes no progress. Avoid this by disabling the
959      * throttling logic during the bulk phase of block migration. */
960     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
961         /* The following detection logic can be refined later. For now:
962            Check to see if the ratio between dirtied bytes and the approx.
963            amount of bytes that just got transferred since the last time
964            we were in this routine reaches the threshold. If that happens
965            twice, start or increase throttling. */
966 
967         if ((bytes_dirty_period > bytes_dirty_threshold) &&
968             (++rs->dirty_rate_high_cnt >= 2)) {
969             trace_migration_throttle();
970             rs->dirty_rate_high_cnt = 0;
971             mig_throttle_guest_down(bytes_dirty_period,
972                                     bytes_dirty_threshold);
973         }
974     }
975 }
976 
977 static void migration_bitmap_sync(RAMState *rs)
978 {
979     RAMBlock *block;
980     int64_t end_time;
981 
982     ram_counters.dirty_sync_count++;
983 
984     if (!rs->time_last_bitmap_sync) {
985         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
986     }
987 
988     trace_migration_bitmap_sync_start();
989     memory_global_dirty_log_sync();
990 
991     qemu_mutex_lock(&rs->bitmap_mutex);
992     WITH_RCU_READ_LOCK_GUARD() {
993         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
994             ramblock_sync_dirty_bitmap(rs, block);
995         }
996         ram_counters.remaining = ram_bytes_remaining();
997     }
998     qemu_mutex_unlock(&rs->bitmap_mutex);
999 
1000     memory_global_after_dirty_log_sync();
1001     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1002 
1003     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1004 
1005     /* more than 1 second = 1000 millisecons */
1006     if (end_time > rs->time_last_bitmap_sync + 1000) {
1007         migration_trigger_throttle(rs);
1008 
1009         migration_update_rates(rs, end_time);
1010 
1011         rs->target_page_count_prev = rs->target_page_count;
1012 
1013         /* reset period counters */
1014         rs->time_last_bitmap_sync = end_time;
1015         rs->num_dirty_pages_period = 0;
1016         rs->bytes_xfer_prev = ram_counters.transferred;
1017     }
1018     if (migrate_use_events()) {
1019         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1020     }
1021 }
1022 
1023 static void migration_bitmap_sync_precopy(RAMState *rs)
1024 {
1025     Error *local_err = NULL;
1026 
1027     /*
1028      * The current notifier usage is just an optimization to migration, so we
1029      * don't stop the normal migration process in the error case.
1030      */
1031     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1032         error_report_err(local_err);
1033         local_err = NULL;
1034     }
1035 
1036     migration_bitmap_sync(rs);
1037 
1038     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1039         error_report_err(local_err);
1040     }
1041 }
1042 
1043 /**
1044  * save_zero_page_to_file: send the zero page to the file
1045  *
1046  * Returns the size of data written to the file, 0 means the page is not
1047  * a zero page
1048  *
1049  * @rs: current RAM state
1050  * @file: the file where the data is saved
1051  * @block: block that contains the page we want to send
1052  * @offset: offset inside the block for the page
1053  */
1054 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1055                                   RAMBlock *block, ram_addr_t offset)
1056 {
1057     uint8_t *p = block->host + offset;
1058     int len = 0;
1059 
1060     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1061         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1062         qemu_put_byte(file, 0);
1063         len += 1;
1064     }
1065     return len;
1066 }
1067 
1068 /**
1069  * save_zero_page: send the zero page to the stream
1070  *
1071  * Returns the number of pages written.
1072  *
1073  * @rs: current RAM state
1074  * @block: block that contains the page we want to send
1075  * @offset: offset inside the block for the page
1076  */
1077 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1078 {
1079     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1080 
1081     if (len) {
1082         ram_counters.duplicate++;
1083         ram_counters.transferred += len;
1084         return 1;
1085     }
1086     return -1;
1087 }
1088 
1089 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1090 {
1091     if (!migrate_release_ram() || !migration_in_postcopy()) {
1092         return;
1093     }
1094 
1095     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1096 }
1097 
1098 /*
1099  * @pages: the number of pages written by the control path,
1100  *        < 0 - error
1101  *        > 0 - number of pages written
1102  *
1103  * Return true if the pages has been saved, otherwise false is returned.
1104  */
1105 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1106                               int *pages)
1107 {
1108     uint64_t bytes_xmit = 0;
1109     int ret;
1110 
1111     *pages = -1;
1112     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1113                                 &bytes_xmit);
1114     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1115         return false;
1116     }
1117 
1118     if (bytes_xmit) {
1119         ram_counters.transferred += bytes_xmit;
1120         *pages = 1;
1121     }
1122 
1123     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1124         return true;
1125     }
1126 
1127     if (bytes_xmit > 0) {
1128         ram_counters.normal++;
1129     } else if (bytes_xmit == 0) {
1130         ram_counters.duplicate++;
1131     }
1132 
1133     return true;
1134 }
1135 
1136 /*
1137  * directly send the page to the stream
1138  *
1139  * Returns the number of pages written.
1140  *
1141  * @rs: current RAM state
1142  * @block: block that contains the page we want to send
1143  * @offset: offset inside the block for the page
1144  * @buf: the page to be sent
1145  * @async: send to page asyncly
1146  */
1147 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1148                             uint8_t *buf, bool async)
1149 {
1150     ram_counters.transferred += save_page_header(rs, rs->f, block,
1151                                                  offset | RAM_SAVE_FLAG_PAGE);
1152     if (async) {
1153         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1154                               migrate_release_ram() &
1155                               migration_in_postcopy());
1156     } else {
1157         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1158     }
1159     ram_counters.transferred += TARGET_PAGE_SIZE;
1160     ram_counters.normal++;
1161     return 1;
1162 }
1163 
1164 /**
1165  * ram_save_page: send the given page to the stream
1166  *
1167  * Returns the number of pages written.
1168  *          < 0 - error
1169  *          >=0 - Number of pages written - this might legally be 0
1170  *                if xbzrle noticed the page was the same.
1171  *
1172  * @rs: current RAM state
1173  * @block: block that contains the page we want to send
1174  * @offset: offset inside the block for the page
1175  * @last_stage: if we are at the completion stage
1176  */
1177 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1178 {
1179     int pages = -1;
1180     uint8_t *p;
1181     bool send_async = true;
1182     RAMBlock *block = pss->block;
1183     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1184     ram_addr_t current_addr = block->offset + offset;
1185 
1186     p = block->host + offset;
1187     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1188 
1189     XBZRLE_cache_lock();
1190     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1191         migrate_use_xbzrle()) {
1192         pages = save_xbzrle_page(rs, &p, current_addr, block,
1193                                  offset, last_stage);
1194         if (!last_stage) {
1195             /* Can't send this cached data async, since the cache page
1196              * might get updated before it gets to the wire
1197              */
1198             send_async = false;
1199         }
1200     }
1201 
1202     /* XBZRLE overflow or normal page */
1203     if (pages == -1) {
1204         pages = save_normal_page(rs, block, offset, p, send_async);
1205     }
1206 
1207     XBZRLE_cache_unlock();
1208 
1209     return pages;
1210 }
1211 
1212 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1213                                  ram_addr_t offset)
1214 {
1215     if (multifd_queue_page(rs->f, block, offset) < 0) {
1216         return -1;
1217     }
1218     ram_counters.normal++;
1219 
1220     return 1;
1221 }
1222 
1223 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1224                                  ram_addr_t offset, uint8_t *source_buf)
1225 {
1226     RAMState *rs = ram_state;
1227     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1228     bool zero_page = false;
1229     int ret;
1230 
1231     if (save_zero_page_to_file(rs, f, block, offset)) {
1232         zero_page = true;
1233         goto exit;
1234     }
1235 
1236     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1237 
1238     /*
1239      * copy it to a internal buffer to avoid it being modified by VM
1240      * so that we can catch up the error during compression and
1241      * decompression
1242      */
1243     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1244     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1245     if (ret < 0) {
1246         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1247         error_report("compressed data failed!");
1248         return false;
1249     }
1250 
1251 exit:
1252     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1253     return zero_page;
1254 }
1255 
1256 static void
1257 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1258 {
1259     ram_counters.transferred += bytes_xmit;
1260 
1261     if (param->zero_page) {
1262         ram_counters.duplicate++;
1263         return;
1264     }
1265 
1266     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1267     compression_counters.compressed_size += bytes_xmit - 8;
1268     compression_counters.pages++;
1269 }
1270 
1271 static bool save_page_use_compression(RAMState *rs);
1272 
1273 static void flush_compressed_data(RAMState *rs)
1274 {
1275     int idx, len, thread_count;
1276 
1277     if (!save_page_use_compression(rs)) {
1278         return;
1279     }
1280     thread_count = migrate_compress_threads();
1281 
1282     qemu_mutex_lock(&comp_done_lock);
1283     for (idx = 0; idx < thread_count; idx++) {
1284         while (!comp_param[idx].done) {
1285             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1286         }
1287     }
1288     qemu_mutex_unlock(&comp_done_lock);
1289 
1290     for (idx = 0; idx < thread_count; idx++) {
1291         qemu_mutex_lock(&comp_param[idx].mutex);
1292         if (!comp_param[idx].quit) {
1293             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1294             /*
1295              * it's safe to fetch zero_page without holding comp_done_lock
1296              * as there is no further request submitted to the thread,
1297              * i.e, the thread should be waiting for a request at this point.
1298              */
1299             update_compress_thread_counts(&comp_param[idx], len);
1300         }
1301         qemu_mutex_unlock(&comp_param[idx].mutex);
1302     }
1303 }
1304 
1305 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1306                                        ram_addr_t offset)
1307 {
1308     param->block = block;
1309     param->offset = offset;
1310 }
1311 
1312 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1313                                            ram_addr_t offset)
1314 {
1315     int idx, thread_count, bytes_xmit = -1, pages = -1;
1316     bool wait = migrate_compress_wait_thread();
1317 
1318     thread_count = migrate_compress_threads();
1319     qemu_mutex_lock(&comp_done_lock);
1320 retry:
1321     for (idx = 0; idx < thread_count; idx++) {
1322         if (comp_param[idx].done) {
1323             comp_param[idx].done = false;
1324             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1325             qemu_mutex_lock(&comp_param[idx].mutex);
1326             set_compress_params(&comp_param[idx], block, offset);
1327             qemu_cond_signal(&comp_param[idx].cond);
1328             qemu_mutex_unlock(&comp_param[idx].mutex);
1329             pages = 1;
1330             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1331             break;
1332         }
1333     }
1334 
1335     /*
1336      * wait for the free thread if the user specifies 'compress-wait-thread',
1337      * otherwise we will post the page out in the main thread as normal page.
1338      */
1339     if (pages < 0 && wait) {
1340         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1341         goto retry;
1342     }
1343     qemu_mutex_unlock(&comp_done_lock);
1344 
1345     return pages;
1346 }
1347 
1348 /**
1349  * find_dirty_block: find the next dirty page and update any state
1350  * associated with the search process.
1351  *
1352  * Returns true if a page is found
1353  *
1354  * @rs: current RAM state
1355  * @pss: data about the state of the current dirty page scan
1356  * @again: set to false if the search has scanned the whole of RAM
1357  */
1358 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1359 {
1360     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1361     if (pss->complete_round && pss->block == rs->last_seen_block &&
1362         pss->page >= rs->last_page) {
1363         /*
1364          * We've been once around the RAM and haven't found anything.
1365          * Give up.
1366          */
1367         *again = false;
1368         return false;
1369     }
1370     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1371         >= pss->block->used_length) {
1372         /* Didn't find anything in this RAM Block */
1373         pss->page = 0;
1374         pss->block = QLIST_NEXT_RCU(pss->block, next);
1375         if (!pss->block) {
1376             /*
1377              * If memory migration starts over, we will meet a dirtied page
1378              * which may still exists in compression threads's ring, so we
1379              * should flush the compressed data to make sure the new page
1380              * is not overwritten by the old one in the destination.
1381              *
1382              * Also If xbzrle is on, stop using the data compression at this
1383              * point. In theory, xbzrle can do better than compression.
1384              */
1385             flush_compressed_data(rs);
1386 
1387             /* Hit the end of the list */
1388             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1389             /* Flag that we've looped */
1390             pss->complete_round = true;
1391             rs->ram_bulk_stage = false;
1392         }
1393         /* Didn't find anything this time, but try again on the new block */
1394         *again = true;
1395         return false;
1396     } else {
1397         /* Can go around again, but... */
1398         *again = true;
1399         /* We've found something so probably don't need to */
1400         return true;
1401     }
1402 }
1403 
1404 /**
1405  * unqueue_page: gets a page of the queue
1406  *
1407  * Helper for 'get_queued_page' - gets a page off the queue
1408  *
1409  * Returns the block of the page (or NULL if none available)
1410  *
1411  * @rs: current RAM state
1412  * @offset: used to return the offset within the RAMBlock
1413  */
1414 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1415 {
1416     RAMBlock *block = NULL;
1417 
1418     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1419         return NULL;
1420     }
1421 
1422     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1423     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1424         struct RAMSrcPageRequest *entry =
1425                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1426         block = entry->rb;
1427         *offset = entry->offset;
1428 
1429         if (entry->len > TARGET_PAGE_SIZE) {
1430             entry->len -= TARGET_PAGE_SIZE;
1431             entry->offset += TARGET_PAGE_SIZE;
1432         } else {
1433             memory_region_unref(block->mr);
1434             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1435             g_free(entry);
1436             migration_consume_urgent_request();
1437         }
1438     }
1439 
1440     return block;
1441 }
1442 
1443 /**
1444  * get_queued_page: unqueue a page from the postcopy requests
1445  *
1446  * Skips pages that are already sent (!dirty)
1447  *
1448  * Returns true if a queued page is found
1449  *
1450  * @rs: current RAM state
1451  * @pss: data about the state of the current dirty page scan
1452  */
1453 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1454 {
1455     RAMBlock  *block;
1456     ram_addr_t offset;
1457     bool dirty;
1458 
1459     do {
1460         block = unqueue_page(rs, &offset);
1461         /*
1462          * We're sending this page, and since it's postcopy nothing else
1463          * will dirty it, and we must make sure it doesn't get sent again
1464          * even if this queue request was received after the background
1465          * search already sent it.
1466          */
1467         if (block) {
1468             unsigned long page;
1469 
1470             page = offset >> TARGET_PAGE_BITS;
1471             dirty = test_bit(page, block->bmap);
1472             if (!dirty) {
1473                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1474                                                 page);
1475             } else {
1476                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1477             }
1478         }
1479 
1480     } while (block && !dirty);
1481 
1482     if (block) {
1483         /*
1484          * As soon as we start servicing pages out of order, then we have
1485          * to kill the bulk stage, since the bulk stage assumes
1486          * in (migration_bitmap_find_and_reset_dirty) that every page is
1487          * dirty, that's no longer true.
1488          */
1489         rs->ram_bulk_stage = false;
1490 
1491         /*
1492          * We want the background search to continue from the queued page
1493          * since the guest is likely to want other pages near to the page
1494          * it just requested.
1495          */
1496         pss->block = block;
1497         pss->page = offset >> TARGET_PAGE_BITS;
1498 
1499         /*
1500          * This unqueued page would break the "one round" check, even is
1501          * really rare.
1502          */
1503         pss->complete_round = false;
1504     }
1505 
1506     return !!block;
1507 }
1508 
1509 /**
1510  * migration_page_queue_free: drop any remaining pages in the ram
1511  * request queue
1512  *
1513  * It should be empty at the end anyway, but in error cases there may
1514  * be some left.  in case that there is any page left, we drop it.
1515  *
1516  */
1517 static void migration_page_queue_free(RAMState *rs)
1518 {
1519     struct RAMSrcPageRequest *mspr, *next_mspr;
1520     /* This queue generally should be empty - but in the case of a failed
1521      * migration might have some droppings in.
1522      */
1523     RCU_READ_LOCK_GUARD();
1524     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1525         memory_region_unref(mspr->rb->mr);
1526         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1527         g_free(mspr);
1528     }
1529 }
1530 
1531 /**
1532  * ram_save_queue_pages: queue the page for transmission
1533  *
1534  * A request from postcopy destination for example.
1535  *
1536  * Returns zero on success or negative on error
1537  *
1538  * @rbname: Name of the RAMBLock of the request. NULL means the
1539  *          same that last one.
1540  * @start: starting address from the start of the RAMBlock
1541  * @len: length (in bytes) to send
1542  */
1543 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1544 {
1545     RAMBlock *ramblock;
1546     RAMState *rs = ram_state;
1547 
1548     ram_counters.postcopy_requests++;
1549     RCU_READ_LOCK_GUARD();
1550 
1551     if (!rbname) {
1552         /* Reuse last RAMBlock */
1553         ramblock = rs->last_req_rb;
1554 
1555         if (!ramblock) {
1556             /*
1557              * Shouldn't happen, we can't reuse the last RAMBlock if
1558              * it's the 1st request.
1559              */
1560             error_report("ram_save_queue_pages no previous block");
1561             return -1;
1562         }
1563     } else {
1564         ramblock = qemu_ram_block_by_name(rbname);
1565 
1566         if (!ramblock) {
1567             /* We shouldn't be asked for a non-existent RAMBlock */
1568             error_report("ram_save_queue_pages no block '%s'", rbname);
1569             return -1;
1570         }
1571         rs->last_req_rb = ramblock;
1572     }
1573     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1574     if (start+len > ramblock->used_length) {
1575         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1576                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1577                      __func__, start, len, ramblock->used_length);
1578         return -1;
1579     }
1580 
1581     struct RAMSrcPageRequest *new_entry =
1582         g_malloc0(sizeof(struct RAMSrcPageRequest));
1583     new_entry->rb = ramblock;
1584     new_entry->offset = start;
1585     new_entry->len = len;
1586 
1587     memory_region_ref(ramblock->mr);
1588     qemu_mutex_lock(&rs->src_page_req_mutex);
1589     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1590     migration_make_urgent_request();
1591     qemu_mutex_unlock(&rs->src_page_req_mutex);
1592 
1593     return 0;
1594 }
1595 
1596 static bool save_page_use_compression(RAMState *rs)
1597 {
1598     if (!migrate_use_compression()) {
1599         return false;
1600     }
1601 
1602     /*
1603      * If xbzrle is on, stop using the data compression after first
1604      * round of migration even if compression is enabled. In theory,
1605      * xbzrle can do better than compression.
1606      */
1607     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1608         return true;
1609     }
1610 
1611     return false;
1612 }
1613 
1614 /*
1615  * try to compress the page before posting it out, return true if the page
1616  * has been properly handled by compression, otherwise needs other
1617  * paths to handle it
1618  */
1619 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1620 {
1621     if (!save_page_use_compression(rs)) {
1622         return false;
1623     }
1624 
1625     /*
1626      * When starting the process of a new block, the first page of
1627      * the block should be sent out before other pages in the same
1628      * block, and all the pages in last block should have been sent
1629      * out, keeping this order is important, because the 'cont' flag
1630      * is used to avoid resending the block name.
1631      *
1632      * We post the fist page as normal page as compression will take
1633      * much CPU resource.
1634      */
1635     if (block != rs->last_sent_block) {
1636         flush_compressed_data(rs);
1637         return false;
1638     }
1639 
1640     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1641         return true;
1642     }
1643 
1644     compression_counters.busy++;
1645     return false;
1646 }
1647 
1648 /**
1649  * ram_save_target_page: save one target page
1650  *
1651  * Returns the number of pages written
1652  *
1653  * @rs: current RAM state
1654  * @pss: data about the page we want to send
1655  * @last_stage: if we are at the completion stage
1656  */
1657 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1658                                 bool last_stage)
1659 {
1660     RAMBlock *block = pss->block;
1661     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1662     int res;
1663 
1664     if (control_save_page(rs, block, offset, &res)) {
1665         return res;
1666     }
1667 
1668     if (save_compress_page(rs, block, offset)) {
1669         return 1;
1670     }
1671 
1672     res = save_zero_page(rs, block, offset);
1673     if (res > 0) {
1674         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1675          * page would be stale
1676          */
1677         if (!save_page_use_compression(rs)) {
1678             XBZRLE_cache_lock();
1679             xbzrle_cache_zero_page(rs, block->offset + offset);
1680             XBZRLE_cache_unlock();
1681         }
1682         ram_release_pages(block->idstr, offset, res);
1683         return res;
1684     }
1685 
1686     /*
1687      * Do not use multifd for:
1688      * 1. Compression as the first page in the new block should be posted out
1689      *    before sending the compressed page
1690      * 2. In postcopy as one whole host page should be placed
1691      */
1692     if (!save_page_use_compression(rs) && migrate_use_multifd()
1693         && !migration_in_postcopy()) {
1694         return ram_save_multifd_page(rs, block, offset);
1695     }
1696 
1697     return ram_save_page(rs, pss, last_stage);
1698 }
1699 
1700 /**
1701  * ram_save_host_page: save a whole host page
1702  *
1703  * Starting at *offset send pages up to the end of the current host
1704  * page. It's valid for the initial offset to point into the middle of
1705  * a host page in which case the remainder of the hostpage is sent.
1706  * Only dirty target pages are sent. Note that the host page size may
1707  * be a huge page for this block.
1708  * The saving stops at the boundary of the used_length of the block
1709  * if the RAMBlock isn't a multiple of the host page size.
1710  *
1711  * Returns the number of pages written or negative on error
1712  *
1713  * @rs: current RAM state
1714  * @ms: current migration state
1715  * @pss: data about the page we want to send
1716  * @last_stage: if we are at the completion stage
1717  */
1718 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1719                               bool last_stage)
1720 {
1721     int tmppages, pages = 0;
1722     size_t pagesize_bits =
1723         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1724 
1725     if (ramblock_is_ignored(pss->block)) {
1726         error_report("block %s should not be migrated !", pss->block->idstr);
1727         return 0;
1728     }
1729 
1730     do {
1731         /* Check the pages is dirty and if it is send it */
1732         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1733             pss->page++;
1734             continue;
1735         }
1736 
1737         tmppages = ram_save_target_page(rs, pss, last_stage);
1738         if (tmppages < 0) {
1739             return tmppages;
1740         }
1741 
1742         pages += tmppages;
1743         pss->page++;
1744         /* Allow rate limiting to happen in the middle of huge pages */
1745         migration_rate_limit();
1746     } while ((pss->page & (pagesize_bits - 1)) &&
1747              offset_in_ramblock(pss->block,
1748                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1749 
1750     /* The offset we leave with is the last one we looked at */
1751     pss->page--;
1752     return pages;
1753 }
1754 
1755 /**
1756  * ram_find_and_save_block: finds a dirty page and sends it to f
1757  *
1758  * Called within an RCU critical section.
1759  *
1760  * Returns the number of pages written where zero means no dirty pages,
1761  * or negative on error
1762  *
1763  * @rs: current RAM state
1764  * @last_stage: if we are at the completion stage
1765  *
1766  * On systems where host-page-size > target-page-size it will send all the
1767  * pages in a host page that are dirty.
1768  */
1769 
1770 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1771 {
1772     PageSearchStatus pss;
1773     int pages = 0;
1774     bool again, found;
1775 
1776     /* No dirty page as there is zero RAM */
1777     if (!ram_bytes_total()) {
1778         return pages;
1779     }
1780 
1781     pss.block = rs->last_seen_block;
1782     pss.page = rs->last_page;
1783     pss.complete_round = false;
1784 
1785     if (!pss.block) {
1786         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1787     }
1788 
1789     do {
1790         again = true;
1791         found = get_queued_page(rs, &pss);
1792 
1793         if (!found) {
1794             /* priority queue empty, so just search for something dirty */
1795             found = find_dirty_block(rs, &pss, &again);
1796         }
1797 
1798         if (found) {
1799             pages = ram_save_host_page(rs, &pss, last_stage);
1800         }
1801     } while (!pages && again);
1802 
1803     rs->last_seen_block = pss.block;
1804     rs->last_page = pss.page;
1805 
1806     return pages;
1807 }
1808 
1809 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1810 {
1811     uint64_t pages = size / TARGET_PAGE_SIZE;
1812 
1813     if (zero) {
1814         ram_counters.duplicate += pages;
1815     } else {
1816         ram_counters.normal += pages;
1817         ram_counters.transferred += size;
1818         qemu_update_position(f, size);
1819     }
1820 }
1821 
1822 static uint64_t ram_bytes_total_common(bool count_ignored)
1823 {
1824     RAMBlock *block;
1825     uint64_t total = 0;
1826 
1827     RCU_READ_LOCK_GUARD();
1828 
1829     if (count_ignored) {
1830         RAMBLOCK_FOREACH_MIGRATABLE(block) {
1831             total += block->used_length;
1832         }
1833     } else {
1834         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1835             total += block->used_length;
1836         }
1837     }
1838     return total;
1839 }
1840 
1841 uint64_t ram_bytes_total(void)
1842 {
1843     return ram_bytes_total_common(false);
1844 }
1845 
1846 static void xbzrle_load_setup(void)
1847 {
1848     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1849 }
1850 
1851 static void xbzrle_load_cleanup(void)
1852 {
1853     g_free(XBZRLE.decoded_buf);
1854     XBZRLE.decoded_buf = NULL;
1855 }
1856 
1857 static void ram_state_cleanup(RAMState **rsp)
1858 {
1859     if (*rsp) {
1860         migration_page_queue_free(*rsp);
1861         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1862         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1863         g_free(*rsp);
1864         *rsp = NULL;
1865     }
1866 }
1867 
1868 static void xbzrle_cleanup(void)
1869 {
1870     XBZRLE_cache_lock();
1871     if (XBZRLE.cache) {
1872         cache_fini(XBZRLE.cache);
1873         g_free(XBZRLE.encoded_buf);
1874         g_free(XBZRLE.current_buf);
1875         g_free(XBZRLE.zero_target_page);
1876         XBZRLE.cache = NULL;
1877         XBZRLE.encoded_buf = NULL;
1878         XBZRLE.current_buf = NULL;
1879         XBZRLE.zero_target_page = NULL;
1880     }
1881     XBZRLE_cache_unlock();
1882 }
1883 
1884 static void ram_save_cleanup(void *opaque)
1885 {
1886     RAMState **rsp = opaque;
1887     RAMBlock *block;
1888 
1889     /* caller have hold iothread lock or is in a bh, so there is
1890      * no writing race against the migration bitmap
1891      */
1892     memory_global_dirty_log_stop();
1893 
1894     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1895         g_free(block->clear_bmap);
1896         block->clear_bmap = NULL;
1897         g_free(block->bmap);
1898         block->bmap = NULL;
1899     }
1900 
1901     xbzrle_cleanup();
1902     compress_threads_save_cleanup();
1903     ram_state_cleanup(rsp);
1904 }
1905 
1906 static void ram_state_reset(RAMState *rs)
1907 {
1908     rs->last_seen_block = NULL;
1909     rs->last_sent_block = NULL;
1910     rs->last_page = 0;
1911     rs->last_version = ram_list.version;
1912     rs->ram_bulk_stage = true;
1913     rs->fpo_enabled = false;
1914 }
1915 
1916 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1917 
1918 /*
1919  * 'expected' is the value you expect the bitmap mostly to be full
1920  * of; it won't bother printing lines that are all this value.
1921  * If 'todump' is null the migration bitmap is dumped.
1922  */
1923 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1924                            unsigned long pages)
1925 {
1926     int64_t cur;
1927     int64_t linelen = 128;
1928     char linebuf[129];
1929 
1930     for (cur = 0; cur < pages; cur += linelen) {
1931         int64_t curb;
1932         bool found = false;
1933         /*
1934          * Last line; catch the case where the line length
1935          * is longer than remaining ram
1936          */
1937         if (cur + linelen > pages) {
1938             linelen = pages - cur;
1939         }
1940         for (curb = 0; curb < linelen; curb++) {
1941             bool thisbit = test_bit(cur + curb, todump);
1942             linebuf[curb] = thisbit ? '1' : '.';
1943             found = found || (thisbit != expected);
1944         }
1945         if (found) {
1946             linebuf[curb] = '\0';
1947             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1948         }
1949     }
1950 }
1951 
1952 /* **** functions for postcopy ***** */
1953 
1954 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1955 {
1956     struct RAMBlock *block;
1957 
1958     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1959         unsigned long *bitmap = block->bmap;
1960         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1961         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1962 
1963         while (run_start < range) {
1964             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1965             ram_discard_range(block->idstr,
1966                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1967                               ((ram_addr_t)(run_end - run_start))
1968                                 << TARGET_PAGE_BITS);
1969             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1970         }
1971     }
1972 }
1973 
1974 /**
1975  * postcopy_send_discard_bm_ram: discard a RAMBlock
1976  *
1977  * Returns zero on success
1978  *
1979  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1980  *
1981  * @ms: current migration state
1982  * @block: RAMBlock to discard
1983  */
1984 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1985 {
1986     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1987     unsigned long current;
1988     unsigned long *bitmap = block->bmap;
1989 
1990     for (current = 0; current < end; ) {
1991         unsigned long one = find_next_bit(bitmap, end, current);
1992         unsigned long zero, discard_length;
1993 
1994         if (one >= end) {
1995             break;
1996         }
1997 
1998         zero = find_next_zero_bit(bitmap, end, one + 1);
1999 
2000         if (zero >= end) {
2001             discard_length = end - one;
2002         } else {
2003             discard_length = zero - one;
2004         }
2005         postcopy_discard_send_range(ms, one, discard_length);
2006         current = one + discard_length;
2007     }
2008 
2009     return 0;
2010 }
2011 
2012 /**
2013  * postcopy_each_ram_send_discard: discard all RAMBlocks
2014  *
2015  * Returns 0 for success or negative for error
2016  *
2017  * Utility for the outgoing postcopy code.
2018  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2019  *   passing it bitmap indexes and name.
2020  * (qemu_ram_foreach_block ends up passing unscaled lengths
2021  *  which would mean postcopy code would have to deal with target page)
2022  *
2023  * @ms: current migration state
2024  */
2025 static int postcopy_each_ram_send_discard(MigrationState *ms)
2026 {
2027     struct RAMBlock *block;
2028     int ret;
2029 
2030     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2031         postcopy_discard_send_init(ms, block->idstr);
2032 
2033         /*
2034          * Postcopy sends chunks of bitmap over the wire, but it
2035          * just needs indexes at this point, avoids it having
2036          * target page specific code.
2037          */
2038         ret = postcopy_send_discard_bm_ram(ms, block);
2039         postcopy_discard_send_finish(ms);
2040         if (ret) {
2041             return ret;
2042         }
2043     }
2044 
2045     return 0;
2046 }
2047 
2048 /**
2049  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2050  *
2051  * Helper for postcopy_chunk_hostpages; it's called twice to
2052  * canonicalize the two bitmaps, that are similar, but one is
2053  * inverted.
2054  *
2055  * Postcopy requires that all target pages in a hostpage are dirty or
2056  * clean, not a mix.  This function canonicalizes the bitmaps.
2057  *
2058  * @ms: current migration state
2059  * @block: block that contains the page we want to canonicalize
2060  */
2061 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2062 {
2063     RAMState *rs = ram_state;
2064     unsigned long *bitmap = block->bmap;
2065     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2066     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2067     unsigned long run_start;
2068 
2069     if (block->page_size == TARGET_PAGE_SIZE) {
2070         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2071         return;
2072     }
2073 
2074     /* Find a dirty page */
2075     run_start = find_next_bit(bitmap, pages, 0);
2076 
2077     while (run_start < pages) {
2078 
2079         /*
2080          * If the start of this run of pages is in the middle of a host
2081          * page, then we need to fixup this host page.
2082          */
2083         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2084             /* Find the end of this run */
2085             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2086             /*
2087              * If the end isn't at the start of a host page, then the
2088              * run doesn't finish at the end of a host page
2089              * and we need to discard.
2090              */
2091         }
2092 
2093         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2094             unsigned long page;
2095             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2096                                                              host_ratio);
2097             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2098 
2099             /* Clean up the bitmap */
2100             for (page = fixup_start_addr;
2101                  page < fixup_start_addr + host_ratio; page++) {
2102                 /*
2103                  * Remark them as dirty, updating the count for any pages
2104                  * that weren't previously dirty.
2105                  */
2106                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2107             }
2108         }
2109 
2110         /* Find the next dirty page for the next iteration */
2111         run_start = find_next_bit(bitmap, pages, run_start);
2112     }
2113 }
2114 
2115 /**
2116  * postcopy_chunk_hostpages: discard any partially sent host page
2117  *
2118  * Utility for the outgoing postcopy code.
2119  *
2120  * Discard any partially sent host-page size chunks, mark any partially
2121  * dirty host-page size chunks as all dirty.  In this case the host-page
2122  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2123  *
2124  * Returns zero on success
2125  *
2126  * @ms: current migration state
2127  * @block: block we want to work with
2128  */
2129 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2130 {
2131     postcopy_discard_send_init(ms, block->idstr);
2132 
2133     /*
2134      * Ensure that all partially dirty host pages are made fully dirty.
2135      */
2136     postcopy_chunk_hostpages_pass(ms, block);
2137 
2138     postcopy_discard_send_finish(ms);
2139     return 0;
2140 }
2141 
2142 /**
2143  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2144  *
2145  * Returns zero on success
2146  *
2147  * Transmit the set of pages to be discarded after precopy to the target
2148  * these are pages that:
2149  *     a) Have been previously transmitted but are now dirty again
2150  *     b) Pages that have never been transmitted, this ensures that
2151  *        any pages on the destination that have been mapped by background
2152  *        tasks get discarded (transparent huge pages is the specific concern)
2153  * Hopefully this is pretty sparse
2154  *
2155  * @ms: current migration state
2156  */
2157 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2158 {
2159     RAMState *rs = ram_state;
2160     RAMBlock *block;
2161     int ret;
2162 
2163     RCU_READ_LOCK_GUARD();
2164 
2165     /* This should be our last sync, the src is now paused */
2166     migration_bitmap_sync(rs);
2167 
2168     /* Easiest way to make sure we don't resume in the middle of a host-page */
2169     rs->last_seen_block = NULL;
2170     rs->last_sent_block = NULL;
2171     rs->last_page = 0;
2172 
2173     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2174         /* Deal with TPS != HPS and huge pages */
2175         ret = postcopy_chunk_hostpages(ms, block);
2176         if (ret) {
2177             return ret;
2178         }
2179 
2180 #ifdef DEBUG_POSTCOPY
2181         ram_debug_dump_bitmap(block->bmap, true,
2182                               block->used_length >> TARGET_PAGE_BITS);
2183 #endif
2184     }
2185     trace_ram_postcopy_send_discard_bitmap();
2186 
2187     return postcopy_each_ram_send_discard(ms);
2188 }
2189 
2190 /**
2191  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2192  *
2193  * Returns zero on success
2194  *
2195  * @rbname: name of the RAMBlock of the request. NULL means the
2196  *          same that last one.
2197  * @start: RAMBlock starting page
2198  * @length: RAMBlock size
2199  */
2200 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2201 {
2202     trace_ram_discard_range(rbname, start, length);
2203 
2204     RCU_READ_LOCK_GUARD();
2205     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2206 
2207     if (!rb) {
2208         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2209         return -1;
2210     }
2211 
2212     /*
2213      * On source VM, we don't need to update the received bitmap since
2214      * we don't even have one.
2215      */
2216     if (rb->receivedmap) {
2217         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2218                      length >> qemu_target_page_bits());
2219     }
2220 
2221     return ram_block_discard_range(rb, start, length);
2222 }
2223 
2224 /*
2225  * For every allocation, we will try not to crash the VM if the
2226  * allocation failed.
2227  */
2228 static int xbzrle_init(void)
2229 {
2230     Error *local_err = NULL;
2231 
2232     if (!migrate_use_xbzrle()) {
2233         return 0;
2234     }
2235 
2236     XBZRLE_cache_lock();
2237 
2238     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2239     if (!XBZRLE.zero_target_page) {
2240         error_report("%s: Error allocating zero page", __func__);
2241         goto err_out;
2242     }
2243 
2244     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2245                               TARGET_PAGE_SIZE, &local_err);
2246     if (!XBZRLE.cache) {
2247         error_report_err(local_err);
2248         goto free_zero_page;
2249     }
2250 
2251     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2252     if (!XBZRLE.encoded_buf) {
2253         error_report("%s: Error allocating encoded_buf", __func__);
2254         goto free_cache;
2255     }
2256 
2257     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2258     if (!XBZRLE.current_buf) {
2259         error_report("%s: Error allocating current_buf", __func__);
2260         goto free_encoded_buf;
2261     }
2262 
2263     /* We are all good */
2264     XBZRLE_cache_unlock();
2265     return 0;
2266 
2267 free_encoded_buf:
2268     g_free(XBZRLE.encoded_buf);
2269     XBZRLE.encoded_buf = NULL;
2270 free_cache:
2271     cache_fini(XBZRLE.cache);
2272     XBZRLE.cache = NULL;
2273 free_zero_page:
2274     g_free(XBZRLE.zero_target_page);
2275     XBZRLE.zero_target_page = NULL;
2276 err_out:
2277     XBZRLE_cache_unlock();
2278     return -ENOMEM;
2279 }
2280 
2281 static int ram_state_init(RAMState **rsp)
2282 {
2283     *rsp = g_try_new0(RAMState, 1);
2284 
2285     if (!*rsp) {
2286         error_report("%s: Init ramstate fail", __func__);
2287         return -1;
2288     }
2289 
2290     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2291     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2292     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2293 
2294     /*
2295      * Count the total number of pages used by ram blocks not including any
2296      * gaps due to alignment or unplugs.
2297      * This must match with the initial values of dirty bitmap.
2298      */
2299     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2300     ram_state_reset(*rsp);
2301 
2302     return 0;
2303 }
2304 
2305 static void ram_list_init_bitmaps(void)
2306 {
2307     MigrationState *ms = migrate_get_current();
2308     RAMBlock *block;
2309     unsigned long pages;
2310     uint8_t shift;
2311 
2312     /* Skip setting bitmap if there is no RAM */
2313     if (ram_bytes_total()) {
2314         shift = ms->clear_bitmap_shift;
2315         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2316             error_report("clear_bitmap_shift (%u) too big, using "
2317                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2318             shift = CLEAR_BITMAP_SHIFT_MAX;
2319         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2320             error_report("clear_bitmap_shift (%u) too small, using "
2321                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2322             shift = CLEAR_BITMAP_SHIFT_MIN;
2323         }
2324 
2325         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2326             pages = block->max_length >> TARGET_PAGE_BITS;
2327             /*
2328              * The initial dirty bitmap for migration must be set with all
2329              * ones to make sure we'll migrate every guest RAM page to
2330              * destination.
2331              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2332              * new migration after a failed migration, ram_list.
2333              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2334              * guest memory.
2335              */
2336             block->bmap = bitmap_new(pages);
2337             bitmap_set(block->bmap, 0, pages);
2338             block->clear_bmap_shift = shift;
2339             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2340         }
2341     }
2342 }
2343 
2344 static void ram_init_bitmaps(RAMState *rs)
2345 {
2346     /* For memory_global_dirty_log_start below.  */
2347     qemu_mutex_lock_iothread();
2348     qemu_mutex_lock_ramlist();
2349 
2350     WITH_RCU_READ_LOCK_GUARD() {
2351         ram_list_init_bitmaps();
2352         memory_global_dirty_log_start();
2353         migration_bitmap_sync_precopy(rs);
2354     }
2355     qemu_mutex_unlock_ramlist();
2356     qemu_mutex_unlock_iothread();
2357 }
2358 
2359 static int ram_init_all(RAMState **rsp)
2360 {
2361     if (ram_state_init(rsp)) {
2362         return -1;
2363     }
2364 
2365     if (xbzrle_init()) {
2366         ram_state_cleanup(rsp);
2367         return -1;
2368     }
2369 
2370     ram_init_bitmaps(*rsp);
2371 
2372     return 0;
2373 }
2374 
2375 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2376 {
2377     RAMBlock *block;
2378     uint64_t pages = 0;
2379 
2380     /*
2381      * Postcopy is not using xbzrle/compression, so no need for that.
2382      * Also, since source are already halted, we don't need to care
2383      * about dirty page logging as well.
2384      */
2385 
2386     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2387         pages += bitmap_count_one(block->bmap,
2388                                   block->used_length >> TARGET_PAGE_BITS);
2389     }
2390 
2391     /* This may not be aligned with current bitmaps. Recalculate. */
2392     rs->migration_dirty_pages = pages;
2393 
2394     rs->last_seen_block = NULL;
2395     rs->last_sent_block = NULL;
2396     rs->last_page = 0;
2397     rs->last_version = ram_list.version;
2398     /*
2399      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2400      * matter what we have sent.
2401      */
2402     rs->ram_bulk_stage = false;
2403 
2404     /* Update RAMState cache of output QEMUFile */
2405     rs->f = out;
2406 
2407     trace_ram_state_resume_prepare(pages);
2408 }
2409 
2410 /*
2411  * This function clears bits of the free pages reported by the caller from the
2412  * migration dirty bitmap. @addr is the host address corresponding to the
2413  * start of the continuous guest free pages, and @len is the total bytes of
2414  * those pages.
2415  */
2416 void qemu_guest_free_page_hint(void *addr, size_t len)
2417 {
2418     RAMBlock *block;
2419     ram_addr_t offset;
2420     size_t used_len, start, npages;
2421     MigrationState *s = migrate_get_current();
2422 
2423     /* This function is currently expected to be used during live migration */
2424     if (!migration_is_setup_or_active(s->state)) {
2425         return;
2426     }
2427 
2428     for (; len > 0; len -= used_len, addr += used_len) {
2429         block = qemu_ram_block_from_host(addr, false, &offset);
2430         if (unlikely(!block || offset >= block->used_length)) {
2431             /*
2432              * The implementation might not support RAMBlock resize during
2433              * live migration, but it could happen in theory with future
2434              * updates. So we add a check here to capture that case.
2435              */
2436             error_report_once("%s unexpected error", __func__);
2437             return;
2438         }
2439 
2440         if (len <= block->used_length - offset) {
2441             used_len = len;
2442         } else {
2443             used_len = block->used_length - offset;
2444         }
2445 
2446         start = offset >> TARGET_PAGE_BITS;
2447         npages = used_len >> TARGET_PAGE_BITS;
2448 
2449         qemu_mutex_lock(&ram_state->bitmap_mutex);
2450         ram_state->migration_dirty_pages -=
2451                       bitmap_count_one_with_offset(block->bmap, start, npages);
2452         bitmap_clear(block->bmap, start, npages);
2453         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2454     }
2455 }
2456 
2457 /*
2458  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2459  * long-running RCU critical section.  When rcu-reclaims in the code
2460  * start to become numerous it will be necessary to reduce the
2461  * granularity of these critical sections.
2462  */
2463 
2464 /**
2465  * ram_save_setup: Setup RAM for migration
2466  *
2467  * Returns zero to indicate success and negative for error
2468  *
2469  * @f: QEMUFile where to send the data
2470  * @opaque: RAMState pointer
2471  */
2472 static int ram_save_setup(QEMUFile *f, void *opaque)
2473 {
2474     RAMState **rsp = opaque;
2475     RAMBlock *block;
2476 
2477     if (compress_threads_save_setup()) {
2478         return -1;
2479     }
2480 
2481     /* migration has already setup the bitmap, reuse it. */
2482     if (!migration_in_colo_state()) {
2483         if (ram_init_all(rsp) != 0) {
2484             compress_threads_save_cleanup();
2485             return -1;
2486         }
2487     }
2488     (*rsp)->f = f;
2489 
2490     WITH_RCU_READ_LOCK_GUARD() {
2491         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2492 
2493         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2494             qemu_put_byte(f, strlen(block->idstr));
2495             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2496             qemu_put_be64(f, block->used_length);
2497             if (migrate_postcopy_ram() && block->page_size !=
2498                                           qemu_host_page_size) {
2499                 qemu_put_be64(f, block->page_size);
2500             }
2501             if (migrate_ignore_shared()) {
2502                 qemu_put_be64(f, block->mr->addr);
2503             }
2504         }
2505     }
2506 
2507     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2508     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2509 
2510     multifd_send_sync_main(f);
2511     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2512     qemu_fflush(f);
2513 
2514     return 0;
2515 }
2516 
2517 /**
2518  * ram_save_iterate: iterative stage for migration
2519  *
2520  * Returns zero to indicate success and negative for error
2521  *
2522  * @f: QEMUFile where to send the data
2523  * @opaque: RAMState pointer
2524  */
2525 static int ram_save_iterate(QEMUFile *f, void *opaque)
2526 {
2527     RAMState **temp = opaque;
2528     RAMState *rs = *temp;
2529     int ret = 0;
2530     int i;
2531     int64_t t0;
2532     int done = 0;
2533 
2534     if (blk_mig_bulk_active()) {
2535         /* Avoid transferring ram during bulk phase of block migration as
2536          * the bulk phase will usually take a long time and transferring
2537          * ram updates during that time is pointless. */
2538         goto out;
2539     }
2540 
2541     WITH_RCU_READ_LOCK_GUARD() {
2542         if (ram_list.version != rs->last_version) {
2543             ram_state_reset(rs);
2544         }
2545 
2546         /* Read version before ram_list.blocks */
2547         smp_rmb();
2548 
2549         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2550 
2551         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2552         i = 0;
2553         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2554                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2555             int pages;
2556 
2557             if (qemu_file_get_error(f)) {
2558                 break;
2559             }
2560 
2561             pages = ram_find_and_save_block(rs, false);
2562             /* no more pages to sent */
2563             if (pages == 0) {
2564                 done = 1;
2565                 break;
2566             }
2567 
2568             if (pages < 0) {
2569                 qemu_file_set_error(f, pages);
2570                 break;
2571             }
2572 
2573             rs->target_page_count += pages;
2574 
2575             /*
2576              * During postcopy, it is necessary to make sure one whole host
2577              * page is sent in one chunk.
2578              */
2579             if (migrate_postcopy_ram()) {
2580                 flush_compressed_data(rs);
2581             }
2582 
2583             /*
2584              * we want to check in the 1st loop, just in case it was the 1st
2585              * time and we had to sync the dirty bitmap.
2586              * qemu_clock_get_ns() is a bit expensive, so we only check each
2587              * some iterations
2588              */
2589             if ((i & 63) == 0) {
2590                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2591                               1000000;
2592                 if (t1 > MAX_WAIT) {
2593                     trace_ram_save_iterate_big_wait(t1, i);
2594                     break;
2595                 }
2596             }
2597             i++;
2598         }
2599     }
2600 
2601     /*
2602      * Must occur before EOS (or any QEMUFile operation)
2603      * because of RDMA protocol.
2604      */
2605     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2606 
2607 out:
2608     if (ret >= 0
2609         && migration_is_setup_or_active(migrate_get_current()->state)) {
2610         multifd_send_sync_main(rs->f);
2611         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2612         qemu_fflush(f);
2613         ram_counters.transferred += 8;
2614 
2615         ret = qemu_file_get_error(f);
2616     }
2617     if (ret < 0) {
2618         return ret;
2619     }
2620 
2621     return done;
2622 }
2623 
2624 /**
2625  * ram_save_complete: function called to send the remaining amount of ram
2626  *
2627  * Returns zero to indicate success or negative on error
2628  *
2629  * Called with iothread lock
2630  *
2631  * @f: QEMUFile where to send the data
2632  * @opaque: RAMState pointer
2633  */
2634 static int ram_save_complete(QEMUFile *f, void *opaque)
2635 {
2636     RAMState **temp = opaque;
2637     RAMState *rs = *temp;
2638     int ret = 0;
2639 
2640     WITH_RCU_READ_LOCK_GUARD() {
2641         if (!migration_in_postcopy()) {
2642             migration_bitmap_sync_precopy(rs);
2643         }
2644 
2645         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2646 
2647         /* try transferring iterative blocks of memory */
2648 
2649         /* flush all remaining blocks regardless of rate limiting */
2650         while (true) {
2651             int pages;
2652 
2653             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2654             /* no more blocks to sent */
2655             if (pages == 0) {
2656                 break;
2657             }
2658             if (pages < 0) {
2659                 ret = pages;
2660                 break;
2661             }
2662         }
2663 
2664         flush_compressed_data(rs);
2665         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2666     }
2667 
2668     if (ret >= 0) {
2669         multifd_send_sync_main(rs->f);
2670         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2671         qemu_fflush(f);
2672     }
2673 
2674     return ret;
2675 }
2676 
2677 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2678                              uint64_t *res_precopy_only,
2679                              uint64_t *res_compatible,
2680                              uint64_t *res_postcopy_only)
2681 {
2682     RAMState **temp = opaque;
2683     RAMState *rs = *temp;
2684     uint64_t remaining_size;
2685 
2686     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2687 
2688     if (!migration_in_postcopy() &&
2689         remaining_size < max_size) {
2690         qemu_mutex_lock_iothread();
2691         WITH_RCU_READ_LOCK_GUARD() {
2692             migration_bitmap_sync_precopy(rs);
2693         }
2694         qemu_mutex_unlock_iothread();
2695         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2696     }
2697 
2698     if (migrate_postcopy_ram()) {
2699         /* We can do postcopy, and all the data is postcopiable */
2700         *res_compatible += remaining_size;
2701     } else {
2702         *res_precopy_only += remaining_size;
2703     }
2704 }
2705 
2706 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2707 {
2708     unsigned int xh_len;
2709     int xh_flags;
2710     uint8_t *loaded_data;
2711 
2712     /* extract RLE header */
2713     xh_flags = qemu_get_byte(f);
2714     xh_len = qemu_get_be16(f);
2715 
2716     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2717         error_report("Failed to load XBZRLE page - wrong compression!");
2718         return -1;
2719     }
2720 
2721     if (xh_len > TARGET_PAGE_SIZE) {
2722         error_report("Failed to load XBZRLE page - len overflow!");
2723         return -1;
2724     }
2725     loaded_data = XBZRLE.decoded_buf;
2726     /* load data and decode */
2727     /* it can change loaded_data to point to an internal buffer */
2728     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2729 
2730     /* decode RLE */
2731     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2732                              TARGET_PAGE_SIZE) == -1) {
2733         error_report("Failed to load XBZRLE page - decode error!");
2734         return -1;
2735     }
2736 
2737     return 0;
2738 }
2739 
2740 /**
2741  * ram_block_from_stream: read a RAMBlock id from the migration stream
2742  *
2743  * Must be called from within a rcu critical section.
2744  *
2745  * Returns a pointer from within the RCU-protected ram_list.
2746  *
2747  * @f: QEMUFile where to read the data from
2748  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2749  */
2750 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2751 {
2752     static RAMBlock *block = NULL;
2753     char id[256];
2754     uint8_t len;
2755 
2756     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2757         if (!block) {
2758             error_report("Ack, bad migration stream!");
2759             return NULL;
2760         }
2761         return block;
2762     }
2763 
2764     len = qemu_get_byte(f);
2765     qemu_get_buffer(f, (uint8_t *)id, len);
2766     id[len] = 0;
2767 
2768     block = qemu_ram_block_by_name(id);
2769     if (!block) {
2770         error_report("Can't find block %s", id);
2771         return NULL;
2772     }
2773 
2774     if (ramblock_is_ignored(block)) {
2775         error_report("block %s should not be migrated !", id);
2776         return NULL;
2777     }
2778 
2779     return block;
2780 }
2781 
2782 static inline void *host_from_ram_block_offset(RAMBlock *block,
2783                                                ram_addr_t offset)
2784 {
2785     if (!offset_in_ramblock(block, offset)) {
2786         return NULL;
2787     }
2788 
2789     return block->host + offset;
2790 }
2791 
2792 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2793                              ram_addr_t offset, bool record_bitmap)
2794 {
2795     if (!offset_in_ramblock(block, offset)) {
2796         return NULL;
2797     }
2798     if (!block->colo_cache) {
2799         error_report("%s: colo_cache is NULL in block :%s",
2800                      __func__, block->idstr);
2801         return NULL;
2802     }
2803 
2804     /*
2805     * During colo checkpoint, we need bitmap of these migrated pages.
2806     * It help us to decide which pages in ram cache should be flushed
2807     * into VM's RAM later.
2808     */
2809     if (record_bitmap &&
2810         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2811         ram_state->migration_dirty_pages++;
2812     }
2813     return block->colo_cache + offset;
2814 }
2815 
2816 /**
2817  * ram_handle_compressed: handle the zero page case
2818  *
2819  * If a page (or a whole RDMA chunk) has been
2820  * determined to be zero, then zap it.
2821  *
2822  * @host: host address for the zero page
2823  * @ch: what the page is filled from.  We only support zero
2824  * @size: size of the zero page
2825  */
2826 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2827 {
2828     if (ch != 0 || !is_zero_range(host, size)) {
2829         memset(host, ch, size);
2830     }
2831 }
2832 
2833 /* return the size after decompression, or negative value on error */
2834 static int
2835 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2836                      const uint8_t *source, size_t source_len)
2837 {
2838     int err;
2839 
2840     err = inflateReset(stream);
2841     if (err != Z_OK) {
2842         return -1;
2843     }
2844 
2845     stream->avail_in = source_len;
2846     stream->next_in = (uint8_t *)source;
2847     stream->avail_out = dest_len;
2848     stream->next_out = dest;
2849 
2850     err = inflate(stream, Z_NO_FLUSH);
2851     if (err != Z_STREAM_END) {
2852         return -1;
2853     }
2854 
2855     return stream->total_out;
2856 }
2857 
2858 static void *do_data_decompress(void *opaque)
2859 {
2860     DecompressParam *param = opaque;
2861     unsigned long pagesize;
2862     uint8_t *des;
2863     int len, ret;
2864 
2865     qemu_mutex_lock(&param->mutex);
2866     while (!param->quit) {
2867         if (param->des) {
2868             des = param->des;
2869             len = param->len;
2870             param->des = 0;
2871             qemu_mutex_unlock(&param->mutex);
2872 
2873             pagesize = TARGET_PAGE_SIZE;
2874 
2875             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2876                                        param->compbuf, len);
2877             if (ret < 0 && migrate_get_current()->decompress_error_check) {
2878                 error_report("decompress data failed");
2879                 qemu_file_set_error(decomp_file, ret);
2880             }
2881 
2882             qemu_mutex_lock(&decomp_done_lock);
2883             param->done = true;
2884             qemu_cond_signal(&decomp_done_cond);
2885             qemu_mutex_unlock(&decomp_done_lock);
2886 
2887             qemu_mutex_lock(&param->mutex);
2888         } else {
2889             qemu_cond_wait(&param->cond, &param->mutex);
2890         }
2891     }
2892     qemu_mutex_unlock(&param->mutex);
2893 
2894     return NULL;
2895 }
2896 
2897 static int wait_for_decompress_done(void)
2898 {
2899     int idx, thread_count;
2900 
2901     if (!migrate_use_compression()) {
2902         return 0;
2903     }
2904 
2905     thread_count = migrate_decompress_threads();
2906     qemu_mutex_lock(&decomp_done_lock);
2907     for (idx = 0; idx < thread_count; idx++) {
2908         while (!decomp_param[idx].done) {
2909             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2910         }
2911     }
2912     qemu_mutex_unlock(&decomp_done_lock);
2913     return qemu_file_get_error(decomp_file);
2914 }
2915 
2916 static void compress_threads_load_cleanup(void)
2917 {
2918     int i, thread_count;
2919 
2920     if (!migrate_use_compression()) {
2921         return;
2922     }
2923     thread_count = migrate_decompress_threads();
2924     for (i = 0; i < thread_count; i++) {
2925         /*
2926          * we use it as a indicator which shows if the thread is
2927          * properly init'd or not
2928          */
2929         if (!decomp_param[i].compbuf) {
2930             break;
2931         }
2932 
2933         qemu_mutex_lock(&decomp_param[i].mutex);
2934         decomp_param[i].quit = true;
2935         qemu_cond_signal(&decomp_param[i].cond);
2936         qemu_mutex_unlock(&decomp_param[i].mutex);
2937     }
2938     for (i = 0; i < thread_count; i++) {
2939         if (!decomp_param[i].compbuf) {
2940             break;
2941         }
2942 
2943         qemu_thread_join(decompress_threads + i);
2944         qemu_mutex_destroy(&decomp_param[i].mutex);
2945         qemu_cond_destroy(&decomp_param[i].cond);
2946         inflateEnd(&decomp_param[i].stream);
2947         g_free(decomp_param[i].compbuf);
2948         decomp_param[i].compbuf = NULL;
2949     }
2950     g_free(decompress_threads);
2951     g_free(decomp_param);
2952     decompress_threads = NULL;
2953     decomp_param = NULL;
2954     decomp_file = NULL;
2955 }
2956 
2957 static int compress_threads_load_setup(QEMUFile *f)
2958 {
2959     int i, thread_count;
2960 
2961     if (!migrate_use_compression()) {
2962         return 0;
2963     }
2964 
2965     thread_count = migrate_decompress_threads();
2966     decompress_threads = g_new0(QemuThread, thread_count);
2967     decomp_param = g_new0(DecompressParam, thread_count);
2968     qemu_mutex_init(&decomp_done_lock);
2969     qemu_cond_init(&decomp_done_cond);
2970     decomp_file = f;
2971     for (i = 0; i < thread_count; i++) {
2972         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2973             goto exit;
2974         }
2975 
2976         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2977         qemu_mutex_init(&decomp_param[i].mutex);
2978         qemu_cond_init(&decomp_param[i].cond);
2979         decomp_param[i].done = true;
2980         decomp_param[i].quit = false;
2981         qemu_thread_create(decompress_threads + i, "decompress",
2982                            do_data_decompress, decomp_param + i,
2983                            QEMU_THREAD_JOINABLE);
2984     }
2985     return 0;
2986 exit:
2987     compress_threads_load_cleanup();
2988     return -1;
2989 }
2990 
2991 static void decompress_data_with_multi_threads(QEMUFile *f,
2992                                                void *host, int len)
2993 {
2994     int idx, thread_count;
2995 
2996     thread_count = migrate_decompress_threads();
2997     qemu_mutex_lock(&decomp_done_lock);
2998     while (true) {
2999         for (idx = 0; idx < thread_count; idx++) {
3000             if (decomp_param[idx].done) {
3001                 decomp_param[idx].done = false;
3002                 qemu_mutex_lock(&decomp_param[idx].mutex);
3003                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3004                 decomp_param[idx].des = host;
3005                 decomp_param[idx].len = len;
3006                 qemu_cond_signal(&decomp_param[idx].cond);
3007                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3008                 break;
3009             }
3010         }
3011         if (idx < thread_count) {
3012             break;
3013         } else {
3014             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3015         }
3016     }
3017     qemu_mutex_unlock(&decomp_done_lock);
3018 }
3019 
3020 /*
3021  * colo cache: this is for secondary VM, we cache the whole
3022  * memory of the secondary VM, it is need to hold the global lock
3023  * to call this helper.
3024  */
3025 int colo_init_ram_cache(void)
3026 {
3027     RAMBlock *block;
3028 
3029     WITH_RCU_READ_LOCK_GUARD() {
3030         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3031             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3032                                                     NULL,
3033                                                     false);
3034             if (!block->colo_cache) {
3035                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3036                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3037                              block->used_length);
3038                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3039                     if (block->colo_cache) {
3040                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3041                         block->colo_cache = NULL;
3042                     }
3043                 }
3044                 return -errno;
3045             }
3046         }
3047     }
3048 
3049     /*
3050     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3051     * with to decide which page in cache should be flushed into SVM's RAM. Here
3052     * we use the same name 'ram_bitmap' as for migration.
3053     */
3054     if (ram_bytes_total()) {
3055         RAMBlock *block;
3056 
3057         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3058             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3059             block->bmap = bitmap_new(pages);
3060         }
3061     }
3062 
3063     ram_state_init(&ram_state);
3064     return 0;
3065 }
3066 
3067 /* TODO: duplicated with ram_init_bitmaps */
3068 void colo_incoming_start_dirty_log(void)
3069 {
3070     RAMBlock *block = NULL;
3071     /* For memory_global_dirty_log_start below. */
3072     qemu_mutex_lock_iothread();
3073     qemu_mutex_lock_ramlist();
3074 
3075     memory_global_dirty_log_sync();
3076     WITH_RCU_READ_LOCK_GUARD() {
3077         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3078             ramblock_sync_dirty_bitmap(ram_state, block);
3079             /* Discard this dirty bitmap record */
3080             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3081         }
3082         memory_global_dirty_log_start();
3083     }
3084     ram_state->migration_dirty_pages = 0;
3085     qemu_mutex_unlock_ramlist();
3086     qemu_mutex_unlock_iothread();
3087 }
3088 
3089 /* It is need to hold the global lock to call this helper */
3090 void colo_release_ram_cache(void)
3091 {
3092     RAMBlock *block;
3093 
3094     memory_global_dirty_log_stop();
3095     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3096         g_free(block->bmap);
3097         block->bmap = NULL;
3098     }
3099 
3100     WITH_RCU_READ_LOCK_GUARD() {
3101         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3102             if (block->colo_cache) {
3103                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3104                 block->colo_cache = NULL;
3105             }
3106         }
3107     }
3108     ram_state_cleanup(&ram_state);
3109 }
3110 
3111 /**
3112  * ram_load_setup: Setup RAM for migration incoming side
3113  *
3114  * Returns zero to indicate success and negative for error
3115  *
3116  * @f: QEMUFile where to receive the data
3117  * @opaque: RAMState pointer
3118  */
3119 static int ram_load_setup(QEMUFile *f, void *opaque)
3120 {
3121     if (compress_threads_load_setup(f)) {
3122         return -1;
3123     }
3124 
3125     xbzrle_load_setup();
3126     ramblock_recv_map_init();
3127 
3128     return 0;
3129 }
3130 
3131 static int ram_load_cleanup(void *opaque)
3132 {
3133     RAMBlock *rb;
3134 
3135     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3136         qemu_ram_block_writeback(rb);
3137     }
3138 
3139     xbzrle_load_cleanup();
3140     compress_threads_load_cleanup();
3141 
3142     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3143         g_free(rb->receivedmap);
3144         rb->receivedmap = NULL;
3145     }
3146 
3147     return 0;
3148 }
3149 
3150 /**
3151  * ram_postcopy_incoming_init: allocate postcopy data structures
3152  *
3153  * Returns 0 for success and negative if there was one error
3154  *
3155  * @mis: current migration incoming state
3156  *
3157  * Allocate data structures etc needed by incoming migration with
3158  * postcopy-ram. postcopy-ram's similarly names
3159  * postcopy_ram_incoming_init does the work.
3160  */
3161 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3162 {
3163     return postcopy_ram_incoming_init(mis);
3164 }
3165 
3166 /**
3167  * ram_load_postcopy: load a page in postcopy case
3168  *
3169  * Returns 0 for success or -errno in case of error
3170  *
3171  * Called in postcopy mode by ram_load().
3172  * rcu_read_lock is taken prior to this being called.
3173  *
3174  * @f: QEMUFile where to send the data
3175  */
3176 static int ram_load_postcopy(QEMUFile *f)
3177 {
3178     int flags = 0, ret = 0;
3179     bool place_needed = false;
3180     bool matches_target_page_size = false;
3181     MigrationIncomingState *mis = migration_incoming_get_current();
3182     /* Temporary page that is later 'placed' */
3183     void *postcopy_host_page = mis->postcopy_tmp_page;
3184     void *this_host = NULL;
3185     bool all_zero = true;
3186     int target_pages = 0;
3187 
3188     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3189         ram_addr_t addr;
3190         void *host = NULL;
3191         void *page_buffer = NULL;
3192         void *place_source = NULL;
3193         RAMBlock *block = NULL;
3194         uint8_t ch;
3195         int len;
3196 
3197         addr = qemu_get_be64(f);
3198 
3199         /*
3200          * If qemu file error, we should stop here, and then "addr"
3201          * may be invalid
3202          */
3203         ret = qemu_file_get_error(f);
3204         if (ret) {
3205             break;
3206         }
3207 
3208         flags = addr & ~TARGET_PAGE_MASK;
3209         addr &= TARGET_PAGE_MASK;
3210 
3211         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3212         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3213                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3214             block = ram_block_from_stream(f, flags);
3215 
3216             host = host_from_ram_block_offset(block, addr);
3217             if (!host) {
3218                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3219                 ret = -EINVAL;
3220                 break;
3221             }
3222             target_pages++;
3223             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3224             /*
3225              * Postcopy requires that we place whole host pages atomically;
3226              * these may be huge pages for RAMBlocks that are backed by
3227              * hugetlbfs.
3228              * To make it atomic, the data is read into a temporary page
3229              * that's moved into place later.
3230              * The migration protocol uses,  possibly smaller, target-pages
3231              * however the source ensures it always sends all the components
3232              * of a host page in one chunk.
3233              */
3234             page_buffer = postcopy_host_page +
3235                           ((uintptr_t)host & (block->page_size - 1));
3236             if (target_pages == 1) {
3237                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3238                                                     block->page_size);
3239             } else {
3240                 /* not the 1st TP within the HP */
3241                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3242                     (uintptr_t)this_host) {
3243                     error_report("Non-same host page %p/%p",
3244                                   host, this_host);
3245                     ret = -EINVAL;
3246                     break;
3247                 }
3248             }
3249 
3250             /*
3251              * If it's the last part of a host page then we place the host
3252              * page
3253              */
3254             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3255                 place_needed = true;
3256             }
3257             place_source = postcopy_host_page;
3258         }
3259 
3260         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3261         case RAM_SAVE_FLAG_ZERO:
3262             ch = qemu_get_byte(f);
3263             /*
3264              * Can skip to set page_buffer when
3265              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3266              */
3267             if (ch || !matches_target_page_size) {
3268                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3269             }
3270             if (ch) {
3271                 all_zero = false;
3272             }
3273             break;
3274 
3275         case RAM_SAVE_FLAG_PAGE:
3276             all_zero = false;
3277             if (!matches_target_page_size) {
3278                 /* For huge pages, we always use temporary buffer */
3279                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3280             } else {
3281                 /*
3282                  * For small pages that matches target page size, we
3283                  * avoid the qemu_file copy.  Instead we directly use
3284                  * the buffer of QEMUFile to place the page.  Note: we
3285                  * cannot do any QEMUFile operation before using that
3286                  * buffer to make sure the buffer is valid when
3287                  * placing the page.
3288                  */
3289                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3290                                          TARGET_PAGE_SIZE);
3291             }
3292             break;
3293         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3294             all_zero = false;
3295             len = qemu_get_be32(f);
3296             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3297                 error_report("Invalid compressed data length: %d", len);
3298                 ret = -EINVAL;
3299                 break;
3300             }
3301             decompress_data_with_multi_threads(f, page_buffer, len);
3302             break;
3303 
3304         case RAM_SAVE_FLAG_EOS:
3305             /* normal exit */
3306             multifd_recv_sync_main();
3307             break;
3308         default:
3309             error_report("Unknown combination of migration flags: %#x"
3310                          " (postcopy mode)", flags);
3311             ret = -EINVAL;
3312             break;
3313         }
3314 
3315         /* Got the whole host page, wait for decompress before placing. */
3316         if (place_needed) {
3317             ret |= wait_for_decompress_done();
3318         }
3319 
3320         /* Detect for any possible file errors */
3321         if (!ret && qemu_file_get_error(f)) {
3322             ret = qemu_file_get_error(f);
3323         }
3324 
3325         if (!ret && place_needed) {
3326             /* This gets called at the last target page in the host page */
3327             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3328                                                        block->page_size);
3329 
3330             if (all_zero) {
3331                 ret = postcopy_place_page_zero(mis, place_dest,
3332                                                block);
3333             } else {
3334                 ret = postcopy_place_page(mis, place_dest,
3335                                           place_source, block);
3336             }
3337             place_needed = false;
3338             target_pages = 0;
3339             /* Assume we have a zero page until we detect something different */
3340             all_zero = true;
3341         }
3342     }
3343 
3344     return ret;
3345 }
3346 
3347 static bool postcopy_is_advised(void)
3348 {
3349     PostcopyState ps = postcopy_state_get();
3350     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3351 }
3352 
3353 static bool postcopy_is_running(void)
3354 {
3355     PostcopyState ps = postcopy_state_get();
3356     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3357 }
3358 
3359 /*
3360  * Flush content of RAM cache into SVM's memory.
3361  * Only flush the pages that be dirtied by PVM or SVM or both.
3362  */
3363 void colo_flush_ram_cache(void)
3364 {
3365     RAMBlock *block = NULL;
3366     void *dst_host;
3367     void *src_host;
3368     unsigned long offset = 0;
3369 
3370     memory_global_dirty_log_sync();
3371     WITH_RCU_READ_LOCK_GUARD() {
3372         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3373             ramblock_sync_dirty_bitmap(ram_state, block);
3374         }
3375     }
3376 
3377     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3378     WITH_RCU_READ_LOCK_GUARD() {
3379         block = QLIST_FIRST_RCU(&ram_list.blocks);
3380 
3381         while (block) {
3382             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3383 
3384             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3385                 >= block->used_length) {
3386                 offset = 0;
3387                 block = QLIST_NEXT_RCU(block, next);
3388             } else {
3389                 migration_bitmap_clear_dirty(ram_state, block, offset);
3390                 dst_host = block->host
3391                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3392                 src_host = block->colo_cache
3393                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3394                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3395             }
3396         }
3397     }
3398     trace_colo_flush_ram_cache_end();
3399 }
3400 
3401 /**
3402  * ram_load_precopy: load pages in precopy case
3403  *
3404  * Returns 0 for success or -errno in case of error
3405  *
3406  * Called in precopy mode by ram_load().
3407  * rcu_read_lock is taken prior to this being called.
3408  *
3409  * @f: QEMUFile where to send the data
3410  */
3411 static int ram_load_precopy(QEMUFile *f)
3412 {
3413     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3414     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3415     bool postcopy_advised = postcopy_is_advised();
3416     if (!migrate_use_compression()) {
3417         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3418     }
3419 
3420     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3421         ram_addr_t addr, total_ram_bytes;
3422         void *host = NULL, *host_bak = NULL;
3423         uint8_t ch;
3424 
3425         /*
3426          * Yield periodically to let main loop run, but an iteration of
3427          * the main loop is expensive, so do it each some iterations
3428          */
3429         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3430             aio_co_schedule(qemu_get_current_aio_context(),
3431                             qemu_coroutine_self());
3432             qemu_coroutine_yield();
3433         }
3434         i++;
3435 
3436         addr = qemu_get_be64(f);
3437         flags = addr & ~TARGET_PAGE_MASK;
3438         addr &= TARGET_PAGE_MASK;
3439 
3440         if (flags & invalid_flags) {
3441             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3442                 error_report("Received an unexpected compressed page");
3443             }
3444 
3445             ret = -EINVAL;
3446             break;
3447         }
3448 
3449         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3450                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3451             RAMBlock *block = ram_block_from_stream(f, flags);
3452 
3453             host = host_from_ram_block_offset(block, addr);
3454             /*
3455              * After going into COLO stage, we should not load the page
3456              * into SVM's memory directly, we put them into colo_cache firstly.
3457              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3458              * Previously, we copied all these memory in preparing stage of COLO
3459              * while we need to stop VM, which is a time-consuming process.
3460              * Here we optimize it by a trick, back-up every page while in
3461              * migration process while COLO is enabled, though it affects the
3462              * speed of the migration, but it obviously reduce the downtime of
3463              * back-up all SVM'S memory in COLO preparing stage.
3464              */
3465             if (migration_incoming_colo_enabled()) {
3466                 if (migration_incoming_in_colo_state()) {
3467                     /* In COLO stage, put all pages into cache temporarily */
3468                     host = colo_cache_from_block_offset(block, addr, true);
3469                 } else {
3470                    /*
3471                     * In migration stage but before COLO stage,
3472                     * Put all pages into both cache and SVM's memory.
3473                     */
3474                     host_bak = colo_cache_from_block_offset(block, addr, false);
3475                 }
3476             }
3477             if (!host) {
3478                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3479                 ret = -EINVAL;
3480                 break;
3481             }
3482             if (!migration_incoming_in_colo_state()) {
3483                 ramblock_recv_bitmap_set(block, host);
3484             }
3485 
3486             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3487         }
3488 
3489         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3490         case RAM_SAVE_FLAG_MEM_SIZE:
3491             /* Synchronize RAM block list */
3492             total_ram_bytes = addr;
3493             while (!ret && total_ram_bytes) {
3494                 RAMBlock *block;
3495                 char id[256];
3496                 ram_addr_t length;
3497 
3498                 len = qemu_get_byte(f);
3499                 qemu_get_buffer(f, (uint8_t *)id, len);
3500                 id[len] = 0;
3501                 length = qemu_get_be64(f);
3502 
3503                 block = qemu_ram_block_by_name(id);
3504                 if (block && !qemu_ram_is_migratable(block)) {
3505                     error_report("block %s should not be migrated !", id);
3506                     ret = -EINVAL;
3507                 } else if (block) {
3508                     if (length != block->used_length) {
3509                         Error *local_err = NULL;
3510 
3511                         ret = qemu_ram_resize(block, length,
3512                                               &local_err);
3513                         if (local_err) {
3514                             error_report_err(local_err);
3515                         }
3516                     }
3517                     /* For postcopy we need to check hugepage sizes match */
3518                     if (postcopy_advised &&
3519                         block->page_size != qemu_host_page_size) {
3520                         uint64_t remote_page_size = qemu_get_be64(f);
3521                         if (remote_page_size != block->page_size) {
3522                             error_report("Mismatched RAM page size %s "
3523                                          "(local) %zd != %" PRId64,
3524                                          id, block->page_size,
3525                                          remote_page_size);
3526                             ret = -EINVAL;
3527                         }
3528                     }
3529                     if (migrate_ignore_shared()) {
3530                         hwaddr addr = qemu_get_be64(f);
3531                         if (ramblock_is_ignored(block) &&
3532                             block->mr->addr != addr) {
3533                             error_report("Mismatched GPAs for block %s "
3534                                          "%" PRId64 "!= %" PRId64,
3535                                          id, (uint64_t)addr,
3536                                          (uint64_t)block->mr->addr);
3537                             ret = -EINVAL;
3538                         }
3539                     }
3540                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3541                                           block->idstr);
3542                 } else {
3543                     error_report("Unknown ramblock \"%s\", cannot "
3544                                  "accept migration", id);
3545                     ret = -EINVAL;
3546                 }
3547 
3548                 total_ram_bytes -= length;
3549             }
3550             break;
3551 
3552         case RAM_SAVE_FLAG_ZERO:
3553             ch = qemu_get_byte(f);
3554             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3555             break;
3556 
3557         case RAM_SAVE_FLAG_PAGE:
3558             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3559             break;
3560 
3561         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3562             len = qemu_get_be32(f);
3563             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3564                 error_report("Invalid compressed data length: %d", len);
3565                 ret = -EINVAL;
3566                 break;
3567             }
3568             decompress_data_with_multi_threads(f, host, len);
3569             break;
3570 
3571         case RAM_SAVE_FLAG_XBZRLE:
3572             if (load_xbzrle(f, addr, host) < 0) {
3573                 error_report("Failed to decompress XBZRLE page at "
3574                              RAM_ADDR_FMT, addr);
3575                 ret = -EINVAL;
3576                 break;
3577             }
3578             break;
3579         case RAM_SAVE_FLAG_EOS:
3580             /* normal exit */
3581             multifd_recv_sync_main();
3582             break;
3583         default:
3584             if (flags & RAM_SAVE_FLAG_HOOK) {
3585                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3586             } else {
3587                 error_report("Unknown combination of migration flags: %#x",
3588                              flags);
3589                 ret = -EINVAL;
3590             }
3591         }
3592         if (!ret) {
3593             ret = qemu_file_get_error(f);
3594         }
3595         if (!ret && host_bak) {
3596             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3597         }
3598     }
3599 
3600     ret |= wait_for_decompress_done();
3601     return ret;
3602 }
3603 
3604 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3605 {
3606     int ret = 0;
3607     static uint64_t seq_iter;
3608     /*
3609      * If system is running in postcopy mode, page inserts to host memory must
3610      * be atomic
3611      */
3612     bool postcopy_running = postcopy_is_running();
3613 
3614     seq_iter++;
3615 
3616     if (version_id != 4) {
3617         return -EINVAL;
3618     }
3619 
3620     /*
3621      * This RCU critical section can be very long running.
3622      * When RCU reclaims in the code start to become numerous,
3623      * it will be necessary to reduce the granularity of this
3624      * critical section.
3625      */
3626     WITH_RCU_READ_LOCK_GUARD() {
3627         if (postcopy_running) {
3628             ret = ram_load_postcopy(f);
3629         } else {
3630             ret = ram_load_precopy(f);
3631         }
3632     }
3633     trace_ram_load_complete(ret, seq_iter);
3634 
3635     return ret;
3636 }
3637 
3638 static bool ram_has_postcopy(void *opaque)
3639 {
3640     RAMBlock *rb;
3641     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3642         if (ramblock_is_pmem(rb)) {
3643             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3644                          "is not supported now!", rb->idstr, rb->host);
3645             return false;
3646         }
3647     }
3648 
3649     return migrate_postcopy_ram();
3650 }
3651 
3652 /* Sync all the dirty bitmap with destination VM.  */
3653 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3654 {
3655     RAMBlock *block;
3656     QEMUFile *file = s->to_dst_file;
3657     int ramblock_count = 0;
3658 
3659     trace_ram_dirty_bitmap_sync_start();
3660 
3661     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3662         qemu_savevm_send_recv_bitmap(file, block->idstr);
3663         trace_ram_dirty_bitmap_request(block->idstr);
3664         ramblock_count++;
3665     }
3666 
3667     trace_ram_dirty_bitmap_sync_wait();
3668 
3669     /* Wait until all the ramblocks' dirty bitmap synced */
3670     while (ramblock_count--) {
3671         qemu_sem_wait(&s->rp_state.rp_sem);
3672     }
3673 
3674     trace_ram_dirty_bitmap_sync_complete();
3675 
3676     return 0;
3677 }
3678 
3679 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3680 {
3681     qemu_sem_post(&s->rp_state.rp_sem);
3682 }
3683 
3684 /*
3685  * Read the received bitmap, revert it as the initial dirty bitmap.
3686  * This is only used when the postcopy migration is paused but wants
3687  * to resume from a middle point.
3688  */
3689 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3690 {
3691     int ret = -EINVAL;
3692     QEMUFile *file = s->rp_state.from_dst_file;
3693     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3694     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3695     uint64_t size, end_mark;
3696 
3697     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3698 
3699     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3700         error_report("%s: incorrect state %s", __func__,
3701                      MigrationStatus_str(s->state));
3702         return -EINVAL;
3703     }
3704 
3705     /*
3706      * Note: see comments in ramblock_recv_bitmap_send() on why we
3707      * need the endianess convertion, and the paddings.
3708      */
3709     local_size = ROUND_UP(local_size, 8);
3710 
3711     /* Add paddings */
3712     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3713 
3714     size = qemu_get_be64(file);
3715 
3716     /* The size of the bitmap should match with our ramblock */
3717     if (size != local_size) {
3718         error_report("%s: ramblock '%s' bitmap size mismatch "
3719                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3720                      block->idstr, size, local_size);
3721         ret = -EINVAL;
3722         goto out;
3723     }
3724 
3725     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3726     end_mark = qemu_get_be64(file);
3727 
3728     ret = qemu_file_get_error(file);
3729     if (ret || size != local_size) {
3730         error_report("%s: read bitmap failed for ramblock '%s': %d"
3731                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3732                      __func__, block->idstr, ret, local_size, size);
3733         ret = -EIO;
3734         goto out;
3735     }
3736 
3737     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3738         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3739                      __func__, block->idstr, end_mark);
3740         ret = -EINVAL;
3741         goto out;
3742     }
3743 
3744     /*
3745      * Endianess convertion. We are during postcopy (though paused).
3746      * The dirty bitmap won't change. We can directly modify it.
3747      */
3748     bitmap_from_le(block->bmap, le_bitmap, nbits);
3749 
3750     /*
3751      * What we received is "received bitmap". Revert it as the initial
3752      * dirty bitmap for this ramblock.
3753      */
3754     bitmap_complement(block->bmap, block->bmap, nbits);
3755 
3756     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3757 
3758     /*
3759      * We succeeded to sync bitmap for current ramblock. If this is
3760      * the last one to sync, we need to notify the main send thread.
3761      */
3762     ram_dirty_bitmap_reload_notify(s);
3763 
3764     ret = 0;
3765 out:
3766     g_free(le_bitmap);
3767     return ret;
3768 }
3769 
3770 static int ram_resume_prepare(MigrationState *s, void *opaque)
3771 {
3772     RAMState *rs = *(RAMState **)opaque;
3773     int ret;
3774 
3775     ret = ram_dirty_bitmap_sync_all(s, rs);
3776     if (ret) {
3777         return ret;
3778     }
3779 
3780     ram_state_resume_prepare(rs, s->to_dst_file);
3781 
3782     return 0;
3783 }
3784 
3785 static SaveVMHandlers savevm_ram_handlers = {
3786     .save_setup = ram_save_setup,
3787     .save_live_iterate = ram_save_iterate,
3788     .save_live_complete_postcopy = ram_save_complete,
3789     .save_live_complete_precopy = ram_save_complete,
3790     .has_postcopy = ram_has_postcopy,
3791     .save_live_pending = ram_save_pending,
3792     .load_state = ram_load,
3793     .save_cleanup = ram_save_cleanup,
3794     .load_setup = ram_load_setup,
3795     .load_cleanup = ram_load_cleanup,
3796     .resume_prepare = ram_resume_prepare,
3797 };
3798 
3799 void ram_mig_init(void)
3800 {
3801     qemu_mutex_init(&XBZRLE.lock);
3802     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3803 }
3804