xref: /openbmc/qemu/migration/ram.c (revision 5e437d3c)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include "qemu/cutils.h"
32 #include "qemu/bitops.h"
33 #include "qemu/bitmap.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "block.h"
54 #include "sysemu/sysemu.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
60 
61 #if defined(__linux__)
62 #include "qemu/userfaultfd.h"
63 #endif /* defined(__linux__) */
64 
65 /***********************************************************/
66 /* ram save/restore */
67 
68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69  * worked for pages that where filled with the same char.  We switched
70  * it to only search for the zero value.  And to avoid confusion with
71  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
72  */
73 
74 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
75 #define RAM_SAVE_FLAG_ZERO     0x02
76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
77 #define RAM_SAVE_FLAG_PAGE     0x08
78 #define RAM_SAVE_FLAG_EOS      0x10
79 #define RAM_SAVE_FLAG_CONTINUE 0x20
80 #define RAM_SAVE_FLAG_XBZRLE   0x40
81 /* 0x80 is reserved in migration.h start with 0x100 next */
82 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
83 
84 static inline bool is_zero_range(uint8_t *p, uint64_t size)
85 {
86     return buffer_is_zero(p, size);
87 }
88 
89 XBZRLECacheStats xbzrle_counters;
90 
91 /* struct contains XBZRLE cache and a static page
92    used by the compression */
93 static struct {
94     /* buffer used for XBZRLE encoding */
95     uint8_t *encoded_buf;
96     /* buffer for storing page content */
97     uint8_t *current_buf;
98     /* Cache for XBZRLE, Protected by lock. */
99     PageCache *cache;
100     QemuMutex lock;
101     /* it will store a page full of zeros */
102     uint8_t *zero_target_page;
103     /* buffer used for XBZRLE decoding */
104     uint8_t *decoded_buf;
105 } XBZRLE;
106 
107 static void XBZRLE_cache_lock(void)
108 {
109     if (migrate_use_xbzrle()) {
110         qemu_mutex_lock(&XBZRLE.lock);
111     }
112 }
113 
114 static void XBZRLE_cache_unlock(void)
115 {
116     if (migrate_use_xbzrle()) {
117         qemu_mutex_unlock(&XBZRLE.lock);
118     }
119 }
120 
121 /**
122  * xbzrle_cache_resize: resize the xbzrle cache
123  *
124  * This function is called from migrate_params_apply in main
125  * thread, possibly while a migration is in progress.  A running
126  * migration may be using the cache and might finish during this call,
127  * hence changes to the cache are protected by XBZRLE.lock().
128  *
129  * Returns 0 for success or -1 for error
130  *
131  * @new_size: new cache size
132  * @errp: set *errp if the check failed, with reason
133  */
134 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
135 {
136     PageCache *new_cache;
137     int64_t ret = 0;
138 
139     /* Check for truncation */
140     if (new_size != (size_t)new_size) {
141         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
142                    "exceeding address space");
143         return -1;
144     }
145 
146     if (new_size == migrate_xbzrle_cache_size()) {
147         /* nothing to do */
148         return 0;
149     }
150 
151     XBZRLE_cache_lock();
152 
153     if (XBZRLE.cache != NULL) {
154         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
155         if (!new_cache) {
156             ret = -1;
157             goto out;
158         }
159 
160         cache_fini(XBZRLE.cache);
161         XBZRLE.cache = new_cache;
162     }
163 out:
164     XBZRLE_cache_unlock();
165     return ret;
166 }
167 
168 bool ramblock_is_ignored(RAMBlock *block)
169 {
170     return !qemu_ram_is_migratable(block) ||
171            (migrate_ignore_shared() && qemu_ram_is_shared(block));
172 }
173 
174 #undef RAMBLOCK_FOREACH
175 
176 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
177 {
178     RAMBlock *block;
179     int ret = 0;
180 
181     RCU_READ_LOCK_GUARD();
182 
183     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
184         ret = func(block, opaque);
185         if (ret) {
186             break;
187         }
188     }
189     return ret;
190 }
191 
192 static void ramblock_recv_map_init(void)
193 {
194     RAMBlock *rb;
195 
196     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
197         assert(!rb->receivedmap);
198         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
199     }
200 }
201 
202 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
203 {
204     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
205                     rb->receivedmap);
206 }
207 
208 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
209 {
210     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
211 }
212 
213 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
214 {
215     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
216 }
217 
218 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
219                                     size_t nr)
220 {
221     bitmap_set_atomic(rb->receivedmap,
222                       ramblock_recv_bitmap_offset(host_addr, rb),
223                       nr);
224 }
225 
226 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
227 
228 /*
229  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
230  *
231  * Returns >0 if success with sent bytes, or <0 if error.
232  */
233 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
234                                   const char *block_name)
235 {
236     RAMBlock *block = qemu_ram_block_by_name(block_name);
237     unsigned long *le_bitmap, nbits;
238     uint64_t size;
239 
240     if (!block) {
241         error_report("%s: invalid block name: %s", __func__, block_name);
242         return -1;
243     }
244 
245     nbits = block->used_length >> TARGET_PAGE_BITS;
246 
247     /*
248      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
249      * machines we may need 4 more bytes for padding (see below
250      * comment). So extend it a bit before hand.
251      */
252     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
253 
254     /*
255      * Always use little endian when sending the bitmap. This is
256      * required that when source and destination VMs are not using the
257      * same endianness. (Note: big endian won't work.)
258      */
259     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
260 
261     /* Size of the bitmap, in bytes */
262     size = DIV_ROUND_UP(nbits, 8);
263 
264     /*
265      * size is always aligned to 8 bytes for 64bit machines, but it
266      * may not be true for 32bit machines. We need this padding to
267      * make sure the migration can survive even between 32bit and
268      * 64bit machines.
269      */
270     size = ROUND_UP(size, 8);
271 
272     qemu_put_be64(file, size);
273     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
274     /*
275      * Mark as an end, in case the middle part is screwed up due to
276      * some "mysterious" reason.
277      */
278     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
279     qemu_fflush(file);
280 
281     g_free(le_bitmap);
282 
283     if (qemu_file_get_error(file)) {
284         return qemu_file_get_error(file);
285     }
286 
287     return size + sizeof(size);
288 }
289 
290 /*
291  * An outstanding page request, on the source, having been received
292  * and queued
293  */
294 struct RAMSrcPageRequest {
295     RAMBlock *rb;
296     hwaddr    offset;
297     hwaddr    len;
298 
299     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
300 };
301 
302 /* State of RAM for migration */
303 struct RAMState {
304     /* QEMUFile used for this migration */
305     QEMUFile *f;
306     /* UFFD file descriptor, used in 'write-tracking' migration */
307     int uffdio_fd;
308     /* Last block that we have visited searching for dirty pages */
309     RAMBlock *last_seen_block;
310     /* Last block from where we have sent data */
311     RAMBlock *last_sent_block;
312     /* Last dirty target page we have sent */
313     ram_addr_t last_page;
314     /* last ram version we have seen */
315     uint32_t last_version;
316     /* We are in the first round */
317     bool ram_bulk_stage;
318     /* The free page optimization is enabled */
319     bool fpo_enabled;
320     /* How many times we have dirty too many pages */
321     int dirty_rate_high_cnt;
322     /* these variables are used for bitmap sync */
323     /* last time we did a full bitmap_sync */
324     int64_t time_last_bitmap_sync;
325     /* bytes transferred at start_time */
326     uint64_t bytes_xfer_prev;
327     /* number of dirty pages since start_time */
328     uint64_t num_dirty_pages_period;
329     /* xbzrle misses since the beginning of the period */
330     uint64_t xbzrle_cache_miss_prev;
331     /* Amount of xbzrle pages since the beginning of the period */
332     uint64_t xbzrle_pages_prev;
333     /* Amount of xbzrle encoded bytes since the beginning of the period */
334     uint64_t xbzrle_bytes_prev;
335 
336     /* compression statistics since the beginning of the period */
337     /* amount of count that no free thread to compress data */
338     uint64_t compress_thread_busy_prev;
339     /* amount bytes after compression */
340     uint64_t compressed_size_prev;
341     /* amount of compressed pages */
342     uint64_t compress_pages_prev;
343 
344     /* total handled target pages at the beginning of period */
345     uint64_t target_page_count_prev;
346     /* total handled target pages since start */
347     uint64_t target_page_count;
348     /* number of dirty bits in the bitmap */
349     uint64_t migration_dirty_pages;
350     /* Protects modification of the bitmap and migration dirty pages */
351     QemuMutex bitmap_mutex;
352     /* The RAMBlock used in the last src_page_requests */
353     RAMBlock *last_req_rb;
354     /* Queue of outstanding page requests from the destination */
355     QemuMutex src_page_req_mutex;
356     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
357 };
358 typedef struct RAMState RAMState;
359 
360 static RAMState *ram_state;
361 
362 static NotifierWithReturnList precopy_notifier_list;
363 
364 void precopy_infrastructure_init(void)
365 {
366     notifier_with_return_list_init(&precopy_notifier_list);
367 }
368 
369 void precopy_add_notifier(NotifierWithReturn *n)
370 {
371     notifier_with_return_list_add(&precopy_notifier_list, n);
372 }
373 
374 void precopy_remove_notifier(NotifierWithReturn *n)
375 {
376     notifier_with_return_remove(n);
377 }
378 
379 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
380 {
381     PrecopyNotifyData pnd;
382     pnd.reason = reason;
383     pnd.errp = errp;
384 
385     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
386 }
387 
388 void precopy_enable_free_page_optimization(void)
389 {
390     if (!ram_state) {
391         return;
392     }
393 
394     ram_state->fpo_enabled = true;
395 }
396 
397 uint64_t ram_bytes_remaining(void)
398 {
399     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
400                        0;
401 }
402 
403 MigrationStats ram_counters;
404 
405 /* used by the search for pages to send */
406 struct PageSearchStatus {
407     /* Current block being searched */
408     RAMBlock    *block;
409     /* Current page to search from */
410     unsigned long page;
411     /* Set once we wrap around */
412     bool         complete_round;
413 };
414 typedef struct PageSearchStatus PageSearchStatus;
415 
416 CompressionStats compression_counters;
417 
418 struct CompressParam {
419     bool done;
420     bool quit;
421     bool zero_page;
422     QEMUFile *file;
423     QemuMutex mutex;
424     QemuCond cond;
425     RAMBlock *block;
426     ram_addr_t offset;
427 
428     /* internally used fields */
429     z_stream stream;
430     uint8_t *originbuf;
431 };
432 typedef struct CompressParam CompressParam;
433 
434 struct DecompressParam {
435     bool done;
436     bool quit;
437     QemuMutex mutex;
438     QemuCond cond;
439     void *des;
440     uint8_t *compbuf;
441     int len;
442     z_stream stream;
443 };
444 typedef struct DecompressParam DecompressParam;
445 
446 static CompressParam *comp_param;
447 static QemuThread *compress_threads;
448 /* comp_done_cond is used to wake up the migration thread when
449  * one of the compression threads has finished the compression.
450  * comp_done_lock is used to co-work with comp_done_cond.
451  */
452 static QemuMutex comp_done_lock;
453 static QemuCond comp_done_cond;
454 /* The empty QEMUFileOps will be used by file in CompressParam */
455 static const QEMUFileOps empty_ops = { };
456 
457 static QEMUFile *decomp_file;
458 static DecompressParam *decomp_param;
459 static QemuThread *decompress_threads;
460 static QemuMutex decomp_done_lock;
461 static QemuCond decomp_done_cond;
462 
463 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
464                                  ram_addr_t offset, uint8_t *source_buf);
465 
466 static void *do_data_compress(void *opaque)
467 {
468     CompressParam *param = opaque;
469     RAMBlock *block;
470     ram_addr_t offset;
471     bool zero_page;
472 
473     qemu_mutex_lock(&param->mutex);
474     while (!param->quit) {
475         if (param->block) {
476             block = param->block;
477             offset = param->offset;
478             param->block = NULL;
479             qemu_mutex_unlock(&param->mutex);
480 
481             zero_page = do_compress_ram_page(param->file, &param->stream,
482                                              block, offset, param->originbuf);
483 
484             qemu_mutex_lock(&comp_done_lock);
485             param->done = true;
486             param->zero_page = zero_page;
487             qemu_cond_signal(&comp_done_cond);
488             qemu_mutex_unlock(&comp_done_lock);
489 
490             qemu_mutex_lock(&param->mutex);
491         } else {
492             qemu_cond_wait(&param->cond, &param->mutex);
493         }
494     }
495     qemu_mutex_unlock(&param->mutex);
496 
497     return NULL;
498 }
499 
500 static void compress_threads_save_cleanup(void)
501 {
502     int i, thread_count;
503 
504     if (!migrate_use_compression() || !comp_param) {
505         return;
506     }
507 
508     thread_count = migrate_compress_threads();
509     for (i = 0; i < thread_count; i++) {
510         /*
511          * we use it as a indicator which shows if the thread is
512          * properly init'd or not
513          */
514         if (!comp_param[i].file) {
515             break;
516         }
517 
518         qemu_mutex_lock(&comp_param[i].mutex);
519         comp_param[i].quit = true;
520         qemu_cond_signal(&comp_param[i].cond);
521         qemu_mutex_unlock(&comp_param[i].mutex);
522 
523         qemu_thread_join(compress_threads + i);
524         qemu_mutex_destroy(&comp_param[i].mutex);
525         qemu_cond_destroy(&comp_param[i].cond);
526         deflateEnd(&comp_param[i].stream);
527         g_free(comp_param[i].originbuf);
528         qemu_fclose(comp_param[i].file);
529         comp_param[i].file = NULL;
530     }
531     qemu_mutex_destroy(&comp_done_lock);
532     qemu_cond_destroy(&comp_done_cond);
533     g_free(compress_threads);
534     g_free(comp_param);
535     compress_threads = NULL;
536     comp_param = NULL;
537 }
538 
539 static int compress_threads_save_setup(void)
540 {
541     int i, thread_count;
542 
543     if (!migrate_use_compression()) {
544         return 0;
545     }
546     thread_count = migrate_compress_threads();
547     compress_threads = g_new0(QemuThread, thread_count);
548     comp_param = g_new0(CompressParam, thread_count);
549     qemu_cond_init(&comp_done_cond);
550     qemu_mutex_init(&comp_done_lock);
551     for (i = 0; i < thread_count; i++) {
552         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
553         if (!comp_param[i].originbuf) {
554             goto exit;
555         }
556 
557         if (deflateInit(&comp_param[i].stream,
558                         migrate_compress_level()) != Z_OK) {
559             g_free(comp_param[i].originbuf);
560             goto exit;
561         }
562 
563         /* comp_param[i].file is just used as a dummy buffer to save data,
564          * set its ops to empty.
565          */
566         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
567         comp_param[i].done = true;
568         comp_param[i].quit = false;
569         qemu_mutex_init(&comp_param[i].mutex);
570         qemu_cond_init(&comp_param[i].cond);
571         qemu_thread_create(compress_threads + i, "compress",
572                            do_data_compress, comp_param + i,
573                            QEMU_THREAD_JOINABLE);
574     }
575     return 0;
576 
577 exit:
578     compress_threads_save_cleanup();
579     return -1;
580 }
581 
582 /**
583  * save_page_header: write page header to wire
584  *
585  * If this is the 1st block, it also writes the block identification
586  *
587  * Returns the number of bytes written
588  *
589  * @f: QEMUFile where to send the data
590  * @block: block that contains the page we want to send
591  * @offset: offset inside the block for the page
592  *          in the lower bits, it contains flags
593  */
594 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
595                                ram_addr_t offset)
596 {
597     size_t size, len;
598 
599     if (block == rs->last_sent_block) {
600         offset |= RAM_SAVE_FLAG_CONTINUE;
601     }
602     qemu_put_be64(f, offset);
603     size = 8;
604 
605     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
606         len = strlen(block->idstr);
607         qemu_put_byte(f, len);
608         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
609         size += 1 + len;
610         rs->last_sent_block = block;
611     }
612     return size;
613 }
614 
615 /**
616  * mig_throttle_guest_down: throotle down the guest
617  *
618  * Reduce amount of guest cpu execution to hopefully slow down memory
619  * writes. If guest dirty memory rate is reduced below the rate at
620  * which we can transfer pages to the destination then we should be
621  * able to complete migration. Some workloads dirty memory way too
622  * fast and will not effectively converge, even with auto-converge.
623  */
624 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
625                                     uint64_t bytes_dirty_threshold)
626 {
627     MigrationState *s = migrate_get_current();
628     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
629     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
630     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
631     int pct_max = s->parameters.max_cpu_throttle;
632 
633     uint64_t throttle_now = cpu_throttle_get_percentage();
634     uint64_t cpu_now, cpu_ideal, throttle_inc;
635 
636     /* We have not started throttling yet. Let's start it. */
637     if (!cpu_throttle_active()) {
638         cpu_throttle_set(pct_initial);
639     } else {
640         /* Throttling already on, just increase the rate */
641         if (!pct_tailslow) {
642             throttle_inc = pct_increment;
643         } else {
644             /* Compute the ideal CPU percentage used by Guest, which may
645              * make the dirty rate match the dirty rate threshold. */
646             cpu_now = 100 - throttle_now;
647             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
648                         bytes_dirty_period);
649             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
650         }
651         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
652     }
653 }
654 
655 /**
656  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
657  *
658  * @rs: current RAM state
659  * @current_addr: address for the zero page
660  *
661  * Update the xbzrle cache to reflect a page that's been sent as all 0.
662  * The important thing is that a stale (not-yet-0'd) page be replaced
663  * by the new data.
664  * As a bonus, if the page wasn't in the cache it gets added so that
665  * when a small write is made into the 0'd page it gets XBZRLE sent.
666  */
667 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
668 {
669     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
670         return;
671     }
672 
673     /* We don't care if this fails to allocate a new cache page
674      * as long as it updated an old one */
675     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
676                  ram_counters.dirty_sync_count);
677 }
678 
679 #define ENCODING_FLAG_XBZRLE 0x1
680 
681 /**
682  * save_xbzrle_page: compress and send current page
683  *
684  * Returns: 1 means that we wrote the page
685  *          0 means that page is identical to the one already sent
686  *          -1 means that xbzrle would be longer than normal
687  *
688  * @rs: current RAM state
689  * @current_data: pointer to the address of the page contents
690  * @current_addr: addr of the page
691  * @block: block that contains the page we want to send
692  * @offset: offset inside the block for the page
693  * @last_stage: if we are at the completion stage
694  */
695 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
696                             ram_addr_t current_addr, RAMBlock *block,
697                             ram_addr_t offset, bool last_stage)
698 {
699     int encoded_len = 0, bytes_xbzrle;
700     uint8_t *prev_cached_page;
701 
702     if (!cache_is_cached(XBZRLE.cache, current_addr,
703                          ram_counters.dirty_sync_count)) {
704         xbzrle_counters.cache_miss++;
705         if (!last_stage) {
706             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
707                              ram_counters.dirty_sync_count) == -1) {
708                 return -1;
709             } else {
710                 /* update *current_data when the page has been
711                    inserted into cache */
712                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
713             }
714         }
715         return -1;
716     }
717 
718     /*
719      * Reaching here means the page has hit the xbzrle cache, no matter what
720      * encoding result it is (normal encoding, overflow or skipping the page),
721      * count the page as encoded. This is used to calculate the encoding rate.
722      *
723      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
724      * 2nd page turns out to be skipped (i.e. no new bytes written to the
725      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
726      * skipped page included. In this way, the encoding rate can tell if the
727      * guest page is good for xbzrle encoding.
728      */
729     xbzrle_counters.pages++;
730     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
731 
732     /* save current buffer into memory */
733     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
734 
735     /* XBZRLE encoding (if there is no overflow) */
736     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
737                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
738                                        TARGET_PAGE_SIZE);
739 
740     /*
741      * Update the cache contents, so that it corresponds to the data
742      * sent, in all cases except where we skip the page.
743      */
744     if (!last_stage && encoded_len != 0) {
745         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
746         /*
747          * In the case where we couldn't compress, ensure that the caller
748          * sends the data from the cache, since the guest might have
749          * changed the RAM since we copied it.
750          */
751         *current_data = prev_cached_page;
752     }
753 
754     if (encoded_len == 0) {
755         trace_save_xbzrle_page_skipping();
756         return 0;
757     } else if (encoded_len == -1) {
758         trace_save_xbzrle_page_overflow();
759         xbzrle_counters.overflow++;
760         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
761         return -1;
762     }
763 
764     /* Send XBZRLE based compressed page */
765     bytes_xbzrle = save_page_header(rs, rs->f, block,
766                                     offset | RAM_SAVE_FLAG_XBZRLE);
767     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
768     qemu_put_be16(rs->f, encoded_len);
769     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
770     bytes_xbzrle += encoded_len + 1 + 2;
771     /*
772      * Like compressed_size (please see update_compress_thread_counts),
773      * the xbzrle encoded bytes don't count the 8 byte header with
774      * RAM_SAVE_FLAG_CONTINUE.
775      */
776     xbzrle_counters.bytes += bytes_xbzrle - 8;
777     ram_counters.transferred += bytes_xbzrle;
778 
779     return 1;
780 }
781 
782 /**
783  * migration_bitmap_find_dirty: find the next dirty page from start
784  *
785  * Returns the page offset within memory region of the start of a dirty page
786  *
787  * @rs: current RAM state
788  * @rb: RAMBlock where to search for dirty pages
789  * @start: page where we start the search
790  */
791 static inline
792 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
793                                           unsigned long start)
794 {
795     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
796     unsigned long *bitmap = rb->bmap;
797     unsigned long next;
798 
799     if (ramblock_is_ignored(rb)) {
800         return size;
801     }
802 
803     /*
804      * When the free page optimization is enabled, we need to check the bitmap
805      * to send the non-free pages rather than all the pages in the bulk stage.
806      */
807     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
808         next = start + 1;
809     } else {
810         next = find_next_bit(bitmap, size, start);
811     }
812 
813     return next;
814 }
815 
816 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
817                                                 RAMBlock *rb,
818                                                 unsigned long page)
819 {
820     bool ret;
821 
822     QEMU_LOCK_GUARD(&rs->bitmap_mutex);
823 
824     /*
825      * Clear dirty bitmap if needed.  This _must_ be called before we
826      * send any of the page in the chunk because we need to make sure
827      * we can capture further page content changes when we sync dirty
828      * log the next time.  So as long as we are going to send any of
829      * the page in the chunk we clear the remote dirty bitmap for all.
830      * Clearing it earlier won't be a problem, but too late will.
831      */
832     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
833         uint8_t shift = rb->clear_bmap_shift;
834         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
835         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
836 
837         /*
838          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
839          * can make things easier sometimes since then start address
840          * of the small chunk will always be 64 pages aligned so the
841          * bitmap will always be aligned to unsigned long.  We should
842          * even be able to remove this restriction but I'm simply
843          * keeping it.
844          */
845         assert(shift >= 6);
846         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
847         memory_region_clear_dirty_bitmap(rb->mr, start, size);
848     }
849 
850     ret = test_and_clear_bit(page, rb->bmap);
851 
852     if (ret) {
853         rs->migration_dirty_pages--;
854     }
855 
856     return ret;
857 }
858 
859 /* Called with RCU critical section */
860 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
861 {
862     uint64_t new_dirty_pages =
863         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
864 
865     rs->migration_dirty_pages += new_dirty_pages;
866     rs->num_dirty_pages_period += new_dirty_pages;
867 }
868 
869 /**
870  * ram_pagesize_summary: calculate all the pagesizes of a VM
871  *
872  * Returns a summary bitmap of the page sizes of all RAMBlocks
873  *
874  * For VMs with just normal pages this is equivalent to the host page
875  * size. If it's got some huge pages then it's the OR of all the
876  * different page sizes.
877  */
878 uint64_t ram_pagesize_summary(void)
879 {
880     RAMBlock *block;
881     uint64_t summary = 0;
882 
883     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
884         summary |= block->page_size;
885     }
886 
887     return summary;
888 }
889 
890 uint64_t ram_get_total_transferred_pages(void)
891 {
892     return  ram_counters.normal + ram_counters.duplicate +
893                 compression_counters.pages + xbzrle_counters.pages;
894 }
895 
896 static void migration_update_rates(RAMState *rs, int64_t end_time)
897 {
898     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
899     double compressed_size;
900 
901     /* calculate period counters */
902     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
903                 / (end_time - rs->time_last_bitmap_sync);
904 
905     if (!page_count) {
906         return;
907     }
908 
909     if (migrate_use_xbzrle()) {
910         double encoded_size, unencoded_size;
911 
912         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
913             rs->xbzrle_cache_miss_prev) / page_count;
914         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
915         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
916                          TARGET_PAGE_SIZE;
917         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
918         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
919             xbzrle_counters.encoding_rate = 0;
920         } else {
921             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
922         }
923         rs->xbzrle_pages_prev = xbzrle_counters.pages;
924         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
925     }
926 
927     if (migrate_use_compression()) {
928         compression_counters.busy_rate = (double)(compression_counters.busy -
929             rs->compress_thread_busy_prev) / page_count;
930         rs->compress_thread_busy_prev = compression_counters.busy;
931 
932         compressed_size = compression_counters.compressed_size -
933                           rs->compressed_size_prev;
934         if (compressed_size) {
935             double uncompressed_size = (compression_counters.pages -
936                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
937 
938             /* Compression-Ratio = Uncompressed-size / Compressed-size */
939             compression_counters.compression_rate =
940                                         uncompressed_size / compressed_size;
941 
942             rs->compress_pages_prev = compression_counters.pages;
943             rs->compressed_size_prev = compression_counters.compressed_size;
944         }
945     }
946 }
947 
948 static void migration_trigger_throttle(RAMState *rs)
949 {
950     MigrationState *s = migrate_get_current();
951     uint64_t threshold = s->parameters.throttle_trigger_threshold;
952 
953     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
954     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
955     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
956 
957     /* During block migration the auto-converge logic incorrectly detects
958      * that ram migration makes no progress. Avoid this by disabling the
959      * throttling logic during the bulk phase of block migration. */
960     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
961         /* The following detection logic can be refined later. For now:
962            Check to see if the ratio between dirtied bytes and the approx.
963            amount of bytes that just got transferred since the last time
964            we were in this routine reaches the threshold. If that happens
965            twice, start or increase throttling. */
966 
967         if ((bytes_dirty_period > bytes_dirty_threshold) &&
968             (++rs->dirty_rate_high_cnt >= 2)) {
969             trace_migration_throttle();
970             rs->dirty_rate_high_cnt = 0;
971             mig_throttle_guest_down(bytes_dirty_period,
972                                     bytes_dirty_threshold);
973         }
974     }
975 }
976 
977 static void migration_bitmap_sync(RAMState *rs)
978 {
979     RAMBlock *block;
980     int64_t end_time;
981 
982     ram_counters.dirty_sync_count++;
983 
984     if (!rs->time_last_bitmap_sync) {
985         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
986     }
987 
988     trace_migration_bitmap_sync_start();
989     memory_global_dirty_log_sync();
990 
991     qemu_mutex_lock(&rs->bitmap_mutex);
992     WITH_RCU_READ_LOCK_GUARD() {
993         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
994             ramblock_sync_dirty_bitmap(rs, block);
995         }
996         ram_counters.remaining = ram_bytes_remaining();
997     }
998     qemu_mutex_unlock(&rs->bitmap_mutex);
999 
1000     memory_global_after_dirty_log_sync();
1001     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1002 
1003     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1004 
1005     /* more than 1 second = 1000 millisecons */
1006     if (end_time > rs->time_last_bitmap_sync + 1000) {
1007         migration_trigger_throttle(rs);
1008 
1009         migration_update_rates(rs, end_time);
1010 
1011         rs->target_page_count_prev = rs->target_page_count;
1012 
1013         /* reset period counters */
1014         rs->time_last_bitmap_sync = end_time;
1015         rs->num_dirty_pages_period = 0;
1016         rs->bytes_xfer_prev = ram_counters.transferred;
1017     }
1018     if (migrate_use_events()) {
1019         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1020     }
1021 }
1022 
1023 static void migration_bitmap_sync_precopy(RAMState *rs)
1024 {
1025     Error *local_err = NULL;
1026 
1027     /*
1028      * The current notifier usage is just an optimization to migration, so we
1029      * don't stop the normal migration process in the error case.
1030      */
1031     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1032         error_report_err(local_err);
1033         local_err = NULL;
1034     }
1035 
1036     migration_bitmap_sync(rs);
1037 
1038     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1039         error_report_err(local_err);
1040     }
1041 }
1042 
1043 /**
1044  * save_zero_page_to_file: send the zero page to the file
1045  *
1046  * Returns the size of data written to the file, 0 means the page is not
1047  * a zero page
1048  *
1049  * @rs: current RAM state
1050  * @file: the file where the data is saved
1051  * @block: block that contains the page we want to send
1052  * @offset: offset inside the block for the page
1053  */
1054 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1055                                   RAMBlock *block, ram_addr_t offset)
1056 {
1057     uint8_t *p = block->host + offset;
1058     int len = 0;
1059 
1060     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1061         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1062         qemu_put_byte(file, 0);
1063         len += 1;
1064     }
1065     return len;
1066 }
1067 
1068 /**
1069  * save_zero_page: send the zero page to the stream
1070  *
1071  * Returns the number of pages written.
1072  *
1073  * @rs: current RAM state
1074  * @block: block that contains the page we want to send
1075  * @offset: offset inside the block for the page
1076  */
1077 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1078 {
1079     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1080 
1081     if (len) {
1082         ram_counters.duplicate++;
1083         ram_counters.transferred += len;
1084         return 1;
1085     }
1086     return -1;
1087 }
1088 
1089 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1090 {
1091     if (!migrate_release_ram() || !migration_in_postcopy()) {
1092         return;
1093     }
1094 
1095     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1096 }
1097 
1098 /*
1099  * @pages: the number of pages written by the control path,
1100  *        < 0 - error
1101  *        > 0 - number of pages written
1102  *
1103  * Return true if the pages has been saved, otherwise false is returned.
1104  */
1105 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1106                               int *pages)
1107 {
1108     uint64_t bytes_xmit = 0;
1109     int ret;
1110 
1111     *pages = -1;
1112     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1113                                 &bytes_xmit);
1114     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1115         return false;
1116     }
1117 
1118     if (bytes_xmit) {
1119         ram_counters.transferred += bytes_xmit;
1120         *pages = 1;
1121     }
1122 
1123     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1124         return true;
1125     }
1126 
1127     if (bytes_xmit > 0) {
1128         ram_counters.normal++;
1129     } else if (bytes_xmit == 0) {
1130         ram_counters.duplicate++;
1131     }
1132 
1133     return true;
1134 }
1135 
1136 /*
1137  * directly send the page to the stream
1138  *
1139  * Returns the number of pages written.
1140  *
1141  * @rs: current RAM state
1142  * @block: block that contains the page we want to send
1143  * @offset: offset inside the block for the page
1144  * @buf: the page to be sent
1145  * @async: send to page asyncly
1146  */
1147 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1148                             uint8_t *buf, bool async)
1149 {
1150     ram_counters.transferred += save_page_header(rs, rs->f, block,
1151                                                  offset | RAM_SAVE_FLAG_PAGE);
1152     if (async) {
1153         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1154                               migrate_release_ram() &
1155                               migration_in_postcopy());
1156     } else {
1157         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1158     }
1159     ram_counters.transferred += TARGET_PAGE_SIZE;
1160     ram_counters.normal++;
1161     return 1;
1162 }
1163 
1164 /**
1165  * ram_save_page: send the given page to the stream
1166  *
1167  * Returns the number of pages written.
1168  *          < 0 - error
1169  *          >=0 - Number of pages written - this might legally be 0
1170  *                if xbzrle noticed the page was the same.
1171  *
1172  * @rs: current RAM state
1173  * @block: block that contains the page we want to send
1174  * @offset: offset inside the block for the page
1175  * @last_stage: if we are at the completion stage
1176  */
1177 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1178 {
1179     int pages = -1;
1180     uint8_t *p;
1181     bool send_async = true;
1182     RAMBlock *block = pss->block;
1183     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1184     ram_addr_t current_addr = block->offset + offset;
1185 
1186     p = block->host + offset;
1187     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1188 
1189     XBZRLE_cache_lock();
1190     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1191         migrate_use_xbzrle()) {
1192         pages = save_xbzrle_page(rs, &p, current_addr, block,
1193                                  offset, last_stage);
1194         if (!last_stage) {
1195             /* Can't send this cached data async, since the cache page
1196              * might get updated before it gets to the wire
1197              */
1198             send_async = false;
1199         }
1200     }
1201 
1202     /* XBZRLE overflow or normal page */
1203     if (pages == -1) {
1204         pages = save_normal_page(rs, block, offset, p, send_async);
1205     }
1206 
1207     XBZRLE_cache_unlock();
1208 
1209     return pages;
1210 }
1211 
1212 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1213                                  ram_addr_t offset)
1214 {
1215     if (multifd_queue_page(rs->f, block, offset) < 0) {
1216         return -1;
1217     }
1218     ram_counters.normal++;
1219 
1220     return 1;
1221 }
1222 
1223 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1224                                  ram_addr_t offset, uint8_t *source_buf)
1225 {
1226     RAMState *rs = ram_state;
1227     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1228     bool zero_page = false;
1229     int ret;
1230 
1231     if (save_zero_page_to_file(rs, f, block, offset)) {
1232         zero_page = true;
1233         goto exit;
1234     }
1235 
1236     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1237 
1238     /*
1239      * copy it to a internal buffer to avoid it being modified by VM
1240      * so that we can catch up the error during compression and
1241      * decompression
1242      */
1243     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1244     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1245     if (ret < 0) {
1246         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1247         error_report("compressed data failed!");
1248         return false;
1249     }
1250 
1251 exit:
1252     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1253     return zero_page;
1254 }
1255 
1256 static void
1257 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1258 {
1259     ram_counters.transferred += bytes_xmit;
1260 
1261     if (param->zero_page) {
1262         ram_counters.duplicate++;
1263         return;
1264     }
1265 
1266     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1267     compression_counters.compressed_size += bytes_xmit - 8;
1268     compression_counters.pages++;
1269 }
1270 
1271 static bool save_page_use_compression(RAMState *rs);
1272 
1273 static void flush_compressed_data(RAMState *rs)
1274 {
1275     int idx, len, thread_count;
1276 
1277     if (!save_page_use_compression(rs)) {
1278         return;
1279     }
1280     thread_count = migrate_compress_threads();
1281 
1282     qemu_mutex_lock(&comp_done_lock);
1283     for (idx = 0; idx < thread_count; idx++) {
1284         while (!comp_param[idx].done) {
1285             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1286         }
1287     }
1288     qemu_mutex_unlock(&comp_done_lock);
1289 
1290     for (idx = 0; idx < thread_count; idx++) {
1291         qemu_mutex_lock(&comp_param[idx].mutex);
1292         if (!comp_param[idx].quit) {
1293             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1294             /*
1295              * it's safe to fetch zero_page without holding comp_done_lock
1296              * as there is no further request submitted to the thread,
1297              * i.e, the thread should be waiting for a request at this point.
1298              */
1299             update_compress_thread_counts(&comp_param[idx], len);
1300         }
1301         qemu_mutex_unlock(&comp_param[idx].mutex);
1302     }
1303 }
1304 
1305 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1306                                        ram_addr_t offset)
1307 {
1308     param->block = block;
1309     param->offset = offset;
1310 }
1311 
1312 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1313                                            ram_addr_t offset)
1314 {
1315     int idx, thread_count, bytes_xmit = -1, pages = -1;
1316     bool wait = migrate_compress_wait_thread();
1317 
1318     thread_count = migrate_compress_threads();
1319     qemu_mutex_lock(&comp_done_lock);
1320 retry:
1321     for (idx = 0; idx < thread_count; idx++) {
1322         if (comp_param[idx].done) {
1323             comp_param[idx].done = false;
1324             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1325             qemu_mutex_lock(&comp_param[idx].mutex);
1326             set_compress_params(&comp_param[idx], block, offset);
1327             qemu_cond_signal(&comp_param[idx].cond);
1328             qemu_mutex_unlock(&comp_param[idx].mutex);
1329             pages = 1;
1330             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1331             break;
1332         }
1333     }
1334 
1335     /*
1336      * wait for the free thread if the user specifies 'compress-wait-thread',
1337      * otherwise we will post the page out in the main thread as normal page.
1338      */
1339     if (pages < 0 && wait) {
1340         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1341         goto retry;
1342     }
1343     qemu_mutex_unlock(&comp_done_lock);
1344 
1345     return pages;
1346 }
1347 
1348 /**
1349  * find_dirty_block: find the next dirty page and update any state
1350  * associated with the search process.
1351  *
1352  * Returns true if a page is found
1353  *
1354  * @rs: current RAM state
1355  * @pss: data about the state of the current dirty page scan
1356  * @again: set to false if the search has scanned the whole of RAM
1357  */
1358 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1359 {
1360     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1361     if (pss->complete_round && pss->block == rs->last_seen_block &&
1362         pss->page >= rs->last_page) {
1363         /*
1364          * We've been once around the RAM and haven't found anything.
1365          * Give up.
1366          */
1367         *again = false;
1368         return false;
1369     }
1370     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1371         >= pss->block->used_length) {
1372         /* Didn't find anything in this RAM Block */
1373         pss->page = 0;
1374         pss->block = QLIST_NEXT_RCU(pss->block, next);
1375         if (!pss->block) {
1376             /*
1377              * If memory migration starts over, we will meet a dirtied page
1378              * which may still exists in compression threads's ring, so we
1379              * should flush the compressed data to make sure the new page
1380              * is not overwritten by the old one in the destination.
1381              *
1382              * Also If xbzrle is on, stop using the data compression at this
1383              * point. In theory, xbzrle can do better than compression.
1384              */
1385             flush_compressed_data(rs);
1386 
1387             /* Hit the end of the list */
1388             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1389             /* Flag that we've looped */
1390             pss->complete_round = true;
1391             rs->ram_bulk_stage = false;
1392         }
1393         /* Didn't find anything this time, but try again on the new block */
1394         *again = true;
1395         return false;
1396     } else {
1397         /* Can go around again, but... */
1398         *again = true;
1399         /* We've found something so probably don't need to */
1400         return true;
1401     }
1402 }
1403 
1404 /**
1405  * unqueue_page: gets a page of the queue
1406  *
1407  * Helper for 'get_queued_page' - gets a page off the queue
1408  *
1409  * Returns the block of the page (or NULL if none available)
1410  *
1411  * @rs: current RAM state
1412  * @offset: used to return the offset within the RAMBlock
1413  */
1414 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1415 {
1416     RAMBlock *block = NULL;
1417 
1418     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1419         return NULL;
1420     }
1421 
1422     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1423     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1424         struct RAMSrcPageRequest *entry =
1425                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1426         block = entry->rb;
1427         *offset = entry->offset;
1428 
1429         if (entry->len > TARGET_PAGE_SIZE) {
1430             entry->len -= TARGET_PAGE_SIZE;
1431             entry->offset += TARGET_PAGE_SIZE;
1432         } else {
1433             memory_region_unref(block->mr);
1434             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1435             g_free(entry);
1436             migration_consume_urgent_request();
1437         }
1438     }
1439 
1440     return block;
1441 }
1442 
1443 #if defined(__linux__)
1444 /**
1445  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1446  *   is found, return RAM block pointer and page offset
1447  *
1448  * Returns pointer to the RAMBlock containing faulting page,
1449  *   NULL if no write faults are pending
1450  *
1451  * @rs: current RAM state
1452  * @offset: page offset from the beginning of the block
1453  */
1454 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1455 {
1456     struct uffd_msg uffd_msg;
1457     void *page_address;
1458     RAMBlock *bs;
1459     int res;
1460 
1461     if (!migrate_background_snapshot()) {
1462         return NULL;
1463     }
1464 
1465     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1466     if (res <= 0) {
1467         return NULL;
1468     }
1469 
1470     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1471     bs = qemu_ram_block_from_host(page_address, false, offset);
1472     assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0);
1473     return bs;
1474 }
1475 
1476 /**
1477  * ram_save_release_protection: release UFFD write protection after
1478  *   a range of pages has been saved
1479  *
1480  * @rs: current RAM state
1481  * @pss: page-search-status structure
1482  * @start_page: index of the first page in the range relative to pss->block
1483  *
1484  * Returns 0 on success, negative value in case of an error
1485 */
1486 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1487         unsigned long start_page)
1488 {
1489     int res = 0;
1490 
1491     /* Check if page is from UFFD-managed region. */
1492     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1493         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1494         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1495 
1496         /* Flush async buffers before un-protect. */
1497         qemu_fflush(rs->f);
1498         /* Un-protect memory range. */
1499         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1500                 false, false);
1501     }
1502 
1503     return res;
1504 }
1505 
1506 /* ram_write_tracking_available: check if kernel supports required UFFD features
1507  *
1508  * Returns true if supports, false otherwise
1509  */
1510 bool ram_write_tracking_available(void)
1511 {
1512     uint64_t uffd_features;
1513     int res;
1514 
1515     res = uffd_query_features(&uffd_features);
1516     return (res == 0 &&
1517             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1518 }
1519 
1520 /* ram_write_tracking_compatible: check if guest configuration is
1521  *   compatible with 'write-tracking'
1522  *
1523  * Returns true if compatible, false otherwise
1524  */
1525 bool ram_write_tracking_compatible(void)
1526 {
1527     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1528     int uffd_fd;
1529     RAMBlock *bs;
1530     bool ret = false;
1531 
1532     /* Open UFFD file descriptor */
1533     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1534     if (uffd_fd < 0) {
1535         return false;
1536     }
1537 
1538     RCU_READ_LOCK_GUARD();
1539 
1540     RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1541         uint64_t uffd_ioctls;
1542 
1543         /* Nothing to do with read-only and MMIO-writable regions */
1544         if (bs->mr->readonly || bs->mr->rom_device) {
1545             continue;
1546         }
1547         /* Try to register block memory via UFFD-IO to track writes */
1548         if (uffd_register_memory(uffd_fd, bs->host, bs->max_length,
1549                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1550             goto out;
1551         }
1552         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1553             goto out;
1554         }
1555     }
1556     ret = true;
1557 
1558 out:
1559     uffd_close_fd(uffd_fd);
1560     return ret;
1561 }
1562 
1563 /*
1564  * ram_write_tracking_start: start UFFD-WP memory tracking
1565  *
1566  * Returns 0 for success or negative value in case of error
1567  */
1568 int ram_write_tracking_start(void)
1569 {
1570     int uffd_fd;
1571     RAMState *rs = ram_state;
1572     RAMBlock *bs;
1573 
1574     /* Open UFFD file descriptor */
1575     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1576     if (uffd_fd < 0) {
1577         return uffd_fd;
1578     }
1579     rs->uffdio_fd = uffd_fd;
1580 
1581     RCU_READ_LOCK_GUARD();
1582 
1583     RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1584         /* Nothing to do with read-only and MMIO-writable regions */
1585         if (bs->mr->readonly || bs->mr->rom_device) {
1586             continue;
1587         }
1588 
1589         /* Register block memory with UFFD to track writes */
1590         if (uffd_register_memory(rs->uffdio_fd, bs->host,
1591                 bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1592             goto fail;
1593         }
1594         /* Apply UFFD write protection to the block memory range */
1595         if (uffd_change_protection(rs->uffdio_fd, bs->host,
1596                 bs->max_length, true, false)) {
1597             goto fail;
1598         }
1599         bs->flags |= RAM_UF_WRITEPROTECT;
1600         memory_region_ref(bs->mr);
1601 
1602         trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size,
1603                 bs->host, bs->max_length);
1604     }
1605 
1606     return 0;
1607 
1608 fail:
1609     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1610 
1611     RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1612         if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
1613             continue;
1614         }
1615         /*
1616          * In case some memory block failed to be write-protected
1617          * remove protection and unregister all succeeded RAM blocks
1618          */
1619         uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
1620         uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
1621         /* Cleanup flags and remove reference */
1622         bs->flags &= ~RAM_UF_WRITEPROTECT;
1623         memory_region_unref(bs->mr);
1624     }
1625 
1626     uffd_close_fd(uffd_fd);
1627     rs->uffdio_fd = -1;
1628     return -1;
1629 }
1630 
1631 /**
1632  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1633  */
1634 void ram_write_tracking_stop(void)
1635 {
1636     RAMState *rs = ram_state;
1637     RAMBlock *bs;
1638 
1639     RCU_READ_LOCK_GUARD();
1640 
1641     RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1642         if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
1643             continue;
1644         }
1645         /* Remove protection and unregister all affected RAM blocks */
1646         uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
1647         uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
1648 
1649         trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size,
1650                 bs->host, bs->max_length);
1651 
1652         /* Cleanup flags and remove reference */
1653         bs->flags &= ~RAM_UF_WRITEPROTECT;
1654         memory_region_unref(bs->mr);
1655     }
1656 
1657     /* Finally close UFFD file descriptor */
1658     uffd_close_fd(rs->uffdio_fd);
1659     rs->uffdio_fd = -1;
1660 }
1661 
1662 #else
1663 /* No target OS support, stubs just fail or ignore */
1664 
1665 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1666 {
1667     (void) rs;
1668     (void) offset;
1669 
1670     return NULL;
1671 }
1672 
1673 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1674         unsigned long start_page)
1675 {
1676     (void) rs;
1677     (void) pss;
1678     (void) start_page;
1679 
1680     return 0;
1681 }
1682 
1683 bool ram_write_tracking_available(void)
1684 {
1685     return false;
1686 }
1687 
1688 bool ram_write_tracking_compatible(void)
1689 {
1690     assert(0);
1691     return false;
1692 }
1693 
1694 int ram_write_tracking_start(void)
1695 {
1696     assert(0);
1697     return -1;
1698 }
1699 
1700 void ram_write_tracking_stop(void)
1701 {
1702     assert(0);
1703 }
1704 #endif /* defined(__linux__) */
1705 
1706 /**
1707  * get_queued_page: unqueue a page from the postcopy requests
1708  *
1709  * Skips pages that are already sent (!dirty)
1710  *
1711  * Returns true if a queued page is found
1712  *
1713  * @rs: current RAM state
1714  * @pss: data about the state of the current dirty page scan
1715  */
1716 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1717 {
1718     RAMBlock  *block;
1719     ram_addr_t offset;
1720     bool dirty;
1721 
1722     do {
1723         block = unqueue_page(rs, &offset);
1724         /*
1725          * We're sending this page, and since it's postcopy nothing else
1726          * will dirty it, and we must make sure it doesn't get sent again
1727          * even if this queue request was received after the background
1728          * search already sent it.
1729          */
1730         if (block) {
1731             unsigned long page;
1732 
1733             page = offset >> TARGET_PAGE_BITS;
1734             dirty = test_bit(page, block->bmap);
1735             if (!dirty) {
1736                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1737                                                 page);
1738             } else {
1739                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1740             }
1741         }
1742 
1743     } while (block && !dirty);
1744 
1745     if (!block) {
1746         /*
1747          * Poll write faults too if background snapshot is enabled; that's
1748          * when we have vcpus got blocked by the write protected pages.
1749          */
1750         block = poll_fault_page(rs, &offset);
1751     }
1752 
1753     if (block) {
1754         /*
1755          * As soon as we start servicing pages out of order, then we have
1756          * to kill the bulk stage, since the bulk stage assumes
1757          * in (migration_bitmap_find_and_reset_dirty) that every page is
1758          * dirty, that's no longer true.
1759          */
1760         rs->ram_bulk_stage = false;
1761 
1762         /*
1763          * We want the background search to continue from the queued page
1764          * since the guest is likely to want other pages near to the page
1765          * it just requested.
1766          */
1767         pss->block = block;
1768         pss->page = offset >> TARGET_PAGE_BITS;
1769 
1770         /*
1771          * This unqueued page would break the "one round" check, even is
1772          * really rare.
1773          */
1774         pss->complete_round = false;
1775     }
1776 
1777     return !!block;
1778 }
1779 
1780 /**
1781  * migration_page_queue_free: drop any remaining pages in the ram
1782  * request queue
1783  *
1784  * It should be empty at the end anyway, but in error cases there may
1785  * be some left.  in case that there is any page left, we drop it.
1786  *
1787  */
1788 static void migration_page_queue_free(RAMState *rs)
1789 {
1790     struct RAMSrcPageRequest *mspr, *next_mspr;
1791     /* This queue generally should be empty - but in the case of a failed
1792      * migration might have some droppings in.
1793      */
1794     RCU_READ_LOCK_GUARD();
1795     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1796         memory_region_unref(mspr->rb->mr);
1797         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1798         g_free(mspr);
1799     }
1800 }
1801 
1802 /**
1803  * ram_save_queue_pages: queue the page for transmission
1804  *
1805  * A request from postcopy destination for example.
1806  *
1807  * Returns zero on success or negative on error
1808  *
1809  * @rbname: Name of the RAMBLock of the request. NULL means the
1810  *          same that last one.
1811  * @start: starting address from the start of the RAMBlock
1812  * @len: length (in bytes) to send
1813  */
1814 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1815 {
1816     RAMBlock *ramblock;
1817     RAMState *rs = ram_state;
1818 
1819     ram_counters.postcopy_requests++;
1820     RCU_READ_LOCK_GUARD();
1821 
1822     if (!rbname) {
1823         /* Reuse last RAMBlock */
1824         ramblock = rs->last_req_rb;
1825 
1826         if (!ramblock) {
1827             /*
1828              * Shouldn't happen, we can't reuse the last RAMBlock if
1829              * it's the 1st request.
1830              */
1831             error_report("ram_save_queue_pages no previous block");
1832             return -1;
1833         }
1834     } else {
1835         ramblock = qemu_ram_block_by_name(rbname);
1836 
1837         if (!ramblock) {
1838             /* We shouldn't be asked for a non-existent RAMBlock */
1839             error_report("ram_save_queue_pages no block '%s'", rbname);
1840             return -1;
1841         }
1842         rs->last_req_rb = ramblock;
1843     }
1844     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1845     if (start + len > ramblock->used_length) {
1846         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1847                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1848                      __func__, start, len, ramblock->used_length);
1849         return -1;
1850     }
1851 
1852     struct RAMSrcPageRequest *new_entry =
1853         g_malloc0(sizeof(struct RAMSrcPageRequest));
1854     new_entry->rb = ramblock;
1855     new_entry->offset = start;
1856     new_entry->len = len;
1857 
1858     memory_region_ref(ramblock->mr);
1859     qemu_mutex_lock(&rs->src_page_req_mutex);
1860     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1861     migration_make_urgent_request();
1862     qemu_mutex_unlock(&rs->src_page_req_mutex);
1863 
1864     return 0;
1865 }
1866 
1867 static bool save_page_use_compression(RAMState *rs)
1868 {
1869     if (!migrate_use_compression()) {
1870         return false;
1871     }
1872 
1873     /*
1874      * If xbzrle is on, stop using the data compression after first
1875      * round of migration even if compression is enabled. In theory,
1876      * xbzrle can do better than compression.
1877      */
1878     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1879         return true;
1880     }
1881 
1882     return false;
1883 }
1884 
1885 /*
1886  * try to compress the page before posting it out, return true if the page
1887  * has been properly handled by compression, otherwise needs other
1888  * paths to handle it
1889  */
1890 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1891 {
1892     if (!save_page_use_compression(rs)) {
1893         return false;
1894     }
1895 
1896     /*
1897      * When starting the process of a new block, the first page of
1898      * the block should be sent out before other pages in the same
1899      * block, and all the pages in last block should have been sent
1900      * out, keeping this order is important, because the 'cont' flag
1901      * is used to avoid resending the block name.
1902      *
1903      * We post the fist page as normal page as compression will take
1904      * much CPU resource.
1905      */
1906     if (block != rs->last_sent_block) {
1907         flush_compressed_data(rs);
1908         return false;
1909     }
1910 
1911     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1912         return true;
1913     }
1914 
1915     compression_counters.busy++;
1916     return false;
1917 }
1918 
1919 /**
1920  * ram_save_target_page: save one target page
1921  *
1922  * Returns the number of pages written
1923  *
1924  * @rs: current RAM state
1925  * @pss: data about the page we want to send
1926  * @last_stage: if we are at the completion stage
1927  */
1928 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1929                                 bool last_stage)
1930 {
1931     RAMBlock *block = pss->block;
1932     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1933     int res;
1934 
1935     if (control_save_page(rs, block, offset, &res)) {
1936         return res;
1937     }
1938 
1939     if (save_compress_page(rs, block, offset)) {
1940         return 1;
1941     }
1942 
1943     res = save_zero_page(rs, block, offset);
1944     if (res > 0) {
1945         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1946          * page would be stale
1947          */
1948         if (!save_page_use_compression(rs)) {
1949             XBZRLE_cache_lock();
1950             xbzrle_cache_zero_page(rs, block->offset + offset);
1951             XBZRLE_cache_unlock();
1952         }
1953         ram_release_pages(block->idstr, offset, res);
1954         return res;
1955     }
1956 
1957     /*
1958      * Do not use multifd for:
1959      * 1. Compression as the first page in the new block should be posted out
1960      *    before sending the compressed page
1961      * 2. In postcopy as one whole host page should be placed
1962      */
1963     if (!save_page_use_compression(rs) && migrate_use_multifd()
1964         && !migration_in_postcopy()) {
1965         return ram_save_multifd_page(rs, block, offset);
1966     }
1967 
1968     return ram_save_page(rs, pss, last_stage);
1969 }
1970 
1971 /**
1972  * ram_save_host_page: save a whole host page
1973  *
1974  * Starting at *offset send pages up to the end of the current host
1975  * page. It's valid for the initial offset to point into the middle of
1976  * a host page in which case the remainder of the hostpage is sent.
1977  * Only dirty target pages are sent. Note that the host page size may
1978  * be a huge page for this block.
1979  * The saving stops at the boundary of the used_length of the block
1980  * if the RAMBlock isn't a multiple of the host page size.
1981  *
1982  * Returns the number of pages written or negative on error
1983  *
1984  * @rs: current RAM state
1985  * @ms: current migration state
1986  * @pss: data about the page we want to send
1987  * @last_stage: if we are at the completion stage
1988  */
1989 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1990                               bool last_stage)
1991 {
1992     int tmppages, pages = 0;
1993     size_t pagesize_bits =
1994         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1995     unsigned long start_page = pss->page;
1996     int res;
1997 
1998     if (ramblock_is_ignored(pss->block)) {
1999         error_report("block %s should not be migrated !", pss->block->idstr);
2000         return 0;
2001     }
2002 
2003     do {
2004         /* Check the pages is dirty and if it is send it */
2005         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2006             pss->page++;
2007             continue;
2008         }
2009 
2010         tmppages = ram_save_target_page(rs, pss, last_stage);
2011         if (tmppages < 0) {
2012             return tmppages;
2013         }
2014 
2015         pages += tmppages;
2016         pss->page++;
2017         /* Allow rate limiting to happen in the middle of huge pages */
2018         migration_rate_limit();
2019     } while ((pss->page & (pagesize_bits - 1)) &&
2020              offset_in_ramblock(pss->block,
2021                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2022     /* The offset we leave with is the last one we looked at */
2023     pss->page--;
2024 
2025     res = ram_save_release_protection(rs, pss, start_page);
2026     return (res < 0 ? res : pages);
2027 }
2028 
2029 /**
2030  * ram_find_and_save_block: finds a dirty page and sends it to f
2031  *
2032  * Called within an RCU critical section.
2033  *
2034  * Returns the number of pages written where zero means no dirty pages,
2035  * or negative on error
2036  *
2037  * @rs: current RAM state
2038  * @last_stage: if we are at the completion stage
2039  *
2040  * On systems where host-page-size > target-page-size it will send all the
2041  * pages in a host page that are dirty.
2042  */
2043 
2044 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2045 {
2046     PageSearchStatus pss;
2047     int pages = 0;
2048     bool again, found;
2049 
2050     /* No dirty page as there is zero RAM */
2051     if (!ram_bytes_total()) {
2052         return pages;
2053     }
2054 
2055     pss.block = rs->last_seen_block;
2056     pss.page = rs->last_page;
2057     pss.complete_round = false;
2058 
2059     if (!pss.block) {
2060         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2061     }
2062 
2063     do {
2064         again = true;
2065         found = get_queued_page(rs, &pss);
2066 
2067         if (!found) {
2068             /* priority queue empty, so just search for something dirty */
2069             found = find_dirty_block(rs, &pss, &again);
2070         }
2071 
2072         if (found) {
2073             pages = ram_save_host_page(rs, &pss, last_stage);
2074         }
2075     } while (!pages && again);
2076 
2077     rs->last_seen_block = pss.block;
2078     rs->last_page = pss.page;
2079 
2080     return pages;
2081 }
2082 
2083 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2084 {
2085     uint64_t pages = size / TARGET_PAGE_SIZE;
2086 
2087     if (zero) {
2088         ram_counters.duplicate += pages;
2089     } else {
2090         ram_counters.normal += pages;
2091         ram_counters.transferred += size;
2092         qemu_update_position(f, size);
2093     }
2094 }
2095 
2096 static uint64_t ram_bytes_total_common(bool count_ignored)
2097 {
2098     RAMBlock *block;
2099     uint64_t total = 0;
2100 
2101     RCU_READ_LOCK_GUARD();
2102 
2103     if (count_ignored) {
2104         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2105             total += block->used_length;
2106         }
2107     } else {
2108         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2109             total += block->used_length;
2110         }
2111     }
2112     return total;
2113 }
2114 
2115 uint64_t ram_bytes_total(void)
2116 {
2117     return ram_bytes_total_common(false);
2118 }
2119 
2120 static void xbzrle_load_setup(void)
2121 {
2122     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2123 }
2124 
2125 static void xbzrle_load_cleanup(void)
2126 {
2127     g_free(XBZRLE.decoded_buf);
2128     XBZRLE.decoded_buf = NULL;
2129 }
2130 
2131 static void ram_state_cleanup(RAMState **rsp)
2132 {
2133     if (*rsp) {
2134         migration_page_queue_free(*rsp);
2135         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2136         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2137         g_free(*rsp);
2138         *rsp = NULL;
2139     }
2140 }
2141 
2142 static void xbzrle_cleanup(void)
2143 {
2144     XBZRLE_cache_lock();
2145     if (XBZRLE.cache) {
2146         cache_fini(XBZRLE.cache);
2147         g_free(XBZRLE.encoded_buf);
2148         g_free(XBZRLE.current_buf);
2149         g_free(XBZRLE.zero_target_page);
2150         XBZRLE.cache = NULL;
2151         XBZRLE.encoded_buf = NULL;
2152         XBZRLE.current_buf = NULL;
2153         XBZRLE.zero_target_page = NULL;
2154     }
2155     XBZRLE_cache_unlock();
2156 }
2157 
2158 static void ram_save_cleanup(void *opaque)
2159 {
2160     RAMState **rsp = opaque;
2161     RAMBlock *block;
2162 
2163     /* We don't use dirty log with background snapshots */
2164     if (!migrate_background_snapshot()) {
2165         /* caller have hold iothread lock or is in a bh, so there is
2166          * no writing race against the migration bitmap
2167          */
2168         memory_global_dirty_log_stop();
2169     }
2170 
2171     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2172         g_free(block->clear_bmap);
2173         block->clear_bmap = NULL;
2174         g_free(block->bmap);
2175         block->bmap = NULL;
2176     }
2177 
2178     xbzrle_cleanup();
2179     compress_threads_save_cleanup();
2180     ram_state_cleanup(rsp);
2181 }
2182 
2183 static void ram_state_reset(RAMState *rs)
2184 {
2185     rs->last_seen_block = NULL;
2186     rs->last_sent_block = NULL;
2187     rs->last_page = 0;
2188     rs->last_version = ram_list.version;
2189     rs->ram_bulk_stage = true;
2190     rs->fpo_enabled = false;
2191 }
2192 
2193 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2194 
2195 /*
2196  * 'expected' is the value you expect the bitmap mostly to be full
2197  * of; it won't bother printing lines that are all this value.
2198  * If 'todump' is null the migration bitmap is dumped.
2199  */
2200 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2201                            unsigned long pages)
2202 {
2203     int64_t cur;
2204     int64_t linelen = 128;
2205     char linebuf[129];
2206 
2207     for (cur = 0; cur < pages; cur += linelen) {
2208         int64_t curb;
2209         bool found = false;
2210         /*
2211          * Last line; catch the case where the line length
2212          * is longer than remaining ram
2213          */
2214         if (cur + linelen > pages) {
2215             linelen = pages - cur;
2216         }
2217         for (curb = 0; curb < linelen; curb++) {
2218             bool thisbit = test_bit(cur + curb, todump);
2219             linebuf[curb] = thisbit ? '1' : '.';
2220             found = found || (thisbit != expected);
2221         }
2222         if (found) {
2223             linebuf[curb] = '\0';
2224             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2225         }
2226     }
2227 }
2228 
2229 /* **** functions for postcopy ***** */
2230 
2231 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2232 {
2233     struct RAMBlock *block;
2234 
2235     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2236         unsigned long *bitmap = block->bmap;
2237         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2238         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2239 
2240         while (run_start < range) {
2241             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2242             ram_discard_range(block->idstr,
2243                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2244                               ((ram_addr_t)(run_end - run_start))
2245                                 << TARGET_PAGE_BITS);
2246             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2247         }
2248     }
2249 }
2250 
2251 /**
2252  * postcopy_send_discard_bm_ram: discard a RAMBlock
2253  *
2254  * Returns zero on success
2255  *
2256  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2257  *
2258  * @ms: current migration state
2259  * @block: RAMBlock to discard
2260  */
2261 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2262 {
2263     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2264     unsigned long current;
2265     unsigned long *bitmap = block->bmap;
2266 
2267     for (current = 0; current < end; ) {
2268         unsigned long one = find_next_bit(bitmap, end, current);
2269         unsigned long zero, discard_length;
2270 
2271         if (one >= end) {
2272             break;
2273         }
2274 
2275         zero = find_next_zero_bit(bitmap, end, one + 1);
2276 
2277         if (zero >= end) {
2278             discard_length = end - one;
2279         } else {
2280             discard_length = zero - one;
2281         }
2282         postcopy_discard_send_range(ms, one, discard_length);
2283         current = one + discard_length;
2284     }
2285 
2286     return 0;
2287 }
2288 
2289 /**
2290  * postcopy_each_ram_send_discard: discard all RAMBlocks
2291  *
2292  * Returns 0 for success or negative for error
2293  *
2294  * Utility for the outgoing postcopy code.
2295  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2296  *   passing it bitmap indexes and name.
2297  * (qemu_ram_foreach_block ends up passing unscaled lengths
2298  *  which would mean postcopy code would have to deal with target page)
2299  *
2300  * @ms: current migration state
2301  */
2302 static int postcopy_each_ram_send_discard(MigrationState *ms)
2303 {
2304     struct RAMBlock *block;
2305     int ret;
2306 
2307     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2308         postcopy_discard_send_init(ms, block->idstr);
2309 
2310         /*
2311          * Postcopy sends chunks of bitmap over the wire, but it
2312          * just needs indexes at this point, avoids it having
2313          * target page specific code.
2314          */
2315         ret = postcopy_send_discard_bm_ram(ms, block);
2316         postcopy_discard_send_finish(ms);
2317         if (ret) {
2318             return ret;
2319         }
2320     }
2321 
2322     return 0;
2323 }
2324 
2325 /**
2326  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2327  *
2328  * Helper for postcopy_chunk_hostpages; it's called twice to
2329  * canonicalize the two bitmaps, that are similar, but one is
2330  * inverted.
2331  *
2332  * Postcopy requires that all target pages in a hostpage are dirty or
2333  * clean, not a mix.  This function canonicalizes the bitmaps.
2334  *
2335  * @ms: current migration state
2336  * @block: block that contains the page we want to canonicalize
2337  */
2338 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2339 {
2340     RAMState *rs = ram_state;
2341     unsigned long *bitmap = block->bmap;
2342     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2343     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2344     unsigned long run_start;
2345 
2346     if (block->page_size == TARGET_PAGE_SIZE) {
2347         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2348         return;
2349     }
2350 
2351     /* Find a dirty page */
2352     run_start = find_next_bit(bitmap, pages, 0);
2353 
2354     while (run_start < pages) {
2355 
2356         /*
2357          * If the start of this run of pages is in the middle of a host
2358          * page, then we need to fixup this host page.
2359          */
2360         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2361             /* Find the end of this run */
2362             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2363             /*
2364              * If the end isn't at the start of a host page, then the
2365              * run doesn't finish at the end of a host page
2366              * and we need to discard.
2367              */
2368         }
2369 
2370         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2371             unsigned long page;
2372             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2373                                                              host_ratio);
2374             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2375 
2376             /* Clean up the bitmap */
2377             for (page = fixup_start_addr;
2378                  page < fixup_start_addr + host_ratio; page++) {
2379                 /*
2380                  * Remark them as dirty, updating the count for any pages
2381                  * that weren't previously dirty.
2382                  */
2383                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2384             }
2385         }
2386 
2387         /* Find the next dirty page for the next iteration */
2388         run_start = find_next_bit(bitmap, pages, run_start);
2389     }
2390 }
2391 
2392 /**
2393  * postcopy_chunk_hostpages: discard any partially sent host page
2394  *
2395  * Utility for the outgoing postcopy code.
2396  *
2397  * Discard any partially sent host-page size chunks, mark any partially
2398  * dirty host-page size chunks as all dirty.  In this case the host-page
2399  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2400  *
2401  * Returns zero on success
2402  *
2403  * @ms: current migration state
2404  * @block: block we want to work with
2405  */
2406 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2407 {
2408     postcopy_discard_send_init(ms, block->idstr);
2409 
2410     /*
2411      * Ensure that all partially dirty host pages are made fully dirty.
2412      */
2413     postcopy_chunk_hostpages_pass(ms, block);
2414 
2415     postcopy_discard_send_finish(ms);
2416     return 0;
2417 }
2418 
2419 /**
2420  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2421  *
2422  * Returns zero on success
2423  *
2424  * Transmit the set of pages to be discarded after precopy to the target
2425  * these are pages that:
2426  *     a) Have been previously transmitted but are now dirty again
2427  *     b) Pages that have never been transmitted, this ensures that
2428  *        any pages on the destination that have been mapped by background
2429  *        tasks get discarded (transparent huge pages is the specific concern)
2430  * Hopefully this is pretty sparse
2431  *
2432  * @ms: current migration state
2433  */
2434 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2435 {
2436     RAMState *rs = ram_state;
2437     RAMBlock *block;
2438     int ret;
2439 
2440     RCU_READ_LOCK_GUARD();
2441 
2442     /* This should be our last sync, the src is now paused */
2443     migration_bitmap_sync(rs);
2444 
2445     /* Easiest way to make sure we don't resume in the middle of a host-page */
2446     rs->last_seen_block = NULL;
2447     rs->last_sent_block = NULL;
2448     rs->last_page = 0;
2449 
2450     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2451         /* Deal with TPS != HPS and huge pages */
2452         ret = postcopy_chunk_hostpages(ms, block);
2453         if (ret) {
2454             return ret;
2455         }
2456 
2457 #ifdef DEBUG_POSTCOPY
2458         ram_debug_dump_bitmap(block->bmap, true,
2459                               block->used_length >> TARGET_PAGE_BITS);
2460 #endif
2461     }
2462     trace_ram_postcopy_send_discard_bitmap();
2463 
2464     return postcopy_each_ram_send_discard(ms);
2465 }
2466 
2467 /**
2468  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2469  *
2470  * Returns zero on success
2471  *
2472  * @rbname: name of the RAMBlock of the request. NULL means the
2473  *          same that last one.
2474  * @start: RAMBlock starting page
2475  * @length: RAMBlock size
2476  */
2477 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2478 {
2479     trace_ram_discard_range(rbname, start, length);
2480 
2481     RCU_READ_LOCK_GUARD();
2482     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2483 
2484     if (!rb) {
2485         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2486         return -1;
2487     }
2488 
2489     /*
2490      * On source VM, we don't need to update the received bitmap since
2491      * we don't even have one.
2492      */
2493     if (rb->receivedmap) {
2494         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2495                      length >> qemu_target_page_bits());
2496     }
2497 
2498     return ram_block_discard_range(rb, start, length);
2499 }
2500 
2501 /*
2502  * For every allocation, we will try not to crash the VM if the
2503  * allocation failed.
2504  */
2505 static int xbzrle_init(void)
2506 {
2507     Error *local_err = NULL;
2508 
2509     if (!migrate_use_xbzrle()) {
2510         return 0;
2511     }
2512 
2513     XBZRLE_cache_lock();
2514 
2515     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2516     if (!XBZRLE.zero_target_page) {
2517         error_report("%s: Error allocating zero page", __func__);
2518         goto err_out;
2519     }
2520 
2521     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2522                               TARGET_PAGE_SIZE, &local_err);
2523     if (!XBZRLE.cache) {
2524         error_report_err(local_err);
2525         goto free_zero_page;
2526     }
2527 
2528     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2529     if (!XBZRLE.encoded_buf) {
2530         error_report("%s: Error allocating encoded_buf", __func__);
2531         goto free_cache;
2532     }
2533 
2534     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2535     if (!XBZRLE.current_buf) {
2536         error_report("%s: Error allocating current_buf", __func__);
2537         goto free_encoded_buf;
2538     }
2539 
2540     /* We are all good */
2541     XBZRLE_cache_unlock();
2542     return 0;
2543 
2544 free_encoded_buf:
2545     g_free(XBZRLE.encoded_buf);
2546     XBZRLE.encoded_buf = NULL;
2547 free_cache:
2548     cache_fini(XBZRLE.cache);
2549     XBZRLE.cache = NULL;
2550 free_zero_page:
2551     g_free(XBZRLE.zero_target_page);
2552     XBZRLE.zero_target_page = NULL;
2553 err_out:
2554     XBZRLE_cache_unlock();
2555     return -ENOMEM;
2556 }
2557 
2558 static int ram_state_init(RAMState **rsp)
2559 {
2560     *rsp = g_try_new0(RAMState, 1);
2561 
2562     if (!*rsp) {
2563         error_report("%s: Init ramstate fail", __func__);
2564         return -1;
2565     }
2566 
2567     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2568     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2569     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2570 
2571     /*
2572      * Count the total number of pages used by ram blocks not including any
2573      * gaps due to alignment or unplugs.
2574      * This must match with the initial values of dirty bitmap.
2575      */
2576     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2577     ram_state_reset(*rsp);
2578 
2579     return 0;
2580 }
2581 
2582 static void ram_list_init_bitmaps(void)
2583 {
2584     MigrationState *ms = migrate_get_current();
2585     RAMBlock *block;
2586     unsigned long pages;
2587     uint8_t shift;
2588 
2589     /* Skip setting bitmap if there is no RAM */
2590     if (ram_bytes_total()) {
2591         shift = ms->clear_bitmap_shift;
2592         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2593             error_report("clear_bitmap_shift (%u) too big, using "
2594                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2595             shift = CLEAR_BITMAP_SHIFT_MAX;
2596         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2597             error_report("clear_bitmap_shift (%u) too small, using "
2598                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2599             shift = CLEAR_BITMAP_SHIFT_MIN;
2600         }
2601 
2602         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2603             pages = block->max_length >> TARGET_PAGE_BITS;
2604             /*
2605              * The initial dirty bitmap for migration must be set with all
2606              * ones to make sure we'll migrate every guest RAM page to
2607              * destination.
2608              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2609              * new migration after a failed migration, ram_list.
2610              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2611              * guest memory.
2612              */
2613             block->bmap = bitmap_new(pages);
2614             bitmap_set(block->bmap, 0, pages);
2615             block->clear_bmap_shift = shift;
2616             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2617         }
2618     }
2619 }
2620 
2621 static void ram_init_bitmaps(RAMState *rs)
2622 {
2623     /* For memory_global_dirty_log_start below.  */
2624     qemu_mutex_lock_iothread();
2625     qemu_mutex_lock_ramlist();
2626 
2627     WITH_RCU_READ_LOCK_GUARD() {
2628         ram_list_init_bitmaps();
2629         /* We don't use dirty log with background snapshots */
2630         if (!migrate_background_snapshot()) {
2631             memory_global_dirty_log_start();
2632             migration_bitmap_sync_precopy(rs);
2633         }
2634     }
2635     qemu_mutex_unlock_ramlist();
2636     qemu_mutex_unlock_iothread();
2637 }
2638 
2639 static int ram_init_all(RAMState **rsp)
2640 {
2641     if (ram_state_init(rsp)) {
2642         return -1;
2643     }
2644 
2645     if (xbzrle_init()) {
2646         ram_state_cleanup(rsp);
2647         return -1;
2648     }
2649 
2650     ram_init_bitmaps(*rsp);
2651 
2652     return 0;
2653 }
2654 
2655 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2656 {
2657     RAMBlock *block;
2658     uint64_t pages = 0;
2659 
2660     /*
2661      * Postcopy is not using xbzrle/compression, so no need for that.
2662      * Also, since source are already halted, we don't need to care
2663      * about dirty page logging as well.
2664      */
2665 
2666     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2667         pages += bitmap_count_one(block->bmap,
2668                                   block->used_length >> TARGET_PAGE_BITS);
2669     }
2670 
2671     /* This may not be aligned with current bitmaps. Recalculate. */
2672     rs->migration_dirty_pages = pages;
2673 
2674     rs->last_seen_block = NULL;
2675     rs->last_sent_block = NULL;
2676     rs->last_page = 0;
2677     rs->last_version = ram_list.version;
2678     /*
2679      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2680      * matter what we have sent.
2681      */
2682     rs->ram_bulk_stage = false;
2683 
2684     /* Update RAMState cache of output QEMUFile */
2685     rs->f = out;
2686 
2687     trace_ram_state_resume_prepare(pages);
2688 }
2689 
2690 /*
2691  * This function clears bits of the free pages reported by the caller from the
2692  * migration dirty bitmap. @addr is the host address corresponding to the
2693  * start of the continuous guest free pages, and @len is the total bytes of
2694  * those pages.
2695  */
2696 void qemu_guest_free_page_hint(void *addr, size_t len)
2697 {
2698     RAMBlock *block;
2699     ram_addr_t offset;
2700     size_t used_len, start, npages;
2701     MigrationState *s = migrate_get_current();
2702 
2703     /* This function is currently expected to be used during live migration */
2704     if (!migration_is_setup_or_active(s->state)) {
2705         return;
2706     }
2707 
2708     for (; len > 0; len -= used_len, addr += used_len) {
2709         block = qemu_ram_block_from_host(addr, false, &offset);
2710         if (unlikely(!block || offset >= block->used_length)) {
2711             /*
2712              * The implementation might not support RAMBlock resize during
2713              * live migration, but it could happen in theory with future
2714              * updates. So we add a check here to capture that case.
2715              */
2716             error_report_once("%s unexpected error", __func__);
2717             return;
2718         }
2719 
2720         if (len <= block->used_length - offset) {
2721             used_len = len;
2722         } else {
2723             used_len = block->used_length - offset;
2724         }
2725 
2726         start = offset >> TARGET_PAGE_BITS;
2727         npages = used_len >> TARGET_PAGE_BITS;
2728 
2729         qemu_mutex_lock(&ram_state->bitmap_mutex);
2730         ram_state->migration_dirty_pages -=
2731                       bitmap_count_one_with_offset(block->bmap, start, npages);
2732         bitmap_clear(block->bmap, start, npages);
2733         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2734     }
2735 }
2736 
2737 /*
2738  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2739  * long-running RCU critical section.  When rcu-reclaims in the code
2740  * start to become numerous it will be necessary to reduce the
2741  * granularity of these critical sections.
2742  */
2743 
2744 /**
2745  * ram_save_setup: Setup RAM for migration
2746  *
2747  * Returns zero to indicate success and negative for error
2748  *
2749  * @f: QEMUFile where to send the data
2750  * @opaque: RAMState pointer
2751  */
2752 static int ram_save_setup(QEMUFile *f, void *opaque)
2753 {
2754     RAMState **rsp = opaque;
2755     RAMBlock *block;
2756 
2757     if (compress_threads_save_setup()) {
2758         return -1;
2759     }
2760 
2761     /* migration has already setup the bitmap, reuse it. */
2762     if (!migration_in_colo_state()) {
2763         if (ram_init_all(rsp) != 0) {
2764             compress_threads_save_cleanup();
2765             return -1;
2766         }
2767     }
2768     (*rsp)->f = f;
2769 
2770     WITH_RCU_READ_LOCK_GUARD() {
2771         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2772 
2773         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2774             qemu_put_byte(f, strlen(block->idstr));
2775             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2776             qemu_put_be64(f, block->used_length);
2777             if (migrate_postcopy_ram() && block->page_size !=
2778                                           qemu_host_page_size) {
2779                 qemu_put_be64(f, block->page_size);
2780             }
2781             if (migrate_ignore_shared()) {
2782                 qemu_put_be64(f, block->mr->addr);
2783             }
2784         }
2785     }
2786 
2787     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2788     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2789 
2790     multifd_send_sync_main(f);
2791     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2792     qemu_fflush(f);
2793 
2794     return 0;
2795 }
2796 
2797 /**
2798  * ram_save_iterate: iterative stage for migration
2799  *
2800  * Returns zero to indicate success and negative for error
2801  *
2802  * @f: QEMUFile where to send the data
2803  * @opaque: RAMState pointer
2804  */
2805 static int ram_save_iterate(QEMUFile *f, void *opaque)
2806 {
2807     RAMState **temp = opaque;
2808     RAMState *rs = *temp;
2809     int ret = 0;
2810     int i;
2811     int64_t t0;
2812     int done = 0;
2813 
2814     if (blk_mig_bulk_active()) {
2815         /* Avoid transferring ram during bulk phase of block migration as
2816          * the bulk phase will usually take a long time and transferring
2817          * ram updates during that time is pointless. */
2818         goto out;
2819     }
2820 
2821     WITH_RCU_READ_LOCK_GUARD() {
2822         if (ram_list.version != rs->last_version) {
2823             ram_state_reset(rs);
2824         }
2825 
2826         /* Read version before ram_list.blocks */
2827         smp_rmb();
2828 
2829         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2830 
2831         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2832         i = 0;
2833         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2834                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2835             int pages;
2836 
2837             if (qemu_file_get_error(f)) {
2838                 break;
2839             }
2840 
2841             pages = ram_find_and_save_block(rs, false);
2842             /* no more pages to sent */
2843             if (pages == 0) {
2844                 done = 1;
2845                 break;
2846             }
2847 
2848             if (pages < 0) {
2849                 qemu_file_set_error(f, pages);
2850                 break;
2851             }
2852 
2853             rs->target_page_count += pages;
2854 
2855             /*
2856              * During postcopy, it is necessary to make sure one whole host
2857              * page is sent in one chunk.
2858              */
2859             if (migrate_postcopy_ram()) {
2860                 flush_compressed_data(rs);
2861             }
2862 
2863             /*
2864              * we want to check in the 1st loop, just in case it was the 1st
2865              * time and we had to sync the dirty bitmap.
2866              * qemu_clock_get_ns() is a bit expensive, so we only check each
2867              * some iterations
2868              */
2869             if ((i & 63) == 0) {
2870                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2871                               1000000;
2872                 if (t1 > MAX_WAIT) {
2873                     trace_ram_save_iterate_big_wait(t1, i);
2874                     break;
2875                 }
2876             }
2877             i++;
2878         }
2879     }
2880 
2881     /*
2882      * Must occur before EOS (or any QEMUFile operation)
2883      * because of RDMA protocol.
2884      */
2885     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2886 
2887 out:
2888     if (ret >= 0
2889         && migration_is_setup_or_active(migrate_get_current()->state)) {
2890         multifd_send_sync_main(rs->f);
2891         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2892         qemu_fflush(f);
2893         ram_counters.transferred += 8;
2894 
2895         ret = qemu_file_get_error(f);
2896     }
2897     if (ret < 0) {
2898         return ret;
2899     }
2900 
2901     return done;
2902 }
2903 
2904 /**
2905  * ram_save_complete: function called to send the remaining amount of ram
2906  *
2907  * Returns zero to indicate success or negative on error
2908  *
2909  * Called with iothread lock
2910  *
2911  * @f: QEMUFile where to send the data
2912  * @opaque: RAMState pointer
2913  */
2914 static int ram_save_complete(QEMUFile *f, void *opaque)
2915 {
2916     RAMState **temp = opaque;
2917     RAMState *rs = *temp;
2918     int ret = 0;
2919 
2920     WITH_RCU_READ_LOCK_GUARD() {
2921         if (!migration_in_postcopy()) {
2922             migration_bitmap_sync_precopy(rs);
2923         }
2924 
2925         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2926 
2927         /* try transferring iterative blocks of memory */
2928 
2929         /* flush all remaining blocks regardless of rate limiting */
2930         while (true) {
2931             int pages;
2932 
2933             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2934             /* no more blocks to sent */
2935             if (pages == 0) {
2936                 break;
2937             }
2938             if (pages < 0) {
2939                 ret = pages;
2940                 break;
2941             }
2942         }
2943 
2944         flush_compressed_data(rs);
2945         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2946     }
2947 
2948     if (ret >= 0) {
2949         multifd_send_sync_main(rs->f);
2950         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2951         qemu_fflush(f);
2952     }
2953 
2954     return ret;
2955 }
2956 
2957 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2958                              uint64_t *res_precopy_only,
2959                              uint64_t *res_compatible,
2960                              uint64_t *res_postcopy_only)
2961 {
2962     RAMState **temp = opaque;
2963     RAMState *rs = *temp;
2964     uint64_t remaining_size;
2965 
2966     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2967 
2968     if (!migration_in_postcopy() &&
2969         remaining_size < max_size) {
2970         qemu_mutex_lock_iothread();
2971         WITH_RCU_READ_LOCK_GUARD() {
2972             migration_bitmap_sync_precopy(rs);
2973         }
2974         qemu_mutex_unlock_iothread();
2975         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2976     }
2977 
2978     if (migrate_postcopy_ram()) {
2979         /* We can do postcopy, and all the data is postcopiable */
2980         *res_compatible += remaining_size;
2981     } else {
2982         *res_precopy_only += remaining_size;
2983     }
2984 }
2985 
2986 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2987 {
2988     unsigned int xh_len;
2989     int xh_flags;
2990     uint8_t *loaded_data;
2991 
2992     /* extract RLE header */
2993     xh_flags = qemu_get_byte(f);
2994     xh_len = qemu_get_be16(f);
2995 
2996     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2997         error_report("Failed to load XBZRLE page - wrong compression!");
2998         return -1;
2999     }
3000 
3001     if (xh_len > TARGET_PAGE_SIZE) {
3002         error_report("Failed to load XBZRLE page - len overflow!");
3003         return -1;
3004     }
3005     loaded_data = XBZRLE.decoded_buf;
3006     /* load data and decode */
3007     /* it can change loaded_data to point to an internal buffer */
3008     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3009 
3010     /* decode RLE */
3011     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3012                              TARGET_PAGE_SIZE) == -1) {
3013         error_report("Failed to load XBZRLE page - decode error!");
3014         return -1;
3015     }
3016 
3017     return 0;
3018 }
3019 
3020 /**
3021  * ram_block_from_stream: read a RAMBlock id from the migration stream
3022  *
3023  * Must be called from within a rcu critical section.
3024  *
3025  * Returns a pointer from within the RCU-protected ram_list.
3026  *
3027  * @f: QEMUFile where to read the data from
3028  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3029  */
3030 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3031 {
3032     static RAMBlock *block;
3033     char id[256];
3034     uint8_t len;
3035 
3036     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3037         if (!block) {
3038             error_report("Ack, bad migration stream!");
3039             return NULL;
3040         }
3041         return block;
3042     }
3043 
3044     len = qemu_get_byte(f);
3045     qemu_get_buffer(f, (uint8_t *)id, len);
3046     id[len] = 0;
3047 
3048     block = qemu_ram_block_by_name(id);
3049     if (!block) {
3050         error_report("Can't find block %s", id);
3051         return NULL;
3052     }
3053 
3054     if (ramblock_is_ignored(block)) {
3055         error_report("block %s should not be migrated !", id);
3056         return NULL;
3057     }
3058 
3059     return block;
3060 }
3061 
3062 static inline void *host_from_ram_block_offset(RAMBlock *block,
3063                                                ram_addr_t offset)
3064 {
3065     if (!offset_in_ramblock(block, offset)) {
3066         return NULL;
3067     }
3068 
3069     return block->host + offset;
3070 }
3071 
3072 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3073                              ram_addr_t offset, bool record_bitmap)
3074 {
3075     if (!offset_in_ramblock(block, offset)) {
3076         return NULL;
3077     }
3078     if (!block->colo_cache) {
3079         error_report("%s: colo_cache is NULL in block :%s",
3080                      __func__, block->idstr);
3081         return NULL;
3082     }
3083 
3084     /*
3085     * During colo checkpoint, we need bitmap of these migrated pages.
3086     * It help us to decide which pages in ram cache should be flushed
3087     * into VM's RAM later.
3088     */
3089     if (record_bitmap &&
3090         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3091         ram_state->migration_dirty_pages++;
3092     }
3093     return block->colo_cache + offset;
3094 }
3095 
3096 /**
3097  * ram_handle_compressed: handle the zero page case
3098  *
3099  * If a page (or a whole RDMA chunk) has been
3100  * determined to be zero, then zap it.
3101  *
3102  * @host: host address for the zero page
3103  * @ch: what the page is filled from.  We only support zero
3104  * @size: size of the zero page
3105  */
3106 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3107 {
3108     if (ch != 0 || !is_zero_range(host, size)) {
3109         memset(host, ch, size);
3110     }
3111 }
3112 
3113 /* return the size after decompression, or negative value on error */
3114 static int
3115 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3116                      const uint8_t *source, size_t source_len)
3117 {
3118     int err;
3119 
3120     err = inflateReset(stream);
3121     if (err != Z_OK) {
3122         return -1;
3123     }
3124 
3125     stream->avail_in = source_len;
3126     stream->next_in = (uint8_t *)source;
3127     stream->avail_out = dest_len;
3128     stream->next_out = dest;
3129 
3130     err = inflate(stream, Z_NO_FLUSH);
3131     if (err != Z_STREAM_END) {
3132         return -1;
3133     }
3134 
3135     return stream->total_out;
3136 }
3137 
3138 static void *do_data_decompress(void *opaque)
3139 {
3140     DecompressParam *param = opaque;
3141     unsigned long pagesize;
3142     uint8_t *des;
3143     int len, ret;
3144 
3145     qemu_mutex_lock(&param->mutex);
3146     while (!param->quit) {
3147         if (param->des) {
3148             des = param->des;
3149             len = param->len;
3150             param->des = 0;
3151             qemu_mutex_unlock(&param->mutex);
3152 
3153             pagesize = TARGET_PAGE_SIZE;
3154 
3155             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3156                                        param->compbuf, len);
3157             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3158                 error_report("decompress data failed");
3159                 qemu_file_set_error(decomp_file, ret);
3160             }
3161 
3162             qemu_mutex_lock(&decomp_done_lock);
3163             param->done = true;
3164             qemu_cond_signal(&decomp_done_cond);
3165             qemu_mutex_unlock(&decomp_done_lock);
3166 
3167             qemu_mutex_lock(&param->mutex);
3168         } else {
3169             qemu_cond_wait(&param->cond, &param->mutex);
3170         }
3171     }
3172     qemu_mutex_unlock(&param->mutex);
3173 
3174     return NULL;
3175 }
3176 
3177 static int wait_for_decompress_done(void)
3178 {
3179     int idx, thread_count;
3180 
3181     if (!migrate_use_compression()) {
3182         return 0;
3183     }
3184 
3185     thread_count = migrate_decompress_threads();
3186     qemu_mutex_lock(&decomp_done_lock);
3187     for (idx = 0; idx < thread_count; idx++) {
3188         while (!decomp_param[idx].done) {
3189             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3190         }
3191     }
3192     qemu_mutex_unlock(&decomp_done_lock);
3193     return qemu_file_get_error(decomp_file);
3194 }
3195 
3196 static void compress_threads_load_cleanup(void)
3197 {
3198     int i, thread_count;
3199 
3200     if (!migrate_use_compression()) {
3201         return;
3202     }
3203     thread_count = migrate_decompress_threads();
3204     for (i = 0; i < thread_count; i++) {
3205         /*
3206          * we use it as a indicator which shows if the thread is
3207          * properly init'd or not
3208          */
3209         if (!decomp_param[i].compbuf) {
3210             break;
3211         }
3212 
3213         qemu_mutex_lock(&decomp_param[i].mutex);
3214         decomp_param[i].quit = true;
3215         qemu_cond_signal(&decomp_param[i].cond);
3216         qemu_mutex_unlock(&decomp_param[i].mutex);
3217     }
3218     for (i = 0; i < thread_count; i++) {
3219         if (!decomp_param[i].compbuf) {
3220             break;
3221         }
3222 
3223         qemu_thread_join(decompress_threads + i);
3224         qemu_mutex_destroy(&decomp_param[i].mutex);
3225         qemu_cond_destroy(&decomp_param[i].cond);
3226         inflateEnd(&decomp_param[i].stream);
3227         g_free(decomp_param[i].compbuf);
3228         decomp_param[i].compbuf = NULL;
3229     }
3230     g_free(decompress_threads);
3231     g_free(decomp_param);
3232     decompress_threads = NULL;
3233     decomp_param = NULL;
3234     decomp_file = NULL;
3235 }
3236 
3237 static int compress_threads_load_setup(QEMUFile *f)
3238 {
3239     int i, thread_count;
3240 
3241     if (!migrate_use_compression()) {
3242         return 0;
3243     }
3244 
3245     thread_count = migrate_decompress_threads();
3246     decompress_threads = g_new0(QemuThread, thread_count);
3247     decomp_param = g_new0(DecompressParam, thread_count);
3248     qemu_mutex_init(&decomp_done_lock);
3249     qemu_cond_init(&decomp_done_cond);
3250     decomp_file = f;
3251     for (i = 0; i < thread_count; i++) {
3252         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3253             goto exit;
3254         }
3255 
3256         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3257         qemu_mutex_init(&decomp_param[i].mutex);
3258         qemu_cond_init(&decomp_param[i].cond);
3259         decomp_param[i].done = true;
3260         decomp_param[i].quit = false;
3261         qemu_thread_create(decompress_threads + i, "decompress",
3262                            do_data_decompress, decomp_param + i,
3263                            QEMU_THREAD_JOINABLE);
3264     }
3265     return 0;
3266 exit:
3267     compress_threads_load_cleanup();
3268     return -1;
3269 }
3270 
3271 static void decompress_data_with_multi_threads(QEMUFile *f,
3272                                                void *host, int len)
3273 {
3274     int idx, thread_count;
3275 
3276     thread_count = migrate_decompress_threads();
3277     QEMU_LOCK_GUARD(&decomp_done_lock);
3278     while (true) {
3279         for (idx = 0; idx < thread_count; idx++) {
3280             if (decomp_param[idx].done) {
3281                 decomp_param[idx].done = false;
3282                 qemu_mutex_lock(&decomp_param[idx].mutex);
3283                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3284                 decomp_param[idx].des = host;
3285                 decomp_param[idx].len = len;
3286                 qemu_cond_signal(&decomp_param[idx].cond);
3287                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3288                 break;
3289             }
3290         }
3291         if (idx < thread_count) {
3292             break;
3293         } else {
3294             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3295         }
3296     }
3297 }
3298 
3299  /*
3300   * we must set ram_bulk_stage to false, otherwise in
3301   * migation_bitmap_find_dirty the bitmap will be unused and
3302   * all the pages in ram cache wil be flushed to the ram of
3303   * secondary VM.
3304   */
3305 static void colo_init_ram_state(void)
3306 {
3307     ram_state_init(&ram_state);
3308     ram_state->ram_bulk_stage = false;
3309 }
3310 
3311 /*
3312  * colo cache: this is for secondary VM, we cache the whole
3313  * memory of the secondary VM, it is need to hold the global lock
3314  * to call this helper.
3315  */
3316 int colo_init_ram_cache(void)
3317 {
3318     RAMBlock *block;
3319 
3320     WITH_RCU_READ_LOCK_GUARD() {
3321         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3322             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3323                                                     NULL,
3324                                                     false);
3325             if (!block->colo_cache) {
3326                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3327                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3328                              block->used_length);
3329                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3330                     if (block->colo_cache) {
3331                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3332                         block->colo_cache = NULL;
3333                     }
3334                 }
3335                 return -errno;
3336             }
3337         }
3338     }
3339 
3340     /*
3341     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3342     * with to decide which page in cache should be flushed into SVM's RAM. Here
3343     * we use the same name 'ram_bitmap' as for migration.
3344     */
3345     if (ram_bytes_total()) {
3346         RAMBlock *block;
3347 
3348         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3349             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3350             block->bmap = bitmap_new(pages);
3351         }
3352     }
3353 
3354     colo_init_ram_state();
3355     return 0;
3356 }
3357 
3358 /* TODO: duplicated with ram_init_bitmaps */
3359 void colo_incoming_start_dirty_log(void)
3360 {
3361     RAMBlock *block = NULL;
3362     /* For memory_global_dirty_log_start below. */
3363     qemu_mutex_lock_iothread();
3364     qemu_mutex_lock_ramlist();
3365 
3366     memory_global_dirty_log_sync();
3367     WITH_RCU_READ_LOCK_GUARD() {
3368         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3369             ramblock_sync_dirty_bitmap(ram_state, block);
3370             /* Discard this dirty bitmap record */
3371             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3372         }
3373         memory_global_dirty_log_start();
3374     }
3375     ram_state->migration_dirty_pages = 0;
3376     qemu_mutex_unlock_ramlist();
3377     qemu_mutex_unlock_iothread();
3378 }
3379 
3380 /* It is need to hold the global lock to call this helper */
3381 void colo_release_ram_cache(void)
3382 {
3383     RAMBlock *block;
3384 
3385     memory_global_dirty_log_stop();
3386     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3387         g_free(block->bmap);
3388         block->bmap = NULL;
3389     }
3390 
3391     WITH_RCU_READ_LOCK_GUARD() {
3392         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3393             if (block->colo_cache) {
3394                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3395                 block->colo_cache = NULL;
3396             }
3397         }
3398     }
3399     ram_state_cleanup(&ram_state);
3400 }
3401 
3402 /**
3403  * ram_load_setup: Setup RAM for migration incoming side
3404  *
3405  * Returns zero to indicate success and negative for error
3406  *
3407  * @f: QEMUFile where to receive the data
3408  * @opaque: RAMState pointer
3409  */
3410 static int ram_load_setup(QEMUFile *f, void *opaque)
3411 {
3412     if (compress_threads_load_setup(f)) {
3413         return -1;
3414     }
3415 
3416     xbzrle_load_setup();
3417     ramblock_recv_map_init();
3418 
3419     return 0;
3420 }
3421 
3422 static int ram_load_cleanup(void *opaque)
3423 {
3424     RAMBlock *rb;
3425 
3426     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3427         qemu_ram_block_writeback(rb);
3428     }
3429 
3430     xbzrle_load_cleanup();
3431     compress_threads_load_cleanup();
3432 
3433     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3434         g_free(rb->receivedmap);
3435         rb->receivedmap = NULL;
3436     }
3437 
3438     return 0;
3439 }
3440 
3441 /**
3442  * ram_postcopy_incoming_init: allocate postcopy data structures
3443  *
3444  * Returns 0 for success and negative if there was one error
3445  *
3446  * @mis: current migration incoming state
3447  *
3448  * Allocate data structures etc needed by incoming migration with
3449  * postcopy-ram. postcopy-ram's similarly names
3450  * postcopy_ram_incoming_init does the work.
3451  */
3452 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3453 {
3454     return postcopy_ram_incoming_init(mis);
3455 }
3456 
3457 /**
3458  * ram_load_postcopy: load a page in postcopy case
3459  *
3460  * Returns 0 for success or -errno in case of error
3461  *
3462  * Called in postcopy mode by ram_load().
3463  * rcu_read_lock is taken prior to this being called.
3464  *
3465  * @f: QEMUFile where to send the data
3466  */
3467 static int ram_load_postcopy(QEMUFile *f)
3468 {
3469     int flags = 0, ret = 0;
3470     bool place_needed = false;
3471     bool matches_target_page_size = false;
3472     MigrationIncomingState *mis = migration_incoming_get_current();
3473     /* Temporary page that is later 'placed' */
3474     void *postcopy_host_page = mis->postcopy_tmp_page;
3475     void *this_host = NULL;
3476     bool all_zero = true;
3477     int target_pages = 0;
3478 
3479     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3480         ram_addr_t addr;
3481         void *host = NULL;
3482         void *page_buffer = NULL;
3483         void *place_source = NULL;
3484         RAMBlock *block = NULL;
3485         uint8_t ch;
3486         int len;
3487 
3488         addr = qemu_get_be64(f);
3489 
3490         /*
3491          * If qemu file error, we should stop here, and then "addr"
3492          * may be invalid
3493          */
3494         ret = qemu_file_get_error(f);
3495         if (ret) {
3496             break;
3497         }
3498 
3499         flags = addr & ~TARGET_PAGE_MASK;
3500         addr &= TARGET_PAGE_MASK;
3501 
3502         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3503         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3504                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3505             block = ram_block_from_stream(f, flags);
3506 
3507             host = host_from_ram_block_offset(block, addr);
3508             if (!host) {
3509                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3510                 ret = -EINVAL;
3511                 break;
3512             }
3513             target_pages++;
3514             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3515             /*
3516              * Postcopy requires that we place whole host pages atomically;
3517              * these may be huge pages for RAMBlocks that are backed by
3518              * hugetlbfs.
3519              * To make it atomic, the data is read into a temporary page
3520              * that's moved into place later.
3521              * The migration protocol uses,  possibly smaller, target-pages
3522              * however the source ensures it always sends all the components
3523              * of a host page in one chunk.
3524              */
3525             page_buffer = postcopy_host_page +
3526                           ((uintptr_t)host & (block->page_size - 1));
3527             if (target_pages == 1) {
3528                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3529                                                     block->page_size);
3530             } else {
3531                 /* not the 1st TP within the HP */
3532                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3533                     (uintptr_t)this_host) {
3534                     error_report("Non-same host page %p/%p",
3535                                   host, this_host);
3536                     ret = -EINVAL;
3537                     break;
3538                 }
3539             }
3540 
3541             /*
3542              * If it's the last part of a host page then we place the host
3543              * page
3544              */
3545             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3546                 place_needed = true;
3547             }
3548             place_source = postcopy_host_page;
3549         }
3550 
3551         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3552         case RAM_SAVE_FLAG_ZERO:
3553             ch = qemu_get_byte(f);
3554             /*
3555              * Can skip to set page_buffer when
3556              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3557              */
3558             if (ch || !matches_target_page_size) {
3559                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3560             }
3561             if (ch) {
3562                 all_zero = false;
3563             }
3564             break;
3565 
3566         case RAM_SAVE_FLAG_PAGE:
3567             all_zero = false;
3568             if (!matches_target_page_size) {
3569                 /* For huge pages, we always use temporary buffer */
3570                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3571             } else {
3572                 /*
3573                  * For small pages that matches target page size, we
3574                  * avoid the qemu_file copy.  Instead we directly use
3575                  * the buffer of QEMUFile to place the page.  Note: we
3576                  * cannot do any QEMUFile operation before using that
3577                  * buffer to make sure the buffer is valid when
3578                  * placing the page.
3579                  */
3580                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3581                                          TARGET_PAGE_SIZE);
3582             }
3583             break;
3584         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3585             all_zero = false;
3586             len = qemu_get_be32(f);
3587             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3588                 error_report("Invalid compressed data length: %d", len);
3589                 ret = -EINVAL;
3590                 break;
3591             }
3592             decompress_data_with_multi_threads(f, page_buffer, len);
3593             break;
3594 
3595         case RAM_SAVE_FLAG_EOS:
3596             /* normal exit */
3597             multifd_recv_sync_main();
3598             break;
3599         default:
3600             error_report("Unknown combination of migration flags: 0x%x"
3601                          " (postcopy mode)", flags);
3602             ret = -EINVAL;
3603             break;
3604         }
3605 
3606         /* Got the whole host page, wait for decompress before placing. */
3607         if (place_needed) {
3608             ret |= wait_for_decompress_done();
3609         }
3610 
3611         /* Detect for any possible file errors */
3612         if (!ret && qemu_file_get_error(f)) {
3613             ret = qemu_file_get_error(f);
3614         }
3615 
3616         if (!ret && place_needed) {
3617             /* This gets called at the last target page in the host page */
3618             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3619                                                        block->page_size);
3620 
3621             if (all_zero) {
3622                 ret = postcopy_place_page_zero(mis, place_dest,
3623                                                block);
3624             } else {
3625                 ret = postcopy_place_page(mis, place_dest,
3626                                           place_source, block);
3627             }
3628             place_needed = false;
3629             target_pages = 0;
3630             /* Assume we have a zero page until we detect something different */
3631             all_zero = true;
3632         }
3633     }
3634 
3635     return ret;
3636 }
3637 
3638 static bool postcopy_is_advised(void)
3639 {
3640     PostcopyState ps = postcopy_state_get();
3641     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3642 }
3643 
3644 static bool postcopy_is_running(void)
3645 {
3646     PostcopyState ps = postcopy_state_get();
3647     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3648 }
3649 
3650 /*
3651  * Flush content of RAM cache into SVM's memory.
3652  * Only flush the pages that be dirtied by PVM or SVM or both.
3653  */
3654 void colo_flush_ram_cache(void)
3655 {
3656     RAMBlock *block = NULL;
3657     void *dst_host;
3658     void *src_host;
3659     unsigned long offset = 0;
3660 
3661     memory_global_dirty_log_sync();
3662     WITH_RCU_READ_LOCK_GUARD() {
3663         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3664             ramblock_sync_dirty_bitmap(ram_state, block);
3665         }
3666     }
3667 
3668     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3669     WITH_RCU_READ_LOCK_GUARD() {
3670         block = QLIST_FIRST_RCU(&ram_list.blocks);
3671 
3672         while (block) {
3673             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3674 
3675             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3676                 >= block->used_length) {
3677                 offset = 0;
3678                 block = QLIST_NEXT_RCU(block, next);
3679             } else {
3680                 migration_bitmap_clear_dirty(ram_state, block, offset);
3681                 dst_host = block->host
3682                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3683                 src_host = block->colo_cache
3684                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3685                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3686             }
3687         }
3688     }
3689     trace_colo_flush_ram_cache_end();
3690 }
3691 
3692 /**
3693  * ram_load_precopy: load pages in precopy case
3694  *
3695  * Returns 0 for success or -errno in case of error
3696  *
3697  * Called in precopy mode by ram_load().
3698  * rcu_read_lock is taken prior to this being called.
3699  *
3700  * @f: QEMUFile where to send the data
3701  */
3702 static int ram_load_precopy(QEMUFile *f)
3703 {
3704     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3705     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3706     bool postcopy_advised = postcopy_is_advised();
3707     if (!migrate_use_compression()) {
3708         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3709     }
3710 
3711     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3712         ram_addr_t addr, total_ram_bytes;
3713         void *host = NULL, *host_bak = NULL;
3714         uint8_t ch;
3715 
3716         /*
3717          * Yield periodically to let main loop run, but an iteration of
3718          * the main loop is expensive, so do it each some iterations
3719          */
3720         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3721             aio_co_schedule(qemu_get_current_aio_context(),
3722                             qemu_coroutine_self());
3723             qemu_coroutine_yield();
3724         }
3725         i++;
3726 
3727         addr = qemu_get_be64(f);
3728         flags = addr & ~TARGET_PAGE_MASK;
3729         addr &= TARGET_PAGE_MASK;
3730 
3731         if (flags & invalid_flags) {
3732             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3733                 error_report("Received an unexpected compressed page");
3734             }
3735 
3736             ret = -EINVAL;
3737             break;
3738         }
3739 
3740         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3741                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3742             RAMBlock *block = ram_block_from_stream(f, flags);
3743 
3744             host = host_from_ram_block_offset(block, addr);
3745             /*
3746              * After going into COLO stage, we should not load the page
3747              * into SVM's memory directly, we put them into colo_cache firstly.
3748              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3749              * Previously, we copied all these memory in preparing stage of COLO
3750              * while we need to stop VM, which is a time-consuming process.
3751              * Here we optimize it by a trick, back-up every page while in
3752              * migration process while COLO is enabled, though it affects the
3753              * speed of the migration, but it obviously reduce the downtime of
3754              * back-up all SVM'S memory in COLO preparing stage.
3755              */
3756             if (migration_incoming_colo_enabled()) {
3757                 if (migration_incoming_in_colo_state()) {
3758                     /* In COLO stage, put all pages into cache temporarily */
3759                     host = colo_cache_from_block_offset(block, addr, true);
3760                 } else {
3761                    /*
3762                     * In migration stage but before COLO stage,
3763                     * Put all pages into both cache and SVM's memory.
3764                     */
3765                     host_bak = colo_cache_from_block_offset(block, addr, false);
3766                 }
3767             }
3768             if (!host) {
3769                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3770                 ret = -EINVAL;
3771                 break;
3772             }
3773             if (!migration_incoming_in_colo_state()) {
3774                 ramblock_recv_bitmap_set(block, host);
3775             }
3776 
3777             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3778         }
3779 
3780         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3781         case RAM_SAVE_FLAG_MEM_SIZE:
3782             /* Synchronize RAM block list */
3783             total_ram_bytes = addr;
3784             while (!ret && total_ram_bytes) {
3785                 RAMBlock *block;
3786                 char id[256];
3787                 ram_addr_t length;
3788 
3789                 len = qemu_get_byte(f);
3790                 qemu_get_buffer(f, (uint8_t *)id, len);
3791                 id[len] = 0;
3792                 length = qemu_get_be64(f);
3793 
3794                 block = qemu_ram_block_by_name(id);
3795                 if (block && !qemu_ram_is_migratable(block)) {
3796                     error_report("block %s should not be migrated !", id);
3797                     ret = -EINVAL;
3798                 } else if (block) {
3799                     if (length != block->used_length) {
3800                         Error *local_err = NULL;
3801 
3802                         ret = qemu_ram_resize(block, length,
3803                                               &local_err);
3804                         if (local_err) {
3805                             error_report_err(local_err);
3806                         }
3807                     }
3808                     /* For postcopy we need to check hugepage sizes match */
3809                     if (postcopy_advised && migrate_postcopy_ram() &&
3810                         block->page_size != qemu_host_page_size) {
3811                         uint64_t remote_page_size = qemu_get_be64(f);
3812                         if (remote_page_size != block->page_size) {
3813                             error_report("Mismatched RAM page size %s "
3814                                          "(local) %zd != %" PRId64,
3815                                          id, block->page_size,
3816                                          remote_page_size);
3817                             ret = -EINVAL;
3818                         }
3819                     }
3820                     if (migrate_ignore_shared()) {
3821                         hwaddr addr = qemu_get_be64(f);
3822                         if (ramblock_is_ignored(block) &&
3823                             block->mr->addr != addr) {
3824                             error_report("Mismatched GPAs for block %s "
3825                                          "%" PRId64 "!= %" PRId64,
3826                                          id, (uint64_t)addr,
3827                                          (uint64_t)block->mr->addr);
3828                             ret = -EINVAL;
3829                         }
3830                     }
3831                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3832                                           block->idstr);
3833                 } else {
3834                     error_report("Unknown ramblock \"%s\", cannot "
3835                                  "accept migration", id);
3836                     ret = -EINVAL;
3837                 }
3838 
3839                 total_ram_bytes -= length;
3840             }
3841             break;
3842 
3843         case RAM_SAVE_FLAG_ZERO:
3844             ch = qemu_get_byte(f);
3845             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3846             break;
3847 
3848         case RAM_SAVE_FLAG_PAGE:
3849             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3850             break;
3851 
3852         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3853             len = qemu_get_be32(f);
3854             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3855                 error_report("Invalid compressed data length: %d", len);
3856                 ret = -EINVAL;
3857                 break;
3858             }
3859             decompress_data_with_multi_threads(f, host, len);
3860             break;
3861 
3862         case RAM_SAVE_FLAG_XBZRLE:
3863             if (load_xbzrle(f, addr, host) < 0) {
3864                 error_report("Failed to decompress XBZRLE page at "
3865                              RAM_ADDR_FMT, addr);
3866                 ret = -EINVAL;
3867                 break;
3868             }
3869             break;
3870         case RAM_SAVE_FLAG_EOS:
3871             /* normal exit */
3872             multifd_recv_sync_main();
3873             break;
3874         default:
3875             if (flags & RAM_SAVE_FLAG_HOOK) {
3876                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3877             } else {
3878                 error_report("Unknown combination of migration flags: 0x%x",
3879                              flags);
3880                 ret = -EINVAL;
3881             }
3882         }
3883         if (!ret) {
3884             ret = qemu_file_get_error(f);
3885         }
3886         if (!ret && host_bak) {
3887             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3888         }
3889     }
3890 
3891     ret |= wait_for_decompress_done();
3892     return ret;
3893 }
3894 
3895 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3896 {
3897     int ret = 0;
3898     static uint64_t seq_iter;
3899     /*
3900      * If system is running in postcopy mode, page inserts to host memory must
3901      * be atomic
3902      */
3903     bool postcopy_running = postcopy_is_running();
3904 
3905     seq_iter++;
3906 
3907     if (version_id != 4) {
3908         return -EINVAL;
3909     }
3910 
3911     /*
3912      * This RCU critical section can be very long running.
3913      * When RCU reclaims in the code start to become numerous,
3914      * it will be necessary to reduce the granularity of this
3915      * critical section.
3916      */
3917     WITH_RCU_READ_LOCK_GUARD() {
3918         if (postcopy_running) {
3919             ret = ram_load_postcopy(f);
3920         } else {
3921             ret = ram_load_precopy(f);
3922         }
3923     }
3924     trace_ram_load_complete(ret, seq_iter);
3925 
3926     return ret;
3927 }
3928 
3929 static bool ram_has_postcopy(void *opaque)
3930 {
3931     RAMBlock *rb;
3932     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3933         if (ramblock_is_pmem(rb)) {
3934             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3935                          "is not supported now!", rb->idstr, rb->host);
3936             return false;
3937         }
3938     }
3939 
3940     return migrate_postcopy_ram();
3941 }
3942 
3943 /* Sync all the dirty bitmap with destination VM.  */
3944 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3945 {
3946     RAMBlock *block;
3947     QEMUFile *file = s->to_dst_file;
3948     int ramblock_count = 0;
3949 
3950     trace_ram_dirty_bitmap_sync_start();
3951 
3952     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3953         qemu_savevm_send_recv_bitmap(file, block->idstr);
3954         trace_ram_dirty_bitmap_request(block->idstr);
3955         ramblock_count++;
3956     }
3957 
3958     trace_ram_dirty_bitmap_sync_wait();
3959 
3960     /* Wait until all the ramblocks' dirty bitmap synced */
3961     while (ramblock_count--) {
3962         qemu_sem_wait(&s->rp_state.rp_sem);
3963     }
3964 
3965     trace_ram_dirty_bitmap_sync_complete();
3966 
3967     return 0;
3968 }
3969 
3970 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3971 {
3972     qemu_sem_post(&s->rp_state.rp_sem);
3973 }
3974 
3975 /*
3976  * Read the received bitmap, revert it as the initial dirty bitmap.
3977  * This is only used when the postcopy migration is paused but wants
3978  * to resume from a middle point.
3979  */
3980 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3981 {
3982     int ret = -EINVAL;
3983     QEMUFile *file = s->rp_state.from_dst_file;
3984     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3985     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3986     uint64_t size, end_mark;
3987 
3988     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3989 
3990     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3991         error_report("%s: incorrect state %s", __func__,
3992                      MigrationStatus_str(s->state));
3993         return -EINVAL;
3994     }
3995 
3996     /*
3997      * Note: see comments in ramblock_recv_bitmap_send() on why we
3998      * need the endianness conversion, and the paddings.
3999      */
4000     local_size = ROUND_UP(local_size, 8);
4001 
4002     /* Add paddings */
4003     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4004 
4005     size = qemu_get_be64(file);
4006 
4007     /* The size of the bitmap should match with our ramblock */
4008     if (size != local_size) {
4009         error_report("%s: ramblock '%s' bitmap size mismatch "
4010                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4011                      block->idstr, size, local_size);
4012         ret = -EINVAL;
4013         goto out;
4014     }
4015 
4016     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4017     end_mark = qemu_get_be64(file);
4018 
4019     ret = qemu_file_get_error(file);
4020     if (ret || size != local_size) {
4021         error_report("%s: read bitmap failed for ramblock '%s': %d"
4022                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4023                      __func__, block->idstr, ret, local_size, size);
4024         ret = -EIO;
4025         goto out;
4026     }
4027 
4028     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4029         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4030                      __func__, block->idstr, end_mark);
4031         ret = -EINVAL;
4032         goto out;
4033     }
4034 
4035     /*
4036      * Endianness conversion. We are during postcopy (though paused).
4037      * The dirty bitmap won't change. We can directly modify it.
4038      */
4039     bitmap_from_le(block->bmap, le_bitmap, nbits);
4040 
4041     /*
4042      * What we received is "received bitmap". Revert it as the initial
4043      * dirty bitmap for this ramblock.
4044      */
4045     bitmap_complement(block->bmap, block->bmap, nbits);
4046 
4047     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4048 
4049     /*
4050      * We succeeded to sync bitmap for current ramblock. If this is
4051      * the last one to sync, we need to notify the main send thread.
4052      */
4053     ram_dirty_bitmap_reload_notify(s);
4054 
4055     ret = 0;
4056 out:
4057     g_free(le_bitmap);
4058     return ret;
4059 }
4060 
4061 static int ram_resume_prepare(MigrationState *s, void *opaque)
4062 {
4063     RAMState *rs = *(RAMState **)opaque;
4064     int ret;
4065 
4066     ret = ram_dirty_bitmap_sync_all(s, rs);
4067     if (ret) {
4068         return ret;
4069     }
4070 
4071     ram_state_resume_prepare(rs, s->to_dst_file);
4072 
4073     return 0;
4074 }
4075 
4076 static SaveVMHandlers savevm_ram_handlers = {
4077     .save_setup = ram_save_setup,
4078     .save_live_iterate = ram_save_iterate,
4079     .save_live_complete_postcopy = ram_save_complete,
4080     .save_live_complete_precopy = ram_save_complete,
4081     .has_postcopy = ram_has_postcopy,
4082     .save_live_pending = ram_save_pending,
4083     .load_state = ram_load,
4084     .save_cleanup = ram_save_cleanup,
4085     .load_setup = ram_load_setup,
4086     .load_cleanup = ram_load_cleanup,
4087     .resume_prepare = ram_resume_prepare,
4088 };
4089 
4090 void ram_mig_init(void)
4091 {
4092     qemu_mutex_init(&XBZRLE.lock);
4093     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4094 }
4095