xref: /openbmc/qemu/migration/ram.c (revision b890902c9c025b87d02e718eec3090fd3525ab18)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "block.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
60 #include "options.h"
61 
62 #include "hw/boards.h" /* for machine_dump_guest_core() */
63 
64 #if defined(__linux__)
65 #include "qemu/userfaultfd.h"
66 #endif /* defined(__linux__) */
67 
68 /***********************************************************/
69 /* ram save/restore */
70 
71 /*
72  * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
73  * worked for pages that were filled with the same char.  We switched
74  * it to only search for the zero value.  And to avoid confusion with
75  * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
76  */
77 /*
78  * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
79  */
80 #define RAM_SAVE_FLAG_FULL     0x01
81 #define RAM_SAVE_FLAG_ZERO     0x02
82 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
83 #define RAM_SAVE_FLAG_PAGE     0x08
84 #define RAM_SAVE_FLAG_EOS      0x10
85 #define RAM_SAVE_FLAG_CONTINUE 0x20
86 #define RAM_SAVE_FLAG_XBZRLE   0x40
87 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
88 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
89 /* We can't use any flag that is bigger than 0x200 */
90 
91 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
92      uint8_t *, int) = xbzrle_encode_buffer;
93 #if defined(CONFIG_AVX512BW_OPT)
94 #include "qemu/cpuid.h"
95 static void __attribute__((constructor)) init_cpu_flag(void)
96 {
97     unsigned max = __get_cpuid_max(0, NULL);
98     int a, b, c, d;
99     if (max >= 1) {
100         __cpuid(1, a, b, c, d);
101          /* We must check that AVX is not just available, but usable.  */
102         if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
103             int bv;
104             __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
105             __cpuid_count(7, 0, a, b, c, d);
106            /* 0xe6:
107             *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
108             *                    and ZMM16-ZMM31 state are enabled by OS)
109             *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
110             */
111             if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
112                 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
113             }
114         }
115     }
116 }
117 #endif
118 
119 XBZRLECacheStats xbzrle_counters;
120 
121 /* used by the search for pages to send */
122 struct PageSearchStatus {
123     /* The migration channel used for a specific host page */
124     QEMUFile    *pss_channel;
125     /* Last block from where we have sent data */
126     RAMBlock *last_sent_block;
127     /* Current block being searched */
128     RAMBlock    *block;
129     /* Current page to search from */
130     unsigned long page;
131     /* Set once we wrap around */
132     bool         complete_round;
133     /* Whether we're sending a host page */
134     bool          host_page_sending;
135     /* The start/end of current host page.  Invalid if host_page_sending==false */
136     unsigned long host_page_start;
137     unsigned long host_page_end;
138 };
139 typedef struct PageSearchStatus PageSearchStatus;
140 
141 /* struct contains XBZRLE cache and a static page
142    used by the compression */
143 static struct {
144     /* buffer used for XBZRLE encoding */
145     uint8_t *encoded_buf;
146     /* buffer for storing page content */
147     uint8_t *current_buf;
148     /* Cache for XBZRLE, Protected by lock. */
149     PageCache *cache;
150     QemuMutex lock;
151     /* it will store a page full of zeros */
152     uint8_t *zero_target_page;
153     /* buffer used for XBZRLE decoding */
154     uint8_t *decoded_buf;
155 } XBZRLE;
156 
157 static void XBZRLE_cache_lock(void)
158 {
159     if (migrate_use_xbzrle()) {
160         qemu_mutex_lock(&XBZRLE.lock);
161     }
162 }
163 
164 static void XBZRLE_cache_unlock(void)
165 {
166     if (migrate_use_xbzrle()) {
167         qemu_mutex_unlock(&XBZRLE.lock);
168     }
169 }
170 
171 /**
172  * xbzrle_cache_resize: resize the xbzrle cache
173  *
174  * This function is called from migrate_params_apply in main
175  * thread, possibly while a migration is in progress.  A running
176  * migration may be using the cache and might finish during this call,
177  * hence changes to the cache are protected by XBZRLE.lock().
178  *
179  * Returns 0 for success or -1 for error
180  *
181  * @new_size: new cache size
182  * @errp: set *errp if the check failed, with reason
183  */
184 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
185 {
186     PageCache *new_cache;
187     int64_t ret = 0;
188 
189     /* Check for truncation */
190     if (new_size != (size_t)new_size) {
191         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
192                    "exceeding address space");
193         return -1;
194     }
195 
196     if (new_size == migrate_xbzrle_cache_size()) {
197         /* nothing to do */
198         return 0;
199     }
200 
201     XBZRLE_cache_lock();
202 
203     if (XBZRLE.cache != NULL) {
204         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
205         if (!new_cache) {
206             ret = -1;
207             goto out;
208         }
209 
210         cache_fini(XBZRLE.cache);
211         XBZRLE.cache = new_cache;
212     }
213 out:
214     XBZRLE_cache_unlock();
215     return ret;
216 }
217 
218 static bool postcopy_preempt_active(void)
219 {
220     return migrate_postcopy_preempt() && migration_in_postcopy();
221 }
222 
223 bool ramblock_is_ignored(RAMBlock *block)
224 {
225     return !qemu_ram_is_migratable(block) ||
226            (migrate_ignore_shared() && qemu_ram_is_shared(block));
227 }
228 
229 #undef RAMBLOCK_FOREACH
230 
231 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
232 {
233     RAMBlock *block;
234     int ret = 0;
235 
236     RCU_READ_LOCK_GUARD();
237 
238     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
239         ret = func(block, opaque);
240         if (ret) {
241             break;
242         }
243     }
244     return ret;
245 }
246 
247 static void ramblock_recv_map_init(void)
248 {
249     RAMBlock *rb;
250 
251     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
252         assert(!rb->receivedmap);
253         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
254     }
255 }
256 
257 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
258 {
259     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
260                     rb->receivedmap);
261 }
262 
263 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
264 {
265     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
266 }
267 
268 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
269 {
270     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
271 }
272 
273 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
274                                     size_t nr)
275 {
276     bitmap_set_atomic(rb->receivedmap,
277                       ramblock_recv_bitmap_offset(host_addr, rb),
278                       nr);
279 }
280 
281 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
282 
283 /*
284  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
285  *
286  * Returns >0 if success with sent bytes, or <0 if error.
287  */
288 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
289                                   const char *block_name)
290 {
291     RAMBlock *block = qemu_ram_block_by_name(block_name);
292     unsigned long *le_bitmap, nbits;
293     uint64_t size;
294 
295     if (!block) {
296         error_report("%s: invalid block name: %s", __func__, block_name);
297         return -1;
298     }
299 
300     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
301 
302     /*
303      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
304      * machines we may need 4 more bytes for padding (see below
305      * comment). So extend it a bit before hand.
306      */
307     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
308 
309     /*
310      * Always use little endian when sending the bitmap. This is
311      * required that when source and destination VMs are not using the
312      * same endianness. (Note: big endian won't work.)
313      */
314     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
315 
316     /* Size of the bitmap, in bytes */
317     size = DIV_ROUND_UP(nbits, 8);
318 
319     /*
320      * size is always aligned to 8 bytes for 64bit machines, but it
321      * may not be true for 32bit machines. We need this padding to
322      * make sure the migration can survive even between 32bit and
323      * 64bit machines.
324      */
325     size = ROUND_UP(size, 8);
326 
327     qemu_put_be64(file, size);
328     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
329     /*
330      * Mark as an end, in case the middle part is screwed up due to
331      * some "mysterious" reason.
332      */
333     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
334     qemu_fflush(file);
335 
336     g_free(le_bitmap);
337 
338     if (qemu_file_get_error(file)) {
339         return qemu_file_get_error(file);
340     }
341 
342     return size + sizeof(size);
343 }
344 
345 /*
346  * An outstanding page request, on the source, having been received
347  * and queued
348  */
349 struct RAMSrcPageRequest {
350     RAMBlock *rb;
351     hwaddr    offset;
352     hwaddr    len;
353 
354     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
355 };
356 
357 /* State of RAM for migration */
358 struct RAMState {
359     /*
360      * PageSearchStatus structures for the channels when send pages.
361      * Protected by the bitmap_mutex.
362      */
363     PageSearchStatus pss[RAM_CHANNEL_MAX];
364     /* UFFD file descriptor, used in 'write-tracking' migration */
365     int uffdio_fd;
366     /* total ram size in bytes */
367     uint64_t ram_bytes_total;
368     /* Last block that we have visited searching for dirty pages */
369     RAMBlock *last_seen_block;
370     /* Last dirty target page we have sent */
371     ram_addr_t last_page;
372     /* last ram version we have seen */
373     uint32_t last_version;
374     /* How many times we have dirty too many pages */
375     int dirty_rate_high_cnt;
376     /* these variables are used for bitmap sync */
377     /* last time we did a full bitmap_sync */
378     int64_t time_last_bitmap_sync;
379     /* bytes transferred at start_time */
380     uint64_t bytes_xfer_prev;
381     /* number of dirty pages since start_time */
382     uint64_t num_dirty_pages_period;
383     /* xbzrle misses since the beginning of the period */
384     uint64_t xbzrle_cache_miss_prev;
385     /* Amount of xbzrle pages since the beginning of the period */
386     uint64_t xbzrle_pages_prev;
387     /* Amount of xbzrle encoded bytes since the beginning of the period */
388     uint64_t xbzrle_bytes_prev;
389     /* Start using XBZRLE (e.g., after the first round). */
390     bool xbzrle_enabled;
391     /* Are we on the last stage of migration */
392     bool last_stage;
393     /* compression statistics since the beginning of the period */
394     /* amount of count that no free thread to compress data */
395     uint64_t compress_thread_busy_prev;
396     /* amount bytes after compression */
397     uint64_t compressed_size_prev;
398     /* amount of compressed pages */
399     uint64_t compress_pages_prev;
400 
401     /* total handled target pages at the beginning of period */
402     uint64_t target_page_count_prev;
403     /* total handled target pages since start */
404     uint64_t target_page_count;
405     /* number of dirty bits in the bitmap */
406     uint64_t migration_dirty_pages;
407     /*
408      * Protects:
409      * - dirty/clear bitmap
410      * - migration_dirty_pages
411      * - pss structures
412      */
413     QemuMutex bitmap_mutex;
414     /* The RAMBlock used in the last src_page_requests */
415     RAMBlock *last_req_rb;
416     /* Queue of outstanding page requests from the destination */
417     QemuMutex src_page_req_mutex;
418     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
419 };
420 typedef struct RAMState RAMState;
421 
422 static RAMState *ram_state;
423 
424 static NotifierWithReturnList precopy_notifier_list;
425 
426 /* Whether postcopy has queued requests? */
427 static bool postcopy_has_request(RAMState *rs)
428 {
429     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
430 }
431 
432 void precopy_infrastructure_init(void)
433 {
434     notifier_with_return_list_init(&precopy_notifier_list);
435 }
436 
437 void precopy_add_notifier(NotifierWithReturn *n)
438 {
439     notifier_with_return_list_add(&precopy_notifier_list, n);
440 }
441 
442 void precopy_remove_notifier(NotifierWithReturn *n)
443 {
444     notifier_with_return_remove(n);
445 }
446 
447 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
448 {
449     PrecopyNotifyData pnd;
450     pnd.reason = reason;
451     pnd.errp = errp;
452 
453     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
454 }
455 
456 uint64_t ram_bytes_remaining(void)
457 {
458     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
459                        0;
460 }
461 
462 RAMStats ram_counters;
463 
464 void ram_transferred_add(uint64_t bytes)
465 {
466     if (runstate_is_running()) {
467         stat64_add(&ram_counters.precopy_bytes, bytes);
468     } else if (migration_in_postcopy()) {
469         stat64_add(&ram_counters.postcopy_bytes, bytes);
470     } else {
471         stat64_add(&ram_counters.downtime_bytes, bytes);
472     }
473     stat64_add(&ram_counters.transferred, bytes);
474 }
475 
476 struct MigrationOps {
477     int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
478 };
479 typedef struct MigrationOps MigrationOps;
480 
481 MigrationOps *migration_ops;
482 
483 CompressionStats compression_counters;
484 
485 struct CompressParam {
486     bool done;
487     bool quit;
488     bool zero_page;
489     QEMUFile *file;
490     QemuMutex mutex;
491     QemuCond cond;
492     RAMBlock *block;
493     ram_addr_t offset;
494 
495     /* internally used fields */
496     z_stream stream;
497     uint8_t *originbuf;
498 };
499 typedef struct CompressParam CompressParam;
500 
501 struct DecompressParam {
502     bool done;
503     bool quit;
504     QemuMutex mutex;
505     QemuCond cond;
506     void *des;
507     uint8_t *compbuf;
508     int len;
509     z_stream stream;
510 };
511 typedef struct DecompressParam DecompressParam;
512 
513 static CompressParam *comp_param;
514 static QemuThread *compress_threads;
515 /* comp_done_cond is used to wake up the migration thread when
516  * one of the compression threads has finished the compression.
517  * comp_done_lock is used to co-work with comp_done_cond.
518  */
519 static QemuMutex comp_done_lock;
520 static QemuCond comp_done_cond;
521 
522 static QEMUFile *decomp_file;
523 static DecompressParam *decomp_param;
524 static QemuThread *decompress_threads;
525 static QemuMutex decomp_done_lock;
526 static QemuCond decomp_done_cond;
527 
528 static int ram_save_host_page_urgent(PageSearchStatus *pss);
529 
530 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
531                                  ram_addr_t offset, uint8_t *source_buf);
532 
533 /* NOTE: page is the PFN not real ram_addr_t. */
534 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
535 {
536     pss->block = rb;
537     pss->page = page;
538     pss->complete_round = false;
539 }
540 
541 /*
542  * Check whether two PSSs are actively sending the same page.  Return true
543  * if it is, false otherwise.
544  */
545 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
546 {
547     return pss1->host_page_sending && pss2->host_page_sending &&
548         (pss1->host_page_start == pss2->host_page_start);
549 }
550 
551 static void *do_data_compress(void *opaque)
552 {
553     CompressParam *param = opaque;
554     RAMBlock *block;
555     ram_addr_t offset;
556     bool zero_page;
557 
558     qemu_mutex_lock(&param->mutex);
559     while (!param->quit) {
560         if (param->block) {
561             block = param->block;
562             offset = param->offset;
563             param->block = NULL;
564             qemu_mutex_unlock(&param->mutex);
565 
566             zero_page = do_compress_ram_page(param->file, &param->stream,
567                                              block, offset, param->originbuf);
568 
569             qemu_mutex_lock(&comp_done_lock);
570             param->done = true;
571             param->zero_page = zero_page;
572             qemu_cond_signal(&comp_done_cond);
573             qemu_mutex_unlock(&comp_done_lock);
574 
575             qemu_mutex_lock(&param->mutex);
576         } else {
577             qemu_cond_wait(&param->cond, &param->mutex);
578         }
579     }
580     qemu_mutex_unlock(&param->mutex);
581 
582     return NULL;
583 }
584 
585 static void compress_threads_save_cleanup(void)
586 {
587     int i, thread_count;
588 
589     if (!migrate_compress() || !comp_param) {
590         return;
591     }
592 
593     thread_count = migrate_compress_threads();
594     for (i = 0; i < thread_count; i++) {
595         /*
596          * we use it as a indicator which shows if the thread is
597          * properly init'd or not
598          */
599         if (!comp_param[i].file) {
600             break;
601         }
602 
603         qemu_mutex_lock(&comp_param[i].mutex);
604         comp_param[i].quit = true;
605         qemu_cond_signal(&comp_param[i].cond);
606         qemu_mutex_unlock(&comp_param[i].mutex);
607 
608         qemu_thread_join(compress_threads + i);
609         qemu_mutex_destroy(&comp_param[i].mutex);
610         qemu_cond_destroy(&comp_param[i].cond);
611         deflateEnd(&comp_param[i].stream);
612         g_free(comp_param[i].originbuf);
613         qemu_fclose(comp_param[i].file);
614         comp_param[i].file = NULL;
615     }
616     qemu_mutex_destroy(&comp_done_lock);
617     qemu_cond_destroy(&comp_done_cond);
618     g_free(compress_threads);
619     g_free(comp_param);
620     compress_threads = NULL;
621     comp_param = NULL;
622 }
623 
624 static int compress_threads_save_setup(void)
625 {
626     int i, thread_count;
627 
628     if (!migrate_compress()) {
629         return 0;
630     }
631     thread_count = migrate_compress_threads();
632     compress_threads = g_new0(QemuThread, thread_count);
633     comp_param = g_new0(CompressParam, thread_count);
634     qemu_cond_init(&comp_done_cond);
635     qemu_mutex_init(&comp_done_lock);
636     for (i = 0; i < thread_count; i++) {
637         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
638         if (!comp_param[i].originbuf) {
639             goto exit;
640         }
641 
642         if (deflateInit(&comp_param[i].stream,
643                         migrate_compress_level()) != Z_OK) {
644             g_free(comp_param[i].originbuf);
645             goto exit;
646         }
647 
648         /* comp_param[i].file is just used as a dummy buffer to save data,
649          * set its ops to empty.
650          */
651         comp_param[i].file = qemu_file_new_output(
652             QIO_CHANNEL(qio_channel_null_new()));
653         comp_param[i].done = true;
654         comp_param[i].quit = false;
655         qemu_mutex_init(&comp_param[i].mutex);
656         qemu_cond_init(&comp_param[i].cond);
657         qemu_thread_create(compress_threads + i, "compress",
658                            do_data_compress, comp_param + i,
659                            QEMU_THREAD_JOINABLE);
660     }
661     return 0;
662 
663 exit:
664     compress_threads_save_cleanup();
665     return -1;
666 }
667 
668 /**
669  * save_page_header: write page header to wire
670  *
671  * If this is the 1st block, it also writes the block identification
672  *
673  * Returns the number of bytes written
674  *
675  * @pss: current PSS channel status
676  * @block: block that contains the page we want to send
677  * @offset: offset inside the block for the page
678  *          in the lower bits, it contains flags
679  */
680 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
681                                RAMBlock *block, ram_addr_t offset)
682 {
683     size_t size, len;
684     bool same_block = (block == pss->last_sent_block);
685 
686     if (same_block) {
687         offset |= RAM_SAVE_FLAG_CONTINUE;
688     }
689     qemu_put_be64(f, offset);
690     size = 8;
691 
692     if (!same_block) {
693         len = strlen(block->idstr);
694         qemu_put_byte(f, len);
695         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
696         size += 1 + len;
697         pss->last_sent_block = block;
698     }
699     return size;
700 }
701 
702 /**
703  * mig_throttle_guest_down: throttle down the guest
704  *
705  * Reduce amount of guest cpu execution to hopefully slow down memory
706  * writes. If guest dirty memory rate is reduced below the rate at
707  * which we can transfer pages to the destination then we should be
708  * able to complete migration. Some workloads dirty memory way too
709  * fast and will not effectively converge, even with auto-converge.
710  */
711 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
712                                     uint64_t bytes_dirty_threshold)
713 {
714     MigrationState *s = migrate_get_current();
715     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
716     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
717     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
718     int pct_max = s->parameters.max_cpu_throttle;
719 
720     uint64_t throttle_now = cpu_throttle_get_percentage();
721     uint64_t cpu_now, cpu_ideal, throttle_inc;
722 
723     /* We have not started throttling yet. Let's start it. */
724     if (!cpu_throttle_active()) {
725         cpu_throttle_set(pct_initial);
726     } else {
727         /* Throttling already on, just increase the rate */
728         if (!pct_tailslow) {
729             throttle_inc = pct_increment;
730         } else {
731             /* Compute the ideal CPU percentage used by Guest, which may
732              * make the dirty rate match the dirty rate threshold. */
733             cpu_now = 100 - throttle_now;
734             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
735                         bytes_dirty_period);
736             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
737         }
738         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
739     }
740 }
741 
742 void mig_throttle_counter_reset(void)
743 {
744     RAMState *rs = ram_state;
745 
746     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
747     rs->num_dirty_pages_period = 0;
748     rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
749 }
750 
751 /**
752  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
753  *
754  * @rs: current RAM state
755  * @current_addr: address for the zero page
756  *
757  * Update the xbzrle cache to reflect a page that's been sent as all 0.
758  * The important thing is that a stale (not-yet-0'd) page be replaced
759  * by the new data.
760  * As a bonus, if the page wasn't in the cache it gets added so that
761  * when a small write is made into the 0'd page it gets XBZRLE sent.
762  */
763 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
764 {
765     /* We don't care if this fails to allocate a new cache page
766      * as long as it updated an old one */
767     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
768                  stat64_get(&ram_counters.dirty_sync_count));
769 }
770 
771 #define ENCODING_FLAG_XBZRLE 0x1
772 
773 /**
774  * save_xbzrle_page: compress and send current page
775  *
776  * Returns: 1 means that we wrote the page
777  *          0 means that page is identical to the one already sent
778  *          -1 means that xbzrle would be longer than normal
779  *
780  * @rs: current RAM state
781  * @pss: current PSS channel
782  * @current_data: pointer to the address of the page contents
783  * @current_addr: addr of the page
784  * @block: block that contains the page we want to send
785  * @offset: offset inside the block for the page
786  */
787 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
788                             uint8_t **current_data, ram_addr_t current_addr,
789                             RAMBlock *block, ram_addr_t offset)
790 {
791     int encoded_len = 0, bytes_xbzrle;
792     uint8_t *prev_cached_page;
793     QEMUFile *file = pss->pss_channel;
794     uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
795 
796     if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
797         xbzrle_counters.cache_miss++;
798         if (!rs->last_stage) {
799             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
800                              generation) == -1) {
801                 return -1;
802             } else {
803                 /* update *current_data when the page has been
804                    inserted into cache */
805                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
806             }
807         }
808         return -1;
809     }
810 
811     /*
812      * Reaching here means the page has hit the xbzrle cache, no matter what
813      * encoding result it is (normal encoding, overflow or skipping the page),
814      * count the page as encoded. This is used to calculate the encoding rate.
815      *
816      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
817      * 2nd page turns out to be skipped (i.e. no new bytes written to the
818      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
819      * skipped page included. In this way, the encoding rate can tell if the
820      * guest page is good for xbzrle encoding.
821      */
822     xbzrle_counters.pages++;
823     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
824 
825     /* save current buffer into memory */
826     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
827 
828     /* XBZRLE encoding (if there is no overflow) */
829     encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
830                                             TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
831                                             TARGET_PAGE_SIZE);
832 
833     /*
834      * Update the cache contents, so that it corresponds to the data
835      * sent, in all cases except where we skip the page.
836      */
837     if (!rs->last_stage && encoded_len != 0) {
838         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
839         /*
840          * In the case where we couldn't compress, ensure that the caller
841          * sends the data from the cache, since the guest might have
842          * changed the RAM since we copied it.
843          */
844         *current_data = prev_cached_page;
845     }
846 
847     if (encoded_len == 0) {
848         trace_save_xbzrle_page_skipping();
849         return 0;
850     } else if (encoded_len == -1) {
851         trace_save_xbzrle_page_overflow();
852         xbzrle_counters.overflow++;
853         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
854         return -1;
855     }
856 
857     /* Send XBZRLE based compressed page */
858     bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
859                                     offset | RAM_SAVE_FLAG_XBZRLE);
860     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
861     qemu_put_be16(file, encoded_len);
862     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
863     bytes_xbzrle += encoded_len + 1 + 2;
864     /*
865      * Like compressed_size (please see update_compress_thread_counts),
866      * the xbzrle encoded bytes don't count the 8 byte header with
867      * RAM_SAVE_FLAG_CONTINUE.
868      */
869     xbzrle_counters.bytes += bytes_xbzrle - 8;
870     ram_transferred_add(bytes_xbzrle);
871 
872     return 1;
873 }
874 
875 /**
876  * pss_find_next_dirty: find the next dirty page of current ramblock
877  *
878  * This function updates pss->page to point to the next dirty page index
879  * within the ramblock to migrate, or the end of ramblock when nothing
880  * found.  Note that when pss->host_page_sending==true it means we're
881  * during sending a host page, so we won't look for dirty page that is
882  * outside the host page boundary.
883  *
884  * @pss: the current page search status
885  */
886 static void pss_find_next_dirty(PageSearchStatus *pss)
887 {
888     RAMBlock *rb = pss->block;
889     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
890     unsigned long *bitmap = rb->bmap;
891 
892     if (ramblock_is_ignored(rb)) {
893         /* Points directly to the end, so we know no dirty page */
894         pss->page = size;
895         return;
896     }
897 
898     /*
899      * If during sending a host page, only look for dirty pages within the
900      * current host page being send.
901      */
902     if (pss->host_page_sending) {
903         assert(pss->host_page_end);
904         size = MIN(size, pss->host_page_end);
905     }
906 
907     pss->page = find_next_bit(bitmap, size, pss->page);
908 }
909 
910 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
911                                                        unsigned long page)
912 {
913     uint8_t shift;
914     hwaddr size, start;
915 
916     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
917         return;
918     }
919 
920     shift = rb->clear_bmap_shift;
921     /*
922      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
923      * can make things easier sometimes since then start address
924      * of the small chunk will always be 64 pages aligned so the
925      * bitmap will always be aligned to unsigned long. We should
926      * even be able to remove this restriction but I'm simply
927      * keeping it.
928      */
929     assert(shift >= 6);
930 
931     size = 1ULL << (TARGET_PAGE_BITS + shift);
932     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
933     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
934     memory_region_clear_dirty_bitmap(rb->mr, start, size);
935 }
936 
937 static void
938 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
939                                                  unsigned long start,
940                                                  unsigned long npages)
941 {
942     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
943     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
944     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
945 
946     /*
947      * Clear pages from start to start + npages - 1, so the end boundary is
948      * exclusive.
949      */
950     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
951         migration_clear_memory_region_dirty_bitmap(rb, i);
952     }
953 }
954 
955 /*
956  * colo_bitmap_find_diry:find contiguous dirty pages from start
957  *
958  * Returns the page offset within memory region of the start of the contiguout
959  * dirty page
960  *
961  * @rs: current RAM state
962  * @rb: RAMBlock where to search for dirty pages
963  * @start: page where we start the search
964  * @num: the number of contiguous dirty pages
965  */
966 static inline
967 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
968                                      unsigned long start, unsigned long *num)
969 {
970     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
971     unsigned long *bitmap = rb->bmap;
972     unsigned long first, next;
973 
974     *num = 0;
975 
976     if (ramblock_is_ignored(rb)) {
977         return size;
978     }
979 
980     first = find_next_bit(bitmap, size, start);
981     if (first >= size) {
982         return first;
983     }
984     next = find_next_zero_bit(bitmap, size, first + 1);
985     assert(next >= first);
986     *num = next - first;
987     return first;
988 }
989 
990 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
991                                                 RAMBlock *rb,
992                                                 unsigned long page)
993 {
994     bool ret;
995 
996     /*
997      * Clear dirty bitmap if needed.  This _must_ be called before we
998      * send any of the page in the chunk because we need to make sure
999      * we can capture further page content changes when we sync dirty
1000      * log the next time.  So as long as we are going to send any of
1001      * the page in the chunk we clear the remote dirty bitmap for all.
1002      * Clearing it earlier won't be a problem, but too late will.
1003      */
1004     migration_clear_memory_region_dirty_bitmap(rb, page);
1005 
1006     ret = test_and_clear_bit(page, rb->bmap);
1007     if (ret) {
1008         rs->migration_dirty_pages--;
1009     }
1010 
1011     return ret;
1012 }
1013 
1014 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1015                                        void *opaque)
1016 {
1017     const hwaddr offset = section->offset_within_region;
1018     const hwaddr size = int128_get64(section->size);
1019     const unsigned long start = offset >> TARGET_PAGE_BITS;
1020     const unsigned long npages = size >> TARGET_PAGE_BITS;
1021     RAMBlock *rb = section->mr->ram_block;
1022     uint64_t *cleared_bits = opaque;
1023 
1024     /*
1025      * We don't grab ram_state->bitmap_mutex because we expect to run
1026      * only when starting migration or during postcopy recovery where
1027      * we don't have concurrent access.
1028      */
1029     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1030         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1031     }
1032     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1033     bitmap_clear(rb->bmap, start, npages);
1034 }
1035 
1036 /*
1037  * Exclude all dirty pages from migration that fall into a discarded range as
1038  * managed by a RamDiscardManager responsible for the mapped memory region of
1039  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1040  *
1041  * Discarded pages ("logically unplugged") have undefined content and must
1042  * not get migrated, because even reading these pages for migration might
1043  * result in undesired behavior.
1044  *
1045  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1046  *
1047  * Note: The result is only stable while migrating (precopy/postcopy).
1048  */
1049 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1050 {
1051     uint64_t cleared_bits = 0;
1052 
1053     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1054         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1055         MemoryRegionSection section = {
1056             .mr = rb->mr,
1057             .offset_within_region = 0,
1058             .size = int128_make64(qemu_ram_get_used_length(rb)),
1059         };
1060 
1061         ram_discard_manager_replay_discarded(rdm, &section,
1062                                              dirty_bitmap_clear_section,
1063                                              &cleared_bits);
1064     }
1065     return cleared_bits;
1066 }
1067 
1068 /*
1069  * Check if a host-page aligned page falls into a discarded range as managed by
1070  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1071  *
1072  * Note: The result is only stable while migrating (precopy/postcopy).
1073  */
1074 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1075 {
1076     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1077         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1078         MemoryRegionSection section = {
1079             .mr = rb->mr,
1080             .offset_within_region = start,
1081             .size = int128_make64(qemu_ram_pagesize(rb)),
1082         };
1083 
1084         return !ram_discard_manager_is_populated(rdm, &section);
1085     }
1086     return false;
1087 }
1088 
1089 /* Called with RCU critical section */
1090 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1091 {
1092     uint64_t new_dirty_pages =
1093         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1094 
1095     rs->migration_dirty_pages += new_dirty_pages;
1096     rs->num_dirty_pages_period += new_dirty_pages;
1097 }
1098 
1099 /**
1100  * ram_pagesize_summary: calculate all the pagesizes of a VM
1101  *
1102  * Returns a summary bitmap of the page sizes of all RAMBlocks
1103  *
1104  * For VMs with just normal pages this is equivalent to the host page
1105  * size. If it's got some huge pages then it's the OR of all the
1106  * different page sizes.
1107  */
1108 uint64_t ram_pagesize_summary(void)
1109 {
1110     RAMBlock *block;
1111     uint64_t summary = 0;
1112 
1113     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1114         summary |= block->page_size;
1115     }
1116 
1117     return summary;
1118 }
1119 
1120 uint64_t ram_get_total_transferred_pages(void)
1121 {
1122     return stat64_get(&ram_counters.normal_pages) +
1123         stat64_get(&ram_counters.zero_pages) +
1124         compression_counters.pages + xbzrle_counters.pages;
1125 }
1126 
1127 static void migration_update_rates(RAMState *rs, int64_t end_time)
1128 {
1129     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1130     double compressed_size;
1131 
1132     /* calculate period counters */
1133     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1134                 / (end_time - rs->time_last_bitmap_sync);
1135 
1136     if (!page_count) {
1137         return;
1138     }
1139 
1140     if (migrate_use_xbzrle()) {
1141         double encoded_size, unencoded_size;
1142 
1143         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1144             rs->xbzrle_cache_miss_prev) / page_count;
1145         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1146         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1147                          TARGET_PAGE_SIZE;
1148         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1149         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1150             xbzrle_counters.encoding_rate = 0;
1151         } else {
1152             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1153         }
1154         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1155         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1156     }
1157 
1158     if (migrate_compress()) {
1159         compression_counters.busy_rate = (double)(compression_counters.busy -
1160             rs->compress_thread_busy_prev) / page_count;
1161         rs->compress_thread_busy_prev = compression_counters.busy;
1162 
1163         compressed_size = compression_counters.compressed_size -
1164                           rs->compressed_size_prev;
1165         if (compressed_size) {
1166             double uncompressed_size = (compression_counters.pages -
1167                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1168 
1169             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1170             compression_counters.compression_rate =
1171                                         uncompressed_size / compressed_size;
1172 
1173             rs->compress_pages_prev = compression_counters.pages;
1174             rs->compressed_size_prev = compression_counters.compressed_size;
1175         }
1176     }
1177 }
1178 
1179 static void migration_trigger_throttle(RAMState *rs)
1180 {
1181     MigrationState *s = migrate_get_current();
1182     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1183     uint64_t bytes_xfer_period =
1184         stat64_get(&ram_counters.transferred) - rs->bytes_xfer_prev;
1185     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1186     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1187 
1188     /* During block migration the auto-converge logic incorrectly detects
1189      * that ram migration makes no progress. Avoid this by disabling the
1190      * throttling logic during the bulk phase of block migration. */
1191     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1192         /* The following detection logic can be refined later. For now:
1193            Check to see if the ratio between dirtied bytes and the approx.
1194            amount of bytes that just got transferred since the last time
1195            we were in this routine reaches the threshold. If that happens
1196            twice, start or increase throttling. */
1197 
1198         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1199             (++rs->dirty_rate_high_cnt >= 2)) {
1200             trace_migration_throttle();
1201             rs->dirty_rate_high_cnt = 0;
1202             mig_throttle_guest_down(bytes_dirty_period,
1203                                     bytes_dirty_threshold);
1204         }
1205     }
1206 }
1207 
1208 static void migration_bitmap_sync(RAMState *rs)
1209 {
1210     RAMBlock *block;
1211     int64_t end_time;
1212 
1213     stat64_add(&ram_counters.dirty_sync_count, 1);
1214 
1215     if (!rs->time_last_bitmap_sync) {
1216         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1217     }
1218 
1219     trace_migration_bitmap_sync_start();
1220     memory_global_dirty_log_sync();
1221 
1222     qemu_mutex_lock(&rs->bitmap_mutex);
1223     WITH_RCU_READ_LOCK_GUARD() {
1224         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1225             ramblock_sync_dirty_bitmap(rs, block);
1226         }
1227         ram_counters.remaining = ram_bytes_remaining();
1228     }
1229     qemu_mutex_unlock(&rs->bitmap_mutex);
1230 
1231     memory_global_after_dirty_log_sync();
1232     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1233 
1234     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1235 
1236     /* more than 1 second = 1000 millisecons */
1237     if (end_time > rs->time_last_bitmap_sync + 1000) {
1238         migration_trigger_throttle(rs);
1239 
1240         migration_update_rates(rs, end_time);
1241 
1242         rs->target_page_count_prev = rs->target_page_count;
1243 
1244         /* reset period counters */
1245         rs->time_last_bitmap_sync = end_time;
1246         rs->num_dirty_pages_period = 0;
1247         rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
1248     }
1249     if (migrate_events()) {
1250         uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
1251         qapi_event_send_migration_pass(generation);
1252     }
1253 }
1254 
1255 static void migration_bitmap_sync_precopy(RAMState *rs)
1256 {
1257     Error *local_err = NULL;
1258 
1259     /*
1260      * The current notifier usage is just an optimization to migration, so we
1261      * don't stop the normal migration process in the error case.
1262      */
1263     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1264         error_report_err(local_err);
1265         local_err = NULL;
1266     }
1267 
1268     migration_bitmap_sync(rs);
1269 
1270     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1271         error_report_err(local_err);
1272     }
1273 }
1274 
1275 void ram_release_page(const char *rbname, uint64_t offset)
1276 {
1277     if (!migrate_release_ram() || !migration_in_postcopy()) {
1278         return;
1279     }
1280 
1281     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1282 }
1283 
1284 /**
1285  * save_zero_page_to_file: send the zero page to the file
1286  *
1287  * Returns the size of data written to the file, 0 means the page is not
1288  * a zero page
1289  *
1290  * @pss: current PSS channel
1291  * @block: block that contains the page we want to send
1292  * @offset: offset inside the block for the page
1293  */
1294 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1295                                   RAMBlock *block, ram_addr_t offset)
1296 {
1297     uint8_t *p = block->host + offset;
1298     int len = 0;
1299 
1300     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1301         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1302         qemu_put_byte(file, 0);
1303         len += 1;
1304         ram_release_page(block->idstr, offset);
1305     }
1306     return len;
1307 }
1308 
1309 /**
1310  * save_zero_page: send the zero page to the stream
1311  *
1312  * Returns the number of pages written.
1313  *
1314  * @pss: current PSS channel
1315  * @block: block that contains the page we want to send
1316  * @offset: offset inside the block for the page
1317  */
1318 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1319                           ram_addr_t offset)
1320 {
1321     int len = save_zero_page_to_file(pss, f, block, offset);
1322 
1323     if (len) {
1324         stat64_add(&ram_counters.zero_pages, 1);
1325         ram_transferred_add(len);
1326         return 1;
1327     }
1328     return -1;
1329 }
1330 
1331 /*
1332  * @pages: the number of pages written by the control path,
1333  *        < 0 - error
1334  *        > 0 - number of pages written
1335  *
1336  * Return true if the pages has been saved, otherwise false is returned.
1337  */
1338 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1339                               ram_addr_t offset, int *pages)
1340 {
1341     uint64_t bytes_xmit = 0;
1342     int ret;
1343 
1344     *pages = -1;
1345     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1346                                 TARGET_PAGE_SIZE, &bytes_xmit);
1347     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1348         return false;
1349     }
1350 
1351     if (bytes_xmit) {
1352         ram_transferred_add(bytes_xmit);
1353         *pages = 1;
1354     }
1355 
1356     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1357         return true;
1358     }
1359 
1360     if (bytes_xmit > 0) {
1361         stat64_add(&ram_counters.normal_pages, 1);
1362     } else if (bytes_xmit == 0) {
1363         stat64_add(&ram_counters.zero_pages, 1);
1364     }
1365 
1366     return true;
1367 }
1368 
1369 /*
1370  * directly send the page to the stream
1371  *
1372  * Returns the number of pages written.
1373  *
1374  * @pss: current PSS channel
1375  * @block: block that contains the page we want to send
1376  * @offset: offset inside the block for the page
1377  * @buf: the page to be sent
1378  * @async: send to page asyncly
1379  */
1380 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1381                             ram_addr_t offset, uint8_t *buf, bool async)
1382 {
1383     QEMUFile *file = pss->pss_channel;
1384 
1385     ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1386                                          offset | RAM_SAVE_FLAG_PAGE));
1387     if (async) {
1388         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1389                               migrate_release_ram() &&
1390                               migration_in_postcopy());
1391     } else {
1392         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1393     }
1394     ram_transferred_add(TARGET_PAGE_SIZE);
1395     stat64_add(&ram_counters.normal_pages, 1);
1396     return 1;
1397 }
1398 
1399 /**
1400  * ram_save_page: send the given page to the stream
1401  *
1402  * Returns the number of pages written.
1403  *          < 0 - error
1404  *          >=0 - Number of pages written - this might legally be 0
1405  *                if xbzrle noticed the page was the same.
1406  *
1407  * @rs: current RAM state
1408  * @block: block that contains the page we want to send
1409  * @offset: offset inside the block for the page
1410  */
1411 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1412 {
1413     int pages = -1;
1414     uint8_t *p;
1415     bool send_async = true;
1416     RAMBlock *block = pss->block;
1417     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1418     ram_addr_t current_addr = block->offset + offset;
1419 
1420     p = block->host + offset;
1421     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1422 
1423     XBZRLE_cache_lock();
1424     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1425         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1426                                  block, offset);
1427         if (!rs->last_stage) {
1428             /* Can't send this cached data async, since the cache page
1429              * might get updated before it gets to the wire
1430              */
1431             send_async = false;
1432         }
1433     }
1434 
1435     /* XBZRLE overflow or normal page */
1436     if (pages == -1) {
1437         pages = save_normal_page(pss, block, offset, p, send_async);
1438     }
1439 
1440     XBZRLE_cache_unlock();
1441 
1442     return pages;
1443 }
1444 
1445 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1446                                  ram_addr_t offset)
1447 {
1448     if (multifd_queue_page(file, block, offset) < 0) {
1449         return -1;
1450     }
1451     stat64_add(&ram_counters.normal_pages, 1);
1452 
1453     return 1;
1454 }
1455 
1456 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1457                                  ram_addr_t offset, uint8_t *source_buf)
1458 {
1459     RAMState *rs = ram_state;
1460     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1461     uint8_t *p = block->host + offset;
1462     int ret;
1463 
1464     if (save_zero_page_to_file(pss, f, block, offset)) {
1465         return true;
1466     }
1467 
1468     save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1469 
1470     /*
1471      * copy it to a internal buffer to avoid it being modified by VM
1472      * so that we can catch up the error during compression and
1473      * decompression
1474      */
1475     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1476     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1477     if (ret < 0) {
1478         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1479         error_report("compressed data failed!");
1480     }
1481     return false;
1482 }
1483 
1484 static void
1485 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1486 {
1487     ram_transferred_add(bytes_xmit);
1488 
1489     if (param->zero_page) {
1490         stat64_add(&ram_counters.zero_pages, 1);
1491         return;
1492     }
1493 
1494     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1495     compression_counters.compressed_size += bytes_xmit - 8;
1496     compression_counters.pages++;
1497 }
1498 
1499 static bool save_page_use_compression(RAMState *rs);
1500 
1501 static void flush_compressed_data(RAMState *rs)
1502 {
1503     MigrationState *ms = migrate_get_current();
1504     int idx, len, thread_count;
1505 
1506     if (!save_page_use_compression(rs)) {
1507         return;
1508     }
1509     thread_count = migrate_compress_threads();
1510 
1511     qemu_mutex_lock(&comp_done_lock);
1512     for (idx = 0; idx < thread_count; idx++) {
1513         while (!comp_param[idx].done) {
1514             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1515         }
1516     }
1517     qemu_mutex_unlock(&comp_done_lock);
1518 
1519     for (idx = 0; idx < thread_count; idx++) {
1520         qemu_mutex_lock(&comp_param[idx].mutex);
1521         if (!comp_param[idx].quit) {
1522             len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1523             /*
1524              * it's safe to fetch zero_page without holding comp_done_lock
1525              * as there is no further request submitted to the thread,
1526              * i.e, the thread should be waiting for a request at this point.
1527              */
1528             update_compress_thread_counts(&comp_param[idx], len);
1529         }
1530         qemu_mutex_unlock(&comp_param[idx].mutex);
1531     }
1532 }
1533 
1534 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1535                                        ram_addr_t offset)
1536 {
1537     param->block = block;
1538     param->offset = offset;
1539 }
1540 
1541 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1542 {
1543     int idx, thread_count, bytes_xmit = -1, pages = -1;
1544     bool wait = migrate_compress_wait_thread();
1545     MigrationState *ms = migrate_get_current();
1546 
1547     thread_count = migrate_compress_threads();
1548     qemu_mutex_lock(&comp_done_lock);
1549 retry:
1550     for (idx = 0; idx < thread_count; idx++) {
1551         if (comp_param[idx].done) {
1552             comp_param[idx].done = false;
1553             bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1554                                             comp_param[idx].file);
1555             qemu_mutex_lock(&comp_param[idx].mutex);
1556             set_compress_params(&comp_param[idx], block, offset);
1557             qemu_cond_signal(&comp_param[idx].cond);
1558             qemu_mutex_unlock(&comp_param[idx].mutex);
1559             pages = 1;
1560             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1561             break;
1562         }
1563     }
1564 
1565     /*
1566      * wait for the free thread if the user specifies 'compress-wait-thread',
1567      * otherwise we will post the page out in the main thread as normal page.
1568      */
1569     if (pages < 0 && wait) {
1570         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1571         goto retry;
1572     }
1573     qemu_mutex_unlock(&comp_done_lock);
1574 
1575     return pages;
1576 }
1577 
1578 #define PAGE_ALL_CLEAN 0
1579 #define PAGE_TRY_AGAIN 1
1580 #define PAGE_DIRTY_FOUND 2
1581 /**
1582  * find_dirty_block: find the next dirty page and update any state
1583  * associated with the search process.
1584  *
1585  * Returns:
1586  *         PAGE_ALL_CLEAN: no dirty page found, give up
1587  *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1588  *         PAGE_DIRTY_FOUND: dirty page found
1589  *
1590  * @rs: current RAM state
1591  * @pss: data about the state of the current dirty page scan
1592  * @again: set to false if the search has scanned the whole of RAM
1593  */
1594 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1595 {
1596     /* Update pss->page for the next dirty bit in ramblock */
1597     pss_find_next_dirty(pss);
1598 
1599     if (pss->complete_round && pss->block == rs->last_seen_block &&
1600         pss->page >= rs->last_page) {
1601         /*
1602          * We've been once around the RAM and haven't found anything.
1603          * Give up.
1604          */
1605         return PAGE_ALL_CLEAN;
1606     }
1607     if (!offset_in_ramblock(pss->block,
1608                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1609         /* Didn't find anything in this RAM Block */
1610         pss->page = 0;
1611         pss->block = QLIST_NEXT_RCU(pss->block, next);
1612         if (!pss->block) {
1613             /*
1614              * If memory migration starts over, we will meet a dirtied page
1615              * which may still exists in compression threads's ring, so we
1616              * should flush the compressed data to make sure the new page
1617              * is not overwritten by the old one in the destination.
1618              *
1619              * Also If xbzrle is on, stop using the data compression at this
1620              * point. In theory, xbzrle can do better than compression.
1621              */
1622             flush_compressed_data(rs);
1623 
1624             /* Hit the end of the list */
1625             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1626             /* Flag that we've looped */
1627             pss->complete_round = true;
1628             /* After the first round, enable XBZRLE. */
1629             if (migrate_use_xbzrle()) {
1630                 rs->xbzrle_enabled = true;
1631             }
1632         }
1633         /* Didn't find anything this time, but try again on the new block */
1634         return PAGE_TRY_AGAIN;
1635     } else {
1636         /* We've found something */
1637         return PAGE_DIRTY_FOUND;
1638     }
1639 }
1640 
1641 /**
1642  * unqueue_page: gets a page of the queue
1643  *
1644  * Helper for 'get_queued_page' - gets a page off the queue
1645  *
1646  * Returns the block of the page (or NULL if none available)
1647  *
1648  * @rs: current RAM state
1649  * @offset: used to return the offset within the RAMBlock
1650  */
1651 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1652 {
1653     struct RAMSrcPageRequest *entry;
1654     RAMBlock *block = NULL;
1655 
1656     if (!postcopy_has_request(rs)) {
1657         return NULL;
1658     }
1659 
1660     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1661 
1662     /*
1663      * This should _never_ change even after we take the lock, because no one
1664      * should be taking anything off the request list other than us.
1665      */
1666     assert(postcopy_has_request(rs));
1667 
1668     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1669     block = entry->rb;
1670     *offset = entry->offset;
1671 
1672     if (entry->len > TARGET_PAGE_SIZE) {
1673         entry->len -= TARGET_PAGE_SIZE;
1674         entry->offset += TARGET_PAGE_SIZE;
1675     } else {
1676         memory_region_unref(block->mr);
1677         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1678         g_free(entry);
1679         migration_consume_urgent_request();
1680     }
1681 
1682     return block;
1683 }
1684 
1685 #if defined(__linux__)
1686 /**
1687  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1688  *   is found, return RAM block pointer and page offset
1689  *
1690  * Returns pointer to the RAMBlock containing faulting page,
1691  *   NULL if no write faults are pending
1692  *
1693  * @rs: current RAM state
1694  * @offset: page offset from the beginning of the block
1695  */
1696 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1697 {
1698     struct uffd_msg uffd_msg;
1699     void *page_address;
1700     RAMBlock *block;
1701     int res;
1702 
1703     if (!migrate_background_snapshot()) {
1704         return NULL;
1705     }
1706 
1707     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1708     if (res <= 0) {
1709         return NULL;
1710     }
1711 
1712     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1713     block = qemu_ram_block_from_host(page_address, false, offset);
1714     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1715     return block;
1716 }
1717 
1718 /**
1719  * ram_save_release_protection: release UFFD write protection after
1720  *   a range of pages has been saved
1721  *
1722  * @rs: current RAM state
1723  * @pss: page-search-status structure
1724  * @start_page: index of the first page in the range relative to pss->block
1725  *
1726  * Returns 0 on success, negative value in case of an error
1727 */
1728 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1729         unsigned long start_page)
1730 {
1731     int res = 0;
1732 
1733     /* Check if page is from UFFD-managed region. */
1734     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1735         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1736         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1737 
1738         /* Flush async buffers before un-protect. */
1739         qemu_fflush(pss->pss_channel);
1740         /* Un-protect memory range. */
1741         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1742                 false, false);
1743     }
1744 
1745     return res;
1746 }
1747 
1748 /* ram_write_tracking_available: check if kernel supports required UFFD features
1749  *
1750  * Returns true if supports, false otherwise
1751  */
1752 bool ram_write_tracking_available(void)
1753 {
1754     uint64_t uffd_features;
1755     int res;
1756 
1757     res = uffd_query_features(&uffd_features);
1758     return (res == 0 &&
1759             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1760 }
1761 
1762 /* ram_write_tracking_compatible: check if guest configuration is
1763  *   compatible with 'write-tracking'
1764  *
1765  * Returns true if compatible, false otherwise
1766  */
1767 bool ram_write_tracking_compatible(void)
1768 {
1769     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1770     int uffd_fd;
1771     RAMBlock *block;
1772     bool ret = false;
1773 
1774     /* Open UFFD file descriptor */
1775     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1776     if (uffd_fd < 0) {
1777         return false;
1778     }
1779 
1780     RCU_READ_LOCK_GUARD();
1781 
1782     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1783         uint64_t uffd_ioctls;
1784 
1785         /* Nothing to do with read-only and MMIO-writable regions */
1786         if (block->mr->readonly || block->mr->rom_device) {
1787             continue;
1788         }
1789         /* Try to register block memory via UFFD-IO to track writes */
1790         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1791                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1792             goto out;
1793         }
1794         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1795             goto out;
1796         }
1797     }
1798     ret = true;
1799 
1800 out:
1801     uffd_close_fd(uffd_fd);
1802     return ret;
1803 }
1804 
1805 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1806                                        ram_addr_t size)
1807 {
1808     const ram_addr_t end = offset + size;
1809 
1810     /*
1811      * We read one byte of each page; this will preallocate page tables if
1812      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1813      * where no page was populated yet. This might require adaption when
1814      * supporting other mappings, like shmem.
1815      */
1816     for (; offset < end; offset += block->page_size) {
1817         char tmp = *((char *)block->host + offset);
1818 
1819         /* Don't optimize the read out */
1820         asm volatile("" : "+r" (tmp));
1821     }
1822 }
1823 
1824 static inline int populate_read_section(MemoryRegionSection *section,
1825                                         void *opaque)
1826 {
1827     const hwaddr size = int128_get64(section->size);
1828     hwaddr offset = section->offset_within_region;
1829     RAMBlock *block = section->mr->ram_block;
1830 
1831     populate_read_range(block, offset, size);
1832     return 0;
1833 }
1834 
1835 /*
1836  * ram_block_populate_read: preallocate page tables and populate pages in the
1837  *   RAM block by reading a byte of each page.
1838  *
1839  * Since it's solely used for userfault_fd WP feature, here we just
1840  *   hardcode page size to qemu_real_host_page_size.
1841  *
1842  * @block: RAM block to populate
1843  */
1844 static void ram_block_populate_read(RAMBlock *rb)
1845 {
1846     /*
1847      * Skip populating all pages that fall into a discarded range as managed by
1848      * a RamDiscardManager responsible for the mapped memory region of the
1849      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1850      * must not get populated automatically. We don't have to track
1851      * modifications via userfaultfd WP reliably, because these pages will
1852      * not be part of the migration stream either way -- see
1853      * ramblock_dirty_bitmap_exclude_discarded_pages().
1854      *
1855      * Note: The result is only stable while migrating (precopy/postcopy).
1856      */
1857     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1858         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1859         MemoryRegionSection section = {
1860             .mr = rb->mr,
1861             .offset_within_region = 0,
1862             .size = rb->mr->size,
1863         };
1864 
1865         ram_discard_manager_replay_populated(rdm, &section,
1866                                              populate_read_section, NULL);
1867     } else {
1868         populate_read_range(rb, 0, rb->used_length);
1869     }
1870 }
1871 
1872 /*
1873  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1874  */
1875 void ram_write_tracking_prepare(void)
1876 {
1877     RAMBlock *block;
1878 
1879     RCU_READ_LOCK_GUARD();
1880 
1881     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1882         /* Nothing to do with read-only and MMIO-writable regions */
1883         if (block->mr->readonly || block->mr->rom_device) {
1884             continue;
1885         }
1886 
1887         /*
1888          * Populate pages of the RAM block before enabling userfault_fd
1889          * write protection.
1890          *
1891          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1892          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1893          * pages with pte_none() entries in page table.
1894          */
1895         ram_block_populate_read(block);
1896     }
1897 }
1898 
1899 static inline int uffd_protect_section(MemoryRegionSection *section,
1900                                        void *opaque)
1901 {
1902     const hwaddr size = int128_get64(section->size);
1903     const hwaddr offset = section->offset_within_region;
1904     RAMBlock *rb = section->mr->ram_block;
1905     int uffd_fd = (uintptr_t)opaque;
1906 
1907     return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1908                                   false);
1909 }
1910 
1911 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1912 {
1913     assert(rb->flags & RAM_UF_WRITEPROTECT);
1914 
1915     /* See ram_block_populate_read() */
1916     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1917         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1918         MemoryRegionSection section = {
1919             .mr = rb->mr,
1920             .offset_within_region = 0,
1921             .size = rb->mr->size,
1922         };
1923 
1924         return ram_discard_manager_replay_populated(rdm, &section,
1925                                                     uffd_protect_section,
1926                                                     (void *)(uintptr_t)uffd_fd);
1927     }
1928     return uffd_change_protection(uffd_fd, rb->host,
1929                                   rb->used_length, true, false);
1930 }
1931 
1932 /*
1933  * ram_write_tracking_start: start UFFD-WP memory tracking
1934  *
1935  * Returns 0 for success or negative value in case of error
1936  */
1937 int ram_write_tracking_start(void)
1938 {
1939     int uffd_fd;
1940     RAMState *rs = ram_state;
1941     RAMBlock *block;
1942 
1943     /* Open UFFD file descriptor */
1944     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1945     if (uffd_fd < 0) {
1946         return uffd_fd;
1947     }
1948     rs->uffdio_fd = uffd_fd;
1949 
1950     RCU_READ_LOCK_GUARD();
1951 
1952     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1953         /* Nothing to do with read-only and MMIO-writable regions */
1954         if (block->mr->readonly || block->mr->rom_device) {
1955             continue;
1956         }
1957 
1958         /* Register block memory with UFFD to track writes */
1959         if (uffd_register_memory(rs->uffdio_fd, block->host,
1960                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1961             goto fail;
1962         }
1963         block->flags |= RAM_UF_WRITEPROTECT;
1964         memory_region_ref(block->mr);
1965 
1966         /* Apply UFFD write protection to the block memory range */
1967         if (ram_block_uffd_protect(block, uffd_fd)) {
1968             goto fail;
1969         }
1970 
1971         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1972                 block->host, block->max_length);
1973     }
1974 
1975     return 0;
1976 
1977 fail:
1978     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1979 
1980     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1981         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1982             continue;
1983         }
1984         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1985         /* Cleanup flags and remove reference */
1986         block->flags &= ~RAM_UF_WRITEPROTECT;
1987         memory_region_unref(block->mr);
1988     }
1989 
1990     uffd_close_fd(uffd_fd);
1991     rs->uffdio_fd = -1;
1992     return -1;
1993 }
1994 
1995 /**
1996  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1997  */
1998 void ram_write_tracking_stop(void)
1999 {
2000     RAMState *rs = ram_state;
2001     RAMBlock *block;
2002 
2003     RCU_READ_LOCK_GUARD();
2004 
2005     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2006         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
2007             continue;
2008         }
2009         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
2010 
2011         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2012                 block->host, block->max_length);
2013 
2014         /* Cleanup flags and remove reference */
2015         block->flags &= ~RAM_UF_WRITEPROTECT;
2016         memory_region_unref(block->mr);
2017     }
2018 
2019     /* Finally close UFFD file descriptor */
2020     uffd_close_fd(rs->uffdio_fd);
2021     rs->uffdio_fd = -1;
2022 }
2023 
2024 #else
2025 /* No target OS support, stubs just fail or ignore */
2026 
2027 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2028 {
2029     (void) rs;
2030     (void) offset;
2031 
2032     return NULL;
2033 }
2034 
2035 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2036         unsigned long start_page)
2037 {
2038     (void) rs;
2039     (void) pss;
2040     (void) start_page;
2041 
2042     return 0;
2043 }
2044 
2045 bool ram_write_tracking_available(void)
2046 {
2047     return false;
2048 }
2049 
2050 bool ram_write_tracking_compatible(void)
2051 {
2052     assert(0);
2053     return false;
2054 }
2055 
2056 int ram_write_tracking_start(void)
2057 {
2058     assert(0);
2059     return -1;
2060 }
2061 
2062 void ram_write_tracking_stop(void)
2063 {
2064     assert(0);
2065 }
2066 #endif /* defined(__linux__) */
2067 
2068 /**
2069  * get_queued_page: unqueue a page from the postcopy requests
2070  *
2071  * Skips pages that are already sent (!dirty)
2072  *
2073  * Returns true if a queued page is found
2074  *
2075  * @rs: current RAM state
2076  * @pss: data about the state of the current dirty page scan
2077  */
2078 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2079 {
2080     RAMBlock  *block;
2081     ram_addr_t offset;
2082     bool dirty;
2083 
2084     do {
2085         block = unqueue_page(rs, &offset);
2086         /*
2087          * We're sending this page, and since it's postcopy nothing else
2088          * will dirty it, and we must make sure it doesn't get sent again
2089          * even if this queue request was received after the background
2090          * search already sent it.
2091          */
2092         if (block) {
2093             unsigned long page;
2094 
2095             page = offset >> TARGET_PAGE_BITS;
2096             dirty = test_bit(page, block->bmap);
2097             if (!dirty) {
2098                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2099                                                 page);
2100             } else {
2101                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2102             }
2103         }
2104 
2105     } while (block && !dirty);
2106 
2107     if (!block) {
2108         /*
2109          * Poll write faults too if background snapshot is enabled; that's
2110          * when we have vcpus got blocked by the write protected pages.
2111          */
2112         block = poll_fault_page(rs, &offset);
2113     }
2114 
2115     if (block) {
2116         /*
2117          * We want the background search to continue from the queued page
2118          * since the guest is likely to want other pages near to the page
2119          * it just requested.
2120          */
2121         pss->block = block;
2122         pss->page = offset >> TARGET_PAGE_BITS;
2123 
2124         /*
2125          * This unqueued page would break the "one round" check, even is
2126          * really rare.
2127          */
2128         pss->complete_round = false;
2129     }
2130 
2131     return !!block;
2132 }
2133 
2134 /**
2135  * migration_page_queue_free: drop any remaining pages in the ram
2136  * request queue
2137  *
2138  * It should be empty at the end anyway, but in error cases there may
2139  * be some left.  in case that there is any page left, we drop it.
2140  *
2141  */
2142 static void migration_page_queue_free(RAMState *rs)
2143 {
2144     struct RAMSrcPageRequest *mspr, *next_mspr;
2145     /* This queue generally should be empty - but in the case of a failed
2146      * migration might have some droppings in.
2147      */
2148     RCU_READ_LOCK_GUARD();
2149     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2150         memory_region_unref(mspr->rb->mr);
2151         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2152         g_free(mspr);
2153     }
2154 }
2155 
2156 /**
2157  * ram_save_queue_pages: queue the page for transmission
2158  *
2159  * A request from postcopy destination for example.
2160  *
2161  * Returns zero on success or negative on error
2162  *
2163  * @rbname: Name of the RAMBLock of the request. NULL means the
2164  *          same that last one.
2165  * @start: starting address from the start of the RAMBlock
2166  * @len: length (in bytes) to send
2167  */
2168 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2169 {
2170     RAMBlock *ramblock;
2171     RAMState *rs = ram_state;
2172 
2173     stat64_add(&ram_counters.postcopy_requests, 1);
2174     RCU_READ_LOCK_GUARD();
2175 
2176     if (!rbname) {
2177         /* Reuse last RAMBlock */
2178         ramblock = rs->last_req_rb;
2179 
2180         if (!ramblock) {
2181             /*
2182              * Shouldn't happen, we can't reuse the last RAMBlock if
2183              * it's the 1st request.
2184              */
2185             error_report("ram_save_queue_pages no previous block");
2186             return -1;
2187         }
2188     } else {
2189         ramblock = qemu_ram_block_by_name(rbname);
2190 
2191         if (!ramblock) {
2192             /* We shouldn't be asked for a non-existent RAMBlock */
2193             error_report("ram_save_queue_pages no block '%s'", rbname);
2194             return -1;
2195         }
2196         rs->last_req_rb = ramblock;
2197     }
2198     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2199     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2200         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2201                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2202                      __func__, start, len, ramblock->used_length);
2203         return -1;
2204     }
2205 
2206     /*
2207      * When with postcopy preempt, we send back the page directly in the
2208      * rp-return thread.
2209      */
2210     if (postcopy_preempt_active()) {
2211         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2212         size_t page_size = qemu_ram_pagesize(ramblock);
2213         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2214         int ret = 0;
2215 
2216         qemu_mutex_lock(&rs->bitmap_mutex);
2217 
2218         pss_init(pss, ramblock, page_start);
2219         /*
2220          * Always use the preempt channel, and make sure it's there.  It's
2221          * safe to access without lock, because when rp-thread is running
2222          * we should be the only one who operates on the qemufile
2223          */
2224         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2225         assert(pss->pss_channel);
2226 
2227         /*
2228          * It must be either one or multiple of host page size.  Just
2229          * assert; if something wrong we're mostly split brain anyway.
2230          */
2231         assert(len % page_size == 0);
2232         while (len) {
2233             if (ram_save_host_page_urgent(pss)) {
2234                 error_report("%s: ram_save_host_page_urgent() failed: "
2235                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2236                              __func__, ramblock->idstr, start);
2237                 ret = -1;
2238                 break;
2239             }
2240             /*
2241              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2242              * will automatically be moved and point to the next host page
2243              * we're going to send, so no need to update here.
2244              *
2245              * Normally QEMU never sends >1 host page in requests, so
2246              * logically we don't even need that as the loop should only
2247              * run once, but just to be consistent.
2248              */
2249             len -= page_size;
2250         };
2251         qemu_mutex_unlock(&rs->bitmap_mutex);
2252 
2253         return ret;
2254     }
2255 
2256     struct RAMSrcPageRequest *new_entry =
2257         g_new0(struct RAMSrcPageRequest, 1);
2258     new_entry->rb = ramblock;
2259     new_entry->offset = start;
2260     new_entry->len = len;
2261 
2262     memory_region_ref(ramblock->mr);
2263     qemu_mutex_lock(&rs->src_page_req_mutex);
2264     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2265     migration_make_urgent_request();
2266     qemu_mutex_unlock(&rs->src_page_req_mutex);
2267 
2268     return 0;
2269 }
2270 
2271 static bool save_page_use_compression(RAMState *rs)
2272 {
2273     if (!migrate_compress()) {
2274         return false;
2275     }
2276 
2277     /*
2278      * If xbzrle is enabled (e.g., after first round of migration), stop
2279      * using the data compression. In theory, xbzrle can do better than
2280      * compression.
2281      */
2282     if (rs->xbzrle_enabled) {
2283         return false;
2284     }
2285 
2286     return true;
2287 }
2288 
2289 /*
2290  * try to compress the page before posting it out, return true if the page
2291  * has been properly handled by compression, otherwise needs other
2292  * paths to handle it
2293  */
2294 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2295                                RAMBlock *block, ram_addr_t offset)
2296 {
2297     if (!save_page_use_compression(rs)) {
2298         return false;
2299     }
2300 
2301     /*
2302      * When starting the process of a new block, the first page of
2303      * the block should be sent out before other pages in the same
2304      * block, and all the pages in last block should have been sent
2305      * out, keeping this order is important, because the 'cont' flag
2306      * is used to avoid resending the block name.
2307      *
2308      * We post the fist page as normal page as compression will take
2309      * much CPU resource.
2310      */
2311     if (block != pss->last_sent_block) {
2312         flush_compressed_data(rs);
2313         return false;
2314     }
2315 
2316     if (compress_page_with_multi_thread(block, offset) > 0) {
2317         return true;
2318     }
2319 
2320     compression_counters.busy++;
2321     return false;
2322 }
2323 
2324 /**
2325  * ram_save_target_page_legacy: save one target page
2326  *
2327  * Returns the number of pages written
2328  *
2329  * @rs: current RAM state
2330  * @pss: data about the page we want to send
2331  */
2332 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2333 {
2334     RAMBlock *block = pss->block;
2335     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2336     int res;
2337 
2338     if (control_save_page(pss, block, offset, &res)) {
2339         return res;
2340     }
2341 
2342     if (save_compress_page(rs, pss, block, offset)) {
2343         return 1;
2344     }
2345 
2346     res = save_zero_page(pss, pss->pss_channel, block, offset);
2347     if (res > 0) {
2348         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2349          * page would be stale
2350          */
2351         if (rs->xbzrle_enabled) {
2352             XBZRLE_cache_lock();
2353             xbzrle_cache_zero_page(rs, block->offset + offset);
2354             XBZRLE_cache_unlock();
2355         }
2356         return res;
2357     }
2358 
2359     /*
2360      * Do not use multifd in postcopy as one whole host page should be
2361      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2362      * if host page size == guest page size the dest guest during run may
2363      * still see partially copied pages which is data corruption.
2364      */
2365     if (migrate_use_multifd() && !migration_in_postcopy()) {
2366         return ram_save_multifd_page(pss->pss_channel, block, offset);
2367     }
2368 
2369     return ram_save_page(rs, pss);
2370 }
2371 
2372 /* Should be called before sending a host page */
2373 static void pss_host_page_prepare(PageSearchStatus *pss)
2374 {
2375     /* How many guest pages are there in one host page? */
2376     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2377 
2378     pss->host_page_sending = true;
2379     if (guest_pfns <= 1) {
2380         /*
2381          * This covers both when guest psize == host psize, or when guest
2382          * has larger psize than the host (guest_pfns==0).
2383          *
2384          * For the latter, we always send one whole guest page per
2385          * iteration of the host page (example: an Alpha VM on x86 host
2386          * will have guest psize 8K while host psize 4K).
2387          */
2388         pss->host_page_start = pss->page;
2389         pss->host_page_end = pss->page + 1;
2390     } else {
2391         /*
2392          * The host page spans over multiple guest pages, we send them
2393          * within the same host page iteration.
2394          */
2395         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2396         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2397     }
2398 }
2399 
2400 /*
2401  * Whether the page pointed by PSS is within the host page being sent.
2402  * Must be called after a previous pss_host_page_prepare().
2403  */
2404 static bool pss_within_range(PageSearchStatus *pss)
2405 {
2406     ram_addr_t ram_addr;
2407 
2408     assert(pss->host_page_sending);
2409 
2410     /* Over host-page boundary? */
2411     if (pss->page >= pss->host_page_end) {
2412         return false;
2413     }
2414 
2415     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2416 
2417     return offset_in_ramblock(pss->block, ram_addr);
2418 }
2419 
2420 static void pss_host_page_finish(PageSearchStatus *pss)
2421 {
2422     pss->host_page_sending = false;
2423     /* This is not needed, but just to reset it */
2424     pss->host_page_start = pss->host_page_end = 0;
2425 }
2426 
2427 /*
2428  * Send an urgent host page specified by `pss'.  Need to be called with
2429  * bitmap_mutex held.
2430  *
2431  * Returns 0 if save host page succeeded, false otherwise.
2432  */
2433 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2434 {
2435     bool page_dirty, sent = false;
2436     RAMState *rs = ram_state;
2437     int ret = 0;
2438 
2439     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2440     pss_host_page_prepare(pss);
2441 
2442     /*
2443      * If precopy is sending the same page, let it be done in precopy, or
2444      * we could send the same page in two channels and none of them will
2445      * receive the whole page.
2446      */
2447     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2448         trace_postcopy_preempt_hit(pss->block->idstr,
2449                                    pss->page << TARGET_PAGE_BITS);
2450         return 0;
2451     }
2452 
2453     do {
2454         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2455 
2456         if (page_dirty) {
2457             /* Be strict to return code; it must be 1, or what else? */
2458             if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2459                 error_report_once("%s: ram_save_target_page failed", __func__);
2460                 ret = -1;
2461                 goto out;
2462             }
2463             sent = true;
2464         }
2465         pss_find_next_dirty(pss);
2466     } while (pss_within_range(pss));
2467 out:
2468     pss_host_page_finish(pss);
2469     /* For urgent requests, flush immediately if sent */
2470     if (sent) {
2471         qemu_fflush(pss->pss_channel);
2472     }
2473     return ret;
2474 }
2475 
2476 /**
2477  * ram_save_host_page: save a whole host page
2478  *
2479  * Starting at *offset send pages up to the end of the current host
2480  * page. It's valid for the initial offset to point into the middle of
2481  * a host page in which case the remainder of the hostpage is sent.
2482  * Only dirty target pages are sent. Note that the host page size may
2483  * be a huge page for this block.
2484  *
2485  * The saving stops at the boundary of the used_length of the block
2486  * if the RAMBlock isn't a multiple of the host page size.
2487  *
2488  * The caller must be with ram_state.bitmap_mutex held to call this
2489  * function.  Note that this function can temporarily release the lock, but
2490  * when the function is returned it'll make sure the lock is still held.
2491  *
2492  * Returns the number of pages written or negative on error
2493  *
2494  * @rs: current RAM state
2495  * @pss: data about the page we want to send
2496  */
2497 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2498 {
2499     bool page_dirty, preempt_active = postcopy_preempt_active();
2500     int tmppages, pages = 0;
2501     size_t pagesize_bits =
2502         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2503     unsigned long start_page = pss->page;
2504     int res;
2505 
2506     if (ramblock_is_ignored(pss->block)) {
2507         error_report("block %s should not be migrated !", pss->block->idstr);
2508         return 0;
2509     }
2510 
2511     /* Update host page boundary information */
2512     pss_host_page_prepare(pss);
2513 
2514     do {
2515         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2516 
2517         /* Check the pages is dirty and if it is send it */
2518         if (page_dirty) {
2519             /*
2520              * Properly yield the lock only in postcopy preempt mode
2521              * because both migration thread and rp-return thread can
2522              * operate on the bitmaps.
2523              */
2524             if (preempt_active) {
2525                 qemu_mutex_unlock(&rs->bitmap_mutex);
2526             }
2527             tmppages = migration_ops->ram_save_target_page(rs, pss);
2528             if (tmppages >= 0) {
2529                 pages += tmppages;
2530                 /*
2531                  * Allow rate limiting to happen in the middle of huge pages if
2532                  * something is sent in the current iteration.
2533                  */
2534                 if (pagesize_bits > 1 && tmppages > 0) {
2535                     migration_rate_limit();
2536                 }
2537             }
2538             if (preempt_active) {
2539                 qemu_mutex_lock(&rs->bitmap_mutex);
2540             }
2541         } else {
2542             tmppages = 0;
2543         }
2544 
2545         if (tmppages < 0) {
2546             pss_host_page_finish(pss);
2547             return tmppages;
2548         }
2549 
2550         pss_find_next_dirty(pss);
2551     } while (pss_within_range(pss));
2552 
2553     pss_host_page_finish(pss);
2554 
2555     res = ram_save_release_protection(rs, pss, start_page);
2556     return (res < 0 ? res : pages);
2557 }
2558 
2559 /**
2560  * ram_find_and_save_block: finds a dirty page and sends it to f
2561  *
2562  * Called within an RCU critical section.
2563  *
2564  * Returns the number of pages written where zero means no dirty pages,
2565  * or negative on error
2566  *
2567  * @rs: current RAM state
2568  *
2569  * On systems where host-page-size > target-page-size it will send all the
2570  * pages in a host page that are dirty.
2571  */
2572 static int ram_find_and_save_block(RAMState *rs)
2573 {
2574     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2575     int pages = 0;
2576 
2577     /* No dirty page as there is zero RAM */
2578     if (!rs->ram_bytes_total) {
2579         return pages;
2580     }
2581 
2582     /*
2583      * Always keep last_seen_block/last_page valid during this procedure,
2584      * because find_dirty_block() relies on these values (e.g., we compare
2585      * last_seen_block with pss.block to see whether we searched all the
2586      * ramblocks) to detect the completion of migration.  Having NULL value
2587      * of last_seen_block can conditionally cause below loop to run forever.
2588      */
2589     if (!rs->last_seen_block) {
2590         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2591         rs->last_page = 0;
2592     }
2593 
2594     pss_init(pss, rs->last_seen_block, rs->last_page);
2595 
2596     while (true){
2597         if (!get_queued_page(rs, pss)) {
2598             /* priority queue empty, so just search for something dirty */
2599             int res = find_dirty_block(rs, pss);
2600             if (res != PAGE_DIRTY_FOUND) {
2601                 if (res == PAGE_ALL_CLEAN) {
2602                     break;
2603                 } else if (res == PAGE_TRY_AGAIN) {
2604                     continue;
2605                 }
2606             }
2607         }
2608         pages = ram_save_host_page(rs, pss);
2609         if (pages) {
2610             break;
2611         }
2612     }
2613 
2614     rs->last_seen_block = pss->block;
2615     rs->last_page = pss->page;
2616 
2617     return pages;
2618 }
2619 
2620 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2621 {
2622     uint64_t pages = size / TARGET_PAGE_SIZE;
2623 
2624     if (zero) {
2625         stat64_add(&ram_counters.zero_pages, pages);
2626     } else {
2627         stat64_add(&ram_counters.normal_pages, pages);
2628         ram_transferred_add(size);
2629         qemu_file_credit_transfer(f, size);
2630     }
2631 }
2632 
2633 static uint64_t ram_bytes_total_with_ignored(void)
2634 {
2635     RAMBlock *block;
2636     uint64_t total = 0;
2637 
2638     RCU_READ_LOCK_GUARD();
2639 
2640     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2641         total += block->used_length;
2642     }
2643     return total;
2644 }
2645 
2646 uint64_t ram_bytes_total(void)
2647 {
2648     RAMBlock *block;
2649     uint64_t total = 0;
2650 
2651     RCU_READ_LOCK_GUARD();
2652 
2653     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2654         total += block->used_length;
2655     }
2656     return total;
2657 }
2658 
2659 static void xbzrle_load_setup(void)
2660 {
2661     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2662 }
2663 
2664 static void xbzrle_load_cleanup(void)
2665 {
2666     g_free(XBZRLE.decoded_buf);
2667     XBZRLE.decoded_buf = NULL;
2668 }
2669 
2670 static void ram_state_cleanup(RAMState **rsp)
2671 {
2672     if (*rsp) {
2673         migration_page_queue_free(*rsp);
2674         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2675         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2676         g_free(*rsp);
2677         *rsp = NULL;
2678     }
2679 }
2680 
2681 static void xbzrle_cleanup(void)
2682 {
2683     XBZRLE_cache_lock();
2684     if (XBZRLE.cache) {
2685         cache_fini(XBZRLE.cache);
2686         g_free(XBZRLE.encoded_buf);
2687         g_free(XBZRLE.current_buf);
2688         g_free(XBZRLE.zero_target_page);
2689         XBZRLE.cache = NULL;
2690         XBZRLE.encoded_buf = NULL;
2691         XBZRLE.current_buf = NULL;
2692         XBZRLE.zero_target_page = NULL;
2693     }
2694     XBZRLE_cache_unlock();
2695 }
2696 
2697 static void ram_save_cleanup(void *opaque)
2698 {
2699     RAMState **rsp = opaque;
2700     RAMBlock *block;
2701 
2702     /* We don't use dirty log with background snapshots */
2703     if (!migrate_background_snapshot()) {
2704         /* caller have hold iothread lock or is in a bh, so there is
2705          * no writing race against the migration bitmap
2706          */
2707         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2708             /*
2709              * do not stop dirty log without starting it, since
2710              * memory_global_dirty_log_stop will assert that
2711              * memory_global_dirty_log_start/stop used in pairs
2712              */
2713             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2714         }
2715     }
2716 
2717     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2718         g_free(block->clear_bmap);
2719         block->clear_bmap = NULL;
2720         g_free(block->bmap);
2721         block->bmap = NULL;
2722     }
2723 
2724     xbzrle_cleanup();
2725     compress_threads_save_cleanup();
2726     ram_state_cleanup(rsp);
2727     g_free(migration_ops);
2728     migration_ops = NULL;
2729 }
2730 
2731 static void ram_state_reset(RAMState *rs)
2732 {
2733     int i;
2734 
2735     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2736         rs->pss[i].last_sent_block = NULL;
2737     }
2738 
2739     rs->last_seen_block = NULL;
2740     rs->last_page = 0;
2741     rs->last_version = ram_list.version;
2742     rs->xbzrle_enabled = false;
2743 }
2744 
2745 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2746 
2747 /* **** functions for postcopy ***** */
2748 
2749 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2750 {
2751     struct RAMBlock *block;
2752 
2753     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2754         unsigned long *bitmap = block->bmap;
2755         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2756         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2757 
2758         while (run_start < range) {
2759             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2760             ram_discard_range(block->idstr,
2761                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2762                               ((ram_addr_t)(run_end - run_start))
2763                                 << TARGET_PAGE_BITS);
2764             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2765         }
2766     }
2767 }
2768 
2769 /**
2770  * postcopy_send_discard_bm_ram: discard a RAMBlock
2771  *
2772  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2773  *
2774  * @ms: current migration state
2775  * @block: RAMBlock to discard
2776  */
2777 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2778 {
2779     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2780     unsigned long current;
2781     unsigned long *bitmap = block->bmap;
2782 
2783     for (current = 0; current < end; ) {
2784         unsigned long one = find_next_bit(bitmap, end, current);
2785         unsigned long zero, discard_length;
2786 
2787         if (one >= end) {
2788             break;
2789         }
2790 
2791         zero = find_next_zero_bit(bitmap, end, one + 1);
2792 
2793         if (zero >= end) {
2794             discard_length = end - one;
2795         } else {
2796             discard_length = zero - one;
2797         }
2798         postcopy_discard_send_range(ms, one, discard_length);
2799         current = one + discard_length;
2800     }
2801 }
2802 
2803 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2804 
2805 /**
2806  * postcopy_each_ram_send_discard: discard all RAMBlocks
2807  *
2808  * Utility for the outgoing postcopy code.
2809  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2810  *   passing it bitmap indexes and name.
2811  * (qemu_ram_foreach_block ends up passing unscaled lengths
2812  *  which would mean postcopy code would have to deal with target page)
2813  *
2814  * @ms: current migration state
2815  */
2816 static void postcopy_each_ram_send_discard(MigrationState *ms)
2817 {
2818     struct RAMBlock *block;
2819 
2820     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2821         postcopy_discard_send_init(ms, block->idstr);
2822 
2823         /*
2824          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2825          * host-page size chunks, mark any partially dirty host-page size
2826          * chunks as all dirty.  In this case the host-page is the host-page
2827          * for the particular RAMBlock, i.e. it might be a huge page.
2828          */
2829         postcopy_chunk_hostpages_pass(ms, block);
2830 
2831         /*
2832          * Postcopy sends chunks of bitmap over the wire, but it
2833          * just needs indexes at this point, avoids it having
2834          * target page specific code.
2835          */
2836         postcopy_send_discard_bm_ram(ms, block);
2837         postcopy_discard_send_finish(ms);
2838     }
2839 }
2840 
2841 /**
2842  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2843  *
2844  * Helper for postcopy_chunk_hostpages; it's called twice to
2845  * canonicalize the two bitmaps, that are similar, but one is
2846  * inverted.
2847  *
2848  * Postcopy requires that all target pages in a hostpage are dirty or
2849  * clean, not a mix.  This function canonicalizes the bitmaps.
2850  *
2851  * @ms: current migration state
2852  * @block: block that contains the page we want to canonicalize
2853  */
2854 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2855 {
2856     RAMState *rs = ram_state;
2857     unsigned long *bitmap = block->bmap;
2858     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2859     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2860     unsigned long run_start;
2861 
2862     if (block->page_size == TARGET_PAGE_SIZE) {
2863         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2864         return;
2865     }
2866 
2867     /* Find a dirty page */
2868     run_start = find_next_bit(bitmap, pages, 0);
2869 
2870     while (run_start < pages) {
2871 
2872         /*
2873          * If the start of this run of pages is in the middle of a host
2874          * page, then we need to fixup this host page.
2875          */
2876         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2877             /* Find the end of this run */
2878             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2879             /*
2880              * If the end isn't at the start of a host page, then the
2881              * run doesn't finish at the end of a host page
2882              * and we need to discard.
2883              */
2884         }
2885 
2886         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2887             unsigned long page;
2888             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2889                                                              host_ratio);
2890             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2891 
2892             /* Clean up the bitmap */
2893             for (page = fixup_start_addr;
2894                  page < fixup_start_addr + host_ratio; page++) {
2895                 /*
2896                  * Remark them as dirty, updating the count for any pages
2897                  * that weren't previously dirty.
2898                  */
2899                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2900             }
2901         }
2902 
2903         /* Find the next dirty page for the next iteration */
2904         run_start = find_next_bit(bitmap, pages, run_start);
2905     }
2906 }
2907 
2908 /**
2909  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2910  *
2911  * Transmit the set of pages to be discarded after precopy to the target
2912  * these are pages that:
2913  *     a) Have been previously transmitted but are now dirty again
2914  *     b) Pages that have never been transmitted, this ensures that
2915  *        any pages on the destination that have been mapped by background
2916  *        tasks get discarded (transparent huge pages is the specific concern)
2917  * Hopefully this is pretty sparse
2918  *
2919  * @ms: current migration state
2920  */
2921 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2922 {
2923     RAMState *rs = ram_state;
2924 
2925     RCU_READ_LOCK_GUARD();
2926 
2927     /* This should be our last sync, the src is now paused */
2928     migration_bitmap_sync(rs);
2929 
2930     /* Easiest way to make sure we don't resume in the middle of a host-page */
2931     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2932     rs->last_seen_block = NULL;
2933     rs->last_page = 0;
2934 
2935     postcopy_each_ram_send_discard(ms);
2936 
2937     trace_ram_postcopy_send_discard_bitmap();
2938 }
2939 
2940 /**
2941  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2942  *
2943  * Returns zero on success
2944  *
2945  * @rbname: name of the RAMBlock of the request. NULL means the
2946  *          same that last one.
2947  * @start: RAMBlock starting page
2948  * @length: RAMBlock size
2949  */
2950 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2951 {
2952     trace_ram_discard_range(rbname, start, length);
2953 
2954     RCU_READ_LOCK_GUARD();
2955     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2956 
2957     if (!rb) {
2958         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2959         return -1;
2960     }
2961 
2962     /*
2963      * On source VM, we don't need to update the received bitmap since
2964      * we don't even have one.
2965      */
2966     if (rb->receivedmap) {
2967         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2968                      length >> qemu_target_page_bits());
2969     }
2970 
2971     return ram_block_discard_range(rb, start, length);
2972 }
2973 
2974 /*
2975  * For every allocation, we will try not to crash the VM if the
2976  * allocation failed.
2977  */
2978 static int xbzrle_init(void)
2979 {
2980     Error *local_err = NULL;
2981 
2982     if (!migrate_use_xbzrle()) {
2983         return 0;
2984     }
2985 
2986     XBZRLE_cache_lock();
2987 
2988     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2989     if (!XBZRLE.zero_target_page) {
2990         error_report("%s: Error allocating zero page", __func__);
2991         goto err_out;
2992     }
2993 
2994     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2995                               TARGET_PAGE_SIZE, &local_err);
2996     if (!XBZRLE.cache) {
2997         error_report_err(local_err);
2998         goto free_zero_page;
2999     }
3000 
3001     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3002     if (!XBZRLE.encoded_buf) {
3003         error_report("%s: Error allocating encoded_buf", __func__);
3004         goto free_cache;
3005     }
3006 
3007     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3008     if (!XBZRLE.current_buf) {
3009         error_report("%s: Error allocating current_buf", __func__);
3010         goto free_encoded_buf;
3011     }
3012 
3013     /* We are all good */
3014     XBZRLE_cache_unlock();
3015     return 0;
3016 
3017 free_encoded_buf:
3018     g_free(XBZRLE.encoded_buf);
3019     XBZRLE.encoded_buf = NULL;
3020 free_cache:
3021     cache_fini(XBZRLE.cache);
3022     XBZRLE.cache = NULL;
3023 free_zero_page:
3024     g_free(XBZRLE.zero_target_page);
3025     XBZRLE.zero_target_page = NULL;
3026 err_out:
3027     XBZRLE_cache_unlock();
3028     return -ENOMEM;
3029 }
3030 
3031 static int ram_state_init(RAMState **rsp)
3032 {
3033     *rsp = g_try_new0(RAMState, 1);
3034 
3035     if (!*rsp) {
3036         error_report("%s: Init ramstate fail", __func__);
3037         return -1;
3038     }
3039 
3040     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3041     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3042     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3043     (*rsp)->ram_bytes_total = ram_bytes_total();
3044 
3045     /*
3046      * Count the total number of pages used by ram blocks not including any
3047      * gaps due to alignment or unplugs.
3048      * This must match with the initial values of dirty bitmap.
3049      */
3050     (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
3051     ram_state_reset(*rsp);
3052 
3053     return 0;
3054 }
3055 
3056 static void ram_list_init_bitmaps(void)
3057 {
3058     MigrationState *ms = migrate_get_current();
3059     RAMBlock *block;
3060     unsigned long pages;
3061     uint8_t shift;
3062 
3063     /* Skip setting bitmap if there is no RAM */
3064     if (ram_bytes_total()) {
3065         shift = ms->clear_bitmap_shift;
3066         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3067             error_report("clear_bitmap_shift (%u) too big, using "
3068                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3069             shift = CLEAR_BITMAP_SHIFT_MAX;
3070         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3071             error_report("clear_bitmap_shift (%u) too small, using "
3072                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3073             shift = CLEAR_BITMAP_SHIFT_MIN;
3074         }
3075 
3076         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3077             pages = block->max_length >> TARGET_PAGE_BITS;
3078             /*
3079              * The initial dirty bitmap for migration must be set with all
3080              * ones to make sure we'll migrate every guest RAM page to
3081              * destination.
3082              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3083              * new migration after a failed migration, ram_list.
3084              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3085              * guest memory.
3086              */
3087             block->bmap = bitmap_new(pages);
3088             bitmap_set(block->bmap, 0, pages);
3089             block->clear_bmap_shift = shift;
3090             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3091         }
3092     }
3093 }
3094 
3095 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3096 {
3097     unsigned long pages;
3098     RAMBlock *rb;
3099 
3100     RCU_READ_LOCK_GUARD();
3101 
3102     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3103             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3104             rs->migration_dirty_pages -= pages;
3105     }
3106 }
3107 
3108 static void ram_init_bitmaps(RAMState *rs)
3109 {
3110     /* For memory_global_dirty_log_start below.  */
3111     qemu_mutex_lock_iothread();
3112     qemu_mutex_lock_ramlist();
3113 
3114     WITH_RCU_READ_LOCK_GUARD() {
3115         ram_list_init_bitmaps();
3116         /* We don't use dirty log with background snapshots */
3117         if (!migrate_background_snapshot()) {
3118             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3119             migration_bitmap_sync_precopy(rs);
3120         }
3121     }
3122     qemu_mutex_unlock_ramlist();
3123     qemu_mutex_unlock_iothread();
3124 
3125     /*
3126      * After an eventual first bitmap sync, fixup the initial bitmap
3127      * containing all 1s to exclude any discarded pages from migration.
3128      */
3129     migration_bitmap_clear_discarded_pages(rs);
3130 }
3131 
3132 static int ram_init_all(RAMState **rsp)
3133 {
3134     if (ram_state_init(rsp)) {
3135         return -1;
3136     }
3137 
3138     if (xbzrle_init()) {
3139         ram_state_cleanup(rsp);
3140         return -1;
3141     }
3142 
3143     ram_init_bitmaps(*rsp);
3144 
3145     return 0;
3146 }
3147 
3148 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3149 {
3150     RAMBlock *block;
3151     uint64_t pages = 0;
3152 
3153     /*
3154      * Postcopy is not using xbzrle/compression, so no need for that.
3155      * Also, since source are already halted, we don't need to care
3156      * about dirty page logging as well.
3157      */
3158 
3159     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3160         pages += bitmap_count_one(block->bmap,
3161                                   block->used_length >> TARGET_PAGE_BITS);
3162     }
3163 
3164     /* This may not be aligned with current bitmaps. Recalculate. */
3165     rs->migration_dirty_pages = pages;
3166 
3167     ram_state_reset(rs);
3168 
3169     /* Update RAMState cache of output QEMUFile */
3170     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
3171 
3172     trace_ram_state_resume_prepare(pages);
3173 }
3174 
3175 /*
3176  * This function clears bits of the free pages reported by the caller from the
3177  * migration dirty bitmap. @addr is the host address corresponding to the
3178  * start of the continuous guest free pages, and @len is the total bytes of
3179  * those pages.
3180  */
3181 void qemu_guest_free_page_hint(void *addr, size_t len)
3182 {
3183     RAMBlock *block;
3184     ram_addr_t offset;
3185     size_t used_len, start, npages;
3186     MigrationState *s = migrate_get_current();
3187 
3188     /* This function is currently expected to be used during live migration */
3189     if (!migration_is_setup_or_active(s->state)) {
3190         return;
3191     }
3192 
3193     for (; len > 0; len -= used_len, addr += used_len) {
3194         block = qemu_ram_block_from_host(addr, false, &offset);
3195         if (unlikely(!block || offset >= block->used_length)) {
3196             /*
3197              * The implementation might not support RAMBlock resize during
3198              * live migration, but it could happen in theory with future
3199              * updates. So we add a check here to capture that case.
3200              */
3201             error_report_once("%s unexpected error", __func__);
3202             return;
3203         }
3204 
3205         if (len <= block->used_length - offset) {
3206             used_len = len;
3207         } else {
3208             used_len = block->used_length - offset;
3209         }
3210 
3211         start = offset >> TARGET_PAGE_BITS;
3212         npages = used_len >> TARGET_PAGE_BITS;
3213 
3214         qemu_mutex_lock(&ram_state->bitmap_mutex);
3215         /*
3216          * The skipped free pages are equavalent to be sent from clear_bmap's
3217          * perspective, so clear the bits from the memory region bitmap which
3218          * are initially set. Otherwise those skipped pages will be sent in
3219          * the next round after syncing from the memory region bitmap.
3220          */
3221         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3222         ram_state->migration_dirty_pages -=
3223                       bitmap_count_one_with_offset(block->bmap, start, npages);
3224         bitmap_clear(block->bmap, start, npages);
3225         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3226     }
3227 }
3228 
3229 /*
3230  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3231  * long-running RCU critical section.  When rcu-reclaims in the code
3232  * start to become numerous it will be necessary to reduce the
3233  * granularity of these critical sections.
3234  */
3235 
3236 /**
3237  * ram_save_setup: Setup RAM for migration
3238  *
3239  * Returns zero to indicate success and negative for error
3240  *
3241  * @f: QEMUFile where to send the data
3242  * @opaque: RAMState pointer
3243  */
3244 static int ram_save_setup(QEMUFile *f, void *opaque)
3245 {
3246     RAMState **rsp = opaque;
3247     RAMBlock *block;
3248     int ret;
3249 
3250     if (compress_threads_save_setup()) {
3251         return -1;
3252     }
3253 
3254     /* migration has already setup the bitmap, reuse it. */
3255     if (!migration_in_colo_state()) {
3256         if (ram_init_all(rsp) != 0) {
3257             compress_threads_save_cleanup();
3258             return -1;
3259         }
3260     }
3261     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3262 
3263     WITH_RCU_READ_LOCK_GUARD() {
3264         qemu_put_be64(f, ram_bytes_total_with_ignored()
3265                          | RAM_SAVE_FLAG_MEM_SIZE);
3266 
3267         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3268             qemu_put_byte(f, strlen(block->idstr));
3269             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3270             qemu_put_be64(f, block->used_length);
3271             if (migrate_postcopy_ram() && block->page_size !=
3272                                           qemu_host_page_size) {
3273                 qemu_put_be64(f, block->page_size);
3274             }
3275             if (migrate_ignore_shared()) {
3276                 qemu_put_be64(f, block->mr->addr);
3277             }
3278         }
3279     }
3280 
3281     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3282     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3283 
3284     migration_ops = g_malloc0(sizeof(MigrationOps));
3285     migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3286     ret = multifd_send_sync_main(f);
3287     if (ret < 0) {
3288         return ret;
3289     }
3290 
3291     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3292     qemu_fflush(f);
3293 
3294     return 0;
3295 }
3296 
3297 /**
3298  * ram_save_iterate: iterative stage for migration
3299  *
3300  * Returns zero to indicate success and negative for error
3301  *
3302  * @f: QEMUFile where to send the data
3303  * @opaque: RAMState pointer
3304  */
3305 static int ram_save_iterate(QEMUFile *f, void *opaque)
3306 {
3307     RAMState **temp = opaque;
3308     RAMState *rs = *temp;
3309     int ret = 0;
3310     int i;
3311     int64_t t0;
3312     int done = 0;
3313 
3314     if (blk_mig_bulk_active()) {
3315         /* Avoid transferring ram during bulk phase of block migration as
3316          * the bulk phase will usually take a long time and transferring
3317          * ram updates during that time is pointless. */
3318         goto out;
3319     }
3320 
3321     /*
3322      * We'll take this lock a little bit long, but it's okay for two reasons.
3323      * Firstly, the only possible other thread to take it is who calls
3324      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3325      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3326      * guarantees that we'll at least released it in a regular basis.
3327      */
3328     qemu_mutex_lock(&rs->bitmap_mutex);
3329     WITH_RCU_READ_LOCK_GUARD() {
3330         if (ram_list.version != rs->last_version) {
3331             ram_state_reset(rs);
3332         }
3333 
3334         /* Read version before ram_list.blocks */
3335         smp_rmb();
3336 
3337         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3338 
3339         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3340         i = 0;
3341         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3342                postcopy_has_request(rs)) {
3343             int pages;
3344 
3345             if (qemu_file_get_error(f)) {
3346                 break;
3347             }
3348 
3349             pages = ram_find_and_save_block(rs);
3350             /* no more pages to sent */
3351             if (pages == 0) {
3352                 done = 1;
3353                 break;
3354             }
3355 
3356             if (pages < 0) {
3357                 qemu_file_set_error(f, pages);
3358                 break;
3359             }
3360 
3361             rs->target_page_count += pages;
3362 
3363             /*
3364              * During postcopy, it is necessary to make sure one whole host
3365              * page is sent in one chunk.
3366              */
3367             if (migrate_postcopy_ram()) {
3368                 flush_compressed_data(rs);
3369             }
3370 
3371             /*
3372              * we want to check in the 1st loop, just in case it was the 1st
3373              * time and we had to sync the dirty bitmap.
3374              * qemu_clock_get_ns() is a bit expensive, so we only check each
3375              * some iterations
3376              */
3377             if ((i & 63) == 0) {
3378                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3379                               1000000;
3380                 if (t1 > MAX_WAIT) {
3381                     trace_ram_save_iterate_big_wait(t1, i);
3382                     break;
3383                 }
3384             }
3385             i++;
3386         }
3387     }
3388     qemu_mutex_unlock(&rs->bitmap_mutex);
3389 
3390     /*
3391      * Must occur before EOS (or any QEMUFile operation)
3392      * because of RDMA protocol.
3393      */
3394     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3395 
3396 out:
3397     if (ret >= 0
3398         && migration_is_setup_or_active(migrate_get_current()->state)) {
3399         ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3400         if (ret < 0) {
3401             return ret;
3402         }
3403 
3404         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3405         qemu_fflush(f);
3406         ram_transferred_add(8);
3407 
3408         ret = qemu_file_get_error(f);
3409     }
3410     if (ret < 0) {
3411         return ret;
3412     }
3413 
3414     return done;
3415 }
3416 
3417 /**
3418  * ram_save_complete: function called to send the remaining amount of ram
3419  *
3420  * Returns zero to indicate success or negative on error
3421  *
3422  * Called with iothread lock
3423  *
3424  * @f: QEMUFile where to send the data
3425  * @opaque: RAMState pointer
3426  */
3427 static int ram_save_complete(QEMUFile *f, void *opaque)
3428 {
3429     RAMState **temp = opaque;
3430     RAMState *rs = *temp;
3431     int ret = 0;
3432 
3433     rs->last_stage = !migration_in_colo_state();
3434 
3435     WITH_RCU_READ_LOCK_GUARD() {
3436         if (!migration_in_postcopy()) {
3437             migration_bitmap_sync_precopy(rs);
3438         }
3439 
3440         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3441 
3442         /* try transferring iterative blocks of memory */
3443 
3444         /* flush all remaining blocks regardless of rate limiting */
3445         qemu_mutex_lock(&rs->bitmap_mutex);
3446         while (true) {
3447             int pages;
3448 
3449             pages = ram_find_and_save_block(rs);
3450             /* no more blocks to sent */
3451             if (pages == 0) {
3452                 break;
3453             }
3454             if (pages < 0) {
3455                 ret = pages;
3456                 break;
3457             }
3458         }
3459         qemu_mutex_unlock(&rs->bitmap_mutex);
3460 
3461         flush_compressed_data(rs);
3462         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3463     }
3464 
3465     if (ret < 0) {
3466         return ret;
3467     }
3468 
3469     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3470     if (ret < 0) {
3471         return ret;
3472     }
3473 
3474     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3475     qemu_fflush(f);
3476 
3477     return 0;
3478 }
3479 
3480 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3481                                        uint64_t *can_postcopy)
3482 {
3483     RAMState **temp = opaque;
3484     RAMState *rs = *temp;
3485 
3486     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3487 
3488     if (migrate_postcopy_ram()) {
3489         /* We can do postcopy, and all the data is postcopiable */
3490         *can_postcopy += remaining_size;
3491     } else {
3492         *must_precopy += remaining_size;
3493     }
3494 }
3495 
3496 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3497                                     uint64_t *can_postcopy)
3498 {
3499     MigrationState *s = migrate_get_current();
3500     RAMState **temp = opaque;
3501     RAMState *rs = *temp;
3502 
3503     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3504 
3505     if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3506         qemu_mutex_lock_iothread();
3507         WITH_RCU_READ_LOCK_GUARD() {
3508             migration_bitmap_sync_precopy(rs);
3509         }
3510         qemu_mutex_unlock_iothread();
3511         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3512     }
3513 
3514     if (migrate_postcopy_ram()) {
3515         /* We can do postcopy, and all the data is postcopiable */
3516         *can_postcopy += remaining_size;
3517     } else {
3518         *must_precopy += remaining_size;
3519     }
3520 }
3521 
3522 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3523 {
3524     unsigned int xh_len;
3525     int xh_flags;
3526     uint8_t *loaded_data;
3527 
3528     /* extract RLE header */
3529     xh_flags = qemu_get_byte(f);
3530     xh_len = qemu_get_be16(f);
3531 
3532     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3533         error_report("Failed to load XBZRLE page - wrong compression!");
3534         return -1;
3535     }
3536 
3537     if (xh_len > TARGET_PAGE_SIZE) {
3538         error_report("Failed to load XBZRLE page - len overflow!");
3539         return -1;
3540     }
3541     loaded_data = XBZRLE.decoded_buf;
3542     /* load data and decode */
3543     /* it can change loaded_data to point to an internal buffer */
3544     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3545 
3546     /* decode RLE */
3547     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3548                              TARGET_PAGE_SIZE) == -1) {
3549         error_report("Failed to load XBZRLE page - decode error!");
3550         return -1;
3551     }
3552 
3553     return 0;
3554 }
3555 
3556 /**
3557  * ram_block_from_stream: read a RAMBlock id from the migration stream
3558  *
3559  * Must be called from within a rcu critical section.
3560  *
3561  * Returns a pointer from within the RCU-protected ram_list.
3562  *
3563  * @mis: the migration incoming state pointer
3564  * @f: QEMUFile where to read the data from
3565  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3566  * @channel: the channel we're using
3567  */
3568 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3569                                               QEMUFile *f, int flags,
3570                                               int channel)
3571 {
3572     RAMBlock *block = mis->last_recv_block[channel];
3573     char id[256];
3574     uint8_t len;
3575 
3576     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3577         if (!block) {
3578             error_report("Ack, bad migration stream!");
3579             return NULL;
3580         }
3581         return block;
3582     }
3583 
3584     len = qemu_get_byte(f);
3585     qemu_get_buffer(f, (uint8_t *)id, len);
3586     id[len] = 0;
3587 
3588     block = qemu_ram_block_by_name(id);
3589     if (!block) {
3590         error_report("Can't find block %s", id);
3591         return NULL;
3592     }
3593 
3594     if (ramblock_is_ignored(block)) {
3595         error_report("block %s should not be migrated !", id);
3596         return NULL;
3597     }
3598 
3599     mis->last_recv_block[channel] = block;
3600 
3601     return block;
3602 }
3603 
3604 static inline void *host_from_ram_block_offset(RAMBlock *block,
3605                                                ram_addr_t offset)
3606 {
3607     if (!offset_in_ramblock(block, offset)) {
3608         return NULL;
3609     }
3610 
3611     return block->host + offset;
3612 }
3613 
3614 static void *host_page_from_ram_block_offset(RAMBlock *block,
3615                                              ram_addr_t offset)
3616 {
3617     /* Note: Explicitly no check against offset_in_ramblock(). */
3618     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3619                                    block->page_size);
3620 }
3621 
3622 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3623                                                          ram_addr_t offset)
3624 {
3625     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3626 }
3627 
3628 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3629                              ram_addr_t offset, bool record_bitmap)
3630 {
3631     if (!offset_in_ramblock(block, offset)) {
3632         return NULL;
3633     }
3634     if (!block->colo_cache) {
3635         error_report("%s: colo_cache is NULL in block :%s",
3636                      __func__, block->idstr);
3637         return NULL;
3638     }
3639 
3640     /*
3641     * During colo checkpoint, we need bitmap of these migrated pages.
3642     * It help us to decide which pages in ram cache should be flushed
3643     * into VM's RAM later.
3644     */
3645     if (record_bitmap &&
3646         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3647         ram_state->migration_dirty_pages++;
3648     }
3649     return block->colo_cache + offset;
3650 }
3651 
3652 /**
3653  * ram_handle_compressed: handle the zero page case
3654  *
3655  * If a page (or a whole RDMA chunk) has been
3656  * determined to be zero, then zap it.
3657  *
3658  * @host: host address for the zero page
3659  * @ch: what the page is filled from.  We only support zero
3660  * @size: size of the zero page
3661  */
3662 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3663 {
3664     if (ch != 0 || !buffer_is_zero(host, size)) {
3665         memset(host, ch, size);
3666     }
3667 }
3668 
3669 /* return the size after decompression, or negative value on error */
3670 static int
3671 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3672                      const uint8_t *source, size_t source_len)
3673 {
3674     int err;
3675 
3676     err = inflateReset(stream);
3677     if (err != Z_OK) {
3678         return -1;
3679     }
3680 
3681     stream->avail_in = source_len;
3682     stream->next_in = (uint8_t *)source;
3683     stream->avail_out = dest_len;
3684     stream->next_out = dest;
3685 
3686     err = inflate(stream, Z_NO_FLUSH);
3687     if (err != Z_STREAM_END) {
3688         return -1;
3689     }
3690 
3691     return stream->total_out;
3692 }
3693 
3694 static void *do_data_decompress(void *opaque)
3695 {
3696     DecompressParam *param = opaque;
3697     unsigned long pagesize;
3698     uint8_t *des;
3699     int len, ret;
3700 
3701     qemu_mutex_lock(&param->mutex);
3702     while (!param->quit) {
3703         if (param->des) {
3704             des = param->des;
3705             len = param->len;
3706             param->des = 0;
3707             qemu_mutex_unlock(&param->mutex);
3708 
3709             pagesize = TARGET_PAGE_SIZE;
3710 
3711             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3712                                        param->compbuf, len);
3713             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3714                 error_report("decompress data failed");
3715                 qemu_file_set_error(decomp_file, ret);
3716             }
3717 
3718             qemu_mutex_lock(&decomp_done_lock);
3719             param->done = true;
3720             qemu_cond_signal(&decomp_done_cond);
3721             qemu_mutex_unlock(&decomp_done_lock);
3722 
3723             qemu_mutex_lock(&param->mutex);
3724         } else {
3725             qemu_cond_wait(&param->cond, &param->mutex);
3726         }
3727     }
3728     qemu_mutex_unlock(&param->mutex);
3729 
3730     return NULL;
3731 }
3732 
3733 static int wait_for_decompress_done(void)
3734 {
3735     int idx, thread_count;
3736 
3737     if (!migrate_compress()) {
3738         return 0;
3739     }
3740 
3741     thread_count = migrate_decompress_threads();
3742     qemu_mutex_lock(&decomp_done_lock);
3743     for (idx = 0; idx < thread_count; idx++) {
3744         while (!decomp_param[idx].done) {
3745             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3746         }
3747     }
3748     qemu_mutex_unlock(&decomp_done_lock);
3749     return qemu_file_get_error(decomp_file);
3750 }
3751 
3752 static void compress_threads_load_cleanup(void)
3753 {
3754     int i, thread_count;
3755 
3756     if (!migrate_compress()) {
3757         return;
3758     }
3759     thread_count = migrate_decompress_threads();
3760     for (i = 0; i < thread_count; i++) {
3761         /*
3762          * we use it as a indicator which shows if the thread is
3763          * properly init'd or not
3764          */
3765         if (!decomp_param[i].compbuf) {
3766             break;
3767         }
3768 
3769         qemu_mutex_lock(&decomp_param[i].mutex);
3770         decomp_param[i].quit = true;
3771         qemu_cond_signal(&decomp_param[i].cond);
3772         qemu_mutex_unlock(&decomp_param[i].mutex);
3773     }
3774     for (i = 0; i < thread_count; i++) {
3775         if (!decomp_param[i].compbuf) {
3776             break;
3777         }
3778 
3779         qemu_thread_join(decompress_threads + i);
3780         qemu_mutex_destroy(&decomp_param[i].mutex);
3781         qemu_cond_destroy(&decomp_param[i].cond);
3782         inflateEnd(&decomp_param[i].stream);
3783         g_free(decomp_param[i].compbuf);
3784         decomp_param[i].compbuf = NULL;
3785     }
3786     g_free(decompress_threads);
3787     g_free(decomp_param);
3788     decompress_threads = NULL;
3789     decomp_param = NULL;
3790     decomp_file = NULL;
3791 }
3792 
3793 static int compress_threads_load_setup(QEMUFile *f)
3794 {
3795     int i, thread_count;
3796 
3797     if (!migrate_compress()) {
3798         return 0;
3799     }
3800 
3801     thread_count = migrate_decompress_threads();
3802     decompress_threads = g_new0(QemuThread, thread_count);
3803     decomp_param = g_new0(DecompressParam, thread_count);
3804     qemu_mutex_init(&decomp_done_lock);
3805     qemu_cond_init(&decomp_done_cond);
3806     decomp_file = f;
3807     for (i = 0; i < thread_count; i++) {
3808         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3809             goto exit;
3810         }
3811 
3812         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3813         qemu_mutex_init(&decomp_param[i].mutex);
3814         qemu_cond_init(&decomp_param[i].cond);
3815         decomp_param[i].done = true;
3816         decomp_param[i].quit = false;
3817         qemu_thread_create(decompress_threads + i, "decompress",
3818                            do_data_decompress, decomp_param + i,
3819                            QEMU_THREAD_JOINABLE);
3820     }
3821     return 0;
3822 exit:
3823     compress_threads_load_cleanup();
3824     return -1;
3825 }
3826 
3827 static void decompress_data_with_multi_threads(QEMUFile *f,
3828                                                void *host, int len)
3829 {
3830     int idx, thread_count;
3831 
3832     thread_count = migrate_decompress_threads();
3833     QEMU_LOCK_GUARD(&decomp_done_lock);
3834     while (true) {
3835         for (idx = 0; idx < thread_count; idx++) {
3836             if (decomp_param[idx].done) {
3837                 decomp_param[idx].done = false;
3838                 qemu_mutex_lock(&decomp_param[idx].mutex);
3839                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3840                 decomp_param[idx].des = host;
3841                 decomp_param[idx].len = len;
3842                 qemu_cond_signal(&decomp_param[idx].cond);
3843                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3844                 break;
3845             }
3846         }
3847         if (idx < thread_count) {
3848             break;
3849         } else {
3850             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3851         }
3852     }
3853 }
3854 
3855 static void colo_init_ram_state(void)
3856 {
3857     ram_state_init(&ram_state);
3858 }
3859 
3860 /*
3861  * colo cache: this is for secondary VM, we cache the whole
3862  * memory of the secondary VM, it is need to hold the global lock
3863  * to call this helper.
3864  */
3865 int colo_init_ram_cache(void)
3866 {
3867     RAMBlock *block;
3868 
3869     WITH_RCU_READ_LOCK_GUARD() {
3870         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3871             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3872                                                     NULL, false, false);
3873             if (!block->colo_cache) {
3874                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3875                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3876                              block->used_length);
3877                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3878                     if (block->colo_cache) {
3879                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3880                         block->colo_cache = NULL;
3881                     }
3882                 }
3883                 return -errno;
3884             }
3885             if (!machine_dump_guest_core(current_machine)) {
3886                 qemu_madvise(block->colo_cache, block->used_length,
3887                              QEMU_MADV_DONTDUMP);
3888             }
3889         }
3890     }
3891 
3892     /*
3893     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3894     * with to decide which page in cache should be flushed into SVM's RAM. Here
3895     * we use the same name 'ram_bitmap' as for migration.
3896     */
3897     if (ram_bytes_total()) {
3898         RAMBlock *block;
3899 
3900         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3901             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3902             block->bmap = bitmap_new(pages);
3903         }
3904     }
3905 
3906     colo_init_ram_state();
3907     return 0;
3908 }
3909 
3910 /* TODO: duplicated with ram_init_bitmaps */
3911 void colo_incoming_start_dirty_log(void)
3912 {
3913     RAMBlock *block = NULL;
3914     /* For memory_global_dirty_log_start below. */
3915     qemu_mutex_lock_iothread();
3916     qemu_mutex_lock_ramlist();
3917 
3918     memory_global_dirty_log_sync();
3919     WITH_RCU_READ_LOCK_GUARD() {
3920         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3921             ramblock_sync_dirty_bitmap(ram_state, block);
3922             /* Discard this dirty bitmap record */
3923             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3924         }
3925         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3926     }
3927     ram_state->migration_dirty_pages = 0;
3928     qemu_mutex_unlock_ramlist();
3929     qemu_mutex_unlock_iothread();
3930 }
3931 
3932 /* It is need to hold the global lock to call this helper */
3933 void colo_release_ram_cache(void)
3934 {
3935     RAMBlock *block;
3936 
3937     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3938     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3939         g_free(block->bmap);
3940         block->bmap = NULL;
3941     }
3942 
3943     WITH_RCU_READ_LOCK_GUARD() {
3944         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3945             if (block->colo_cache) {
3946                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3947                 block->colo_cache = NULL;
3948             }
3949         }
3950     }
3951     ram_state_cleanup(&ram_state);
3952 }
3953 
3954 /**
3955  * ram_load_setup: Setup RAM for migration incoming side
3956  *
3957  * Returns zero to indicate success and negative for error
3958  *
3959  * @f: QEMUFile where to receive the data
3960  * @opaque: RAMState pointer
3961  */
3962 static int ram_load_setup(QEMUFile *f, void *opaque)
3963 {
3964     if (compress_threads_load_setup(f)) {
3965         return -1;
3966     }
3967 
3968     xbzrle_load_setup();
3969     ramblock_recv_map_init();
3970 
3971     return 0;
3972 }
3973 
3974 static int ram_load_cleanup(void *opaque)
3975 {
3976     RAMBlock *rb;
3977 
3978     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3979         qemu_ram_block_writeback(rb);
3980     }
3981 
3982     xbzrle_load_cleanup();
3983     compress_threads_load_cleanup();
3984 
3985     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3986         g_free(rb->receivedmap);
3987         rb->receivedmap = NULL;
3988     }
3989 
3990     return 0;
3991 }
3992 
3993 /**
3994  * ram_postcopy_incoming_init: allocate postcopy data structures
3995  *
3996  * Returns 0 for success and negative if there was one error
3997  *
3998  * @mis: current migration incoming state
3999  *
4000  * Allocate data structures etc needed by incoming migration with
4001  * postcopy-ram. postcopy-ram's similarly names
4002  * postcopy_ram_incoming_init does the work.
4003  */
4004 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4005 {
4006     return postcopy_ram_incoming_init(mis);
4007 }
4008 
4009 /**
4010  * ram_load_postcopy: load a page in postcopy case
4011  *
4012  * Returns 0 for success or -errno in case of error
4013  *
4014  * Called in postcopy mode by ram_load().
4015  * rcu_read_lock is taken prior to this being called.
4016  *
4017  * @f: QEMUFile where to send the data
4018  * @channel: the channel to use for loading
4019  */
4020 int ram_load_postcopy(QEMUFile *f, int channel)
4021 {
4022     int flags = 0, ret = 0;
4023     bool place_needed = false;
4024     bool matches_target_page_size = false;
4025     MigrationIncomingState *mis = migration_incoming_get_current();
4026     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
4027 
4028     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4029         ram_addr_t addr;
4030         void *page_buffer = NULL;
4031         void *place_source = NULL;
4032         RAMBlock *block = NULL;
4033         uint8_t ch;
4034         int len;
4035 
4036         addr = qemu_get_be64(f);
4037 
4038         /*
4039          * If qemu file error, we should stop here, and then "addr"
4040          * may be invalid
4041          */
4042         ret = qemu_file_get_error(f);
4043         if (ret) {
4044             break;
4045         }
4046 
4047         flags = addr & ~TARGET_PAGE_MASK;
4048         addr &= TARGET_PAGE_MASK;
4049 
4050         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4051         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4052                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4053             block = ram_block_from_stream(mis, f, flags, channel);
4054             if (!block) {
4055                 ret = -EINVAL;
4056                 break;
4057             }
4058 
4059             /*
4060              * Relying on used_length is racy and can result in false positives.
4061              * We might place pages beyond used_length in case RAM was shrunk
4062              * while in postcopy, which is fine - trying to place via
4063              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4064              */
4065             if (!block->host || addr >= block->postcopy_length) {
4066                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4067                 ret = -EINVAL;
4068                 break;
4069             }
4070             tmp_page->target_pages++;
4071             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4072             /*
4073              * Postcopy requires that we place whole host pages atomically;
4074              * these may be huge pages for RAMBlocks that are backed by
4075              * hugetlbfs.
4076              * To make it atomic, the data is read into a temporary page
4077              * that's moved into place later.
4078              * The migration protocol uses,  possibly smaller, target-pages
4079              * however the source ensures it always sends all the components
4080              * of a host page in one chunk.
4081              */
4082             page_buffer = tmp_page->tmp_huge_page +
4083                           host_page_offset_from_ram_block_offset(block, addr);
4084             /* If all TP are zero then we can optimise the place */
4085             if (tmp_page->target_pages == 1) {
4086                 tmp_page->host_addr =
4087                     host_page_from_ram_block_offset(block, addr);
4088             } else if (tmp_page->host_addr !=
4089                        host_page_from_ram_block_offset(block, addr)) {
4090                 /* not the 1st TP within the HP */
4091                 error_report("Non-same host page detected on channel %d: "
4092                              "Target host page %p, received host page %p "
4093                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4094                              channel, tmp_page->host_addr,
4095                              host_page_from_ram_block_offset(block, addr),
4096                              block->idstr, addr, tmp_page->target_pages);
4097                 ret = -EINVAL;
4098                 break;
4099             }
4100 
4101             /*
4102              * If it's the last part of a host page then we place the host
4103              * page
4104              */
4105             if (tmp_page->target_pages ==
4106                 (block->page_size / TARGET_PAGE_SIZE)) {
4107                 place_needed = true;
4108             }
4109             place_source = tmp_page->tmp_huge_page;
4110         }
4111 
4112         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4113         case RAM_SAVE_FLAG_ZERO:
4114             ch = qemu_get_byte(f);
4115             /*
4116              * Can skip to set page_buffer when
4117              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4118              */
4119             if (ch || !matches_target_page_size) {
4120                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4121             }
4122             if (ch) {
4123                 tmp_page->all_zero = false;
4124             }
4125             break;
4126 
4127         case RAM_SAVE_FLAG_PAGE:
4128             tmp_page->all_zero = false;
4129             if (!matches_target_page_size) {
4130                 /* For huge pages, we always use temporary buffer */
4131                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4132             } else {
4133                 /*
4134                  * For small pages that matches target page size, we
4135                  * avoid the qemu_file copy.  Instead we directly use
4136                  * the buffer of QEMUFile to place the page.  Note: we
4137                  * cannot do any QEMUFile operation before using that
4138                  * buffer to make sure the buffer is valid when
4139                  * placing the page.
4140                  */
4141                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4142                                          TARGET_PAGE_SIZE);
4143             }
4144             break;
4145         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4146             tmp_page->all_zero = false;
4147             len = qemu_get_be32(f);
4148             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4149                 error_report("Invalid compressed data length: %d", len);
4150                 ret = -EINVAL;
4151                 break;
4152             }
4153             decompress_data_with_multi_threads(f, page_buffer, len);
4154             break;
4155 
4156         case RAM_SAVE_FLAG_EOS:
4157             /* normal exit */
4158             multifd_recv_sync_main();
4159             break;
4160         default:
4161             error_report("Unknown combination of migration flags: 0x%x"
4162                          " (postcopy mode)", flags);
4163             ret = -EINVAL;
4164             break;
4165         }
4166 
4167         /* Got the whole host page, wait for decompress before placing. */
4168         if (place_needed) {
4169             ret |= wait_for_decompress_done();
4170         }
4171 
4172         /* Detect for any possible file errors */
4173         if (!ret && qemu_file_get_error(f)) {
4174             ret = qemu_file_get_error(f);
4175         }
4176 
4177         if (!ret && place_needed) {
4178             if (tmp_page->all_zero) {
4179                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4180             } else {
4181                 ret = postcopy_place_page(mis, tmp_page->host_addr,
4182                                           place_source, block);
4183             }
4184             place_needed = false;
4185             postcopy_temp_page_reset(tmp_page);
4186         }
4187     }
4188 
4189     return ret;
4190 }
4191 
4192 static bool postcopy_is_running(void)
4193 {
4194     PostcopyState ps = postcopy_state_get();
4195     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4196 }
4197 
4198 /*
4199  * Flush content of RAM cache into SVM's memory.
4200  * Only flush the pages that be dirtied by PVM or SVM or both.
4201  */
4202 void colo_flush_ram_cache(void)
4203 {
4204     RAMBlock *block = NULL;
4205     void *dst_host;
4206     void *src_host;
4207     unsigned long offset = 0;
4208 
4209     memory_global_dirty_log_sync();
4210     WITH_RCU_READ_LOCK_GUARD() {
4211         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4212             ramblock_sync_dirty_bitmap(ram_state, block);
4213         }
4214     }
4215 
4216     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4217     WITH_RCU_READ_LOCK_GUARD() {
4218         block = QLIST_FIRST_RCU(&ram_list.blocks);
4219 
4220         while (block) {
4221             unsigned long num = 0;
4222 
4223             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4224             if (!offset_in_ramblock(block,
4225                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4226                 offset = 0;
4227                 num = 0;
4228                 block = QLIST_NEXT_RCU(block, next);
4229             } else {
4230                 unsigned long i = 0;
4231 
4232                 for (i = 0; i < num; i++) {
4233                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
4234                 }
4235                 dst_host = block->host
4236                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4237                 src_host = block->colo_cache
4238                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4239                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4240                 offset += num;
4241             }
4242         }
4243     }
4244     trace_colo_flush_ram_cache_end();
4245 }
4246 
4247 /**
4248  * ram_load_precopy: load pages in precopy case
4249  *
4250  * Returns 0 for success or -errno in case of error
4251  *
4252  * Called in precopy mode by ram_load().
4253  * rcu_read_lock is taken prior to this being called.
4254  *
4255  * @f: QEMUFile where to send the data
4256  */
4257 static int ram_load_precopy(QEMUFile *f)
4258 {
4259     MigrationIncomingState *mis = migration_incoming_get_current();
4260     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4261     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4262     bool postcopy_advised = migration_incoming_postcopy_advised();
4263     if (!migrate_compress()) {
4264         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4265     }
4266 
4267     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4268         ram_addr_t addr, total_ram_bytes;
4269         void *host = NULL, *host_bak = NULL;
4270         uint8_t ch;
4271 
4272         /*
4273          * Yield periodically to let main loop run, but an iteration of
4274          * the main loop is expensive, so do it each some iterations
4275          */
4276         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4277             aio_co_schedule(qemu_get_current_aio_context(),
4278                             qemu_coroutine_self());
4279             qemu_coroutine_yield();
4280         }
4281         i++;
4282 
4283         addr = qemu_get_be64(f);
4284         flags = addr & ~TARGET_PAGE_MASK;
4285         addr &= TARGET_PAGE_MASK;
4286 
4287         if (flags & invalid_flags) {
4288             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4289                 error_report("Received an unexpected compressed page");
4290             }
4291 
4292             ret = -EINVAL;
4293             break;
4294         }
4295 
4296         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4297                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4298             RAMBlock *block = ram_block_from_stream(mis, f, flags,
4299                                                     RAM_CHANNEL_PRECOPY);
4300 
4301             host = host_from_ram_block_offset(block, addr);
4302             /*
4303              * After going into COLO stage, we should not load the page
4304              * into SVM's memory directly, we put them into colo_cache firstly.
4305              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4306              * Previously, we copied all these memory in preparing stage of COLO
4307              * while we need to stop VM, which is a time-consuming process.
4308              * Here we optimize it by a trick, back-up every page while in
4309              * migration process while COLO is enabled, though it affects the
4310              * speed of the migration, but it obviously reduce the downtime of
4311              * back-up all SVM'S memory in COLO preparing stage.
4312              */
4313             if (migration_incoming_colo_enabled()) {
4314                 if (migration_incoming_in_colo_state()) {
4315                     /* In COLO stage, put all pages into cache temporarily */
4316                     host = colo_cache_from_block_offset(block, addr, true);
4317                 } else {
4318                    /*
4319                     * In migration stage but before COLO stage,
4320                     * Put all pages into both cache and SVM's memory.
4321                     */
4322                     host_bak = colo_cache_from_block_offset(block, addr, false);
4323                 }
4324             }
4325             if (!host) {
4326                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4327                 ret = -EINVAL;
4328                 break;
4329             }
4330             if (!migration_incoming_in_colo_state()) {
4331                 ramblock_recv_bitmap_set(block, host);
4332             }
4333 
4334             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4335         }
4336 
4337         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4338         case RAM_SAVE_FLAG_MEM_SIZE:
4339             /* Synchronize RAM block list */
4340             total_ram_bytes = addr;
4341             while (!ret && total_ram_bytes) {
4342                 RAMBlock *block;
4343                 char id[256];
4344                 ram_addr_t length;
4345 
4346                 len = qemu_get_byte(f);
4347                 qemu_get_buffer(f, (uint8_t *)id, len);
4348                 id[len] = 0;
4349                 length = qemu_get_be64(f);
4350 
4351                 block = qemu_ram_block_by_name(id);
4352                 if (block && !qemu_ram_is_migratable(block)) {
4353                     error_report("block %s should not be migrated !", id);
4354                     ret = -EINVAL;
4355                 } else if (block) {
4356                     if (length != block->used_length) {
4357                         Error *local_err = NULL;
4358 
4359                         ret = qemu_ram_resize(block, length,
4360                                               &local_err);
4361                         if (local_err) {
4362                             error_report_err(local_err);
4363                         }
4364                     }
4365                     /* For postcopy we need to check hugepage sizes match */
4366                     if (postcopy_advised && migrate_postcopy_ram() &&
4367                         block->page_size != qemu_host_page_size) {
4368                         uint64_t remote_page_size = qemu_get_be64(f);
4369                         if (remote_page_size != block->page_size) {
4370                             error_report("Mismatched RAM page size %s "
4371                                          "(local) %zd != %" PRId64,
4372                                          id, block->page_size,
4373                                          remote_page_size);
4374                             ret = -EINVAL;
4375                         }
4376                     }
4377                     if (migrate_ignore_shared()) {
4378                         hwaddr addr = qemu_get_be64(f);
4379                         if (ramblock_is_ignored(block) &&
4380                             block->mr->addr != addr) {
4381                             error_report("Mismatched GPAs for block %s "
4382                                          "%" PRId64 "!= %" PRId64,
4383                                          id, (uint64_t)addr,
4384                                          (uint64_t)block->mr->addr);
4385                             ret = -EINVAL;
4386                         }
4387                     }
4388                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4389                                           block->idstr);
4390                 } else {
4391                     error_report("Unknown ramblock \"%s\", cannot "
4392                                  "accept migration", id);
4393                     ret = -EINVAL;
4394                 }
4395 
4396                 total_ram_bytes -= length;
4397             }
4398             break;
4399 
4400         case RAM_SAVE_FLAG_ZERO:
4401             ch = qemu_get_byte(f);
4402             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4403             break;
4404 
4405         case RAM_SAVE_FLAG_PAGE:
4406             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4407             break;
4408 
4409         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4410             len = qemu_get_be32(f);
4411             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4412                 error_report("Invalid compressed data length: %d", len);
4413                 ret = -EINVAL;
4414                 break;
4415             }
4416             decompress_data_with_multi_threads(f, host, len);
4417             break;
4418 
4419         case RAM_SAVE_FLAG_XBZRLE:
4420             if (load_xbzrle(f, addr, host) < 0) {
4421                 error_report("Failed to decompress XBZRLE page at "
4422                              RAM_ADDR_FMT, addr);
4423                 ret = -EINVAL;
4424                 break;
4425             }
4426             break;
4427         case RAM_SAVE_FLAG_EOS:
4428             /* normal exit */
4429             multifd_recv_sync_main();
4430             break;
4431         default:
4432             if (flags & RAM_SAVE_FLAG_HOOK) {
4433                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4434             } else {
4435                 error_report("Unknown combination of migration flags: 0x%x",
4436                              flags);
4437                 ret = -EINVAL;
4438             }
4439         }
4440         if (!ret) {
4441             ret = qemu_file_get_error(f);
4442         }
4443         if (!ret && host_bak) {
4444             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4445         }
4446     }
4447 
4448     ret |= wait_for_decompress_done();
4449     return ret;
4450 }
4451 
4452 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4453 {
4454     int ret = 0;
4455     static uint64_t seq_iter;
4456     /*
4457      * If system is running in postcopy mode, page inserts to host memory must
4458      * be atomic
4459      */
4460     bool postcopy_running = postcopy_is_running();
4461 
4462     seq_iter++;
4463 
4464     if (version_id != 4) {
4465         return -EINVAL;
4466     }
4467 
4468     /*
4469      * This RCU critical section can be very long running.
4470      * When RCU reclaims in the code start to become numerous,
4471      * it will be necessary to reduce the granularity of this
4472      * critical section.
4473      */
4474     WITH_RCU_READ_LOCK_GUARD() {
4475         if (postcopy_running) {
4476             /*
4477              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4478              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4479              * service fast page faults.
4480              */
4481             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4482         } else {
4483             ret = ram_load_precopy(f);
4484         }
4485     }
4486     trace_ram_load_complete(ret, seq_iter);
4487 
4488     return ret;
4489 }
4490 
4491 static bool ram_has_postcopy(void *opaque)
4492 {
4493     RAMBlock *rb;
4494     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4495         if (ramblock_is_pmem(rb)) {
4496             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4497                          "is not supported now!", rb->idstr, rb->host);
4498             return false;
4499         }
4500     }
4501 
4502     return migrate_postcopy_ram();
4503 }
4504 
4505 /* Sync all the dirty bitmap with destination VM.  */
4506 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4507 {
4508     RAMBlock *block;
4509     QEMUFile *file = s->to_dst_file;
4510     int ramblock_count = 0;
4511 
4512     trace_ram_dirty_bitmap_sync_start();
4513 
4514     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4515         qemu_savevm_send_recv_bitmap(file, block->idstr);
4516         trace_ram_dirty_bitmap_request(block->idstr);
4517         ramblock_count++;
4518     }
4519 
4520     trace_ram_dirty_bitmap_sync_wait();
4521 
4522     /* Wait until all the ramblocks' dirty bitmap synced */
4523     while (ramblock_count--) {
4524         qemu_sem_wait(&s->rp_state.rp_sem);
4525     }
4526 
4527     trace_ram_dirty_bitmap_sync_complete();
4528 
4529     return 0;
4530 }
4531 
4532 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4533 {
4534     qemu_sem_post(&s->rp_state.rp_sem);
4535 }
4536 
4537 /*
4538  * Read the received bitmap, revert it as the initial dirty bitmap.
4539  * This is only used when the postcopy migration is paused but wants
4540  * to resume from a middle point.
4541  */
4542 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4543 {
4544     int ret = -EINVAL;
4545     /* from_dst_file is always valid because we're within rp_thread */
4546     QEMUFile *file = s->rp_state.from_dst_file;
4547     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4548     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4549     uint64_t size, end_mark;
4550 
4551     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4552 
4553     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4554         error_report("%s: incorrect state %s", __func__,
4555                      MigrationStatus_str(s->state));
4556         return -EINVAL;
4557     }
4558 
4559     /*
4560      * Note: see comments in ramblock_recv_bitmap_send() on why we
4561      * need the endianness conversion, and the paddings.
4562      */
4563     local_size = ROUND_UP(local_size, 8);
4564 
4565     /* Add paddings */
4566     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4567 
4568     size = qemu_get_be64(file);
4569 
4570     /* The size of the bitmap should match with our ramblock */
4571     if (size != local_size) {
4572         error_report("%s: ramblock '%s' bitmap size mismatch "
4573                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4574                      block->idstr, size, local_size);
4575         ret = -EINVAL;
4576         goto out;
4577     }
4578 
4579     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4580     end_mark = qemu_get_be64(file);
4581 
4582     ret = qemu_file_get_error(file);
4583     if (ret || size != local_size) {
4584         error_report("%s: read bitmap failed for ramblock '%s': %d"
4585                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4586                      __func__, block->idstr, ret, local_size, size);
4587         ret = -EIO;
4588         goto out;
4589     }
4590 
4591     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4592         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4593                      __func__, block->idstr, end_mark);
4594         ret = -EINVAL;
4595         goto out;
4596     }
4597 
4598     /*
4599      * Endianness conversion. We are during postcopy (though paused).
4600      * The dirty bitmap won't change. We can directly modify it.
4601      */
4602     bitmap_from_le(block->bmap, le_bitmap, nbits);
4603 
4604     /*
4605      * What we received is "received bitmap". Revert it as the initial
4606      * dirty bitmap for this ramblock.
4607      */
4608     bitmap_complement(block->bmap, block->bmap, nbits);
4609 
4610     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4611     ramblock_dirty_bitmap_clear_discarded_pages(block);
4612 
4613     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4614     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4615 
4616     /*
4617      * We succeeded to sync bitmap for current ramblock. If this is
4618      * the last one to sync, we need to notify the main send thread.
4619      */
4620     ram_dirty_bitmap_reload_notify(s);
4621 
4622     ret = 0;
4623 out:
4624     g_free(le_bitmap);
4625     return ret;
4626 }
4627 
4628 static int ram_resume_prepare(MigrationState *s, void *opaque)
4629 {
4630     RAMState *rs = *(RAMState **)opaque;
4631     int ret;
4632 
4633     ret = ram_dirty_bitmap_sync_all(s, rs);
4634     if (ret) {
4635         return ret;
4636     }
4637 
4638     ram_state_resume_prepare(rs, s->to_dst_file);
4639 
4640     return 0;
4641 }
4642 
4643 void postcopy_preempt_shutdown_file(MigrationState *s)
4644 {
4645     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4646     qemu_fflush(s->postcopy_qemufile_src);
4647 }
4648 
4649 static SaveVMHandlers savevm_ram_handlers = {
4650     .save_setup = ram_save_setup,
4651     .save_live_iterate = ram_save_iterate,
4652     .save_live_complete_postcopy = ram_save_complete,
4653     .save_live_complete_precopy = ram_save_complete,
4654     .has_postcopy = ram_has_postcopy,
4655     .state_pending_exact = ram_state_pending_exact,
4656     .state_pending_estimate = ram_state_pending_estimate,
4657     .load_state = ram_load,
4658     .save_cleanup = ram_save_cleanup,
4659     .load_setup = ram_load_setup,
4660     .load_cleanup = ram_load_cleanup,
4661     .resume_prepare = ram_resume_prepare,
4662 };
4663 
4664 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4665                                       size_t old_size, size_t new_size)
4666 {
4667     PostcopyState ps = postcopy_state_get();
4668     ram_addr_t offset;
4669     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4670     Error *err = NULL;
4671 
4672     if (ramblock_is_ignored(rb)) {
4673         return;
4674     }
4675 
4676     if (!migration_is_idle()) {
4677         /*
4678          * Precopy code on the source cannot deal with the size of RAM blocks
4679          * changing at random points in time - especially after sending the
4680          * RAM block sizes in the migration stream, they must no longer change.
4681          * Abort and indicate a proper reason.
4682          */
4683         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4684         migration_cancel(err);
4685         error_free(err);
4686     }
4687 
4688     switch (ps) {
4689     case POSTCOPY_INCOMING_ADVISE:
4690         /*
4691          * Update what ram_postcopy_incoming_init()->init_range() does at the
4692          * time postcopy was advised. Syncing RAM blocks with the source will
4693          * result in RAM resizes.
4694          */
4695         if (old_size < new_size) {
4696             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4697                 error_report("RAM block '%s' discard of resized RAM failed",
4698                              rb->idstr);
4699             }
4700         }
4701         rb->postcopy_length = new_size;
4702         break;
4703     case POSTCOPY_INCOMING_NONE:
4704     case POSTCOPY_INCOMING_RUNNING:
4705     case POSTCOPY_INCOMING_END:
4706         /*
4707          * Once our guest is running, postcopy does no longer care about
4708          * resizes. When growing, the new memory was not available on the
4709          * source, no handler needed.
4710          */
4711         break;
4712     default:
4713         error_report("RAM block '%s' resized during postcopy state: %d",
4714                      rb->idstr, ps);
4715         exit(-1);
4716     }
4717 }
4718 
4719 static RAMBlockNotifier ram_mig_ram_notifier = {
4720     .ram_block_resized = ram_mig_ram_block_resized,
4721 };
4722 
4723 void ram_mig_init(void)
4724 {
4725     qemu_mutex_init(&XBZRLE.lock);
4726     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4727     ram_block_notifier_add(&ram_mig_ram_notifier);
4728 }
4729