xref: /openbmc/qemu/migration/ram.c (revision f9e1ef74)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "block.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
60 
61 #include "hw/boards.h" /* for machine_dump_guest_core() */
62 
63 #if defined(__linux__)
64 #include "qemu/userfaultfd.h"
65 #endif /* defined(__linux__) */
66 
67 /***********************************************************/
68 /* ram save/restore */
69 
70 /*
71  * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
72  * worked for pages that were filled with the same char.  We switched
73  * it to only search for the zero value.  And to avoid confusion with
74  * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
75  */
76 /*
77  * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
78  */
79 #define RAM_SAVE_FLAG_FULL     0x01
80 #define RAM_SAVE_FLAG_ZERO     0x02
81 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
82 #define RAM_SAVE_FLAG_PAGE     0x08
83 #define RAM_SAVE_FLAG_EOS      0x10
84 #define RAM_SAVE_FLAG_CONTINUE 0x20
85 #define RAM_SAVE_FLAG_XBZRLE   0x40
86 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
87 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
88 /* We can't use any flag that is bigger than 0x200 */
89 
90 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
91      uint8_t *, int) = xbzrle_encode_buffer;
92 #if defined(CONFIG_AVX512BW_OPT)
93 #include "qemu/cpuid.h"
94 static void __attribute__((constructor)) init_cpu_flag(void)
95 {
96     unsigned max = __get_cpuid_max(0, NULL);
97     int a, b, c, d;
98     if (max >= 1) {
99         __cpuid(1, a, b, c, d);
100          /* We must check that AVX is not just available, but usable.  */
101         if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
102             int bv;
103             __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
104             __cpuid_count(7, 0, a, b, c, d);
105            /* 0xe6:
106             *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
107             *                    and ZMM16-ZMM31 state are enabled by OS)
108             *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
109             */
110             if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
111                 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
112             }
113         }
114     }
115 }
116 #endif
117 
118 XBZRLECacheStats xbzrle_counters;
119 
120 /* used by the search for pages to send */
121 struct PageSearchStatus {
122     /* The migration channel used for a specific host page */
123     QEMUFile    *pss_channel;
124     /* Last block from where we have sent data */
125     RAMBlock *last_sent_block;
126     /* Current block being searched */
127     RAMBlock    *block;
128     /* Current page to search from */
129     unsigned long page;
130     /* Set once we wrap around */
131     bool         complete_round;
132     /* Whether we're sending a host page */
133     bool          host_page_sending;
134     /* The start/end of current host page.  Invalid if host_page_sending==false */
135     unsigned long host_page_start;
136     unsigned long host_page_end;
137 };
138 typedef struct PageSearchStatus PageSearchStatus;
139 
140 /* struct contains XBZRLE cache and a static page
141    used by the compression */
142 static struct {
143     /* buffer used for XBZRLE encoding */
144     uint8_t *encoded_buf;
145     /* buffer for storing page content */
146     uint8_t *current_buf;
147     /* Cache for XBZRLE, Protected by lock. */
148     PageCache *cache;
149     QemuMutex lock;
150     /* it will store a page full of zeros */
151     uint8_t *zero_target_page;
152     /* buffer used for XBZRLE decoding */
153     uint8_t *decoded_buf;
154 } XBZRLE;
155 
156 static void XBZRLE_cache_lock(void)
157 {
158     if (migrate_use_xbzrle()) {
159         qemu_mutex_lock(&XBZRLE.lock);
160     }
161 }
162 
163 static void XBZRLE_cache_unlock(void)
164 {
165     if (migrate_use_xbzrle()) {
166         qemu_mutex_unlock(&XBZRLE.lock);
167     }
168 }
169 
170 /**
171  * xbzrle_cache_resize: resize the xbzrle cache
172  *
173  * This function is called from migrate_params_apply in main
174  * thread, possibly while a migration is in progress.  A running
175  * migration may be using the cache and might finish during this call,
176  * hence changes to the cache are protected by XBZRLE.lock().
177  *
178  * Returns 0 for success or -1 for error
179  *
180  * @new_size: new cache size
181  * @errp: set *errp if the check failed, with reason
182  */
183 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
184 {
185     PageCache *new_cache;
186     int64_t ret = 0;
187 
188     /* Check for truncation */
189     if (new_size != (size_t)new_size) {
190         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
191                    "exceeding address space");
192         return -1;
193     }
194 
195     if (new_size == migrate_xbzrle_cache_size()) {
196         /* nothing to do */
197         return 0;
198     }
199 
200     XBZRLE_cache_lock();
201 
202     if (XBZRLE.cache != NULL) {
203         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
204         if (!new_cache) {
205             ret = -1;
206             goto out;
207         }
208 
209         cache_fini(XBZRLE.cache);
210         XBZRLE.cache = new_cache;
211     }
212 out:
213     XBZRLE_cache_unlock();
214     return ret;
215 }
216 
217 static bool postcopy_preempt_active(void)
218 {
219     return migrate_postcopy_preempt() && migration_in_postcopy();
220 }
221 
222 bool ramblock_is_ignored(RAMBlock *block)
223 {
224     return !qemu_ram_is_migratable(block) ||
225            (migrate_ignore_shared() && qemu_ram_is_shared(block));
226 }
227 
228 #undef RAMBLOCK_FOREACH
229 
230 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
231 {
232     RAMBlock *block;
233     int ret = 0;
234 
235     RCU_READ_LOCK_GUARD();
236 
237     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
238         ret = func(block, opaque);
239         if (ret) {
240             break;
241         }
242     }
243     return ret;
244 }
245 
246 static void ramblock_recv_map_init(void)
247 {
248     RAMBlock *rb;
249 
250     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
251         assert(!rb->receivedmap);
252         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
253     }
254 }
255 
256 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
257 {
258     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
259                     rb->receivedmap);
260 }
261 
262 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
263 {
264     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
265 }
266 
267 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
268 {
269     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
270 }
271 
272 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
273                                     size_t nr)
274 {
275     bitmap_set_atomic(rb->receivedmap,
276                       ramblock_recv_bitmap_offset(host_addr, rb),
277                       nr);
278 }
279 
280 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
281 
282 /*
283  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
284  *
285  * Returns >0 if success with sent bytes, or <0 if error.
286  */
287 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
288                                   const char *block_name)
289 {
290     RAMBlock *block = qemu_ram_block_by_name(block_name);
291     unsigned long *le_bitmap, nbits;
292     uint64_t size;
293 
294     if (!block) {
295         error_report("%s: invalid block name: %s", __func__, block_name);
296         return -1;
297     }
298 
299     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
300 
301     /*
302      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
303      * machines we may need 4 more bytes for padding (see below
304      * comment). So extend it a bit before hand.
305      */
306     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
307 
308     /*
309      * Always use little endian when sending the bitmap. This is
310      * required that when source and destination VMs are not using the
311      * same endianness. (Note: big endian won't work.)
312      */
313     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
314 
315     /* Size of the bitmap, in bytes */
316     size = DIV_ROUND_UP(nbits, 8);
317 
318     /*
319      * size is always aligned to 8 bytes for 64bit machines, but it
320      * may not be true for 32bit machines. We need this padding to
321      * make sure the migration can survive even between 32bit and
322      * 64bit machines.
323      */
324     size = ROUND_UP(size, 8);
325 
326     qemu_put_be64(file, size);
327     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
328     /*
329      * Mark as an end, in case the middle part is screwed up due to
330      * some "mysterious" reason.
331      */
332     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
333     qemu_fflush(file);
334 
335     g_free(le_bitmap);
336 
337     if (qemu_file_get_error(file)) {
338         return qemu_file_get_error(file);
339     }
340 
341     return size + sizeof(size);
342 }
343 
344 /*
345  * An outstanding page request, on the source, having been received
346  * and queued
347  */
348 struct RAMSrcPageRequest {
349     RAMBlock *rb;
350     hwaddr    offset;
351     hwaddr    len;
352 
353     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
354 };
355 
356 /* State of RAM for migration */
357 struct RAMState {
358     /*
359      * PageSearchStatus structures for the channels when send pages.
360      * Protected by the bitmap_mutex.
361      */
362     PageSearchStatus pss[RAM_CHANNEL_MAX];
363     /* UFFD file descriptor, used in 'write-tracking' migration */
364     int uffdio_fd;
365     /* total ram size in bytes */
366     uint64_t ram_bytes_total;
367     /* Last block that we have visited searching for dirty pages */
368     RAMBlock *last_seen_block;
369     /* Last dirty target page we have sent */
370     ram_addr_t last_page;
371     /* last ram version we have seen */
372     uint32_t last_version;
373     /* How many times we have dirty too many pages */
374     int dirty_rate_high_cnt;
375     /* these variables are used for bitmap sync */
376     /* last time we did a full bitmap_sync */
377     int64_t time_last_bitmap_sync;
378     /* bytes transferred at start_time */
379     uint64_t bytes_xfer_prev;
380     /* number of dirty pages since start_time */
381     uint64_t num_dirty_pages_period;
382     /* xbzrle misses since the beginning of the period */
383     uint64_t xbzrle_cache_miss_prev;
384     /* Amount of xbzrle pages since the beginning of the period */
385     uint64_t xbzrle_pages_prev;
386     /* Amount of xbzrle encoded bytes since the beginning of the period */
387     uint64_t xbzrle_bytes_prev;
388     /* Start using XBZRLE (e.g., after the first round). */
389     bool xbzrle_enabled;
390     /* Are we on the last stage of migration */
391     bool last_stage;
392     /* compression statistics since the beginning of the period */
393     /* amount of count that no free thread to compress data */
394     uint64_t compress_thread_busy_prev;
395     /* amount bytes after compression */
396     uint64_t compressed_size_prev;
397     /* amount of compressed pages */
398     uint64_t compress_pages_prev;
399 
400     /* total handled target pages at the beginning of period */
401     uint64_t target_page_count_prev;
402     /* total handled target pages since start */
403     uint64_t target_page_count;
404     /* number of dirty bits in the bitmap */
405     uint64_t migration_dirty_pages;
406     /*
407      * Protects:
408      * - dirty/clear bitmap
409      * - migration_dirty_pages
410      * - pss structures
411      */
412     QemuMutex bitmap_mutex;
413     /* The RAMBlock used in the last src_page_requests */
414     RAMBlock *last_req_rb;
415     /* Queue of outstanding page requests from the destination */
416     QemuMutex src_page_req_mutex;
417     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
418 };
419 typedef struct RAMState RAMState;
420 
421 static RAMState *ram_state;
422 
423 static NotifierWithReturnList precopy_notifier_list;
424 
425 /* Whether postcopy has queued requests? */
426 static bool postcopy_has_request(RAMState *rs)
427 {
428     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
429 }
430 
431 void precopy_infrastructure_init(void)
432 {
433     notifier_with_return_list_init(&precopy_notifier_list);
434 }
435 
436 void precopy_add_notifier(NotifierWithReturn *n)
437 {
438     notifier_with_return_list_add(&precopy_notifier_list, n);
439 }
440 
441 void precopy_remove_notifier(NotifierWithReturn *n)
442 {
443     notifier_with_return_remove(n);
444 }
445 
446 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
447 {
448     PrecopyNotifyData pnd;
449     pnd.reason = reason;
450     pnd.errp = errp;
451 
452     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
453 }
454 
455 uint64_t ram_bytes_remaining(void)
456 {
457     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
458                        0;
459 }
460 
461 RAMStats ram_counters;
462 
463 void ram_transferred_add(uint64_t bytes)
464 {
465     if (runstate_is_running()) {
466         stat64_add(&ram_counters.precopy_bytes, bytes);
467     } else if (migration_in_postcopy()) {
468         stat64_add(&ram_counters.postcopy_bytes, bytes);
469     } else {
470         stat64_add(&ram_counters.downtime_bytes, bytes);
471     }
472     stat64_add(&ram_counters.transferred, bytes);
473 }
474 
475 struct MigrationOps {
476     int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
477 };
478 typedef struct MigrationOps MigrationOps;
479 
480 MigrationOps *migration_ops;
481 
482 CompressionStats compression_counters;
483 
484 struct CompressParam {
485     bool done;
486     bool quit;
487     bool zero_page;
488     QEMUFile *file;
489     QemuMutex mutex;
490     QemuCond cond;
491     RAMBlock *block;
492     ram_addr_t offset;
493 
494     /* internally used fields */
495     z_stream stream;
496     uint8_t *originbuf;
497 };
498 typedef struct CompressParam CompressParam;
499 
500 struct DecompressParam {
501     bool done;
502     bool quit;
503     QemuMutex mutex;
504     QemuCond cond;
505     void *des;
506     uint8_t *compbuf;
507     int len;
508     z_stream stream;
509 };
510 typedef struct DecompressParam DecompressParam;
511 
512 static CompressParam *comp_param;
513 static QemuThread *compress_threads;
514 /* comp_done_cond is used to wake up the migration thread when
515  * one of the compression threads has finished the compression.
516  * comp_done_lock is used to co-work with comp_done_cond.
517  */
518 static QemuMutex comp_done_lock;
519 static QemuCond comp_done_cond;
520 
521 static QEMUFile *decomp_file;
522 static DecompressParam *decomp_param;
523 static QemuThread *decompress_threads;
524 static QemuMutex decomp_done_lock;
525 static QemuCond decomp_done_cond;
526 
527 static int ram_save_host_page_urgent(PageSearchStatus *pss);
528 
529 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
530                                  ram_addr_t offset, uint8_t *source_buf);
531 
532 /* NOTE: page is the PFN not real ram_addr_t. */
533 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
534 {
535     pss->block = rb;
536     pss->page = page;
537     pss->complete_round = false;
538 }
539 
540 /*
541  * Check whether two PSSs are actively sending the same page.  Return true
542  * if it is, false otherwise.
543  */
544 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
545 {
546     return pss1->host_page_sending && pss2->host_page_sending &&
547         (pss1->host_page_start == pss2->host_page_start);
548 }
549 
550 static void *do_data_compress(void *opaque)
551 {
552     CompressParam *param = opaque;
553     RAMBlock *block;
554     ram_addr_t offset;
555     bool zero_page;
556 
557     qemu_mutex_lock(&param->mutex);
558     while (!param->quit) {
559         if (param->block) {
560             block = param->block;
561             offset = param->offset;
562             param->block = NULL;
563             qemu_mutex_unlock(&param->mutex);
564 
565             zero_page = do_compress_ram_page(param->file, &param->stream,
566                                              block, offset, param->originbuf);
567 
568             qemu_mutex_lock(&comp_done_lock);
569             param->done = true;
570             param->zero_page = zero_page;
571             qemu_cond_signal(&comp_done_cond);
572             qemu_mutex_unlock(&comp_done_lock);
573 
574             qemu_mutex_lock(&param->mutex);
575         } else {
576             qemu_cond_wait(&param->cond, &param->mutex);
577         }
578     }
579     qemu_mutex_unlock(&param->mutex);
580 
581     return NULL;
582 }
583 
584 static void compress_threads_save_cleanup(void)
585 {
586     int i, thread_count;
587 
588     if (!migrate_use_compression() || !comp_param) {
589         return;
590     }
591 
592     thread_count = migrate_compress_threads();
593     for (i = 0; i < thread_count; i++) {
594         /*
595          * we use it as a indicator which shows if the thread is
596          * properly init'd or not
597          */
598         if (!comp_param[i].file) {
599             break;
600         }
601 
602         qemu_mutex_lock(&comp_param[i].mutex);
603         comp_param[i].quit = true;
604         qemu_cond_signal(&comp_param[i].cond);
605         qemu_mutex_unlock(&comp_param[i].mutex);
606 
607         qemu_thread_join(compress_threads + i);
608         qemu_mutex_destroy(&comp_param[i].mutex);
609         qemu_cond_destroy(&comp_param[i].cond);
610         deflateEnd(&comp_param[i].stream);
611         g_free(comp_param[i].originbuf);
612         qemu_fclose(comp_param[i].file);
613         comp_param[i].file = NULL;
614     }
615     qemu_mutex_destroy(&comp_done_lock);
616     qemu_cond_destroy(&comp_done_cond);
617     g_free(compress_threads);
618     g_free(comp_param);
619     compress_threads = NULL;
620     comp_param = NULL;
621 }
622 
623 static int compress_threads_save_setup(void)
624 {
625     int i, thread_count;
626 
627     if (!migrate_use_compression()) {
628         return 0;
629     }
630     thread_count = migrate_compress_threads();
631     compress_threads = g_new0(QemuThread, thread_count);
632     comp_param = g_new0(CompressParam, thread_count);
633     qemu_cond_init(&comp_done_cond);
634     qemu_mutex_init(&comp_done_lock);
635     for (i = 0; i < thread_count; i++) {
636         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
637         if (!comp_param[i].originbuf) {
638             goto exit;
639         }
640 
641         if (deflateInit(&comp_param[i].stream,
642                         migrate_compress_level()) != Z_OK) {
643             g_free(comp_param[i].originbuf);
644             goto exit;
645         }
646 
647         /* comp_param[i].file is just used as a dummy buffer to save data,
648          * set its ops to empty.
649          */
650         comp_param[i].file = qemu_file_new_output(
651             QIO_CHANNEL(qio_channel_null_new()));
652         comp_param[i].done = true;
653         comp_param[i].quit = false;
654         qemu_mutex_init(&comp_param[i].mutex);
655         qemu_cond_init(&comp_param[i].cond);
656         qemu_thread_create(compress_threads + i, "compress",
657                            do_data_compress, comp_param + i,
658                            QEMU_THREAD_JOINABLE);
659     }
660     return 0;
661 
662 exit:
663     compress_threads_save_cleanup();
664     return -1;
665 }
666 
667 /**
668  * save_page_header: write page header to wire
669  *
670  * If this is the 1st block, it also writes the block identification
671  *
672  * Returns the number of bytes written
673  *
674  * @pss: current PSS channel status
675  * @block: block that contains the page we want to send
676  * @offset: offset inside the block for the page
677  *          in the lower bits, it contains flags
678  */
679 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
680                                RAMBlock *block, ram_addr_t offset)
681 {
682     size_t size, len;
683     bool same_block = (block == pss->last_sent_block);
684 
685     if (same_block) {
686         offset |= RAM_SAVE_FLAG_CONTINUE;
687     }
688     qemu_put_be64(f, offset);
689     size = 8;
690 
691     if (!same_block) {
692         len = strlen(block->idstr);
693         qemu_put_byte(f, len);
694         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
695         size += 1 + len;
696         pss->last_sent_block = block;
697     }
698     return size;
699 }
700 
701 /**
702  * mig_throttle_guest_down: throttle down the guest
703  *
704  * Reduce amount of guest cpu execution to hopefully slow down memory
705  * writes. If guest dirty memory rate is reduced below the rate at
706  * which we can transfer pages to the destination then we should be
707  * able to complete migration. Some workloads dirty memory way too
708  * fast and will not effectively converge, even with auto-converge.
709  */
710 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
711                                     uint64_t bytes_dirty_threshold)
712 {
713     MigrationState *s = migrate_get_current();
714     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
715     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
716     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
717     int pct_max = s->parameters.max_cpu_throttle;
718 
719     uint64_t throttle_now = cpu_throttle_get_percentage();
720     uint64_t cpu_now, cpu_ideal, throttle_inc;
721 
722     /* We have not started throttling yet. Let's start it. */
723     if (!cpu_throttle_active()) {
724         cpu_throttle_set(pct_initial);
725     } else {
726         /* Throttling already on, just increase the rate */
727         if (!pct_tailslow) {
728             throttle_inc = pct_increment;
729         } else {
730             /* Compute the ideal CPU percentage used by Guest, which may
731              * make the dirty rate match the dirty rate threshold. */
732             cpu_now = 100 - throttle_now;
733             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
734                         bytes_dirty_period);
735             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
736         }
737         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
738     }
739 }
740 
741 void mig_throttle_counter_reset(void)
742 {
743     RAMState *rs = ram_state;
744 
745     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
746     rs->num_dirty_pages_period = 0;
747     rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
748 }
749 
750 /**
751  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
752  *
753  * @rs: current RAM state
754  * @current_addr: address for the zero page
755  *
756  * Update the xbzrle cache to reflect a page that's been sent as all 0.
757  * The important thing is that a stale (not-yet-0'd) page be replaced
758  * by the new data.
759  * As a bonus, if the page wasn't in the cache it gets added so that
760  * when a small write is made into the 0'd page it gets XBZRLE sent.
761  */
762 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
763 {
764     /* We don't care if this fails to allocate a new cache page
765      * as long as it updated an old one */
766     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
767                  stat64_get(&ram_counters.dirty_sync_count));
768 }
769 
770 #define ENCODING_FLAG_XBZRLE 0x1
771 
772 /**
773  * save_xbzrle_page: compress and send current page
774  *
775  * Returns: 1 means that we wrote the page
776  *          0 means that page is identical to the one already sent
777  *          -1 means that xbzrle would be longer than normal
778  *
779  * @rs: current RAM state
780  * @pss: current PSS channel
781  * @current_data: pointer to the address of the page contents
782  * @current_addr: addr of the page
783  * @block: block that contains the page we want to send
784  * @offset: offset inside the block for the page
785  */
786 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
787                             uint8_t **current_data, ram_addr_t current_addr,
788                             RAMBlock *block, ram_addr_t offset)
789 {
790     int encoded_len = 0, bytes_xbzrle;
791     uint8_t *prev_cached_page;
792     QEMUFile *file = pss->pss_channel;
793     uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
794 
795     if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
796         xbzrle_counters.cache_miss++;
797         if (!rs->last_stage) {
798             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
799                              generation) == -1) {
800                 return -1;
801             } else {
802                 /* update *current_data when the page has been
803                    inserted into cache */
804                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
805             }
806         }
807         return -1;
808     }
809 
810     /*
811      * Reaching here means the page has hit the xbzrle cache, no matter what
812      * encoding result it is (normal encoding, overflow or skipping the page),
813      * count the page as encoded. This is used to calculate the encoding rate.
814      *
815      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
816      * 2nd page turns out to be skipped (i.e. no new bytes written to the
817      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
818      * skipped page included. In this way, the encoding rate can tell if the
819      * guest page is good for xbzrle encoding.
820      */
821     xbzrle_counters.pages++;
822     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
823 
824     /* save current buffer into memory */
825     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
826 
827     /* XBZRLE encoding (if there is no overflow) */
828     encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
829                                             TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
830                                             TARGET_PAGE_SIZE);
831 
832     /*
833      * Update the cache contents, so that it corresponds to the data
834      * sent, in all cases except where we skip the page.
835      */
836     if (!rs->last_stage && encoded_len != 0) {
837         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
838         /*
839          * In the case where we couldn't compress, ensure that the caller
840          * sends the data from the cache, since the guest might have
841          * changed the RAM since we copied it.
842          */
843         *current_data = prev_cached_page;
844     }
845 
846     if (encoded_len == 0) {
847         trace_save_xbzrle_page_skipping();
848         return 0;
849     } else if (encoded_len == -1) {
850         trace_save_xbzrle_page_overflow();
851         xbzrle_counters.overflow++;
852         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
853         return -1;
854     }
855 
856     /* Send XBZRLE based compressed page */
857     bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
858                                     offset | RAM_SAVE_FLAG_XBZRLE);
859     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
860     qemu_put_be16(file, encoded_len);
861     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
862     bytes_xbzrle += encoded_len + 1 + 2;
863     /*
864      * Like compressed_size (please see update_compress_thread_counts),
865      * the xbzrle encoded bytes don't count the 8 byte header with
866      * RAM_SAVE_FLAG_CONTINUE.
867      */
868     xbzrle_counters.bytes += bytes_xbzrle - 8;
869     ram_transferred_add(bytes_xbzrle);
870 
871     return 1;
872 }
873 
874 /**
875  * pss_find_next_dirty: find the next dirty page of current ramblock
876  *
877  * This function updates pss->page to point to the next dirty page index
878  * within the ramblock to migrate, or the end of ramblock when nothing
879  * found.  Note that when pss->host_page_sending==true it means we're
880  * during sending a host page, so we won't look for dirty page that is
881  * outside the host page boundary.
882  *
883  * @pss: the current page search status
884  */
885 static void pss_find_next_dirty(PageSearchStatus *pss)
886 {
887     RAMBlock *rb = pss->block;
888     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
889     unsigned long *bitmap = rb->bmap;
890 
891     if (ramblock_is_ignored(rb)) {
892         /* Points directly to the end, so we know no dirty page */
893         pss->page = size;
894         return;
895     }
896 
897     /*
898      * If during sending a host page, only look for dirty pages within the
899      * current host page being send.
900      */
901     if (pss->host_page_sending) {
902         assert(pss->host_page_end);
903         size = MIN(size, pss->host_page_end);
904     }
905 
906     pss->page = find_next_bit(bitmap, size, pss->page);
907 }
908 
909 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
910                                                        unsigned long page)
911 {
912     uint8_t shift;
913     hwaddr size, start;
914 
915     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
916         return;
917     }
918 
919     shift = rb->clear_bmap_shift;
920     /*
921      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
922      * can make things easier sometimes since then start address
923      * of the small chunk will always be 64 pages aligned so the
924      * bitmap will always be aligned to unsigned long. We should
925      * even be able to remove this restriction but I'm simply
926      * keeping it.
927      */
928     assert(shift >= 6);
929 
930     size = 1ULL << (TARGET_PAGE_BITS + shift);
931     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
932     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
933     memory_region_clear_dirty_bitmap(rb->mr, start, size);
934 }
935 
936 static void
937 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
938                                                  unsigned long start,
939                                                  unsigned long npages)
940 {
941     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
942     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
943     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
944 
945     /*
946      * Clear pages from start to start + npages - 1, so the end boundary is
947      * exclusive.
948      */
949     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
950         migration_clear_memory_region_dirty_bitmap(rb, i);
951     }
952 }
953 
954 /*
955  * colo_bitmap_find_diry:find contiguous dirty pages from start
956  *
957  * Returns the page offset within memory region of the start of the contiguout
958  * dirty page
959  *
960  * @rs: current RAM state
961  * @rb: RAMBlock where to search for dirty pages
962  * @start: page where we start the search
963  * @num: the number of contiguous dirty pages
964  */
965 static inline
966 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
967                                      unsigned long start, unsigned long *num)
968 {
969     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
970     unsigned long *bitmap = rb->bmap;
971     unsigned long first, next;
972 
973     *num = 0;
974 
975     if (ramblock_is_ignored(rb)) {
976         return size;
977     }
978 
979     first = find_next_bit(bitmap, size, start);
980     if (first >= size) {
981         return first;
982     }
983     next = find_next_zero_bit(bitmap, size, first + 1);
984     assert(next >= first);
985     *num = next - first;
986     return first;
987 }
988 
989 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
990                                                 RAMBlock *rb,
991                                                 unsigned long page)
992 {
993     bool ret;
994 
995     /*
996      * Clear dirty bitmap if needed.  This _must_ be called before we
997      * send any of the page in the chunk because we need to make sure
998      * we can capture further page content changes when we sync dirty
999      * log the next time.  So as long as we are going to send any of
1000      * the page in the chunk we clear the remote dirty bitmap for all.
1001      * Clearing it earlier won't be a problem, but too late will.
1002      */
1003     migration_clear_memory_region_dirty_bitmap(rb, page);
1004 
1005     ret = test_and_clear_bit(page, rb->bmap);
1006     if (ret) {
1007         rs->migration_dirty_pages--;
1008     }
1009 
1010     return ret;
1011 }
1012 
1013 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1014                                        void *opaque)
1015 {
1016     const hwaddr offset = section->offset_within_region;
1017     const hwaddr size = int128_get64(section->size);
1018     const unsigned long start = offset >> TARGET_PAGE_BITS;
1019     const unsigned long npages = size >> TARGET_PAGE_BITS;
1020     RAMBlock *rb = section->mr->ram_block;
1021     uint64_t *cleared_bits = opaque;
1022 
1023     /*
1024      * We don't grab ram_state->bitmap_mutex because we expect to run
1025      * only when starting migration or during postcopy recovery where
1026      * we don't have concurrent access.
1027      */
1028     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1029         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1030     }
1031     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1032     bitmap_clear(rb->bmap, start, npages);
1033 }
1034 
1035 /*
1036  * Exclude all dirty pages from migration that fall into a discarded range as
1037  * managed by a RamDiscardManager responsible for the mapped memory region of
1038  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1039  *
1040  * Discarded pages ("logically unplugged") have undefined content and must
1041  * not get migrated, because even reading these pages for migration might
1042  * result in undesired behavior.
1043  *
1044  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1045  *
1046  * Note: The result is only stable while migrating (precopy/postcopy).
1047  */
1048 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1049 {
1050     uint64_t cleared_bits = 0;
1051 
1052     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1053         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1054         MemoryRegionSection section = {
1055             .mr = rb->mr,
1056             .offset_within_region = 0,
1057             .size = int128_make64(qemu_ram_get_used_length(rb)),
1058         };
1059 
1060         ram_discard_manager_replay_discarded(rdm, &section,
1061                                              dirty_bitmap_clear_section,
1062                                              &cleared_bits);
1063     }
1064     return cleared_bits;
1065 }
1066 
1067 /*
1068  * Check if a host-page aligned page falls into a discarded range as managed by
1069  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1070  *
1071  * Note: The result is only stable while migrating (precopy/postcopy).
1072  */
1073 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1074 {
1075     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1076         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1077         MemoryRegionSection section = {
1078             .mr = rb->mr,
1079             .offset_within_region = start,
1080             .size = int128_make64(qemu_ram_pagesize(rb)),
1081         };
1082 
1083         return !ram_discard_manager_is_populated(rdm, &section);
1084     }
1085     return false;
1086 }
1087 
1088 /* Called with RCU critical section */
1089 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1090 {
1091     uint64_t new_dirty_pages =
1092         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1093 
1094     rs->migration_dirty_pages += new_dirty_pages;
1095     rs->num_dirty_pages_period += new_dirty_pages;
1096 }
1097 
1098 /**
1099  * ram_pagesize_summary: calculate all the pagesizes of a VM
1100  *
1101  * Returns a summary bitmap of the page sizes of all RAMBlocks
1102  *
1103  * For VMs with just normal pages this is equivalent to the host page
1104  * size. If it's got some huge pages then it's the OR of all the
1105  * different page sizes.
1106  */
1107 uint64_t ram_pagesize_summary(void)
1108 {
1109     RAMBlock *block;
1110     uint64_t summary = 0;
1111 
1112     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1113         summary |= block->page_size;
1114     }
1115 
1116     return summary;
1117 }
1118 
1119 uint64_t ram_get_total_transferred_pages(void)
1120 {
1121     return stat64_get(&ram_counters.normal_pages) +
1122         stat64_get(&ram_counters.zero_pages) +
1123         compression_counters.pages + xbzrle_counters.pages;
1124 }
1125 
1126 static void migration_update_rates(RAMState *rs, int64_t end_time)
1127 {
1128     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1129     double compressed_size;
1130 
1131     /* calculate period counters */
1132     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1133                 / (end_time - rs->time_last_bitmap_sync);
1134 
1135     if (!page_count) {
1136         return;
1137     }
1138 
1139     if (migrate_use_xbzrle()) {
1140         double encoded_size, unencoded_size;
1141 
1142         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1143             rs->xbzrle_cache_miss_prev) / page_count;
1144         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1145         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1146                          TARGET_PAGE_SIZE;
1147         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1148         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1149             xbzrle_counters.encoding_rate = 0;
1150         } else {
1151             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1152         }
1153         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1154         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1155     }
1156 
1157     if (migrate_use_compression()) {
1158         compression_counters.busy_rate = (double)(compression_counters.busy -
1159             rs->compress_thread_busy_prev) / page_count;
1160         rs->compress_thread_busy_prev = compression_counters.busy;
1161 
1162         compressed_size = compression_counters.compressed_size -
1163                           rs->compressed_size_prev;
1164         if (compressed_size) {
1165             double uncompressed_size = (compression_counters.pages -
1166                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1167 
1168             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1169             compression_counters.compression_rate =
1170                                         uncompressed_size / compressed_size;
1171 
1172             rs->compress_pages_prev = compression_counters.pages;
1173             rs->compressed_size_prev = compression_counters.compressed_size;
1174         }
1175     }
1176 }
1177 
1178 static void migration_trigger_throttle(RAMState *rs)
1179 {
1180     MigrationState *s = migrate_get_current();
1181     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1182     uint64_t bytes_xfer_period =
1183         stat64_get(&ram_counters.transferred) - rs->bytes_xfer_prev;
1184     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1185     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1186 
1187     /* During block migration the auto-converge logic incorrectly detects
1188      * that ram migration makes no progress. Avoid this by disabling the
1189      * throttling logic during the bulk phase of block migration. */
1190     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1191         /* The following detection logic can be refined later. For now:
1192            Check to see if the ratio between dirtied bytes and the approx.
1193            amount of bytes that just got transferred since the last time
1194            we were in this routine reaches the threshold. If that happens
1195            twice, start or increase throttling. */
1196 
1197         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1198             (++rs->dirty_rate_high_cnt >= 2)) {
1199             trace_migration_throttle();
1200             rs->dirty_rate_high_cnt = 0;
1201             mig_throttle_guest_down(bytes_dirty_period,
1202                                     bytes_dirty_threshold);
1203         }
1204     }
1205 }
1206 
1207 static void migration_bitmap_sync(RAMState *rs)
1208 {
1209     RAMBlock *block;
1210     int64_t end_time;
1211 
1212     stat64_add(&ram_counters.dirty_sync_count, 1);
1213 
1214     if (!rs->time_last_bitmap_sync) {
1215         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1216     }
1217 
1218     trace_migration_bitmap_sync_start();
1219     memory_global_dirty_log_sync();
1220 
1221     qemu_mutex_lock(&rs->bitmap_mutex);
1222     WITH_RCU_READ_LOCK_GUARD() {
1223         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1224             ramblock_sync_dirty_bitmap(rs, block);
1225         }
1226         ram_counters.remaining = ram_bytes_remaining();
1227     }
1228     qemu_mutex_unlock(&rs->bitmap_mutex);
1229 
1230     memory_global_after_dirty_log_sync();
1231     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1232 
1233     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1234 
1235     /* more than 1 second = 1000 millisecons */
1236     if (end_time > rs->time_last_bitmap_sync + 1000) {
1237         migration_trigger_throttle(rs);
1238 
1239         migration_update_rates(rs, end_time);
1240 
1241         rs->target_page_count_prev = rs->target_page_count;
1242 
1243         /* reset period counters */
1244         rs->time_last_bitmap_sync = end_time;
1245         rs->num_dirty_pages_period = 0;
1246         rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
1247     }
1248     if (migrate_use_events()) {
1249         uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
1250         qapi_event_send_migration_pass(generation);
1251     }
1252 }
1253 
1254 static void migration_bitmap_sync_precopy(RAMState *rs)
1255 {
1256     Error *local_err = NULL;
1257 
1258     /*
1259      * The current notifier usage is just an optimization to migration, so we
1260      * don't stop the normal migration process in the error case.
1261      */
1262     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1263         error_report_err(local_err);
1264         local_err = NULL;
1265     }
1266 
1267     migration_bitmap_sync(rs);
1268 
1269     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1270         error_report_err(local_err);
1271     }
1272 }
1273 
1274 void ram_release_page(const char *rbname, uint64_t offset)
1275 {
1276     if (!migrate_release_ram() || !migration_in_postcopy()) {
1277         return;
1278     }
1279 
1280     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1281 }
1282 
1283 /**
1284  * save_zero_page_to_file: send the zero page to the file
1285  *
1286  * Returns the size of data written to the file, 0 means the page is not
1287  * a zero page
1288  *
1289  * @pss: current PSS channel
1290  * @block: block that contains the page we want to send
1291  * @offset: offset inside the block for the page
1292  */
1293 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1294                                   RAMBlock *block, ram_addr_t offset)
1295 {
1296     uint8_t *p = block->host + offset;
1297     int len = 0;
1298 
1299     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1300         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1301         qemu_put_byte(file, 0);
1302         len += 1;
1303         ram_release_page(block->idstr, offset);
1304     }
1305     return len;
1306 }
1307 
1308 /**
1309  * save_zero_page: send the zero page to the stream
1310  *
1311  * Returns the number of pages written.
1312  *
1313  * @pss: current PSS channel
1314  * @block: block that contains the page we want to send
1315  * @offset: offset inside the block for the page
1316  */
1317 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1318                           ram_addr_t offset)
1319 {
1320     int len = save_zero_page_to_file(pss, f, block, offset);
1321 
1322     if (len) {
1323         stat64_add(&ram_counters.zero_pages, 1);
1324         ram_transferred_add(len);
1325         return 1;
1326     }
1327     return -1;
1328 }
1329 
1330 /*
1331  * @pages: the number of pages written by the control path,
1332  *        < 0 - error
1333  *        > 0 - number of pages written
1334  *
1335  * Return true if the pages has been saved, otherwise false is returned.
1336  */
1337 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1338                               ram_addr_t offset, int *pages)
1339 {
1340     uint64_t bytes_xmit = 0;
1341     int ret;
1342 
1343     *pages = -1;
1344     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1345                                 TARGET_PAGE_SIZE, &bytes_xmit);
1346     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1347         return false;
1348     }
1349 
1350     if (bytes_xmit) {
1351         ram_transferred_add(bytes_xmit);
1352         *pages = 1;
1353     }
1354 
1355     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1356         return true;
1357     }
1358 
1359     if (bytes_xmit > 0) {
1360         stat64_add(&ram_counters.normal_pages, 1);
1361     } else if (bytes_xmit == 0) {
1362         stat64_add(&ram_counters.zero_pages, 1);
1363     }
1364 
1365     return true;
1366 }
1367 
1368 /*
1369  * directly send the page to the stream
1370  *
1371  * Returns the number of pages written.
1372  *
1373  * @pss: current PSS channel
1374  * @block: block that contains the page we want to send
1375  * @offset: offset inside the block for the page
1376  * @buf: the page to be sent
1377  * @async: send to page asyncly
1378  */
1379 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1380                             ram_addr_t offset, uint8_t *buf, bool async)
1381 {
1382     QEMUFile *file = pss->pss_channel;
1383 
1384     ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1385                                          offset | RAM_SAVE_FLAG_PAGE));
1386     if (async) {
1387         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1388                               migrate_release_ram() &&
1389                               migration_in_postcopy());
1390     } else {
1391         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1392     }
1393     ram_transferred_add(TARGET_PAGE_SIZE);
1394     stat64_add(&ram_counters.normal_pages, 1);
1395     return 1;
1396 }
1397 
1398 /**
1399  * ram_save_page: send the given page to the stream
1400  *
1401  * Returns the number of pages written.
1402  *          < 0 - error
1403  *          >=0 - Number of pages written - this might legally be 0
1404  *                if xbzrle noticed the page was the same.
1405  *
1406  * @rs: current RAM state
1407  * @block: block that contains the page we want to send
1408  * @offset: offset inside the block for the page
1409  */
1410 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1411 {
1412     int pages = -1;
1413     uint8_t *p;
1414     bool send_async = true;
1415     RAMBlock *block = pss->block;
1416     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1417     ram_addr_t current_addr = block->offset + offset;
1418 
1419     p = block->host + offset;
1420     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1421 
1422     XBZRLE_cache_lock();
1423     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1424         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1425                                  block, offset);
1426         if (!rs->last_stage) {
1427             /* Can't send this cached data async, since the cache page
1428              * might get updated before it gets to the wire
1429              */
1430             send_async = false;
1431         }
1432     }
1433 
1434     /* XBZRLE overflow or normal page */
1435     if (pages == -1) {
1436         pages = save_normal_page(pss, block, offset, p, send_async);
1437     }
1438 
1439     XBZRLE_cache_unlock();
1440 
1441     return pages;
1442 }
1443 
1444 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1445                                  ram_addr_t offset)
1446 {
1447     if (multifd_queue_page(file, block, offset) < 0) {
1448         return -1;
1449     }
1450     stat64_add(&ram_counters.normal_pages, 1);
1451 
1452     return 1;
1453 }
1454 
1455 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1456                                  ram_addr_t offset, uint8_t *source_buf)
1457 {
1458     RAMState *rs = ram_state;
1459     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1460     uint8_t *p = block->host + offset;
1461     int ret;
1462 
1463     if (save_zero_page_to_file(pss, f, block, offset)) {
1464         return true;
1465     }
1466 
1467     save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1468 
1469     /*
1470      * copy it to a internal buffer to avoid it being modified by VM
1471      * so that we can catch up the error during compression and
1472      * decompression
1473      */
1474     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1475     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1476     if (ret < 0) {
1477         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1478         error_report("compressed data failed!");
1479     }
1480     return false;
1481 }
1482 
1483 static void
1484 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1485 {
1486     ram_transferred_add(bytes_xmit);
1487 
1488     if (param->zero_page) {
1489         stat64_add(&ram_counters.zero_pages, 1);
1490         return;
1491     }
1492 
1493     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1494     compression_counters.compressed_size += bytes_xmit - 8;
1495     compression_counters.pages++;
1496 }
1497 
1498 static bool save_page_use_compression(RAMState *rs);
1499 
1500 static void flush_compressed_data(RAMState *rs)
1501 {
1502     MigrationState *ms = migrate_get_current();
1503     int idx, len, thread_count;
1504 
1505     if (!save_page_use_compression(rs)) {
1506         return;
1507     }
1508     thread_count = migrate_compress_threads();
1509 
1510     qemu_mutex_lock(&comp_done_lock);
1511     for (idx = 0; idx < thread_count; idx++) {
1512         while (!comp_param[idx].done) {
1513             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1514         }
1515     }
1516     qemu_mutex_unlock(&comp_done_lock);
1517 
1518     for (idx = 0; idx < thread_count; idx++) {
1519         qemu_mutex_lock(&comp_param[idx].mutex);
1520         if (!comp_param[idx].quit) {
1521             len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1522             /*
1523              * it's safe to fetch zero_page without holding comp_done_lock
1524              * as there is no further request submitted to the thread,
1525              * i.e, the thread should be waiting for a request at this point.
1526              */
1527             update_compress_thread_counts(&comp_param[idx], len);
1528         }
1529         qemu_mutex_unlock(&comp_param[idx].mutex);
1530     }
1531 }
1532 
1533 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1534                                        ram_addr_t offset)
1535 {
1536     param->block = block;
1537     param->offset = offset;
1538 }
1539 
1540 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1541 {
1542     int idx, thread_count, bytes_xmit = -1, pages = -1;
1543     bool wait = migrate_compress_wait_thread();
1544     MigrationState *ms = migrate_get_current();
1545 
1546     thread_count = migrate_compress_threads();
1547     qemu_mutex_lock(&comp_done_lock);
1548 retry:
1549     for (idx = 0; idx < thread_count; idx++) {
1550         if (comp_param[idx].done) {
1551             comp_param[idx].done = false;
1552             bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1553                                             comp_param[idx].file);
1554             qemu_mutex_lock(&comp_param[idx].mutex);
1555             set_compress_params(&comp_param[idx], block, offset);
1556             qemu_cond_signal(&comp_param[idx].cond);
1557             qemu_mutex_unlock(&comp_param[idx].mutex);
1558             pages = 1;
1559             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1560             break;
1561         }
1562     }
1563 
1564     /*
1565      * wait for the free thread if the user specifies 'compress-wait-thread',
1566      * otherwise we will post the page out in the main thread as normal page.
1567      */
1568     if (pages < 0 && wait) {
1569         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1570         goto retry;
1571     }
1572     qemu_mutex_unlock(&comp_done_lock);
1573 
1574     return pages;
1575 }
1576 
1577 #define PAGE_ALL_CLEAN 0
1578 #define PAGE_TRY_AGAIN 1
1579 #define PAGE_DIRTY_FOUND 2
1580 /**
1581  * find_dirty_block: find the next dirty page and update any state
1582  * associated with the search process.
1583  *
1584  * Returns:
1585  *         PAGE_ALL_CLEAN: no dirty page found, give up
1586  *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1587  *         PAGE_DIRTY_FOUND: dirty page found
1588  *
1589  * @rs: current RAM state
1590  * @pss: data about the state of the current dirty page scan
1591  * @again: set to false if the search has scanned the whole of RAM
1592  */
1593 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1594 {
1595     /* Update pss->page for the next dirty bit in ramblock */
1596     pss_find_next_dirty(pss);
1597 
1598     if (pss->complete_round && pss->block == rs->last_seen_block &&
1599         pss->page >= rs->last_page) {
1600         /*
1601          * We've been once around the RAM and haven't found anything.
1602          * Give up.
1603          */
1604         return PAGE_ALL_CLEAN;
1605     }
1606     if (!offset_in_ramblock(pss->block,
1607                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1608         /* Didn't find anything in this RAM Block */
1609         pss->page = 0;
1610         pss->block = QLIST_NEXT_RCU(pss->block, next);
1611         if (!pss->block) {
1612             /*
1613              * If memory migration starts over, we will meet a dirtied page
1614              * which may still exists in compression threads's ring, so we
1615              * should flush the compressed data to make sure the new page
1616              * is not overwritten by the old one in the destination.
1617              *
1618              * Also If xbzrle is on, stop using the data compression at this
1619              * point. In theory, xbzrle can do better than compression.
1620              */
1621             flush_compressed_data(rs);
1622 
1623             /* Hit the end of the list */
1624             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1625             /* Flag that we've looped */
1626             pss->complete_round = true;
1627             /* After the first round, enable XBZRLE. */
1628             if (migrate_use_xbzrle()) {
1629                 rs->xbzrle_enabled = true;
1630             }
1631         }
1632         /* Didn't find anything this time, but try again on the new block */
1633         return PAGE_TRY_AGAIN;
1634     } else {
1635         /* We've found something */
1636         return PAGE_DIRTY_FOUND;
1637     }
1638 }
1639 
1640 /**
1641  * unqueue_page: gets a page of the queue
1642  *
1643  * Helper for 'get_queued_page' - gets a page off the queue
1644  *
1645  * Returns the block of the page (or NULL if none available)
1646  *
1647  * @rs: current RAM state
1648  * @offset: used to return the offset within the RAMBlock
1649  */
1650 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1651 {
1652     struct RAMSrcPageRequest *entry;
1653     RAMBlock *block = NULL;
1654 
1655     if (!postcopy_has_request(rs)) {
1656         return NULL;
1657     }
1658 
1659     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1660 
1661     /*
1662      * This should _never_ change even after we take the lock, because no one
1663      * should be taking anything off the request list other than us.
1664      */
1665     assert(postcopy_has_request(rs));
1666 
1667     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1668     block = entry->rb;
1669     *offset = entry->offset;
1670 
1671     if (entry->len > TARGET_PAGE_SIZE) {
1672         entry->len -= TARGET_PAGE_SIZE;
1673         entry->offset += TARGET_PAGE_SIZE;
1674     } else {
1675         memory_region_unref(block->mr);
1676         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1677         g_free(entry);
1678         migration_consume_urgent_request();
1679     }
1680 
1681     return block;
1682 }
1683 
1684 #if defined(__linux__)
1685 /**
1686  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1687  *   is found, return RAM block pointer and page offset
1688  *
1689  * Returns pointer to the RAMBlock containing faulting page,
1690  *   NULL if no write faults are pending
1691  *
1692  * @rs: current RAM state
1693  * @offset: page offset from the beginning of the block
1694  */
1695 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1696 {
1697     struct uffd_msg uffd_msg;
1698     void *page_address;
1699     RAMBlock *block;
1700     int res;
1701 
1702     if (!migrate_background_snapshot()) {
1703         return NULL;
1704     }
1705 
1706     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1707     if (res <= 0) {
1708         return NULL;
1709     }
1710 
1711     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1712     block = qemu_ram_block_from_host(page_address, false, offset);
1713     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1714     return block;
1715 }
1716 
1717 /**
1718  * ram_save_release_protection: release UFFD write protection after
1719  *   a range of pages has been saved
1720  *
1721  * @rs: current RAM state
1722  * @pss: page-search-status structure
1723  * @start_page: index of the first page in the range relative to pss->block
1724  *
1725  * Returns 0 on success, negative value in case of an error
1726 */
1727 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1728         unsigned long start_page)
1729 {
1730     int res = 0;
1731 
1732     /* Check if page is from UFFD-managed region. */
1733     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1734         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1735         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1736 
1737         /* Flush async buffers before un-protect. */
1738         qemu_fflush(pss->pss_channel);
1739         /* Un-protect memory range. */
1740         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1741                 false, false);
1742     }
1743 
1744     return res;
1745 }
1746 
1747 /* ram_write_tracking_available: check if kernel supports required UFFD features
1748  *
1749  * Returns true if supports, false otherwise
1750  */
1751 bool ram_write_tracking_available(void)
1752 {
1753     uint64_t uffd_features;
1754     int res;
1755 
1756     res = uffd_query_features(&uffd_features);
1757     return (res == 0 &&
1758             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1759 }
1760 
1761 /* ram_write_tracking_compatible: check if guest configuration is
1762  *   compatible with 'write-tracking'
1763  *
1764  * Returns true if compatible, false otherwise
1765  */
1766 bool ram_write_tracking_compatible(void)
1767 {
1768     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1769     int uffd_fd;
1770     RAMBlock *block;
1771     bool ret = false;
1772 
1773     /* Open UFFD file descriptor */
1774     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1775     if (uffd_fd < 0) {
1776         return false;
1777     }
1778 
1779     RCU_READ_LOCK_GUARD();
1780 
1781     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1782         uint64_t uffd_ioctls;
1783 
1784         /* Nothing to do with read-only and MMIO-writable regions */
1785         if (block->mr->readonly || block->mr->rom_device) {
1786             continue;
1787         }
1788         /* Try to register block memory via UFFD-IO to track writes */
1789         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1790                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1791             goto out;
1792         }
1793         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1794             goto out;
1795         }
1796     }
1797     ret = true;
1798 
1799 out:
1800     uffd_close_fd(uffd_fd);
1801     return ret;
1802 }
1803 
1804 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1805                                        ram_addr_t size)
1806 {
1807     const ram_addr_t end = offset + size;
1808 
1809     /*
1810      * We read one byte of each page; this will preallocate page tables if
1811      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1812      * where no page was populated yet. This might require adaption when
1813      * supporting other mappings, like shmem.
1814      */
1815     for (; offset < end; offset += block->page_size) {
1816         char tmp = *((char *)block->host + offset);
1817 
1818         /* Don't optimize the read out */
1819         asm volatile("" : "+r" (tmp));
1820     }
1821 }
1822 
1823 static inline int populate_read_section(MemoryRegionSection *section,
1824                                         void *opaque)
1825 {
1826     const hwaddr size = int128_get64(section->size);
1827     hwaddr offset = section->offset_within_region;
1828     RAMBlock *block = section->mr->ram_block;
1829 
1830     populate_read_range(block, offset, size);
1831     return 0;
1832 }
1833 
1834 /*
1835  * ram_block_populate_read: preallocate page tables and populate pages in the
1836  *   RAM block by reading a byte of each page.
1837  *
1838  * Since it's solely used for userfault_fd WP feature, here we just
1839  *   hardcode page size to qemu_real_host_page_size.
1840  *
1841  * @block: RAM block to populate
1842  */
1843 static void ram_block_populate_read(RAMBlock *rb)
1844 {
1845     /*
1846      * Skip populating all pages that fall into a discarded range as managed by
1847      * a RamDiscardManager responsible for the mapped memory region of the
1848      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1849      * must not get populated automatically. We don't have to track
1850      * modifications via userfaultfd WP reliably, because these pages will
1851      * not be part of the migration stream either way -- see
1852      * ramblock_dirty_bitmap_exclude_discarded_pages().
1853      *
1854      * Note: The result is only stable while migrating (precopy/postcopy).
1855      */
1856     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1857         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1858         MemoryRegionSection section = {
1859             .mr = rb->mr,
1860             .offset_within_region = 0,
1861             .size = rb->mr->size,
1862         };
1863 
1864         ram_discard_manager_replay_populated(rdm, &section,
1865                                              populate_read_section, NULL);
1866     } else {
1867         populate_read_range(rb, 0, rb->used_length);
1868     }
1869 }
1870 
1871 /*
1872  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1873  */
1874 void ram_write_tracking_prepare(void)
1875 {
1876     RAMBlock *block;
1877 
1878     RCU_READ_LOCK_GUARD();
1879 
1880     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1881         /* Nothing to do with read-only and MMIO-writable regions */
1882         if (block->mr->readonly || block->mr->rom_device) {
1883             continue;
1884         }
1885 
1886         /*
1887          * Populate pages of the RAM block before enabling userfault_fd
1888          * write protection.
1889          *
1890          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1891          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1892          * pages with pte_none() entries in page table.
1893          */
1894         ram_block_populate_read(block);
1895     }
1896 }
1897 
1898 static inline int uffd_protect_section(MemoryRegionSection *section,
1899                                        void *opaque)
1900 {
1901     const hwaddr size = int128_get64(section->size);
1902     const hwaddr offset = section->offset_within_region;
1903     RAMBlock *rb = section->mr->ram_block;
1904     int uffd_fd = (uintptr_t)opaque;
1905 
1906     return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1907                                   false);
1908 }
1909 
1910 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1911 {
1912     assert(rb->flags & RAM_UF_WRITEPROTECT);
1913 
1914     /* See ram_block_populate_read() */
1915     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1916         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1917         MemoryRegionSection section = {
1918             .mr = rb->mr,
1919             .offset_within_region = 0,
1920             .size = rb->mr->size,
1921         };
1922 
1923         return ram_discard_manager_replay_populated(rdm, &section,
1924                                                     uffd_protect_section,
1925                                                     (void *)(uintptr_t)uffd_fd);
1926     }
1927     return uffd_change_protection(uffd_fd, rb->host,
1928                                   rb->used_length, true, false);
1929 }
1930 
1931 /*
1932  * ram_write_tracking_start: start UFFD-WP memory tracking
1933  *
1934  * Returns 0 for success or negative value in case of error
1935  */
1936 int ram_write_tracking_start(void)
1937 {
1938     int uffd_fd;
1939     RAMState *rs = ram_state;
1940     RAMBlock *block;
1941 
1942     /* Open UFFD file descriptor */
1943     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1944     if (uffd_fd < 0) {
1945         return uffd_fd;
1946     }
1947     rs->uffdio_fd = uffd_fd;
1948 
1949     RCU_READ_LOCK_GUARD();
1950 
1951     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1952         /* Nothing to do with read-only and MMIO-writable regions */
1953         if (block->mr->readonly || block->mr->rom_device) {
1954             continue;
1955         }
1956 
1957         /* Register block memory with UFFD to track writes */
1958         if (uffd_register_memory(rs->uffdio_fd, block->host,
1959                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1960             goto fail;
1961         }
1962         block->flags |= RAM_UF_WRITEPROTECT;
1963         memory_region_ref(block->mr);
1964 
1965         /* Apply UFFD write protection to the block memory range */
1966         if (ram_block_uffd_protect(block, uffd_fd)) {
1967             goto fail;
1968         }
1969 
1970         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1971                 block->host, block->max_length);
1972     }
1973 
1974     return 0;
1975 
1976 fail:
1977     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1978 
1979     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1980         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1981             continue;
1982         }
1983         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1984         /* Cleanup flags and remove reference */
1985         block->flags &= ~RAM_UF_WRITEPROTECT;
1986         memory_region_unref(block->mr);
1987     }
1988 
1989     uffd_close_fd(uffd_fd);
1990     rs->uffdio_fd = -1;
1991     return -1;
1992 }
1993 
1994 /**
1995  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1996  */
1997 void ram_write_tracking_stop(void)
1998 {
1999     RAMState *rs = ram_state;
2000     RAMBlock *block;
2001 
2002     RCU_READ_LOCK_GUARD();
2003 
2004     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2005         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
2006             continue;
2007         }
2008         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
2009 
2010         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2011                 block->host, block->max_length);
2012 
2013         /* Cleanup flags and remove reference */
2014         block->flags &= ~RAM_UF_WRITEPROTECT;
2015         memory_region_unref(block->mr);
2016     }
2017 
2018     /* Finally close UFFD file descriptor */
2019     uffd_close_fd(rs->uffdio_fd);
2020     rs->uffdio_fd = -1;
2021 }
2022 
2023 #else
2024 /* No target OS support, stubs just fail or ignore */
2025 
2026 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2027 {
2028     (void) rs;
2029     (void) offset;
2030 
2031     return NULL;
2032 }
2033 
2034 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2035         unsigned long start_page)
2036 {
2037     (void) rs;
2038     (void) pss;
2039     (void) start_page;
2040 
2041     return 0;
2042 }
2043 
2044 bool ram_write_tracking_available(void)
2045 {
2046     return false;
2047 }
2048 
2049 bool ram_write_tracking_compatible(void)
2050 {
2051     assert(0);
2052     return false;
2053 }
2054 
2055 int ram_write_tracking_start(void)
2056 {
2057     assert(0);
2058     return -1;
2059 }
2060 
2061 void ram_write_tracking_stop(void)
2062 {
2063     assert(0);
2064 }
2065 #endif /* defined(__linux__) */
2066 
2067 /**
2068  * get_queued_page: unqueue a page from the postcopy requests
2069  *
2070  * Skips pages that are already sent (!dirty)
2071  *
2072  * Returns true if a queued page is found
2073  *
2074  * @rs: current RAM state
2075  * @pss: data about the state of the current dirty page scan
2076  */
2077 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2078 {
2079     RAMBlock  *block;
2080     ram_addr_t offset;
2081     bool dirty;
2082 
2083     do {
2084         block = unqueue_page(rs, &offset);
2085         /*
2086          * We're sending this page, and since it's postcopy nothing else
2087          * will dirty it, and we must make sure it doesn't get sent again
2088          * even if this queue request was received after the background
2089          * search already sent it.
2090          */
2091         if (block) {
2092             unsigned long page;
2093 
2094             page = offset >> TARGET_PAGE_BITS;
2095             dirty = test_bit(page, block->bmap);
2096             if (!dirty) {
2097                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2098                                                 page);
2099             } else {
2100                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2101             }
2102         }
2103 
2104     } while (block && !dirty);
2105 
2106     if (!block) {
2107         /*
2108          * Poll write faults too if background snapshot is enabled; that's
2109          * when we have vcpus got blocked by the write protected pages.
2110          */
2111         block = poll_fault_page(rs, &offset);
2112     }
2113 
2114     if (block) {
2115         /*
2116          * We want the background search to continue from the queued page
2117          * since the guest is likely to want other pages near to the page
2118          * it just requested.
2119          */
2120         pss->block = block;
2121         pss->page = offset >> TARGET_PAGE_BITS;
2122 
2123         /*
2124          * This unqueued page would break the "one round" check, even is
2125          * really rare.
2126          */
2127         pss->complete_round = false;
2128     }
2129 
2130     return !!block;
2131 }
2132 
2133 /**
2134  * migration_page_queue_free: drop any remaining pages in the ram
2135  * request queue
2136  *
2137  * It should be empty at the end anyway, but in error cases there may
2138  * be some left.  in case that there is any page left, we drop it.
2139  *
2140  */
2141 static void migration_page_queue_free(RAMState *rs)
2142 {
2143     struct RAMSrcPageRequest *mspr, *next_mspr;
2144     /* This queue generally should be empty - but in the case of a failed
2145      * migration might have some droppings in.
2146      */
2147     RCU_READ_LOCK_GUARD();
2148     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2149         memory_region_unref(mspr->rb->mr);
2150         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2151         g_free(mspr);
2152     }
2153 }
2154 
2155 /**
2156  * ram_save_queue_pages: queue the page for transmission
2157  *
2158  * A request from postcopy destination for example.
2159  *
2160  * Returns zero on success or negative on error
2161  *
2162  * @rbname: Name of the RAMBLock of the request. NULL means the
2163  *          same that last one.
2164  * @start: starting address from the start of the RAMBlock
2165  * @len: length (in bytes) to send
2166  */
2167 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2168 {
2169     RAMBlock *ramblock;
2170     RAMState *rs = ram_state;
2171 
2172     stat64_add(&ram_counters.postcopy_requests, 1);
2173     RCU_READ_LOCK_GUARD();
2174 
2175     if (!rbname) {
2176         /* Reuse last RAMBlock */
2177         ramblock = rs->last_req_rb;
2178 
2179         if (!ramblock) {
2180             /*
2181              * Shouldn't happen, we can't reuse the last RAMBlock if
2182              * it's the 1st request.
2183              */
2184             error_report("ram_save_queue_pages no previous block");
2185             return -1;
2186         }
2187     } else {
2188         ramblock = qemu_ram_block_by_name(rbname);
2189 
2190         if (!ramblock) {
2191             /* We shouldn't be asked for a non-existent RAMBlock */
2192             error_report("ram_save_queue_pages no block '%s'", rbname);
2193             return -1;
2194         }
2195         rs->last_req_rb = ramblock;
2196     }
2197     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2198     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2199         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2200                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2201                      __func__, start, len, ramblock->used_length);
2202         return -1;
2203     }
2204 
2205     /*
2206      * When with postcopy preempt, we send back the page directly in the
2207      * rp-return thread.
2208      */
2209     if (postcopy_preempt_active()) {
2210         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2211         size_t page_size = qemu_ram_pagesize(ramblock);
2212         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2213         int ret = 0;
2214 
2215         qemu_mutex_lock(&rs->bitmap_mutex);
2216 
2217         pss_init(pss, ramblock, page_start);
2218         /*
2219          * Always use the preempt channel, and make sure it's there.  It's
2220          * safe to access without lock, because when rp-thread is running
2221          * we should be the only one who operates on the qemufile
2222          */
2223         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2224         assert(pss->pss_channel);
2225 
2226         /*
2227          * It must be either one or multiple of host page size.  Just
2228          * assert; if something wrong we're mostly split brain anyway.
2229          */
2230         assert(len % page_size == 0);
2231         while (len) {
2232             if (ram_save_host_page_urgent(pss)) {
2233                 error_report("%s: ram_save_host_page_urgent() failed: "
2234                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2235                              __func__, ramblock->idstr, start);
2236                 ret = -1;
2237                 break;
2238             }
2239             /*
2240              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2241              * will automatically be moved and point to the next host page
2242              * we're going to send, so no need to update here.
2243              *
2244              * Normally QEMU never sends >1 host page in requests, so
2245              * logically we don't even need that as the loop should only
2246              * run once, but just to be consistent.
2247              */
2248             len -= page_size;
2249         };
2250         qemu_mutex_unlock(&rs->bitmap_mutex);
2251 
2252         return ret;
2253     }
2254 
2255     struct RAMSrcPageRequest *new_entry =
2256         g_new0(struct RAMSrcPageRequest, 1);
2257     new_entry->rb = ramblock;
2258     new_entry->offset = start;
2259     new_entry->len = len;
2260 
2261     memory_region_ref(ramblock->mr);
2262     qemu_mutex_lock(&rs->src_page_req_mutex);
2263     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2264     migration_make_urgent_request();
2265     qemu_mutex_unlock(&rs->src_page_req_mutex);
2266 
2267     return 0;
2268 }
2269 
2270 static bool save_page_use_compression(RAMState *rs)
2271 {
2272     if (!migrate_use_compression()) {
2273         return false;
2274     }
2275 
2276     /*
2277      * If xbzrle is enabled (e.g., after first round of migration), stop
2278      * using the data compression. In theory, xbzrle can do better than
2279      * compression.
2280      */
2281     if (rs->xbzrle_enabled) {
2282         return false;
2283     }
2284 
2285     return true;
2286 }
2287 
2288 /*
2289  * try to compress the page before posting it out, return true if the page
2290  * has been properly handled by compression, otherwise needs other
2291  * paths to handle it
2292  */
2293 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2294                                RAMBlock *block, ram_addr_t offset)
2295 {
2296     if (!save_page_use_compression(rs)) {
2297         return false;
2298     }
2299 
2300     /*
2301      * When starting the process of a new block, the first page of
2302      * the block should be sent out before other pages in the same
2303      * block, and all the pages in last block should have been sent
2304      * out, keeping this order is important, because the 'cont' flag
2305      * is used to avoid resending the block name.
2306      *
2307      * We post the fist page as normal page as compression will take
2308      * much CPU resource.
2309      */
2310     if (block != pss->last_sent_block) {
2311         flush_compressed_data(rs);
2312         return false;
2313     }
2314 
2315     if (compress_page_with_multi_thread(block, offset) > 0) {
2316         return true;
2317     }
2318 
2319     compression_counters.busy++;
2320     return false;
2321 }
2322 
2323 /**
2324  * ram_save_target_page_legacy: save one target page
2325  *
2326  * Returns the number of pages written
2327  *
2328  * @rs: current RAM state
2329  * @pss: data about the page we want to send
2330  */
2331 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2332 {
2333     RAMBlock *block = pss->block;
2334     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2335     int res;
2336 
2337     if (control_save_page(pss, block, offset, &res)) {
2338         return res;
2339     }
2340 
2341     if (save_compress_page(rs, pss, block, offset)) {
2342         return 1;
2343     }
2344 
2345     res = save_zero_page(pss, pss->pss_channel, block, offset);
2346     if (res > 0) {
2347         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2348          * page would be stale
2349          */
2350         if (rs->xbzrle_enabled) {
2351             XBZRLE_cache_lock();
2352             xbzrle_cache_zero_page(rs, block->offset + offset);
2353             XBZRLE_cache_unlock();
2354         }
2355         return res;
2356     }
2357 
2358     /*
2359      * Do not use multifd in postcopy as one whole host page should be
2360      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2361      * if host page size == guest page size the dest guest during run may
2362      * still see partially copied pages which is data corruption.
2363      */
2364     if (migrate_use_multifd() && !migration_in_postcopy()) {
2365         return ram_save_multifd_page(pss->pss_channel, block, offset);
2366     }
2367 
2368     return ram_save_page(rs, pss);
2369 }
2370 
2371 /* Should be called before sending a host page */
2372 static void pss_host_page_prepare(PageSearchStatus *pss)
2373 {
2374     /* How many guest pages are there in one host page? */
2375     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2376 
2377     pss->host_page_sending = true;
2378     if (guest_pfns <= 1) {
2379         /*
2380          * This covers both when guest psize == host psize, or when guest
2381          * has larger psize than the host (guest_pfns==0).
2382          *
2383          * For the latter, we always send one whole guest page per
2384          * iteration of the host page (example: an Alpha VM on x86 host
2385          * will have guest psize 8K while host psize 4K).
2386          */
2387         pss->host_page_start = pss->page;
2388         pss->host_page_end = pss->page + 1;
2389     } else {
2390         /*
2391          * The host page spans over multiple guest pages, we send them
2392          * within the same host page iteration.
2393          */
2394         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2395         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2396     }
2397 }
2398 
2399 /*
2400  * Whether the page pointed by PSS is within the host page being sent.
2401  * Must be called after a previous pss_host_page_prepare().
2402  */
2403 static bool pss_within_range(PageSearchStatus *pss)
2404 {
2405     ram_addr_t ram_addr;
2406 
2407     assert(pss->host_page_sending);
2408 
2409     /* Over host-page boundary? */
2410     if (pss->page >= pss->host_page_end) {
2411         return false;
2412     }
2413 
2414     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2415 
2416     return offset_in_ramblock(pss->block, ram_addr);
2417 }
2418 
2419 static void pss_host_page_finish(PageSearchStatus *pss)
2420 {
2421     pss->host_page_sending = false;
2422     /* This is not needed, but just to reset it */
2423     pss->host_page_start = pss->host_page_end = 0;
2424 }
2425 
2426 /*
2427  * Send an urgent host page specified by `pss'.  Need to be called with
2428  * bitmap_mutex held.
2429  *
2430  * Returns 0 if save host page succeeded, false otherwise.
2431  */
2432 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2433 {
2434     bool page_dirty, sent = false;
2435     RAMState *rs = ram_state;
2436     int ret = 0;
2437 
2438     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2439     pss_host_page_prepare(pss);
2440 
2441     /*
2442      * If precopy is sending the same page, let it be done in precopy, or
2443      * we could send the same page in two channels and none of them will
2444      * receive the whole page.
2445      */
2446     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2447         trace_postcopy_preempt_hit(pss->block->idstr,
2448                                    pss->page << TARGET_PAGE_BITS);
2449         return 0;
2450     }
2451 
2452     do {
2453         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2454 
2455         if (page_dirty) {
2456             /* Be strict to return code; it must be 1, or what else? */
2457             if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2458                 error_report_once("%s: ram_save_target_page failed", __func__);
2459                 ret = -1;
2460                 goto out;
2461             }
2462             sent = true;
2463         }
2464         pss_find_next_dirty(pss);
2465     } while (pss_within_range(pss));
2466 out:
2467     pss_host_page_finish(pss);
2468     /* For urgent requests, flush immediately if sent */
2469     if (sent) {
2470         qemu_fflush(pss->pss_channel);
2471     }
2472     return ret;
2473 }
2474 
2475 /**
2476  * ram_save_host_page: save a whole host page
2477  *
2478  * Starting at *offset send pages up to the end of the current host
2479  * page. It's valid for the initial offset to point into the middle of
2480  * a host page in which case the remainder of the hostpage is sent.
2481  * Only dirty target pages are sent. Note that the host page size may
2482  * be a huge page for this block.
2483  *
2484  * The saving stops at the boundary of the used_length of the block
2485  * if the RAMBlock isn't a multiple of the host page size.
2486  *
2487  * The caller must be with ram_state.bitmap_mutex held to call this
2488  * function.  Note that this function can temporarily release the lock, but
2489  * when the function is returned it'll make sure the lock is still held.
2490  *
2491  * Returns the number of pages written or negative on error
2492  *
2493  * @rs: current RAM state
2494  * @pss: data about the page we want to send
2495  */
2496 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2497 {
2498     bool page_dirty, preempt_active = postcopy_preempt_active();
2499     int tmppages, pages = 0;
2500     size_t pagesize_bits =
2501         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2502     unsigned long start_page = pss->page;
2503     int res;
2504 
2505     if (ramblock_is_ignored(pss->block)) {
2506         error_report("block %s should not be migrated !", pss->block->idstr);
2507         return 0;
2508     }
2509 
2510     /* Update host page boundary information */
2511     pss_host_page_prepare(pss);
2512 
2513     do {
2514         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2515 
2516         /* Check the pages is dirty and if it is send it */
2517         if (page_dirty) {
2518             /*
2519              * Properly yield the lock only in postcopy preempt mode
2520              * because both migration thread and rp-return thread can
2521              * operate on the bitmaps.
2522              */
2523             if (preempt_active) {
2524                 qemu_mutex_unlock(&rs->bitmap_mutex);
2525             }
2526             tmppages = migration_ops->ram_save_target_page(rs, pss);
2527             if (tmppages >= 0) {
2528                 pages += tmppages;
2529                 /*
2530                  * Allow rate limiting to happen in the middle of huge pages if
2531                  * something is sent in the current iteration.
2532                  */
2533                 if (pagesize_bits > 1 && tmppages > 0) {
2534                     migration_rate_limit();
2535                 }
2536             }
2537             if (preempt_active) {
2538                 qemu_mutex_lock(&rs->bitmap_mutex);
2539             }
2540         } else {
2541             tmppages = 0;
2542         }
2543 
2544         if (tmppages < 0) {
2545             pss_host_page_finish(pss);
2546             return tmppages;
2547         }
2548 
2549         pss_find_next_dirty(pss);
2550     } while (pss_within_range(pss));
2551 
2552     pss_host_page_finish(pss);
2553 
2554     res = ram_save_release_protection(rs, pss, start_page);
2555     return (res < 0 ? res : pages);
2556 }
2557 
2558 /**
2559  * ram_find_and_save_block: finds a dirty page and sends it to f
2560  *
2561  * Called within an RCU critical section.
2562  *
2563  * Returns the number of pages written where zero means no dirty pages,
2564  * or negative on error
2565  *
2566  * @rs: current RAM state
2567  *
2568  * On systems where host-page-size > target-page-size it will send all the
2569  * pages in a host page that are dirty.
2570  */
2571 static int ram_find_and_save_block(RAMState *rs)
2572 {
2573     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2574     int pages = 0;
2575 
2576     /* No dirty page as there is zero RAM */
2577     if (!rs->ram_bytes_total) {
2578         return pages;
2579     }
2580 
2581     /*
2582      * Always keep last_seen_block/last_page valid during this procedure,
2583      * because find_dirty_block() relies on these values (e.g., we compare
2584      * last_seen_block with pss.block to see whether we searched all the
2585      * ramblocks) to detect the completion of migration.  Having NULL value
2586      * of last_seen_block can conditionally cause below loop to run forever.
2587      */
2588     if (!rs->last_seen_block) {
2589         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2590         rs->last_page = 0;
2591     }
2592 
2593     pss_init(pss, rs->last_seen_block, rs->last_page);
2594 
2595     while (true){
2596         if (!get_queued_page(rs, pss)) {
2597             /* priority queue empty, so just search for something dirty */
2598             int res = find_dirty_block(rs, pss);
2599             if (res != PAGE_DIRTY_FOUND) {
2600                 if (res == PAGE_ALL_CLEAN) {
2601                     break;
2602                 } else if (res == PAGE_TRY_AGAIN) {
2603                     continue;
2604                 }
2605             }
2606         }
2607         pages = ram_save_host_page(rs, pss);
2608         if (pages) {
2609             break;
2610         }
2611     }
2612 
2613     rs->last_seen_block = pss->block;
2614     rs->last_page = pss->page;
2615 
2616     return pages;
2617 }
2618 
2619 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2620 {
2621     uint64_t pages = size / TARGET_PAGE_SIZE;
2622 
2623     if (zero) {
2624         stat64_add(&ram_counters.zero_pages, pages);
2625     } else {
2626         stat64_add(&ram_counters.normal_pages, pages);
2627         ram_transferred_add(size);
2628         qemu_file_credit_transfer(f, size);
2629     }
2630 }
2631 
2632 static uint64_t ram_bytes_total_with_ignored(void)
2633 {
2634     RAMBlock *block;
2635     uint64_t total = 0;
2636 
2637     RCU_READ_LOCK_GUARD();
2638 
2639     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2640         total += block->used_length;
2641     }
2642     return total;
2643 }
2644 
2645 uint64_t ram_bytes_total(void)
2646 {
2647     RAMBlock *block;
2648     uint64_t total = 0;
2649 
2650     RCU_READ_LOCK_GUARD();
2651 
2652     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2653         total += block->used_length;
2654     }
2655     return total;
2656 }
2657 
2658 static void xbzrle_load_setup(void)
2659 {
2660     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2661 }
2662 
2663 static void xbzrle_load_cleanup(void)
2664 {
2665     g_free(XBZRLE.decoded_buf);
2666     XBZRLE.decoded_buf = NULL;
2667 }
2668 
2669 static void ram_state_cleanup(RAMState **rsp)
2670 {
2671     if (*rsp) {
2672         migration_page_queue_free(*rsp);
2673         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2674         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2675         g_free(*rsp);
2676         *rsp = NULL;
2677     }
2678 }
2679 
2680 static void xbzrle_cleanup(void)
2681 {
2682     XBZRLE_cache_lock();
2683     if (XBZRLE.cache) {
2684         cache_fini(XBZRLE.cache);
2685         g_free(XBZRLE.encoded_buf);
2686         g_free(XBZRLE.current_buf);
2687         g_free(XBZRLE.zero_target_page);
2688         XBZRLE.cache = NULL;
2689         XBZRLE.encoded_buf = NULL;
2690         XBZRLE.current_buf = NULL;
2691         XBZRLE.zero_target_page = NULL;
2692     }
2693     XBZRLE_cache_unlock();
2694 }
2695 
2696 static void ram_save_cleanup(void *opaque)
2697 {
2698     RAMState **rsp = opaque;
2699     RAMBlock *block;
2700 
2701     /* We don't use dirty log with background snapshots */
2702     if (!migrate_background_snapshot()) {
2703         /* caller have hold iothread lock or is in a bh, so there is
2704          * no writing race against the migration bitmap
2705          */
2706         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2707             /*
2708              * do not stop dirty log without starting it, since
2709              * memory_global_dirty_log_stop will assert that
2710              * memory_global_dirty_log_start/stop used in pairs
2711              */
2712             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2713         }
2714     }
2715 
2716     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2717         g_free(block->clear_bmap);
2718         block->clear_bmap = NULL;
2719         g_free(block->bmap);
2720         block->bmap = NULL;
2721     }
2722 
2723     xbzrle_cleanup();
2724     compress_threads_save_cleanup();
2725     ram_state_cleanup(rsp);
2726     g_free(migration_ops);
2727     migration_ops = NULL;
2728 }
2729 
2730 static void ram_state_reset(RAMState *rs)
2731 {
2732     int i;
2733 
2734     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2735         rs->pss[i].last_sent_block = NULL;
2736     }
2737 
2738     rs->last_seen_block = NULL;
2739     rs->last_page = 0;
2740     rs->last_version = ram_list.version;
2741     rs->xbzrle_enabled = false;
2742 }
2743 
2744 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2745 
2746 /* **** functions for postcopy ***** */
2747 
2748 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2749 {
2750     struct RAMBlock *block;
2751 
2752     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2753         unsigned long *bitmap = block->bmap;
2754         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2755         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2756 
2757         while (run_start < range) {
2758             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2759             ram_discard_range(block->idstr,
2760                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2761                               ((ram_addr_t)(run_end - run_start))
2762                                 << TARGET_PAGE_BITS);
2763             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2764         }
2765     }
2766 }
2767 
2768 /**
2769  * postcopy_send_discard_bm_ram: discard a RAMBlock
2770  *
2771  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2772  *
2773  * @ms: current migration state
2774  * @block: RAMBlock to discard
2775  */
2776 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2777 {
2778     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2779     unsigned long current;
2780     unsigned long *bitmap = block->bmap;
2781 
2782     for (current = 0; current < end; ) {
2783         unsigned long one = find_next_bit(bitmap, end, current);
2784         unsigned long zero, discard_length;
2785 
2786         if (one >= end) {
2787             break;
2788         }
2789 
2790         zero = find_next_zero_bit(bitmap, end, one + 1);
2791 
2792         if (zero >= end) {
2793             discard_length = end - one;
2794         } else {
2795             discard_length = zero - one;
2796         }
2797         postcopy_discard_send_range(ms, one, discard_length);
2798         current = one + discard_length;
2799     }
2800 }
2801 
2802 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2803 
2804 /**
2805  * postcopy_each_ram_send_discard: discard all RAMBlocks
2806  *
2807  * Utility for the outgoing postcopy code.
2808  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2809  *   passing it bitmap indexes and name.
2810  * (qemu_ram_foreach_block ends up passing unscaled lengths
2811  *  which would mean postcopy code would have to deal with target page)
2812  *
2813  * @ms: current migration state
2814  */
2815 static void postcopy_each_ram_send_discard(MigrationState *ms)
2816 {
2817     struct RAMBlock *block;
2818 
2819     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2820         postcopy_discard_send_init(ms, block->idstr);
2821 
2822         /*
2823          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2824          * host-page size chunks, mark any partially dirty host-page size
2825          * chunks as all dirty.  In this case the host-page is the host-page
2826          * for the particular RAMBlock, i.e. it might be a huge page.
2827          */
2828         postcopy_chunk_hostpages_pass(ms, block);
2829 
2830         /*
2831          * Postcopy sends chunks of bitmap over the wire, but it
2832          * just needs indexes at this point, avoids it having
2833          * target page specific code.
2834          */
2835         postcopy_send_discard_bm_ram(ms, block);
2836         postcopy_discard_send_finish(ms);
2837     }
2838 }
2839 
2840 /**
2841  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2842  *
2843  * Helper for postcopy_chunk_hostpages; it's called twice to
2844  * canonicalize the two bitmaps, that are similar, but one is
2845  * inverted.
2846  *
2847  * Postcopy requires that all target pages in a hostpage are dirty or
2848  * clean, not a mix.  This function canonicalizes the bitmaps.
2849  *
2850  * @ms: current migration state
2851  * @block: block that contains the page we want to canonicalize
2852  */
2853 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2854 {
2855     RAMState *rs = ram_state;
2856     unsigned long *bitmap = block->bmap;
2857     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2858     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2859     unsigned long run_start;
2860 
2861     if (block->page_size == TARGET_PAGE_SIZE) {
2862         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2863         return;
2864     }
2865 
2866     /* Find a dirty page */
2867     run_start = find_next_bit(bitmap, pages, 0);
2868 
2869     while (run_start < pages) {
2870 
2871         /*
2872          * If the start of this run of pages is in the middle of a host
2873          * page, then we need to fixup this host page.
2874          */
2875         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2876             /* Find the end of this run */
2877             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2878             /*
2879              * If the end isn't at the start of a host page, then the
2880              * run doesn't finish at the end of a host page
2881              * and we need to discard.
2882              */
2883         }
2884 
2885         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2886             unsigned long page;
2887             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2888                                                              host_ratio);
2889             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2890 
2891             /* Clean up the bitmap */
2892             for (page = fixup_start_addr;
2893                  page < fixup_start_addr + host_ratio; page++) {
2894                 /*
2895                  * Remark them as dirty, updating the count for any pages
2896                  * that weren't previously dirty.
2897                  */
2898                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2899             }
2900         }
2901 
2902         /* Find the next dirty page for the next iteration */
2903         run_start = find_next_bit(bitmap, pages, run_start);
2904     }
2905 }
2906 
2907 /**
2908  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2909  *
2910  * Transmit the set of pages to be discarded after precopy to the target
2911  * these are pages that:
2912  *     a) Have been previously transmitted but are now dirty again
2913  *     b) Pages that have never been transmitted, this ensures that
2914  *        any pages on the destination that have been mapped by background
2915  *        tasks get discarded (transparent huge pages is the specific concern)
2916  * Hopefully this is pretty sparse
2917  *
2918  * @ms: current migration state
2919  */
2920 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2921 {
2922     RAMState *rs = ram_state;
2923 
2924     RCU_READ_LOCK_GUARD();
2925 
2926     /* This should be our last sync, the src is now paused */
2927     migration_bitmap_sync(rs);
2928 
2929     /* Easiest way to make sure we don't resume in the middle of a host-page */
2930     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2931     rs->last_seen_block = NULL;
2932     rs->last_page = 0;
2933 
2934     postcopy_each_ram_send_discard(ms);
2935 
2936     trace_ram_postcopy_send_discard_bitmap();
2937 }
2938 
2939 /**
2940  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2941  *
2942  * Returns zero on success
2943  *
2944  * @rbname: name of the RAMBlock of the request. NULL means the
2945  *          same that last one.
2946  * @start: RAMBlock starting page
2947  * @length: RAMBlock size
2948  */
2949 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2950 {
2951     trace_ram_discard_range(rbname, start, length);
2952 
2953     RCU_READ_LOCK_GUARD();
2954     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2955 
2956     if (!rb) {
2957         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2958         return -1;
2959     }
2960 
2961     /*
2962      * On source VM, we don't need to update the received bitmap since
2963      * we don't even have one.
2964      */
2965     if (rb->receivedmap) {
2966         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2967                      length >> qemu_target_page_bits());
2968     }
2969 
2970     return ram_block_discard_range(rb, start, length);
2971 }
2972 
2973 /*
2974  * For every allocation, we will try not to crash the VM if the
2975  * allocation failed.
2976  */
2977 static int xbzrle_init(void)
2978 {
2979     Error *local_err = NULL;
2980 
2981     if (!migrate_use_xbzrle()) {
2982         return 0;
2983     }
2984 
2985     XBZRLE_cache_lock();
2986 
2987     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2988     if (!XBZRLE.zero_target_page) {
2989         error_report("%s: Error allocating zero page", __func__);
2990         goto err_out;
2991     }
2992 
2993     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2994                               TARGET_PAGE_SIZE, &local_err);
2995     if (!XBZRLE.cache) {
2996         error_report_err(local_err);
2997         goto free_zero_page;
2998     }
2999 
3000     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3001     if (!XBZRLE.encoded_buf) {
3002         error_report("%s: Error allocating encoded_buf", __func__);
3003         goto free_cache;
3004     }
3005 
3006     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3007     if (!XBZRLE.current_buf) {
3008         error_report("%s: Error allocating current_buf", __func__);
3009         goto free_encoded_buf;
3010     }
3011 
3012     /* We are all good */
3013     XBZRLE_cache_unlock();
3014     return 0;
3015 
3016 free_encoded_buf:
3017     g_free(XBZRLE.encoded_buf);
3018     XBZRLE.encoded_buf = NULL;
3019 free_cache:
3020     cache_fini(XBZRLE.cache);
3021     XBZRLE.cache = NULL;
3022 free_zero_page:
3023     g_free(XBZRLE.zero_target_page);
3024     XBZRLE.zero_target_page = NULL;
3025 err_out:
3026     XBZRLE_cache_unlock();
3027     return -ENOMEM;
3028 }
3029 
3030 static int ram_state_init(RAMState **rsp)
3031 {
3032     *rsp = g_try_new0(RAMState, 1);
3033 
3034     if (!*rsp) {
3035         error_report("%s: Init ramstate fail", __func__);
3036         return -1;
3037     }
3038 
3039     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3040     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3041     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3042     (*rsp)->ram_bytes_total = ram_bytes_total();
3043 
3044     /*
3045      * Count the total number of pages used by ram blocks not including any
3046      * gaps due to alignment or unplugs.
3047      * This must match with the initial values of dirty bitmap.
3048      */
3049     (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
3050     ram_state_reset(*rsp);
3051 
3052     return 0;
3053 }
3054 
3055 static void ram_list_init_bitmaps(void)
3056 {
3057     MigrationState *ms = migrate_get_current();
3058     RAMBlock *block;
3059     unsigned long pages;
3060     uint8_t shift;
3061 
3062     /* Skip setting bitmap if there is no RAM */
3063     if (ram_bytes_total()) {
3064         shift = ms->clear_bitmap_shift;
3065         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3066             error_report("clear_bitmap_shift (%u) too big, using "
3067                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3068             shift = CLEAR_BITMAP_SHIFT_MAX;
3069         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3070             error_report("clear_bitmap_shift (%u) too small, using "
3071                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3072             shift = CLEAR_BITMAP_SHIFT_MIN;
3073         }
3074 
3075         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3076             pages = block->max_length >> TARGET_PAGE_BITS;
3077             /*
3078              * The initial dirty bitmap for migration must be set with all
3079              * ones to make sure we'll migrate every guest RAM page to
3080              * destination.
3081              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3082              * new migration after a failed migration, ram_list.
3083              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3084              * guest memory.
3085              */
3086             block->bmap = bitmap_new(pages);
3087             bitmap_set(block->bmap, 0, pages);
3088             block->clear_bmap_shift = shift;
3089             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3090         }
3091     }
3092 }
3093 
3094 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3095 {
3096     unsigned long pages;
3097     RAMBlock *rb;
3098 
3099     RCU_READ_LOCK_GUARD();
3100 
3101     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3102             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3103             rs->migration_dirty_pages -= pages;
3104     }
3105 }
3106 
3107 static void ram_init_bitmaps(RAMState *rs)
3108 {
3109     /* For memory_global_dirty_log_start below.  */
3110     qemu_mutex_lock_iothread();
3111     qemu_mutex_lock_ramlist();
3112 
3113     WITH_RCU_READ_LOCK_GUARD() {
3114         ram_list_init_bitmaps();
3115         /* We don't use dirty log with background snapshots */
3116         if (!migrate_background_snapshot()) {
3117             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3118             migration_bitmap_sync_precopy(rs);
3119         }
3120     }
3121     qemu_mutex_unlock_ramlist();
3122     qemu_mutex_unlock_iothread();
3123 
3124     /*
3125      * After an eventual first bitmap sync, fixup the initial bitmap
3126      * containing all 1s to exclude any discarded pages from migration.
3127      */
3128     migration_bitmap_clear_discarded_pages(rs);
3129 }
3130 
3131 static int ram_init_all(RAMState **rsp)
3132 {
3133     if (ram_state_init(rsp)) {
3134         return -1;
3135     }
3136 
3137     if (xbzrle_init()) {
3138         ram_state_cleanup(rsp);
3139         return -1;
3140     }
3141 
3142     ram_init_bitmaps(*rsp);
3143 
3144     return 0;
3145 }
3146 
3147 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3148 {
3149     RAMBlock *block;
3150     uint64_t pages = 0;
3151 
3152     /*
3153      * Postcopy is not using xbzrle/compression, so no need for that.
3154      * Also, since source are already halted, we don't need to care
3155      * about dirty page logging as well.
3156      */
3157 
3158     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3159         pages += bitmap_count_one(block->bmap,
3160                                   block->used_length >> TARGET_PAGE_BITS);
3161     }
3162 
3163     /* This may not be aligned with current bitmaps. Recalculate. */
3164     rs->migration_dirty_pages = pages;
3165 
3166     ram_state_reset(rs);
3167 
3168     /* Update RAMState cache of output QEMUFile */
3169     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
3170 
3171     trace_ram_state_resume_prepare(pages);
3172 }
3173 
3174 /*
3175  * This function clears bits of the free pages reported by the caller from the
3176  * migration dirty bitmap. @addr is the host address corresponding to the
3177  * start of the continuous guest free pages, and @len is the total bytes of
3178  * those pages.
3179  */
3180 void qemu_guest_free_page_hint(void *addr, size_t len)
3181 {
3182     RAMBlock *block;
3183     ram_addr_t offset;
3184     size_t used_len, start, npages;
3185     MigrationState *s = migrate_get_current();
3186 
3187     /* This function is currently expected to be used during live migration */
3188     if (!migration_is_setup_or_active(s->state)) {
3189         return;
3190     }
3191 
3192     for (; len > 0; len -= used_len, addr += used_len) {
3193         block = qemu_ram_block_from_host(addr, false, &offset);
3194         if (unlikely(!block || offset >= block->used_length)) {
3195             /*
3196              * The implementation might not support RAMBlock resize during
3197              * live migration, but it could happen in theory with future
3198              * updates. So we add a check here to capture that case.
3199              */
3200             error_report_once("%s unexpected error", __func__);
3201             return;
3202         }
3203 
3204         if (len <= block->used_length - offset) {
3205             used_len = len;
3206         } else {
3207             used_len = block->used_length - offset;
3208         }
3209 
3210         start = offset >> TARGET_PAGE_BITS;
3211         npages = used_len >> TARGET_PAGE_BITS;
3212 
3213         qemu_mutex_lock(&ram_state->bitmap_mutex);
3214         /*
3215          * The skipped free pages are equavalent to be sent from clear_bmap's
3216          * perspective, so clear the bits from the memory region bitmap which
3217          * are initially set. Otherwise those skipped pages will be sent in
3218          * the next round after syncing from the memory region bitmap.
3219          */
3220         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3221         ram_state->migration_dirty_pages -=
3222                       bitmap_count_one_with_offset(block->bmap, start, npages);
3223         bitmap_clear(block->bmap, start, npages);
3224         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3225     }
3226 }
3227 
3228 /*
3229  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3230  * long-running RCU critical section.  When rcu-reclaims in the code
3231  * start to become numerous it will be necessary to reduce the
3232  * granularity of these critical sections.
3233  */
3234 
3235 /**
3236  * ram_save_setup: Setup RAM for migration
3237  *
3238  * Returns zero to indicate success and negative for error
3239  *
3240  * @f: QEMUFile where to send the data
3241  * @opaque: RAMState pointer
3242  */
3243 static int ram_save_setup(QEMUFile *f, void *opaque)
3244 {
3245     RAMState **rsp = opaque;
3246     RAMBlock *block;
3247     int ret;
3248 
3249     if (compress_threads_save_setup()) {
3250         return -1;
3251     }
3252 
3253     /* migration has already setup the bitmap, reuse it. */
3254     if (!migration_in_colo_state()) {
3255         if (ram_init_all(rsp) != 0) {
3256             compress_threads_save_cleanup();
3257             return -1;
3258         }
3259     }
3260     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3261 
3262     WITH_RCU_READ_LOCK_GUARD() {
3263         qemu_put_be64(f, ram_bytes_total_with_ignored()
3264                          | RAM_SAVE_FLAG_MEM_SIZE);
3265 
3266         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3267             qemu_put_byte(f, strlen(block->idstr));
3268             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3269             qemu_put_be64(f, block->used_length);
3270             if (migrate_postcopy_ram() && block->page_size !=
3271                                           qemu_host_page_size) {
3272                 qemu_put_be64(f, block->page_size);
3273             }
3274             if (migrate_ignore_shared()) {
3275                 qemu_put_be64(f, block->mr->addr);
3276             }
3277         }
3278     }
3279 
3280     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3281     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3282 
3283     migration_ops = g_malloc0(sizeof(MigrationOps));
3284     migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3285     ret = multifd_send_sync_main(f);
3286     if (ret < 0) {
3287         return ret;
3288     }
3289 
3290     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3291     qemu_fflush(f);
3292 
3293     return 0;
3294 }
3295 
3296 /**
3297  * ram_save_iterate: iterative stage for migration
3298  *
3299  * Returns zero to indicate success and negative for error
3300  *
3301  * @f: QEMUFile where to send the data
3302  * @opaque: RAMState pointer
3303  */
3304 static int ram_save_iterate(QEMUFile *f, void *opaque)
3305 {
3306     RAMState **temp = opaque;
3307     RAMState *rs = *temp;
3308     int ret = 0;
3309     int i;
3310     int64_t t0;
3311     int done = 0;
3312 
3313     if (blk_mig_bulk_active()) {
3314         /* Avoid transferring ram during bulk phase of block migration as
3315          * the bulk phase will usually take a long time and transferring
3316          * ram updates during that time is pointless. */
3317         goto out;
3318     }
3319 
3320     /*
3321      * We'll take this lock a little bit long, but it's okay for two reasons.
3322      * Firstly, the only possible other thread to take it is who calls
3323      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3324      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3325      * guarantees that we'll at least released it in a regular basis.
3326      */
3327     qemu_mutex_lock(&rs->bitmap_mutex);
3328     WITH_RCU_READ_LOCK_GUARD() {
3329         if (ram_list.version != rs->last_version) {
3330             ram_state_reset(rs);
3331         }
3332 
3333         /* Read version before ram_list.blocks */
3334         smp_rmb();
3335 
3336         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3337 
3338         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3339         i = 0;
3340         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3341                postcopy_has_request(rs)) {
3342             int pages;
3343 
3344             if (qemu_file_get_error(f)) {
3345                 break;
3346             }
3347 
3348             pages = ram_find_and_save_block(rs);
3349             /* no more pages to sent */
3350             if (pages == 0) {
3351                 done = 1;
3352                 break;
3353             }
3354 
3355             if (pages < 0) {
3356                 qemu_file_set_error(f, pages);
3357                 break;
3358             }
3359 
3360             rs->target_page_count += pages;
3361 
3362             /*
3363              * During postcopy, it is necessary to make sure one whole host
3364              * page is sent in one chunk.
3365              */
3366             if (migrate_postcopy_ram()) {
3367                 flush_compressed_data(rs);
3368             }
3369 
3370             /*
3371              * we want to check in the 1st loop, just in case it was the 1st
3372              * time and we had to sync the dirty bitmap.
3373              * qemu_clock_get_ns() is a bit expensive, so we only check each
3374              * some iterations
3375              */
3376             if ((i & 63) == 0) {
3377                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3378                               1000000;
3379                 if (t1 > MAX_WAIT) {
3380                     trace_ram_save_iterate_big_wait(t1, i);
3381                     break;
3382                 }
3383             }
3384             i++;
3385         }
3386     }
3387     qemu_mutex_unlock(&rs->bitmap_mutex);
3388 
3389     /*
3390      * Must occur before EOS (or any QEMUFile operation)
3391      * because of RDMA protocol.
3392      */
3393     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3394 
3395 out:
3396     if (ret >= 0
3397         && migration_is_setup_or_active(migrate_get_current()->state)) {
3398         ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3399         if (ret < 0) {
3400             return ret;
3401         }
3402 
3403         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3404         qemu_fflush(f);
3405         ram_transferred_add(8);
3406 
3407         ret = qemu_file_get_error(f);
3408     }
3409     if (ret < 0) {
3410         return ret;
3411     }
3412 
3413     return done;
3414 }
3415 
3416 /**
3417  * ram_save_complete: function called to send the remaining amount of ram
3418  *
3419  * Returns zero to indicate success or negative on error
3420  *
3421  * Called with iothread lock
3422  *
3423  * @f: QEMUFile where to send the data
3424  * @opaque: RAMState pointer
3425  */
3426 static int ram_save_complete(QEMUFile *f, void *opaque)
3427 {
3428     RAMState **temp = opaque;
3429     RAMState *rs = *temp;
3430     int ret = 0;
3431 
3432     rs->last_stage = !migration_in_colo_state();
3433 
3434     WITH_RCU_READ_LOCK_GUARD() {
3435         if (!migration_in_postcopy()) {
3436             migration_bitmap_sync_precopy(rs);
3437         }
3438 
3439         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3440 
3441         /* try transferring iterative blocks of memory */
3442 
3443         /* flush all remaining blocks regardless of rate limiting */
3444         qemu_mutex_lock(&rs->bitmap_mutex);
3445         while (true) {
3446             int pages;
3447 
3448             pages = ram_find_and_save_block(rs);
3449             /* no more blocks to sent */
3450             if (pages == 0) {
3451                 break;
3452             }
3453             if (pages < 0) {
3454                 ret = pages;
3455                 break;
3456             }
3457         }
3458         qemu_mutex_unlock(&rs->bitmap_mutex);
3459 
3460         flush_compressed_data(rs);
3461         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3462     }
3463 
3464     if (ret < 0) {
3465         return ret;
3466     }
3467 
3468     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3469     if (ret < 0) {
3470         return ret;
3471     }
3472 
3473     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3474     qemu_fflush(f);
3475 
3476     return 0;
3477 }
3478 
3479 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3480                                        uint64_t *can_postcopy)
3481 {
3482     RAMState **temp = opaque;
3483     RAMState *rs = *temp;
3484 
3485     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3486 
3487     if (migrate_postcopy_ram()) {
3488         /* We can do postcopy, and all the data is postcopiable */
3489         *can_postcopy += remaining_size;
3490     } else {
3491         *must_precopy += remaining_size;
3492     }
3493 }
3494 
3495 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3496                                     uint64_t *can_postcopy)
3497 {
3498     MigrationState *s = migrate_get_current();
3499     RAMState **temp = opaque;
3500     RAMState *rs = *temp;
3501 
3502     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3503 
3504     if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3505         qemu_mutex_lock_iothread();
3506         WITH_RCU_READ_LOCK_GUARD() {
3507             migration_bitmap_sync_precopy(rs);
3508         }
3509         qemu_mutex_unlock_iothread();
3510         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3511     }
3512 
3513     if (migrate_postcopy_ram()) {
3514         /* We can do postcopy, and all the data is postcopiable */
3515         *can_postcopy += remaining_size;
3516     } else {
3517         *must_precopy += remaining_size;
3518     }
3519 }
3520 
3521 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3522 {
3523     unsigned int xh_len;
3524     int xh_flags;
3525     uint8_t *loaded_data;
3526 
3527     /* extract RLE header */
3528     xh_flags = qemu_get_byte(f);
3529     xh_len = qemu_get_be16(f);
3530 
3531     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3532         error_report("Failed to load XBZRLE page - wrong compression!");
3533         return -1;
3534     }
3535 
3536     if (xh_len > TARGET_PAGE_SIZE) {
3537         error_report("Failed to load XBZRLE page - len overflow!");
3538         return -1;
3539     }
3540     loaded_data = XBZRLE.decoded_buf;
3541     /* load data and decode */
3542     /* it can change loaded_data to point to an internal buffer */
3543     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3544 
3545     /* decode RLE */
3546     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3547                              TARGET_PAGE_SIZE) == -1) {
3548         error_report("Failed to load XBZRLE page - decode error!");
3549         return -1;
3550     }
3551 
3552     return 0;
3553 }
3554 
3555 /**
3556  * ram_block_from_stream: read a RAMBlock id from the migration stream
3557  *
3558  * Must be called from within a rcu critical section.
3559  *
3560  * Returns a pointer from within the RCU-protected ram_list.
3561  *
3562  * @mis: the migration incoming state pointer
3563  * @f: QEMUFile where to read the data from
3564  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3565  * @channel: the channel we're using
3566  */
3567 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3568                                               QEMUFile *f, int flags,
3569                                               int channel)
3570 {
3571     RAMBlock *block = mis->last_recv_block[channel];
3572     char id[256];
3573     uint8_t len;
3574 
3575     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3576         if (!block) {
3577             error_report("Ack, bad migration stream!");
3578             return NULL;
3579         }
3580         return block;
3581     }
3582 
3583     len = qemu_get_byte(f);
3584     qemu_get_buffer(f, (uint8_t *)id, len);
3585     id[len] = 0;
3586 
3587     block = qemu_ram_block_by_name(id);
3588     if (!block) {
3589         error_report("Can't find block %s", id);
3590         return NULL;
3591     }
3592 
3593     if (ramblock_is_ignored(block)) {
3594         error_report("block %s should not be migrated !", id);
3595         return NULL;
3596     }
3597 
3598     mis->last_recv_block[channel] = block;
3599 
3600     return block;
3601 }
3602 
3603 static inline void *host_from_ram_block_offset(RAMBlock *block,
3604                                                ram_addr_t offset)
3605 {
3606     if (!offset_in_ramblock(block, offset)) {
3607         return NULL;
3608     }
3609 
3610     return block->host + offset;
3611 }
3612 
3613 static void *host_page_from_ram_block_offset(RAMBlock *block,
3614                                              ram_addr_t offset)
3615 {
3616     /* Note: Explicitly no check against offset_in_ramblock(). */
3617     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3618                                    block->page_size);
3619 }
3620 
3621 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3622                                                          ram_addr_t offset)
3623 {
3624     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3625 }
3626 
3627 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3628                              ram_addr_t offset, bool record_bitmap)
3629 {
3630     if (!offset_in_ramblock(block, offset)) {
3631         return NULL;
3632     }
3633     if (!block->colo_cache) {
3634         error_report("%s: colo_cache is NULL in block :%s",
3635                      __func__, block->idstr);
3636         return NULL;
3637     }
3638 
3639     /*
3640     * During colo checkpoint, we need bitmap of these migrated pages.
3641     * It help us to decide which pages in ram cache should be flushed
3642     * into VM's RAM later.
3643     */
3644     if (record_bitmap &&
3645         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3646         ram_state->migration_dirty_pages++;
3647     }
3648     return block->colo_cache + offset;
3649 }
3650 
3651 /**
3652  * ram_handle_compressed: handle the zero page case
3653  *
3654  * If a page (or a whole RDMA chunk) has been
3655  * determined to be zero, then zap it.
3656  *
3657  * @host: host address for the zero page
3658  * @ch: what the page is filled from.  We only support zero
3659  * @size: size of the zero page
3660  */
3661 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3662 {
3663     if (ch != 0 || !buffer_is_zero(host, size)) {
3664         memset(host, ch, size);
3665     }
3666 }
3667 
3668 /* return the size after decompression, or negative value on error */
3669 static int
3670 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3671                      const uint8_t *source, size_t source_len)
3672 {
3673     int err;
3674 
3675     err = inflateReset(stream);
3676     if (err != Z_OK) {
3677         return -1;
3678     }
3679 
3680     stream->avail_in = source_len;
3681     stream->next_in = (uint8_t *)source;
3682     stream->avail_out = dest_len;
3683     stream->next_out = dest;
3684 
3685     err = inflate(stream, Z_NO_FLUSH);
3686     if (err != Z_STREAM_END) {
3687         return -1;
3688     }
3689 
3690     return stream->total_out;
3691 }
3692 
3693 static void *do_data_decompress(void *opaque)
3694 {
3695     DecompressParam *param = opaque;
3696     unsigned long pagesize;
3697     uint8_t *des;
3698     int len, ret;
3699 
3700     qemu_mutex_lock(&param->mutex);
3701     while (!param->quit) {
3702         if (param->des) {
3703             des = param->des;
3704             len = param->len;
3705             param->des = 0;
3706             qemu_mutex_unlock(&param->mutex);
3707 
3708             pagesize = TARGET_PAGE_SIZE;
3709 
3710             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3711                                        param->compbuf, len);
3712             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3713                 error_report("decompress data failed");
3714                 qemu_file_set_error(decomp_file, ret);
3715             }
3716 
3717             qemu_mutex_lock(&decomp_done_lock);
3718             param->done = true;
3719             qemu_cond_signal(&decomp_done_cond);
3720             qemu_mutex_unlock(&decomp_done_lock);
3721 
3722             qemu_mutex_lock(&param->mutex);
3723         } else {
3724             qemu_cond_wait(&param->cond, &param->mutex);
3725         }
3726     }
3727     qemu_mutex_unlock(&param->mutex);
3728 
3729     return NULL;
3730 }
3731 
3732 static int wait_for_decompress_done(void)
3733 {
3734     int idx, thread_count;
3735 
3736     if (!migrate_use_compression()) {
3737         return 0;
3738     }
3739 
3740     thread_count = migrate_decompress_threads();
3741     qemu_mutex_lock(&decomp_done_lock);
3742     for (idx = 0; idx < thread_count; idx++) {
3743         while (!decomp_param[idx].done) {
3744             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3745         }
3746     }
3747     qemu_mutex_unlock(&decomp_done_lock);
3748     return qemu_file_get_error(decomp_file);
3749 }
3750 
3751 static void compress_threads_load_cleanup(void)
3752 {
3753     int i, thread_count;
3754 
3755     if (!migrate_use_compression()) {
3756         return;
3757     }
3758     thread_count = migrate_decompress_threads();
3759     for (i = 0; i < thread_count; i++) {
3760         /*
3761          * we use it as a indicator which shows if the thread is
3762          * properly init'd or not
3763          */
3764         if (!decomp_param[i].compbuf) {
3765             break;
3766         }
3767 
3768         qemu_mutex_lock(&decomp_param[i].mutex);
3769         decomp_param[i].quit = true;
3770         qemu_cond_signal(&decomp_param[i].cond);
3771         qemu_mutex_unlock(&decomp_param[i].mutex);
3772     }
3773     for (i = 0; i < thread_count; i++) {
3774         if (!decomp_param[i].compbuf) {
3775             break;
3776         }
3777 
3778         qemu_thread_join(decompress_threads + i);
3779         qemu_mutex_destroy(&decomp_param[i].mutex);
3780         qemu_cond_destroy(&decomp_param[i].cond);
3781         inflateEnd(&decomp_param[i].stream);
3782         g_free(decomp_param[i].compbuf);
3783         decomp_param[i].compbuf = NULL;
3784     }
3785     g_free(decompress_threads);
3786     g_free(decomp_param);
3787     decompress_threads = NULL;
3788     decomp_param = NULL;
3789     decomp_file = NULL;
3790 }
3791 
3792 static int compress_threads_load_setup(QEMUFile *f)
3793 {
3794     int i, thread_count;
3795 
3796     if (!migrate_use_compression()) {
3797         return 0;
3798     }
3799 
3800     thread_count = migrate_decompress_threads();
3801     decompress_threads = g_new0(QemuThread, thread_count);
3802     decomp_param = g_new0(DecompressParam, thread_count);
3803     qemu_mutex_init(&decomp_done_lock);
3804     qemu_cond_init(&decomp_done_cond);
3805     decomp_file = f;
3806     for (i = 0; i < thread_count; i++) {
3807         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3808             goto exit;
3809         }
3810 
3811         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3812         qemu_mutex_init(&decomp_param[i].mutex);
3813         qemu_cond_init(&decomp_param[i].cond);
3814         decomp_param[i].done = true;
3815         decomp_param[i].quit = false;
3816         qemu_thread_create(decompress_threads + i, "decompress",
3817                            do_data_decompress, decomp_param + i,
3818                            QEMU_THREAD_JOINABLE);
3819     }
3820     return 0;
3821 exit:
3822     compress_threads_load_cleanup();
3823     return -1;
3824 }
3825 
3826 static void decompress_data_with_multi_threads(QEMUFile *f,
3827                                                void *host, int len)
3828 {
3829     int idx, thread_count;
3830 
3831     thread_count = migrate_decompress_threads();
3832     QEMU_LOCK_GUARD(&decomp_done_lock);
3833     while (true) {
3834         for (idx = 0; idx < thread_count; idx++) {
3835             if (decomp_param[idx].done) {
3836                 decomp_param[idx].done = false;
3837                 qemu_mutex_lock(&decomp_param[idx].mutex);
3838                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3839                 decomp_param[idx].des = host;
3840                 decomp_param[idx].len = len;
3841                 qemu_cond_signal(&decomp_param[idx].cond);
3842                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3843                 break;
3844             }
3845         }
3846         if (idx < thread_count) {
3847             break;
3848         } else {
3849             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3850         }
3851     }
3852 }
3853 
3854 static void colo_init_ram_state(void)
3855 {
3856     ram_state_init(&ram_state);
3857 }
3858 
3859 /*
3860  * colo cache: this is for secondary VM, we cache the whole
3861  * memory of the secondary VM, it is need to hold the global lock
3862  * to call this helper.
3863  */
3864 int colo_init_ram_cache(void)
3865 {
3866     RAMBlock *block;
3867 
3868     WITH_RCU_READ_LOCK_GUARD() {
3869         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3870             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3871                                                     NULL, false, false);
3872             if (!block->colo_cache) {
3873                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3874                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3875                              block->used_length);
3876                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3877                     if (block->colo_cache) {
3878                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3879                         block->colo_cache = NULL;
3880                     }
3881                 }
3882                 return -errno;
3883             }
3884             if (!machine_dump_guest_core(current_machine)) {
3885                 qemu_madvise(block->colo_cache, block->used_length,
3886                              QEMU_MADV_DONTDUMP);
3887             }
3888         }
3889     }
3890 
3891     /*
3892     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3893     * with to decide which page in cache should be flushed into SVM's RAM. Here
3894     * we use the same name 'ram_bitmap' as for migration.
3895     */
3896     if (ram_bytes_total()) {
3897         RAMBlock *block;
3898 
3899         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3900             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3901             block->bmap = bitmap_new(pages);
3902         }
3903     }
3904 
3905     colo_init_ram_state();
3906     return 0;
3907 }
3908 
3909 /* TODO: duplicated with ram_init_bitmaps */
3910 void colo_incoming_start_dirty_log(void)
3911 {
3912     RAMBlock *block = NULL;
3913     /* For memory_global_dirty_log_start below. */
3914     qemu_mutex_lock_iothread();
3915     qemu_mutex_lock_ramlist();
3916 
3917     memory_global_dirty_log_sync();
3918     WITH_RCU_READ_LOCK_GUARD() {
3919         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3920             ramblock_sync_dirty_bitmap(ram_state, block);
3921             /* Discard this dirty bitmap record */
3922             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3923         }
3924         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3925     }
3926     ram_state->migration_dirty_pages = 0;
3927     qemu_mutex_unlock_ramlist();
3928     qemu_mutex_unlock_iothread();
3929 }
3930 
3931 /* It is need to hold the global lock to call this helper */
3932 void colo_release_ram_cache(void)
3933 {
3934     RAMBlock *block;
3935 
3936     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3937     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3938         g_free(block->bmap);
3939         block->bmap = NULL;
3940     }
3941 
3942     WITH_RCU_READ_LOCK_GUARD() {
3943         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3944             if (block->colo_cache) {
3945                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3946                 block->colo_cache = NULL;
3947             }
3948         }
3949     }
3950     ram_state_cleanup(&ram_state);
3951 }
3952 
3953 /**
3954  * ram_load_setup: Setup RAM for migration incoming side
3955  *
3956  * Returns zero to indicate success and negative for error
3957  *
3958  * @f: QEMUFile where to receive the data
3959  * @opaque: RAMState pointer
3960  */
3961 static int ram_load_setup(QEMUFile *f, void *opaque)
3962 {
3963     if (compress_threads_load_setup(f)) {
3964         return -1;
3965     }
3966 
3967     xbzrle_load_setup();
3968     ramblock_recv_map_init();
3969 
3970     return 0;
3971 }
3972 
3973 static int ram_load_cleanup(void *opaque)
3974 {
3975     RAMBlock *rb;
3976 
3977     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3978         qemu_ram_block_writeback(rb);
3979     }
3980 
3981     xbzrle_load_cleanup();
3982     compress_threads_load_cleanup();
3983 
3984     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3985         g_free(rb->receivedmap);
3986         rb->receivedmap = NULL;
3987     }
3988 
3989     return 0;
3990 }
3991 
3992 /**
3993  * ram_postcopy_incoming_init: allocate postcopy data structures
3994  *
3995  * Returns 0 for success and negative if there was one error
3996  *
3997  * @mis: current migration incoming state
3998  *
3999  * Allocate data structures etc needed by incoming migration with
4000  * postcopy-ram. postcopy-ram's similarly names
4001  * postcopy_ram_incoming_init does the work.
4002  */
4003 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4004 {
4005     return postcopy_ram_incoming_init(mis);
4006 }
4007 
4008 /**
4009  * ram_load_postcopy: load a page in postcopy case
4010  *
4011  * Returns 0 for success or -errno in case of error
4012  *
4013  * Called in postcopy mode by ram_load().
4014  * rcu_read_lock is taken prior to this being called.
4015  *
4016  * @f: QEMUFile where to send the data
4017  * @channel: the channel to use for loading
4018  */
4019 int ram_load_postcopy(QEMUFile *f, int channel)
4020 {
4021     int flags = 0, ret = 0;
4022     bool place_needed = false;
4023     bool matches_target_page_size = false;
4024     MigrationIncomingState *mis = migration_incoming_get_current();
4025     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
4026 
4027     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4028         ram_addr_t addr;
4029         void *page_buffer = NULL;
4030         void *place_source = NULL;
4031         RAMBlock *block = NULL;
4032         uint8_t ch;
4033         int len;
4034 
4035         addr = qemu_get_be64(f);
4036 
4037         /*
4038          * If qemu file error, we should stop here, and then "addr"
4039          * may be invalid
4040          */
4041         ret = qemu_file_get_error(f);
4042         if (ret) {
4043             break;
4044         }
4045 
4046         flags = addr & ~TARGET_PAGE_MASK;
4047         addr &= TARGET_PAGE_MASK;
4048 
4049         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4050         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4051                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4052             block = ram_block_from_stream(mis, f, flags, channel);
4053             if (!block) {
4054                 ret = -EINVAL;
4055                 break;
4056             }
4057 
4058             /*
4059              * Relying on used_length is racy and can result in false positives.
4060              * We might place pages beyond used_length in case RAM was shrunk
4061              * while in postcopy, which is fine - trying to place via
4062              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4063              */
4064             if (!block->host || addr >= block->postcopy_length) {
4065                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4066                 ret = -EINVAL;
4067                 break;
4068             }
4069             tmp_page->target_pages++;
4070             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4071             /*
4072              * Postcopy requires that we place whole host pages atomically;
4073              * these may be huge pages for RAMBlocks that are backed by
4074              * hugetlbfs.
4075              * To make it atomic, the data is read into a temporary page
4076              * that's moved into place later.
4077              * The migration protocol uses,  possibly smaller, target-pages
4078              * however the source ensures it always sends all the components
4079              * of a host page in one chunk.
4080              */
4081             page_buffer = tmp_page->tmp_huge_page +
4082                           host_page_offset_from_ram_block_offset(block, addr);
4083             /* If all TP are zero then we can optimise the place */
4084             if (tmp_page->target_pages == 1) {
4085                 tmp_page->host_addr =
4086                     host_page_from_ram_block_offset(block, addr);
4087             } else if (tmp_page->host_addr !=
4088                        host_page_from_ram_block_offset(block, addr)) {
4089                 /* not the 1st TP within the HP */
4090                 error_report("Non-same host page detected on channel %d: "
4091                              "Target host page %p, received host page %p "
4092                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4093                              channel, tmp_page->host_addr,
4094                              host_page_from_ram_block_offset(block, addr),
4095                              block->idstr, addr, tmp_page->target_pages);
4096                 ret = -EINVAL;
4097                 break;
4098             }
4099 
4100             /*
4101              * If it's the last part of a host page then we place the host
4102              * page
4103              */
4104             if (tmp_page->target_pages ==
4105                 (block->page_size / TARGET_PAGE_SIZE)) {
4106                 place_needed = true;
4107             }
4108             place_source = tmp_page->tmp_huge_page;
4109         }
4110 
4111         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4112         case RAM_SAVE_FLAG_ZERO:
4113             ch = qemu_get_byte(f);
4114             /*
4115              * Can skip to set page_buffer when
4116              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4117              */
4118             if (ch || !matches_target_page_size) {
4119                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4120             }
4121             if (ch) {
4122                 tmp_page->all_zero = false;
4123             }
4124             break;
4125 
4126         case RAM_SAVE_FLAG_PAGE:
4127             tmp_page->all_zero = false;
4128             if (!matches_target_page_size) {
4129                 /* For huge pages, we always use temporary buffer */
4130                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4131             } else {
4132                 /*
4133                  * For small pages that matches target page size, we
4134                  * avoid the qemu_file copy.  Instead we directly use
4135                  * the buffer of QEMUFile to place the page.  Note: we
4136                  * cannot do any QEMUFile operation before using that
4137                  * buffer to make sure the buffer is valid when
4138                  * placing the page.
4139                  */
4140                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4141                                          TARGET_PAGE_SIZE);
4142             }
4143             break;
4144         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4145             tmp_page->all_zero = false;
4146             len = qemu_get_be32(f);
4147             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4148                 error_report("Invalid compressed data length: %d", len);
4149                 ret = -EINVAL;
4150                 break;
4151             }
4152             decompress_data_with_multi_threads(f, page_buffer, len);
4153             break;
4154 
4155         case RAM_SAVE_FLAG_EOS:
4156             /* normal exit */
4157             multifd_recv_sync_main();
4158             break;
4159         default:
4160             error_report("Unknown combination of migration flags: 0x%x"
4161                          " (postcopy mode)", flags);
4162             ret = -EINVAL;
4163             break;
4164         }
4165 
4166         /* Got the whole host page, wait for decompress before placing. */
4167         if (place_needed) {
4168             ret |= wait_for_decompress_done();
4169         }
4170 
4171         /* Detect for any possible file errors */
4172         if (!ret && qemu_file_get_error(f)) {
4173             ret = qemu_file_get_error(f);
4174         }
4175 
4176         if (!ret && place_needed) {
4177             if (tmp_page->all_zero) {
4178                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4179             } else {
4180                 ret = postcopy_place_page(mis, tmp_page->host_addr,
4181                                           place_source, block);
4182             }
4183             place_needed = false;
4184             postcopy_temp_page_reset(tmp_page);
4185         }
4186     }
4187 
4188     return ret;
4189 }
4190 
4191 static bool postcopy_is_running(void)
4192 {
4193     PostcopyState ps = postcopy_state_get();
4194     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4195 }
4196 
4197 /*
4198  * Flush content of RAM cache into SVM's memory.
4199  * Only flush the pages that be dirtied by PVM or SVM or both.
4200  */
4201 void colo_flush_ram_cache(void)
4202 {
4203     RAMBlock *block = NULL;
4204     void *dst_host;
4205     void *src_host;
4206     unsigned long offset = 0;
4207 
4208     memory_global_dirty_log_sync();
4209     WITH_RCU_READ_LOCK_GUARD() {
4210         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4211             ramblock_sync_dirty_bitmap(ram_state, block);
4212         }
4213     }
4214 
4215     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4216     WITH_RCU_READ_LOCK_GUARD() {
4217         block = QLIST_FIRST_RCU(&ram_list.blocks);
4218 
4219         while (block) {
4220             unsigned long num = 0;
4221 
4222             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4223             if (!offset_in_ramblock(block,
4224                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4225                 offset = 0;
4226                 num = 0;
4227                 block = QLIST_NEXT_RCU(block, next);
4228             } else {
4229                 unsigned long i = 0;
4230 
4231                 for (i = 0; i < num; i++) {
4232                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
4233                 }
4234                 dst_host = block->host
4235                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4236                 src_host = block->colo_cache
4237                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4238                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4239                 offset += num;
4240             }
4241         }
4242     }
4243     trace_colo_flush_ram_cache_end();
4244 }
4245 
4246 /**
4247  * ram_load_precopy: load pages in precopy case
4248  *
4249  * Returns 0 for success or -errno in case of error
4250  *
4251  * Called in precopy mode by ram_load().
4252  * rcu_read_lock is taken prior to this being called.
4253  *
4254  * @f: QEMUFile where to send the data
4255  */
4256 static int ram_load_precopy(QEMUFile *f)
4257 {
4258     MigrationIncomingState *mis = migration_incoming_get_current();
4259     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4260     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4261     bool postcopy_advised = migration_incoming_postcopy_advised();
4262     if (!migrate_use_compression()) {
4263         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4264     }
4265 
4266     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4267         ram_addr_t addr, total_ram_bytes;
4268         void *host = NULL, *host_bak = NULL;
4269         uint8_t ch;
4270 
4271         /*
4272          * Yield periodically to let main loop run, but an iteration of
4273          * the main loop is expensive, so do it each some iterations
4274          */
4275         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4276             aio_co_schedule(qemu_get_current_aio_context(),
4277                             qemu_coroutine_self());
4278             qemu_coroutine_yield();
4279         }
4280         i++;
4281 
4282         addr = qemu_get_be64(f);
4283         flags = addr & ~TARGET_PAGE_MASK;
4284         addr &= TARGET_PAGE_MASK;
4285 
4286         if (flags & invalid_flags) {
4287             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4288                 error_report("Received an unexpected compressed page");
4289             }
4290 
4291             ret = -EINVAL;
4292             break;
4293         }
4294 
4295         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4296                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4297             RAMBlock *block = ram_block_from_stream(mis, f, flags,
4298                                                     RAM_CHANNEL_PRECOPY);
4299 
4300             host = host_from_ram_block_offset(block, addr);
4301             /*
4302              * After going into COLO stage, we should not load the page
4303              * into SVM's memory directly, we put them into colo_cache firstly.
4304              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4305              * Previously, we copied all these memory in preparing stage of COLO
4306              * while we need to stop VM, which is a time-consuming process.
4307              * Here we optimize it by a trick, back-up every page while in
4308              * migration process while COLO is enabled, though it affects the
4309              * speed of the migration, but it obviously reduce the downtime of
4310              * back-up all SVM'S memory in COLO preparing stage.
4311              */
4312             if (migration_incoming_colo_enabled()) {
4313                 if (migration_incoming_in_colo_state()) {
4314                     /* In COLO stage, put all pages into cache temporarily */
4315                     host = colo_cache_from_block_offset(block, addr, true);
4316                 } else {
4317                    /*
4318                     * In migration stage but before COLO stage,
4319                     * Put all pages into both cache and SVM's memory.
4320                     */
4321                     host_bak = colo_cache_from_block_offset(block, addr, false);
4322                 }
4323             }
4324             if (!host) {
4325                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4326                 ret = -EINVAL;
4327                 break;
4328             }
4329             if (!migration_incoming_in_colo_state()) {
4330                 ramblock_recv_bitmap_set(block, host);
4331             }
4332 
4333             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4334         }
4335 
4336         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4337         case RAM_SAVE_FLAG_MEM_SIZE:
4338             /* Synchronize RAM block list */
4339             total_ram_bytes = addr;
4340             while (!ret && total_ram_bytes) {
4341                 RAMBlock *block;
4342                 char id[256];
4343                 ram_addr_t length;
4344 
4345                 len = qemu_get_byte(f);
4346                 qemu_get_buffer(f, (uint8_t *)id, len);
4347                 id[len] = 0;
4348                 length = qemu_get_be64(f);
4349 
4350                 block = qemu_ram_block_by_name(id);
4351                 if (block && !qemu_ram_is_migratable(block)) {
4352                     error_report("block %s should not be migrated !", id);
4353                     ret = -EINVAL;
4354                 } else if (block) {
4355                     if (length != block->used_length) {
4356                         Error *local_err = NULL;
4357 
4358                         ret = qemu_ram_resize(block, length,
4359                                               &local_err);
4360                         if (local_err) {
4361                             error_report_err(local_err);
4362                         }
4363                     }
4364                     /* For postcopy we need to check hugepage sizes match */
4365                     if (postcopy_advised && migrate_postcopy_ram() &&
4366                         block->page_size != qemu_host_page_size) {
4367                         uint64_t remote_page_size = qemu_get_be64(f);
4368                         if (remote_page_size != block->page_size) {
4369                             error_report("Mismatched RAM page size %s "
4370                                          "(local) %zd != %" PRId64,
4371                                          id, block->page_size,
4372                                          remote_page_size);
4373                             ret = -EINVAL;
4374                         }
4375                     }
4376                     if (migrate_ignore_shared()) {
4377                         hwaddr addr = qemu_get_be64(f);
4378                         if (ramblock_is_ignored(block) &&
4379                             block->mr->addr != addr) {
4380                             error_report("Mismatched GPAs for block %s "
4381                                          "%" PRId64 "!= %" PRId64,
4382                                          id, (uint64_t)addr,
4383                                          (uint64_t)block->mr->addr);
4384                             ret = -EINVAL;
4385                         }
4386                     }
4387                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4388                                           block->idstr);
4389                 } else {
4390                     error_report("Unknown ramblock \"%s\", cannot "
4391                                  "accept migration", id);
4392                     ret = -EINVAL;
4393                 }
4394 
4395                 total_ram_bytes -= length;
4396             }
4397             break;
4398 
4399         case RAM_SAVE_FLAG_ZERO:
4400             ch = qemu_get_byte(f);
4401             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4402             break;
4403 
4404         case RAM_SAVE_FLAG_PAGE:
4405             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4406             break;
4407 
4408         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4409             len = qemu_get_be32(f);
4410             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4411                 error_report("Invalid compressed data length: %d", len);
4412                 ret = -EINVAL;
4413                 break;
4414             }
4415             decompress_data_with_multi_threads(f, host, len);
4416             break;
4417 
4418         case RAM_SAVE_FLAG_XBZRLE:
4419             if (load_xbzrle(f, addr, host) < 0) {
4420                 error_report("Failed to decompress XBZRLE page at "
4421                              RAM_ADDR_FMT, addr);
4422                 ret = -EINVAL;
4423                 break;
4424             }
4425             break;
4426         case RAM_SAVE_FLAG_EOS:
4427             /* normal exit */
4428             multifd_recv_sync_main();
4429             break;
4430         default:
4431             if (flags & RAM_SAVE_FLAG_HOOK) {
4432                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4433             } else {
4434                 error_report("Unknown combination of migration flags: 0x%x",
4435                              flags);
4436                 ret = -EINVAL;
4437             }
4438         }
4439         if (!ret) {
4440             ret = qemu_file_get_error(f);
4441         }
4442         if (!ret && host_bak) {
4443             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4444         }
4445     }
4446 
4447     ret |= wait_for_decompress_done();
4448     return ret;
4449 }
4450 
4451 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4452 {
4453     int ret = 0;
4454     static uint64_t seq_iter;
4455     /*
4456      * If system is running in postcopy mode, page inserts to host memory must
4457      * be atomic
4458      */
4459     bool postcopy_running = postcopy_is_running();
4460 
4461     seq_iter++;
4462 
4463     if (version_id != 4) {
4464         return -EINVAL;
4465     }
4466 
4467     /*
4468      * This RCU critical section can be very long running.
4469      * When RCU reclaims in the code start to become numerous,
4470      * it will be necessary to reduce the granularity of this
4471      * critical section.
4472      */
4473     WITH_RCU_READ_LOCK_GUARD() {
4474         if (postcopy_running) {
4475             /*
4476              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4477              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4478              * service fast page faults.
4479              */
4480             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4481         } else {
4482             ret = ram_load_precopy(f);
4483         }
4484     }
4485     trace_ram_load_complete(ret, seq_iter);
4486 
4487     return ret;
4488 }
4489 
4490 static bool ram_has_postcopy(void *opaque)
4491 {
4492     RAMBlock *rb;
4493     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4494         if (ramblock_is_pmem(rb)) {
4495             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4496                          "is not supported now!", rb->idstr, rb->host);
4497             return false;
4498         }
4499     }
4500 
4501     return migrate_postcopy_ram();
4502 }
4503 
4504 /* Sync all the dirty bitmap with destination VM.  */
4505 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4506 {
4507     RAMBlock *block;
4508     QEMUFile *file = s->to_dst_file;
4509     int ramblock_count = 0;
4510 
4511     trace_ram_dirty_bitmap_sync_start();
4512 
4513     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4514         qemu_savevm_send_recv_bitmap(file, block->idstr);
4515         trace_ram_dirty_bitmap_request(block->idstr);
4516         ramblock_count++;
4517     }
4518 
4519     trace_ram_dirty_bitmap_sync_wait();
4520 
4521     /* Wait until all the ramblocks' dirty bitmap synced */
4522     while (ramblock_count--) {
4523         qemu_sem_wait(&s->rp_state.rp_sem);
4524     }
4525 
4526     trace_ram_dirty_bitmap_sync_complete();
4527 
4528     return 0;
4529 }
4530 
4531 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4532 {
4533     qemu_sem_post(&s->rp_state.rp_sem);
4534 }
4535 
4536 /*
4537  * Read the received bitmap, revert it as the initial dirty bitmap.
4538  * This is only used when the postcopy migration is paused but wants
4539  * to resume from a middle point.
4540  */
4541 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4542 {
4543     int ret = -EINVAL;
4544     /* from_dst_file is always valid because we're within rp_thread */
4545     QEMUFile *file = s->rp_state.from_dst_file;
4546     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4547     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4548     uint64_t size, end_mark;
4549 
4550     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4551 
4552     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4553         error_report("%s: incorrect state %s", __func__,
4554                      MigrationStatus_str(s->state));
4555         return -EINVAL;
4556     }
4557 
4558     /*
4559      * Note: see comments in ramblock_recv_bitmap_send() on why we
4560      * need the endianness conversion, and the paddings.
4561      */
4562     local_size = ROUND_UP(local_size, 8);
4563 
4564     /* Add paddings */
4565     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4566 
4567     size = qemu_get_be64(file);
4568 
4569     /* The size of the bitmap should match with our ramblock */
4570     if (size != local_size) {
4571         error_report("%s: ramblock '%s' bitmap size mismatch "
4572                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4573                      block->idstr, size, local_size);
4574         ret = -EINVAL;
4575         goto out;
4576     }
4577 
4578     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4579     end_mark = qemu_get_be64(file);
4580 
4581     ret = qemu_file_get_error(file);
4582     if (ret || size != local_size) {
4583         error_report("%s: read bitmap failed for ramblock '%s': %d"
4584                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4585                      __func__, block->idstr, ret, local_size, size);
4586         ret = -EIO;
4587         goto out;
4588     }
4589 
4590     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4591         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4592                      __func__, block->idstr, end_mark);
4593         ret = -EINVAL;
4594         goto out;
4595     }
4596 
4597     /*
4598      * Endianness conversion. We are during postcopy (though paused).
4599      * The dirty bitmap won't change. We can directly modify it.
4600      */
4601     bitmap_from_le(block->bmap, le_bitmap, nbits);
4602 
4603     /*
4604      * What we received is "received bitmap". Revert it as the initial
4605      * dirty bitmap for this ramblock.
4606      */
4607     bitmap_complement(block->bmap, block->bmap, nbits);
4608 
4609     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4610     ramblock_dirty_bitmap_clear_discarded_pages(block);
4611 
4612     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4613     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4614 
4615     /*
4616      * We succeeded to sync bitmap for current ramblock. If this is
4617      * the last one to sync, we need to notify the main send thread.
4618      */
4619     ram_dirty_bitmap_reload_notify(s);
4620 
4621     ret = 0;
4622 out:
4623     g_free(le_bitmap);
4624     return ret;
4625 }
4626 
4627 static int ram_resume_prepare(MigrationState *s, void *opaque)
4628 {
4629     RAMState *rs = *(RAMState **)opaque;
4630     int ret;
4631 
4632     ret = ram_dirty_bitmap_sync_all(s, rs);
4633     if (ret) {
4634         return ret;
4635     }
4636 
4637     ram_state_resume_prepare(rs, s->to_dst_file);
4638 
4639     return 0;
4640 }
4641 
4642 void postcopy_preempt_shutdown_file(MigrationState *s)
4643 {
4644     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4645     qemu_fflush(s->postcopy_qemufile_src);
4646 }
4647 
4648 static SaveVMHandlers savevm_ram_handlers = {
4649     .save_setup = ram_save_setup,
4650     .save_live_iterate = ram_save_iterate,
4651     .save_live_complete_postcopy = ram_save_complete,
4652     .save_live_complete_precopy = ram_save_complete,
4653     .has_postcopy = ram_has_postcopy,
4654     .state_pending_exact = ram_state_pending_exact,
4655     .state_pending_estimate = ram_state_pending_estimate,
4656     .load_state = ram_load,
4657     .save_cleanup = ram_save_cleanup,
4658     .load_setup = ram_load_setup,
4659     .load_cleanup = ram_load_cleanup,
4660     .resume_prepare = ram_resume_prepare,
4661 };
4662 
4663 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4664                                       size_t old_size, size_t new_size)
4665 {
4666     PostcopyState ps = postcopy_state_get();
4667     ram_addr_t offset;
4668     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4669     Error *err = NULL;
4670 
4671     if (ramblock_is_ignored(rb)) {
4672         return;
4673     }
4674 
4675     if (!migration_is_idle()) {
4676         /*
4677          * Precopy code on the source cannot deal with the size of RAM blocks
4678          * changing at random points in time - especially after sending the
4679          * RAM block sizes in the migration stream, they must no longer change.
4680          * Abort and indicate a proper reason.
4681          */
4682         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4683         migration_cancel(err);
4684         error_free(err);
4685     }
4686 
4687     switch (ps) {
4688     case POSTCOPY_INCOMING_ADVISE:
4689         /*
4690          * Update what ram_postcopy_incoming_init()->init_range() does at the
4691          * time postcopy was advised. Syncing RAM blocks with the source will
4692          * result in RAM resizes.
4693          */
4694         if (old_size < new_size) {
4695             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4696                 error_report("RAM block '%s' discard of resized RAM failed",
4697                              rb->idstr);
4698             }
4699         }
4700         rb->postcopy_length = new_size;
4701         break;
4702     case POSTCOPY_INCOMING_NONE:
4703     case POSTCOPY_INCOMING_RUNNING:
4704     case POSTCOPY_INCOMING_END:
4705         /*
4706          * Once our guest is running, postcopy does no longer care about
4707          * resizes. When growing, the new memory was not available on the
4708          * source, no handler needed.
4709          */
4710         break;
4711     default:
4712         error_report("RAM block '%s' resized during postcopy state: %d",
4713                      rb->idstr, ps);
4714         exit(-1);
4715     }
4716 }
4717 
4718 static RAMBlockNotifier ram_mig_ram_notifier = {
4719     .ram_block_resized = ram_mig_ram_block_resized,
4720 };
4721 
4722 void ram_mig_init(void)
4723 {
4724     qemu_mutex_init(&XBZRLE.lock);
4725     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4726     ram_block_notifier_add(&ram_mig_ram_notifier);
4727 }
4728