xref: /openbmc/qemu/migration/ram.c (revision 2a8ec38082f8098f2693bb3632175453c0c84a51)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "block.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
60 #include "options.h"
61 
62 #include "hw/boards.h" /* for machine_dump_guest_core() */
63 
64 #if defined(__linux__)
65 #include "qemu/userfaultfd.h"
66 #endif /* defined(__linux__) */
67 
68 /***********************************************************/
69 /* ram save/restore */
70 
71 /*
72  * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
73  * worked for pages that were filled with the same char.  We switched
74  * it to only search for the zero value.  And to avoid confusion with
75  * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
76  */
77 /*
78  * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
79  */
80 #define RAM_SAVE_FLAG_FULL     0x01
81 #define RAM_SAVE_FLAG_ZERO     0x02
82 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
83 #define RAM_SAVE_FLAG_PAGE     0x08
84 #define RAM_SAVE_FLAG_EOS      0x10
85 #define RAM_SAVE_FLAG_CONTINUE 0x20
86 #define RAM_SAVE_FLAG_XBZRLE   0x40
87 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
88 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
89 /* We can't use any flag that is bigger than 0x200 */
90 
91 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
92      uint8_t *, int) = xbzrle_encode_buffer;
93 #if defined(CONFIG_AVX512BW_OPT)
94 #include "qemu/cpuid.h"
95 static void __attribute__((constructor)) init_cpu_flag(void)
96 {
97     unsigned max = __get_cpuid_max(0, NULL);
98     int a, b, c, d;
99     if (max >= 1) {
100         __cpuid(1, a, b, c, d);
101          /* We must check that AVX is not just available, but usable.  */
102         if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
103             int bv;
104             __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
105             __cpuid_count(7, 0, a, b, c, d);
106            /* 0xe6:
107             *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
108             *                    and ZMM16-ZMM31 state are enabled by OS)
109             *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
110             */
111             if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
112                 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
113             }
114         }
115     }
116 }
117 #endif
118 
119 XBZRLECacheStats xbzrle_counters;
120 
121 /* used by the search for pages to send */
122 struct PageSearchStatus {
123     /* The migration channel used for a specific host page */
124     QEMUFile    *pss_channel;
125     /* Last block from where we have sent data */
126     RAMBlock *last_sent_block;
127     /* Current block being searched */
128     RAMBlock    *block;
129     /* Current page to search from */
130     unsigned long page;
131     /* Set once we wrap around */
132     bool         complete_round;
133     /* Whether we're sending a host page */
134     bool          host_page_sending;
135     /* The start/end of current host page.  Invalid if host_page_sending==false */
136     unsigned long host_page_start;
137     unsigned long host_page_end;
138 };
139 typedef struct PageSearchStatus PageSearchStatus;
140 
141 /* struct contains XBZRLE cache and a static page
142    used by the compression */
143 static struct {
144     /* buffer used for XBZRLE encoding */
145     uint8_t *encoded_buf;
146     /* buffer for storing page content */
147     uint8_t *current_buf;
148     /* Cache for XBZRLE, Protected by lock. */
149     PageCache *cache;
150     QemuMutex lock;
151     /* it will store a page full of zeros */
152     uint8_t *zero_target_page;
153     /* buffer used for XBZRLE decoding */
154     uint8_t *decoded_buf;
155 } XBZRLE;
156 
157 static void XBZRLE_cache_lock(void)
158 {
159     if (migrate_xbzrle()) {
160         qemu_mutex_lock(&XBZRLE.lock);
161     }
162 }
163 
164 static void XBZRLE_cache_unlock(void)
165 {
166     if (migrate_xbzrle()) {
167         qemu_mutex_unlock(&XBZRLE.lock);
168     }
169 }
170 
171 /**
172  * xbzrle_cache_resize: resize the xbzrle cache
173  *
174  * This function is called from migrate_params_apply in main
175  * thread, possibly while a migration is in progress.  A running
176  * migration may be using the cache and might finish during this call,
177  * hence changes to the cache are protected by XBZRLE.lock().
178  *
179  * Returns 0 for success or -1 for error
180  *
181  * @new_size: new cache size
182  * @errp: set *errp if the check failed, with reason
183  */
184 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
185 {
186     PageCache *new_cache;
187     int64_t ret = 0;
188 
189     /* Check for truncation */
190     if (new_size != (size_t)new_size) {
191         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
192                    "exceeding address space");
193         return -1;
194     }
195 
196     if (new_size == migrate_xbzrle_cache_size()) {
197         /* nothing to do */
198         return 0;
199     }
200 
201     XBZRLE_cache_lock();
202 
203     if (XBZRLE.cache != NULL) {
204         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
205         if (!new_cache) {
206             ret = -1;
207             goto out;
208         }
209 
210         cache_fini(XBZRLE.cache);
211         XBZRLE.cache = new_cache;
212     }
213 out:
214     XBZRLE_cache_unlock();
215     return ret;
216 }
217 
218 static bool postcopy_preempt_active(void)
219 {
220     return migrate_postcopy_preempt() && migration_in_postcopy();
221 }
222 
223 bool ramblock_is_ignored(RAMBlock *block)
224 {
225     return !qemu_ram_is_migratable(block) ||
226            (migrate_ignore_shared() && qemu_ram_is_shared(block));
227 }
228 
229 #undef RAMBLOCK_FOREACH
230 
231 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
232 {
233     RAMBlock *block;
234     int ret = 0;
235 
236     RCU_READ_LOCK_GUARD();
237 
238     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
239         ret = func(block, opaque);
240         if (ret) {
241             break;
242         }
243     }
244     return ret;
245 }
246 
247 static void ramblock_recv_map_init(void)
248 {
249     RAMBlock *rb;
250 
251     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
252         assert(!rb->receivedmap);
253         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
254     }
255 }
256 
257 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
258 {
259     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
260                     rb->receivedmap);
261 }
262 
263 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
264 {
265     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
266 }
267 
268 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
269 {
270     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
271 }
272 
273 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
274                                     size_t nr)
275 {
276     bitmap_set_atomic(rb->receivedmap,
277                       ramblock_recv_bitmap_offset(host_addr, rb),
278                       nr);
279 }
280 
281 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
282 
283 /*
284  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
285  *
286  * Returns >0 if success with sent bytes, or <0 if error.
287  */
288 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
289                                   const char *block_name)
290 {
291     RAMBlock *block = qemu_ram_block_by_name(block_name);
292     unsigned long *le_bitmap, nbits;
293     uint64_t size;
294 
295     if (!block) {
296         error_report("%s: invalid block name: %s", __func__, block_name);
297         return -1;
298     }
299 
300     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
301 
302     /*
303      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
304      * machines we may need 4 more bytes for padding (see below
305      * comment). So extend it a bit before hand.
306      */
307     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
308 
309     /*
310      * Always use little endian when sending the bitmap. This is
311      * required that when source and destination VMs are not using the
312      * same endianness. (Note: big endian won't work.)
313      */
314     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
315 
316     /* Size of the bitmap, in bytes */
317     size = DIV_ROUND_UP(nbits, 8);
318 
319     /*
320      * size is always aligned to 8 bytes for 64bit machines, but it
321      * may not be true for 32bit machines. We need this padding to
322      * make sure the migration can survive even between 32bit and
323      * 64bit machines.
324      */
325     size = ROUND_UP(size, 8);
326 
327     qemu_put_be64(file, size);
328     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
329     /*
330      * Mark as an end, in case the middle part is screwed up due to
331      * some "mysterious" reason.
332      */
333     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
334     qemu_fflush(file);
335 
336     g_free(le_bitmap);
337 
338     if (qemu_file_get_error(file)) {
339         return qemu_file_get_error(file);
340     }
341 
342     return size + sizeof(size);
343 }
344 
345 /*
346  * An outstanding page request, on the source, having been received
347  * and queued
348  */
349 struct RAMSrcPageRequest {
350     RAMBlock *rb;
351     hwaddr    offset;
352     hwaddr    len;
353 
354     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
355 };
356 
357 /* State of RAM for migration */
358 struct RAMState {
359     /*
360      * PageSearchStatus structures for the channels when send pages.
361      * Protected by the bitmap_mutex.
362      */
363     PageSearchStatus pss[RAM_CHANNEL_MAX];
364     /* UFFD file descriptor, used in 'write-tracking' migration */
365     int uffdio_fd;
366     /* total ram size in bytes */
367     uint64_t ram_bytes_total;
368     /* Last block that we have visited searching for dirty pages */
369     RAMBlock *last_seen_block;
370     /* Last dirty target page we have sent */
371     ram_addr_t last_page;
372     /* last ram version we have seen */
373     uint32_t last_version;
374     /* How many times we have dirty too many pages */
375     int dirty_rate_high_cnt;
376     /* these variables are used for bitmap sync */
377     /* last time we did a full bitmap_sync */
378     int64_t time_last_bitmap_sync;
379     /* bytes transferred at start_time */
380     uint64_t bytes_xfer_prev;
381     /* number of dirty pages since start_time */
382     uint64_t num_dirty_pages_period;
383     /* xbzrle misses since the beginning of the period */
384     uint64_t xbzrle_cache_miss_prev;
385     /* Amount of xbzrle pages since the beginning of the period */
386     uint64_t xbzrle_pages_prev;
387     /* Amount of xbzrle encoded bytes since the beginning of the period */
388     uint64_t xbzrle_bytes_prev;
389     /* Start using XBZRLE (e.g., after the first round). */
390     bool xbzrle_enabled;
391     /* Are we on the last stage of migration */
392     bool last_stage;
393     /* compression statistics since the beginning of the period */
394     /* amount of count that no free thread to compress data */
395     uint64_t compress_thread_busy_prev;
396     /* amount bytes after compression */
397     uint64_t compressed_size_prev;
398     /* amount of compressed pages */
399     uint64_t compress_pages_prev;
400 
401     /* total handled target pages at the beginning of period */
402     uint64_t target_page_count_prev;
403     /* total handled target pages since start */
404     uint64_t target_page_count;
405     /* number of dirty bits in the bitmap */
406     uint64_t migration_dirty_pages;
407     /*
408      * Protects:
409      * - dirty/clear bitmap
410      * - migration_dirty_pages
411      * - pss structures
412      */
413     QemuMutex bitmap_mutex;
414     /* The RAMBlock used in the last src_page_requests */
415     RAMBlock *last_req_rb;
416     /* Queue of outstanding page requests from the destination */
417     QemuMutex src_page_req_mutex;
418     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
419 };
420 typedef struct RAMState RAMState;
421 
422 static RAMState *ram_state;
423 
424 static NotifierWithReturnList precopy_notifier_list;
425 
426 /* Whether postcopy has queued requests? */
427 static bool postcopy_has_request(RAMState *rs)
428 {
429     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
430 }
431 
432 void precopy_infrastructure_init(void)
433 {
434     notifier_with_return_list_init(&precopy_notifier_list);
435 }
436 
437 void precopy_add_notifier(NotifierWithReturn *n)
438 {
439     notifier_with_return_list_add(&precopy_notifier_list, n);
440 }
441 
442 void precopy_remove_notifier(NotifierWithReturn *n)
443 {
444     notifier_with_return_remove(n);
445 }
446 
447 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
448 {
449     PrecopyNotifyData pnd;
450     pnd.reason = reason;
451     pnd.errp = errp;
452 
453     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
454 }
455 
456 uint64_t ram_bytes_remaining(void)
457 {
458     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
459                        0;
460 }
461 
462 RAMStats ram_counters;
463 
464 void ram_transferred_add(uint64_t bytes)
465 {
466     if (runstate_is_running()) {
467         stat64_add(&ram_counters.precopy_bytes, bytes);
468     } else if (migration_in_postcopy()) {
469         stat64_add(&ram_counters.postcopy_bytes, bytes);
470     } else {
471         stat64_add(&ram_counters.downtime_bytes, bytes);
472     }
473     stat64_add(&ram_counters.transferred, bytes);
474 }
475 
476 struct MigrationOps {
477     int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
478 };
479 typedef struct MigrationOps MigrationOps;
480 
481 MigrationOps *migration_ops;
482 
483 CompressionStats compression_counters;
484 
485 struct CompressParam {
486     bool done;
487     bool quit;
488     bool zero_page;
489     QEMUFile *file;
490     QemuMutex mutex;
491     QemuCond cond;
492     RAMBlock *block;
493     ram_addr_t offset;
494 
495     /* internally used fields */
496     z_stream stream;
497     uint8_t *originbuf;
498 };
499 typedef struct CompressParam CompressParam;
500 
501 struct DecompressParam {
502     bool done;
503     bool quit;
504     QemuMutex mutex;
505     QemuCond cond;
506     void *des;
507     uint8_t *compbuf;
508     int len;
509     z_stream stream;
510 };
511 typedef struct DecompressParam DecompressParam;
512 
513 static CompressParam *comp_param;
514 static QemuThread *compress_threads;
515 /* comp_done_cond is used to wake up the migration thread when
516  * one of the compression threads has finished the compression.
517  * comp_done_lock is used to co-work with comp_done_cond.
518  */
519 static QemuMutex comp_done_lock;
520 static QemuCond comp_done_cond;
521 
522 static QEMUFile *decomp_file;
523 static DecompressParam *decomp_param;
524 static QemuThread *decompress_threads;
525 static QemuMutex decomp_done_lock;
526 static QemuCond decomp_done_cond;
527 
528 static int ram_save_host_page_urgent(PageSearchStatus *pss);
529 
530 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
531                                  ram_addr_t offset, uint8_t *source_buf);
532 
533 /* NOTE: page is the PFN not real ram_addr_t. */
534 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
535 {
536     pss->block = rb;
537     pss->page = page;
538     pss->complete_round = false;
539 }
540 
541 /*
542  * Check whether two PSSs are actively sending the same page.  Return true
543  * if it is, false otherwise.
544  */
545 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
546 {
547     return pss1->host_page_sending && pss2->host_page_sending &&
548         (pss1->host_page_start == pss2->host_page_start);
549 }
550 
551 static void *do_data_compress(void *opaque)
552 {
553     CompressParam *param = opaque;
554     RAMBlock *block;
555     ram_addr_t offset;
556     bool zero_page;
557 
558     qemu_mutex_lock(&param->mutex);
559     while (!param->quit) {
560         if (param->block) {
561             block = param->block;
562             offset = param->offset;
563             param->block = NULL;
564             qemu_mutex_unlock(&param->mutex);
565 
566             zero_page = do_compress_ram_page(param->file, &param->stream,
567                                              block, offset, param->originbuf);
568 
569             qemu_mutex_lock(&comp_done_lock);
570             param->done = true;
571             param->zero_page = zero_page;
572             qemu_cond_signal(&comp_done_cond);
573             qemu_mutex_unlock(&comp_done_lock);
574 
575             qemu_mutex_lock(&param->mutex);
576         } else {
577             qemu_cond_wait(&param->cond, &param->mutex);
578         }
579     }
580     qemu_mutex_unlock(&param->mutex);
581 
582     return NULL;
583 }
584 
585 static void compress_threads_save_cleanup(void)
586 {
587     int i, thread_count;
588 
589     if (!migrate_compress() || !comp_param) {
590         return;
591     }
592 
593     thread_count = migrate_compress_threads();
594     for (i = 0; i < thread_count; i++) {
595         /*
596          * we use it as a indicator which shows if the thread is
597          * properly init'd or not
598          */
599         if (!comp_param[i].file) {
600             break;
601         }
602 
603         qemu_mutex_lock(&comp_param[i].mutex);
604         comp_param[i].quit = true;
605         qemu_cond_signal(&comp_param[i].cond);
606         qemu_mutex_unlock(&comp_param[i].mutex);
607 
608         qemu_thread_join(compress_threads + i);
609         qemu_mutex_destroy(&comp_param[i].mutex);
610         qemu_cond_destroy(&comp_param[i].cond);
611         deflateEnd(&comp_param[i].stream);
612         g_free(comp_param[i].originbuf);
613         qemu_fclose(comp_param[i].file);
614         comp_param[i].file = NULL;
615     }
616     qemu_mutex_destroy(&comp_done_lock);
617     qemu_cond_destroy(&comp_done_cond);
618     g_free(compress_threads);
619     g_free(comp_param);
620     compress_threads = NULL;
621     comp_param = NULL;
622 }
623 
624 static int compress_threads_save_setup(void)
625 {
626     int i, thread_count;
627 
628     if (!migrate_compress()) {
629         return 0;
630     }
631     thread_count = migrate_compress_threads();
632     compress_threads = g_new0(QemuThread, thread_count);
633     comp_param = g_new0(CompressParam, thread_count);
634     qemu_cond_init(&comp_done_cond);
635     qemu_mutex_init(&comp_done_lock);
636     for (i = 0; i < thread_count; i++) {
637         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
638         if (!comp_param[i].originbuf) {
639             goto exit;
640         }
641 
642         if (deflateInit(&comp_param[i].stream,
643                         migrate_compress_level()) != Z_OK) {
644             g_free(comp_param[i].originbuf);
645             goto exit;
646         }
647 
648         /* comp_param[i].file is just used as a dummy buffer to save data,
649          * set its ops to empty.
650          */
651         comp_param[i].file = qemu_file_new_output(
652             QIO_CHANNEL(qio_channel_null_new()));
653         comp_param[i].done = true;
654         comp_param[i].quit = false;
655         qemu_mutex_init(&comp_param[i].mutex);
656         qemu_cond_init(&comp_param[i].cond);
657         qemu_thread_create(compress_threads + i, "compress",
658                            do_data_compress, comp_param + i,
659                            QEMU_THREAD_JOINABLE);
660     }
661     return 0;
662 
663 exit:
664     compress_threads_save_cleanup();
665     return -1;
666 }
667 
668 /**
669  * save_page_header: write page header to wire
670  *
671  * If this is the 1st block, it also writes the block identification
672  *
673  * Returns the number of bytes written
674  *
675  * @pss: current PSS channel status
676  * @block: block that contains the page we want to send
677  * @offset: offset inside the block for the page
678  *          in the lower bits, it contains flags
679  */
680 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
681                                RAMBlock *block, ram_addr_t offset)
682 {
683     size_t size, len;
684     bool same_block = (block == pss->last_sent_block);
685 
686     if (same_block) {
687         offset |= RAM_SAVE_FLAG_CONTINUE;
688     }
689     qemu_put_be64(f, offset);
690     size = 8;
691 
692     if (!same_block) {
693         len = strlen(block->idstr);
694         qemu_put_byte(f, len);
695         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
696         size += 1 + len;
697         pss->last_sent_block = block;
698     }
699     return size;
700 }
701 
702 /**
703  * mig_throttle_guest_down: throttle down the guest
704  *
705  * Reduce amount of guest cpu execution to hopefully slow down memory
706  * writes. If guest dirty memory rate is reduced below the rate at
707  * which we can transfer pages to the destination then we should be
708  * able to complete migration. Some workloads dirty memory way too
709  * fast and will not effectively converge, even with auto-converge.
710  */
711 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
712                                     uint64_t bytes_dirty_threshold)
713 {
714     MigrationState *s = migrate_get_current();
715     uint64_t pct_initial = migrate_cpu_throttle_initial();
716     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
717     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
718     int pct_max = migrate_max_cpu_throttle();
719 
720     uint64_t throttle_now = cpu_throttle_get_percentage();
721     uint64_t cpu_now, cpu_ideal, throttle_inc;
722 
723     /* We have not started throttling yet. Let's start it. */
724     if (!cpu_throttle_active()) {
725         cpu_throttle_set(pct_initial);
726     } else {
727         /* Throttling already on, just increase the rate */
728         if (!pct_tailslow) {
729             throttle_inc = pct_increment;
730         } else {
731             /* Compute the ideal CPU percentage used by Guest, which may
732              * make the dirty rate match the dirty rate threshold. */
733             cpu_now = 100 - throttle_now;
734             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
735                         bytes_dirty_period);
736             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
737         }
738         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
739     }
740 }
741 
742 void mig_throttle_counter_reset(void)
743 {
744     RAMState *rs = ram_state;
745 
746     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
747     rs->num_dirty_pages_period = 0;
748     rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
749 }
750 
751 /**
752  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
753  *
754  * @rs: current RAM state
755  * @current_addr: address for the zero page
756  *
757  * Update the xbzrle cache to reflect a page that's been sent as all 0.
758  * The important thing is that a stale (not-yet-0'd) page be replaced
759  * by the new data.
760  * As a bonus, if the page wasn't in the cache it gets added so that
761  * when a small write is made into the 0'd page it gets XBZRLE sent.
762  */
763 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
764 {
765     /* We don't care if this fails to allocate a new cache page
766      * as long as it updated an old one */
767     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
768                  stat64_get(&ram_counters.dirty_sync_count));
769 }
770 
771 #define ENCODING_FLAG_XBZRLE 0x1
772 
773 /**
774  * save_xbzrle_page: compress and send current page
775  *
776  * Returns: 1 means that we wrote the page
777  *          0 means that page is identical to the one already sent
778  *          -1 means that xbzrle would be longer than normal
779  *
780  * @rs: current RAM state
781  * @pss: current PSS channel
782  * @current_data: pointer to the address of the page contents
783  * @current_addr: addr of the page
784  * @block: block that contains the page we want to send
785  * @offset: offset inside the block for the page
786  */
787 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
788                             uint8_t **current_data, ram_addr_t current_addr,
789                             RAMBlock *block, ram_addr_t offset)
790 {
791     int encoded_len = 0, bytes_xbzrle;
792     uint8_t *prev_cached_page;
793     QEMUFile *file = pss->pss_channel;
794     uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
795 
796     if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
797         xbzrle_counters.cache_miss++;
798         if (!rs->last_stage) {
799             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
800                              generation) == -1) {
801                 return -1;
802             } else {
803                 /* update *current_data when the page has been
804                    inserted into cache */
805                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
806             }
807         }
808         return -1;
809     }
810 
811     /*
812      * Reaching here means the page has hit the xbzrle cache, no matter what
813      * encoding result it is (normal encoding, overflow or skipping the page),
814      * count the page as encoded. This is used to calculate the encoding rate.
815      *
816      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
817      * 2nd page turns out to be skipped (i.e. no new bytes written to the
818      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
819      * skipped page included. In this way, the encoding rate can tell if the
820      * guest page is good for xbzrle encoding.
821      */
822     xbzrle_counters.pages++;
823     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
824 
825     /* save current buffer into memory */
826     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
827 
828     /* XBZRLE encoding (if there is no overflow) */
829     encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
830                                             TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
831                                             TARGET_PAGE_SIZE);
832 
833     /*
834      * Update the cache contents, so that it corresponds to the data
835      * sent, in all cases except where we skip the page.
836      */
837     if (!rs->last_stage && encoded_len != 0) {
838         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
839         /*
840          * In the case where we couldn't compress, ensure that the caller
841          * sends the data from the cache, since the guest might have
842          * changed the RAM since we copied it.
843          */
844         *current_data = prev_cached_page;
845     }
846 
847     if (encoded_len == 0) {
848         trace_save_xbzrle_page_skipping();
849         return 0;
850     } else if (encoded_len == -1) {
851         trace_save_xbzrle_page_overflow();
852         xbzrle_counters.overflow++;
853         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
854         return -1;
855     }
856 
857     /* Send XBZRLE based compressed page */
858     bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
859                                     offset | RAM_SAVE_FLAG_XBZRLE);
860     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
861     qemu_put_be16(file, encoded_len);
862     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
863     bytes_xbzrle += encoded_len + 1 + 2;
864     /*
865      * Like compressed_size (please see update_compress_thread_counts),
866      * the xbzrle encoded bytes don't count the 8 byte header with
867      * RAM_SAVE_FLAG_CONTINUE.
868      */
869     xbzrle_counters.bytes += bytes_xbzrle - 8;
870     ram_transferred_add(bytes_xbzrle);
871 
872     return 1;
873 }
874 
875 /**
876  * pss_find_next_dirty: find the next dirty page of current ramblock
877  *
878  * This function updates pss->page to point to the next dirty page index
879  * within the ramblock to migrate, or the end of ramblock when nothing
880  * found.  Note that when pss->host_page_sending==true it means we're
881  * during sending a host page, so we won't look for dirty page that is
882  * outside the host page boundary.
883  *
884  * @pss: the current page search status
885  */
886 static void pss_find_next_dirty(PageSearchStatus *pss)
887 {
888     RAMBlock *rb = pss->block;
889     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
890     unsigned long *bitmap = rb->bmap;
891 
892     if (ramblock_is_ignored(rb)) {
893         /* Points directly to the end, so we know no dirty page */
894         pss->page = size;
895         return;
896     }
897 
898     /*
899      * If during sending a host page, only look for dirty pages within the
900      * current host page being send.
901      */
902     if (pss->host_page_sending) {
903         assert(pss->host_page_end);
904         size = MIN(size, pss->host_page_end);
905     }
906 
907     pss->page = find_next_bit(bitmap, size, pss->page);
908 }
909 
910 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
911                                                        unsigned long page)
912 {
913     uint8_t shift;
914     hwaddr size, start;
915 
916     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
917         return;
918     }
919 
920     shift = rb->clear_bmap_shift;
921     /*
922      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
923      * can make things easier sometimes since then start address
924      * of the small chunk will always be 64 pages aligned so the
925      * bitmap will always be aligned to unsigned long. We should
926      * even be able to remove this restriction but I'm simply
927      * keeping it.
928      */
929     assert(shift >= 6);
930 
931     size = 1ULL << (TARGET_PAGE_BITS + shift);
932     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
933     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
934     memory_region_clear_dirty_bitmap(rb->mr, start, size);
935 }
936 
937 static void
938 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
939                                                  unsigned long start,
940                                                  unsigned long npages)
941 {
942     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
943     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
944     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
945 
946     /*
947      * Clear pages from start to start + npages - 1, so the end boundary is
948      * exclusive.
949      */
950     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
951         migration_clear_memory_region_dirty_bitmap(rb, i);
952     }
953 }
954 
955 /*
956  * colo_bitmap_find_diry:find contiguous dirty pages from start
957  *
958  * Returns the page offset within memory region of the start of the contiguout
959  * dirty page
960  *
961  * @rs: current RAM state
962  * @rb: RAMBlock where to search for dirty pages
963  * @start: page where we start the search
964  * @num: the number of contiguous dirty pages
965  */
966 static inline
967 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
968                                      unsigned long start, unsigned long *num)
969 {
970     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
971     unsigned long *bitmap = rb->bmap;
972     unsigned long first, next;
973 
974     *num = 0;
975 
976     if (ramblock_is_ignored(rb)) {
977         return size;
978     }
979 
980     first = find_next_bit(bitmap, size, start);
981     if (first >= size) {
982         return first;
983     }
984     next = find_next_zero_bit(bitmap, size, first + 1);
985     assert(next >= first);
986     *num = next - first;
987     return first;
988 }
989 
990 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
991                                                 RAMBlock *rb,
992                                                 unsigned long page)
993 {
994     bool ret;
995 
996     /*
997      * Clear dirty bitmap if needed.  This _must_ be called before we
998      * send any of the page in the chunk because we need to make sure
999      * we can capture further page content changes when we sync dirty
1000      * log the next time.  So as long as we are going to send any of
1001      * the page in the chunk we clear the remote dirty bitmap for all.
1002      * Clearing it earlier won't be a problem, but too late will.
1003      */
1004     migration_clear_memory_region_dirty_bitmap(rb, page);
1005 
1006     ret = test_and_clear_bit(page, rb->bmap);
1007     if (ret) {
1008         rs->migration_dirty_pages--;
1009     }
1010 
1011     return ret;
1012 }
1013 
1014 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1015                                        void *opaque)
1016 {
1017     const hwaddr offset = section->offset_within_region;
1018     const hwaddr size = int128_get64(section->size);
1019     const unsigned long start = offset >> TARGET_PAGE_BITS;
1020     const unsigned long npages = size >> TARGET_PAGE_BITS;
1021     RAMBlock *rb = section->mr->ram_block;
1022     uint64_t *cleared_bits = opaque;
1023 
1024     /*
1025      * We don't grab ram_state->bitmap_mutex because we expect to run
1026      * only when starting migration or during postcopy recovery where
1027      * we don't have concurrent access.
1028      */
1029     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1030         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1031     }
1032     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1033     bitmap_clear(rb->bmap, start, npages);
1034 }
1035 
1036 /*
1037  * Exclude all dirty pages from migration that fall into a discarded range as
1038  * managed by a RamDiscardManager responsible for the mapped memory region of
1039  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1040  *
1041  * Discarded pages ("logically unplugged") have undefined content and must
1042  * not get migrated, because even reading these pages for migration might
1043  * result in undesired behavior.
1044  *
1045  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1046  *
1047  * Note: The result is only stable while migrating (precopy/postcopy).
1048  */
1049 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1050 {
1051     uint64_t cleared_bits = 0;
1052 
1053     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1054         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1055         MemoryRegionSection section = {
1056             .mr = rb->mr,
1057             .offset_within_region = 0,
1058             .size = int128_make64(qemu_ram_get_used_length(rb)),
1059         };
1060 
1061         ram_discard_manager_replay_discarded(rdm, &section,
1062                                              dirty_bitmap_clear_section,
1063                                              &cleared_bits);
1064     }
1065     return cleared_bits;
1066 }
1067 
1068 /*
1069  * Check if a host-page aligned page falls into a discarded range as managed by
1070  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1071  *
1072  * Note: The result is only stable while migrating (precopy/postcopy).
1073  */
1074 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1075 {
1076     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1077         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1078         MemoryRegionSection section = {
1079             .mr = rb->mr,
1080             .offset_within_region = start,
1081             .size = int128_make64(qemu_ram_pagesize(rb)),
1082         };
1083 
1084         return !ram_discard_manager_is_populated(rdm, &section);
1085     }
1086     return false;
1087 }
1088 
1089 /* Called with RCU critical section */
1090 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1091 {
1092     uint64_t new_dirty_pages =
1093         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1094 
1095     rs->migration_dirty_pages += new_dirty_pages;
1096     rs->num_dirty_pages_period += new_dirty_pages;
1097 }
1098 
1099 /**
1100  * ram_pagesize_summary: calculate all the pagesizes of a VM
1101  *
1102  * Returns a summary bitmap of the page sizes of all RAMBlocks
1103  *
1104  * For VMs with just normal pages this is equivalent to the host page
1105  * size. If it's got some huge pages then it's the OR of all the
1106  * different page sizes.
1107  */
1108 uint64_t ram_pagesize_summary(void)
1109 {
1110     RAMBlock *block;
1111     uint64_t summary = 0;
1112 
1113     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1114         summary |= block->page_size;
1115     }
1116 
1117     return summary;
1118 }
1119 
1120 uint64_t ram_get_total_transferred_pages(void)
1121 {
1122     return stat64_get(&ram_counters.normal_pages) +
1123         stat64_get(&ram_counters.zero_pages) +
1124         compression_counters.pages + xbzrle_counters.pages;
1125 }
1126 
1127 static void migration_update_rates(RAMState *rs, int64_t end_time)
1128 {
1129     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1130     double compressed_size;
1131 
1132     /* calculate period counters */
1133     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1134                 / (end_time - rs->time_last_bitmap_sync);
1135 
1136     if (!page_count) {
1137         return;
1138     }
1139 
1140     if (migrate_xbzrle()) {
1141         double encoded_size, unencoded_size;
1142 
1143         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1144             rs->xbzrle_cache_miss_prev) / page_count;
1145         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1146         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1147                          TARGET_PAGE_SIZE;
1148         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1149         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1150             xbzrle_counters.encoding_rate = 0;
1151         } else {
1152             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1153         }
1154         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1155         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1156     }
1157 
1158     if (migrate_compress()) {
1159         compression_counters.busy_rate = (double)(compression_counters.busy -
1160             rs->compress_thread_busy_prev) / page_count;
1161         rs->compress_thread_busy_prev = compression_counters.busy;
1162 
1163         compressed_size = compression_counters.compressed_size -
1164                           rs->compressed_size_prev;
1165         if (compressed_size) {
1166             double uncompressed_size = (compression_counters.pages -
1167                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1168 
1169             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1170             compression_counters.compression_rate =
1171                                         uncompressed_size / compressed_size;
1172 
1173             rs->compress_pages_prev = compression_counters.pages;
1174             rs->compressed_size_prev = compression_counters.compressed_size;
1175         }
1176     }
1177 }
1178 
1179 static void migration_trigger_throttle(RAMState *rs)
1180 {
1181     uint64_t threshold = migrate_throttle_trigger_threshold();
1182     uint64_t bytes_xfer_period =
1183         stat64_get(&ram_counters.transferred) - rs->bytes_xfer_prev;
1184     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1185     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1186 
1187     /* During block migration the auto-converge logic incorrectly detects
1188      * that ram migration makes no progress. Avoid this by disabling the
1189      * throttling logic during the bulk phase of block migration. */
1190     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1191         /* The following detection logic can be refined later. For now:
1192            Check to see if the ratio between dirtied bytes and the approx.
1193            amount of bytes that just got transferred since the last time
1194            we were in this routine reaches the threshold. If that happens
1195            twice, start or increase throttling. */
1196 
1197         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1198             (++rs->dirty_rate_high_cnt >= 2)) {
1199             trace_migration_throttle();
1200             rs->dirty_rate_high_cnt = 0;
1201             mig_throttle_guest_down(bytes_dirty_period,
1202                                     bytes_dirty_threshold);
1203         }
1204     }
1205 }
1206 
1207 static void migration_bitmap_sync(RAMState *rs)
1208 {
1209     RAMBlock *block;
1210     int64_t end_time;
1211 
1212     stat64_add(&ram_counters.dirty_sync_count, 1);
1213 
1214     if (!rs->time_last_bitmap_sync) {
1215         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1216     }
1217 
1218     trace_migration_bitmap_sync_start();
1219     memory_global_dirty_log_sync();
1220 
1221     qemu_mutex_lock(&rs->bitmap_mutex);
1222     WITH_RCU_READ_LOCK_GUARD() {
1223         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1224             ramblock_sync_dirty_bitmap(rs, block);
1225         }
1226         ram_counters.remaining = ram_bytes_remaining();
1227     }
1228     qemu_mutex_unlock(&rs->bitmap_mutex);
1229 
1230     memory_global_after_dirty_log_sync();
1231     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1232 
1233     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1234 
1235     /* more than 1 second = 1000 millisecons */
1236     if (end_time > rs->time_last_bitmap_sync + 1000) {
1237         migration_trigger_throttle(rs);
1238 
1239         migration_update_rates(rs, end_time);
1240 
1241         rs->target_page_count_prev = rs->target_page_count;
1242 
1243         /* reset period counters */
1244         rs->time_last_bitmap_sync = end_time;
1245         rs->num_dirty_pages_period = 0;
1246         rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
1247     }
1248     if (migrate_events()) {
1249         uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
1250         qapi_event_send_migration_pass(generation);
1251     }
1252 }
1253 
1254 static void migration_bitmap_sync_precopy(RAMState *rs)
1255 {
1256     Error *local_err = NULL;
1257 
1258     /*
1259      * The current notifier usage is just an optimization to migration, so we
1260      * don't stop the normal migration process in the error case.
1261      */
1262     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1263         error_report_err(local_err);
1264         local_err = NULL;
1265     }
1266 
1267     migration_bitmap_sync(rs);
1268 
1269     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1270         error_report_err(local_err);
1271     }
1272 }
1273 
1274 void ram_release_page(const char *rbname, uint64_t offset)
1275 {
1276     if (!migrate_release_ram() || !migration_in_postcopy()) {
1277         return;
1278     }
1279 
1280     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1281 }
1282 
1283 /**
1284  * save_zero_page_to_file: send the zero page to the file
1285  *
1286  * Returns the size of data written to the file, 0 means the page is not
1287  * a zero page
1288  *
1289  * @pss: current PSS channel
1290  * @block: block that contains the page we want to send
1291  * @offset: offset inside the block for the page
1292  */
1293 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1294                                   RAMBlock *block, ram_addr_t offset)
1295 {
1296     uint8_t *p = block->host + offset;
1297     int len = 0;
1298 
1299     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1300         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1301         qemu_put_byte(file, 0);
1302         len += 1;
1303         ram_release_page(block->idstr, offset);
1304     }
1305     return len;
1306 }
1307 
1308 /**
1309  * save_zero_page: send the zero page to the stream
1310  *
1311  * Returns the number of pages written.
1312  *
1313  * @pss: current PSS channel
1314  * @block: block that contains the page we want to send
1315  * @offset: offset inside the block for the page
1316  */
1317 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1318                           ram_addr_t offset)
1319 {
1320     int len = save_zero_page_to_file(pss, f, block, offset);
1321 
1322     if (len) {
1323         stat64_add(&ram_counters.zero_pages, 1);
1324         ram_transferred_add(len);
1325         return 1;
1326     }
1327     return -1;
1328 }
1329 
1330 /*
1331  * @pages: the number of pages written by the control path,
1332  *        < 0 - error
1333  *        > 0 - number of pages written
1334  *
1335  * Return true if the pages has been saved, otherwise false is returned.
1336  */
1337 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1338                               ram_addr_t offset, int *pages)
1339 {
1340     uint64_t bytes_xmit = 0;
1341     int ret;
1342 
1343     *pages = -1;
1344     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1345                                 TARGET_PAGE_SIZE, &bytes_xmit);
1346     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1347         return false;
1348     }
1349 
1350     if (bytes_xmit) {
1351         ram_transferred_add(bytes_xmit);
1352         *pages = 1;
1353     }
1354 
1355     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1356         return true;
1357     }
1358 
1359     if (bytes_xmit > 0) {
1360         stat64_add(&ram_counters.normal_pages, 1);
1361     } else if (bytes_xmit == 0) {
1362         stat64_add(&ram_counters.zero_pages, 1);
1363     }
1364 
1365     return true;
1366 }
1367 
1368 /*
1369  * directly send the page to the stream
1370  *
1371  * Returns the number of pages written.
1372  *
1373  * @pss: current PSS channel
1374  * @block: block that contains the page we want to send
1375  * @offset: offset inside the block for the page
1376  * @buf: the page to be sent
1377  * @async: send to page asyncly
1378  */
1379 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1380                             ram_addr_t offset, uint8_t *buf, bool async)
1381 {
1382     QEMUFile *file = pss->pss_channel;
1383 
1384     ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1385                                          offset | RAM_SAVE_FLAG_PAGE));
1386     if (async) {
1387         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1388                               migrate_release_ram() &&
1389                               migration_in_postcopy());
1390     } else {
1391         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1392     }
1393     ram_transferred_add(TARGET_PAGE_SIZE);
1394     stat64_add(&ram_counters.normal_pages, 1);
1395     return 1;
1396 }
1397 
1398 /**
1399  * ram_save_page: send the given page to the stream
1400  *
1401  * Returns the number of pages written.
1402  *          < 0 - error
1403  *          >=0 - Number of pages written - this might legally be 0
1404  *                if xbzrle noticed the page was the same.
1405  *
1406  * @rs: current RAM state
1407  * @block: block that contains the page we want to send
1408  * @offset: offset inside the block for the page
1409  */
1410 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1411 {
1412     int pages = -1;
1413     uint8_t *p;
1414     bool send_async = true;
1415     RAMBlock *block = pss->block;
1416     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1417     ram_addr_t current_addr = block->offset + offset;
1418 
1419     p = block->host + offset;
1420     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1421 
1422     XBZRLE_cache_lock();
1423     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1424         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1425                                  block, offset);
1426         if (!rs->last_stage) {
1427             /* Can't send this cached data async, since the cache page
1428              * might get updated before it gets to the wire
1429              */
1430             send_async = false;
1431         }
1432     }
1433 
1434     /* XBZRLE overflow or normal page */
1435     if (pages == -1) {
1436         pages = save_normal_page(pss, block, offset, p, send_async);
1437     }
1438 
1439     XBZRLE_cache_unlock();
1440 
1441     return pages;
1442 }
1443 
1444 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1445                                  ram_addr_t offset)
1446 {
1447     if (multifd_queue_page(file, block, offset) < 0) {
1448         return -1;
1449     }
1450     stat64_add(&ram_counters.normal_pages, 1);
1451 
1452     return 1;
1453 }
1454 
1455 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1456                                  ram_addr_t offset, uint8_t *source_buf)
1457 {
1458     RAMState *rs = ram_state;
1459     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1460     uint8_t *p = block->host + offset;
1461     int ret;
1462 
1463     if (save_zero_page_to_file(pss, f, block, offset)) {
1464         return true;
1465     }
1466 
1467     save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1468 
1469     /*
1470      * copy it to a internal buffer to avoid it being modified by VM
1471      * so that we can catch up the error during compression and
1472      * decompression
1473      */
1474     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1475     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1476     if (ret < 0) {
1477         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1478         error_report("compressed data failed!");
1479     }
1480     return false;
1481 }
1482 
1483 static void
1484 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1485 {
1486     ram_transferred_add(bytes_xmit);
1487 
1488     if (param->zero_page) {
1489         stat64_add(&ram_counters.zero_pages, 1);
1490         return;
1491     }
1492 
1493     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1494     compression_counters.compressed_size += bytes_xmit - 8;
1495     compression_counters.pages++;
1496 }
1497 
1498 static bool save_page_use_compression(RAMState *rs);
1499 
1500 static void flush_compressed_data(RAMState *rs)
1501 {
1502     MigrationState *ms = migrate_get_current();
1503     int idx, len, thread_count;
1504 
1505     if (!save_page_use_compression(rs)) {
1506         return;
1507     }
1508     thread_count = migrate_compress_threads();
1509 
1510     qemu_mutex_lock(&comp_done_lock);
1511     for (idx = 0; idx < thread_count; idx++) {
1512         while (!comp_param[idx].done) {
1513             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1514         }
1515     }
1516     qemu_mutex_unlock(&comp_done_lock);
1517 
1518     for (idx = 0; idx < thread_count; idx++) {
1519         qemu_mutex_lock(&comp_param[idx].mutex);
1520         if (!comp_param[idx].quit) {
1521             len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1522             /*
1523              * it's safe to fetch zero_page without holding comp_done_lock
1524              * as there is no further request submitted to the thread,
1525              * i.e, the thread should be waiting for a request at this point.
1526              */
1527             update_compress_thread_counts(&comp_param[idx], len);
1528         }
1529         qemu_mutex_unlock(&comp_param[idx].mutex);
1530     }
1531 }
1532 
1533 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1534                                        ram_addr_t offset)
1535 {
1536     param->block = block;
1537     param->offset = offset;
1538 }
1539 
1540 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1541 {
1542     int idx, thread_count, bytes_xmit = -1, pages = -1;
1543     bool wait = migrate_compress_wait_thread();
1544     MigrationState *ms = migrate_get_current();
1545 
1546     thread_count = migrate_compress_threads();
1547     qemu_mutex_lock(&comp_done_lock);
1548 retry:
1549     for (idx = 0; idx < thread_count; idx++) {
1550         if (comp_param[idx].done) {
1551             comp_param[idx].done = false;
1552             bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1553                                             comp_param[idx].file);
1554             qemu_mutex_lock(&comp_param[idx].mutex);
1555             set_compress_params(&comp_param[idx], block, offset);
1556             qemu_cond_signal(&comp_param[idx].cond);
1557             qemu_mutex_unlock(&comp_param[idx].mutex);
1558             pages = 1;
1559             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1560             break;
1561         }
1562     }
1563 
1564     /*
1565      * wait for the free thread if the user specifies 'compress-wait-thread',
1566      * otherwise we will post the page out in the main thread as normal page.
1567      */
1568     if (pages < 0 && wait) {
1569         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1570         goto retry;
1571     }
1572     qemu_mutex_unlock(&comp_done_lock);
1573 
1574     return pages;
1575 }
1576 
1577 #define PAGE_ALL_CLEAN 0
1578 #define PAGE_TRY_AGAIN 1
1579 #define PAGE_DIRTY_FOUND 2
1580 /**
1581  * find_dirty_block: find the next dirty page and update any state
1582  * associated with the search process.
1583  *
1584  * Returns:
1585  *         PAGE_ALL_CLEAN: no dirty page found, give up
1586  *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1587  *         PAGE_DIRTY_FOUND: dirty page found
1588  *
1589  * @rs: current RAM state
1590  * @pss: data about the state of the current dirty page scan
1591  * @again: set to false if the search has scanned the whole of RAM
1592  */
1593 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1594 {
1595     /* Update pss->page for the next dirty bit in ramblock */
1596     pss_find_next_dirty(pss);
1597 
1598     if (pss->complete_round && pss->block == rs->last_seen_block &&
1599         pss->page >= rs->last_page) {
1600         /*
1601          * We've been once around the RAM and haven't found anything.
1602          * Give up.
1603          */
1604         return PAGE_ALL_CLEAN;
1605     }
1606     if (!offset_in_ramblock(pss->block,
1607                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1608         /* Didn't find anything in this RAM Block */
1609         pss->page = 0;
1610         pss->block = QLIST_NEXT_RCU(pss->block, next);
1611         if (!pss->block) {
1612             /*
1613              * If memory migration starts over, we will meet a dirtied page
1614              * which may still exists in compression threads's ring, so we
1615              * should flush the compressed data to make sure the new page
1616              * is not overwritten by the old one in the destination.
1617              *
1618              * Also If xbzrle is on, stop using the data compression at this
1619              * point. In theory, xbzrle can do better than compression.
1620              */
1621             flush_compressed_data(rs);
1622 
1623             /* Hit the end of the list */
1624             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1625             /* Flag that we've looped */
1626             pss->complete_round = true;
1627             /* After the first round, enable XBZRLE. */
1628             if (migrate_xbzrle()) {
1629                 rs->xbzrle_enabled = true;
1630             }
1631         }
1632         /* Didn't find anything this time, but try again on the new block */
1633         return PAGE_TRY_AGAIN;
1634     } else {
1635         /* We've found something */
1636         return PAGE_DIRTY_FOUND;
1637     }
1638 }
1639 
1640 /**
1641  * unqueue_page: gets a page of the queue
1642  *
1643  * Helper for 'get_queued_page' - gets a page off the queue
1644  *
1645  * Returns the block of the page (or NULL if none available)
1646  *
1647  * @rs: current RAM state
1648  * @offset: used to return the offset within the RAMBlock
1649  */
1650 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1651 {
1652     struct RAMSrcPageRequest *entry;
1653     RAMBlock *block = NULL;
1654 
1655     if (!postcopy_has_request(rs)) {
1656         return NULL;
1657     }
1658 
1659     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1660 
1661     /*
1662      * This should _never_ change even after we take the lock, because no one
1663      * should be taking anything off the request list other than us.
1664      */
1665     assert(postcopy_has_request(rs));
1666 
1667     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1668     block = entry->rb;
1669     *offset = entry->offset;
1670 
1671     if (entry->len > TARGET_PAGE_SIZE) {
1672         entry->len -= TARGET_PAGE_SIZE;
1673         entry->offset += TARGET_PAGE_SIZE;
1674     } else {
1675         memory_region_unref(block->mr);
1676         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1677         g_free(entry);
1678         migration_consume_urgent_request();
1679     }
1680 
1681     return block;
1682 }
1683 
1684 #if defined(__linux__)
1685 /**
1686  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1687  *   is found, return RAM block pointer and page offset
1688  *
1689  * Returns pointer to the RAMBlock containing faulting page,
1690  *   NULL if no write faults are pending
1691  *
1692  * @rs: current RAM state
1693  * @offset: page offset from the beginning of the block
1694  */
1695 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1696 {
1697     struct uffd_msg uffd_msg;
1698     void *page_address;
1699     RAMBlock *block;
1700     int res;
1701 
1702     if (!migrate_background_snapshot()) {
1703         return NULL;
1704     }
1705 
1706     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1707     if (res <= 0) {
1708         return NULL;
1709     }
1710 
1711     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1712     block = qemu_ram_block_from_host(page_address, false, offset);
1713     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1714     return block;
1715 }
1716 
1717 /**
1718  * ram_save_release_protection: release UFFD write protection after
1719  *   a range of pages has been saved
1720  *
1721  * @rs: current RAM state
1722  * @pss: page-search-status structure
1723  * @start_page: index of the first page in the range relative to pss->block
1724  *
1725  * Returns 0 on success, negative value in case of an error
1726 */
1727 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1728         unsigned long start_page)
1729 {
1730     int res = 0;
1731 
1732     /* Check if page is from UFFD-managed region. */
1733     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1734         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1735         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1736 
1737         /* Flush async buffers before un-protect. */
1738         qemu_fflush(pss->pss_channel);
1739         /* Un-protect memory range. */
1740         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1741                 false, false);
1742     }
1743 
1744     return res;
1745 }
1746 
1747 /* ram_write_tracking_available: check if kernel supports required UFFD features
1748  *
1749  * Returns true if supports, false otherwise
1750  */
1751 bool ram_write_tracking_available(void)
1752 {
1753     uint64_t uffd_features;
1754     int res;
1755 
1756     res = uffd_query_features(&uffd_features);
1757     return (res == 0 &&
1758             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1759 }
1760 
1761 /* ram_write_tracking_compatible: check if guest configuration is
1762  *   compatible with 'write-tracking'
1763  *
1764  * Returns true if compatible, false otherwise
1765  */
1766 bool ram_write_tracking_compatible(void)
1767 {
1768     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1769     int uffd_fd;
1770     RAMBlock *block;
1771     bool ret = false;
1772 
1773     /* Open UFFD file descriptor */
1774     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1775     if (uffd_fd < 0) {
1776         return false;
1777     }
1778 
1779     RCU_READ_LOCK_GUARD();
1780 
1781     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1782         uint64_t uffd_ioctls;
1783 
1784         /* Nothing to do with read-only and MMIO-writable regions */
1785         if (block->mr->readonly || block->mr->rom_device) {
1786             continue;
1787         }
1788         /* Try to register block memory via UFFD-IO to track writes */
1789         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1790                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1791             goto out;
1792         }
1793         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1794             goto out;
1795         }
1796     }
1797     ret = true;
1798 
1799 out:
1800     uffd_close_fd(uffd_fd);
1801     return ret;
1802 }
1803 
1804 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1805                                        ram_addr_t size)
1806 {
1807     const ram_addr_t end = offset + size;
1808 
1809     /*
1810      * We read one byte of each page; this will preallocate page tables if
1811      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1812      * where no page was populated yet. This might require adaption when
1813      * supporting other mappings, like shmem.
1814      */
1815     for (; offset < end; offset += block->page_size) {
1816         char tmp = *((char *)block->host + offset);
1817 
1818         /* Don't optimize the read out */
1819         asm volatile("" : "+r" (tmp));
1820     }
1821 }
1822 
1823 static inline int populate_read_section(MemoryRegionSection *section,
1824                                         void *opaque)
1825 {
1826     const hwaddr size = int128_get64(section->size);
1827     hwaddr offset = section->offset_within_region;
1828     RAMBlock *block = section->mr->ram_block;
1829 
1830     populate_read_range(block, offset, size);
1831     return 0;
1832 }
1833 
1834 /*
1835  * ram_block_populate_read: preallocate page tables and populate pages in the
1836  *   RAM block by reading a byte of each page.
1837  *
1838  * Since it's solely used for userfault_fd WP feature, here we just
1839  *   hardcode page size to qemu_real_host_page_size.
1840  *
1841  * @block: RAM block to populate
1842  */
1843 static void ram_block_populate_read(RAMBlock *rb)
1844 {
1845     /*
1846      * Skip populating all pages that fall into a discarded range as managed by
1847      * a RamDiscardManager responsible for the mapped memory region of the
1848      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1849      * must not get populated automatically. We don't have to track
1850      * modifications via userfaultfd WP reliably, because these pages will
1851      * not be part of the migration stream either way -- see
1852      * ramblock_dirty_bitmap_exclude_discarded_pages().
1853      *
1854      * Note: The result is only stable while migrating (precopy/postcopy).
1855      */
1856     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1857         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1858         MemoryRegionSection section = {
1859             .mr = rb->mr,
1860             .offset_within_region = 0,
1861             .size = rb->mr->size,
1862         };
1863 
1864         ram_discard_manager_replay_populated(rdm, &section,
1865                                              populate_read_section, NULL);
1866     } else {
1867         populate_read_range(rb, 0, rb->used_length);
1868     }
1869 }
1870 
1871 /*
1872  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1873  */
1874 void ram_write_tracking_prepare(void)
1875 {
1876     RAMBlock *block;
1877 
1878     RCU_READ_LOCK_GUARD();
1879 
1880     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1881         /* Nothing to do with read-only and MMIO-writable regions */
1882         if (block->mr->readonly || block->mr->rom_device) {
1883             continue;
1884         }
1885 
1886         /*
1887          * Populate pages of the RAM block before enabling userfault_fd
1888          * write protection.
1889          *
1890          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1891          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1892          * pages with pte_none() entries in page table.
1893          */
1894         ram_block_populate_read(block);
1895     }
1896 }
1897 
1898 static inline int uffd_protect_section(MemoryRegionSection *section,
1899                                        void *opaque)
1900 {
1901     const hwaddr size = int128_get64(section->size);
1902     const hwaddr offset = section->offset_within_region;
1903     RAMBlock *rb = section->mr->ram_block;
1904     int uffd_fd = (uintptr_t)opaque;
1905 
1906     return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1907                                   false);
1908 }
1909 
1910 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1911 {
1912     assert(rb->flags & RAM_UF_WRITEPROTECT);
1913 
1914     /* See ram_block_populate_read() */
1915     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1916         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1917         MemoryRegionSection section = {
1918             .mr = rb->mr,
1919             .offset_within_region = 0,
1920             .size = rb->mr->size,
1921         };
1922 
1923         return ram_discard_manager_replay_populated(rdm, &section,
1924                                                     uffd_protect_section,
1925                                                     (void *)(uintptr_t)uffd_fd);
1926     }
1927     return uffd_change_protection(uffd_fd, rb->host,
1928                                   rb->used_length, true, false);
1929 }
1930 
1931 /*
1932  * ram_write_tracking_start: start UFFD-WP memory tracking
1933  *
1934  * Returns 0 for success or negative value in case of error
1935  */
1936 int ram_write_tracking_start(void)
1937 {
1938     int uffd_fd;
1939     RAMState *rs = ram_state;
1940     RAMBlock *block;
1941 
1942     /* Open UFFD file descriptor */
1943     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1944     if (uffd_fd < 0) {
1945         return uffd_fd;
1946     }
1947     rs->uffdio_fd = uffd_fd;
1948 
1949     RCU_READ_LOCK_GUARD();
1950 
1951     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1952         /* Nothing to do with read-only and MMIO-writable regions */
1953         if (block->mr->readonly || block->mr->rom_device) {
1954             continue;
1955         }
1956 
1957         /* Register block memory with UFFD to track writes */
1958         if (uffd_register_memory(rs->uffdio_fd, block->host,
1959                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1960             goto fail;
1961         }
1962         block->flags |= RAM_UF_WRITEPROTECT;
1963         memory_region_ref(block->mr);
1964 
1965         /* Apply UFFD write protection to the block memory range */
1966         if (ram_block_uffd_protect(block, uffd_fd)) {
1967             goto fail;
1968         }
1969 
1970         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1971                 block->host, block->max_length);
1972     }
1973 
1974     return 0;
1975 
1976 fail:
1977     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1978 
1979     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1980         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1981             continue;
1982         }
1983         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1984         /* Cleanup flags and remove reference */
1985         block->flags &= ~RAM_UF_WRITEPROTECT;
1986         memory_region_unref(block->mr);
1987     }
1988 
1989     uffd_close_fd(uffd_fd);
1990     rs->uffdio_fd = -1;
1991     return -1;
1992 }
1993 
1994 /**
1995  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1996  */
1997 void ram_write_tracking_stop(void)
1998 {
1999     RAMState *rs = ram_state;
2000     RAMBlock *block;
2001 
2002     RCU_READ_LOCK_GUARD();
2003 
2004     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2005         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
2006             continue;
2007         }
2008         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
2009 
2010         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2011                 block->host, block->max_length);
2012 
2013         /* Cleanup flags and remove reference */
2014         block->flags &= ~RAM_UF_WRITEPROTECT;
2015         memory_region_unref(block->mr);
2016     }
2017 
2018     /* Finally close UFFD file descriptor */
2019     uffd_close_fd(rs->uffdio_fd);
2020     rs->uffdio_fd = -1;
2021 }
2022 
2023 #else
2024 /* No target OS support, stubs just fail or ignore */
2025 
2026 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2027 {
2028     (void) rs;
2029     (void) offset;
2030 
2031     return NULL;
2032 }
2033 
2034 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2035         unsigned long start_page)
2036 {
2037     (void) rs;
2038     (void) pss;
2039     (void) start_page;
2040 
2041     return 0;
2042 }
2043 
2044 bool ram_write_tracking_available(void)
2045 {
2046     return false;
2047 }
2048 
2049 bool ram_write_tracking_compatible(void)
2050 {
2051     assert(0);
2052     return false;
2053 }
2054 
2055 int ram_write_tracking_start(void)
2056 {
2057     assert(0);
2058     return -1;
2059 }
2060 
2061 void ram_write_tracking_stop(void)
2062 {
2063     assert(0);
2064 }
2065 #endif /* defined(__linux__) */
2066 
2067 /**
2068  * get_queued_page: unqueue a page from the postcopy requests
2069  *
2070  * Skips pages that are already sent (!dirty)
2071  *
2072  * Returns true if a queued page is found
2073  *
2074  * @rs: current RAM state
2075  * @pss: data about the state of the current dirty page scan
2076  */
2077 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2078 {
2079     RAMBlock  *block;
2080     ram_addr_t offset;
2081     bool dirty;
2082 
2083     do {
2084         block = unqueue_page(rs, &offset);
2085         /*
2086          * We're sending this page, and since it's postcopy nothing else
2087          * will dirty it, and we must make sure it doesn't get sent again
2088          * even if this queue request was received after the background
2089          * search already sent it.
2090          */
2091         if (block) {
2092             unsigned long page;
2093 
2094             page = offset >> TARGET_PAGE_BITS;
2095             dirty = test_bit(page, block->bmap);
2096             if (!dirty) {
2097                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2098                                                 page);
2099             } else {
2100                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2101             }
2102         }
2103 
2104     } while (block && !dirty);
2105 
2106     if (!block) {
2107         /*
2108          * Poll write faults too if background snapshot is enabled; that's
2109          * when we have vcpus got blocked by the write protected pages.
2110          */
2111         block = poll_fault_page(rs, &offset);
2112     }
2113 
2114     if (block) {
2115         /*
2116          * We want the background search to continue from the queued page
2117          * since the guest is likely to want other pages near to the page
2118          * it just requested.
2119          */
2120         pss->block = block;
2121         pss->page = offset >> TARGET_PAGE_BITS;
2122 
2123         /*
2124          * This unqueued page would break the "one round" check, even is
2125          * really rare.
2126          */
2127         pss->complete_round = false;
2128     }
2129 
2130     return !!block;
2131 }
2132 
2133 /**
2134  * migration_page_queue_free: drop any remaining pages in the ram
2135  * request queue
2136  *
2137  * It should be empty at the end anyway, but in error cases there may
2138  * be some left.  in case that there is any page left, we drop it.
2139  *
2140  */
2141 static void migration_page_queue_free(RAMState *rs)
2142 {
2143     struct RAMSrcPageRequest *mspr, *next_mspr;
2144     /* This queue generally should be empty - but in the case of a failed
2145      * migration might have some droppings in.
2146      */
2147     RCU_READ_LOCK_GUARD();
2148     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2149         memory_region_unref(mspr->rb->mr);
2150         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2151         g_free(mspr);
2152     }
2153 }
2154 
2155 /**
2156  * ram_save_queue_pages: queue the page for transmission
2157  *
2158  * A request from postcopy destination for example.
2159  *
2160  * Returns zero on success or negative on error
2161  *
2162  * @rbname: Name of the RAMBLock of the request. NULL means the
2163  *          same that last one.
2164  * @start: starting address from the start of the RAMBlock
2165  * @len: length (in bytes) to send
2166  */
2167 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2168 {
2169     RAMBlock *ramblock;
2170     RAMState *rs = ram_state;
2171 
2172     stat64_add(&ram_counters.postcopy_requests, 1);
2173     RCU_READ_LOCK_GUARD();
2174 
2175     if (!rbname) {
2176         /* Reuse last RAMBlock */
2177         ramblock = rs->last_req_rb;
2178 
2179         if (!ramblock) {
2180             /*
2181              * Shouldn't happen, we can't reuse the last RAMBlock if
2182              * it's the 1st request.
2183              */
2184             error_report("ram_save_queue_pages no previous block");
2185             return -1;
2186         }
2187     } else {
2188         ramblock = qemu_ram_block_by_name(rbname);
2189 
2190         if (!ramblock) {
2191             /* We shouldn't be asked for a non-existent RAMBlock */
2192             error_report("ram_save_queue_pages no block '%s'", rbname);
2193             return -1;
2194         }
2195         rs->last_req_rb = ramblock;
2196     }
2197     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2198     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2199         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2200                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2201                      __func__, start, len, ramblock->used_length);
2202         return -1;
2203     }
2204 
2205     /*
2206      * When with postcopy preempt, we send back the page directly in the
2207      * rp-return thread.
2208      */
2209     if (postcopy_preempt_active()) {
2210         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2211         size_t page_size = qemu_ram_pagesize(ramblock);
2212         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2213         int ret = 0;
2214 
2215         qemu_mutex_lock(&rs->bitmap_mutex);
2216 
2217         pss_init(pss, ramblock, page_start);
2218         /*
2219          * Always use the preempt channel, and make sure it's there.  It's
2220          * safe to access without lock, because when rp-thread is running
2221          * we should be the only one who operates on the qemufile
2222          */
2223         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2224         assert(pss->pss_channel);
2225 
2226         /*
2227          * It must be either one or multiple of host page size.  Just
2228          * assert; if something wrong we're mostly split brain anyway.
2229          */
2230         assert(len % page_size == 0);
2231         while (len) {
2232             if (ram_save_host_page_urgent(pss)) {
2233                 error_report("%s: ram_save_host_page_urgent() failed: "
2234                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2235                              __func__, ramblock->idstr, start);
2236                 ret = -1;
2237                 break;
2238             }
2239             /*
2240              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2241              * will automatically be moved and point to the next host page
2242              * we're going to send, so no need to update here.
2243              *
2244              * Normally QEMU never sends >1 host page in requests, so
2245              * logically we don't even need that as the loop should only
2246              * run once, but just to be consistent.
2247              */
2248             len -= page_size;
2249         };
2250         qemu_mutex_unlock(&rs->bitmap_mutex);
2251 
2252         return ret;
2253     }
2254 
2255     struct RAMSrcPageRequest *new_entry =
2256         g_new0(struct RAMSrcPageRequest, 1);
2257     new_entry->rb = ramblock;
2258     new_entry->offset = start;
2259     new_entry->len = len;
2260 
2261     memory_region_ref(ramblock->mr);
2262     qemu_mutex_lock(&rs->src_page_req_mutex);
2263     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2264     migration_make_urgent_request();
2265     qemu_mutex_unlock(&rs->src_page_req_mutex);
2266 
2267     return 0;
2268 }
2269 
2270 static bool save_page_use_compression(RAMState *rs)
2271 {
2272     if (!migrate_compress()) {
2273         return false;
2274     }
2275 
2276     /*
2277      * If xbzrle is enabled (e.g., after first round of migration), stop
2278      * using the data compression. In theory, xbzrle can do better than
2279      * compression.
2280      */
2281     if (rs->xbzrle_enabled) {
2282         return false;
2283     }
2284 
2285     return true;
2286 }
2287 
2288 /*
2289  * try to compress the page before posting it out, return true if the page
2290  * has been properly handled by compression, otherwise needs other
2291  * paths to handle it
2292  */
2293 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2294                                RAMBlock *block, ram_addr_t offset)
2295 {
2296     if (!save_page_use_compression(rs)) {
2297         return false;
2298     }
2299 
2300     /*
2301      * When starting the process of a new block, the first page of
2302      * the block should be sent out before other pages in the same
2303      * block, and all the pages in last block should have been sent
2304      * out, keeping this order is important, because the 'cont' flag
2305      * is used to avoid resending the block name.
2306      *
2307      * We post the fist page as normal page as compression will take
2308      * much CPU resource.
2309      */
2310     if (block != pss->last_sent_block) {
2311         flush_compressed_data(rs);
2312         return false;
2313     }
2314 
2315     if (compress_page_with_multi_thread(block, offset) > 0) {
2316         return true;
2317     }
2318 
2319     compression_counters.busy++;
2320     return false;
2321 }
2322 
2323 /**
2324  * ram_save_target_page_legacy: save one target page
2325  *
2326  * Returns the number of pages written
2327  *
2328  * @rs: current RAM state
2329  * @pss: data about the page we want to send
2330  */
2331 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2332 {
2333     RAMBlock *block = pss->block;
2334     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2335     int res;
2336 
2337     if (control_save_page(pss, block, offset, &res)) {
2338         return res;
2339     }
2340 
2341     if (save_compress_page(rs, pss, block, offset)) {
2342         return 1;
2343     }
2344 
2345     res = save_zero_page(pss, pss->pss_channel, block, offset);
2346     if (res > 0) {
2347         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2348          * page would be stale
2349          */
2350         if (rs->xbzrle_enabled) {
2351             XBZRLE_cache_lock();
2352             xbzrle_cache_zero_page(rs, block->offset + offset);
2353             XBZRLE_cache_unlock();
2354         }
2355         return res;
2356     }
2357 
2358     /*
2359      * Do not use multifd in postcopy as one whole host page should be
2360      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2361      * if host page size == guest page size the dest guest during run may
2362      * still see partially copied pages which is data corruption.
2363      */
2364     if (migrate_multifd() && !migration_in_postcopy()) {
2365         return ram_save_multifd_page(pss->pss_channel, block, offset);
2366     }
2367 
2368     return ram_save_page(rs, pss);
2369 }
2370 
2371 /* Should be called before sending a host page */
2372 static void pss_host_page_prepare(PageSearchStatus *pss)
2373 {
2374     /* How many guest pages are there in one host page? */
2375     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2376 
2377     pss->host_page_sending = true;
2378     if (guest_pfns <= 1) {
2379         /*
2380          * This covers both when guest psize == host psize, or when guest
2381          * has larger psize than the host (guest_pfns==0).
2382          *
2383          * For the latter, we always send one whole guest page per
2384          * iteration of the host page (example: an Alpha VM on x86 host
2385          * will have guest psize 8K while host psize 4K).
2386          */
2387         pss->host_page_start = pss->page;
2388         pss->host_page_end = pss->page + 1;
2389     } else {
2390         /*
2391          * The host page spans over multiple guest pages, we send them
2392          * within the same host page iteration.
2393          */
2394         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2395         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2396     }
2397 }
2398 
2399 /*
2400  * Whether the page pointed by PSS is within the host page being sent.
2401  * Must be called after a previous pss_host_page_prepare().
2402  */
2403 static bool pss_within_range(PageSearchStatus *pss)
2404 {
2405     ram_addr_t ram_addr;
2406 
2407     assert(pss->host_page_sending);
2408 
2409     /* Over host-page boundary? */
2410     if (pss->page >= pss->host_page_end) {
2411         return false;
2412     }
2413 
2414     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2415 
2416     return offset_in_ramblock(pss->block, ram_addr);
2417 }
2418 
2419 static void pss_host_page_finish(PageSearchStatus *pss)
2420 {
2421     pss->host_page_sending = false;
2422     /* This is not needed, but just to reset it */
2423     pss->host_page_start = pss->host_page_end = 0;
2424 }
2425 
2426 /*
2427  * Send an urgent host page specified by `pss'.  Need to be called with
2428  * bitmap_mutex held.
2429  *
2430  * Returns 0 if save host page succeeded, false otherwise.
2431  */
2432 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2433 {
2434     bool page_dirty, sent = false;
2435     RAMState *rs = ram_state;
2436     int ret = 0;
2437 
2438     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2439     pss_host_page_prepare(pss);
2440 
2441     /*
2442      * If precopy is sending the same page, let it be done in precopy, or
2443      * we could send the same page in two channels and none of them will
2444      * receive the whole page.
2445      */
2446     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2447         trace_postcopy_preempt_hit(pss->block->idstr,
2448                                    pss->page << TARGET_PAGE_BITS);
2449         return 0;
2450     }
2451 
2452     do {
2453         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2454 
2455         if (page_dirty) {
2456             /* Be strict to return code; it must be 1, or what else? */
2457             if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2458                 error_report_once("%s: ram_save_target_page failed", __func__);
2459                 ret = -1;
2460                 goto out;
2461             }
2462             sent = true;
2463         }
2464         pss_find_next_dirty(pss);
2465     } while (pss_within_range(pss));
2466 out:
2467     pss_host_page_finish(pss);
2468     /* For urgent requests, flush immediately if sent */
2469     if (sent) {
2470         qemu_fflush(pss->pss_channel);
2471     }
2472     return ret;
2473 }
2474 
2475 /**
2476  * ram_save_host_page: save a whole host page
2477  *
2478  * Starting at *offset send pages up to the end of the current host
2479  * page. It's valid for the initial offset to point into the middle of
2480  * a host page in which case the remainder of the hostpage is sent.
2481  * Only dirty target pages are sent. Note that the host page size may
2482  * be a huge page for this block.
2483  *
2484  * The saving stops at the boundary of the used_length of the block
2485  * if the RAMBlock isn't a multiple of the host page size.
2486  *
2487  * The caller must be with ram_state.bitmap_mutex held to call this
2488  * function.  Note that this function can temporarily release the lock, but
2489  * when the function is returned it'll make sure the lock is still held.
2490  *
2491  * Returns the number of pages written or negative on error
2492  *
2493  * @rs: current RAM state
2494  * @pss: data about the page we want to send
2495  */
2496 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2497 {
2498     bool page_dirty, preempt_active = postcopy_preempt_active();
2499     int tmppages, pages = 0;
2500     size_t pagesize_bits =
2501         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2502     unsigned long start_page = pss->page;
2503     int res;
2504 
2505     if (ramblock_is_ignored(pss->block)) {
2506         error_report("block %s should not be migrated !", pss->block->idstr);
2507         return 0;
2508     }
2509 
2510     /* Update host page boundary information */
2511     pss_host_page_prepare(pss);
2512 
2513     do {
2514         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2515 
2516         /* Check the pages is dirty and if it is send it */
2517         if (page_dirty) {
2518             /*
2519              * Properly yield the lock only in postcopy preempt mode
2520              * because both migration thread and rp-return thread can
2521              * operate on the bitmaps.
2522              */
2523             if (preempt_active) {
2524                 qemu_mutex_unlock(&rs->bitmap_mutex);
2525             }
2526             tmppages = migration_ops->ram_save_target_page(rs, pss);
2527             if (tmppages >= 0) {
2528                 pages += tmppages;
2529                 /*
2530                  * Allow rate limiting to happen in the middle of huge pages if
2531                  * something is sent in the current iteration.
2532                  */
2533                 if (pagesize_bits > 1 && tmppages > 0) {
2534                     migration_rate_limit();
2535                 }
2536             }
2537             if (preempt_active) {
2538                 qemu_mutex_lock(&rs->bitmap_mutex);
2539             }
2540         } else {
2541             tmppages = 0;
2542         }
2543 
2544         if (tmppages < 0) {
2545             pss_host_page_finish(pss);
2546             return tmppages;
2547         }
2548 
2549         pss_find_next_dirty(pss);
2550     } while (pss_within_range(pss));
2551 
2552     pss_host_page_finish(pss);
2553 
2554     res = ram_save_release_protection(rs, pss, start_page);
2555     return (res < 0 ? res : pages);
2556 }
2557 
2558 /**
2559  * ram_find_and_save_block: finds a dirty page and sends it to f
2560  *
2561  * Called within an RCU critical section.
2562  *
2563  * Returns the number of pages written where zero means no dirty pages,
2564  * or negative on error
2565  *
2566  * @rs: current RAM state
2567  *
2568  * On systems where host-page-size > target-page-size it will send all the
2569  * pages in a host page that are dirty.
2570  */
2571 static int ram_find_and_save_block(RAMState *rs)
2572 {
2573     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2574     int pages = 0;
2575 
2576     /* No dirty page as there is zero RAM */
2577     if (!rs->ram_bytes_total) {
2578         return pages;
2579     }
2580 
2581     /*
2582      * Always keep last_seen_block/last_page valid during this procedure,
2583      * because find_dirty_block() relies on these values (e.g., we compare
2584      * last_seen_block with pss.block to see whether we searched all the
2585      * ramblocks) to detect the completion of migration.  Having NULL value
2586      * of last_seen_block can conditionally cause below loop to run forever.
2587      */
2588     if (!rs->last_seen_block) {
2589         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2590         rs->last_page = 0;
2591     }
2592 
2593     pss_init(pss, rs->last_seen_block, rs->last_page);
2594 
2595     while (true){
2596         if (!get_queued_page(rs, pss)) {
2597             /* priority queue empty, so just search for something dirty */
2598             int res = find_dirty_block(rs, pss);
2599             if (res != PAGE_DIRTY_FOUND) {
2600                 if (res == PAGE_ALL_CLEAN) {
2601                     break;
2602                 } else if (res == PAGE_TRY_AGAIN) {
2603                     continue;
2604                 }
2605             }
2606         }
2607         pages = ram_save_host_page(rs, pss);
2608         if (pages) {
2609             break;
2610         }
2611     }
2612 
2613     rs->last_seen_block = pss->block;
2614     rs->last_page = pss->page;
2615 
2616     return pages;
2617 }
2618 
2619 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2620 {
2621     uint64_t pages = size / TARGET_PAGE_SIZE;
2622 
2623     if (zero) {
2624         stat64_add(&ram_counters.zero_pages, pages);
2625     } else {
2626         stat64_add(&ram_counters.normal_pages, pages);
2627         ram_transferred_add(size);
2628         qemu_file_credit_transfer(f, size);
2629     }
2630 }
2631 
2632 static uint64_t ram_bytes_total_with_ignored(void)
2633 {
2634     RAMBlock *block;
2635     uint64_t total = 0;
2636 
2637     RCU_READ_LOCK_GUARD();
2638 
2639     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2640         total += block->used_length;
2641     }
2642     return total;
2643 }
2644 
2645 uint64_t ram_bytes_total(void)
2646 {
2647     RAMBlock *block;
2648     uint64_t total = 0;
2649 
2650     RCU_READ_LOCK_GUARD();
2651 
2652     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2653         total += block->used_length;
2654     }
2655     return total;
2656 }
2657 
2658 static void xbzrle_load_setup(void)
2659 {
2660     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2661 }
2662 
2663 static void xbzrle_load_cleanup(void)
2664 {
2665     g_free(XBZRLE.decoded_buf);
2666     XBZRLE.decoded_buf = NULL;
2667 }
2668 
2669 static void ram_state_cleanup(RAMState **rsp)
2670 {
2671     if (*rsp) {
2672         migration_page_queue_free(*rsp);
2673         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2674         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2675         g_free(*rsp);
2676         *rsp = NULL;
2677     }
2678 }
2679 
2680 static void xbzrle_cleanup(void)
2681 {
2682     XBZRLE_cache_lock();
2683     if (XBZRLE.cache) {
2684         cache_fini(XBZRLE.cache);
2685         g_free(XBZRLE.encoded_buf);
2686         g_free(XBZRLE.current_buf);
2687         g_free(XBZRLE.zero_target_page);
2688         XBZRLE.cache = NULL;
2689         XBZRLE.encoded_buf = NULL;
2690         XBZRLE.current_buf = NULL;
2691         XBZRLE.zero_target_page = NULL;
2692     }
2693     XBZRLE_cache_unlock();
2694 }
2695 
2696 static void ram_save_cleanup(void *opaque)
2697 {
2698     RAMState **rsp = opaque;
2699     RAMBlock *block;
2700 
2701     /* We don't use dirty log with background snapshots */
2702     if (!migrate_background_snapshot()) {
2703         /* caller have hold iothread lock or is in a bh, so there is
2704          * no writing race against the migration bitmap
2705          */
2706         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2707             /*
2708              * do not stop dirty log without starting it, since
2709              * memory_global_dirty_log_stop will assert that
2710              * memory_global_dirty_log_start/stop used in pairs
2711              */
2712             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2713         }
2714     }
2715 
2716     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2717         g_free(block->clear_bmap);
2718         block->clear_bmap = NULL;
2719         g_free(block->bmap);
2720         block->bmap = NULL;
2721     }
2722 
2723     xbzrle_cleanup();
2724     compress_threads_save_cleanup();
2725     ram_state_cleanup(rsp);
2726     g_free(migration_ops);
2727     migration_ops = NULL;
2728 }
2729 
2730 static void ram_state_reset(RAMState *rs)
2731 {
2732     int i;
2733 
2734     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2735         rs->pss[i].last_sent_block = NULL;
2736     }
2737 
2738     rs->last_seen_block = NULL;
2739     rs->last_page = 0;
2740     rs->last_version = ram_list.version;
2741     rs->xbzrle_enabled = false;
2742 }
2743 
2744 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2745 
2746 /* **** functions for postcopy ***** */
2747 
2748 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2749 {
2750     struct RAMBlock *block;
2751 
2752     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2753         unsigned long *bitmap = block->bmap;
2754         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2755         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2756 
2757         while (run_start < range) {
2758             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2759             ram_discard_range(block->idstr,
2760                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2761                               ((ram_addr_t)(run_end - run_start))
2762                                 << TARGET_PAGE_BITS);
2763             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2764         }
2765     }
2766 }
2767 
2768 /**
2769  * postcopy_send_discard_bm_ram: discard a RAMBlock
2770  *
2771  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2772  *
2773  * @ms: current migration state
2774  * @block: RAMBlock to discard
2775  */
2776 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2777 {
2778     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2779     unsigned long current;
2780     unsigned long *bitmap = block->bmap;
2781 
2782     for (current = 0; current < end; ) {
2783         unsigned long one = find_next_bit(bitmap, end, current);
2784         unsigned long zero, discard_length;
2785 
2786         if (one >= end) {
2787             break;
2788         }
2789 
2790         zero = find_next_zero_bit(bitmap, end, one + 1);
2791 
2792         if (zero >= end) {
2793             discard_length = end - one;
2794         } else {
2795             discard_length = zero - one;
2796         }
2797         postcopy_discard_send_range(ms, one, discard_length);
2798         current = one + discard_length;
2799     }
2800 }
2801 
2802 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2803 
2804 /**
2805  * postcopy_each_ram_send_discard: discard all RAMBlocks
2806  *
2807  * Utility for the outgoing postcopy code.
2808  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2809  *   passing it bitmap indexes and name.
2810  * (qemu_ram_foreach_block ends up passing unscaled lengths
2811  *  which would mean postcopy code would have to deal with target page)
2812  *
2813  * @ms: current migration state
2814  */
2815 static void postcopy_each_ram_send_discard(MigrationState *ms)
2816 {
2817     struct RAMBlock *block;
2818 
2819     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2820         postcopy_discard_send_init(ms, block->idstr);
2821 
2822         /*
2823          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2824          * host-page size chunks, mark any partially dirty host-page size
2825          * chunks as all dirty.  In this case the host-page is the host-page
2826          * for the particular RAMBlock, i.e. it might be a huge page.
2827          */
2828         postcopy_chunk_hostpages_pass(ms, block);
2829 
2830         /*
2831          * Postcopy sends chunks of bitmap over the wire, but it
2832          * just needs indexes at this point, avoids it having
2833          * target page specific code.
2834          */
2835         postcopy_send_discard_bm_ram(ms, block);
2836         postcopy_discard_send_finish(ms);
2837     }
2838 }
2839 
2840 /**
2841  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2842  *
2843  * Helper for postcopy_chunk_hostpages; it's called twice to
2844  * canonicalize the two bitmaps, that are similar, but one is
2845  * inverted.
2846  *
2847  * Postcopy requires that all target pages in a hostpage are dirty or
2848  * clean, not a mix.  This function canonicalizes the bitmaps.
2849  *
2850  * @ms: current migration state
2851  * @block: block that contains the page we want to canonicalize
2852  */
2853 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2854 {
2855     RAMState *rs = ram_state;
2856     unsigned long *bitmap = block->bmap;
2857     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2858     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2859     unsigned long run_start;
2860 
2861     if (block->page_size == TARGET_PAGE_SIZE) {
2862         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2863         return;
2864     }
2865 
2866     /* Find a dirty page */
2867     run_start = find_next_bit(bitmap, pages, 0);
2868 
2869     while (run_start < pages) {
2870 
2871         /*
2872          * If the start of this run of pages is in the middle of a host
2873          * page, then we need to fixup this host page.
2874          */
2875         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2876             /* Find the end of this run */
2877             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2878             /*
2879              * If the end isn't at the start of a host page, then the
2880              * run doesn't finish at the end of a host page
2881              * and we need to discard.
2882              */
2883         }
2884 
2885         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2886             unsigned long page;
2887             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2888                                                              host_ratio);
2889             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2890 
2891             /* Clean up the bitmap */
2892             for (page = fixup_start_addr;
2893                  page < fixup_start_addr + host_ratio; page++) {
2894                 /*
2895                  * Remark them as dirty, updating the count for any pages
2896                  * that weren't previously dirty.
2897                  */
2898                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2899             }
2900         }
2901 
2902         /* Find the next dirty page for the next iteration */
2903         run_start = find_next_bit(bitmap, pages, run_start);
2904     }
2905 }
2906 
2907 /**
2908  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2909  *
2910  * Transmit the set of pages to be discarded after precopy to the target
2911  * these are pages that:
2912  *     a) Have been previously transmitted but are now dirty again
2913  *     b) Pages that have never been transmitted, this ensures that
2914  *        any pages on the destination that have been mapped by background
2915  *        tasks get discarded (transparent huge pages is the specific concern)
2916  * Hopefully this is pretty sparse
2917  *
2918  * @ms: current migration state
2919  */
2920 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2921 {
2922     RAMState *rs = ram_state;
2923 
2924     RCU_READ_LOCK_GUARD();
2925 
2926     /* This should be our last sync, the src is now paused */
2927     migration_bitmap_sync(rs);
2928 
2929     /* Easiest way to make sure we don't resume in the middle of a host-page */
2930     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2931     rs->last_seen_block = NULL;
2932     rs->last_page = 0;
2933 
2934     postcopy_each_ram_send_discard(ms);
2935 
2936     trace_ram_postcopy_send_discard_bitmap();
2937 }
2938 
2939 /**
2940  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2941  *
2942  * Returns zero on success
2943  *
2944  * @rbname: name of the RAMBlock of the request. NULL means the
2945  *          same that last one.
2946  * @start: RAMBlock starting page
2947  * @length: RAMBlock size
2948  */
2949 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2950 {
2951     trace_ram_discard_range(rbname, start, length);
2952 
2953     RCU_READ_LOCK_GUARD();
2954     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2955 
2956     if (!rb) {
2957         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2958         return -1;
2959     }
2960 
2961     /*
2962      * On source VM, we don't need to update the received bitmap since
2963      * we don't even have one.
2964      */
2965     if (rb->receivedmap) {
2966         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2967                      length >> qemu_target_page_bits());
2968     }
2969 
2970     return ram_block_discard_range(rb, start, length);
2971 }
2972 
2973 /*
2974  * For every allocation, we will try not to crash the VM if the
2975  * allocation failed.
2976  */
2977 static int xbzrle_init(void)
2978 {
2979     Error *local_err = NULL;
2980 
2981     if (!migrate_xbzrle()) {
2982         return 0;
2983     }
2984 
2985     XBZRLE_cache_lock();
2986 
2987     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2988     if (!XBZRLE.zero_target_page) {
2989         error_report("%s: Error allocating zero page", __func__);
2990         goto err_out;
2991     }
2992 
2993     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2994                               TARGET_PAGE_SIZE, &local_err);
2995     if (!XBZRLE.cache) {
2996         error_report_err(local_err);
2997         goto free_zero_page;
2998     }
2999 
3000     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3001     if (!XBZRLE.encoded_buf) {
3002         error_report("%s: Error allocating encoded_buf", __func__);
3003         goto free_cache;
3004     }
3005 
3006     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3007     if (!XBZRLE.current_buf) {
3008         error_report("%s: Error allocating current_buf", __func__);
3009         goto free_encoded_buf;
3010     }
3011 
3012     /* We are all good */
3013     XBZRLE_cache_unlock();
3014     return 0;
3015 
3016 free_encoded_buf:
3017     g_free(XBZRLE.encoded_buf);
3018     XBZRLE.encoded_buf = NULL;
3019 free_cache:
3020     cache_fini(XBZRLE.cache);
3021     XBZRLE.cache = NULL;
3022 free_zero_page:
3023     g_free(XBZRLE.zero_target_page);
3024     XBZRLE.zero_target_page = NULL;
3025 err_out:
3026     XBZRLE_cache_unlock();
3027     return -ENOMEM;
3028 }
3029 
3030 static int ram_state_init(RAMState **rsp)
3031 {
3032     *rsp = g_try_new0(RAMState, 1);
3033 
3034     if (!*rsp) {
3035         error_report("%s: Init ramstate fail", __func__);
3036         return -1;
3037     }
3038 
3039     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3040     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3041     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3042     (*rsp)->ram_bytes_total = ram_bytes_total();
3043 
3044     /*
3045      * Count the total number of pages used by ram blocks not including any
3046      * gaps due to alignment or unplugs.
3047      * This must match with the initial values of dirty bitmap.
3048      */
3049     (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
3050     ram_state_reset(*rsp);
3051 
3052     return 0;
3053 }
3054 
3055 static void ram_list_init_bitmaps(void)
3056 {
3057     MigrationState *ms = migrate_get_current();
3058     RAMBlock *block;
3059     unsigned long pages;
3060     uint8_t shift;
3061 
3062     /* Skip setting bitmap if there is no RAM */
3063     if (ram_bytes_total()) {
3064         shift = ms->clear_bitmap_shift;
3065         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3066             error_report("clear_bitmap_shift (%u) too big, using "
3067                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3068             shift = CLEAR_BITMAP_SHIFT_MAX;
3069         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3070             error_report("clear_bitmap_shift (%u) too small, using "
3071                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3072             shift = CLEAR_BITMAP_SHIFT_MIN;
3073         }
3074 
3075         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3076             pages = block->max_length >> TARGET_PAGE_BITS;
3077             /*
3078              * The initial dirty bitmap for migration must be set with all
3079              * ones to make sure we'll migrate every guest RAM page to
3080              * destination.
3081              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3082              * new migration after a failed migration, ram_list.
3083              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3084              * guest memory.
3085              */
3086             block->bmap = bitmap_new(pages);
3087             bitmap_set(block->bmap, 0, pages);
3088             block->clear_bmap_shift = shift;
3089             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3090         }
3091     }
3092 }
3093 
3094 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3095 {
3096     unsigned long pages;
3097     RAMBlock *rb;
3098 
3099     RCU_READ_LOCK_GUARD();
3100 
3101     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3102             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3103             rs->migration_dirty_pages -= pages;
3104     }
3105 }
3106 
3107 static void ram_init_bitmaps(RAMState *rs)
3108 {
3109     /* For memory_global_dirty_log_start below.  */
3110     qemu_mutex_lock_iothread();
3111     qemu_mutex_lock_ramlist();
3112 
3113     WITH_RCU_READ_LOCK_GUARD() {
3114         ram_list_init_bitmaps();
3115         /* We don't use dirty log with background snapshots */
3116         if (!migrate_background_snapshot()) {
3117             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3118             migration_bitmap_sync_precopy(rs);
3119         }
3120     }
3121     qemu_mutex_unlock_ramlist();
3122     qemu_mutex_unlock_iothread();
3123 
3124     /*
3125      * After an eventual first bitmap sync, fixup the initial bitmap
3126      * containing all 1s to exclude any discarded pages from migration.
3127      */
3128     migration_bitmap_clear_discarded_pages(rs);
3129 }
3130 
3131 static int ram_init_all(RAMState **rsp)
3132 {
3133     if (ram_state_init(rsp)) {
3134         return -1;
3135     }
3136 
3137     if (xbzrle_init()) {
3138         ram_state_cleanup(rsp);
3139         return -1;
3140     }
3141 
3142     ram_init_bitmaps(*rsp);
3143 
3144     return 0;
3145 }
3146 
3147 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3148 {
3149     RAMBlock *block;
3150     uint64_t pages = 0;
3151 
3152     /*
3153      * Postcopy is not using xbzrle/compression, so no need for that.
3154      * Also, since source are already halted, we don't need to care
3155      * about dirty page logging as well.
3156      */
3157 
3158     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3159         pages += bitmap_count_one(block->bmap,
3160                                   block->used_length >> TARGET_PAGE_BITS);
3161     }
3162 
3163     /* This may not be aligned with current bitmaps. Recalculate. */
3164     rs->migration_dirty_pages = pages;
3165 
3166     ram_state_reset(rs);
3167 
3168     /* Update RAMState cache of output QEMUFile */
3169     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
3170 
3171     trace_ram_state_resume_prepare(pages);
3172 }
3173 
3174 /*
3175  * This function clears bits of the free pages reported by the caller from the
3176  * migration dirty bitmap. @addr is the host address corresponding to the
3177  * start of the continuous guest free pages, and @len is the total bytes of
3178  * those pages.
3179  */
3180 void qemu_guest_free_page_hint(void *addr, size_t len)
3181 {
3182     RAMBlock *block;
3183     ram_addr_t offset;
3184     size_t used_len, start, npages;
3185     MigrationState *s = migrate_get_current();
3186 
3187     /* This function is currently expected to be used during live migration */
3188     if (!migration_is_setup_or_active(s->state)) {
3189         return;
3190     }
3191 
3192     for (; len > 0; len -= used_len, addr += used_len) {
3193         block = qemu_ram_block_from_host(addr, false, &offset);
3194         if (unlikely(!block || offset >= block->used_length)) {
3195             /*
3196              * The implementation might not support RAMBlock resize during
3197              * live migration, but it could happen in theory with future
3198              * updates. So we add a check here to capture that case.
3199              */
3200             error_report_once("%s unexpected error", __func__);
3201             return;
3202         }
3203 
3204         if (len <= block->used_length - offset) {
3205             used_len = len;
3206         } else {
3207             used_len = block->used_length - offset;
3208         }
3209 
3210         start = offset >> TARGET_PAGE_BITS;
3211         npages = used_len >> TARGET_PAGE_BITS;
3212 
3213         qemu_mutex_lock(&ram_state->bitmap_mutex);
3214         /*
3215          * The skipped free pages are equavalent to be sent from clear_bmap's
3216          * perspective, so clear the bits from the memory region bitmap which
3217          * are initially set. Otherwise those skipped pages will be sent in
3218          * the next round after syncing from the memory region bitmap.
3219          */
3220         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3221         ram_state->migration_dirty_pages -=
3222                       bitmap_count_one_with_offset(block->bmap, start, npages);
3223         bitmap_clear(block->bmap, start, npages);
3224         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3225     }
3226 }
3227 
3228 /*
3229  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3230  * long-running RCU critical section.  When rcu-reclaims in the code
3231  * start to become numerous it will be necessary to reduce the
3232  * granularity of these critical sections.
3233  */
3234 
3235 /**
3236  * ram_save_setup: Setup RAM for migration
3237  *
3238  * Returns zero to indicate success and negative for error
3239  *
3240  * @f: QEMUFile where to send the data
3241  * @opaque: RAMState pointer
3242  */
3243 static int ram_save_setup(QEMUFile *f, void *opaque)
3244 {
3245     RAMState **rsp = opaque;
3246     RAMBlock *block;
3247     int ret;
3248 
3249     if (compress_threads_save_setup()) {
3250         return -1;
3251     }
3252 
3253     /* migration has already setup the bitmap, reuse it. */
3254     if (!migration_in_colo_state()) {
3255         if (ram_init_all(rsp) != 0) {
3256             compress_threads_save_cleanup();
3257             return -1;
3258         }
3259     }
3260     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3261 
3262     WITH_RCU_READ_LOCK_GUARD() {
3263         qemu_put_be64(f, ram_bytes_total_with_ignored()
3264                          | RAM_SAVE_FLAG_MEM_SIZE);
3265 
3266         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3267             qemu_put_byte(f, strlen(block->idstr));
3268             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3269             qemu_put_be64(f, block->used_length);
3270             if (migrate_postcopy_ram() && block->page_size !=
3271                                           qemu_host_page_size) {
3272                 qemu_put_be64(f, block->page_size);
3273             }
3274             if (migrate_ignore_shared()) {
3275                 qemu_put_be64(f, block->mr->addr);
3276             }
3277         }
3278     }
3279 
3280     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3281     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3282 
3283     migration_ops = g_malloc0(sizeof(MigrationOps));
3284     migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3285     ret = multifd_send_sync_main(f);
3286     if (ret < 0) {
3287         return ret;
3288     }
3289 
3290     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3291     qemu_fflush(f);
3292 
3293     return 0;
3294 }
3295 
3296 /**
3297  * ram_save_iterate: iterative stage for migration
3298  *
3299  * Returns zero to indicate success and negative for error
3300  *
3301  * @f: QEMUFile where to send the data
3302  * @opaque: RAMState pointer
3303  */
3304 static int ram_save_iterate(QEMUFile *f, void *opaque)
3305 {
3306     RAMState **temp = opaque;
3307     RAMState *rs = *temp;
3308     int ret = 0;
3309     int i;
3310     int64_t t0;
3311     int done = 0;
3312 
3313     if (blk_mig_bulk_active()) {
3314         /* Avoid transferring ram during bulk phase of block migration as
3315          * the bulk phase will usually take a long time and transferring
3316          * ram updates during that time is pointless. */
3317         goto out;
3318     }
3319 
3320     /*
3321      * We'll take this lock a little bit long, but it's okay for two reasons.
3322      * Firstly, the only possible other thread to take it is who calls
3323      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3324      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3325      * guarantees that we'll at least released it in a regular basis.
3326      */
3327     qemu_mutex_lock(&rs->bitmap_mutex);
3328     WITH_RCU_READ_LOCK_GUARD() {
3329         if (ram_list.version != rs->last_version) {
3330             ram_state_reset(rs);
3331         }
3332 
3333         /* Read version before ram_list.blocks */
3334         smp_rmb();
3335 
3336         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3337 
3338         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3339         i = 0;
3340         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3341                postcopy_has_request(rs)) {
3342             int pages;
3343 
3344             if (qemu_file_get_error(f)) {
3345                 break;
3346             }
3347 
3348             pages = ram_find_and_save_block(rs);
3349             /* no more pages to sent */
3350             if (pages == 0) {
3351                 done = 1;
3352                 break;
3353             }
3354 
3355             if (pages < 0) {
3356                 qemu_file_set_error(f, pages);
3357                 break;
3358             }
3359 
3360             rs->target_page_count += pages;
3361 
3362             /*
3363              * During postcopy, it is necessary to make sure one whole host
3364              * page is sent in one chunk.
3365              */
3366             if (migrate_postcopy_ram()) {
3367                 flush_compressed_data(rs);
3368             }
3369 
3370             /*
3371              * we want to check in the 1st loop, just in case it was the 1st
3372              * time and we had to sync the dirty bitmap.
3373              * qemu_clock_get_ns() is a bit expensive, so we only check each
3374              * some iterations
3375              */
3376             if ((i & 63) == 0) {
3377                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3378                               1000000;
3379                 if (t1 > MAX_WAIT) {
3380                     trace_ram_save_iterate_big_wait(t1, i);
3381                     break;
3382                 }
3383             }
3384             i++;
3385         }
3386     }
3387     qemu_mutex_unlock(&rs->bitmap_mutex);
3388 
3389     /*
3390      * Must occur before EOS (or any QEMUFile operation)
3391      * because of RDMA protocol.
3392      */
3393     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3394 
3395 out:
3396     if (ret >= 0
3397         && migration_is_setup_or_active(migrate_get_current()->state)) {
3398         ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3399         if (ret < 0) {
3400             return ret;
3401         }
3402 
3403         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3404         qemu_fflush(f);
3405         ram_transferred_add(8);
3406 
3407         ret = qemu_file_get_error(f);
3408     }
3409     if (ret < 0) {
3410         return ret;
3411     }
3412 
3413     return done;
3414 }
3415 
3416 /**
3417  * ram_save_complete: function called to send the remaining amount of ram
3418  *
3419  * Returns zero to indicate success or negative on error
3420  *
3421  * Called with iothread lock
3422  *
3423  * @f: QEMUFile where to send the data
3424  * @opaque: RAMState pointer
3425  */
3426 static int ram_save_complete(QEMUFile *f, void *opaque)
3427 {
3428     RAMState **temp = opaque;
3429     RAMState *rs = *temp;
3430     int ret = 0;
3431 
3432     rs->last_stage = !migration_in_colo_state();
3433 
3434     WITH_RCU_READ_LOCK_GUARD() {
3435         if (!migration_in_postcopy()) {
3436             migration_bitmap_sync_precopy(rs);
3437         }
3438 
3439         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3440 
3441         /* try transferring iterative blocks of memory */
3442 
3443         /* flush all remaining blocks regardless of rate limiting */
3444         qemu_mutex_lock(&rs->bitmap_mutex);
3445         while (true) {
3446             int pages;
3447 
3448             pages = ram_find_and_save_block(rs);
3449             /* no more blocks to sent */
3450             if (pages == 0) {
3451                 break;
3452             }
3453             if (pages < 0) {
3454                 ret = pages;
3455                 break;
3456             }
3457         }
3458         qemu_mutex_unlock(&rs->bitmap_mutex);
3459 
3460         flush_compressed_data(rs);
3461         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3462     }
3463 
3464     if (ret < 0) {
3465         return ret;
3466     }
3467 
3468     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3469     if (ret < 0) {
3470         return ret;
3471     }
3472 
3473     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3474     qemu_fflush(f);
3475 
3476     return 0;
3477 }
3478 
3479 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3480                                        uint64_t *can_postcopy)
3481 {
3482     RAMState **temp = opaque;
3483     RAMState *rs = *temp;
3484 
3485     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3486 
3487     if (migrate_postcopy_ram()) {
3488         /* We can do postcopy, and all the data is postcopiable */
3489         *can_postcopy += remaining_size;
3490     } else {
3491         *must_precopy += remaining_size;
3492     }
3493 }
3494 
3495 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3496                                     uint64_t *can_postcopy)
3497 {
3498     MigrationState *s = migrate_get_current();
3499     RAMState **temp = opaque;
3500     RAMState *rs = *temp;
3501 
3502     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3503 
3504     if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3505         qemu_mutex_lock_iothread();
3506         WITH_RCU_READ_LOCK_GUARD() {
3507             migration_bitmap_sync_precopy(rs);
3508         }
3509         qemu_mutex_unlock_iothread();
3510         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3511     }
3512 
3513     if (migrate_postcopy_ram()) {
3514         /* We can do postcopy, and all the data is postcopiable */
3515         *can_postcopy += remaining_size;
3516     } else {
3517         *must_precopy += remaining_size;
3518     }
3519 }
3520 
3521 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3522 {
3523     unsigned int xh_len;
3524     int xh_flags;
3525     uint8_t *loaded_data;
3526 
3527     /* extract RLE header */
3528     xh_flags = qemu_get_byte(f);
3529     xh_len = qemu_get_be16(f);
3530 
3531     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3532         error_report("Failed to load XBZRLE page - wrong compression!");
3533         return -1;
3534     }
3535 
3536     if (xh_len > TARGET_PAGE_SIZE) {
3537         error_report("Failed to load XBZRLE page - len overflow!");
3538         return -1;
3539     }
3540     loaded_data = XBZRLE.decoded_buf;
3541     /* load data and decode */
3542     /* it can change loaded_data to point to an internal buffer */
3543     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3544 
3545     /* decode RLE */
3546     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3547                              TARGET_PAGE_SIZE) == -1) {
3548         error_report("Failed to load XBZRLE page - decode error!");
3549         return -1;
3550     }
3551 
3552     return 0;
3553 }
3554 
3555 /**
3556  * ram_block_from_stream: read a RAMBlock id from the migration stream
3557  *
3558  * Must be called from within a rcu critical section.
3559  *
3560  * Returns a pointer from within the RCU-protected ram_list.
3561  *
3562  * @mis: the migration incoming state pointer
3563  * @f: QEMUFile where to read the data from
3564  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3565  * @channel: the channel we're using
3566  */
3567 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3568                                               QEMUFile *f, int flags,
3569                                               int channel)
3570 {
3571     RAMBlock *block = mis->last_recv_block[channel];
3572     char id[256];
3573     uint8_t len;
3574 
3575     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3576         if (!block) {
3577             error_report("Ack, bad migration stream!");
3578             return NULL;
3579         }
3580         return block;
3581     }
3582 
3583     len = qemu_get_byte(f);
3584     qemu_get_buffer(f, (uint8_t *)id, len);
3585     id[len] = 0;
3586 
3587     block = qemu_ram_block_by_name(id);
3588     if (!block) {
3589         error_report("Can't find block %s", id);
3590         return NULL;
3591     }
3592 
3593     if (ramblock_is_ignored(block)) {
3594         error_report("block %s should not be migrated !", id);
3595         return NULL;
3596     }
3597 
3598     mis->last_recv_block[channel] = block;
3599 
3600     return block;
3601 }
3602 
3603 static inline void *host_from_ram_block_offset(RAMBlock *block,
3604                                                ram_addr_t offset)
3605 {
3606     if (!offset_in_ramblock(block, offset)) {
3607         return NULL;
3608     }
3609 
3610     return block->host + offset;
3611 }
3612 
3613 static void *host_page_from_ram_block_offset(RAMBlock *block,
3614                                              ram_addr_t offset)
3615 {
3616     /* Note: Explicitly no check against offset_in_ramblock(). */
3617     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3618                                    block->page_size);
3619 }
3620 
3621 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3622                                                          ram_addr_t offset)
3623 {
3624     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3625 }
3626 
3627 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3628                              ram_addr_t offset, bool record_bitmap)
3629 {
3630     if (!offset_in_ramblock(block, offset)) {
3631         return NULL;
3632     }
3633     if (!block->colo_cache) {
3634         error_report("%s: colo_cache is NULL in block :%s",
3635                      __func__, block->idstr);
3636         return NULL;
3637     }
3638 
3639     /*
3640     * During colo checkpoint, we need bitmap of these migrated pages.
3641     * It help us to decide which pages in ram cache should be flushed
3642     * into VM's RAM later.
3643     */
3644     if (record_bitmap &&
3645         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3646         ram_state->migration_dirty_pages++;
3647     }
3648     return block->colo_cache + offset;
3649 }
3650 
3651 /**
3652  * ram_handle_compressed: handle the zero page case
3653  *
3654  * If a page (or a whole RDMA chunk) has been
3655  * determined to be zero, then zap it.
3656  *
3657  * @host: host address for the zero page
3658  * @ch: what the page is filled from.  We only support zero
3659  * @size: size of the zero page
3660  */
3661 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3662 {
3663     if (ch != 0 || !buffer_is_zero(host, size)) {
3664         memset(host, ch, size);
3665     }
3666 }
3667 
3668 /* return the size after decompression, or negative value on error */
3669 static int
3670 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3671                      const uint8_t *source, size_t source_len)
3672 {
3673     int err;
3674 
3675     err = inflateReset(stream);
3676     if (err != Z_OK) {
3677         return -1;
3678     }
3679 
3680     stream->avail_in = source_len;
3681     stream->next_in = (uint8_t *)source;
3682     stream->avail_out = dest_len;
3683     stream->next_out = dest;
3684 
3685     err = inflate(stream, Z_NO_FLUSH);
3686     if (err != Z_STREAM_END) {
3687         return -1;
3688     }
3689 
3690     return stream->total_out;
3691 }
3692 
3693 static void *do_data_decompress(void *opaque)
3694 {
3695     DecompressParam *param = opaque;
3696     unsigned long pagesize;
3697     uint8_t *des;
3698     int len, ret;
3699 
3700     qemu_mutex_lock(&param->mutex);
3701     while (!param->quit) {
3702         if (param->des) {
3703             des = param->des;
3704             len = param->len;
3705             param->des = 0;
3706             qemu_mutex_unlock(&param->mutex);
3707 
3708             pagesize = TARGET_PAGE_SIZE;
3709 
3710             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3711                                        param->compbuf, len);
3712             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3713                 error_report("decompress data failed");
3714                 qemu_file_set_error(decomp_file, ret);
3715             }
3716 
3717             qemu_mutex_lock(&decomp_done_lock);
3718             param->done = true;
3719             qemu_cond_signal(&decomp_done_cond);
3720             qemu_mutex_unlock(&decomp_done_lock);
3721 
3722             qemu_mutex_lock(&param->mutex);
3723         } else {
3724             qemu_cond_wait(&param->cond, &param->mutex);
3725         }
3726     }
3727     qemu_mutex_unlock(&param->mutex);
3728 
3729     return NULL;
3730 }
3731 
3732 static int wait_for_decompress_done(void)
3733 {
3734     int idx, thread_count;
3735 
3736     if (!migrate_compress()) {
3737         return 0;
3738     }
3739 
3740     thread_count = migrate_decompress_threads();
3741     qemu_mutex_lock(&decomp_done_lock);
3742     for (idx = 0; idx < thread_count; idx++) {
3743         while (!decomp_param[idx].done) {
3744             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3745         }
3746     }
3747     qemu_mutex_unlock(&decomp_done_lock);
3748     return qemu_file_get_error(decomp_file);
3749 }
3750 
3751 static void compress_threads_load_cleanup(void)
3752 {
3753     int i, thread_count;
3754 
3755     if (!migrate_compress()) {
3756         return;
3757     }
3758     thread_count = migrate_decompress_threads();
3759     for (i = 0; i < thread_count; i++) {
3760         /*
3761          * we use it as a indicator which shows if the thread is
3762          * properly init'd or not
3763          */
3764         if (!decomp_param[i].compbuf) {
3765             break;
3766         }
3767 
3768         qemu_mutex_lock(&decomp_param[i].mutex);
3769         decomp_param[i].quit = true;
3770         qemu_cond_signal(&decomp_param[i].cond);
3771         qemu_mutex_unlock(&decomp_param[i].mutex);
3772     }
3773     for (i = 0; i < thread_count; i++) {
3774         if (!decomp_param[i].compbuf) {
3775             break;
3776         }
3777 
3778         qemu_thread_join(decompress_threads + i);
3779         qemu_mutex_destroy(&decomp_param[i].mutex);
3780         qemu_cond_destroy(&decomp_param[i].cond);
3781         inflateEnd(&decomp_param[i].stream);
3782         g_free(decomp_param[i].compbuf);
3783         decomp_param[i].compbuf = NULL;
3784     }
3785     g_free(decompress_threads);
3786     g_free(decomp_param);
3787     decompress_threads = NULL;
3788     decomp_param = NULL;
3789     decomp_file = NULL;
3790 }
3791 
3792 static int compress_threads_load_setup(QEMUFile *f)
3793 {
3794     int i, thread_count;
3795 
3796     if (!migrate_compress()) {
3797         return 0;
3798     }
3799 
3800     thread_count = migrate_decompress_threads();
3801     decompress_threads = g_new0(QemuThread, thread_count);
3802     decomp_param = g_new0(DecompressParam, thread_count);
3803     qemu_mutex_init(&decomp_done_lock);
3804     qemu_cond_init(&decomp_done_cond);
3805     decomp_file = f;
3806     for (i = 0; i < thread_count; i++) {
3807         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3808             goto exit;
3809         }
3810 
3811         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3812         qemu_mutex_init(&decomp_param[i].mutex);
3813         qemu_cond_init(&decomp_param[i].cond);
3814         decomp_param[i].done = true;
3815         decomp_param[i].quit = false;
3816         qemu_thread_create(decompress_threads + i, "decompress",
3817                            do_data_decompress, decomp_param + i,
3818                            QEMU_THREAD_JOINABLE);
3819     }
3820     return 0;
3821 exit:
3822     compress_threads_load_cleanup();
3823     return -1;
3824 }
3825 
3826 static void decompress_data_with_multi_threads(QEMUFile *f,
3827                                                void *host, int len)
3828 {
3829     int idx, thread_count;
3830 
3831     thread_count = migrate_decompress_threads();
3832     QEMU_LOCK_GUARD(&decomp_done_lock);
3833     while (true) {
3834         for (idx = 0; idx < thread_count; idx++) {
3835             if (decomp_param[idx].done) {
3836                 decomp_param[idx].done = false;
3837                 qemu_mutex_lock(&decomp_param[idx].mutex);
3838                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3839                 decomp_param[idx].des = host;
3840                 decomp_param[idx].len = len;
3841                 qemu_cond_signal(&decomp_param[idx].cond);
3842                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3843                 break;
3844             }
3845         }
3846         if (idx < thread_count) {
3847             break;
3848         } else {
3849             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3850         }
3851     }
3852 }
3853 
3854 static void colo_init_ram_state(void)
3855 {
3856     ram_state_init(&ram_state);
3857 }
3858 
3859 /*
3860  * colo cache: this is for secondary VM, we cache the whole
3861  * memory of the secondary VM, it is need to hold the global lock
3862  * to call this helper.
3863  */
3864 int colo_init_ram_cache(void)
3865 {
3866     RAMBlock *block;
3867 
3868     WITH_RCU_READ_LOCK_GUARD() {
3869         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3870             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3871                                                     NULL, false, false);
3872             if (!block->colo_cache) {
3873                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3874                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3875                              block->used_length);
3876                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3877                     if (block->colo_cache) {
3878                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3879                         block->colo_cache = NULL;
3880                     }
3881                 }
3882                 return -errno;
3883             }
3884             if (!machine_dump_guest_core(current_machine)) {
3885                 qemu_madvise(block->colo_cache, block->used_length,
3886                              QEMU_MADV_DONTDUMP);
3887             }
3888         }
3889     }
3890 
3891     /*
3892     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3893     * with to decide which page in cache should be flushed into SVM's RAM. Here
3894     * we use the same name 'ram_bitmap' as for migration.
3895     */
3896     if (ram_bytes_total()) {
3897         RAMBlock *block;
3898 
3899         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3900             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3901             block->bmap = bitmap_new(pages);
3902         }
3903     }
3904 
3905     colo_init_ram_state();
3906     return 0;
3907 }
3908 
3909 /* TODO: duplicated with ram_init_bitmaps */
3910 void colo_incoming_start_dirty_log(void)
3911 {
3912     RAMBlock *block = NULL;
3913     /* For memory_global_dirty_log_start below. */
3914     qemu_mutex_lock_iothread();
3915     qemu_mutex_lock_ramlist();
3916 
3917     memory_global_dirty_log_sync();
3918     WITH_RCU_READ_LOCK_GUARD() {
3919         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3920             ramblock_sync_dirty_bitmap(ram_state, block);
3921             /* Discard this dirty bitmap record */
3922             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3923         }
3924         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3925     }
3926     ram_state->migration_dirty_pages = 0;
3927     qemu_mutex_unlock_ramlist();
3928     qemu_mutex_unlock_iothread();
3929 }
3930 
3931 /* It is need to hold the global lock to call this helper */
3932 void colo_release_ram_cache(void)
3933 {
3934     RAMBlock *block;
3935 
3936     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3937     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3938         g_free(block->bmap);
3939         block->bmap = NULL;
3940     }
3941 
3942     WITH_RCU_READ_LOCK_GUARD() {
3943         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3944             if (block->colo_cache) {
3945                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3946                 block->colo_cache = NULL;
3947             }
3948         }
3949     }
3950     ram_state_cleanup(&ram_state);
3951 }
3952 
3953 /**
3954  * ram_load_setup: Setup RAM for migration incoming side
3955  *
3956  * Returns zero to indicate success and negative for error
3957  *
3958  * @f: QEMUFile where to receive the data
3959  * @opaque: RAMState pointer
3960  */
3961 static int ram_load_setup(QEMUFile *f, void *opaque)
3962 {
3963     if (compress_threads_load_setup(f)) {
3964         return -1;
3965     }
3966 
3967     xbzrle_load_setup();
3968     ramblock_recv_map_init();
3969 
3970     return 0;
3971 }
3972 
3973 static int ram_load_cleanup(void *opaque)
3974 {
3975     RAMBlock *rb;
3976 
3977     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3978         qemu_ram_block_writeback(rb);
3979     }
3980 
3981     xbzrle_load_cleanup();
3982     compress_threads_load_cleanup();
3983 
3984     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3985         g_free(rb->receivedmap);
3986         rb->receivedmap = NULL;
3987     }
3988 
3989     return 0;
3990 }
3991 
3992 /**
3993  * ram_postcopy_incoming_init: allocate postcopy data structures
3994  *
3995  * Returns 0 for success and negative if there was one error
3996  *
3997  * @mis: current migration incoming state
3998  *
3999  * Allocate data structures etc needed by incoming migration with
4000  * postcopy-ram. postcopy-ram's similarly names
4001  * postcopy_ram_incoming_init does the work.
4002  */
4003 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4004 {
4005     return postcopy_ram_incoming_init(mis);
4006 }
4007 
4008 /**
4009  * ram_load_postcopy: load a page in postcopy case
4010  *
4011  * Returns 0 for success or -errno in case of error
4012  *
4013  * Called in postcopy mode by ram_load().
4014  * rcu_read_lock is taken prior to this being called.
4015  *
4016  * @f: QEMUFile where to send the data
4017  * @channel: the channel to use for loading
4018  */
4019 int ram_load_postcopy(QEMUFile *f, int channel)
4020 {
4021     int flags = 0, ret = 0;
4022     bool place_needed = false;
4023     bool matches_target_page_size = false;
4024     MigrationIncomingState *mis = migration_incoming_get_current();
4025     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
4026 
4027     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4028         ram_addr_t addr;
4029         void *page_buffer = NULL;
4030         void *place_source = NULL;
4031         RAMBlock *block = NULL;
4032         uint8_t ch;
4033         int len;
4034 
4035         addr = qemu_get_be64(f);
4036 
4037         /*
4038          * If qemu file error, we should stop here, and then "addr"
4039          * may be invalid
4040          */
4041         ret = qemu_file_get_error(f);
4042         if (ret) {
4043             break;
4044         }
4045 
4046         flags = addr & ~TARGET_PAGE_MASK;
4047         addr &= TARGET_PAGE_MASK;
4048 
4049         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4050         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4051                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4052             block = ram_block_from_stream(mis, f, flags, channel);
4053             if (!block) {
4054                 ret = -EINVAL;
4055                 break;
4056             }
4057 
4058             /*
4059              * Relying on used_length is racy and can result in false positives.
4060              * We might place pages beyond used_length in case RAM was shrunk
4061              * while in postcopy, which is fine - trying to place via
4062              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4063              */
4064             if (!block->host || addr >= block->postcopy_length) {
4065                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4066                 ret = -EINVAL;
4067                 break;
4068             }
4069             tmp_page->target_pages++;
4070             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4071             /*
4072              * Postcopy requires that we place whole host pages atomically;
4073              * these may be huge pages for RAMBlocks that are backed by
4074              * hugetlbfs.
4075              * To make it atomic, the data is read into a temporary page
4076              * that's moved into place later.
4077              * The migration protocol uses,  possibly smaller, target-pages
4078              * however the source ensures it always sends all the components
4079              * of a host page in one chunk.
4080              */
4081             page_buffer = tmp_page->tmp_huge_page +
4082                           host_page_offset_from_ram_block_offset(block, addr);
4083             /* If all TP are zero then we can optimise the place */
4084             if (tmp_page->target_pages == 1) {
4085                 tmp_page->host_addr =
4086                     host_page_from_ram_block_offset(block, addr);
4087             } else if (tmp_page->host_addr !=
4088                        host_page_from_ram_block_offset(block, addr)) {
4089                 /* not the 1st TP within the HP */
4090                 error_report("Non-same host page detected on channel %d: "
4091                              "Target host page %p, received host page %p "
4092                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4093                              channel, tmp_page->host_addr,
4094                              host_page_from_ram_block_offset(block, addr),
4095                              block->idstr, addr, tmp_page->target_pages);
4096                 ret = -EINVAL;
4097                 break;
4098             }
4099 
4100             /*
4101              * If it's the last part of a host page then we place the host
4102              * page
4103              */
4104             if (tmp_page->target_pages ==
4105                 (block->page_size / TARGET_PAGE_SIZE)) {
4106                 place_needed = true;
4107             }
4108             place_source = tmp_page->tmp_huge_page;
4109         }
4110 
4111         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4112         case RAM_SAVE_FLAG_ZERO:
4113             ch = qemu_get_byte(f);
4114             /*
4115              * Can skip to set page_buffer when
4116              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4117              */
4118             if (ch || !matches_target_page_size) {
4119                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4120             }
4121             if (ch) {
4122                 tmp_page->all_zero = false;
4123             }
4124             break;
4125 
4126         case RAM_SAVE_FLAG_PAGE:
4127             tmp_page->all_zero = false;
4128             if (!matches_target_page_size) {
4129                 /* For huge pages, we always use temporary buffer */
4130                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4131             } else {
4132                 /*
4133                  * For small pages that matches target page size, we
4134                  * avoid the qemu_file copy.  Instead we directly use
4135                  * the buffer of QEMUFile to place the page.  Note: we
4136                  * cannot do any QEMUFile operation before using that
4137                  * buffer to make sure the buffer is valid when
4138                  * placing the page.
4139                  */
4140                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4141                                          TARGET_PAGE_SIZE);
4142             }
4143             break;
4144         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4145             tmp_page->all_zero = false;
4146             len = qemu_get_be32(f);
4147             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4148                 error_report("Invalid compressed data length: %d", len);
4149                 ret = -EINVAL;
4150                 break;
4151             }
4152             decompress_data_with_multi_threads(f, page_buffer, len);
4153             break;
4154 
4155         case RAM_SAVE_FLAG_EOS:
4156             /* normal exit */
4157             multifd_recv_sync_main();
4158             break;
4159         default:
4160             error_report("Unknown combination of migration flags: 0x%x"
4161                          " (postcopy mode)", flags);
4162             ret = -EINVAL;
4163             break;
4164         }
4165 
4166         /* Got the whole host page, wait for decompress before placing. */
4167         if (place_needed) {
4168             ret |= wait_for_decompress_done();
4169         }
4170 
4171         /* Detect for any possible file errors */
4172         if (!ret && qemu_file_get_error(f)) {
4173             ret = qemu_file_get_error(f);
4174         }
4175 
4176         if (!ret && place_needed) {
4177             if (tmp_page->all_zero) {
4178                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4179             } else {
4180                 ret = postcopy_place_page(mis, tmp_page->host_addr,
4181                                           place_source, block);
4182             }
4183             place_needed = false;
4184             postcopy_temp_page_reset(tmp_page);
4185         }
4186     }
4187 
4188     return ret;
4189 }
4190 
4191 static bool postcopy_is_running(void)
4192 {
4193     PostcopyState ps = postcopy_state_get();
4194     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4195 }
4196 
4197 /*
4198  * Flush content of RAM cache into SVM's memory.
4199  * Only flush the pages that be dirtied by PVM or SVM or both.
4200  */
4201 void colo_flush_ram_cache(void)
4202 {
4203     RAMBlock *block = NULL;
4204     void *dst_host;
4205     void *src_host;
4206     unsigned long offset = 0;
4207 
4208     memory_global_dirty_log_sync();
4209     WITH_RCU_READ_LOCK_GUARD() {
4210         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4211             ramblock_sync_dirty_bitmap(ram_state, block);
4212         }
4213     }
4214 
4215     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4216     WITH_RCU_READ_LOCK_GUARD() {
4217         block = QLIST_FIRST_RCU(&ram_list.blocks);
4218 
4219         while (block) {
4220             unsigned long num = 0;
4221 
4222             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4223             if (!offset_in_ramblock(block,
4224                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4225                 offset = 0;
4226                 num = 0;
4227                 block = QLIST_NEXT_RCU(block, next);
4228             } else {
4229                 unsigned long i = 0;
4230 
4231                 for (i = 0; i < num; i++) {
4232                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
4233                 }
4234                 dst_host = block->host
4235                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4236                 src_host = block->colo_cache
4237                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4238                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4239                 offset += num;
4240             }
4241         }
4242     }
4243     trace_colo_flush_ram_cache_end();
4244 }
4245 
4246 /**
4247  * ram_load_precopy: load pages in precopy case
4248  *
4249  * Returns 0 for success or -errno in case of error
4250  *
4251  * Called in precopy mode by ram_load().
4252  * rcu_read_lock is taken prior to this being called.
4253  *
4254  * @f: QEMUFile where to send the data
4255  */
4256 static int ram_load_precopy(QEMUFile *f)
4257 {
4258     MigrationIncomingState *mis = migration_incoming_get_current();
4259     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4260     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4261     bool postcopy_advised = migration_incoming_postcopy_advised();
4262     if (!migrate_compress()) {
4263         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4264     }
4265 
4266     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4267         ram_addr_t addr, total_ram_bytes;
4268         void *host = NULL, *host_bak = NULL;
4269         uint8_t ch;
4270 
4271         /*
4272          * Yield periodically to let main loop run, but an iteration of
4273          * the main loop is expensive, so do it each some iterations
4274          */
4275         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4276             aio_co_schedule(qemu_get_current_aio_context(),
4277                             qemu_coroutine_self());
4278             qemu_coroutine_yield();
4279         }
4280         i++;
4281 
4282         addr = qemu_get_be64(f);
4283         flags = addr & ~TARGET_PAGE_MASK;
4284         addr &= TARGET_PAGE_MASK;
4285 
4286         if (flags & invalid_flags) {
4287             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4288                 error_report("Received an unexpected compressed page");
4289             }
4290 
4291             ret = -EINVAL;
4292             break;
4293         }
4294 
4295         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4296                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4297             RAMBlock *block = ram_block_from_stream(mis, f, flags,
4298                                                     RAM_CHANNEL_PRECOPY);
4299 
4300             host = host_from_ram_block_offset(block, addr);
4301             /*
4302              * After going into COLO stage, we should not load the page
4303              * into SVM's memory directly, we put them into colo_cache firstly.
4304              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4305              * Previously, we copied all these memory in preparing stage of COLO
4306              * while we need to stop VM, which is a time-consuming process.
4307              * Here we optimize it by a trick, back-up every page while in
4308              * migration process while COLO is enabled, though it affects the
4309              * speed of the migration, but it obviously reduce the downtime of
4310              * back-up all SVM'S memory in COLO preparing stage.
4311              */
4312             if (migration_incoming_colo_enabled()) {
4313                 if (migration_incoming_in_colo_state()) {
4314                     /* In COLO stage, put all pages into cache temporarily */
4315                     host = colo_cache_from_block_offset(block, addr, true);
4316                 } else {
4317                    /*
4318                     * In migration stage but before COLO stage,
4319                     * Put all pages into both cache and SVM's memory.
4320                     */
4321                     host_bak = colo_cache_from_block_offset(block, addr, false);
4322                 }
4323             }
4324             if (!host) {
4325                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4326                 ret = -EINVAL;
4327                 break;
4328             }
4329             if (!migration_incoming_in_colo_state()) {
4330                 ramblock_recv_bitmap_set(block, host);
4331             }
4332 
4333             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4334         }
4335 
4336         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4337         case RAM_SAVE_FLAG_MEM_SIZE:
4338             /* Synchronize RAM block list */
4339             total_ram_bytes = addr;
4340             while (!ret && total_ram_bytes) {
4341                 RAMBlock *block;
4342                 char id[256];
4343                 ram_addr_t length;
4344 
4345                 len = qemu_get_byte(f);
4346                 qemu_get_buffer(f, (uint8_t *)id, len);
4347                 id[len] = 0;
4348                 length = qemu_get_be64(f);
4349 
4350                 block = qemu_ram_block_by_name(id);
4351                 if (block && !qemu_ram_is_migratable(block)) {
4352                     error_report("block %s should not be migrated !", id);
4353                     ret = -EINVAL;
4354                 } else if (block) {
4355                     if (length != block->used_length) {
4356                         Error *local_err = NULL;
4357 
4358                         ret = qemu_ram_resize(block, length,
4359                                               &local_err);
4360                         if (local_err) {
4361                             error_report_err(local_err);
4362                         }
4363                     }
4364                     /* For postcopy we need to check hugepage sizes match */
4365                     if (postcopy_advised && migrate_postcopy_ram() &&
4366                         block->page_size != qemu_host_page_size) {
4367                         uint64_t remote_page_size = qemu_get_be64(f);
4368                         if (remote_page_size != block->page_size) {
4369                             error_report("Mismatched RAM page size %s "
4370                                          "(local) %zd != %" PRId64,
4371                                          id, block->page_size,
4372                                          remote_page_size);
4373                             ret = -EINVAL;
4374                         }
4375                     }
4376                     if (migrate_ignore_shared()) {
4377                         hwaddr addr = qemu_get_be64(f);
4378                         if (ramblock_is_ignored(block) &&
4379                             block->mr->addr != addr) {
4380                             error_report("Mismatched GPAs for block %s "
4381                                          "%" PRId64 "!= %" PRId64,
4382                                          id, (uint64_t)addr,
4383                                          (uint64_t)block->mr->addr);
4384                             ret = -EINVAL;
4385                         }
4386                     }
4387                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4388                                           block->idstr);
4389                 } else {
4390                     error_report("Unknown ramblock \"%s\", cannot "
4391                                  "accept migration", id);
4392                     ret = -EINVAL;
4393                 }
4394 
4395                 total_ram_bytes -= length;
4396             }
4397             break;
4398 
4399         case RAM_SAVE_FLAG_ZERO:
4400             ch = qemu_get_byte(f);
4401             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4402             break;
4403 
4404         case RAM_SAVE_FLAG_PAGE:
4405             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4406             break;
4407 
4408         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4409             len = qemu_get_be32(f);
4410             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4411                 error_report("Invalid compressed data length: %d", len);
4412                 ret = -EINVAL;
4413                 break;
4414             }
4415             decompress_data_with_multi_threads(f, host, len);
4416             break;
4417 
4418         case RAM_SAVE_FLAG_XBZRLE:
4419             if (load_xbzrle(f, addr, host) < 0) {
4420                 error_report("Failed to decompress XBZRLE page at "
4421                              RAM_ADDR_FMT, addr);
4422                 ret = -EINVAL;
4423                 break;
4424             }
4425             break;
4426         case RAM_SAVE_FLAG_EOS:
4427             /* normal exit */
4428             multifd_recv_sync_main();
4429             break;
4430         default:
4431             if (flags & RAM_SAVE_FLAG_HOOK) {
4432                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4433             } else {
4434                 error_report("Unknown combination of migration flags: 0x%x",
4435                              flags);
4436                 ret = -EINVAL;
4437             }
4438         }
4439         if (!ret) {
4440             ret = qemu_file_get_error(f);
4441         }
4442         if (!ret && host_bak) {
4443             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4444         }
4445     }
4446 
4447     ret |= wait_for_decompress_done();
4448     return ret;
4449 }
4450 
4451 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4452 {
4453     int ret = 0;
4454     static uint64_t seq_iter;
4455     /*
4456      * If system is running in postcopy mode, page inserts to host memory must
4457      * be atomic
4458      */
4459     bool postcopy_running = postcopy_is_running();
4460 
4461     seq_iter++;
4462 
4463     if (version_id != 4) {
4464         return -EINVAL;
4465     }
4466 
4467     /*
4468      * This RCU critical section can be very long running.
4469      * When RCU reclaims in the code start to become numerous,
4470      * it will be necessary to reduce the granularity of this
4471      * critical section.
4472      */
4473     WITH_RCU_READ_LOCK_GUARD() {
4474         if (postcopy_running) {
4475             /*
4476              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4477              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4478              * service fast page faults.
4479              */
4480             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4481         } else {
4482             ret = ram_load_precopy(f);
4483         }
4484     }
4485     trace_ram_load_complete(ret, seq_iter);
4486 
4487     return ret;
4488 }
4489 
4490 static bool ram_has_postcopy(void *opaque)
4491 {
4492     RAMBlock *rb;
4493     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4494         if (ramblock_is_pmem(rb)) {
4495             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4496                          "is not supported now!", rb->idstr, rb->host);
4497             return false;
4498         }
4499     }
4500 
4501     return migrate_postcopy_ram();
4502 }
4503 
4504 /* Sync all the dirty bitmap with destination VM.  */
4505 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4506 {
4507     RAMBlock *block;
4508     QEMUFile *file = s->to_dst_file;
4509     int ramblock_count = 0;
4510 
4511     trace_ram_dirty_bitmap_sync_start();
4512 
4513     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4514         qemu_savevm_send_recv_bitmap(file, block->idstr);
4515         trace_ram_dirty_bitmap_request(block->idstr);
4516         ramblock_count++;
4517     }
4518 
4519     trace_ram_dirty_bitmap_sync_wait();
4520 
4521     /* Wait until all the ramblocks' dirty bitmap synced */
4522     while (ramblock_count--) {
4523         qemu_sem_wait(&s->rp_state.rp_sem);
4524     }
4525 
4526     trace_ram_dirty_bitmap_sync_complete();
4527 
4528     return 0;
4529 }
4530 
4531 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4532 {
4533     qemu_sem_post(&s->rp_state.rp_sem);
4534 }
4535 
4536 /*
4537  * Read the received bitmap, revert it as the initial dirty bitmap.
4538  * This is only used when the postcopy migration is paused but wants
4539  * to resume from a middle point.
4540  */
4541 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4542 {
4543     int ret = -EINVAL;
4544     /* from_dst_file is always valid because we're within rp_thread */
4545     QEMUFile *file = s->rp_state.from_dst_file;
4546     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4547     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4548     uint64_t size, end_mark;
4549 
4550     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4551 
4552     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4553         error_report("%s: incorrect state %s", __func__,
4554                      MigrationStatus_str(s->state));
4555         return -EINVAL;
4556     }
4557 
4558     /*
4559      * Note: see comments in ramblock_recv_bitmap_send() on why we
4560      * need the endianness conversion, and the paddings.
4561      */
4562     local_size = ROUND_UP(local_size, 8);
4563 
4564     /* Add paddings */
4565     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4566 
4567     size = qemu_get_be64(file);
4568 
4569     /* The size of the bitmap should match with our ramblock */
4570     if (size != local_size) {
4571         error_report("%s: ramblock '%s' bitmap size mismatch "
4572                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4573                      block->idstr, size, local_size);
4574         ret = -EINVAL;
4575         goto out;
4576     }
4577 
4578     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4579     end_mark = qemu_get_be64(file);
4580 
4581     ret = qemu_file_get_error(file);
4582     if (ret || size != local_size) {
4583         error_report("%s: read bitmap failed for ramblock '%s': %d"
4584                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4585                      __func__, block->idstr, ret, local_size, size);
4586         ret = -EIO;
4587         goto out;
4588     }
4589 
4590     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4591         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4592                      __func__, block->idstr, end_mark);
4593         ret = -EINVAL;
4594         goto out;
4595     }
4596 
4597     /*
4598      * Endianness conversion. We are during postcopy (though paused).
4599      * The dirty bitmap won't change. We can directly modify it.
4600      */
4601     bitmap_from_le(block->bmap, le_bitmap, nbits);
4602 
4603     /*
4604      * What we received is "received bitmap". Revert it as the initial
4605      * dirty bitmap for this ramblock.
4606      */
4607     bitmap_complement(block->bmap, block->bmap, nbits);
4608 
4609     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4610     ramblock_dirty_bitmap_clear_discarded_pages(block);
4611 
4612     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4613     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4614 
4615     /*
4616      * We succeeded to sync bitmap for current ramblock. If this is
4617      * the last one to sync, we need to notify the main send thread.
4618      */
4619     ram_dirty_bitmap_reload_notify(s);
4620 
4621     ret = 0;
4622 out:
4623     g_free(le_bitmap);
4624     return ret;
4625 }
4626 
4627 static int ram_resume_prepare(MigrationState *s, void *opaque)
4628 {
4629     RAMState *rs = *(RAMState **)opaque;
4630     int ret;
4631 
4632     ret = ram_dirty_bitmap_sync_all(s, rs);
4633     if (ret) {
4634         return ret;
4635     }
4636 
4637     ram_state_resume_prepare(rs, s->to_dst_file);
4638 
4639     return 0;
4640 }
4641 
4642 void postcopy_preempt_shutdown_file(MigrationState *s)
4643 {
4644     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4645     qemu_fflush(s->postcopy_qemufile_src);
4646 }
4647 
4648 static SaveVMHandlers savevm_ram_handlers = {
4649     .save_setup = ram_save_setup,
4650     .save_live_iterate = ram_save_iterate,
4651     .save_live_complete_postcopy = ram_save_complete,
4652     .save_live_complete_precopy = ram_save_complete,
4653     .has_postcopy = ram_has_postcopy,
4654     .state_pending_exact = ram_state_pending_exact,
4655     .state_pending_estimate = ram_state_pending_estimate,
4656     .load_state = ram_load,
4657     .save_cleanup = ram_save_cleanup,
4658     .load_setup = ram_load_setup,
4659     .load_cleanup = ram_load_cleanup,
4660     .resume_prepare = ram_resume_prepare,
4661 };
4662 
4663 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4664                                       size_t old_size, size_t new_size)
4665 {
4666     PostcopyState ps = postcopy_state_get();
4667     ram_addr_t offset;
4668     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4669     Error *err = NULL;
4670 
4671     if (ramblock_is_ignored(rb)) {
4672         return;
4673     }
4674 
4675     if (!migration_is_idle()) {
4676         /*
4677          * Precopy code on the source cannot deal with the size of RAM blocks
4678          * changing at random points in time - especially after sending the
4679          * RAM block sizes in the migration stream, they must no longer change.
4680          * Abort and indicate a proper reason.
4681          */
4682         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4683         migration_cancel(err);
4684         error_free(err);
4685     }
4686 
4687     switch (ps) {
4688     case POSTCOPY_INCOMING_ADVISE:
4689         /*
4690          * Update what ram_postcopy_incoming_init()->init_range() does at the
4691          * time postcopy was advised. Syncing RAM blocks with the source will
4692          * result in RAM resizes.
4693          */
4694         if (old_size < new_size) {
4695             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4696                 error_report("RAM block '%s' discard of resized RAM failed",
4697                              rb->idstr);
4698             }
4699         }
4700         rb->postcopy_length = new_size;
4701         break;
4702     case POSTCOPY_INCOMING_NONE:
4703     case POSTCOPY_INCOMING_RUNNING:
4704     case POSTCOPY_INCOMING_END:
4705         /*
4706          * Once our guest is running, postcopy does no longer care about
4707          * resizes. When growing, the new memory was not available on the
4708          * source, no handler needed.
4709          */
4710         break;
4711     default:
4712         error_report("RAM block '%s' resized during postcopy state: %d",
4713                      rb->idstr, ps);
4714         exit(-1);
4715     }
4716 }
4717 
4718 static RAMBlockNotifier ram_mig_ram_notifier = {
4719     .ram_block_resized = ram_mig_ram_block_resized,
4720 };
4721 
4722 void ram_mig_init(void)
4723 {
4724     qemu_mutex_init(&XBZRLE.lock);
4725     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4726     ram_block_notifier_add(&ram_mig_ram_notifier);
4727 }
4728