xref: /openbmc/qemu/migration/ram.c (revision 90f9e35b)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "block.h"
54 #include "sysemu/cpu-throttle.h"
55 #include "savevm.h"
56 #include "qemu/iov.h"
57 #include "multifd.h"
58 #include "sysemu/runstate.h"
59 
60 #include "hw/boards.h" /* for machine_dump_guest_core() */
61 
62 #if defined(__linux__)
63 #include "qemu/userfaultfd.h"
64 #endif /* defined(__linux__) */
65 
66 /***********************************************************/
67 /* ram save/restore */
68 
69 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
70  * worked for pages that where filled with the same char.  We switched
71  * it to only search for the zero value.  And to avoid confusion with
72  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
73  */
74 
75 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
76 #define RAM_SAVE_FLAG_ZERO     0x02
77 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
78 #define RAM_SAVE_FLAG_PAGE     0x08
79 #define RAM_SAVE_FLAG_EOS      0x10
80 #define RAM_SAVE_FLAG_CONTINUE 0x20
81 #define RAM_SAVE_FLAG_XBZRLE   0x40
82 /* 0x80 is reserved in migration.h start with 0x100 next */
83 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
84 
85 XBZRLECacheStats xbzrle_counters;
86 
87 /* struct contains XBZRLE cache and a static page
88    used by the compression */
89 static struct {
90     /* buffer used for XBZRLE encoding */
91     uint8_t *encoded_buf;
92     /* buffer for storing page content */
93     uint8_t *current_buf;
94     /* Cache for XBZRLE, Protected by lock. */
95     PageCache *cache;
96     QemuMutex lock;
97     /* it will store a page full of zeros */
98     uint8_t *zero_target_page;
99     /* buffer used for XBZRLE decoding */
100     uint8_t *decoded_buf;
101 } XBZRLE;
102 
103 static void XBZRLE_cache_lock(void)
104 {
105     if (migrate_use_xbzrle()) {
106         qemu_mutex_lock(&XBZRLE.lock);
107     }
108 }
109 
110 static void XBZRLE_cache_unlock(void)
111 {
112     if (migrate_use_xbzrle()) {
113         qemu_mutex_unlock(&XBZRLE.lock);
114     }
115 }
116 
117 /**
118  * xbzrle_cache_resize: resize the xbzrle cache
119  *
120  * This function is called from migrate_params_apply in main
121  * thread, possibly while a migration is in progress.  A running
122  * migration may be using the cache and might finish during this call,
123  * hence changes to the cache are protected by XBZRLE.lock().
124  *
125  * Returns 0 for success or -1 for error
126  *
127  * @new_size: new cache size
128  * @errp: set *errp if the check failed, with reason
129  */
130 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
131 {
132     PageCache *new_cache;
133     int64_t ret = 0;
134 
135     /* Check for truncation */
136     if (new_size != (size_t)new_size) {
137         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
138                    "exceeding address space");
139         return -1;
140     }
141 
142     if (new_size == migrate_xbzrle_cache_size()) {
143         /* nothing to do */
144         return 0;
145     }
146 
147     XBZRLE_cache_lock();
148 
149     if (XBZRLE.cache != NULL) {
150         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
151         if (!new_cache) {
152             ret = -1;
153             goto out;
154         }
155 
156         cache_fini(XBZRLE.cache);
157         XBZRLE.cache = new_cache;
158     }
159 out:
160     XBZRLE_cache_unlock();
161     return ret;
162 }
163 
164 bool ramblock_is_ignored(RAMBlock *block)
165 {
166     return !qemu_ram_is_migratable(block) ||
167            (migrate_ignore_shared() && qemu_ram_is_shared(block));
168 }
169 
170 #undef RAMBLOCK_FOREACH
171 
172 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
173 {
174     RAMBlock *block;
175     int ret = 0;
176 
177     RCU_READ_LOCK_GUARD();
178 
179     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
180         ret = func(block, opaque);
181         if (ret) {
182             break;
183         }
184     }
185     return ret;
186 }
187 
188 static void ramblock_recv_map_init(void)
189 {
190     RAMBlock *rb;
191 
192     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
193         assert(!rb->receivedmap);
194         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
195     }
196 }
197 
198 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
199 {
200     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
201                     rb->receivedmap);
202 }
203 
204 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
205 {
206     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
207 }
208 
209 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
210 {
211     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
212 }
213 
214 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
215                                     size_t nr)
216 {
217     bitmap_set_atomic(rb->receivedmap,
218                       ramblock_recv_bitmap_offset(host_addr, rb),
219                       nr);
220 }
221 
222 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
223 
224 /*
225  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
226  *
227  * Returns >0 if success with sent bytes, or <0 if error.
228  */
229 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
230                                   const char *block_name)
231 {
232     RAMBlock *block = qemu_ram_block_by_name(block_name);
233     unsigned long *le_bitmap, nbits;
234     uint64_t size;
235 
236     if (!block) {
237         error_report("%s: invalid block name: %s", __func__, block_name);
238         return -1;
239     }
240 
241     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
242 
243     /*
244      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
245      * machines we may need 4 more bytes for padding (see below
246      * comment). So extend it a bit before hand.
247      */
248     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
249 
250     /*
251      * Always use little endian when sending the bitmap. This is
252      * required that when source and destination VMs are not using the
253      * same endianness. (Note: big endian won't work.)
254      */
255     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
256 
257     /* Size of the bitmap, in bytes */
258     size = DIV_ROUND_UP(nbits, 8);
259 
260     /*
261      * size is always aligned to 8 bytes for 64bit machines, but it
262      * may not be true for 32bit machines. We need this padding to
263      * make sure the migration can survive even between 32bit and
264      * 64bit machines.
265      */
266     size = ROUND_UP(size, 8);
267 
268     qemu_put_be64(file, size);
269     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
270     /*
271      * Mark as an end, in case the middle part is screwed up due to
272      * some "mysterious" reason.
273      */
274     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
275     qemu_fflush(file);
276 
277     g_free(le_bitmap);
278 
279     if (qemu_file_get_error(file)) {
280         return qemu_file_get_error(file);
281     }
282 
283     return size + sizeof(size);
284 }
285 
286 /*
287  * An outstanding page request, on the source, having been received
288  * and queued
289  */
290 struct RAMSrcPageRequest {
291     RAMBlock *rb;
292     hwaddr    offset;
293     hwaddr    len;
294 
295     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
296 };
297 
298 /* State of RAM for migration */
299 struct RAMState {
300     /* QEMUFile used for this migration */
301     QEMUFile *f;
302     /* UFFD file descriptor, used in 'write-tracking' migration */
303     int uffdio_fd;
304     /* Last block that we have visited searching for dirty pages */
305     RAMBlock *last_seen_block;
306     /* Last block from where we have sent data */
307     RAMBlock *last_sent_block;
308     /* Last dirty target page we have sent */
309     ram_addr_t last_page;
310     /* last ram version we have seen */
311     uint32_t last_version;
312     /* How many times we have dirty too many pages */
313     int dirty_rate_high_cnt;
314     /* these variables are used for bitmap sync */
315     /* last time we did a full bitmap_sync */
316     int64_t time_last_bitmap_sync;
317     /* bytes transferred at start_time */
318     uint64_t bytes_xfer_prev;
319     /* number of dirty pages since start_time */
320     uint64_t num_dirty_pages_period;
321     /* xbzrle misses since the beginning of the period */
322     uint64_t xbzrle_cache_miss_prev;
323     /* Amount of xbzrle pages since the beginning of the period */
324     uint64_t xbzrle_pages_prev;
325     /* Amount of xbzrle encoded bytes since the beginning of the period */
326     uint64_t xbzrle_bytes_prev;
327     /* Start using XBZRLE (e.g., after the first round). */
328     bool xbzrle_enabled;
329     /* Are we on the last stage of migration */
330     bool last_stage;
331     /* compression statistics since the beginning of the period */
332     /* amount of count that no free thread to compress data */
333     uint64_t compress_thread_busy_prev;
334     /* amount bytes after compression */
335     uint64_t compressed_size_prev;
336     /* amount of compressed pages */
337     uint64_t compress_pages_prev;
338 
339     /* total handled target pages at the beginning of period */
340     uint64_t target_page_count_prev;
341     /* total handled target pages since start */
342     uint64_t target_page_count;
343     /* number of dirty bits in the bitmap */
344     uint64_t migration_dirty_pages;
345     /* Protects modification of the bitmap and migration dirty pages */
346     QemuMutex bitmap_mutex;
347     /* The RAMBlock used in the last src_page_requests */
348     RAMBlock *last_req_rb;
349     /* Queue of outstanding page requests from the destination */
350     QemuMutex src_page_req_mutex;
351     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
352 };
353 typedef struct RAMState RAMState;
354 
355 static RAMState *ram_state;
356 
357 static NotifierWithReturnList precopy_notifier_list;
358 
359 /* Whether postcopy has queued requests? */
360 static bool postcopy_has_request(RAMState *rs)
361 {
362     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
363 }
364 
365 void precopy_infrastructure_init(void)
366 {
367     notifier_with_return_list_init(&precopy_notifier_list);
368 }
369 
370 void precopy_add_notifier(NotifierWithReturn *n)
371 {
372     notifier_with_return_list_add(&precopy_notifier_list, n);
373 }
374 
375 void precopy_remove_notifier(NotifierWithReturn *n)
376 {
377     notifier_with_return_remove(n);
378 }
379 
380 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
381 {
382     PrecopyNotifyData pnd;
383     pnd.reason = reason;
384     pnd.errp = errp;
385 
386     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
387 }
388 
389 uint64_t ram_bytes_remaining(void)
390 {
391     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
392                        0;
393 }
394 
395 MigrationStats ram_counters;
396 
397 static void ram_transferred_add(uint64_t bytes)
398 {
399     if (runstate_is_running()) {
400         ram_counters.precopy_bytes += bytes;
401     } else if (migration_in_postcopy()) {
402         ram_counters.postcopy_bytes += bytes;
403     } else {
404         ram_counters.downtime_bytes += bytes;
405     }
406     ram_counters.transferred += bytes;
407 }
408 
409 /* used by the search for pages to send */
410 struct PageSearchStatus {
411     /* Current block being searched */
412     RAMBlock    *block;
413     /* Current page to search from */
414     unsigned long page;
415     /* Set once we wrap around */
416     bool         complete_round;
417 };
418 typedef struct PageSearchStatus PageSearchStatus;
419 
420 CompressionStats compression_counters;
421 
422 struct CompressParam {
423     bool done;
424     bool quit;
425     bool zero_page;
426     QEMUFile *file;
427     QemuMutex mutex;
428     QemuCond cond;
429     RAMBlock *block;
430     ram_addr_t offset;
431 
432     /* internally used fields */
433     z_stream stream;
434     uint8_t *originbuf;
435 };
436 typedef struct CompressParam CompressParam;
437 
438 struct DecompressParam {
439     bool done;
440     bool quit;
441     QemuMutex mutex;
442     QemuCond cond;
443     void *des;
444     uint8_t *compbuf;
445     int len;
446     z_stream stream;
447 };
448 typedef struct DecompressParam DecompressParam;
449 
450 static CompressParam *comp_param;
451 static QemuThread *compress_threads;
452 /* comp_done_cond is used to wake up the migration thread when
453  * one of the compression threads has finished the compression.
454  * comp_done_lock is used to co-work with comp_done_cond.
455  */
456 static QemuMutex comp_done_lock;
457 static QemuCond comp_done_cond;
458 /* The empty QEMUFileOps will be used by file in CompressParam */
459 static const QEMUFileOps empty_ops = { };
460 
461 static QEMUFile *decomp_file;
462 static DecompressParam *decomp_param;
463 static QemuThread *decompress_threads;
464 static QemuMutex decomp_done_lock;
465 static QemuCond decomp_done_cond;
466 
467 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
468                                  ram_addr_t offset, uint8_t *source_buf);
469 
470 static void *do_data_compress(void *opaque)
471 {
472     CompressParam *param = opaque;
473     RAMBlock *block;
474     ram_addr_t offset;
475     bool zero_page;
476 
477     qemu_mutex_lock(&param->mutex);
478     while (!param->quit) {
479         if (param->block) {
480             block = param->block;
481             offset = param->offset;
482             param->block = NULL;
483             qemu_mutex_unlock(&param->mutex);
484 
485             zero_page = do_compress_ram_page(param->file, &param->stream,
486                                              block, offset, param->originbuf);
487 
488             qemu_mutex_lock(&comp_done_lock);
489             param->done = true;
490             param->zero_page = zero_page;
491             qemu_cond_signal(&comp_done_cond);
492             qemu_mutex_unlock(&comp_done_lock);
493 
494             qemu_mutex_lock(&param->mutex);
495         } else {
496             qemu_cond_wait(&param->cond, &param->mutex);
497         }
498     }
499     qemu_mutex_unlock(&param->mutex);
500 
501     return NULL;
502 }
503 
504 static void compress_threads_save_cleanup(void)
505 {
506     int i, thread_count;
507 
508     if (!migrate_use_compression() || !comp_param) {
509         return;
510     }
511 
512     thread_count = migrate_compress_threads();
513     for (i = 0; i < thread_count; i++) {
514         /*
515          * we use it as a indicator which shows if the thread is
516          * properly init'd or not
517          */
518         if (!comp_param[i].file) {
519             break;
520         }
521 
522         qemu_mutex_lock(&comp_param[i].mutex);
523         comp_param[i].quit = true;
524         qemu_cond_signal(&comp_param[i].cond);
525         qemu_mutex_unlock(&comp_param[i].mutex);
526 
527         qemu_thread_join(compress_threads + i);
528         qemu_mutex_destroy(&comp_param[i].mutex);
529         qemu_cond_destroy(&comp_param[i].cond);
530         deflateEnd(&comp_param[i].stream);
531         g_free(comp_param[i].originbuf);
532         qemu_fclose(comp_param[i].file);
533         comp_param[i].file = NULL;
534     }
535     qemu_mutex_destroy(&comp_done_lock);
536     qemu_cond_destroy(&comp_done_cond);
537     g_free(compress_threads);
538     g_free(comp_param);
539     compress_threads = NULL;
540     comp_param = NULL;
541 }
542 
543 static int compress_threads_save_setup(void)
544 {
545     int i, thread_count;
546 
547     if (!migrate_use_compression()) {
548         return 0;
549     }
550     thread_count = migrate_compress_threads();
551     compress_threads = g_new0(QemuThread, thread_count);
552     comp_param = g_new0(CompressParam, thread_count);
553     qemu_cond_init(&comp_done_cond);
554     qemu_mutex_init(&comp_done_lock);
555     for (i = 0; i < thread_count; i++) {
556         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
557         if (!comp_param[i].originbuf) {
558             goto exit;
559         }
560 
561         if (deflateInit(&comp_param[i].stream,
562                         migrate_compress_level()) != Z_OK) {
563             g_free(comp_param[i].originbuf);
564             goto exit;
565         }
566 
567         /* comp_param[i].file is just used as a dummy buffer to save data,
568          * set its ops to empty.
569          */
570         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
571         comp_param[i].done = true;
572         comp_param[i].quit = false;
573         qemu_mutex_init(&comp_param[i].mutex);
574         qemu_cond_init(&comp_param[i].cond);
575         qemu_thread_create(compress_threads + i, "compress",
576                            do_data_compress, comp_param + i,
577                            QEMU_THREAD_JOINABLE);
578     }
579     return 0;
580 
581 exit:
582     compress_threads_save_cleanup();
583     return -1;
584 }
585 
586 /**
587  * save_page_header: write page header to wire
588  *
589  * If this is the 1st block, it also writes the block identification
590  *
591  * Returns the number of bytes written
592  *
593  * @f: QEMUFile where to send the data
594  * @block: block that contains the page we want to send
595  * @offset: offset inside the block for the page
596  *          in the lower bits, it contains flags
597  */
598 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
599                                ram_addr_t offset)
600 {
601     size_t size, len;
602 
603     if (block == rs->last_sent_block) {
604         offset |= RAM_SAVE_FLAG_CONTINUE;
605     }
606     qemu_put_be64(f, offset);
607     size = 8;
608 
609     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
610         len = strlen(block->idstr);
611         qemu_put_byte(f, len);
612         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
613         size += 1 + len;
614         rs->last_sent_block = block;
615     }
616     return size;
617 }
618 
619 /**
620  * mig_throttle_guest_down: throttle down the guest
621  *
622  * Reduce amount of guest cpu execution to hopefully slow down memory
623  * writes. If guest dirty memory rate is reduced below the rate at
624  * which we can transfer pages to the destination then we should be
625  * able to complete migration. Some workloads dirty memory way too
626  * fast and will not effectively converge, even with auto-converge.
627  */
628 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
629                                     uint64_t bytes_dirty_threshold)
630 {
631     MigrationState *s = migrate_get_current();
632     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
633     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
634     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
635     int pct_max = s->parameters.max_cpu_throttle;
636 
637     uint64_t throttle_now = cpu_throttle_get_percentage();
638     uint64_t cpu_now, cpu_ideal, throttle_inc;
639 
640     /* We have not started throttling yet. Let's start it. */
641     if (!cpu_throttle_active()) {
642         cpu_throttle_set(pct_initial);
643     } else {
644         /* Throttling already on, just increase the rate */
645         if (!pct_tailslow) {
646             throttle_inc = pct_increment;
647         } else {
648             /* Compute the ideal CPU percentage used by Guest, which may
649              * make the dirty rate match the dirty rate threshold. */
650             cpu_now = 100 - throttle_now;
651             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
652                         bytes_dirty_period);
653             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
654         }
655         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
656     }
657 }
658 
659 void mig_throttle_counter_reset(void)
660 {
661     RAMState *rs = ram_state;
662 
663     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
664     rs->num_dirty_pages_period = 0;
665     rs->bytes_xfer_prev = ram_counters.transferred;
666 }
667 
668 /**
669  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
670  *
671  * @rs: current RAM state
672  * @current_addr: address for the zero page
673  *
674  * Update the xbzrle cache to reflect a page that's been sent as all 0.
675  * The important thing is that a stale (not-yet-0'd) page be replaced
676  * by the new data.
677  * As a bonus, if the page wasn't in the cache it gets added so that
678  * when a small write is made into the 0'd page it gets XBZRLE sent.
679  */
680 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
681 {
682     if (!rs->xbzrle_enabled) {
683         return;
684     }
685 
686     /* We don't care if this fails to allocate a new cache page
687      * as long as it updated an old one */
688     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
689                  ram_counters.dirty_sync_count);
690 }
691 
692 #define ENCODING_FLAG_XBZRLE 0x1
693 
694 /**
695  * save_xbzrle_page: compress and send current page
696  *
697  * Returns: 1 means that we wrote the page
698  *          0 means that page is identical to the one already sent
699  *          -1 means that xbzrle would be longer than normal
700  *
701  * @rs: current RAM state
702  * @current_data: pointer to the address of the page contents
703  * @current_addr: addr of the page
704  * @block: block that contains the page we want to send
705  * @offset: offset inside the block for the page
706  */
707 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
708                             ram_addr_t current_addr, RAMBlock *block,
709                             ram_addr_t offset)
710 {
711     int encoded_len = 0, bytes_xbzrle;
712     uint8_t *prev_cached_page;
713 
714     if (!cache_is_cached(XBZRLE.cache, current_addr,
715                          ram_counters.dirty_sync_count)) {
716         xbzrle_counters.cache_miss++;
717         if (!rs->last_stage) {
718             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
719                              ram_counters.dirty_sync_count) == -1) {
720                 return -1;
721             } else {
722                 /* update *current_data when the page has been
723                    inserted into cache */
724                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
725             }
726         }
727         return -1;
728     }
729 
730     /*
731      * Reaching here means the page has hit the xbzrle cache, no matter what
732      * encoding result it is (normal encoding, overflow or skipping the page),
733      * count the page as encoded. This is used to calculate the encoding rate.
734      *
735      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
736      * 2nd page turns out to be skipped (i.e. no new bytes written to the
737      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
738      * skipped page included. In this way, the encoding rate can tell if the
739      * guest page is good for xbzrle encoding.
740      */
741     xbzrle_counters.pages++;
742     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
743 
744     /* save current buffer into memory */
745     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
746 
747     /* XBZRLE encoding (if there is no overflow) */
748     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
749                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
750                                        TARGET_PAGE_SIZE);
751 
752     /*
753      * Update the cache contents, so that it corresponds to the data
754      * sent, in all cases except where we skip the page.
755      */
756     if (!rs->last_stage && encoded_len != 0) {
757         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
758         /*
759          * In the case where we couldn't compress, ensure that the caller
760          * sends the data from the cache, since the guest might have
761          * changed the RAM since we copied it.
762          */
763         *current_data = prev_cached_page;
764     }
765 
766     if (encoded_len == 0) {
767         trace_save_xbzrle_page_skipping();
768         return 0;
769     } else if (encoded_len == -1) {
770         trace_save_xbzrle_page_overflow();
771         xbzrle_counters.overflow++;
772         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
773         return -1;
774     }
775 
776     /* Send XBZRLE based compressed page */
777     bytes_xbzrle = save_page_header(rs, rs->f, block,
778                                     offset | RAM_SAVE_FLAG_XBZRLE);
779     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
780     qemu_put_be16(rs->f, encoded_len);
781     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
782     bytes_xbzrle += encoded_len + 1 + 2;
783     /*
784      * Like compressed_size (please see update_compress_thread_counts),
785      * the xbzrle encoded bytes don't count the 8 byte header with
786      * RAM_SAVE_FLAG_CONTINUE.
787      */
788     xbzrle_counters.bytes += bytes_xbzrle - 8;
789     ram_transferred_add(bytes_xbzrle);
790 
791     return 1;
792 }
793 
794 /**
795  * migration_bitmap_find_dirty: find the next dirty page from start
796  *
797  * Returns the page offset within memory region of the start of a dirty page
798  *
799  * @rs: current RAM state
800  * @rb: RAMBlock where to search for dirty pages
801  * @start: page where we start the search
802  */
803 static inline
804 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
805                                           unsigned long start)
806 {
807     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
808     unsigned long *bitmap = rb->bmap;
809 
810     if (ramblock_is_ignored(rb)) {
811         return size;
812     }
813 
814     return find_next_bit(bitmap, size, start);
815 }
816 
817 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
818                                                        unsigned long page)
819 {
820     uint8_t shift;
821     hwaddr size, start;
822 
823     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
824         return;
825     }
826 
827     shift = rb->clear_bmap_shift;
828     /*
829      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
830      * can make things easier sometimes since then start address
831      * of the small chunk will always be 64 pages aligned so the
832      * bitmap will always be aligned to unsigned long. We should
833      * even be able to remove this restriction but I'm simply
834      * keeping it.
835      */
836     assert(shift >= 6);
837 
838     size = 1ULL << (TARGET_PAGE_BITS + shift);
839     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
840     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
841     memory_region_clear_dirty_bitmap(rb->mr, start, size);
842 }
843 
844 static void
845 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
846                                                  unsigned long start,
847                                                  unsigned long npages)
848 {
849     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
850     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
851     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
852 
853     /*
854      * Clear pages from start to start + npages - 1, so the end boundary is
855      * exclusive.
856      */
857     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
858         migration_clear_memory_region_dirty_bitmap(rb, i);
859     }
860 }
861 
862 /*
863  * colo_bitmap_find_diry:find contiguous dirty pages from start
864  *
865  * Returns the page offset within memory region of the start of the contiguout
866  * dirty page
867  *
868  * @rs: current RAM state
869  * @rb: RAMBlock where to search for dirty pages
870  * @start: page where we start the search
871  * @num: the number of contiguous dirty pages
872  */
873 static inline
874 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
875                                      unsigned long start, unsigned long *num)
876 {
877     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
878     unsigned long *bitmap = rb->bmap;
879     unsigned long first, next;
880 
881     *num = 0;
882 
883     if (ramblock_is_ignored(rb)) {
884         return size;
885     }
886 
887     first = find_next_bit(bitmap, size, start);
888     if (first >= size) {
889         return first;
890     }
891     next = find_next_zero_bit(bitmap, size, first + 1);
892     assert(next >= first);
893     *num = next - first;
894     return first;
895 }
896 
897 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
898                                                 RAMBlock *rb,
899                                                 unsigned long page)
900 {
901     bool ret;
902 
903     /*
904      * Clear dirty bitmap if needed.  This _must_ be called before we
905      * send any of the page in the chunk because we need to make sure
906      * we can capture further page content changes when we sync dirty
907      * log the next time.  So as long as we are going to send any of
908      * the page in the chunk we clear the remote dirty bitmap for all.
909      * Clearing it earlier won't be a problem, but too late will.
910      */
911     migration_clear_memory_region_dirty_bitmap(rb, page);
912 
913     ret = test_and_clear_bit(page, rb->bmap);
914     if (ret) {
915         rs->migration_dirty_pages--;
916     }
917 
918     return ret;
919 }
920 
921 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
922                                        void *opaque)
923 {
924     const hwaddr offset = section->offset_within_region;
925     const hwaddr size = int128_get64(section->size);
926     const unsigned long start = offset >> TARGET_PAGE_BITS;
927     const unsigned long npages = size >> TARGET_PAGE_BITS;
928     RAMBlock *rb = section->mr->ram_block;
929     uint64_t *cleared_bits = opaque;
930 
931     /*
932      * We don't grab ram_state->bitmap_mutex because we expect to run
933      * only when starting migration or during postcopy recovery where
934      * we don't have concurrent access.
935      */
936     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
937         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
938     }
939     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
940     bitmap_clear(rb->bmap, start, npages);
941 }
942 
943 /*
944  * Exclude all dirty pages from migration that fall into a discarded range as
945  * managed by a RamDiscardManager responsible for the mapped memory region of
946  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
947  *
948  * Discarded pages ("logically unplugged") have undefined content and must
949  * not get migrated, because even reading these pages for migration might
950  * result in undesired behavior.
951  *
952  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
953  *
954  * Note: The result is only stable while migrating (precopy/postcopy).
955  */
956 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
957 {
958     uint64_t cleared_bits = 0;
959 
960     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
961         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
962         MemoryRegionSection section = {
963             .mr = rb->mr,
964             .offset_within_region = 0,
965             .size = int128_make64(qemu_ram_get_used_length(rb)),
966         };
967 
968         ram_discard_manager_replay_discarded(rdm, &section,
969                                              dirty_bitmap_clear_section,
970                                              &cleared_bits);
971     }
972     return cleared_bits;
973 }
974 
975 /*
976  * Check if a host-page aligned page falls into a discarded range as managed by
977  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
978  *
979  * Note: The result is only stable while migrating (precopy/postcopy).
980  */
981 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
982 {
983     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
984         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
985         MemoryRegionSection section = {
986             .mr = rb->mr,
987             .offset_within_region = start,
988             .size = int128_make64(qemu_ram_pagesize(rb)),
989         };
990 
991         return !ram_discard_manager_is_populated(rdm, &section);
992     }
993     return false;
994 }
995 
996 /* Called with RCU critical section */
997 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
998 {
999     uint64_t new_dirty_pages =
1000         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1001 
1002     rs->migration_dirty_pages += new_dirty_pages;
1003     rs->num_dirty_pages_period += new_dirty_pages;
1004 }
1005 
1006 /**
1007  * ram_pagesize_summary: calculate all the pagesizes of a VM
1008  *
1009  * Returns a summary bitmap of the page sizes of all RAMBlocks
1010  *
1011  * For VMs with just normal pages this is equivalent to the host page
1012  * size. If it's got some huge pages then it's the OR of all the
1013  * different page sizes.
1014  */
1015 uint64_t ram_pagesize_summary(void)
1016 {
1017     RAMBlock *block;
1018     uint64_t summary = 0;
1019 
1020     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1021         summary |= block->page_size;
1022     }
1023 
1024     return summary;
1025 }
1026 
1027 uint64_t ram_get_total_transferred_pages(void)
1028 {
1029     return  ram_counters.normal + ram_counters.duplicate +
1030                 compression_counters.pages + xbzrle_counters.pages;
1031 }
1032 
1033 static void migration_update_rates(RAMState *rs, int64_t end_time)
1034 {
1035     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1036     double compressed_size;
1037 
1038     /* calculate period counters */
1039     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1040                 / (end_time - rs->time_last_bitmap_sync);
1041 
1042     if (!page_count) {
1043         return;
1044     }
1045 
1046     if (migrate_use_xbzrle()) {
1047         double encoded_size, unencoded_size;
1048 
1049         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1050             rs->xbzrle_cache_miss_prev) / page_count;
1051         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1052         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1053                          TARGET_PAGE_SIZE;
1054         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1055         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1056             xbzrle_counters.encoding_rate = 0;
1057         } else {
1058             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1059         }
1060         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1061         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1062     }
1063 
1064     if (migrate_use_compression()) {
1065         compression_counters.busy_rate = (double)(compression_counters.busy -
1066             rs->compress_thread_busy_prev) / page_count;
1067         rs->compress_thread_busy_prev = compression_counters.busy;
1068 
1069         compressed_size = compression_counters.compressed_size -
1070                           rs->compressed_size_prev;
1071         if (compressed_size) {
1072             double uncompressed_size = (compression_counters.pages -
1073                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1074 
1075             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1076             compression_counters.compression_rate =
1077                                         uncompressed_size / compressed_size;
1078 
1079             rs->compress_pages_prev = compression_counters.pages;
1080             rs->compressed_size_prev = compression_counters.compressed_size;
1081         }
1082     }
1083 }
1084 
1085 static void migration_trigger_throttle(RAMState *rs)
1086 {
1087     MigrationState *s = migrate_get_current();
1088     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1089 
1090     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1091     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1092     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1093 
1094     /* During block migration the auto-converge logic incorrectly detects
1095      * that ram migration makes no progress. Avoid this by disabling the
1096      * throttling logic during the bulk phase of block migration. */
1097     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1098         /* The following detection logic can be refined later. For now:
1099            Check to see if the ratio between dirtied bytes and the approx.
1100            amount of bytes that just got transferred since the last time
1101            we were in this routine reaches the threshold. If that happens
1102            twice, start or increase throttling. */
1103 
1104         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1105             (++rs->dirty_rate_high_cnt >= 2)) {
1106             trace_migration_throttle();
1107             rs->dirty_rate_high_cnt = 0;
1108             mig_throttle_guest_down(bytes_dirty_period,
1109                                     bytes_dirty_threshold);
1110         }
1111     }
1112 }
1113 
1114 static void migration_bitmap_sync(RAMState *rs)
1115 {
1116     RAMBlock *block;
1117     int64_t end_time;
1118 
1119     ram_counters.dirty_sync_count++;
1120 
1121     if (!rs->time_last_bitmap_sync) {
1122         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1123     }
1124 
1125     trace_migration_bitmap_sync_start();
1126     memory_global_dirty_log_sync();
1127 
1128     qemu_mutex_lock(&rs->bitmap_mutex);
1129     WITH_RCU_READ_LOCK_GUARD() {
1130         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1131             ramblock_sync_dirty_bitmap(rs, block);
1132         }
1133         ram_counters.remaining = ram_bytes_remaining();
1134     }
1135     qemu_mutex_unlock(&rs->bitmap_mutex);
1136 
1137     memory_global_after_dirty_log_sync();
1138     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1139 
1140     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1141 
1142     /* more than 1 second = 1000 millisecons */
1143     if (end_time > rs->time_last_bitmap_sync + 1000) {
1144         migration_trigger_throttle(rs);
1145 
1146         migration_update_rates(rs, end_time);
1147 
1148         rs->target_page_count_prev = rs->target_page_count;
1149 
1150         /* reset period counters */
1151         rs->time_last_bitmap_sync = end_time;
1152         rs->num_dirty_pages_period = 0;
1153         rs->bytes_xfer_prev = ram_counters.transferred;
1154     }
1155     if (migrate_use_events()) {
1156         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1157     }
1158 }
1159 
1160 static void migration_bitmap_sync_precopy(RAMState *rs)
1161 {
1162     Error *local_err = NULL;
1163 
1164     /*
1165      * The current notifier usage is just an optimization to migration, so we
1166      * don't stop the normal migration process in the error case.
1167      */
1168     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1169         error_report_err(local_err);
1170         local_err = NULL;
1171     }
1172 
1173     migration_bitmap_sync(rs);
1174 
1175     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1176         error_report_err(local_err);
1177     }
1178 }
1179 
1180 static void ram_release_page(const char *rbname, uint64_t offset)
1181 {
1182     if (!migrate_release_ram() || !migration_in_postcopy()) {
1183         return;
1184     }
1185 
1186     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1187 }
1188 
1189 /**
1190  * save_zero_page_to_file: send the zero page to the file
1191  *
1192  * Returns the size of data written to the file, 0 means the page is not
1193  * a zero page
1194  *
1195  * @rs: current RAM state
1196  * @file: the file where the data is saved
1197  * @block: block that contains the page we want to send
1198  * @offset: offset inside the block for the page
1199  */
1200 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1201                                   RAMBlock *block, ram_addr_t offset)
1202 {
1203     uint8_t *p = block->host + offset;
1204     int len = 0;
1205 
1206     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1207         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1208         qemu_put_byte(file, 0);
1209         len += 1;
1210         ram_release_page(block->idstr, offset);
1211     }
1212     return len;
1213 }
1214 
1215 /**
1216  * save_zero_page: send the zero page to the stream
1217  *
1218  * Returns the number of pages written.
1219  *
1220  * @rs: current RAM state
1221  * @block: block that contains the page we want to send
1222  * @offset: offset inside the block for the page
1223  */
1224 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1225 {
1226     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1227 
1228     if (len) {
1229         ram_counters.duplicate++;
1230         ram_transferred_add(len);
1231         return 1;
1232     }
1233     return -1;
1234 }
1235 
1236 /*
1237  * @pages: the number of pages written by the control path,
1238  *        < 0 - error
1239  *        > 0 - number of pages written
1240  *
1241  * Return true if the pages has been saved, otherwise false is returned.
1242  */
1243 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1244                               int *pages)
1245 {
1246     uint64_t bytes_xmit = 0;
1247     int ret;
1248 
1249     *pages = -1;
1250     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1251                                 &bytes_xmit);
1252     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1253         return false;
1254     }
1255 
1256     if (bytes_xmit) {
1257         ram_transferred_add(bytes_xmit);
1258         *pages = 1;
1259     }
1260 
1261     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1262         return true;
1263     }
1264 
1265     if (bytes_xmit > 0) {
1266         ram_counters.normal++;
1267     } else if (bytes_xmit == 0) {
1268         ram_counters.duplicate++;
1269     }
1270 
1271     return true;
1272 }
1273 
1274 /*
1275  * directly send the page to the stream
1276  *
1277  * Returns the number of pages written.
1278  *
1279  * @rs: current RAM state
1280  * @block: block that contains the page we want to send
1281  * @offset: offset inside the block for the page
1282  * @buf: the page to be sent
1283  * @async: send to page asyncly
1284  */
1285 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1286                             uint8_t *buf, bool async)
1287 {
1288     ram_transferred_add(save_page_header(rs, rs->f, block,
1289                                          offset | RAM_SAVE_FLAG_PAGE));
1290     if (async) {
1291         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1292                               migrate_release_ram() &
1293                               migration_in_postcopy());
1294     } else {
1295         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1296     }
1297     ram_transferred_add(TARGET_PAGE_SIZE);
1298     ram_counters.normal++;
1299     return 1;
1300 }
1301 
1302 /**
1303  * ram_save_page: send the given page to the stream
1304  *
1305  * Returns the number of pages written.
1306  *          < 0 - error
1307  *          >=0 - Number of pages written - this might legally be 0
1308  *                if xbzrle noticed the page was the same.
1309  *
1310  * @rs: current RAM state
1311  * @block: block that contains the page we want to send
1312  * @offset: offset inside the block for the page
1313  */
1314 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1315 {
1316     int pages = -1;
1317     uint8_t *p;
1318     bool send_async = true;
1319     RAMBlock *block = pss->block;
1320     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1321     ram_addr_t current_addr = block->offset + offset;
1322 
1323     p = block->host + offset;
1324     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1325 
1326     XBZRLE_cache_lock();
1327     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1328         pages = save_xbzrle_page(rs, &p, current_addr, block,
1329                                  offset);
1330         if (!rs->last_stage) {
1331             /* Can't send this cached data async, since the cache page
1332              * might get updated before it gets to the wire
1333              */
1334             send_async = false;
1335         }
1336     }
1337 
1338     /* XBZRLE overflow or normal page */
1339     if (pages == -1) {
1340         pages = save_normal_page(rs, block, offset, p, send_async);
1341     }
1342 
1343     XBZRLE_cache_unlock();
1344 
1345     return pages;
1346 }
1347 
1348 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1349                                  ram_addr_t offset)
1350 {
1351     if (multifd_queue_page(rs->f, block, offset) < 0) {
1352         return -1;
1353     }
1354     ram_counters.normal++;
1355 
1356     return 1;
1357 }
1358 
1359 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1360                                  ram_addr_t offset, uint8_t *source_buf)
1361 {
1362     RAMState *rs = ram_state;
1363     uint8_t *p = block->host + offset;
1364     int ret;
1365 
1366     if (save_zero_page_to_file(rs, f, block, offset)) {
1367         return true;
1368     }
1369 
1370     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1371 
1372     /*
1373      * copy it to a internal buffer to avoid it being modified by VM
1374      * so that we can catch up the error during compression and
1375      * decompression
1376      */
1377     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1378     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1379     if (ret < 0) {
1380         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1381         error_report("compressed data failed!");
1382     }
1383     return false;
1384 }
1385 
1386 static void
1387 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1388 {
1389     ram_transferred_add(bytes_xmit);
1390 
1391     if (param->zero_page) {
1392         ram_counters.duplicate++;
1393         return;
1394     }
1395 
1396     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1397     compression_counters.compressed_size += bytes_xmit - 8;
1398     compression_counters.pages++;
1399 }
1400 
1401 static bool save_page_use_compression(RAMState *rs);
1402 
1403 static void flush_compressed_data(RAMState *rs)
1404 {
1405     int idx, len, thread_count;
1406 
1407     if (!save_page_use_compression(rs)) {
1408         return;
1409     }
1410     thread_count = migrate_compress_threads();
1411 
1412     qemu_mutex_lock(&comp_done_lock);
1413     for (idx = 0; idx < thread_count; idx++) {
1414         while (!comp_param[idx].done) {
1415             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1416         }
1417     }
1418     qemu_mutex_unlock(&comp_done_lock);
1419 
1420     for (idx = 0; idx < thread_count; idx++) {
1421         qemu_mutex_lock(&comp_param[idx].mutex);
1422         if (!comp_param[idx].quit) {
1423             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1424             /*
1425              * it's safe to fetch zero_page without holding comp_done_lock
1426              * as there is no further request submitted to the thread,
1427              * i.e, the thread should be waiting for a request at this point.
1428              */
1429             update_compress_thread_counts(&comp_param[idx], len);
1430         }
1431         qemu_mutex_unlock(&comp_param[idx].mutex);
1432     }
1433 }
1434 
1435 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1436                                        ram_addr_t offset)
1437 {
1438     param->block = block;
1439     param->offset = offset;
1440 }
1441 
1442 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1443                                            ram_addr_t offset)
1444 {
1445     int idx, thread_count, bytes_xmit = -1, pages = -1;
1446     bool wait = migrate_compress_wait_thread();
1447 
1448     thread_count = migrate_compress_threads();
1449     qemu_mutex_lock(&comp_done_lock);
1450 retry:
1451     for (idx = 0; idx < thread_count; idx++) {
1452         if (comp_param[idx].done) {
1453             comp_param[idx].done = false;
1454             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1455             qemu_mutex_lock(&comp_param[idx].mutex);
1456             set_compress_params(&comp_param[idx], block, offset);
1457             qemu_cond_signal(&comp_param[idx].cond);
1458             qemu_mutex_unlock(&comp_param[idx].mutex);
1459             pages = 1;
1460             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1461             break;
1462         }
1463     }
1464 
1465     /*
1466      * wait for the free thread if the user specifies 'compress-wait-thread',
1467      * otherwise we will post the page out in the main thread as normal page.
1468      */
1469     if (pages < 0 && wait) {
1470         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1471         goto retry;
1472     }
1473     qemu_mutex_unlock(&comp_done_lock);
1474 
1475     return pages;
1476 }
1477 
1478 /**
1479  * find_dirty_block: find the next dirty page and update any state
1480  * associated with the search process.
1481  *
1482  * Returns true if a page is found
1483  *
1484  * @rs: current RAM state
1485  * @pss: data about the state of the current dirty page scan
1486  * @again: set to false if the search has scanned the whole of RAM
1487  */
1488 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1489 {
1490     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1491     if (pss->complete_round && pss->block == rs->last_seen_block &&
1492         pss->page >= rs->last_page) {
1493         /*
1494          * We've been once around the RAM and haven't found anything.
1495          * Give up.
1496          */
1497         *again = false;
1498         return false;
1499     }
1500     if (!offset_in_ramblock(pss->block,
1501                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1502         /* Didn't find anything in this RAM Block */
1503         pss->page = 0;
1504         pss->block = QLIST_NEXT_RCU(pss->block, next);
1505         if (!pss->block) {
1506             /*
1507              * If memory migration starts over, we will meet a dirtied page
1508              * which may still exists in compression threads's ring, so we
1509              * should flush the compressed data to make sure the new page
1510              * is not overwritten by the old one in the destination.
1511              *
1512              * Also If xbzrle is on, stop using the data compression at this
1513              * point. In theory, xbzrle can do better than compression.
1514              */
1515             flush_compressed_data(rs);
1516 
1517             /* Hit the end of the list */
1518             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1519             /* Flag that we've looped */
1520             pss->complete_round = true;
1521             /* After the first round, enable XBZRLE. */
1522             if (migrate_use_xbzrle()) {
1523                 rs->xbzrle_enabled = true;
1524             }
1525         }
1526         /* Didn't find anything this time, but try again on the new block */
1527         *again = true;
1528         return false;
1529     } else {
1530         /* Can go around again, but... */
1531         *again = true;
1532         /* We've found something so probably don't need to */
1533         return true;
1534     }
1535 }
1536 
1537 /**
1538  * unqueue_page: gets a page of the queue
1539  *
1540  * Helper for 'get_queued_page' - gets a page off the queue
1541  *
1542  * Returns the block of the page (or NULL if none available)
1543  *
1544  * @rs: current RAM state
1545  * @offset: used to return the offset within the RAMBlock
1546  */
1547 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1548 {
1549     struct RAMSrcPageRequest *entry;
1550     RAMBlock *block = NULL;
1551     size_t page_size;
1552 
1553     if (!postcopy_has_request(rs)) {
1554         return NULL;
1555     }
1556 
1557     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1558 
1559     /*
1560      * This should _never_ change even after we take the lock, because no one
1561      * should be taking anything off the request list other than us.
1562      */
1563     assert(postcopy_has_request(rs));
1564 
1565     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1566     block = entry->rb;
1567     *offset = entry->offset;
1568     page_size = qemu_ram_pagesize(block);
1569     /* Each page request should only be multiple page size of the ramblock */
1570     assert((entry->len % page_size) == 0);
1571 
1572     if (entry->len > page_size) {
1573         entry->len -= page_size;
1574         entry->offset += page_size;
1575     } else {
1576         memory_region_unref(block->mr);
1577         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1578         g_free(entry);
1579         migration_consume_urgent_request();
1580     }
1581 
1582     trace_unqueue_page(block->idstr, *offset,
1583                        test_bit((*offset >> TARGET_PAGE_BITS), block->bmap));
1584 
1585     return block;
1586 }
1587 
1588 #if defined(__linux__)
1589 /**
1590  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1591  *   is found, return RAM block pointer and page offset
1592  *
1593  * Returns pointer to the RAMBlock containing faulting page,
1594  *   NULL if no write faults are pending
1595  *
1596  * @rs: current RAM state
1597  * @offset: page offset from the beginning of the block
1598  */
1599 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1600 {
1601     struct uffd_msg uffd_msg;
1602     void *page_address;
1603     RAMBlock *block;
1604     int res;
1605 
1606     if (!migrate_background_snapshot()) {
1607         return NULL;
1608     }
1609 
1610     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1611     if (res <= 0) {
1612         return NULL;
1613     }
1614 
1615     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1616     block = qemu_ram_block_from_host(page_address, false, offset);
1617     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1618     return block;
1619 }
1620 
1621 /**
1622  * ram_save_release_protection: release UFFD write protection after
1623  *   a range of pages has been saved
1624  *
1625  * @rs: current RAM state
1626  * @pss: page-search-status structure
1627  * @start_page: index of the first page in the range relative to pss->block
1628  *
1629  * Returns 0 on success, negative value in case of an error
1630 */
1631 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1632         unsigned long start_page)
1633 {
1634     int res = 0;
1635 
1636     /* Check if page is from UFFD-managed region. */
1637     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1638         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1639         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1640 
1641         /* Flush async buffers before un-protect. */
1642         qemu_fflush(rs->f);
1643         /* Un-protect memory range. */
1644         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1645                 false, false);
1646     }
1647 
1648     return res;
1649 }
1650 
1651 /* ram_write_tracking_available: check if kernel supports required UFFD features
1652  *
1653  * Returns true if supports, false otherwise
1654  */
1655 bool ram_write_tracking_available(void)
1656 {
1657     uint64_t uffd_features;
1658     int res;
1659 
1660     res = uffd_query_features(&uffd_features);
1661     return (res == 0 &&
1662             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1663 }
1664 
1665 /* ram_write_tracking_compatible: check if guest configuration is
1666  *   compatible with 'write-tracking'
1667  *
1668  * Returns true if compatible, false otherwise
1669  */
1670 bool ram_write_tracking_compatible(void)
1671 {
1672     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1673     int uffd_fd;
1674     RAMBlock *block;
1675     bool ret = false;
1676 
1677     /* Open UFFD file descriptor */
1678     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1679     if (uffd_fd < 0) {
1680         return false;
1681     }
1682 
1683     RCU_READ_LOCK_GUARD();
1684 
1685     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1686         uint64_t uffd_ioctls;
1687 
1688         /* Nothing to do with read-only and MMIO-writable regions */
1689         if (block->mr->readonly || block->mr->rom_device) {
1690             continue;
1691         }
1692         /* Try to register block memory via UFFD-IO to track writes */
1693         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1694                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1695             goto out;
1696         }
1697         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1698             goto out;
1699         }
1700     }
1701     ret = true;
1702 
1703 out:
1704     uffd_close_fd(uffd_fd);
1705     return ret;
1706 }
1707 
1708 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1709                                        ram_addr_t size)
1710 {
1711     /*
1712      * We read one byte of each page; this will preallocate page tables if
1713      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1714      * where no page was populated yet. This might require adaption when
1715      * supporting other mappings, like shmem.
1716      */
1717     for (; offset < size; offset += block->page_size) {
1718         char tmp = *((char *)block->host + offset);
1719 
1720         /* Don't optimize the read out */
1721         asm volatile("" : "+r" (tmp));
1722     }
1723 }
1724 
1725 static inline int populate_read_section(MemoryRegionSection *section,
1726                                         void *opaque)
1727 {
1728     const hwaddr size = int128_get64(section->size);
1729     hwaddr offset = section->offset_within_region;
1730     RAMBlock *block = section->mr->ram_block;
1731 
1732     populate_read_range(block, offset, size);
1733     return 0;
1734 }
1735 
1736 /*
1737  * ram_block_populate_read: preallocate page tables and populate pages in the
1738  *   RAM block by reading a byte of each page.
1739  *
1740  * Since it's solely used for userfault_fd WP feature, here we just
1741  *   hardcode page size to qemu_real_host_page_size.
1742  *
1743  * @block: RAM block to populate
1744  */
1745 static void ram_block_populate_read(RAMBlock *rb)
1746 {
1747     /*
1748      * Skip populating all pages that fall into a discarded range as managed by
1749      * a RamDiscardManager responsible for the mapped memory region of the
1750      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1751      * must not get populated automatically. We don't have to track
1752      * modifications via userfaultfd WP reliably, because these pages will
1753      * not be part of the migration stream either way -- see
1754      * ramblock_dirty_bitmap_exclude_discarded_pages().
1755      *
1756      * Note: The result is only stable while migrating (precopy/postcopy).
1757      */
1758     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1759         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1760         MemoryRegionSection section = {
1761             .mr = rb->mr,
1762             .offset_within_region = 0,
1763             .size = rb->mr->size,
1764         };
1765 
1766         ram_discard_manager_replay_populated(rdm, &section,
1767                                              populate_read_section, NULL);
1768     } else {
1769         populate_read_range(rb, 0, rb->used_length);
1770     }
1771 }
1772 
1773 /*
1774  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1775  */
1776 void ram_write_tracking_prepare(void)
1777 {
1778     RAMBlock *block;
1779 
1780     RCU_READ_LOCK_GUARD();
1781 
1782     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1783         /* Nothing to do with read-only and MMIO-writable regions */
1784         if (block->mr->readonly || block->mr->rom_device) {
1785             continue;
1786         }
1787 
1788         /*
1789          * Populate pages of the RAM block before enabling userfault_fd
1790          * write protection.
1791          *
1792          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1793          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1794          * pages with pte_none() entries in page table.
1795          */
1796         ram_block_populate_read(block);
1797     }
1798 }
1799 
1800 /*
1801  * ram_write_tracking_start: start UFFD-WP memory tracking
1802  *
1803  * Returns 0 for success or negative value in case of error
1804  */
1805 int ram_write_tracking_start(void)
1806 {
1807     int uffd_fd;
1808     RAMState *rs = ram_state;
1809     RAMBlock *block;
1810 
1811     /* Open UFFD file descriptor */
1812     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1813     if (uffd_fd < 0) {
1814         return uffd_fd;
1815     }
1816     rs->uffdio_fd = uffd_fd;
1817 
1818     RCU_READ_LOCK_GUARD();
1819 
1820     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1821         /* Nothing to do with read-only and MMIO-writable regions */
1822         if (block->mr->readonly || block->mr->rom_device) {
1823             continue;
1824         }
1825 
1826         /* Register block memory with UFFD to track writes */
1827         if (uffd_register_memory(rs->uffdio_fd, block->host,
1828                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1829             goto fail;
1830         }
1831         /* Apply UFFD write protection to the block memory range */
1832         if (uffd_change_protection(rs->uffdio_fd, block->host,
1833                 block->max_length, true, false)) {
1834             goto fail;
1835         }
1836         block->flags |= RAM_UF_WRITEPROTECT;
1837         memory_region_ref(block->mr);
1838 
1839         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1840                 block->host, block->max_length);
1841     }
1842 
1843     return 0;
1844 
1845 fail:
1846     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1847 
1848     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1849         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1850             continue;
1851         }
1852         /*
1853          * In case some memory block failed to be write-protected
1854          * remove protection and unregister all succeeded RAM blocks
1855          */
1856         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1857                 false, false);
1858         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1859         /* Cleanup flags and remove reference */
1860         block->flags &= ~RAM_UF_WRITEPROTECT;
1861         memory_region_unref(block->mr);
1862     }
1863 
1864     uffd_close_fd(uffd_fd);
1865     rs->uffdio_fd = -1;
1866     return -1;
1867 }
1868 
1869 /**
1870  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1871  */
1872 void ram_write_tracking_stop(void)
1873 {
1874     RAMState *rs = ram_state;
1875     RAMBlock *block;
1876 
1877     RCU_READ_LOCK_GUARD();
1878 
1879     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1880         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1881             continue;
1882         }
1883         /* Remove protection and unregister all affected RAM blocks */
1884         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1885                 false, false);
1886         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1887 
1888         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1889                 block->host, block->max_length);
1890 
1891         /* Cleanup flags and remove reference */
1892         block->flags &= ~RAM_UF_WRITEPROTECT;
1893         memory_region_unref(block->mr);
1894     }
1895 
1896     /* Finally close UFFD file descriptor */
1897     uffd_close_fd(rs->uffdio_fd);
1898     rs->uffdio_fd = -1;
1899 }
1900 
1901 #else
1902 /* No target OS support, stubs just fail or ignore */
1903 
1904 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1905 {
1906     (void) rs;
1907     (void) offset;
1908 
1909     return NULL;
1910 }
1911 
1912 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1913         unsigned long start_page)
1914 {
1915     (void) rs;
1916     (void) pss;
1917     (void) start_page;
1918 
1919     return 0;
1920 }
1921 
1922 bool ram_write_tracking_available(void)
1923 {
1924     return false;
1925 }
1926 
1927 bool ram_write_tracking_compatible(void)
1928 {
1929     assert(0);
1930     return false;
1931 }
1932 
1933 int ram_write_tracking_start(void)
1934 {
1935     assert(0);
1936     return -1;
1937 }
1938 
1939 void ram_write_tracking_stop(void)
1940 {
1941     assert(0);
1942 }
1943 #endif /* defined(__linux__) */
1944 
1945 /**
1946  * get_queued_page: unqueue a page from the postcopy requests
1947  *
1948  * Skips pages that are already sent (!dirty)
1949  *
1950  * Returns true if a queued page is found
1951  *
1952  * @rs: current RAM state
1953  * @pss: data about the state of the current dirty page scan
1954  */
1955 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1956 {
1957     RAMBlock  *block;
1958     ram_addr_t offset;
1959 
1960     block = unqueue_page(rs, &offset);
1961 
1962     if (!block) {
1963         /*
1964          * Poll write faults too if background snapshot is enabled; that's
1965          * when we have vcpus got blocked by the write protected pages.
1966          */
1967         block = poll_fault_page(rs, &offset);
1968     }
1969 
1970     if (block) {
1971         /*
1972          * We want the background search to continue from the queued page
1973          * since the guest is likely to want other pages near to the page
1974          * it just requested.
1975          */
1976         pss->block = block;
1977         pss->page = offset >> TARGET_PAGE_BITS;
1978 
1979         /*
1980          * This unqueued page would break the "one round" check, even is
1981          * really rare.
1982          */
1983         pss->complete_round = false;
1984     }
1985 
1986     return !!block;
1987 }
1988 
1989 /**
1990  * migration_page_queue_free: drop any remaining pages in the ram
1991  * request queue
1992  *
1993  * It should be empty at the end anyway, but in error cases there may
1994  * be some left.  in case that there is any page left, we drop it.
1995  *
1996  */
1997 static void migration_page_queue_free(RAMState *rs)
1998 {
1999     struct RAMSrcPageRequest *mspr, *next_mspr;
2000     /* This queue generally should be empty - but in the case of a failed
2001      * migration might have some droppings in.
2002      */
2003     RCU_READ_LOCK_GUARD();
2004     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2005         memory_region_unref(mspr->rb->mr);
2006         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2007         g_free(mspr);
2008     }
2009 }
2010 
2011 /**
2012  * ram_save_queue_pages: queue the page for transmission
2013  *
2014  * A request from postcopy destination for example.
2015  *
2016  * Returns zero on success or negative on error
2017  *
2018  * @rbname: Name of the RAMBLock of the request. NULL means the
2019  *          same that last one.
2020  * @start: starting address from the start of the RAMBlock
2021  * @len: length (in bytes) to send
2022  */
2023 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2024 {
2025     RAMBlock *ramblock;
2026     RAMState *rs = ram_state;
2027 
2028     ram_counters.postcopy_requests++;
2029     RCU_READ_LOCK_GUARD();
2030 
2031     if (!rbname) {
2032         /* Reuse last RAMBlock */
2033         ramblock = rs->last_req_rb;
2034 
2035         if (!ramblock) {
2036             /*
2037              * Shouldn't happen, we can't reuse the last RAMBlock if
2038              * it's the 1st request.
2039              */
2040             error_report("ram_save_queue_pages no previous block");
2041             return -1;
2042         }
2043     } else {
2044         ramblock = qemu_ram_block_by_name(rbname);
2045 
2046         if (!ramblock) {
2047             /* We shouldn't be asked for a non-existent RAMBlock */
2048             error_report("ram_save_queue_pages no block '%s'", rbname);
2049             return -1;
2050         }
2051         rs->last_req_rb = ramblock;
2052     }
2053     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2054     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2055         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2056                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2057                      __func__, start, len, ramblock->used_length);
2058         return -1;
2059     }
2060 
2061     struct RAMSrcPageRequest *new_entry =
2062         g_malloc0(sizeof(struct RAMSrcPageRequest));
2063     new_entry->rb = ramblock;
2064     new_entry->offset = start;
2065     new_entry->len = len;
2066 
2067     memory_region_ref(ramblock->mr);
2068     qemu_mutex_lock(&rs->src_page_req_mutex);
2069     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2070     migration_make_urgent_request();
2071     qemu_mutex_unlock(&rs->src_page_req_mutex);
2072 
2073     return 0;
2074 }
2075 
2076 static bool save_page_use_compression(RAMState *rs)
2077 {
2078     if (!migrate_use_compression()) {
2079         return false;
2080     }
2081 
2082     /*
2083      * If xbzrle is enabled (e.g., after first round of migration), stop
2084      * using the data compression. In theory, xbzrle can do better than
2085      * compression.
2086      */
2087     if (rs->xbzrle_enabled) {
2088         return false;
2089     }
2090 
2091     return true;
2092 }
2093 
2094 /*
2095  * try to compress the page before posting it out, return true if the page
2096  * has been properly handled by compression, otherwise needs other
2097  * paths to handle it
2098  */
2099 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2100 {
2101     if (!save_page_use_compression(rs)) {
2102         return false;
2103     }
2104 
2105     /*
2106      * When starting the process of a new block, the first page of
2107      * the block should be sent out before other pages in the same
2108      * block, and all the pages in last block should have been sent
2109      * out, keeping this order is important, because the 'cont' flag
2110      * is used to avoid resending the block name.
2111      *
2112      * We post the fist page as normal page as compression will take
2113      * much CPU resource.
2114      */
2115     if (block != rs->last_sent_block) {
2116         flush_compressed_data(rs);
2117         return false;
2118     }
2119 
2120     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2121         return true;
2122     }
2123 
2124     compression_counters.busy++;
2125     return false;
2126 }
2127 
2128 /**
2129  * ram_save_target_page: save one target page
2130  *
2131  * Returns the number of pages written
2132  *
2133  * @rs: current RAM state
2134  * @pss: data about the page we want to send
2135  */
2136 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2137 {
2138     RAMBlock *block = pss->block;
2139     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2140     int res;
2141 
2142     if (control_save_page(rs, block, offset, &res)) {
2143         return res;
2144     }
2145 
2146     if (save_compress_page(rs, block, offset)) {
2147         return 1;
2148     }
2149 
2150     res = save_zero_page(rs, block, offset);
2151     if (res > 0) {
2152         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2153          * page would be stale
2154          */
2155         if (!save_page_use_compression(rs)) {
2156             XBZRLE_cache_lock();
2157             xbzrle_cache_zero_page(rs, block->offset + offset);
2158             XBZRLE_cache_unlock();
2159         }
2160         return res;
2161     }
2162 
2163     /*
2164      * Do not use multifd for:
2165      * 1. Compression as the first page in the new block should be posted out
2166      *    before sending the compressed page
2167      * 2. In postcopy as one whole host page should be placed
2168      */
2169     if (!save_page_use_compression(rs) && migrate_use_multifd()
2170         && !migration_in_postcopy()) {
2171         return ram_save_multifd_page(rs, block, offset);
2172     }
2173 
2174     return ram_save_page(rs, pss);
2175 }
2176 
2177 /**
2178  * ram_save_host_page: save a whole host page
2179  *
2180  * Starting at *offset send pages up to the end of the current host
2181  * page. It's valid for the initial offset to point into the middle of
2182  * a host page in which case the remainder of the hostpage is sent.
2183  * Only dirty target pages are sent. Note that the host page size may
2184  * be a huge page for this block.
2185  * The saving stops at the boundary of the used_length of the block
2186  * if the RAMBlock isn't a multiple of the host page size.
2187  *
2188  * Returns the number of pages written or negative on error
2189  *
2190  * @rs: current RAM state
2191  * @pss: data about the page we want to send
2192  */
2193 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2194 {
2195     int tmppages, pages = 0;
2196     size_t pagesize_bits =
2197         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2198     unsigned long hostpage_boundary =
2199         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2200     unsigned long start_page = pss->page;
2201     int res;
2202 
2203     if (ramblock_is_ignored(pss->block)) {
2204         error_report("block %s should not be migrated !", pss->block->idstr);
2205         return 0;
2206     }
2207 
2208     do {
2209         /* Check the pages is dirty and if it is send it */
2210         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2211             tmppages = ram_save_target_page(rs, pss);
2212             if (tmppages < 0) {
2213                 return tmppages;
2214             }
2215 
2216             pages += tmppages;
2217             /*
2218              * Allow rate limiting to happen in the middle of huge pages if
2219              * something is sent in the current iteration.
2220              */
2221             if (pagesize_bits > 1 && tmppages > 0) {
2222                 migration_rate_limit();
2223             }
2224         }
2225         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2226     } while ((pss->page < hostpage_boundary) &&
2227              offset_in_ramblock(pss->block,
2228                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2229     /* The offset we leave with is the min boundary of host page and block */
2230     pss->page = MIN(pss->page, hostpage_boundary);
2231 
2232     res = ram_save_release_protection(rs, pss, start_page);
2233     return (res < 0 ? res : pages);
2234 }
2235 
2236 /**
2237  * ram_find_and_save_block: finds a dirty page and sends it to f
2238  *
2239  * Called within an RCU critical section.
2240  *
2241  * Returns the number of pages written where zero means no dirty pages,
2242  * or negative on error
2243  *
2244  * @rs: current RAM state
2245  *
2246  * On systems where host-page-size > target-page-size it will send all the
2247  * pages in a host page that are dirty.
2248  */
2249 static int ram_find_and_save_block(RAMState *rs)
2250 {
2251     PageSearchStatus pss;
2252     int pages = 0;
2253     bool again, found;
2254 
2255     /* No dirty page as there is zero RAM */
2256     if (!ram_bytes_total()) {
2257         return pages;
2258     }
2259 
2260     pss.block = rs->last_seen_block;
2261     pss.page = rs->last_page;
2262     pss.complete_round = false;
2263 
2264     if (!pss.block) {
2265         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2266     }
2267 
2268     do {
2269         again = true;
2270         found = get_queued_page(rs, &pss);
2271 
2272         if (!found) {
2273             /* priority queue empty, so just search for something dirty */
2274             found = find_dirty_block(rs, &pss, &again);
2275         }
2276 
2277         if (found) {
2278             pages = ram_save_host_page(rs, &pss);
2279         }
2280     } while (!pages && again);
2281 
2282     rs->last_seen_block = pss.block;
2283     rs->last_page = pss.page;
2284 
2285     return pages;
2286 }
2287 
2288 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2289 {
2290     uint64_t pages = size / TARGET_PAGE_SIZE;
2291 
2292     if (zero) {
2293         ram_counters.duplicate += pages;
2294     } else {
2295         ram_counters.normal += pages;
2296         ram_transferred_add(size);
2297         qemu_update_position(f, size);
2298     }
2299 }
2300 
2301 static uint64_t ram_bytes_total_common(bool count_ignored)
2302 {
2303     RAMBlock *block;
2304     uint64_t total = 0;
2305 
2306     RCU_READ_LOCK_GUARD();
2307 
2308     if (count_ignored) {
2309         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2310             total += block->used_length;
2311         }
2312     } else {
2313         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2314             total += block->used_length;
2315         }
2316     }
2317     return total;
2318 }
2319 
2320 uint64_t ram_bytes_total(void)
2321 {
2322     return ram_bytes_total_common(false);
2323 }
2324 
2325 static void xbzrle_load_setup(void)
2326 {
2327     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2328 }
2329 
2330 static void xbzrle_load_cleanup(void)
2331 {
2332     g_free(XBZRLE.decoded_buf);
2333     XBZRLE.decoded_buf = NULL;
2334 }
2335 
2336 static void ram_state_cleanup(RAMState **rsp)
2337 {
2338     if (*rsp) {
2339         migration_page_queue_free(*rsp);
2340         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2341         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2342         g_free(*rsp);
2343         *rsp = NULL;
2344     }
2345 }
2346 
2347 static void xbzrle_cleanup(void)
2348 {
2349     XBZRLE_cache_lock();
2350     if (XBZRLE.cache) {
2351         cache_fini(XBZRLE.cache);
2352         g_free(XBZRLE.encoded_buf);
2353         g_free(XBZRLE.current_buf);
2354         g_free(XBZRLE.zero_target_page);
2355         XBZRLE.cache = NULL;
2356         XBZRLE.encoded_buf = NULL;
2357         XBZRLE.current_buf = NULL;
2358         XBZRLE.zero_target_page = NULL;
2359     }
2360     XBZRLE_cache_unlock();
2361 }
2362 
2363 static void ram_save_cleanup(void *opaque)
2364 {
2365     RAMState **rsp = opaque;
2366     RAMBlock *block;
2367 
2368     /* We don't use dirty log with background snapshots */
2369     if (!migrate_background_snapshot()) {
2370         /* caller have hold iothread lock or is in a bh, so there is
2371          * no writing race against the migration bitmap
2372          */
2373         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2374             /*
2375              * do not stop dirty log without starting it, since
2376              * memory_global_dirty_log_stop will assert that
2377              * memory_global_dirty_log_start/stop used in pairs
2378              */
2379             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2380         }
2381     }
2382 
2383     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2384         g_free(block->clear_bmap);
2385         block->clear_bmap = NULL;
2386         g_free(block->bmap);
2387         block->bmap = NULL;
2388     }
2389 
2390     xbzrle_cleanup();
2391     compress_threads_save_cleanup();
2392     ram_state_cleanup(rsp);
2393 }
2394 
2395 static void ram_state_reset(RAMState *rs)
2396 {
2397     rs->last_seen_block = NULL;
2398     rs->last_sent_block = NULL;
2399     rs->last_page = 0;
2400     rs->last_version = ram_list.version;
2401     rs->xbzrle_enabled = false;
2402 }
2403 
2404 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2405 
2406 /* **** functions for postcopy ***** */
2407 
2408 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2409 {
2410     struct RAMBlock *block;
2411 
2412     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2413         unsigned long *bitmap = block->bmap;
2414         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2415         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2416 
2417         while (run_start < range) {
2418             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2419             ram_discard_range(block->idstr,
2420                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2421                               ((ram_addr_t)(run_end - run_start))
2422                                 << TARGET_PAGE_BITS);
2423             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2424         }
2425     }
2426 }
2427 
2428 /**
2429  * postcopy_send_discard_bm_ram: discard a RAMBlock
2430  *
2431  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2432  *
2433  * @ms: current migration state
2434  * @block: RAMBlock to discard
2435  */
2436 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2437 {
2438     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2439     unsigned long current;
2440     unsigned long *bitmap = block->bmap;
2441 
2442     for (current = 0; current < end; ) {
2443         unsigned long one = find_next_bit(bitmap, end, current);
2444         unsigned long zero, discard_length;
2445 
2446         if (one >= end) {
2447             break;
2448         }
2449 
2450         zero = find_next_zero_bit(bitmap, end, one + 1);
2451 
2452         if (zero >= end) {
2453             discard_length = end - one;
2454         } else {
2455             discard_length = zero - one;
2456         }
2457         postcopy_discard_send_range(ms, one, discard_length);
2458         current = one + discard_length;
2459     }
2460 }
2461 
2462 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2463 
2464 /**
2465  * postcopy_each_ram_send_discard: discard all RAMBlocks
2466  *
2467  * Utility for the outgoing postcopy code.
2468  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2469  *   passing it bitmap indexes and name.
2470  * (qemu_ram_foreach_block ends up passing unscaled lengths
2471  *  which would mean postcopy code would have to deal with target page)
2472  *
2473  * @ms: current migration state
2474  */
2475 static void postcopy_each_ram_send_discard(MigrationState *ms)
2476 {
2477     struct RAMBlock *block;
2478 
2479     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2480         postcopy_discard_send_init(ms, block->idstr);
2481 
2482         /*
2483          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2484          * host-page size chunks, mark any partially dirty host-page size
2485          * chunks as all dirty.  In this case the host-page is the host-page
2486          * for the particular RAMBlock, i.e. it might be a huge page.
2487          */
2488         postcopy_chunk_hostpages_pass(ms, block);
2489 
2490         /*
2491          * Postcopy sends chunks of bitmap over the wire, but it
2492          * just needs indexes at this point, avoids it having
2493          * target page specific code.
2494          */
2495         postcopy_send_discard_bm_ram(ms, block);
2496         postcopy_discard_send_finish(ms);
2497     }
2498 }
2499 
2500 /**
2501  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2502  *
2503  * Helper for postcopy_chunk_hostpages; it's called twice to
2504  * canonicalize the two bitmaps, that are similar, but one is
2505  * inverted.
2506  *
2507  * Postcopy requires that all target pages in a hostpage are dirty or
2508  * clean, not a mix.  This function canonicalizes the bitmaps.
2509  *
2510  * @ms: current migration state
2511  * @block: block that contains the page we want to canonicalize
2512  */
2513 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2514 {
2515     RAMState *rs = ram_state;
2516     unsigned long *bitmap = block->bmap;
2517     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2518     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2519     unsigned long run_start;
2520 
2521     if (block->page_size == TARGET_PAGE_SIZE) {
2522         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2523         return;
2524     }
2525 
2526     /* Find a dirty page */
2527     run_start = find_next_bit(bitmap, pages, 0);
2528 
2529     while (run_start < pages) {
2530 
2531         /*
2532          * If the start of this run of pages is in the middle of a host
2533          * page, then we need to fixup this host page.
2534          */
2535         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2536             /* Find the end of this run */
2537             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2538             /*
2539              * If the end isn't at the start of a host page, then the
2540              * run doesn't finish at the end of a host page
2541              * and we need to discard.
2542              */
2543         }
2544 
2545         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2546             unsigned long page;
2547             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2548                                                              host_ratio);
2549             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2550 
2551             /* Clean up the bitmap */
2552             for (page = fixup_start_addr;
2553                  page < fixup_start_addr + host_ratio; page++) {
2554                 /*
2555                  * Remark them as dirty, updating the count for any pages
2556                  * that weren't previously dirty.
2557                  */
2558                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2559             }
2560         }
2561 
2562         /* Find the next dirty page for the next iteration */
2563         run_start = find_next_bit(bitmap, pages, run_start);
2564     }
2565 }
2566 
2567 /**
2568  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2569  *
2570  * Transmit the set of pages to be discarded after precopy to the target
2571  * these are pages that:
2572  *     a) Have been previously transmitted but are now dirty again
2573  *     b) Pages that have never been transmitted, this ensures that
2574  *        any pages on the destination that have been mapped by background
2575  *        tasks get discarded (transparent huge pages is the specific concern)
2576  * Hopefully this is pretty sparse
2577  *
2578  * @ms: current migration state
2579  */
2580 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2581 {
2582     RAMState *rs = ram_state;
2583 
2584     RCU_READ_LOCK_GUARD();
2585 
2586     /* This should be our last sync, the src is now paused */
2587     migration_bitmap_sync(rs);
2588 
2589     /* Easiest way to make sure we don't resume in the middle of a host-page */
2590     rs->last_seen_block = NULL;
2591     rs->last_sent_block = NULL;
2592     rs->last_page = 0;
2593 
2594     postcopy_each_ram_send_discard(ms);
2595 
2596     trace_ram_postcopy_send_discard_bitmap();
2597 }
2598 
2599 /**
2600  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2601  *
2602  * Returns zero on success
2603  *
2604  * @rbname: name of the RAMBlock of the request. NULL means the
2605  *          same that last one.
2606  * @start: RAMBlock starting page
2607  * @length: RAMBlock size
2608  */
2609 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2610 {
2611     trace_ram_discard_range(rbname, start, length);
2612 
2613     RCU_READ_LOCK_GUARD();
2614     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2615 
2616     if (!rb) {
2617         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2618         return -1;
2619     }
2620 
2621     /*
2622      * On source VM, we don't need to update the received bitmap since
2623      * we don't even have one.
2624      */
2625     if (rb->receivedmap) {
2626         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2627                      length >> qemu_target_page_bits());
2628     }
2629 
2630     return ram_block_discard_range(rb, start, length);
2631 }
2632 
2633 /*
2634  * For every allocation, we will try not to crash the VM if the
2635  * allocation failed.
2636  */
2637 static int xbzrle_init(void)
2638 {
2639     Error *local_err = NULL;
2640 
2641     if (!migrate_use_xbzrle()) {
2642         return 0;
2643     }
2644 
2645     XBZRLE_cache_lock();
2646 
2647     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2648     if (!XBZRLE.zero_target_page) {
2649         error_report("%s: Error allocating zero page", __func__);
2650         goto err_out;
2651     }
2652 
2653     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2654                               TARGET_PAGE_SIZE, &local_err);
2655     if (!XBZRLE.cache) {
2656         error_report_err(local_err);
2657         goto free_zero_page;
2658     }
2659 
2660     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2661     if (!XBZRLE.encoded_buf) {
2662         error_report("%s: Error allocating encoded_buf", __func__);
2663         goto free_cache;
2664     }
2665 
2666     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2667     if (!XBZRLE.current_buf) {
2668         error_report("%s: Error allocating current_buf", __func__);
2669         goto free_encoded_buf;
2670     }
2671 
2672     /* We are all good */
2673     XBZRLE_cache_unlock();
2674     return 0;
2675 
2676 free_encoded_buf:
2677     g_free(XBZRLE.encoded_buf);
2678     XBZRLE.encoded_buf = NULL;
2679 free_cache:
2680     cache_fini(XBZRLE.cache);
2681     XBZRLE.cache = NULL;
2682 free_zero_page:
2683     g_free(XBZRLE.zero_target_page);
2684     XBZRLE.zero_target_page = NULL;
2685 err_out:
2686     XBZRLE_cache_unlock();
2687     return -ENOMEM;
2688 }
2689 
2690 static int ram_state_init(RAMState **rsp)
2691 {
2692     *rsp = g_try_new0(RAMState, 1);
2693 
2694     if (!*rsp) {
2695         error_report("%s: Init ramstate fail", __func__);
2696         return -1;
2697     }
2698 
2699     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2700     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2701     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2702 
2703     /*
2704      * Count the total number of pages used by ram blocks not including any
2705      * gaps due to alignment or unplugs.
2706      * This must match with the initial values of dirty bitmap.
2707      */
2708     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2709     ram_state_reset(*rsp);
2710 
2711     return 0;
2712 }
2713 
2714 static void ram_list_init_bitmaps(void)
2715 {
2716     MigrationState *ms = migrate_get_current();
2717     RAMBlock *block;
2718     unsigned long pages;
2719     uint8_t shift;
2720 
2721     /* Skip setting bitmap if there is no RAM */
2722     if (ram_bytes_total()) {
2723         shift = ms->clear_bitmap_shift;
2724         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2725             error_report("clear_bitmap_shift (%u) too big, using "
2726                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2727             shift = CLEAR_BITMAP_SHIFT_MAX;
2728         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2729             error_report("clear_bitmap_shift (%u) too small, using "
2730                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2731             shift = CLEAR_BITMAP_SHIFT_MIN;
2732         }
2733 
2734         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2735             pages = block->max_length >> TARGET_PAGE_BITS;
2736             /*
2737              * The initial dirty bitmap for migration must be set with all
2738              * ones to make sure we'll migrate every guest RAM page to
2739              * destination.
2740              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2741              * new migration after a failed migration, ram_list.
2742              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2743              * guest memory.
2744              */
2745             block->bmap = bitmap_new(pages);
2746             bitmap_set(block->bmap, 0, pages);
2747             block->clear_bmap_shift = shift;
2748             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2749         }
2750     }
2751 }
2752 
2753 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2754 {
2755     unsigned long pages;
2756     RAMBlock *rb;
2757 
2758     RCU_READ_LOCK_GUARD();
2759 
2760     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2761             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2762             rs->migration_dirty_pages -= pages;
2763     }
2764 }
2765 
2766 static void ram_init_bitmaps(RAMState *rs)
2767 {
2768     /* For memory_global_dirty_log_start below.  */
2769     qemu_mutex_lock_iothread();
2770     qemu_mutex_lock_ramlist();
2771 
2772     WITH_RCU_READ_LOCK_GUARD() {
2773         ram_list_init_bitmaps();
2774         /* We don't use dirty log with background snapshots */
2775         if (!migrate_background_snapshot()) {
2776             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2777             migration_bitmap_sync_precopy(rs);
2778         }
2779     }
2780     qemu_mutex_unlock_ramlist();
2781     qemu_mutex_unlock_iothread();
2782 
2783     /*
2784      * After an eventual first bitmap sync, fixup the initial bitmap
2785      * containing all 1s to exclude any discarded pages from migration.
2786      */
2787     migration_bitmap_clear_discarded_pages(rs);
2788 }
2789 
2790 static int ram_init_all(RAMState **rsp)
2791 {
2792     if (ram_state_init(rsp)) {
2793         return -1;
2794     }
2795 
2796     if (xbzrle_init()) {
2797         ram_state_cleanup(rsp);
2798         return -1;
2799     }
2800 
2801     ram_init_bitmaps(*rsp);
2802 
2803     return 0;
2804 }
2805 
2806 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2807 {
2808     RAMBlock *block;
2809     uint64_t pages = 0;
2810 
2811     /*
2812      * Postcopy is not using xbzrle/compression, so no need for that.
2813      * Also, since source are already halted, we don't need to care
2814      * about dirty page logging as well.
2815      */
2816 
2817     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2818         pages += bitmap_count_one(block->bmap,
2819                                   block->used_length >> TARGET_PAGE_BITS);
2820     }
2821 
2822     /* This may not be aligned with current bitmaps. Recalculate. */
2823     rs->migration_dirty_pages = pages;
2824 
2825     ram_state_reset(rs);
2826 
2827     /* Update RAMState cache of output QEMUFile */
2828     rs->f = out;
2829 
2830     trace_ram_state_resume_prepare(pages);
2831 }
2832 
2833 /*
2834  * This function clears bits of the free pages reported by the caller from the
2835  * migration dirty bitmap. @addr is the host address corresponding to the
2836  * start of the continuous guest free pages, and @len is the total bytes of
2837  * those pages.
2838  */
2839 void qemu_guest_free_page_hint(void *addr, size_t len)
2840 {
2841     RAMBlock *block;
2842     ram_addr_t offset;
2843     size_t used_len, start, npages;
2844     MigrationState *s = migrate_get_current();
2845 
2846     /* This function is currently expected to be used during live migration */
2847     if (!migration_is_setup_or_active(s->state)) {
2848         return;
2849     }
2850 
2851     for (; len > 0; len -= used_len, addr += used_len) {
2852         block = qemu_ram_block_from_host(addr, false, &offset);
2853         if (unlikely(!block || offset >= block->used_length)) {
2854             /*
2855              * The implementation might not support RAMBlock resize during
2856              * live migration, but it could happen in theory with future
2857              * updates. So we add a check here to capture that case.
2858              */
2859             error_report_once("%s unexpected error", __func__);
2860             return;
2861         }
2862 
2863         if (len <= block->used_length - offset) {
2864             used_len = len;
2865         } else {
2866             used_len = block->used_length - offset;
2867         }
2868 
2869         start = offset >> TARGET_PAGE_BITS;
2870         npages = used_len >> TARGET_PAGE_BITS;
2871 
2872         qemu_mutex_lock(&ram_state->bitmap_mutex);
2873         /*
2874          * The skipped free pages are equavalent to be sent from clear_bmap's
2875          * perspective, so clear the bits from the memory region bitmap which
2876          * are initially set. Otherwise those skipped pages will be sent in
2877          * the next round after syncing from the memory region bitmap.
2878          */
2879         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2880         ram_state->migration_dirty_pages -=
2881                       bitmap_count_one_with_offset(block->bmap, start, npages);
2882         bitmap_clear(block->bmap, start, npages);
2883         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2884     }
2885 }
2886 
2887 /*
2888  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2889  * long-running RCU critical section.  When rcu-reclaims in the code
2890  * start to become numerous it will be necessary to reduce the
2891  * granularity of these critical sections.
2892  */
2893 
2894 /**
2895  * ram_save_setup: Setup RAM for migration
2896  *
2897  * Returns zero to indicate success and negative for error
2898  *
2899  * @f: QEMUFile where to send the data
2900  * @opaque: RAMState pointer
2901  */
2902 static int ram_save_setup(QEMUFile *f, void *opaque)
2903 {
2904     RAMState **rsp = opaque;
2905     RAMBlock *block;
2906 
2907     if (compress_threads_save_setup()) {
2908         return -1;
2909     }
2910 
2911     /* migration has already setup the bitmap, reuse it. */
2912     if (!migration_in_colo_state()) {
2913         if (ram_init_all(rsp) != 0) {
2914             compress_threads_save_cleanup();
2915             return -1;
2916         }
2917     }
2918     (*rsp)->f = f;
2919 
2920     WITH_RCU_READ_LOCK_GUARD() {
2921         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2922 
2923         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2924             qemu_put_byte(f, strlen(block->idstr));
2925             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2926             qemu_put_be64(f, block->used_length);
2927             if (migrate_postcopy_ram() && block->page_size !=
2928                                           qemu_host_page_size) {
2929                 qemu_put_be64(f, block->page_size);
2930             }
2931             if (migrate_ignore_shared()) {
2932                 qemu_put_be64(f, block->mr->addr);
2933             }
2934         }
2935     }
2936 
2937     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2938     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2939 
2940     multifd_send_sync_main(f);
2941     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2942     qemu_fflush(f);
2943 
2944     return 0;
2945 }
2946 
2947 /**
2948  * ram_save_iterate: iterative stage for migration
2949  *
2950  * Returns zero to indicate success and negative for error
2951  *
2952  * @f: QEMUFile where to send the data
2953  * @opaque: RAMState pointer
2954  */
2955 static int ram_save_iterate(QEMUFile *f, void *opaque)
2956 {
2957     RAMState **temp = opaque;
2958     RAMState *rs = *temp;
2959     int ret = 0;
2960     int i;
2961     int64_t t0;
2962     int done = 0;
2963 
2964     if (blk_mig_bulk_active()) {
2965         /* Avoid transferring ram during bulk phase of block migration as
2966          * the bulk phase will usually take a long time and transferring
2967          * ram updates during that time is pointless. */
2968         goto out;
2969     }
2970 
2971     /*
2972      * We'll take this lock a little bit long, but it's okay for two reasons.
2973      * Firstly, the only possible other thread to take it is who calls
2974      * qemu_guest_free_page_hint(), which should be rare; secondly, see
2975      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2976      * guarantees that we'll at least released it in a regular basis.
2977      */
2978     qemu_mutex_lock(&rs->bitmap_mutex);
2979     WITH_RCU_READ_LOCK_GUARD() {
2980         if (ram_list.version != rs->last_version) {
2981             ram_state_reset(rs);
2982         }
2983 
2984         /* Read version before ram_list.blocks */
2985         smp_rmb();
2986 
2987         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2988 
2989         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2990         i = 0;
2991         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2992                postcopy_has_request(rs)) {
2993             int pages;
2994 
2995             if (qemu_file_get_error(f)) {
2996                 break;
2997             }
2998 
2999             pages = ram_find_and_save_block(rs);
3000             /* no more pages to sent */
3001             if (pages == 0) {
3002                 done = 1;
3003                 break;
3004             }
3005 
3006             if (pages < 0) {
3007                 qemu_file_set_error(f, pages);
3008                 break;
3009             }
3010 
3011             rs->target_page_count += pages;
3012 
3013             /*
3014              * During postcopy, it is necessary to make sure one whole host
3015              * page is sent in one chunk.
3016              */
3017             if (migrate_postcopy_ram()) {
3018                 flush_compressed_data(rs);
3019             }
3020 
3021             /*
3022              * we want to check in the 1st loop, just in case it was the 1st
3023              * time and we had to sync the dirty bitmap.
3024              * qemu_clock_get_ns() is a bit expensive, so we only check each
3025              * some iterations
3026              */
3027             if ((i & 63) == 0) {
3028                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3029                               1000000;
3030                 if (t1 > MAX_WAIT) {
3031                     trace_ram_save_iterate_big_wait(t1, i);
3032                     break;
3033                 }
3034             }
3035             i++;
3036         }
3037     }
3038     qemu_mutex_unlock(&rs->bitmap_mutex);
3039 
3040     /*
3041      * Must occur before EOS (or any QEMUFile operation)
3042      * because of RDMA protocol.
3043      */
3044     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3045 
3046 out:
3047     if (ret >= 0
3048         && migration_is_setup_or_active(migrate_get_current()->state)) {
3049         multifd_send_sync_main(rs->f);
3050         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3051         qemu_fflush(f);
3052         ram_transferred_add(8);
3053 
3054         ret = qemu_file_get_error(f);
3055     }
3056     if (ret < 0) {
3057         return ret;
3058     }
3059 
3060     return done;
3061 }
3062 
3063 /**
3064  * ram_save_complete: function called to send the remaining amount of ram
3065  *
3066  * Returns zero to indicate success or negative on error
3067  *
3068  * Called with iothread lock
3069  *
3070  * @f: QEMUFile where to send the data
3071  * @opaque: RAMState pointer
3072  */
3073 static int ram_save_complete(QEMUFile *f, void *opaque)
3074 {
3075     RAMState **temp = opaque;
3076     RAMState *rs = *temp;
3077     int ret = 0;
3078 
3079     rs->last_stage = !migration_in_colo_state();
3080 
3081     WITH_RCU_READ_LOCK_GUARD() {
3082         if (!migration_in_postcopy()) {
3083             migration_bitmap_sync_precopy(rs);
3084         }
3085 
3086         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3087 
3088         /* try transferring iterative blocks of memory */
3089 
3090         /* flush all remaining blocks regardless of rate limiting */
3091         while (true) {
3092             int pages;
3093 
3094             pages = ram_find_and_save_block(rs);
3095             /* no more blocks to sent */
3096             if (pages == 0) {
3097                 break;
3098             }
3099             if (pages < 0) {
3100                 ret = pages;
3101                 break;
3102             }
3103         }
3104 
3105         flush_compressed_data(rs);
3106         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3107     }
3108 
3109     if (ret >= 0) {
3110         multifd_send_sync_main(rs->f);
3111         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3112         qemu_fflush(f);
3113     }
3114 
3115     return ret;
3116 }
3117 
3118 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3119                              uint64_t *res_precopy_only,
3120                              uint64_t *res_compatible,
3121                              uint64_t *res_postcopy_only)
3122 {
3123     RAMState **temp = opaque;
3124     RAMState *rs = *temp;
3125     uint64_t remaining_size;
3126 
3127     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3128 
3129     if (!migration_in_postcopy() &&
3130         remaining_size < max_size) {
3131         qemu_mutex_lock_iothread();
3132         WITH_RCU_READ_LOCK_GUARD() {
3133             migration_bitmap_sync_precopy(rs);
3134         }
3135         qemu_mutex_unlock_iothread();
3136         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3137     }
3138 
3139     if (migrate_postcopy_ram()) {
3140         /* We can do postcopy, and all the data is postcopiable */
3141         *res_compatible += remaining_size;
3142     } else {
3143         *res_precopy_only += remaining_size;
3144     }
3145 }
3146 
3147 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3148 {
3149     unsigned int xh_len;
3150     int xh_flags;
3151     uint8_t *loaded_data;
3152 
3153     /* extract RLE header */
3154     xh_flags = qemu_get_byte(f);
3155     xh_len = qemu_get_be16(f);
3156 
3157     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3158         error_report("Failed to load XBZRLE page - wrong compression!");
3159         return -1;
3160     }
3161 
3162     if (xh_len > TARGET_PAGE_SIZE) {
3163         error_report("Failed to load XBZRLE page - len overflow!");
3164         return -1;
3165     }
3166     loaded_data = XBZRLE.decoded_buf;
3167     /* load data and decode */
3168     /* it can change loaded_data to point to an internal buffer */
3169     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3170 
3171     /* decode RLE */
3172     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3173                              TARGET_PAGE_SIZE) == -1) {
3174         error_report("Failed to load XBZRLE page - decode error!");
3175         return -1;
3176     }
3177 
3178     return 0;
3179 }
3180 
3181 /**
3182  * ram_block_from_stream: read a RAMBlock id from the migration stream
3183  *
3184  * Must be called from within a rcu critical section.
3185  *
3186  * Returns a pointer from within the RCU-protected ram_list.
3187  *
3188  * @f: QEMUFile where to read the data from
3189  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3190  */
3191 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3192 {
3193     static RAMBlock *block;
3194     char id[256];
3195     uint8_t len;
3196 
3197     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3198         if (!block) {
3199             error_report("Ack, bad migration stream!");
3200             return NULL;
3201         }
3202         return block;
3203     }
3204 
3205     len = qemu_get_byte(f);
3206     qemu_get_buffer(f, (uint8_t *)id, len);
3207     id[len] = 0;
3208 
3209     block = qemu_ram_block_by_name(id);
3210     if (!block) {
3211         error_report("Can't find block %s", id);
3212         return NULL;
3213     }
3214 
3215     if (ramblock_is_ignored(block)) {
3216         error_report("block %s should not be migrated !", id);
3217         return NULL;
3218     }
3219 
3220     return block;
3221 }
3222 
3223 static inline void *host_from_ram_block_offset(RAMBlock *block,
3224                                                ram_addr_t offset)
3225 {
3226     if (!offset_in_ramblock(block, offset)) {
3227         return NULL;
3228     }
3229 
3230     return block->host + offset;
3231 }
3232 
3233 static void *host_page_from_ram_block_offset(RAMBlock *block,
3234                                              ram_addr_t offset)
3235 {
3236     /* Note: Explicitly no check against offset_in_ramblock(). */
3237     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3238                                    block->page_size);
3239 }
3240 
3241 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3242                                                          ram_addr_t offset)
3243 {
3244     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3245 }
3246 
3247 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3248                              ram_addr_t offset, bool record_bitmap)
3249 {
3250     if (!offset_in_ramblock(block, offset)) {
3251         return NULL;
3252     }
3253     if (!block->colo_cache) {
3254         error_report("%s: colo_cache is NULL in block :%s",
3255                      __func__, block->idstr);
3256         return NULL;
3257     }
3258 
3259     /*
3260     * During colo checkpoint, we need bitmap of these migrated pages.
3261     * It help us to decide which pages in ram cache should be flushed
3262     * into VM's RAM later.
3263     */
3264     if (record_bitmap &&
3265         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3266         ram_state->migration_dirty_pages++;
3267     }
3268     return block->colo_cache + offset;
3269 }
3270 
3271 /**
3272  * ram_handle_compressed: handle the zero page case
3273  *
3274  * If a page (or a whole RDMA chunk) has been
3275  * determined to be zero, then zap it.
3276  *
3277  * @host: host address for the zero page
3278  * @ch: what the page is filled from.  We only support zero
3279  * @size: size of the zero page
3280  */
3281 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3282 {
3283     if (ch != 0 || !buffer_is_zero(host, size)) {
3284         memset(host, ch, size);
3285     }
3286 }
3287 
3288 /* return the size after decompression, or negative value on error */
3289 static int
3290 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3291                      const uint8_t *source, size_t source_len)
3292 {
3293     int err;
3294 
3295     err = inflateReset(stream);
3296     if (err != Z_OK) {
3297         return -1;
3298     }
3299 
3300     stream->avail_in = source_len;
3301     stream->next_in = (uint8_t *)source;
3302     stream->avail_out = dest_len;
3303     stream->next_out = dest;
3304 
3305     err = inflate(stream, Z_NO_FLUSH);
3306     if (err != Z_STREAM_END) {
3307         return -1;
3308     }
3309 
3310     return stream->total_out;
3311 }
3312 
3313 static void *do_data_decompress(void *opaque)
3314 {
3315     DecompressParam *param = opaque;
3316     unsigned long pagesize;
3317     uint8_t *des;
3318     int len, ret;
3319 
3320     qemu_mutex_lock(&param->mutex);
3321     while (!param->quit) {
3322         if (param->des) {
3323             des = param->des;
3324             len = param->len;
3325             param->des = 0;
3326             qemu_mutex_unlock(&param->mutex);
3327 
3328             pagesize = TARGET_PAGE_SIZE;
3329 
3330             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3331                                        param->compbuf, len);
3332             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3333                 error_report("decompress data failed");
3334                 qemu_file_set_error(decomp_file, ret);
3335             }
3336 
3337             qemu_mutex_lock(&decomp_done_lock);
3338             param->done = true;
3339             qemu_cond_signal(&decomp_done_cond);
3340             qemu_mutex_unlock(&decomp_done_lock);
3341 
3342             qemu_mutex_lock(&param->mutex);
3343         } else {
3344             qemu_cond_wait(&param->cond, &param->mutex);
3345         }
3346     }
3347     qemu_mutex_unlock(&param->mutex);
3348 
3349     return NULL;
3350 }
3351 
3352 static int wait_for_decompress_done(void)
3353 {
3354     int idx, thread_count;
3355 
3356     if (!migrate_use_compression()) {
3357         return 0;
3358     }
3359 
3360     thread_count = migrate_decompress_threads();
3361     qemu_mutex_lock(&decomp_done_lock);
3362     for (idx = 0; idx < thread_count; idx++) {
3363         while (!decomp_param[idx].done) {
3364             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3365         }
3366     }
3367     qemu_mutex_unlock(&decomp_done_lock);
3368     return qemu_file_get_error(decomp_file);
3369 }
3370 
3371 static void compress_threads_load_cleanup(void)
3372 {
3373     int i, thread_count;
3374 
3375     if (!migrate_use_compression()) {
3376         return;
3377     }
3378     thread_count = migrate_decompress_threads();
3379     for (i = 0; i < thread_count; i++) {
3380         /*
3381          * we use it as a indicator which shows if the thread is
3382          * properly init'd or not
3383          */
3384         if (!decomp_param[i].compbuf) {
3385             break;
3386         }
3387 
3388         qemu_mutex_lock(&decomp_param[i].mutex);
3389         decomp_param[i].quit = true;
3390         qemu_cond_signal(&decomp_param[i].cond);
3391         qemu_mutex_unlock(&decomp_param[i].mutex);
3392     }
3393     for (i = 0; i < thread_count; i++) {
3394         if (!decomp_param[i].compbuf) {
3395             break;
3396         }
3397 
3398         qemu_thread_join(decompress_threads + i);
3399         qemu_mutex_destroy(&decomp_param[i].mutex);
3400         qemu_cond_destroy(&decomp_param[i].cond);
3401         inflateEnd(&decomp_param[i].stream);
3402         g_free(decomp_param[i].compbuf);
3403         decomp_param[i].compbuf = NULL;
3404     }
3405     g_free(decompress_threads);
3406     g_free(decomp_param);
3407     decompress_threads = NULL;
3408     decomp_param = NULL;
3409     decomp_file = NULL;
3410 }
3411 
3412 static int compress_threads_load_setup(QEMUFile *f)
3413 {
3414     int i, thread_count;
3415 
3416     if (!migrate_use_compression()) {
3417         return 0;
3418     }
3419 
3420     thread_count = migrate_decompress_threads();
3421     decompress_threads = g_new0(QemuThread, thread_count);
3422     decomp_param = g_new0(DecompressParam, thread_count);
3423     qemu_mutex_init(&decomp_done_lock);
3424     qemu_cond_init(&decomp_done_cond);
3425     decomp_file = f;
3426     for (i = 0; i < thread_count; i++) {
3427         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3428             goto exit;
3429         }
3430 
3431         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3432         qemu_mutex_init(&decomp_param[i].mutex);
3433         qemu_cond_init(&decomp_param[i].cond);
3434         decomp_param[i].done = true;
3435         decomp_param[i].quit = false;
3436         qemu_thread_create(decompress_threads + i, "decompress",
3437                            do_data_decompress, decomp_param + i,
3438                            QEMU_THREAD_JOINABLE);
3439     }
3440     return 0;
3441 exit:
3442     compress_threads_load_cleanup();
3443     return -1;
3444 }
3445 
3446 static void decompress_data_with_multi_threads(QEMUFile *f,
3447                                                void *host, int len)
3448 {
3449     int idx, thread_count;
3450 
3451     thread_count = migrate_decompress_threads();
3452     QEMU_LOCK_GUARD(&decomp_done_lock);
3453     while (true) {
3454         for (idx = 0; idx < thread_count; idx++) {
3455             if (decomp_param[idx].done) {
3456                 decomp_param[idx].done = false;
3457                 qemu_mutex_lock(&decomp_param[idx].mutex);
3458                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3459                 decomp_param[idx].des = host;
3460                 decomp_param[idx].len = len;
3461                 qemu_cond_signal(&decomp_param[idx].cond);
3462                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3463                 break;
3464             }
3465         }
3466         if (idx < thread_count) {
3467             break;
3468         } else {
3469             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3470         }
3471     }
3472 }
3473 
3474 static void colo_init_ram_state(void)
3475 {
3476     ram_state_init(&ram_state);
3477 }
3478 
3479 /*
3480  * colo cache: this is for secondary VM, we cache the whole
3481  * memory of the secondary VM, it is need to hold the global lock
3482  * to call this helper.
3483  */
3484 int colo_init_ram_cache(void)
3485 {
3486     RAMBlock *block;
3487 
3488     WITH_RCU_READ_LOCK_GUARD() {
3489         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3490             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3491                                                     NULL, false, false);
3492             if (!block->colo_cache) {
3493                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3494                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3495                              block->used_length);
3496                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3497                     if (block->colo_cache) {
3498                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3499                         block->colo_cache = NULL;
3500                     }
3501                 }
3502                 return -errno;
3503             }
3504             if (!machine_dump_guest_core(current_machine)) {
3505                 qemu_madvise(block->colo_cache, block->used_length,
3506                              QEMU_MADV_DONTDUMP);
3507             }
3508         }
3509     }
3510 
3511     /*
3512     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3513     * with to decide which page in cache should be flushed into SVM's RAM. Here
3514     * we use the same name 'ram_bitmap' as for migration.
3515     */
3516     if (ram_bytes_total()) {
3517         RAMBlock *block;
3518 
3519         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3520             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3521             block->bmap = bitmap_new(pages);
3522         }
3523     }
3524 
3525     colo_init_ram_state();
3526     return 0;
3527 }
3528 
3529 /* TODO: duplicated with ram_init_bitmaps */
3530 void colo_incoming_start_dirty_log(void)
3531 {
3532     RAMBlock *block = NULL;
3533     /* For memory_global_dirty_log_start below. */
3534     qemu_mutex_lock_iothread();
3535     qemu_mutex_lock_ramlist();
3536 
3537     memory_global_dirty_log_sync();
3538     WITH_RCU_READ_LOCK_GUARD() {
3539         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3540             ramblock_sync_dirty_bitmap(ram_state, block);
3541             /* Discard this dirty bitmap record */
3542             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3543         }
3544         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3545     }
3546     ram_state->migration_dirty_pages = 0;
3547     qemu_mutex_unlock_ramlist();
3548     qemu_mutex_unlock_iothread();
3549 }
3550 
3551 /* It is need to hold the global lock to call this helper */
3552 void colo_release_ram_cache(void)
3553 {
3554     RAMBlock *block;
3555 
3556     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3557     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3558         g_free(block->bmap);
3559         block->bmap = NULL;
3560     }
3561 
3562     WITH_RCU_READ_LOCK_GUARD() {
3563         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3564             if (block->colo_cache) {
3565                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3566                 block->colo_cache = NULL;
3567             }
3568         }
3569     }
3570     ram_state_cleanup(&ram_state);
3571 }
3572 
3573 /**
3574  * ram_load_setup: Setup RAM for migration incoming side
3575  *
3576  * Returns zero to indicate success and negative for error
3577  *
3578  * @f: QEMUFile where to receive the data
3579  * @opaque: RAMState pointer
3580  */
3581 static int ram_load_setup(QEMUFile *f, void *opaque)
3582 {
3583     if (compress_threads_load_setup(f)) {
3584         return -1;
3585     }
3586 
3587     xbzrle_load_setup();
3588     ramblock_recv_map_init();
3589 
3590     return 0;
3591 }
3592 
3593 static int ram_load_cleanup(void *opaque)
3594 {
3595     RAMBlock *rb;
3596 
3597     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3598         qemu_ram_block_writeback(rb);
3599     }
3600 
3601     xbzrle_load_cleanup();
3602     compress_threads_load_cleanup();
3603 
3604     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3605         g_free(rb->receivedmap);
3606         rb->receivedmap = NULL;
3607     }
3608 
3609     return 0;
3610 }
3611 
3612 /**
3613  * ram_postcopy_incoming_init: allocate postcopy data structures
3614  *
3615  * Returns 0 for success and negative if there was one error
3616  *
3617  * @mis: current migration incoming state
3618  *
3619  * Allocate data structures etc needed by incoming migration with
3620  * postcopy-ram. postcopy-ram's similarly names
3621  * postcopy_ram_incoming_init does the work.
3622  */
3623 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3624 {
3625     return postcopy_ram_incoming_init(mis);
3626 }
3627 
3628 /**
3629  * ram_load_postcopy: load a page in postcopy case
3630  *
3631  * Returns 0 for success or -errno in case of error
3632  *
3633  * Called in postcopy mode by ram_load().
3634  * rcu_read_lock is taken prior to this being called.
3635  *
3636  * @f: QEMUFile where to send the data
3637  */
3638 static int ram_load_postcopy(QEMUFile *f)
3639 {
3640     int flags = 0, ret = 0;
3641     bool place_needed = false;
3642     bool matches_target_page_size = false;
3643     MigrationIncomingState *mis = migration_incoming_get_current();
3644     /* Temporary page that is later 'placed' */
3645     void *postcopy_host_page = mis->postcopy_tmp_page;
3646     void *host_page = NULL;
3647     bool all_zero = true;
3648     int target_pages = 0;
3649 
3650     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3651         ram_addr_t addr;
3652         void *page_buffer = NULL;
3653         void *place_source = NULL;
3654         RAMBlock *block = NULL;
3655         uint8_t ch;
3656         int len;
3657 
3658         addr = qemu_get_be64(f);
3659 
3660         /*
3661          * If qemu file error, we should stop here, and then "addr"
3662          * may be invalid
3663          */
3664         ret = qemu_file_get_error(f);
3665         if (ret) {
3666             break;
3667         }
3668 
3669         flags = addr & ~TARGET_PAGE_MASK;
3670         addr &= TARGET_PAGE_MASK;
3671 
3672         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3673         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3674                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3675             block = ram_block_from_stream(f, flags);
3676             if (!block) {
3677                 ret = -EINVAL;
3678                 break;
3679             }
3680 
3681             /*
3682              * Relying on used_length is racy and can result in false positives.
3683              * We might place pages beyond used_length in case RAM was shrunk
3684              * while in postcopy, which is fine - trying to place via
3685              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3686              */
3687             if (!block->host || addr >= block->postcopy_length) {
3688                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3689                 ret = -EINVAL;
3690                 break;
3691             }
3692             target_pages++;
3693             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3694             /*
3695              * Postcopy requires that we place whole host pages atomically;
3696              * these may be huge pages for RAMBlocks that are backed by
3697              * hugetlbfs.
3698              * To make it atomic, the data is read into a temporary page
3699              * that's moved into place later.
3700              * The migration protocol uses,  possibly smaller, target-pages
3701              * however the source ensures it always sends all the components
3702              * of a host page in one chunk.
3703              */
3704             page_buffer = postcopy_host_page +
3705                           host_page_offset_from_ram_block_offset(block, addr);
3706             /* If all TP are zero then we can optimise the place */
3707             if (target_pages == 1) {
3708                 host_page = host_page_from_ram_block_offset(block, addr);
3709             } else if (host_page != host_page_from_ram_block_offset(block,
3710                                                                     addr)) {
3711                 /* not the 1st TP within the HP */
3712                 error_report("Non-same host page %p/%p", host_page,
3713                              host_page_from_ram_block_offset(block, addr));
3714                 ret = -EINVAL;
3715                 break;
3716             }
3717 
3718             /*
3719              * If it's the last part of a host page then we place the host
3720              * page
3721              */
3722             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3723                 place_needed = true;
3724             }
3725             place_source = postcopy_host_page;
3726         }
3727 
3728         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3729         case RAM_SAVE_FLAG_ZERO:
3730             ch = qemu_get_byte(f);
3731             /*
3732              * Can skip to set page_buffer when
3733              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3734              */
3735             if (ch || !matches_target_page_size) {
3736                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3737             }
3738             if (ch) {
3739                 all_zero = false;
3740             }
3741             break;
3742 
3743         case RAM_SAVE_FLAG_PAGE:
3744             all_zero = false;
3745             if (!matches_target_page_size) {
3746                 /* For huge pages, we always use temporary buffer */
3747                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3748             } else {
3749                 /*
3750                  * For small pages that matches target page size, we
3751                  * avoid the qemu_file copy.  Instead we directly use
3752                  * the buffer of QEMUFile to place the page.  Note: we
3753                  * cannot do any QEMUFile operation before using that
3754                  * buffer to make sure the buffer is valid when
3755                  * placing the page.
3756                  */
3757                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3758                                          TARGET_PAGE_SIZE);
3759             }
3760             break;
3761         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3762             all_zero = false;
3763             len = qemu_get_be32(f);
3764             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3765                 error_report("Invalid compressed data length: %d", len);
3766                 ret = -EINVAL;
3767                 break;
3768             }
3769             decompress_data_with_multi_threads(f, page_buffer, len);
3770             break;
3771 
3772         case RAM_SAVE_FLAG_EOS:
3773             /* normal exit */
3774             multifd_recv_sync_main();
3775             break;
3776         default:
3777             error_report("Unknown combination of migration flags: 0x%x"
3778                          " (postcopy mode)", flags);
3779             ret = -EINVAL;
3780             break;
3781         }
3782 
3783         /* Got the whole host page, wait for decompress before placing. */
3784         if (place_needed) {
3785             ret |= wait_for_decompress_done();
3786         }
3787 
3788         /* Detect for any possible file errors */
3789         if (!ret && qemu_file_get_error(f)) {
3790             ret = qemu_file_get_error(f);
3791         }
3792 
3793         if (!ret && place_needed) {
3794             if (all_zero) {
3795                 ret = postcopy_place_page_zero(mis, host_page, block);
3796             } else {
3797                 ret = postcopy_place_page(mis, host_page, place_source,
3798                                           block);
3799             }
3800             place_needed = false;
3801             target_pages = 0;
3802             /* Assume we have a zero page until we detect something different */
3803             all_zero = true;
3804         }
3805     }
3806 
3807     return ret;
3808 }
3809 
3810 static bool postcopy_is_advised(void)
3811 {
3812     PostcopyState ps = postcopy_state_get();
3813     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3814 }
3815 
3816 static bool postcopy_is_running(void)
3817 {
3818     PostcopyState ps = postcopy_state_get();
3819     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3820 }
3821 
3822 /*
3823  * Flush content of RAM cache into SVM's memory.
3824  * Only flush the pages that be dirtied by PVM or SVM or both.
3825  */
3826 void colo_flush_ram_cache(void)
3827 {
3828     RAMBlock *block = NULL;
3829     void *dst_host;
3830     void *src_host;
3831     unsigned long offset = 0;
3832 
3833     memory_global_dirty_log_sync();
3834     WITH_RCU_READ_LOCK_GUARD() {
3835         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3836             ramblock_sync_dirty_bitmap(ram_state, block);
3837         }
3838     }
3839 
3840     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3841     WITH_RCU_READ_LOCK_GUARD() {
3842         block = QLIST_FIRST_RCU(&ram_list.blocks);
3843 
3844         while (block) {
3845             unsigned long num = 0;
3846 
3847             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3848             if (!offset_in_ramblock(block,
3849                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3850                 offset = 0;
3851                 num = 0;
3852                 block = QLIST_NEXT_RCU(block, next);
3853             } else {
3854                 unsigned long i = 0;
3855 
3856                 for (i = 0; i < num; i++) {
3857                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
3858                 }
3859                 dst_host = block->host
3860                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3861                 src_host = block->colo_cache
3862                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3863                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3864                 offset += num;
3865             }
3866         }
3867     }
3868     trace_colo_flush_ram_cache_end();
3869 }
3870 
3871 /**
3872  * ram_load_precopy: load pages in precopy case
3873  *
3874  * Returns 0 for success or -errno in case of error
3875  *
3876  * Called in precopy mode by ram_load().
3877  * rcu_read_lock is taken prior to this being called.
3878  *
3879  * @f: QEMUFile where to send the data
3880  */
3881 static int ram_load_precopy(QEMUFile *f)
3882 {
3883     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3884     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3885     bool postcopy_advised = postcopy_is_advised();
3886     if (!migrate_use_compression()) {
3887         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3888     }
3889 
3890     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3891         ram_addr_t addr, total_ram_bytes;
3892         void *host = NULL, *host_bak = NULL;
3893         uint8_t ch;
3894 
3895         /*
3896          * Yield periodically to let main loop run, but an iteration of
3897          * the main loop is expensive, so do it each some iterations
3898          */
3899         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3900             aio_co_schedule(qemu_get_current_aio_context(),
3901                             qemu_coroutine_self());
3902             qemu_coroutine_yield();
3903         }
3904         i++;
3905 
3906         addr = qemu_get_be64(f);
3907         flags = addr & ~TARGET_PAGE_MASK;
3908         addr &= TARGET_PAGE_MASK;
3909 
3910         if (flags & invalid_flags) {
3911             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3912                 error_report("Received an unexpected compressed page");
3913             }
3914 
3915             ret = -EINVAL;
3916             break;
3917         }
3918 
3919         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3920                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3921             RAMBlock *block = ram_block_from_stream(f, flags);
3922 
3923             host = host_from_ram_block_offset(block, addr);
3924             /*
3925              * After going into COLO stage, we should not load the page
3926              * into SVM's memory directly, we put them into colo_cache firstly.
3927              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3928              * Previously, we copied all these memory in preparing stage of COLO
3929              * while we need to stop VM, which is a time-consuming process.
3930              * Here we optimize it by a trick, back-up every page while in
3931              * migration process while COLO is enabled, though it affects the
3932              * speed of the migration, but it obviously reduce the downtime of
3933              * back-up all SVM'S memory in COLO preparing stage.
3934              */
3935             if (migration_incoming_colo_enabled()) {
3936                 if (migration_incoming_in_colo_state()) {
3937                     /* In COLO stage, put all pages into cache temporarily */
3938                     host = colo_cache_from_block_offset(block, addr, true);
3939                 } else {
3940                    /*
3941                     * In migration stage but before COLO stage,
3942                     * Put all pages into both cache and SVM's memory.
3943                     */
3944                     host_bak = colo_cache_from_block_offset(block, addr, false);
3945                 }
3946             }
3947             if (!host) {
3948                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3949                 ret = -EINVAL;
3950                 break;
3951             }
3952             if (!migration_incoming_in_colo_state()) {
3953                 ramblock_recv_bitmap_set(block, host);
3954             }
3955 
3956             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3957         }
3958 
3959         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3960         case RAM_SAVE_FLAG_MEM_SIZE:
3961             /* Synchronize RAM block list */
3962             total_ram_bytes = addr;
3963             while (!ret && total_ram_bytes) {
3964                 RAMBlock *block;
3965                 char id[256];
3966                 ram_addr_t length;
3967 
3968                 len = qemu_get_byte(f);
3969                 qemu_get_buffer(f, (uint8_t *)id, len);
3970                 id[len] = 0;
3971                 length = qemu_get_be64(f);
3972 
3973                 block = qemu_ram_block_by_name(id);
3974                 if (block && !qemu_ram_is_migratable(block)) {
3975                     error_report("block %s should not be migrated !", id);
3976                     ret = -EINVAL;
3977                 } else if (block) {
3978                     if (length != block->used_length) {
3979                         Error *local_err = NULL;
3980 
3981                         ret = qemu_ram_resize(block, length,
3982                                               &local_err);
3983                         if (local_err) {
3984                             error_report_err(local_err);
3985                         }
3986                     }
3987                     /* For postcopy we need to check hugepage sizes match */
3988                     if (postcopy_advised && migrate_postcopy_ram() &&
3989                         block->page_size != qemu_host_page_size) {
3990                         uint64_t remote_page_size = qemu_get_be64(f);
3991                         if (remote_page_size != block->page_size) {
3992                             error_report("Mismatched RAM page size %s "
3993                                          "(local) %zd != %" PRId64,
3994                                          id, block->page_size,
3995                                          remote_page_size);
3996                             ret = -EINVAL;
3997                         }
3998                     }
3999                     if (migrate_ignore_shared()) {
4000                         hwaddr addr = qemu_get_be64(f);
4001                         if (ramblock_is_ignored(block) &&
4002                             block->mr->addr != addr) {
4003                             error_report("Mismatched GPAs for block %s "
4004                                          "%" PRId64 "!= %" PRId64,
4005                                          id, (uint64_t)addr,
4006                                          (uint64_t)block->mr->addr);
4007                             ret = -EINVAL;
4008                         }
4009                     }
4010                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4011                                           block->idstr);
4012                 } else {
4013                     error_report("Unknown ramblock \"%s\", cannot "
4014                                  "accept migration", id);
4015                     ret = -EINVAL;
4016                 }
4017 
4018                 total_ram_bytes -= length;
4019             }
4020             break;
4021 
4022         case RAM_SAVE_FLAG_ZERO:
4023             ch = qemu_get_byte(f);
4024             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4025             break;
4026 
4027         case RAM_SAVE_FLAG_PAGE:
4028             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4029             break;
4030 
4031         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4032             len = qemu_get_be32(f);
4033             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4034                 error_report("Invalid compressed data length: %d", len);
4035                 ret = -EINVAL;
4036                 break;
4037             }
4038             decompress_data_with_multi_threads(f, host, len);
4039             break;
4040 
4041         case RAM_SAVE_FLAG_XBZRLE:
4042             if (load_xbzrle(f, addr, host) < 0) {
4043                 error_report("Failed to decompress XBZRLE page at "
4044                              RAM_ADDR_FMT, addr);
4045                 ret = -EINVAL;
4046                 break;
4047             }
4048             break;
4049         case RAM_SAVE_FLAG_EOS:
4050             /* normal exit */
4051             multifd_recv_sync_main();
4052             break;
4053         default:
4054             if (flags & RAM_SAVE_FLAG_HOOK) {
4055                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4056             } else {
4057                 error_report("Unknown combination of migration flags: 0x%x",
4058                              flags);
4059                 ret = -EINVAL;
4060             }
4061         }
4062         if (!ret) {
4063             ret = qemu_file_get_error(f);
4064         }
4065         if (!ret && host_bak) {
4066             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4067         }
4068     }
4069 
4070     ret |= wait_for_decompress_done();
4071     return ret;
4072 }
4073 
4074 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4075 {
4076     int ret = 0;
4077     static uint64_t seq_iter;
4078     /*
4079      * If system is running in postcopy mode, page inserts to host memory must
4080      * be atomic
4081      */
4082     bool postcopy_running = postcopy_is_running();
4083 
4084     seq_iter++;
4085 
4086     if (version_id != 4) {
4087         return -EINVAL;
4088     }
4089 
4090     /*
4091      * This RCU critical section can be very long running.
4092      * When RCU reclaims in the code start to become numerous,
4093      * it will be necessary to reduce the granularity of this
4094      * critical section.
4095      */
4096     WITH_RCU_READ_LOCK_GUARD() {
4097         if (postcopy_running) {
4098             ret = ram_load_postcopy(f);
4099         } else {
4100             ret = ram_load_precopy(f);
4101         }
4102     }
4103     trace_ram_load_complete(ret, seq_iter);
4104 
4105     return ret;
4106 }
4107 
4108 static bool ram_has_postcopy(void *opaque)
4109 {
4110     RAMBlock *rb;
4111     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4112         if (ramblock_is_pmem(rb)) {
4113             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4114                          "is not supported now!", rb->idstr, rb->host);
4115             return false;
4116         }
4117     }
4118 
4119     return migrate_postcopy_ram();
4120 }
4121 
4122 /* Sync all the dirty bitmap with destination VM.  */
4123 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4124 {
4125     RAMBlock *block;
4126     QEMUFile *file = s->to_dst_file;
4127     int ramblock_count = 0;
4128 
4129     trace_ram_dirty_bitmap_sync_start();
4130 
4131     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4132         qemu_savevm_send_recv_bitmap(file, block->idstr);
4133         trace_ram_dirty_bitmap_request(block->idstr);
4134         ramblock_count++;
4135     }
4136 
4137     trace_ram_dirty_bitmap_sync_wait();
4138 
4139     /* Wait until all the ramblocks' dirty bitmap synced */
4140     while (ramblock_count--) {
4141         qemu_sem_wait(&s->rp_state.rp_sem);
4142     }
4143 
4144     trace_ram_dirty_bitmap_sync_complete();
4145 
4146     return 0;
4147 }
4148 
4149 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4150 {
4151     qemu_sem_post(&s->rp_state.rp_sem);
4152 }
4153 
4154 /*
4155  * Read the received bitmap, revert it as the initial dirty bitmap.
4156  * This is only used when the postcopy migration is paused but wants
4157  * to resume from a middle point.
4158  */
4159 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4160 {
4161     int ret = -EINVAL;
4162     /* from_dst_file is always valid because we're within rp_thread */
4163     QEMUFile *file = s->rp_state.from_dst_file;
4164     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4165     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4166     uint64_t size, end_mark;
4167 
4168     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4169 
4170     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4171         error_report("%s: incorrect state %s", __func__,
4172                      MigrationStatus_str(s->state));
4173         return -EINVAL;
4174     }
4175 
4176     /*
4177      * Note: see comments in ramblock_recv_bitmap_send() on why we
4178      * need the endianness conversion, and the paddings.
4179      */
4180     local_size = ROUND_UP(local_size, 8);
4181 
4182     /* Add paddings */
4183     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4184 
4185     size = qemu_get_be64(file);
4186 
4187     /* The size of the bitmap should match with our ramblock */
4188     if (size != local_size) {
4189         error_report("%s: ramblock '%s' bitmap size mismatch "
4190                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4191                      block->idstr, size, local_size);
4192         ret = -EINVAL;
4193         goto out;
4194     }
4195 
4196     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4197     end_mark = qemu_get_be64(file);
4198 
4199     ret = qemu_file_get_error(file);
4200     if (ret || size != local_size) {
4201         error_report("%s: read bitmap failed for ramblock '%s': %d"
4202                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4203                      __func__, block->idstr, ret, local_size, size);
4204         ret = -EIO;
4205         goto out;
4206     }
4207 
4208     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4209         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4210                      __func__, block->idstr, end_mark);
4211         ret = -EINVAL;
4212         goto out;
4213     }
4214 
4215     /*
4216      * Endianness conversion. We are during postcopy (though paused).
4217      * The dirty bitmap won't change. We can directly modify it.
4218      */
4219     bitmap_from_le(block->bmap, le_bitmap, nbits);
4220 
4221     /*
4222      * What we received is "received bitmap". Revert it as the initial
4223      * dirty bitmap for this ramblock.
4224      */
4225     bitmap_complement(block->bmap, block->bmap, nbits);
4226 
4227     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4228     ramblock_dirty_bitmap_clear_discarded_pages(block);
4229 
4230     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4231     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4232 
4233     /*
4234      * We succeeded to sync bitmap for current ramblock. If this is
4235      * the last one to sync, we need to notify the main send thread.
4236      */
4237     ram_dirty_bitmap_reload_notify(s);
4238 
4239     ret = 0;
4240 out:
4241     g_free(le_bitmap);
4242     return ret;
4243 }
4244 
4245 static int ram_resume_prepare(MigrationState *s, void *opaque)
4246 {
4247     RAMState *rs = *(RAMState **)opaque;
4248     int ret;
4249 
4250     ret = ram_dirty_bitmap_sync_all(s, rs);
4251     if (ret) {
4252         return ret;
4253     }
4254 
4255     ram_state_resume_prepare(rs, s->to_dst_file);
4256 
4257     return 0;
4258 }
4259 
4260 static SaveVMHandlers savevm_ram_handlers = {
4261     .save_setup = ram_save_setup,
4262     .save_live_iterate = ram_save_iterate,
4263     .save_live_complete_postcopy = ram_save_complete,
4264     .save_live_complete_precopy = ram_save_complete,
4265     .has_postcopy = ram_has_postcopy,
4266     .save_live_pending = ram_save_pending,
4267     .load_state = ram_load,
4268     .save_cleanup = ram_save_cleanup,
4269     .load_setup = ram_load_setup,
4270     .load_cleanup = ram_load_cleanup,
4271     .resume_prepare = ram_resume_prepare,
4272 };
4273 
4274 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4275                                       size_t old_size, size_t new_size)
4276 {
4277     PostcopyState ps = postcopy_state_get();
4278     ram_addr_t offset;
4279     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4280     Error *err = NULL;
4281 
4282     if (ramblock_is_ignored(rb)) {
4283         return;
4284     }
4285 
4286     if (!migration_is_idle()) {
4287         /*
4288          * Precopy code on the source cannot deal with the size of RAM blocks
4289          * changing at random points in time - especially after sending the
4290          * RAM block sizes in the migration stream, they must no longer change.
4291          * Abort and indicate a proper reason.
4292          */
4293         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4294         migration_cancel(err);
4295         error_free(err);
4296     }
4297 
4298     switch (ps) {
4299     case POSTCOPY_INCOMING_ADVISE:
4300         /*
4301          * Update what ram_postcopy_incoming_init()->init_range() does at the
4302          * time postcopy was advised. Syncing RAM blocks with the source will
4303          * result in RAM resizes.
4304          */
4305         if (old_size < new_size) {
4306             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4307                 error_report("RAM block '%s' discard of resized RAM failed",
4308                              rb->idstr);
4309             }
4310         }
4311         rb->postcopy_length = new_size;
4312         break;
4313     case POSTCOPY_INCOMING_NONE:
4314     case POSTCOPY_INCOMING_RUNNING:
4315     case POSTCOPY_INCOMING_END:
4316         /*
4317          * Once our guest is running, postcopy does no longer care about
4318          * resizes. When growing, the new memory was not available on the
4319          * source, no handler needed.
4320          */
4321         break;
4322     default:
4323         error_report("RAM block '%s' resized during postcopy state: %d",
4324                      rb->idstr, ps);
4325         exit(-1);
4326     }
4327 }
4328 
4329 static RAMBlockNotifier ram_mig_ram_notifier = {
4330     .ram_block_resized = ram_mig_ram_block_resized,
4331 };
4332 
4333 void ram_mig_init(void)
4334 {
4335     qemu_mutex_init(&XBZRLE.lock);
4336     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4337     ram_block_notifier_add(&ram_mig_ram_notifier);
4338 }
4339