xref: /openbmc/qemu/migration/ram.c (revision dbd9e084)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
34 #include "xbzrle.h"
35 #include "ram.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
47 #include "trace.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
52 #include "block.h"
53 #include "sysemu/cpu-throttle.h"
54 #include "savevm.h"
55 #include "qemu/iov.h"
56 #include "multifd.h"
57 #include "sysemu/runstate.h"
58 
59 #if defined(__linux__)
60 #include "qemu/userfaultfd.h"
61 #endif /* defined(__linux__) */
62 
63 /***********************************************************/
64 /* ram save/restore */
65 
66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
67  * worked for pages that where filled with the same char.  We switched
68  * it to only search for the zero value.  And to avoid confusion with
69  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
70  */
71 
72 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
73 #define RAM_SAVE_FLAG_ZERO     0x02
74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
75 #define RAM_SAVE_FLAG_PAGE     0x08
76 #define RAM_SAVE_FLAG_EOS      0x10
77 #define RAM_SAVE_FLAG_CONTINUE 0x20
78 #define RAM_SAVE_FLAG_XBZRLE   0x40
79 /* 0x80 is reserved in migration.h start with 0x100 next */
80 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
81 
82 static inline bool is_zero_range(uint8_t *p, uint64_t size)
83 {
84     return buffer_is_zero(p, size);
85 }
86 
87 XBZRLECacheStats xbzrle_counters;
88 
89 /* struct contains XBZRLE cache and a static page
90    used by the compression */
91 static struct {
92     /* buffer used for XBZRLE encoding */
93     uint8_t *encoded_buf;
94     /* buffer for storing page content */
95     uint8_t *current_buf;
96     /* Cache for XBZRLE, Protected by lock. */
97     PageCache *cache;
98     QemuMutex lock;
99     /* it will store a page full of zeros */
100     uint8_t *zero_target_page;
101     /* buffer used for XBZRLE decoding */
102     uint8_t *decoded_buf;
103 } XBZRLE;
104 
105 static void XBZRLE_cache_lock(void)
106 {
107     if (migrate_use_xbzrle()) {
108         qemu_mutex_lock(&XBZRLE.lock);
109     }
110 }
111 
112 static void XBZRLE_cache_unlock(void)
113 {
114     if (migrate_use_xbzrle()) {
115         qemu_mutex_unlock(&XBZRLE.lock);
116     }
117 }
118 
119 /**
120  * xbzrle_cache_resize: resize the xbzrle cache
121  *
122  * This function is called from migrate_params_apply in main
123  * thread, possibly while a migration is in progress.  A running
124  * migration may be using the cache and might finish during this call,
125  * hence changes to the cache are protected by XBZRLE.lock().
126  *
127  * Returns 0 for success or -1 for error
128  *
129  * @new_size: new cache size
130  * @errp: set *errp if the check failed, with reason
131  */
132 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
133 {
134     PageCache *new_cache;
135     int64_t ret = 0;
136 
137     /* Check for truncation */
138     if (new_size != (size_t)new_size) {
139         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
140                    "exceeding address space");
141         return -1;
142     }
143 
144     if (new_size == migrate_xbzrle_cache_size()) {
145         /* nothing to do */
146         return 0;
147     }
148 
149     XBZRLE_cache_lock();
150 
151     if (XBZRLE.cache != NULL) {
152         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
153         if (!new_cache) {
154             ret = -1;
155             goto out;
156         }
157 
158         cache_fini(XBZRLE.cache);
159         XBZRLE.cache = new_cache;
160     }
161 out:
162     XBZRLE_cache_unlock();
163     return ret;
164 }
165 
166 bool ramblock_is_ignored(RAMBlock *block)
167 {
168     return !qemu_ram_is_migratable(block) ||
169            (migrate_ignore_shared() && qemu_ram_is_shared(block));
170 }
171 
172 #undef RAMBLOCK_FOREACH
173 
174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
175 {
176     RAMBlock *block;
177     int ret = 0;
178 
179     RCU_READ_LOCK_GUARD();
180 
181     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
182         ret = func(block, opaque);
183         if (ret) {
184             break;
185         }
186     }
187     return ret;
188 }
189 
190 static void ramblock_recv_map_init(void)
191 {
192     RAMBlock *rb;
193 
194     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
195         assert(!rb->receivedmap);
196         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
197     }
198 }
199 
200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
201 {
202     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
203                     rb->receivedmap);
204 }
205 
206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
207 {
208     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
209 }
210 
211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
212 {
213     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
214 }
215 
216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
217                                     size_t nr)
218 {
219     bitmap_set_atomic(rb->receivedmap,
220                       ramblock_recv_bitmap_offset(host_addr, rb),
221                       nr);
222 }
223 
224 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
225 
226 /*
227  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
228  *
229  * Returns >0 if success with sent bytes, or <0 if error.
230  */
231 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
232                                   const char *block_name)
233 {
234     RAMBlock *block = qemu_ram_block_by_name(block_name);
235     unsigned long *le_bitmap, nbits;
236     uint64_t size;
237 
238     if (!block) {
239         error_report("%s: invalid block name: %s", __func__, block_name);
240         return -1;
241     }
242 
243     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
244 
245     /*
246      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
247      * machines we may need 4 more bytes for padding (see below
248      * comment). So extend it a bit before hand.
249      */
250     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
251 
252     /*
253      * Always use little endian when sending the bitmap. This is
254      * required that when source and destination VMs are not using the
255      * same endianness. (Note: big endian won't work.)
256      */
257     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
258 
259     /* Size of the bitmap, in bytes */
260     size = DIV_ROUND_UP(nbits, 8);
261 
262     /*
263      * size is always aligned to 8 bytes for 64bit machines, but it
264      * may not be true for 32bit machines. We need this padding to
265      * make sure the migration can survive even between 32bit and
266      * 64bit machines.
267      */
268     size = ROUND_UP(size, 8);
269 
270     qemu_put_be64(file, size);
271     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
272     /*
273      * Mark as an end, in case the middle part is screwed up due to
274      * some "mysterious" reason.
275      */
276     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
277     qemu_fflush(file);
278 
279     g_free(le_bitmap);
280 
281     if (qemu_file_get_error(file)) {
282         return qemu_file_get_error(file);
283     }
284 
285     return size + sizeof(size);
286 }
287 
288 /*
289  * An outstanding page request, on the source, having been received
290  * and queued
291  */
292 struct RAMSrcPageRequest {
293     RAMBlock *rb;
294     hwaddr    offset;
295     hwaddr    len;
296 
297     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
298 };
299 
300 /* State of RAM for migration */
301 struct RAMState {
302     /* QEMUFile used for this migration */
303     QEMUFile *f;
304     /* UFFD file descriptor, used in 'write-tracking' migration */
305     int uffdio_fd;
306     /* Last block that we have visited searching for dirty pages */
307     RAMBlock *last_seen_block;
308     /* Last block from where we have sent data */
309     RAMBlock *last_sent_block;
310     /* Last dirty target page we have sent */
311     ram_addr_t last_page;
312     /* last ram version we have seen */
313     uint32_t last_version;
314     /* How many times we have dirty too many pages */
315     int dirty_rate_high_cnt;
316     /* these variables are used for bitmap sync */
317     /* last time we did a full bitmap_sync */
318     int64_t time_last_bitmap_sync;
319     /* bytes transferred at start_time */
320     uint64_t bytes_xfer_prev;
321     /* number of dirty pages since start_time */
322     uint64_t num_dirty_pages_period;
323     /* xbzrle misses since the beginning of the period */
324     uint64_t xbzrle_cache_miss_prev;
325     /* Amount of xbzrle pages since the beginning of the period */
326     uint64_t xbzrle_pages_prev;
327     /* Amount of xbzrle encoded bytes since the beginning of the period */
328     uint64_t xbzrle_bytes_prev;
329     /* Start using XBZRLE (e.g., after the first round). */
330     bool xbzrle_enabled;
331 
332     /* compression statistics since the beginning of the period */
333     /* amount of count that no free thread to compress data */
334     uint64_t compress_thread_busy_prev;
335     /* amount bytes after compression */
336     uint64_t compressed_size_prev;
337     /* amount of compressed pages */
338     uint64_t compress_pages_prev;
339 
340     /* total handled target pages at the beginning of period */
341     uint64_t target_page_count_prev;
342     /* total handled target pages since start */
343     uint64_t target_page_count;
344     /* number of dirty bits in the bitmap */
345     uint64_t migration_dirty_pages;
346     /* Protects modification of the bitmap and migration dirty pages */
347     QemuMutex bitmap_mutex;
348     /* The RAMBlock used in the last src_page_requests */
349     RAMBlock *last_req_rb;
350     /* Queue of outstanding page requests from the destination */
351     QemuMutex src_page_req_mutex;
352     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
353 };
354 typedef struct RAMState RAMState;
355 
356 static RAMState *ram_state;
357 
358 static NotifierWithReturnList precopy_notifier_list;
359 
360 void precopy_infrastructure_init(void)
361 {
362     notifier_with_return_list_init(&precopy_notifier_list);
363 }
364 
365 void precopy_add_notifier(NotifierWithReturn *n)
366 {
367     notifier_with_return_list_add(&precopy_notifier_list, n);
368 }
369 
370 void precopy_remove_notifier(NotifierWithReturn *n)
371 {
372     notifier_with_return_remove(n);
373 }
374 
375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
376 {
377     PrecopyNotifyData pnd;
378     pnd.reason = reason;
379     pnd.errp = errp;
380 
381     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
382 }
383 
384 uint64_t ram_bytes_remaining(void)
385 {
386     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
387                        0;
388 }
389 
390 MigrationStats ram_counters;
391 
392 /* used by the search for pages to send */
393 struct PageSearchStatus {
394     /* Current block being searched */
395     RAMBlock    *block;
396     /* Current page to search from */
397     unsigned long page;
398     /* Set once we wrap around */
399     bool         complete_round;
400 };
401 typedef struct PageSearchStatus PageSearchStatus;
402 
403 CompressionStats compression_counters;
404 
405 struct CompressParam {
406     bool done;
407     bool quit;
408     bool zero_page;
409     QEMUFile *file;
410     QemuMutex mutex;
411     QemuCond cond;
412     RAMBlock *block;
413     ram_addr_t offset;
414 
415     /* internally used fields */
416     z_stream stream;
417     uint8_t *originbuf;
418 };
419 typedef struct CompressParam CompressParam;
420 
421 struct DecompressParam {
422     bool done;
423     bool quit;
424     QemuMutex mutex;
425     QemuCond cond;
426     void *des;
427     uint8_t *compbuf;
428     int len;
429     z_stream stream;
430 };
431 typedef struct DecompressParam DecompressParam;
432 
433 static CompressParam *comp_param;
434 static QemuThread *compress_threads;
435 /* comp_done_cond is used to wake up the migration thread when
436  * one of the compression threads has finished the compression.
437  * comp_done_lock is used to co-work with comp_done_cond.
438  */
439 static QemuMutex comp_done_lock;
440 static QemuCond comp_done_cond;
441 /* The empty QEMUFileOps will be used by file in CompressParam */
442 static const QEMUFileOps empty_ops = { };
443 
444 static QEMUFile *decomp_file;
445 static DecompressParam *decomp_param;
446 static QemuThread *decompress_threads;
447 static QemuMutex decomp_done_lock;
448 static QemuCond decomp_done_cond;
449 
450 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
451                                  ram_addr_t offset, uint8_t *source_buf);
452 
453 static void *do_data_compress(void *opaque)
454 {
455     CompressParam *param = opaque;
456     RAMBlock *block;
457     ram_addr_t offset;
458     bool zero_page;
459 
460     qemu_mutex_lock(&param->mutex);
461     while (!param->quit) {
462         if (param->block) {
463             block = param->block;
464             offset = param->offset;
465             param->block = NULL;
466             qemu_mutex_unlock(&param->mutex);
467 
468             zero_page = do_compress_ram_page(param->file, &param->stream,
469                                              block, offset, param->originbuf);
470 
471             qemu_mutex_lock(&comp_done_lock);
472             param->done = true;
473             param->zero_page = zero_page;
474             qemu_cond_signal(&comp_done_cond);
475             qemu_mutex_unlock(&comp_done_lock);
476 
477             qemu_mutex_lock(&param->mutex);
478         } else {
479             qemu_cond_wait(&param->cond, &param->mutex);
480         }
481     }
482     qemu_mutex_unlock(&param->mutex);
483 
484     return NULL;
485 }
486 
487 static void compress_threads_save_cleanup(void)
488 {
489     int i, thread_count;
490 
491     if (!migrate_use_compression() || !comp_param) {
492         return;
493     }
494 
495     thread_count = migrate_compress_threads();
496     for (i = 0; i < thread_count; i++) {
497         /*
498          * we use it as a indicator which shows if the thread is
499          * properly init'd or not
500          */
501         if (!comp_param[i].file) {
502             break;
503         }
504 
505         qemu_mutex_lock(&comp_param[i].mutex);
506         comp_param[i].quit = true;
507         qemu_cond_signal(&comp_param[i].cond);
508         qemu_mutex_unlock(&comp_param[i].mutex);
509 
510         qemu_thread_join(compress_threads + i);
511         qemu_mutex_destroy(&comp_param[i].mutex);
512         qemu_cond_destroy(&comp_param[i].cond);
513         deflateEnd(&comp_param[i].stream);
514         g_free(comp_param[i].originbuf);
515         qemu_fclose(comp_param[i].file);
516         comp_param[i].file = NULL;
517     }
518     qemu_mutex_destroy(&comp_done_lock);
519     qemu_cond_destroy(&comp_done_cond);
520     g_free(compress_threads);
521     g_free(comp_param);
522     compress_threads = NULL;
523     comp_param = NULL;
524 }
525 
526 static int compress_threads_save_setup(void)
527 {
528     int i, thread_count;
529 
530     if (!migrate_use_compression()) {
531         return 0;
532     }
533     thread_count = migrate_compress_threads();
534     compress_threads = g_new0(QemuThread, thread_count);
535     comp_param = g_new0(CompressParam, thread_count);
536     qemu_cond_init(&comp_done_cond);
537     qemu_mutex_init(&comp_done_lock);
538     for (i = 0; i < thread_count; i++) {
539         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
540         if (!comp_param[i].originbuf) {
541             goto exit;
542         }
543 
544         if (deflateInit(&comp_param[i].stream,
545                         migrate_compress_level()) != Z_OK) {
546             g_free(comp_param[i].originbuf);
547             goto exit;
548         }
549 
550         /* comp_param[i].file is just used as a dummy buffer to save data,
551          * set its ops to empty.
552          */
553         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
554         comp_param[i].done = true;
555         comp_param[i].quit = false;
556         qemu_mutex_init(&comp_param[i].mutex);
557         qemu_cond_init(&comp_param[i].cond);
558         qemu_thread_create(compress_threads + i, "compress",
559                            do_data_compress, comp_param + i,
560                            QEMU_THREAD_JOINABLE);
561     }
562     return 0;
563 
564 exit:
565     compress_threads_save_cleanup();
566     return -1;
567 }
568 
569 /**
570  * save_page_header: write page header to wire
571  *
572  * If this is the 1st block, it also writes the block identification
573  *
574  * Returns the number of bytes written
575  *
576  * @f: QEMUFile where to send the data
577  * @block: block that contains the page we want to send
578  * @offset: offset inside the block for the page
579  *          in the lower bits, it contains flags
580  */
581 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
582                                ram_addr_t offset)
583 {
584     size_t size, len;
585 
586     if (block == rs->last_sent_block) {
587         offset |= RAM_SAVE_FLAG_CONTINUE;
588     }
589     qemu_put_be64(f, offset);
590     size = 8;
591 
592     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
593         len = strlen(block->idstr);
594         qemu_put_byte(f, len);
595         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
596         size += 1 + len;
597         rs->last_sent_block = block;
598     }
599     return size;
600 }
601 
602 /**
603  * mig_throttle_guest_down: throttle down the guest
604  *
605  * Reduce amount of guest cpu execution to hopefully slow down memory
606  * writes. If guest dirty memory rate is reduced below the rate at
607  * which we can transfer pages to the destination then we should be
608  * able to complete migration. Some workloads dirty memory way too
609  * fast and will not effectively converge, even with auto-converge.
610  */
611 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
612                                     uint64_t bytes_dirty_threshold)
613 {
614     MigrationState *s = migrate_get_current();
615     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
616     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
617     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
618     int pct_max = s->parameters.max_cpu_throttle;
619 
620     uint64_t throttle_now = cpu_throttle_get_percentage();
621     uint64_t cpu_now, cpu_ideal, throttle_inc;
622 
623     /* We have not started throttling yet. Let's start it. */
624     if (!cpu_throttle_active()) {
625         cpu_throttle_set(pct_initial);
626     } else {
627         /* Throttling already on, just increase the rate */
628         if (!pct_tailslow) {
629             throttle_inc = pct_increment;
630         } else {
631             /* Compute the ideal CPU percentage used by Guest, which may
632              * make the dirty rate match the dirty rate threshold. */
633             cpu_now = 100 - throttle_now;
634             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
635                         bytes_dirty_period);
636             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
637         }
638         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
639     }
640 }
641 
642 /**
643  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
644  *
645  * @rs: current RAM state
646  * @current_addr: address for the zero page
647  *
648  * Update the xbzrle cache to reflect a page that's been sent as all 0.
649  * The important thing is that a stale (not-yet-0'd) page be replaced
650  * by the new data.
651  * As a bonus, if the page wasn't in the cache it gets added so that
652  * when a small write is made into the 0'd page it gets XBZRLE sent.
653  */
654 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
655 {
656     if (!rs->xbzrle_enabled) {
657         return;
658     }
659 
660     /* We don't care if this fails to allocate a new cache page
661      * as long as it updated an old one */
662     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
663                  ram_counters.dirty_sync_count);
664 }
665 
666 #define ENCODING_FLAG_XBZRLE 0x1
667 
668 /**
669  * save_xbzrle_page: compress and send current page
670  *
671  * Returns: 1 means that we wrote the page
672  *          0 means that page is identical to the one already sent
673  *          -1 means that xbzrle would be longer than normal
674  *
675  * @rs: current RAM state
676  * @current_data: pointer to the address of the page contents
677  * @current_addr: addr of the page
678  * @block: block that contains the page we want to send
679  * @offset: offset inside the block for the page
680  * @last_stage: if we are at the completion stage
681  */
682 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
683                             ram_addr_t current_addr, RAMBlock *block,
684                             ram_addr_t offset, bool last_stage)
685 {
686     int encoded_len = 0, bytes_xbzrle;
687     uint8_t *prev_cached_page;
688 
689     if (!cache_is_cached(XBZRLE.cache, current_addr,
690                          ram_counters.dirty_sync_count)) {
691         xbzrle_counters.cache_miss++;
692         if (!last_stage) {
693             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
694                              ram_counters.dirty_sync_count) == -1) {
695                 return -1;
696             } else {
697                 /* update *current_data when the page has been
698                    inserted into cache */
699                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
700             }
701         }
702         return -1;
703     }
704 
705     /*
706      * Reaching here means the page has hit the xbzrle cache, no matter what
707      * encoding result it is (normal encoding, overflow or skipping the page),
708      * count the page as encoded. This is used to calculate the encoding rate.
709      *
710      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
711      * 2nd page turns out to be skipped (i.e. no new bytes written to the
712      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
713      * skipped page included. In this way, the encoding rate can tell if the
714      * guest page is good for xbzrle encoding.
715      */
716     xbzrle_counters.pages++;
717     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
718 
719     /* save current buffer into memory */
720     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
721 
722     /* XBZRLE encoding (if there is no overflow) */
723     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
724                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
725                                        TARGET_PAGE_SIZE);
726 
727     /*
728      * Update the cache contents, so that it corresponds to the data
729      * sent, in all cases except where we skip the page.
730      */
731     if (!last_stage && encoded_len != 0) {
732         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
733         /*
734          * In the case where we couldn't compress, ensure that the caller
735          * sends the data from the cache, since the guest might have
736          * changed the RAM since we copied it.
737          */
738         *current_data = prev_cached_page;
739     }
740 
741     if (encoded_len == 0) {
742         trace_save_xbzrle_page_skipping();
743         return 0;
744     } else if (encoded_len == -1) {
745         trace_save_xbzrle_page_overflow();
746         xbzrle_counters.overflow++;
747         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
748         return -1;
749     }
750 
751     /* Send XBZRLE based compressed page */
752     bytes_xbzrle = save_page_header(rs, rs->f, block,
753                                     offset | RAM_SAVE_FLAG_XBZRLE);
754     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
755     qemu_put_be16(rs->f, encoded_len);
756     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
757     bytes_xbzrle += encoded_len + 1 + 2;
758     /*
759      * Like compressed_size (please see update_compress_thread_counts),
760      * the xbzrle encoded bytes don't count the 8 byte header with
761      * RAM_SAVE_FLAG_CONTINUE.
762      */
763     xbzrle_counters.bytes += bytes_xbzrle - 8;
764     ram_counters.transferred += bytes_xbzrle;
765 
766     return 1;
767 }
768 
769 /**
770  * migration_bitmap_find_dirty: find the next dirty page from start
771  *
772  * Returns the page offset within memory region of the start of a dirty page
773  *
774  * @rs: current RAM state
775  * @rb: RAMBlock where to search for dirty pages
776  * @start: page where we start the search
777  */
778 static inline
779 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
780                                           unsigned long start)
781 {
782     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
783     unsigned long *bitmap = rb->bmap;
784 
785     if (ramblock_is_ignored(rb)) {
786         return size;
787     }
788 
789     return find_next_bit(bitmap, size, start);
790 }
791 
792 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
793                                                        unsigned long page)
794 {
795     uint8_t shift;
796     hwaddr size, start;
797 
798     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
799         return;
800     }
801 
802     shift = rb->clear_bmap_shift;
803     /*
804      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
805      * can make things easier sometimes since then start address
806      * of the small chunk will always be 64 pages aligned so the
807      * bitmap will always be aligned to unsigned long. We should
808      * even be able to remove this restriction but I'm simply
809      * keeping it.
810      */
811     assert(shift >= 6);
812 
813     size = 1ULL << (TARGET_PAGE_BITS + shift);
814     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
815     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
816     memory_region_clear_dirty_bitmap(rb->mr, start, size);
817 }
818 
819 static void
820 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
821                                                  unsigned long start,
822                                                  unsigned long npages)
823 {
824     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
825     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
826     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
827 
828     /*
829      * Clear pages from start to start + npages - 1, so the end boundary is
830      * exclusive.
831      */
832     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
833         migration_clear_memory_region_dirty_bitmap(rb, i);
834     }
835 }
836 
837 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
838                                                 RAMBlock *rb,
839                                                 unsigned long page)
840 {
841     bool ret;
842 
843     /*
844      * Clear dirty bitmap if needed.  This _must_ be called before we
845      * send any of the page in the chunk because we need to make sure
846      * we can capture further page content changes when we sync dirty
847      * log the next time.  So as long as we are going to send any of
848      * the page in the chunk we clear the remote dirty bitmap for all.
849      * Clearing it earlier won't be a problem, but too late will.
850      */
851     migration_clear_memory_region_dirty_bitmap(rb, page);
852 
853     ret = test_and_clear_bit(page, rb->bmap);
854     if (ret) {
855         rs->migration_dirty_pages--;
856     }
857 
858     return ret;
859 }
860 
861 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
862                                        void *opaque)
863 {
864     const hwaddr offset = section->offset_within_region;
865     const hwaddr size = int128_get64(section->size);
866     const unsigned long start = offset >> TARGET_PAGE_BITS;
867     const unsigned long npages = size >> TARGET_PAGE_BITS;
868     RAMBlock *rb = section->mr->ram_block;
869     uint64_t *cleared_bits = opaque;
870 
871     /*
872      * We don't grab ram_state->bitmap_mutex because we expect to run
873      * only when starting migration or during postcopy recovery where
874      * we don't have concurrent access.
875      */
876     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
877         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
878     }
879     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
880     bitmap_clear(rb->bmap, start, npages);
881 }
882 
883 /*
884  * Exclude all dirty pages from migration that fall into a discarded range as
885  * managed by a RamDiscardManager responsible for the mapped memory region of
886  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
887  *
888  * Discarded pages ("logically unplugged") have undefined content and must
889  * not get migrated, because even reading these pages for migration might
890  * result in undesired behavior.
891  *
892  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
893  *
894  * Note: The result is only stable while migrating (precopy/postcopy).
895  */
896 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
897 {
898     uint64_t cleared_bits = 0;
899 
900     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
901         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
902         MemoryRegionSection section = {
903             .mr = rb->mr,
904             .offset_within_region = 0,
905             .size = int128_make64(qemu_ram_get_used_length(rb)),
906         };
907 
908         ram_discard_manager_replay_discarded(rdm, &section,
909                                              dirty_bitmap_clear_section,
910                                              &cleared_bits);
911     }
912     return cleared_bits;
913 }
914 
915 /*
916  * Check if a host-page aligned page falls into a discarded range as managed by
917  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
918  *
919  * Note: The result is only stable while migrating (precopy/postcopy).
920  */
921 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
922 {
923     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
924         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
925         MemoryRegionSection section = {
926             .mr = rb->mr,
927             .offset_within_region = start,
928             .size = int128_make64(qemu_ram_pagesize(rb)),
929         };
930 
931         return !ram_discard_manager_is_populated(rdm, &section);
932     }
933     return false;
934 }
935 
936 /* Called with RCU critical section */
937 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
938 {
939     uint64_t new_dirty_pages =
940         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
941 
942     rs->migration_dirty_pages += new_dirty_pages;
943     rs->num_dirty_pages_period += new_dirty_pages;
944 }
945 
946 /**
947  * ram_pagesize_summary: calculate all the pagesizes of a VM
948  *
949  * Returns a summary bitmap of the page sizes of all RAMBlocks
950  *
951  * For VMs with just normal pages this is equivalent to the host page
952  * size. If it's got some huge pages then it's the OR of all the
953  * different page sizes.
954  */
955 uint64_t ram_pagesize_summary(void)
956 {
957     RAMBlock *block;
958     uint64_t summary = 0;
959 
960     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
961         summary |= block->page_size;
962     }
963 
964     return summary;
965 }
966 
967 uint64_t ram_get_total_transferred_pages(void)
968 {
969     return  ram_counters.normal + ram_counters.duplicate +
970                 compression_counters.pages + xbzrle_counters.pages;
971 }
972 
973 static void migration_update_rates(RAMState *rs, int64_t end_time)
974 {
975     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
976     double compressed_size;
977 
978     /* calculate period counters */
979     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
980                 / (end_time - rs->time_last_bitmap_sync);
981 
982     if (!page_count) {
983         return;
984     }
985 
986     if (migrate_use_xbzrle()) {
987         double encoded_size, unencoded_size;
988 
989         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
990             rs->xbzrle_cache_miss_prev) / page_count;
991         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
992         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
993                          TARGET_PAGE_SIZE;
994         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
995         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
996             xbzrle_counters.encoding_rate = 0;
997         } else {
998             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
999         }
1000         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1001         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1002     }
1003 
1004     if (migrate_use_compression()) {
1005         compression_counters.busy_rate = (double)(compression_counters.busy -
1006             rs->compress_thread_busy_prev) / page_count;
1007         rs->compress_thread_busy_prev = compression_counters.busy;
1008 
1009         compressed_size = compression_counters.compressed_size -
1010                           rs->compressed_size_prev;
1011         if (compressed_size) {
1012             double uncompressed_size = (compression_counters.pages -
1013                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1014 
1015             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1016             compression_counters.compression_rate =
1017                                         uncompressed_size / compressed_size;
1018 
1019             rs->compress_pages_prev = compression_counters.pages;
1020             rs->compressed_size_prev = compression_counters.compressed_size;
1021         }
1022     }
1023 }
1024 
1025 static void migration_trigger_throttle(RAMState *rs)
1026 {
1027     MigrationState *s = migrate_get_current();
1028     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1029 
1030     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1031     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1032     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1033 
1034     /* During block migration the auto-converge logic incorrectly detects
1035      * that ram migration makes no progress. Avoid this by disabling the
1036      * throttling logic during the bulk phase of block migration. */
1037     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1038         /* The following detection logic can be refined later. For now:
1039            Check to see if the ratio between dirtied bytes and the approx.
1040            amount of bytes that just got transferred since the last time
1041            we were in this routine reaches the threshold. If that happens
1042            twice, start or increase throttling. */
1043 
1044         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1045             (++rs->dirty_rate_high_cnt >= 2)) {
1046             trace_migration_throttle();
1047             rs->dirty_rate_high_cnt = 0;
1048             mig_throttle_guest_down(bytes_dirty_period,
1049                                     bytes_dirty_threshold);
1050         }
1051     }
1052 }
1053 
1054 static void migration_bitmap_sync(RAMState *rs)
1055 {
1056     RAMBlock *block;
1057     int64_t end_time;
1058 
1059     ram_counters.dirty_sync_count++;
1060 
1061     if (!rs->time_last_bitmap_sync) {
1062         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1063     }
1064 
1065     trace_migration_bitmap_sync_start();
1066     memory_global_dirty_log_sync();
1067 
1068     qemu_mutex_lock(&rs->bitmap_mutex);
1069     WITH_RCU_READ_LOCK_GUARD() {
1070         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1071             ramblock_sync_dirty_bitmap(rs, block);
1072         }
1073         ram_counters.remaining = ram_bytes_remaining();
1074     }
1075     qemu_mutex_unlock(&rs->bitmap_mutex);
1076 
1077     memory_global_after_dirty_log_sync();
1078     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1079 
1080     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1081 
1082     /* more than 1 second = 1000 millisecons */
1083     if (end_time > rs->time_last_bitmap_sync + 1000) {
1084         migration_trigger_throttle(rs);
1085 
1086         migration_update_rates(rs, end_time);
1087 
1088         rs->target_page_count_prev = rs->target_page_count;
1089 
1090         /* reset period counters */
1091         rs->time_last_bitmap_sync = end_time;
1092         rs->num_dirty_pages_period = 0;
1093         rs->bytes_xfer_prev = ram_counters.transferred;
1094     }
1095     if (migrate_use_events()) {
1096         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1097     }
1098 }
1099 
1100 static void migration_bitmap_sync_precopy(RAMState *rs)
1101 {
1102     Error *local_err = NULL;
1103 
1104     /*
1105      * The current notifier usage is just an optimization to migration, so we
1106      * don't stop the normal migration process in the error case.
1107      */
1108     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1109         error_report_err(local_err);
1110         local_err = NULL;
1111     }
1112 
1113     migration_bitmap_sync(rs);
1114 
1115     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1116         error_report_err(local_err);
1117     }
1118 }
1119 
1120 /**
1121  * save_zero_page_to_file: send the zero page to the file
1122  *
1123  * Returns the size of data written to the file, 0 means the page is not
1124  * a zero page
1125  *
1126  * @rs: current RAM state
1127  * @file: the file where the data is saved
1128  * @block: block that contains the page we want to send
1129  * @offset: offset inside the block for the page
1130  */
1131 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1132                                   RAMBlock *block, ram_addr_t offset)
1133 {
1134     uint8_t *p = block->host + offset;
1135     int len = 0;
1136 
1137     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1138         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1139         qemu_put_byte(file, 0);
1140         len += 1;
1141     }
1142     return len;
1143 }
1144 
1145 /**
1146  * save_zero_page: send the zero page to the stream
1147  *
1148  * Returns the number of pages written.
1149  *
1150  * @rs: current RAM state
1151  * @block: block that contains the page we want to send
1152  * @offset: offset inside the block for the page
1153  */
1154 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1155 {
1156     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1157 
1158     if (len) {
1159         ram_counters.duplicate++;
1160         ram_counters.transferred += len;
1161         return 1;
1162     }
1163     return -1;
1164 }
1165 
1166 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1167 {
1168     if (!migrate_release_ram() || !migration_in_postcopy()) {
1169         return;
1170     }
1171 
1172     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1173 }
1174 
1175 /*
1176  * @pages: the number of pages written by the control path,
1177  *        < 0 - error
1178  *        > 0 - number of pages written
1179  *
1180  * Return true if the pages has been saved, otherwise false is returned.
1181  */
1182 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1183                               int *pages)
1184 {
1185     uint64_t bytes_xmit = 0;
1186     int ret;
1187 
1188     *pages = -1;
1189     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1190                                 &bytes_xmit);
1191     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1192         return false;
1193     }
1194 
1195     if (bytes_xmit) {
1196         ram_counters.transferred += bytes_xmit;
1197         *pages = 1;
1198     }
1199 
1200     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1201         return true;
1202     }
1203 
1204     if (bytes_xmit > 0) {
1205         ram_counters.normal++;
1206     } else if (bytes_xmit == 0) {
1207         ram_counters.duplicate++;
1208     }
1209 
1210     return true;
1211 }
1212 
1213 /*
1214  * directly send the page to the stream
1215  *
1216  * Returns the number of pages written.
1217  *
1218  * @rs: current RAM state
1219  * @block: block that contains the page we want to send
1220  * @offset: offset inside the block for the page
1221  * @buf: the page to be sent
1222  * @async: send to page asyncly
1223  */
1224 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1225                             uint8_t *buf, bool async)
1226 {
1227     ram_counters.transferred += save_page_header(rs, rs->f, block,
1228                                                  offset | RAM_SAVE_FLAG_PAGE);
1229     if (async) {
1230         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1231                               migrate_release_ram() &
1232                               migration_in_postcopy());
1233     } else {
1234         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1235     }
1236     ram_counters.transferred += TARGET_PAGE_SIZE;
1237     ram_counters.normal++;
1238     return 1;
1239 }
1240 
1241 /**
1242  * ram_save_page: send the given page to the stream
1243  *
1244  * Returns the number of pages written.
1245  *          < 0 - error
1246  *          >=0 - Number of pages written - this might legally be 0
1247  *                if xbzrle noticed the page was the same.
1248  *
1249  * @rs: current RAM state
1250  * @block: block that contains the page we want to send
1251  * @offset: offset inside the block for the page
1252  * @last_stage: if we are at the completion stage
1253  */
1254 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1255 {
1256     int pages = -1;
1257     uint8_t *p;
1258     bool send_async = true;
1259     RAMBlock *block = pss->block;
1260     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1261     ram_addr_t current_addr = block->offset + offset;
1262 
1263     p = block->host + offset;
1264     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1265 
1266     XBZRLE_cache_lock();
1267     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1268         pages = save_xbzrle_page(rs, &p, current_addr, block,
1269                                  offset, last_stage);
1270         if (!last_stage) {
1271             /* Can't send this cached data async, since the cache page
1272              * might get updated before it gets to the wire
1273              */
1274             send_async = false;
1275         }
1276     }
1277 
1278     /* XBZRLE overflow or normal page */
1279     if (pages == -1) {
1280         pages = save_normal_page(rs, block, offset, p, send_async);
1281     }
1282 
1283     XBZRLE_cache_unlock();
1284 
1285     return pages;
1286 }
1287 
1288 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1289                                  ram_addr_t offset)
1290 {
1291     if (multifd_queue_page(rs->f, block, offset) < 0) {
1292         return -1;
1293     }
1294     ram_counters.normal++;
1295 
1296     return 1;
1297 }
1298 
1299 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1300                                  ram_addr_t offset, uint8_t *source_buf)
1301 {
1302     RAMState *rs = ram_state;
1303     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1304     bool zero_page = false;
1305     int ret;
1306 
1307     if (save_zero_page_to_file(rs, f, block, offset)) {
1308         zero_page = true;
1309         goto exit;
1310     }
1311 
1312     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1313 
1314     /*
1315      * copy it to a internal buffer to avoid it being modified by VM
1316      * so that we can catch up the error during compression and
1317      * decompression
1318      */
1319     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1320     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1321     if (ret < 0) {
1322         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1323         error_report("compressed data failed!");
1324         return false;
1325     }
1326 
1327 exit:
1328     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1329     return zero_page;
1330 }
1331 
1332 static void
1333 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1334 {
1335     ram_counters.transferred += bytes_xmit;
1336 
1337     if (param->zero_page) {
1338         ram_counters.duplicate++;
1339         return;
1340     }
1341 
1342     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1343     compression_counters.compressed_size += bytes_xmit - 8;
1344     compression_counters.pages++;
1345 }
1346 
1347 static bool save_page_use_compression(RAMState *rs);
1348 
1349 static void flush_compressed_data(RAMState *rs)
1350 {
1351     int idx, len, thread_count;
1352 
1353     if (!save_page_use_compression(rs)) {
1354         return;
1355     }
1356     thread_count = migrate_compress_threads();
1357 
1358     qemu_mutex_lock(&comp_done_lock);
1359     for (idx = 0; idx < thread_count; idx++) {
1360         while (!comp_param[idx].done) {
1361             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1362         }
1363     }
1364     qemu_mutex_unlock(&comp_done_lock);
1365 
1366     for (idx = 0; idx < thread_count; idx++) {
1367         qemu_mutex_lock(&comp_param[idx].mutex);
1368         if (!comp_param[idx].quit) {
1369             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1370             /*
1371              * it's safe to fetch zero_page without holding comp_done_lock
1372              * as there is no further request submitted to the thread,
1373              * i.e, the thread should be waiting for a request at this point.
1374              */
1375             update_compress_thread_counts(&comp_param[idx], len);
1376         }
1377         qemu_mutex_unlock(&comp_param[idx].mutex);
1378     }
1379 }
1380 
1381 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1382                                        ram_addr_t offset)
1383 {
1384     param->block = block;
1385     param->offset = offset;
1386 }
1387 
1388 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1389                                            ram_addr_t offset)
1390 {
1391     int idx, thread_count, bytes_xmit = -1, pages = -1;
1392     bool wait = migrate_compress_wait_thread();
1393 
1394     thread_count = migrate_compress_threads();
1395     qemu_mutex_lock(&comp_done_lock);
1396 retry:
1397     for (idx = 0; idx < thread_count; idx++) {
1398         if (comp_param[idx].done) {
1399             comp_param[idx].done = false;
1400             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1401             qemu_mutex_lock(&comp_param[idx].mutex);
1402             set_compress_params(&comp_param[idx], block, offset);
1403             qemu_cond_signal(&comp_param[idx].cond);
1404             qemu_mutex_unlock(&comp_param[idx].mutex);
1405             pages = 1;
1406             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1407             break;
1408         }
1409     }
1410 
1411     /*
1412      * wait for the free thread if the user specifies 'compress-wait-thread',
1413      * otherwise we will post the page out in the main thread as normal page.
1414      */
1415     if (pages < 0 && wait) {
1416         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1417         goto retry;
1418     }
1419     qemu_mutex_unlock(&comp_done_lock);
1420 
1421     return pages;
1422 }
1423 
1424 /**
1425  * find_dirty_block: find the next dirty page and update any state
1426  * associated with the search process.
1427  *
1428  * Returns true if a page is found
1429  *
1430  * @rs: current RAM state
1431  * @pss: data about the state of the current dirty page scan
1432  * @again: set to false if the search has scanned the whole of RAM
1433  */
1434 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1435 {
1436     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1437     if (pss->complete_round && pss->block == rs->last_seen_block &&
1438         pss->page >= rs->last_page) {
1439         /*
1440          * We've been once around the RAM and haven't found anything.
1441          * Give up.
1442          */
1443         *again = false;
1444         return false;
1445     }
1446     if (!offset_in_ramblock(pss->block,
1447                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1448         /* Didn't find anything in this RAM Block */
1449         pss->page = 0;
1450         pss->block = QLIST_NEXT_RCU(pss->block, next);
1451         if (!pss->block) {
1452             /*
1453              * If memory migration starts over, we will meet a dirtied page
1454              * which may still exists in compression threads's ring, so we
1455              * should flush the compressed data to make sure the new page
1456              * is not overwritten by the old one in the destination.
1457              *
1458              * Also If xbzrle is on, stop using the data compression at this
1459              * point. In theory, xbzrle can do better than compression.
1460              */
1461             flush_compressed_data(rs);
1462 
1463             /* Hit the end of the list */
1464             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1465             /* Flag that we've looped */
1466             pss->complete_round = true;
1467             /* After the first round, enable XBZRLE. */
1468             if (migrate_use_xbzrle()) {
1469                 rs->xbzrle_enabled = true;
1470             }
1471         }
1472         /* Didn't find anything this time, but try again on the new block */
1473         *again = true;
1474         return false;
1475     } else {
1476         /* Can go around again, but... */
1477         *again = true;
1478         /* We've found something so probably don't need to */
1479         return true;
1480     }
1481 }
1482 
1483 /**
1484  * unqueue_page: gets a page of the queue
1485  *
1486  * Helper for 'get_queued_page' - gets a page off the queue
1487  *
1488  * Returns the block of the page (or NULL if none available)
1489  *
1490  * @rs: current RAM state
1491  * @offset: used to return the offset within the RAMBlock
1492  */
1493 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1494 {
1495     RAMBlock *block = NULL;
1496 
1497     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1498         return NULL;
1499     }
1500 
1501     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1502     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1503         struct RAMSrcPageRequest *entry =
1504                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1505         block = entry->rb;
1506         *offset = entry->offset;
1507 
1508         if (entry->len > TARGET_PAGE_SIZE) {
1509             entry->len -= TARGET_PAGE_SIZE;
1510             entry->offset += TARGET_PAGE_SIZE;
1511         } else {
1512             memory_region_unref(block->mr);
1513             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1514             g_free(entry);
1515             migration_consume_urgent_request();
1516         }
1517     }
1518 
1519     return block;
1520 }
1521 
1522 #if defined(__linux__)
1523 /**
1524  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1525  *   is found, return RAM block pointer and page offset
1526  *
1527  * Returns pointer to the RAMBlock containing faulting page,
1528  *   NULL if no write faults are pending
1529  *
1530  * @rs: current RAM state
1531  * @offset: page offset from the beginning of the block
1532  */
1533 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1534 {
1535     struct uffd_msg uffd_msg;
1536     void *page_address;
1537     RAMBlock *block;
1538     int res;
1539 
1540     if (!migrate_background_snapshot()) {
1541         return NULL;
1542     }
1543 
1544     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1545     if (res <= 0) {
1546         return NULL;
1547     }
1548 
1549     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1550     block = qemu_ram_block_from_host(page_address, false, offset);
1551     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1552     return block;
1553 }
1554 
1555 /**
1556  * ram_save_release_protection: release UFFD write protection after
1557  *   a range of pages has been saved
1558  *
1559  * @rs: current RAM state
1560  * @pss: page-search-status structure
1561  * @start_page: index of the first page in the range relative to pss->block
1562  *
1563  * Returns 0 on success, negative value in case of an error
1564 */
1565 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1566         unsigned long start_page)
1567 {
1568     int res = 0;
1569 
1570     /* Check if page is from UFFD-managed region. */
1571     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1572         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1573         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1574 
1575         /* Flush async buffers before un-protect. */
1576         qemu_fflush(rs->f);
1577         /* Un-protect memory range. */
1578         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1579                 false, false);
1580     }
1581 
1582     return res;
1583 }
1584 
1585 /* ram_write_tracking_available: check if kernel supports required UFFD features
1586  *
1587  * Returns true if supports, false otherwise
1588  */
1589 bool ram_write_tracking_available(void)
1590 {
1591     uint64_t uffd_features;
1592     int res;
1593 
1594     res = uffd_query_features(&uffd_features);
1595     return (res == 0 &&
1596             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1597 }
1598 
1599 /* ram_write_tracking_compatible: check if guest configuration is
1600  *   compatible with 'write-tracking'
1601  *
1602  * Returns true if compatible, false otherwise
1603  */
1604 bool ram_write_tracking_compatible(void)
1605 {
1606     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1607     int uffd_fd;
1608     RAMBlock *block;
1609     bool ret = false;
1610 
1611     /* Open UFFD file descriptor */
1612     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1613     if (uffd_fd < 0) {
1614         return false;
1615     }
1616 
1617     RCU_READ_LOCK_GUARD();
1618 
1619     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1620         uint64_t uffd_ioctls;
1621 
1622         /* Nothing to do with read-only and MMIO-writable regions */
1623         if (block->mr->readonly || block->mr->rom_device) {
1624             continue;
1625         }
1626         /* Try to register block memory via UFFD-IO to track writes */
1627         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1628                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1629             goto out;
1630         }
1631         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1632             goto out;
1633         }
1634     }
1635     ret = true;
1636 
1637 out:
1638     uffd_close_fd(uffd_fd);
1639     return ret;
1640 }
1641 
1642 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1643                                        ram_addr_t size)
1644 {
1645     /*
1646      * We read one byte of each page; this will preallocate page tables if
1647      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1648      * where no page was populated yet. This might require adaption when
1649      * supporting other mappings, like shmem.
1650      */
1651     for (; offset < size; offset += block->page_size) {
1652         char tmp = *((char *)block->host + offset);
1653 
1654         /* Don't optimize the read out */
1655         asm volatile("" : "+r" (tmp));
1656     }
1657 }
1658 
1659 static inline int populate_read_section(MemoryRegionSection *section,
1660                                         void *opaque)
1661 {
1662     const hwaddr size = int128_get64(section->size);
1663     hwaddr offset = section->offset_within_region;
1664     RAMBlock *block = section->mr->ram_block;
1665 
1666     populate_read_range(block, offset, size);
1667     return 0;
1668 }
1669 
1670 /*
1671  * ram_block_populate_read: preallocate page tables and populate pages in the
1672  *   RAM block by reading a byte of each page.
1673  *
1674  * Since it's solely used for userfault_fd WP feature, here we just
1675  *   hardcode page size to qemu_real_host_page_size.
1676  *
1677  * @block: RAM block to populate
1678  */
1679 static void ram_block_populate_read(RAMBlock *rb)
1680 {
1681     /*
1682      * Skip populating all pages that fall into a discarded range as managed by
1683      * a RamDiscardManager responsible for the mapped memory region of the
1684      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1685      * must not get populated automatically. We don't have to track
1686      * modifications via userfaultfd WP reliably, because these pages will
1687      * not be part of the migration stream either way -- see
1688      * ramblock_dirty_bitmap_exclude_discarded_pages().
1689      *
1690      * Note: The result is only stable while migrating (precopy/postcopy).
1691      */
1692     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1693         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1694         MemoryRegionSection section = {
1695             .mr = rb->mr,
1696             .offset_within_region = 0,
1697             .size = rb->mr->size,
1698         };
1699 
1700         ram_discard_manager_replay_populated(rdm, &section,
1701                                              populate_read_section, NULL);
1702     } else {
1703         populate_read_range(rb, 0, rb->used_length);
1704     }
1705 }
1706 
1707 /*
1708  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1709  */
1710 void ram_write_tracking_prepare(void)
1711 {
1712     RAMBlock *block;
1713 
1714     RCU_READ_LOCK_GUARD();
1715 
1716     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1717         /* Nothing to do with read-only and MMIO-writable regions */
1718         if (block->mr->readonly || block->mr->rom_device) {
1719             continue;
1720         }
1721 
1722         /*
1723          * Populate pages of the RAM block before enabling userfault_fd
1724          * write protection.
1725          *
1726          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1727          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1728          * pages with pte_none() entries in page table.
1729          */
1730         ram_block_populate_read(block);
1731     }
1732 }
1733 
1734 /*
1735  * ram_write_tracking_start: start UFFD-WP memory tracking
1736  *
1737  * Returns 0 for success or negative value in case of error
1738  */
1739 int ram_write_tracking_start(void)
1740 {
1741     int uffd_fd;
1742     RAMState *rs = ram_state;
1743     RAMBlock *block;
1744 
1745     /* Open UFFD file descriptor */
1746     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1747     if (uffd_fd < 0) {
1748         return uffd_fd;
1749     }
1750     rs->uffdio_fd = uffd_fd;
1751 
1752     RCU_READ_LOCK_GUARD();
1753 
1754     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1755         /* Nothing to do with read-only and MMIO-writable regions */
1756         if (block->mr->readonly || block->mr->rom_device) {
1757             continue;
1758         }
1759 
1760         /* Register block memory with UFFD to track writes */
1761         if (uffd_register_memory(rs->uffdio_fd, block->host,
1762                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1763             goto fail;
1764         }
1765         /* Apply UFFD write protection to the block memory range */
1766         if (uffd_change_protection(rs->uffdio_fd, block->host,
1767                 block->max_length, true, false)) {
1768             goto fail;
1769         }
1770         block->flags |= RAM_UF_WRITEPROTECT;
1771         memory_region_ref(block->mr);
1772 
1773         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1774                 block->host, block->max_length);
1775     }
1776 
1777     return 0;
1778 
1779 fail:
1780     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1781 
1782     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1783         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1784             continue;
1785         }
1786         /*
1787          * In case some memory block failed to be write-protected
1788          * remove protection and unregister all succeeded RAM blocks
1789          */
1790         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1791                 false, false);
1792         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1793         /* Cleanup flags and remove reference */
1794         block->flags &= ~RAM_UF_WRITEPROTECT;
1795         memory_region_unref(block->mr);
1796     }
1797 
1798     uffd_close_fd(uffd_fd);
1799     rs->uffdio_fd = -1;
1800     return -1;
1801 }
1802 
1803 /**
1804  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1805  */
1806 void ram_write_tracking_stop(void)
1807 {
1808     RAMState *rs = ram_state;
1809     RAMBlock *block;
1810 
1811     RCU_READ_LOCK_GUARD();
1812 
1813     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1814         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1815             continue;
1816         }
1817         /* Remove protection and unregister all affected RAM blocks */
1818         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1819                 false, false);
1820         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1821 
1822         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1823                 block->host, block->max_length);
1824 
1825         /* Cleanup flags and remove reference */
1826         block->flags &= ~RAM_UF_WRITEPROTECT;
1827         memory_region_unref(block->mr);
1828     }
1829 
1830     /* Finally close UFFD file descriptor */
1831     uffd_close_fd(rs->uffdio_fd);
1832     rs->uffdio_fd = -1;
1833 }
1834 
1835 #else
1836 /* No target OS support, stubs just fail or ignore */
1837 
1838 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1839 {
1840     (void) rs;
1841     (void) offset;
1842 
1843     return NULL;
1844 }
1845 
1846 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1847         unsigned long start_page)
1848 {
1849     (void) rs;
1850     (void) pss;
1851     (void) start_page;
1852 
1853     return 0;
1854 }
1855 
1856 bool ram_write_tracking_available(void)
1857 {
1858     return false;
1859 }
1860 
1861 bool ram_write_tracking_compatible(void)
1862 {
1863     assert(0);
1864     return false;
1865 }
1866 
1867 int ram_write_tracking_start(void)
1868 {
1869     assert(0);
1870     return -1;
1871 }
1872 
1873 void ram_write_tracking_stop(void)
1874 {
1875     assert(0);
1876 }
1877 #endif /* defined(__linux__) */
1878 
1879 /**
1880  * get_queued_page: unqueue a page from the postcopy requests
1881  *
1882  * Skips pages that are already sent (!dirty)
1883  *
1884  * Returns true if a queued page is found
1885  *
1886  * @rs: current RAM state
1887  * @pss: data about the state of the current dirty page scan
1888  */
1889 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1890 {
1891     RAMBlock  *block;
1892     ram_addr_t offset;
1893     bool dirty;
1894 
1895     do {
1896         block = unqueue_page(rs, &offset);
1897         /*
1898          * We're sending this page, and since it's postcopy nothing else
1899          * will dirty it, and we must make sure it doesn't get sent again
1900          * even if this queue request was received after the background
1901          * search already sent it.
1902          */
1903         if (block) {
1904             unsigned long page;
1905 
1906             page = offset >> TARGET_PAGE_BITS;
1907             dirty = test_bit(page, block->bmap);
1908             if (!dirty) {
1909                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1910                                                 page);
1911             } else {
1912                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1913             }
1914         }
1915 
1916     } while (block && !dirty);
1917 
1918     if (!block) {
1919         /*
1920          * Poll write faults too if background snapshot is enabled; that's
1921          * when we have vcpus got blocked by the write protected pages.
1922          */
1923         block = poll_fault_page(rs, &offset);
1924     }
1925 
1926     if (block) {
1927         /*
1928          * We want the background search to continue from the queued page
1929          * since the guest is likely to want other pages near to the page
1930          * it just requested.
1931          */
1932         pss->block = block;
1933         pss->page = offset >> TARGET_PAGE_BITS;
1934 
1935         /*
1936          * This unqueued page would break the "one round" check, even is
1937          * really rare.
1938          */
1939         pss->complete_round = false;
1940     }
1941 
1942     return !!block;
1943 }
1944 
1945 /**
1946  * migration_page_queue_free: drop any remaining pages in the ram
1947  * request queue
1948  *
1949  * It should be empty at the end anyway, but in error cases there may
1950  * be some left.  in case that there is any page left, we drop it.
1951  *
1952  */
1953 static void migration_page_queue_free(RAMState *rs)
1954 {
1955     struct RAMSrcPageRequest *mspr, *next_mspr;
1956     /* This queue generally should be empty - but in the case of a failed
1957      * migration might have some droppings in.
1958      */
1959     RCU_READ_LOCK_GUARD();
1960     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1961         memory_region_unref(mspr->rb->mr);
1962         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1963         g_free(mspr);
1964     }
1965 }
1966 
1967 /**
1968  * ram_save_queue_pages: queue the page for transmission
1969  *
1970  * A request from postcopy destination for example.
1971  *
1972  * Returns zero on success or negative on error
1973  *
1974  * @rbname: Name of the RAMBLock of the request. NULL means the
1975  *          same that last one.
1976  * @start: starting address from the start of the RAMBlock
1977  * @len: length (in bytes) to send
1978  */
1979 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1980 {
1981     RAMBlock *ramblock;
1982     RAMState *rs = ram_state;
1983 
1984     ram_counters.postcopy_requests++;
1985     RCU_READ_LOCK_GUARD();
1986 
1987     if (!rbname) {
1988         /* Reuse last RAMBlock */
1989         ramblock = rs->last_req_rb;
1990 
1991         if (!ramblock) {
1992             /*
1993              * Shouldn't happen, we can't reuse the last RAMBlock if
1994              * it's the 1st request.
1995              */
1996             error_report("ram_save_queue_pages no previous block");
1997             return -1;
1998         }
1999     } else {
2000         ramblock = qemu_ram_block_by_name(rbname);
2001 
2002         if (!ramblock) {
2003             /* We shouldn't be asked for a non-existent RAMBlock */
2004             error_report("ram_save_queue_pages no block '%s'", rbname);
2005             return -1;
2006         }
2007         rs->last_req_rb = ramblock;
2008     }
2009     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2010     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2011         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2012                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2013                      __func__, start, len, ramblock->used_length);
2014         return -1;
2015     }
2016 
2017     struct RAMSrcPageRequest *new_entry =
2018         g_malloc0(sizeof(struct RAMSrcPageRequest));
2019     new_entry->rb = ramblock;
2020     new_entry->offset = start;
2021     new_entry->len = len;
2022 
2023     memory_region_ref(ramblock->mr);
2024     qemu_mutex_lock(&rs->src_page_req_mutex);
2025     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2026     migration_make_urgent_request();
2027     qemu_mutex_unlock(&rs->src_page_req_mutex);
2028 
2029     return 0;
2030 }
2031 
2032 static bool save_page_use_compression(RAMState *rs)
2033 {
2034     if (!migrate_use_compression()) {
2035         return false;
2036     }
2037 
2038     /*
2039      * If xbzrle is enabled (e.g., after first round of migration), stop
2040      * using the data compression. In theory, xbzrle can do better than
2041      * compression.
2042      */
2043     if (rs->xbzrle_enabled) {
2044         return false;
2045     }
2046 
2047     return true;
2048 }
2049 
2050 /*
2051  * try to compress the page before posting it out, return true if the page
2052  * has been properly handled by compression, otherwise needs other
2053  * paths to handle it
2054  */
2055 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2056 {
2057     if (!save_page_use_compression(rs)) {
2058         return false;
2059     }
2060 
2061     /*
2062      * When starting the process of a new block, the first page of
2063      * the block should be sent out before other pages in the same
2064      * block, and all the pages in last block should have been sent
2065      * out, keeping this order is important, because the 'cont' flag
2066      * is used to avoid resending the block name.
2067      *
2068      * We post the fist page as normal page as compression will take
2069      * much CPU resource.
2070      */
2071     if (block != rs->last_sent_block) {
2072         flush_compressed_data(rs);
2073         return false;
2074     }
2075 
2076     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2077         return true;
2078     }
2079 
2080     compression_counters.busy++;
2081     return false;
2082 }
2083 
2084 /**
2085  * ram_save_target_page: save one target page
2086  *
2087  * Returns the number of pages written
2088  *
2089  * @rs: current RAM state
2090  * @pss: data about the page we want to send
2091  * @last_stage: if we are at the completion stage
2092  */
2093 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2094                                 bool last_stage)
2095 {
2096     RAMBlock *block = pss->block;
2097     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2098     int res;
2099 
2100     if (control_save_page(rs, block, offset, &res)) {
2101         return res;
2102     }
2103 
2104     if (save_compress_page(rs, block, offset)) {
2105         return 1;
2106     }
2107 
2108     res = save_zero_page(rs, block, offset);
2109     if (res > 0) {
2110         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2111          * page would be stale
2112          */
2113         if (!save_page_use_compression(rs)) {
2114             XBZRLE_cache_lock();
2115             xbzrle_cache_zero_page(rs, block->offset + offset);
2116             XBZRLE_cache_unlock();
2117         }
2118         ram_release_pages(block->idstr, offset, res);
2119         return res;
2120     }
2121 
2122     /*
2123      * Do not use multifd for:
2124      * 1. Compression as the first page in the new block should be posted out
2125      *    before sending the compressed page
2126      * 2. In postcopy as one whole host page should be placed
2127      */
2128     if (!save_page_use_compression(rs) && migrate_use_multifd()
2129         && !migration_in_postcopy()) {
2130         return ram_save_multifd_page(rs, block, offset);
2131     }
2132 
2133     return ram_save_page(rs, pss, last_stage);
2134 }
2135 
2136 /**
2137  * ram_save_host_page: save a whole host page
2138  *
2139  * Starting at *offset send pages up to the end of the current host
2140  * page. It's valid for the initial offset to point into the middle of
2141  * a host page in which case the remainder of the hostpage is sent.
2142  * Only dirty target pages are sent. Note that the host page size may
2143  * be a huge page for this block.
2144  * The saving stops at the boundary of the used_length of the block
2145  * if the RAMBlock isn't a multiple of the host page size.
2146  *
2147  * Returns the number of pages written or negative on error
2148  *
2149  * @rs: current RAM state
2150  * @ms: current migration state
2151  * @pss: data about the page we want to send
2152  * @last_stage: if we are at the completion stage
2153  */
2154 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2155                               bool last_stage)
2156 {
2157     int tmppages, pages = 0;
2158     size_t pagesize_bits =
2159         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2160     unsigned long hostpage_boundary =
2161         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2162     unsigned long start_page = pss->page;
2163     int res;
2164 
2165     if (ramblock_is_ignored(pss->block)) {
2166         error_report("block %s should not be migrated !", pss->block->idstr);
2167         return 0;
2168     }
2169 
2170     do {
2171         /* Check the pages is dirty and if it is send it */
2172         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2173             tmppages = ram_save_target_page(rs, pss, last_stage);
2174             if (tmppages < 0) {
2175                 return tmppages;
2176             }
2177 
2178             pages += tmppages;
2179             /*
2180              * Allow rate limiting to happen in the middle of huge pages if
2181              * something is sent in the current iteration.
2182              */
2183             if (pagesize_bits > 1 && tmppages > 0) {
2184                 migration_rate_limit();
2185             }
2186         }
2187         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2188     } while ((pss->page < hostpage_boundary) &&
2189              offset_in_ramblock(pss->block,
2190                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2191     /* The offset we leave with is the min boundary of host page and block */
2192     pss->page = MIN(pss->page, hostpage_boundary) - 1;
2193 
2194     res = ram_save_release_protection(rs, pss, start_page);
2195     return (res < 0 ? res : pages);
2196 }
2197 
2198 /**
2199  * ram_find_and_save_block: finds a dirty page and sends it to f
2200  *
2201  * Called within an RCU critical section.
2202  *
2203  * Returns the number of pages written where zero means no dirty pages,
2204  * or negative on error
2205  *
2206  * @rs: current RAM state
2207  * @last_stage: if we are at the completion stage
2208  *
2209  * On systems where host-page-size > target-page-size it will send all the
2210  * pages in a host page that are dirty.
2211  */
2212 
2213 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2214 {
2215     PageSearchStatus pss;
2216     int pages = 0;
2217     bool again, found;
2218 
2219     /* No dirty page as there is zero RAM */
2220     if (!ram_bytes_total()) {
2221         return pages;
2222     }
2223 
2224     pss.block = rs->last_seen_block;
2225     pss.page = rs->last_page;
2226     pss.complete_round = false;
2227 
2228     if (!pss.block) {
2229         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2230     }
2231 
2232     do {
2233         again = true;
2234         found = get_queued_page(rs, &pss);
2235 
2236         if (!found) {
2237             /* priority queue empty, so just search for something dirty */
2238             found = find_dirty_block(rs, &pss, &again);
2239         }
2240 
2241         if (found) {
2242             pages = ram_save_host_page(rs, &pss, last_stage);
2243         }
2244     } while (!pages && again);
2245 
2246     rs->last_seen_block = pss.block;
2247     rs->last_page = pss.page;
2248 
2249     return pages;
2250 }
2251 
2252 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2253 {
2254     uint64_t pages = size / TARGET_PAGE_SIZE;
2255 
2256     if (zero) {
2257         ram_counters.duplicate += pages;
2258     } else {
2259         ram_counters.normal += pages;
2260         ram_counters.transferred += size;
2261         qemu_update_position(f, size);
2262     }
2263 }
2264 
2265 static uint64_t ram_bytes_total_common(bool count_ignored)
2266 {
2267     RAMBlock *block;
2268     uint64_t total = 0;
2269 
2270     RCU_READ_LOCK_GUARD();
2271 
2272     if (count_ignored) {
2273         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2274             total += block->used_length;
2275         }
2276     } else {
2277         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2278             total += block->used_length;
2279         }
2280     }
2281     return total;
2282 }
2283 
2284 uint64_t ram_bytes_total(void)
2285 {
2286     return ram_bytes_total_common(false);
2287 }
2288 
2289 static void xbzrle_load_setup(void)
2290 {
2291     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2292 }
2293 
2294 static void xbzrle_load_cleanup(void)
2295 {
2296     g_free(XBZRLE.decoded_buf);
2297     XBZRLE.decoded_buf = NULL;
2298 }
2299 
2300 static void ram_state_cleanup(RAMState **rsp)
2301 {
2302     if (*rsp) {
2303         migration_page_queue_free(*rsp);
2304         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2305         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2306         g_free(*rsp);
2307         *rsp = NULL;
2308     }
2309 }
2310 
2311 static void xbzrle_cleanup(void)
2312 {
2313     XBZRLE_cache_lock();
2314     if (XBZRLE.cache) {
2315         cache_fini(XBZRLE.cache);
2316         g_free(XBZRLE.encoded_buf);
2317         g_free(XBZRLE.current_buf);
2318         g_free(XBZRLE.zero_target_page);
2319         XBZRLE.cache = NULL;
2320         XBZRLE.encoded_buf = NULL;
2321         XBZRLE.current_buf = NULL;
2322         XBZRLE.zero_target_page = NULL;
2323     }
2324     XBZRLE_cache_unlock();
2325 }
2326 
2327 static void ram_save_cleanup(void *opaque)
2328 {
2329     RAMState **rsp = opaque;
2330     RAMBlock *block;
2331 
2332     /* We don't use dirty log with background snapshots */
2333     if (!migrate_background_snapshot()) {
2334         /* caller have hold iothread lock or is in a bh, so there is
2335          * no writing race against the migration bitmap
2336          */
2337         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2338             /*
2339              * do not stop dirty log without starting it, since
2340              * memory_global_dirty_log_stop will assert that
2341              * memory_global_dirty_log_start/stop used in pairs
2342              */
2343             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2344         }
2345     }
2346 
2347     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2348         g_free(block->clear_bmap);
2349         block->clear_bmap = NULL;
2350         g_free(block->bmap);
2351         block->bmap = NULL;
2352     }
2353 
2354     xbzrle_cleanup();
2355     compress_threads_save_cleanup();
2356     ram_state_cleanup(rsp);
2357 }
2358 
2359 static void ram_state_reset(RAMState *rs)
2360 {
2361     rs->last_seen_block = NULL;
2362     rs->last_sent_block = NULL;
2363     rs->last_page = 0;
2364     rs->last_version = ram_list.version;
2365     rs->xbzrle_enabled = false;
2366 }
2367 
2368 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2369 
2370 /*
2371  * 'expected' is the value you expect the bitmap mostly to be full
2372  * of; it won't bother printing lines that are all this value.
2373  * If 'todump' is null the migration bitmap is dumped.
2374  */
2375 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2376                            unsigned long pages)
2377 {
2378     int64_t cur;
2379     int64_t linelen = 128;
2380     char linebuf[129];
2381 
2382     for (cur = 0; cur < pages; cur += linelen) {
2383         int64_t curb;
2384         bool found = false;
2385         /*
2386          * Last line; catch the case where the line length
2387          * is longer than remaining ram
2388          */
2389         if (cur + linelen > pages) {
2390             linelen = pages - cur;
2391         }
2392         for (curb = 0; curb < linelen; curb++) {
2393             bool thisbit = test_bit(cur + curb, todump);
2394             linebuf[curb] = thisbit ? '1' : '.';
2395             found = found || (thisbit != expected);
2396         }
2397         if (found) {
2398             linebuf[curb] = '\0';
2399             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2400         }
2401     }
2402 }
2403 
2404 /* **** functions for postcopy ***** */
2405 
2406 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2407 {
2408     struct RAMBlock *block;
2409 
2410     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2411         unsigned long *bitmap = block->bmap;
2412         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2413         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2414 
2415         while (run_start < range) {
2416             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2417             ram_discard_range(block->idstr,
2418                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2419                               ((ram_addr_t)(run_end - run_start))
2420                                 << TARGET_PAGE_BITS);
2421             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2422         }
2423     }
2424 }
2425 
2426 /**
2427  * postcopy_send_discard_bm_ram: discard a RAMBlock
2428  *
2429  * Returns zero on success
2430  *
2431  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2432  *
2433  * @ms: current migration state
2434  * @block: RAMBlock to discard
2435  */
2436 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2437 {
2438     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2439     unsigned long current;
2440     unsigned long *bitmap = block->bmap;
2441 
2442     for (current = 0; current < end; ) {
2443         unsigned long one = find_next_bit(bitmap, end, current);
2444         unsigned long zero, discard_length;
2445 
2446         if (one >= end) {
2447             break;
2448         }
2449 
2450         zero = find_next_zero_bit(bitmap, end, one + 1);
2451 
2452         if (zero >= end) {
2453             discard_length = end - one;
2454         } else {
2455             discard_length = zero - one;
2456         }
2457         postcopy_discard_send_range(ms, one, discard_length);
2458         current = one + discard_length;
2459     }
2460 
2461     return 0;
2462 }
2463 
2464 /**
2465  * postcopy_each_ram_send_discard: discard all RAMBlocks
2466  *
2467  * Returns 0 for success or negative for error
2468  *
2469  * Utility for the outgoing postcopy code.
2470  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2471  *   passing it bitmap indexes and name.
2472  * (qemu_ram_foreach_block ends up passing unscaled lengths
2473  *  which would mean postcopy code would have to deal with target page)
2474  *
2475  * @ms: current migration state
2476  */
2477 static int postcopy_each_ram_send_discard(MigrationState *ms)
2478 {
2479     struct RAMBlock *block;
2480     int ret;
2481 
2482     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2483         postcopy_discard_send_init(ms, block->idstr);
2484 
2485         /*
2486          * Postcopy sends chunks of bitmap over the wire, but it
2487          * just needs indexes at this point, avoids it having
2488          * target page specific code.
2489          */
2490         ret = postcopy_send_discard_bm_ram(ms, block);
2491         postcopy_discard_send_finish(ms);
2492         if (ret) {
2493             return ret;
2494         }
2495     }
2496 
2497     return 0;
2498 }
2499 
2500 /**
2501  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2502  *
2503  * Helper for postcopy_chunk_hostpages; it's called twice to
2504  * canonicalize the two bitmaps, that are similar, but one is
2505  * inverted.
2506  *
2507  * Postcopy requires that all target pages in a hostpage are dirty or
2508  * clean, not a mix.  This function canonicalizes the bitmaps.
2509  *
2510  * @ms: current migration state
2511  * @block: block that contains the page we want to canonicalize
2512  */
2513 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2514 {
2515     RAMState *rs = ram_state;
2516     unsigned long *bitmap = block->bmap;
2517     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2518     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2519     unsigned long run_start;
2520 
2521     if (block->page_size == TARGET_PAGE_SIZE) {
2522         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2523         return;
2524     }
2525 
2526     /* Find a dirty page */
2527     run_start = find_next_bit(bitmap, pages, 0);
2528 
2529     while (run_start < pages) {
2530 
2531         /*
2532          * If the start of this run of pages is in the middle of a host
2533          * page, then we need to fixup this host page.
2534          */
2535         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2536             /* Find the end of this run */
2537             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2538             /*
2539              * If the end isn't at the start of a host page, then the
2540              * run doesn't finish at the end of a host page
2541              * and we need to discard.
2542              */
2543         }
2544 
2545         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2546             unsigned long page;
2547             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2548                                                              host_ratio);
2549             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2550 
2551             /* Clean up the bitmap */
2552             for (page = fixup_start_addr;
2553                  page < fixup_start_addr + host_ratio; page++) {
2554                 /*
2555                  * Remark them as dirty, updating the count for any pages
2556                  * that weren't previously dirty.
2557                  */
2558                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2559             }
2560         }
2561 
2562         /* Find the next dirty page for the next iteration */
2563         run_start = find_next_bit(bitmap, pages, run_start);
2564     }
2565 }
2566 
2567 /**
2568  * postcopy_chunk_hostpages: discard any partially sent host page
2569  *
2570  * Utility for the outgoing postcopy code.
2571  *
2572  * Discard any partially sent host-page size chunks, mark any partially
2573  * dirty host-page size chunks as all dirty.  In this case the host-page
2574  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2575  *
2576  * Returns zero on success
2577  *
2578  * @ms: current migration state
2579  * @block: block we want to work with
2580  */
2581 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2582 {
2583     postcopy_discard_send_init(ms, block->idstr);
2584 
2585     /*
2586      * Ensure that all partially dirty host pages are made fully dirty.
2587      */
2588     postcopy_chunk_hostpages_pass(ms, block);
2589 
2590     postcopy_discard_send_finish(ms);
2591     return 0;
2592 }
2593 
2594 /**
2595  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2596  *
2597  * Returns zero on success
2598  *
2599  * Transmit the set of pages to be discarded after precopy to the target
2600  * these are pages that:
2601  *     a) Have been previously transmitted but are now dirty again
2602  *     b) Pages that have never been transmitted, this ensures that
2603  *        any pages on the destination that have been mapped by background
2604  *        tasks get discarded (transparent huge pages is the specific concern)
2605  * Hopefully this is pretty sparse
2606  *
2607  * @ms: current migration state
2608  */
2609 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2610 {
2611     RAMState *rs = ram_state;
2612     RAMBlock *block;
2613     int ret;
2614 
2615     RCU_READ_LOCK_GUARD();
2616 
2617     /* This should be our last sync, the src is now paused */
2618     migration_bitmap_sync(rs);
2619 
2620     /* Easiest way to make sure we don't resume in the middle of a host-page */
2621     rs->last_seen_block = NULL;
2622     rs->last_sent_block = NULL;
2623     rs->last_page = 0;
2624 
2625     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2626         /* Deal with TPS != HPS and huge pages */
2627         ret = postcopy_chunk_hostpages(ms, block);
2628         if (ret) {
2629             return ret;
2630         }
2631 
2632 #ifdef DEBUG_POSTCOPY
2633         ram_debug_dump_bitmap(block->bmap, true,
2634                               block->used_length >> TARGET_PAGE_BITS);
2635 #endif
2636     }
2637     trace_ram_postcopy_send_discard_bitmap();
2638 
2639     return postcopy_each_ram_send_discard(ms);
2640 }
2641 
2642 /**
2643  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2644  *
2645  * Returns zero on success
2646  *
2647  * @rbname: name of the RAMBlock of the request. NULL means the
2648  *          same that last one.
2649  * @start: RAMBlock starting page
2650  * @length: RAMBlock size
2651  */
2652 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2653 {
2654     trace_ram_discard_range(rbname, start, length);
2655 
2656     RCU_READ_LOCK_GUARD();
2657     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2658 
2659     if (!rb) {
2660         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2661         return -1;
2662     }
2663 
2664     /*
2665      * On source VM, we don't need to update the received bitmap since
2666      * we don't even have one.
2667      */
2668     if (rb->receivedmap) {
2669         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2670                      length >> qemu_target_page_bits());
2671     }
2672 
2673     return ram_block_discard_range(rb, start, length);
2674 }
2675 
2676 /*
2677  * For every allocation, we will try not to crash the VM if the
2678  * allocation failed.
2679  */
2680 static int xbzrle_init(void)
2681 {
2682     Error *local_err = NULL;
2683 
2684     if (!migrate_use_xbzrle()) {
2685         return 0;
2686     }
2687 
2688     XBZRLE_cache_lock();
2689 
2690     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2691     if (!XBZRLE.zero_target_page) {
2692         error_report("%s: Error allocating zero page", __func__);
2693         goto err_out;
2694     }
2695 
2696     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2697                               TARGET_PAGE_SIZE, &local_err);
2698     if (!XBZRLE.cache) {
2699         error_report_err(local_err);
2700         goto free_zero_page;
2701     }
2702 
2703     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2704     if (!XBZRLE.encoded_buf) {
2705         error_report("%s: Error allocating encoded_buf", __func__);
2706         goto free_cache;
2707     }
2708 
2709     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2710     if (!XBZRLE.current_buf) {
2711         error_report("%s: Error allocating current_buf", __func__);
2712         goto free_encoded_buf;
2713     }
2714 
2715     /* We are all good */
2716     XBZRLE_cache_unlock();
2717     return 0;
2718 
2719 free_encoded_buf:
2720     g_free(XBZRLE.encoded_buf);
2721     XBZRLE.encoded_buf = NULL;
2722 free_cache:
2723     cache_fini(XBZRLE.cache);
2724     XBZRLE.cache = NULL;
2725 free_zero_page:
2726     g_free(XBZRLE.zero_target_page);
2727     XBZRLE.zero_target_page = NULL;
2728 err_out:
2729     XBZRLE_cache_unlock();
2730     return -ENOMEM;
2731 }
2732 
2733 static int ram_state_init(RAMState **rsp)
2734 {
2735     *rsp = g_try_new0(RAMState, 1);
2736 
2737     if (!*rsp) {
2738         error_report("%s: Init ramstate fail", __func__);
2739         return -1;
2740     }
2741 
2742     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2743     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2744     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2745 
2746     /*
2747      * Count the total number of pages used by ram blocks not including any
2748      * gaps due to alignment or unplugs.
2749      * This must match with the initial values of dirty bitmap.
2750      */
2751     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2752     ram_state_reset(*rsp);
2753 
2754     return 0;
2755 }
2756 
2757 static void ram_list_init_bitmaps(void)
2758 {
2759     MigrationState *ms = migrate_get_current();
2760     RAMBlock *block;
2761     unsigned long pages;
2762     uint8_t shift;
2763 
2764     /* Skip setting bitmap if there is no RAM */
2765     if (ram_bytes_total()) {
2766         shift = ms->clear_bitmap_shift;
2767         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2768             error_report("clear_bitmap_shift (%u) too big, using "
2769                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2770             shift = CLEAR_BITMAP_SHIFT_MAX;
2771         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2772             error_report("clear_bitmap_shift (%u) too small, using "
2773                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2774             shift = CLEAR_BITMAP_SHIFT_MIN;
2775         }
2776 
2777         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2778             pages = block->max_length >> TARGET_PAGE_BITS;
2779             /*
2780              * The initial dirty bitmap for migration must be set with all
2781              * ones to make sure we'll migrate every guest RAM page to
2782              * destination.
2783              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2784              * new migration after a failed migration, ram_list.
2785              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2786              * guest memory.
2787              */
2788             block->bmap = bitmap_new(pages);
2789             bitmap_set(block->bmap, 0, pages);
2790             block->clear_bmap_shift = shift;
2791             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2792         }
2793     }
2794 }
2795 
2796 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2797 {
2798     unsigned long pages;
2799     RAMBlock *rb;
2800 
2801     RCU_READ_LOCK_GUARD();
2802 
2803     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2804             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2805             rs->migration_dirty_pages -= pages;
2806     }
2807 }
2808 
2809 static void ram_init_bitmaps(RAMState *rs)
2810 {
2811     /* For memory_global_dirty_log_start below.  */
2812     qemu_mutex_lock_iothread();
2813     qemu_mutex_lock_ramlist();
2814 
2815     WITH_RCU_READ_LOCK_GUARD() {
2816         ram_list_init_bitmaps();
2817         /* We don't use dirty log with background snapshots */
2818         if (!migrate_background_snapshot()) {
2819             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2820             migration_bitmap_sync_precopy(rs);
2821         }
2822     }
2823     qemu_mutex_unlock_ramlist();
2824     qemu_mutex_unlock_iothread();
2825 
2826     /*
2827      * After an eventual first bitmap sync, fixup the initial bitmap
2828      * containing all 1s to exclude any discarded pages from migration.
2829      */
2830     migration_bitmap_clear_discarded_pages(rs);
2831 }
2832 
2833 static int ram_init_all(RAMState **rsp)
2834 {
2835     if (ram_state_init(rsp)) {
2836         return -1;
2837     }
2838 
2839     if (xbzrle_init()) {
2840         ram_state_cleanup(rsp);
2841         return -1;
2842     }
2843 
2844     ram_init_bitmaps(*rsp);
2845 
2846     return 0;
2847 }
2848 
2849 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2850 {
2851     RAMBlock *block;
2852     uint64_t pages = 0;
2853 
2854     /*
2855      * Postcopy is not using xbzrle/compression, so no need for that.
2856      * Also, since source are already halted, we don't need to care
2857      * about dirty page logging as well.
2858      */
2859 
2860     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2861         pages += bitmap_count_one(block->bmap,
2862                                   block->used_length >> TARGET_PAGE_BITS);
2863     }
2864 
2865     /* This may not be aligned with current bitmaps. Recalculate. */
2866     rs->migration_dirty_pages = pages;
2867 
2868     ram_state_reset(rs);
2869 
2870     /* Update RAMState cache of output QEMUFile */
2871     rs->f = out;
2872 
2873     trace_ram_state_resume_prepare(pages);
2874 }
2875 
2876 /*
2877  * This function clears bits of the free pages reported by the caller from the
2878  * migration dirty bitmap. @addr is the host address corresponding to the
2879  * start of the continuous guest free pages, and @len is the total bytes of
2880  * those pages.
2881  */
2882 void qemu_guest_free_page_hint(void *addr, size_t len)
2883 {
2884     RAMBlock *block;
2885     ram_addr_t offset;
2886     size_t used_len, start, npages;
2887     MigrationState *s = migrate_get_current();
2888 
2889     /* This function is currently expected to be used during live migration */
2890     if (!migration_is_setup_or_active(s->state)) {
2891         return;
2892     }
2893 
2894     for (; len > 0; len -= used_len, addr += used_len) {
2895         block = qemu_ram_block_from_host(addr, false, &offset);
2896         if (unlikely(!block || offset >= block->used_length)) {
2897             /*
2898              * The implementation might not support RAMBlock resize during
2899              * live migration, but it could happen in theory with future
2900              * updates. So we add a check here to capture that case.
2901              */
2902             error_report_once("%s unexpected error", __func__);
2903             return;
2904         }
2905 
2906         if (len <= block->used_length - offset) {
2907             used_len = len;
2908         } else {
2909             used_len = block->used_length - offset;
2910         }
2911 
2912         start = offset >> TARGET_PAGE_BITS;
2913         npages = used_len >> TARGET_PAGE_BITS;
2914 
2915         qemu_mutex_lock(&ram_state->bitmap_mutex);
2916         /*
2917          * The skipped free pages are equavalent to be sent from clear_bmap's
2918          * perspective, so clear the bits from the memory region bitmap which
2919          * are initially set. Otherwise those skipped pages will be sent in
2920          * the next round after syncing from the memory region bitmap.
2921          */
2922         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2923         ram_state->migration_dirty_pages -=
2924                       bitmap_count_one_with_offset(block->bmap, start, npages);
2925         bitmap_clear(block->bmap, start, npages);
2926         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2927     }
2928 }
2929 
2930 /*
2931  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2932  * long-running RCU critical section.  When rcu-reclaims in the code
2933  * start to become numerous it will be necessary to reduce the
2934  * granularity of these critical sections.
2935  */
2936 
2937 /**
2938  * ram_save_setup: Setup RAM for migration
2939  *
2940  * Returns zero to indicate success and negative for error
2941  *
2942  * @f: QEMUFile where to send the data
2943  * @opaque: RAMState pointer
2944  */
2945 static int ram_save_setup(QEMUFile *f, void *opaque)
2946 {
2947     RAMState **rsp = opaque;
2948     RAMBlock *block;
2949 
2950     if (compress_threads_save_setup()) {
2951         return -1;
2952     }
2953 
2954     /* migration has already setup the bitmap, reuse it. */
2955     if (!migration_in_colo_state()) {
2956         if (ram_init_all(rsp) != 0) {
2957             compress_threads_save_cleanup();
2958             return -1;
2959         }
2960     }
2961     (*rsp)->f = f;
2962 
2963     WITH_RCU_READ_LOCK_GUARD() {
2964         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2965 
2966         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2967             qemu_put_byte(f, strlen(block->idstr));
2968             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2969             qemu_put_be64(f, block->used_length);
2970             if (migrate_postcopy_ram() && block->page_size !=
2971                                           qemu_host_page_size) {
2972                 qemu_put_be64(f, block->page_size);
2973             }
2974             if (migrate_ignore_shared()) {
2975                 qemu_put_be64(f, block->mr->addr);
2976             }
2977         }
2978     }
2979 
2980     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2981     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2982 
2983     multifd_send_sync_main(f);
2984     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2985     qemu_fflush(f);
2986 
2987     return 0;
2988 }
2989 
2990 /**
2991  * ram_save_iterate: iterative stage for migration
2992  *
2993  * Returns zero to indicate success and negative for error
2994  *
2995  * @f: QEMUFile where to send the data
2996  * @opaque: RAMState pointer
2997  */
2998 static int ram_save_iterate(QEMUFile *f, void *opaque)
2999 {
3000     RAMState **temp = opaque;
3001     RAMState *rs = *temp;
3002     int ret = 0;
3003     int i;
3004     int64_t t0;
3005     int done = 0;
3006 
3007     if (blk_mig_bulk_active()) {
3008         /* Avoid transferring ram during bulk phase of block migration as
3009          * the bulk phase will usually take a long time and transferring
3010          * ram updates during that time is pointless. */
3011         goto out;
3012     }
3013 
3014     /*
3015      * We'll take this lock a little bit long, but it's okay for two reasons.
3016      * Firstly, the only possible other thread to take it is who calls
3017      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3018      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3019      * guarantees that we'll at least released it in a regular basis.
3020      */
3021     qemu_mutex_lock(&rs->bitmap_mutex);
3022     WITH_RCU_READ_LOCK_GUARD() {
3023         if (ram_list.version != rs->last_version) {
3024             ram_state_reset(rs);
3025         }
3026 
3027         /* Read version before ram_list.blocks */
3028         smp_rmb();
3029 
3030         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3031 
3032         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3033         i = 0;
3034         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3035                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3036             int pages;
3037 
3038             if (qemu_file_get_error(f)) {
3039                 break;
3040             }
3041 
3042             pages = ram_find_and_save_block(rs, false);
3043             /* no more pages to sent */
3044             if (pages == 0) {
3045                 done = 1;
3046                 break;
3047             }
3048 
3049             if (pages < 0) {
3050                 qemu_file_set_error(f, pages);
3051                 break;
3052             }
3053 
3054             rs->target_page_count += pages;
3055 
3056             /*
3057              * During postcopy, it is necessary to make sure one whole host
3058              * page is sent in one chunk.
3059              */
3060             if (migrate_postcopy_ram()) {
3061                 flush_compressed_data(rs);
3062             }
3063 
3064             /*
3065              * we want to check in the 1st loop, just in case it was the 1st
3066              * time and we had to sync the dirty bitmap.
3067              * qemu_clock_get_ns() is a bit expensive, so we only check each
3068              * some iterations
3069              */
3070             if ((i & 63) == 0) {
3071                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3072                               1000000;
3073                 if (t1 > MAX_WAIT) {
3074                     trace_ram_save_iterate_big_wait(t1, i);
3075                     break;
3076                 }
3077             }
3078             i++;
3079         }
3080     }
3081     qemu_mutex_unlock(&rs->bitmap_mutex);
3082 
3083     /*
3084      * Must occur before EOS (or any QEMUFile operation)
3085      * because of RDMA protocol.
3086      */
3087     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3088 
3089 out:
3090     if (ret >= 0
3091         && migration_is_setup_or_active(migrate_get_current()->state)) {
3092         multifd_send_sync_main(rs->f);
3093         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3094         qemu_fflush(f);
3095         ram_counters.transferred += 8;
3096 
3097         ret = qemu_file_get_error(f);
3098     }
3099     if (ret < 0) {
3100         return ret;
3101     }
3102 
3103     return done;
3104 }
3105 
3106 /**
3107  * ram_save_complete: function called to send the remaining amount of ram
3108  *
3109  * Returns zero to indicate success or negative on error
3110  *
3111  * Called with iothread lock
3112  *
3113  * @f: QEMUFile where to send the data
3114  * @opaque: RAMState pointer
3115  */
3116 static int ram_save_complete(QEMUFile *f, void *opaque)
3117 {
3118     RAMState **temp = opaque;
3119     RAMState *rs = *temp;
3120     int ret = 0;
3121 
3122     WITH_RCU_READ_LOCK_GUARD() {
3123         if (!migration_in_postcopy()) {
3124             migration_bitmap_sync_precopy(rs);
3125         }
3126 
3127         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3128 
3129         /* try transferring iterative blocks of memory */
3130 
3131         /* flush all remaining blocks regardless of rate limiting */
3132         while (true) {
3133             int pages;
3134 
3135             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3136             /* no more blocks to sent */
3137             if (pages == 0) {
3138                 break;
3139             }
3140             if (pages < 0) {
3141                 ret = pages;
3142                 break;
3143             }
3144         }
3145 
3146         flush_compressed_data(rs);
3147         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3148     }
3149 
3150     if (ret >= 0) {
3151         multifd_send_sync_main(rs->f);
3152         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3153         qemu_fflush(f);
3154     }
3155 
3156     return ret;
3157 }
3158 
3159 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3160                              uint64_t *res_precopy_only,
3161                              uint64_t *res_compatible,
3162                              uint64_t *res_postcopy_only)
3163 {
3164     RAMState **temp = opaque;
3165     RAMState *rs = *temp;
3166     uint64_t remaining_size;
3167 
3168     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3169 
3170     if (!migration_in_postcopy() &&
3171         remaining_size < max_size) {
3172         qemu_mutex_lock_iothread();
3173         WITH_RCU_READ_LOCK_GUARD() {
3174             migration_bitmap_sync_precopy(rs);
3175         }
3176         qemu_mutex_unlock_iothread();
3177         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3178     }
3179 
3180     if (migrate_postcopy_ram()) {
3181         /* We can do postcopy, and all the data is postcopiable */
3182         *res_compatible += remaining_size;
3183     } else {
3184         *res_precopy_only += remaining_size;
3185     }
3186 }
3187 
3188 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3189 {
3190     unsigned int xh_len;
3191     int xh_flags;
3192     uint8_t *loaded_data;
3193 
3194     /* extract RLE header */
3195     xh_flags = qemu_get_byte(f);
3196     xh_len = qemu_get_be16(f);
3197 
3198     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3199         error_report("Failed to load XBZRLE page - wrong compression!");
3200         return -1;
3201     }
3202 
3203     if (xh_len > TARGET_PAGE_SIZE) {
3204         error_report("Failed to load XBZRLE page - len overflow!");
3205         return -1;
3206     }
3207     loaded_data = XBZRLE.decoded_buf;
3208     /* load data and decode */
3209     /* it can change loaded_data to point to an internal buffer */
3210     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3211 
3212     /* decode RLE */
3213     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3214                              TARGET_PAGE_SIZE) == -1) {
3215         error_report("Failed to load XBZRLE page - decode error!");
3216         return -1;
3217     }
3218 
3219     return 0;
3220 }
3221 
3222 /**
3223  * ram_block_from_stream: read a RAMBlock id from the migration stream
3224  *
3225  * Must be called from within a rcu critical section.
3226  *
3227  * Returns a pointer from within the RCU-protected ram_list.
3228  *
3229  * @f: QEMUFile where to read the data from
3230  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3231  */
3232 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3233 {
3234     static RAMBlock *block;
3235     char id[256];
3236     uint8_t len;
3237 
3238     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3239         if (!block) {
3240             error_report("Ack, bad migration stream!");
3241             return NULL;
3242         }
3243         return block;
3244     }
3245 
3246     len = qemu_get_byte(f);
3247     qemu_get_buffer(f, (uint8_t *)id, len);
3248     id[len] = 0;
3249 
3250     block = qemu_ram_block_by_name(id);
3251     if (!block) {
3252         error_report("Can't find block %s", id);
3253         return NULL;
3254     }
3255 
3256     if (ramblock_is_ignored(block)) {
3257         error_report("block %s should not be migrated !", id);
3258         return NULL;
3259     }
3260 
3261     return block;
3262 }
3263 
3264 static inline void *host_from_ram_block_offset(RAMBlock *block,
3265                                                ram_addr_t offset)
3266 {
3267     if (!offset_in_ramblock(block, offset)) {
3268         return NULL;
3269     }
3270 
3271     return block->host + offset;
3272 }
3273 
3274 static void *host_page_from_ram_block_offset(RAMBlock *block,
3275                                              ram_addr_t offset)
3276 {
3277     /* Note: Explicitly no check against offset_in_ramblock(). */
3278     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3279                                    block->page_size);
3280 }
3281 
3282 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3283                                                          ram_addr_t offset)
3284 {
3285     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3286 }
3287 
3288 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3289                              ram_addr_t offset, bool record_bitmap)
3290 {
3291     if (!offset_in_ramblock(block, offset)) {
3292         return NULL;
3293     }
3294     if (!block->colo_cache) {
3295         error_report("%s: colo_cache is NULL in block :%s",
3296                      __func__, block->idstr);
3297         return NULL;
3298     }
3299 
3300     /*
3301     * During colo checkpoint, we need bitmap of these migrated pages.
3302     * It help us to decide which pages in ram cache should be flushed
3303     * into VM's RAM later.
3304     */
3305     if (record_bitmap &&
3306         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3307         ram_state->migration_dirty_pages++;
3308     }
3309     return block->colo_cache + offset;
3310 }
3311 
3312 /**
3313  * ram_handle_compressed: handle the zero page case
3314  *
3315  * If a page (or a whole RDMA chunk) has been
3316  * determined to be zero, then zap it.
3317  *
3318  * @host: host address for the zero page
3319  * @ch: what the page is filled from.  We only support zero
3320  * @size: size of the zero page
3321  */
3322 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3323 {
3324     if (ch != 0 || !is_zero_range(host, size)) {
3325         memset(host, ch, size);
3326     }
3327 }
3328 
3329 /* return the size after decompression, or negative value on error */
3330 static int
3331 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3332                      const uint8_t *source, size_t source_len)
3333 {
3334     int err;
3335 
3336     err = inflateReset(stream);
3337     if (err != Z_OK) {
3338         return -1;
3339     }
3340 
3341     stream->avail_in = source_len;
3342     stream->next_in = (uint8_t *)source;
3343     stream->avail_out = dest_len;
3344     stream->next_out = dest;
3345 
3346     err = inflate(stream, Z_NO_FLUSH);
3347     if (err != Z_STREAM_END) {
3348         return -1;
3349     }
3350 
3351     return stream->total_out;
3352 }
3353 
3354 static void *do_data_decompress(void *opaque)
3355 {
3356     DecompressParam *param = opaque;
3357     unsigned long pagesize;
3358     uint8_t *des;
3359     int len, ret;
3360 
3361     qemu_mutex_lock(&param->mutex);
3362     while (!param->quit) {
3363         if (param->des) {
3364             des = param->des;
3365             len = param->len;
3366             param->des = 0;
3367             qemu_mutex_unlock(&param->mutex);
3368 
3369             pagesize = TARGET_PAGE_SIZE;
3370 
3371             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3372                                        param->compbuf, len);
3373             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3374                 error_report("decompress data failed");
3375                 qemu_file_set_error(decomp_file, ret);
3376             }
3377 
3378             qemu_mutex_lock(&decomp_done_lock);
3379             param->done = true;
3380             qemu_cond_signal(&decomp_done_cond);
3381             qemu_mutex_unlock(&decomp_done_lock);
3382 
3383             qemu_mutex_lock(&param->mutex);
3384         } else {
3385             qemu_cond_wait(&param->cond, &param->mutex);
3386         }
3387     }
3388     qemu_mutex_unlock(&param->mutex);
3389 
3390     return NULL;
3391 }
3392 
3393 static int wait_for_decompress_done(void)
3394 {
3395     int idx, thread_count;
3396 
3397     if (!migrate_use_compression()) {
3398         return 0;
3399     }
3400 
3401     thread_count = migrate_decompress_threads();
3402     qemu_mutex_lock(&decomp_done_lock);
3403     for (idx = 0; idx < thread_count; idx++) {
3404         while (!decomp_param[idx].done) {
3405             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3406         }
3407     }
3408     qemu_mutex_unlock(&decomp_done_lock);
3409     return qemu_file_get_error(decomp_file);
3410 }
3411 
3412 static void compress_threads_load_cleanup(void)
3413 {
3414     int i, thread_count;
3415 
3416     if (!migrate_use_compression()) {
3417         return;
3418     }
3419     thread_count = migrate_decompress_threads();
3420     for (i = 0; i < thread_count; i++) {
3421         /*
3422          * we use it as a indicator which shows if the thread is
3423          * properly init'd or not
3424          */
3425         if (!decomp_param[i].compbuf) {
3426             break;
3427         }
3428 
3429         qemu_mutex_lock(&decomp_param[i].mutex);
3430         decomp_param[i].quit = true;
3431         qemu_cond_signal(&decomp_param[i].cond);
3432         qemu_mutex_unlock(&decomp_param[i].mutex);
3433     }
3434     for (i = 0; i < thread_count; i++) {
3435         if (!decomp_param[i].compbuf) {
3436             break;
3437         }
3438 
3439         qemu_thread_join(decompress_threads + i);
3440         qemu_mutex_destroy(&decomp_param[i].mutex);
3441         qemu_cond_destroy(&decomp_param[i].cond);
3442         inflateEnd(&decomp_param[i].stream);
3443         g_free(decomp_param[i].compbuf);
3444         decomp_param[i].compbuf = NULL;
3445     }
3446     g_free(decompress_threads);
3447     g_free(decomp_param);
3448     decompress_threads = NULL;
3449     decomp_param = NULL;
3450     decomp_file = NULL;
3451 }
3452 
3453 static int compress_threads_load_setup(QEMUFile *f)
3454 {
3455     int i, thread_count;
3456 
3457     if (!migrate_use_compression()) {
3458         return 0;
3459     }
3460 
3461     thread_count = migrate_decompress_threads();
3462     decompress_threads = g_new0(QemuThread, thread_count);
3463     decomp_param = g_new0(DecompressParam, thread_count);
3464     qemu_mutex_init(&decomp_done_lock);
3465     qemu_cond_init(&decomp_done_cond);
3466     decomp_file = f;
3467     for (i = 0; i < thread_count; i++) {
3468         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3469             goto exit;
3470         }
3471 
3472         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3473         qemu_mutex_init(&decomp_param[i].mutex);
3474         qemu_cond_init(&decomp_param[i].cond);
3475         decomp_param[i].done = true;
3476         decomp_param[i].quit = false;
3477         qemu_thread_create(decompress_threads + i, "decompress",
3478                            do_data_decompress, decomp_param + i,
3479                            QEMU_THREAD_JOINABLE);
3480     }
3481     return 0;
3482 exit:
3483     compress_threads_load_cleanup();
3484     return -1;
3485 }
3486 
3487 static void decompress_data_with_multi_threads(QEMUFile *f,
3488                                                void *host, int len)
3489 {
3490     int idx, thread_count;
3491 
3492     thread_count = migrate_decompress_threads();
3493     QEMU_LOCK_GUARD(&decomp_done_lock);
3494     while (true) {
3495         for (idx = 0; idx < thread_count; idx++) {
3496             if (decomp_param[idx].done) {
3497                 decomp_param[idx].done = false;
3498                 qemu_mutex_lock(&decomp_param[idx].mutex);
3499                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3500                 decomp_param[idx].des = host;
3501                 decomp_param[idx].len = len;
3502                 qemu_cond_signal(&decomp_param[idx].cond);
3503                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3504                 break;
3505             }
3506         }
3507         if (idx < thread_count) {
3508             break;
3509         } else {
3510             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3511         }
3512     }
3513 }
3514 
3515 static void colo_init_ram_state(void)
3516 {
3517     ram_state_init(&ram_state);
3518 }
3519 
3520 /*
3521  * colo cache: this is for secondary VM, we cache the whole
3522  * memory of the secondary VM, it is need to hold the global lock
3523  * to call this helper.
3524  */
3525 int colo_init_ram_cache(void)
3526 {
3527     RAMBlock *block;
3528 
3529     WITH_RCU_READ_LOCK_GUARD() {
3530         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3531             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3532                                                     NULL, false, false);
3533             if (!block->colo_cache) {
3534                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3535                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3536                              block->used_length);
3537                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3538                     if (block->colo_cache) {
3539                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3540                         block->colo_cache = NULL;
3541                     }
3542                 }
3543                 return -errno;
3544             }
3545         }
3546     }
3547 
3548     /*
3549     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3550     * with to decide which page in cache should be flushed into SVM's RAM. Here
3551     * we use the same name 'ram_bitmap' as for migration.
3552     */
3553     if (ram_bytes_total()) {
3554         RAMBlock *block;
3555 
3556         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3557             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3558             block->bmap = bitmap_new(pages);
3559         }
3560     }
3561 
3562     colo_init_ram_state();
3563     return 0;
3564 }
3565 
3566 /* TODO: duplicated with ram_init_bitmaps */
3567 void colo_incoming_start_dirty_log(void)
3568 {
3569     RAMBlock *block = NULL;
3570     /* For memory_global_dirty_log_start below. */
3571     qemu_mutex_lock_iothread();
3572     qemu_mutex_lock_ramlist();
3573 
3574     memory_global_dirty_log_sync();
3575     WITH_RCU_READ_LOCK_GUARD() {
3576         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3577             ramblock_sync_dirty_bitmap(ram_state, block);
3578             /* Discard this dirty bitmap record */
3579             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3580         }
3581         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3582     }
3583     ram_state->migration_dirty_pages = 0;
3584     qemu_mutex_unlock_ramlist();
3585     qemu_mutex_unlock_iothread();
3586 }
3587 
3588 /* It is need to hold the global lock to call this helper */
3589 void colo_release_ram_cache(void)
3590 {
3591     RAMBlock *block;
3592 
3593     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3594     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3595         g_free(block->bmap);
3596         block->bmap = NULL;
3597     }
3598 
3599     WITH_RCU_READ_LOCK_GUARD() {
3600         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3601             if (block->colo_cache) {
3602                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3603                 block->colo_cache = NULL;
3604             }
3605         }
3606     }
3607     ram_state_cleanup(&ram_state);
3608 }
3609 
3610 /**
3611  * ram_load_setup: Setup RAM for migration incoming side
3612  *
3613  * Returns zero to indicate success and negative for error
3614  *
3615  * @f: QEMUFile where to receive the data
3616  * @opaque: RAMState pointer
3617  */
3618 static int ram_load_setup(QEMUFile *f, void *opaque)
3619 {
3620     if (compress_threads_load_setup(f)) {
3621         return -1;
3622     }
3623 
3624     xbzrle_load_setup();
3625     ramblock_recv_map_init();
3626 
3627     return 0;
3628 }
3629 
3630 static int ram_load_cleanup(void *opaque)
3631 {
3632     RAMBlock *rb;
3633 
3634     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3635         qemu_ram_block_writeback(rb);
3636     }
3637 
3638     xbzrle_load_cleanup();
3639     compress_threads_load_cleanup();
3640 
3641     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3642         g_free(rb->receivedmap);
3643         rb->receivedmap = NULL;
3644     }
3645 
3646     return 0;
3647 }
3648 
3649 /**
3650  * ram_postcopy_incoming_init: allocate postcopy data structures
3651  *
3652  * Returns 0 for success and negative if there was one error
3653  *
3654  * @mis: current migration incoming state
3655  *
3656  * Allocate data structures etc needed by incoming migration with
3657  * postcopy-ram. postcopy-ram's similarly names
3658  * postcopy_ram_incoming_init does the work.
3659  */
3660 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3661 {
3662     return postcopy_ram_incoming_init(mis);
3663 }
3664 
3665 /**
3666  * ram_load_postcopy: load a page in postcopy case
3667  *
3668  * Returns 0 for success or -errno in case of error
3669  *
3670  * Called in postcopy mode by ram_load().
3671  * rcu_read_lock is taken prior to this being called.
3672  *
3673  * @f: QEMUFile where to send the data
3674  */
3675 static int ram_load_postcopy(QEMUFile *f)
3676 {
3677     int flags = 0, ret = 0;
3678     bool place_needed = false;
3679     bool matches_target_page_size = false;
3680     MigrationIncomingState *mis = migration_incoming_get_current();
3681     /* Temporary page that is later 'placed' */
3682     void *postcopy_host_page = mis->postcopy_tmp_page;
3683     void *host_page = NULL;
3684     bool all_zero = true;
3685     int target_pages = 0;
3686 
3687     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3688         ram_addr_t addr;
3689         void *page_buffer = NULL;
3690         void *place_source = NULL;
3691         RAMBlock *block = NULL;
3692         uint8_t ch;
3693         int len;
3694 
3695         addr = qemu_get_be64(f);
3696 
3697         /*
3698          * If qemu file error, we should stop here, and then "addr"
3699          * may be invalid
3700          */
3701         ret = qemu_file_get_error(f);
3702         if (ret) {
3703             break;
3704         }
3705 
3706         flags = addr & ~TARGET_PAGE_MASK;
3707         addr &= TARGET_PAGE_MASK;
3708 
3709         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3710         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3711                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3712             block = ram_block_from_stream(f, flags);
3713             if (!block) {
3714                 ret = -EINVAL;
3715                 break;
3716             }
3717 
3718             /*
3719              * Relying on used_length is racy and can result in false positives.
3720              * We might place pages beyond used_length in case RAM was shrunk
3721              * while in postcopy, which is fine - trying to place via
3722              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3723              */
3724             if (!block->host || addr >= block->postcopy_length) {
3725                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3726                 ret = -EINVAL;
3727                 break;
3728             }
3729             target_pages++;
3730             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3731             /*
3732              * Postcopy requires that we place whole host pages atomically;
3733              * these may be huge pages for RAMBlocks that are backed by
3734              * hugetlbfs.
3735              * To make it atomic, the data is read into a temporary page
3736              * that's moved into place later.
3737              * The migration protocol uses,  possibly smaller, target-pages
3738              * however the source ensures it always sends all the components
3739              * of a host page in one chunk.
3740              */
3741             page_buffer = postcopy_host_page +
3742                           host_page_offset_from_ram_block_offset(block, addr);
3743             /* If all TP are zero then we can optimise the place */
3744             if (target_pages == 1) {
3745                 host_page = host_page_from_ram_block_offset(block, addr);
3746             } else if (host_page != host_page_from_ram_block_offset(block,
3747                                                                     addr)) {
3748                 /* not the 1st TP within the HP */
3749                 error_report("Non-same host page %p/%p", host_page,
3750                              host_page_from_ram_block_offset(block, addr));
3751                 ret = -EINVAL;
3752                 break;
3753             }
3754 
3755             /*
3756              * If it's the last part of a host page then we place the host
3757              * page
3758              */
3759             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3760                 place_needed = true;
3761             }
3762             place_source = postcopy_host_page;
3763         }
3764 
3765         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3766         case RAM_SAVE_FLAG_ZERO:
3767             ch = qemu_get_byte(f);
3768             /*
3769              * Can skip to set page_buffer when
3770              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3771              */
3772             if (ch || !matches_target_page_size) {
3773                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3774             }
3775             if (ch) {
3776                 all_zero = false;
3777             }
3778             break;
3779 
3780         case RAM_SAVE_FLAG_PAGE:
3781             all_zero = false;
3782             if (!matches_target_page_size) {
3783                 /* For huge pages, we always use temporary buffer */
3784                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3785             } else {
3786                 /*
3787                  * For small pages that matches target page size, we
3788                  * avoid the qemu_file copy.  Instead we directly use
3789                  * the buffer of QEMUFile to place the page.  Note: we
3790                  * cannot do any QEMUFile operation before using that
3791                  * buffer to make sure the buffer is valid when
3792                  * placing the page.
3793                  */
3794                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3795                                          TARGET_PAGE_SIZE);
3796             }
3797             break;
3798         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3799             all_zero = false;
3800             len = qemu_get_be32(f);
3801             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3802                 error_report("Invalid compressed data length: %d", len);
3803                 ret = -EINVAL;
3804                 break;
3805             }
3806             decompress_data_with_multi_threads(f, page_buffer, len);
3807             break;
3808 
3809         case RAM_SAVE_FLAG_EOS:
3810             /* normal exit */
3811             multifd_recv_sync_main();
3812             break;
3813         default:
3814             error_report("Unknown combination of migration flags: 0x%x"
3815                          " (postcopy mode)", flags);
3816             ret = -EINVAL;
3817             break;
3818         }
3819 
3820         /* Got the whole host page, wait for decompress before placing. */
3821         if (place_needed) {
3822             ret |= wait_for_decompress_done();
3823         }
3824 
3825         /* Detect for any possible file errors */
3826         if (!ret && qemu_file_get_error(f)) {
3827             ret = qemu_file_get_error(f);
3828         }
3829 
3830         if (!ret && place_needed) {
3831             if (all_zero) {
3832                 ret = postcopy_place_page_zero(mis, host_page, block);
3833             } else {
3834                 ret = postcopy_place_page(mis, host_page, place_source,
3835                                           block);
3836             }
3837             place_needed = false;
3838             target_pages = 0;
3839             /* Assume we have a zero page until we detect something different */
3840             all_zero = true;
3841         }
3842     }
3843 
3844     return ret;
3845 }
3846 
3847 static bool postcopy_is_advised(void)
3848 {
3849     PostcopyState ps = postcopy_state_get();
3850     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3851 }
3852 
3853 static bool postcopy_is_running(void)
3854 {
3855     PostcopyState ps = postcopy_state_get();
3856     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3857 }
3858 
3859 /*
3860  * Flush content of RAM cache into SVM's memory.
3861  * Only flush the pages that be dirtied by PVM or SVM or both.
3862  */
3863 void colo_flush_ram_cache(void)
3864 {
3865     RAMBlock *block = NULL;
3866     void *dst_host;
3867     void *src_host;
3868     unsigned long offset = 0;
3869 
3870     memory_global_dirty_log_sync();
3871     qemu_mutex_lock(&ram_state->bitmap_mutex);
3872     WITH_RCU_READ_LOCK_GUARD() {
3873         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3874             ramblock_sync_dirty_bitmap(ram_state, block);
3875         }
3876     }
3877 
3878     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3879     WITH_RCU_READ_LOCK_GUARD() {
3880         block = QLIST_FIRST_RCU(&ram_list.blocks);
3881 
3882         while (block) {
3883             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3884 
3885             if (!offset_in_ramblock(block,
3886                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3887                 offset = 0;
3888                 block = QLIST_NEXT_RCU(block, next);
3889             } else {
3890                 migration_bitmap_clear_dirty(ram_state, block, offset);
3891                 dst_host = block->host
3892                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3893                 src_host = block->colo_cache
3894                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3895                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3896             }
3897         }
3898     }
3899     trace_colo_flush_ram_cache_end();
3900     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3901 }
3902 
3903 /**
3904  * ram_load_precopy: load pages in precopy case
3905  *
3906  * Returns 0 for success or -errno in case of error
3907  *
3908  * Called in precopy mode by ram_load().
3909  * rcu_read_lock is taken prior to this being called.
3910  *
3911  * @f: QEMUFile where to send the data
3912  */
3913 static int ram_load_precopy(QEMUFile *f)
3914 {
3915     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3916     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3917     bool postcopy_advised = postcopy_is_advised();
3918     if (!migrate_use_compression()) {
3919         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3920     }
3921 
3922     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3923         ram_addr_t addr, total_ram_bytes;
3924         void *host = NULL, *host_bak = NULL;
3925         uint8_t ch;
3926 
3927         /*
3928          * Yield periodically to let main loop run, but an iteration of
3929          * the main loop is expensive, so do it each some iterations
3930          */
3931         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3932             aio_co_schedule(qemu_get_current_aio_context(),
3933                             qemu_coroutine_self());
3934             qemu_coroutine_yield();
3935         }
3936         i++;
3937 
3938         addr = qemu_get_be64(f);
3939         flags = addr & ~TARGET_PAGE_MASK;
3940         addr &= TARGET_PAGE_MASK;
3941 
3942         if (flags & invalid_flags) {
3943             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3944                 error_report("Received an unexpected compressed page");
3945             }
3946 
3947             ret = -EINVAL;
3948             break;
3949         }
3950 
3951         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3952                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3953             RAMBlock *block = ram_block_from_stream(f, flags);
3954 
3955             host = host_from_ram_block_offset(block, addr);
3956             /*
3957              * After going into COLO stage, we should not load the page
3958              * into SVM's memory directly, we put them into colo_cache firstly.
3959              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3960              * Previously, we copied all these memory in preparing stage of COLO
3961              * while we need to stop VM, which is a time-consuming process.
3962              * Here we optimize it by a trick, back-up every page while in
3963              * migration process while COLO is enabled, though it affects the
3964              * speed of the migration, but it obviously reduce the downtime of
3965              * back-up all SVM'S memory in COLO preparing stage.
3966              */
3967             if (migration_incoming_colo_enabled()) {
3968                 if (migration_incoming_in_colo_state()) {
3969                     /* In COLO stage, put all pages into cache temporarily */
3970                     host = colo_cache_from_block_offset(block, addr, true);
3971                 } else {
3972                    /*
3973                     * In migration stage but before COLO stage,
3974                     * Put all pages into both cache and SVM's memory.
3975                     */
3976                     host_bak = colo_cache_from_block_offset(block, addr, false);
3977                 }
3978             }
3979             if (!host) {
3980                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3981                 ret = -EINVAL;
3982                 break;
3983             }
3984             if (!migration_incoming_in_colo_state()) {
3985                 ramblock_recv_bitmap_set(block, host);
3986             }
3987 
3988             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3989         }
3990 
3991         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3992         case RAM_SAVE_FLAG_MEM_SIZE:
3993             /* Synchronize RAM block list */
3994             total_ram_bytes = addr;
3995             while (!ret && total_ram_bytes) {
3996                 RAMBlock *block;
3997                 char id[256];
3998                 ram_addr_t length;
3999 
4000                 len = qemu_get_byte(f);
4001                 qemu_get_buffer(f, (uint8_t *)id, len);
4002                 id[len] = 0;
4003                 length = qemu_get_be64(f);
4004 
4005                 block = qemu_ram_block_by_name(id);
4006                 if (block && !qemu_ram_is_migratable(block)) {
4007                     error_report("block %s should not be migrated !", id);
4008                     ret = -EINVAL;
4009                 } else if (block) {
4010                     if (length != block->used_length) {
4011                         Error *local_err = NULL;
4012 
4013                         ret = qemu_ram_resize(block, length,
4014                                               &local_err);
4015                         if (local_err) {
4016                             error_report_err(local_err);
4017                         }
4018                     }
4019                     /* For postcopy we need to check hugepage sizes match */
4020                     if (postcopy_advised && migrate_postcopy_ram() &&
4021                         block->page_size != qemu_host_page_size) {
4022                         uint64_t remote_page_size = qemu_get_be64(f);
4023                         if (remote_page_size != block->page_size) {
4024                             error_report("Mismatched RAM page size %s "
4025                                          "(local) %zd != %" PRId64,
4026                                          id, block->page_size,
4027                                          remote_page_size);
4028                             ret = -EINVAL;
4029                         }
4030                     }
4031                     if (migrate_ignore_shared()) {
4032                         hwaddr addr = qemu_get_be64(f);
4033                         if (ramblock_is_ignored(block) &&
4034                             block->mr->addr != addr) {
4035                             error_report("Mismatched GPAs for block %s "
4036                                          "%" PRId64 "!= %" PRId64,
4037                                          id, (uint64_t)addr,
4038                                          (uint64_t)block->mr->addr);
4039                             ret = -EINVAL;
4040                         }
4041                     }
4042                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4043                                           block->idstr);
4044                 } else {
4045                     error_report("Unknown ramblock \"%s\", cannot "
4046                                  "accept migration", id);
4047                     ret = -EINVAL;
4048                 }
4049 
4050                 total_ram_bytes -= length;
4051             }
4052             break;
4053 
4054         case RAM_SAVE_FLAG_ZERO:
4055             ch = qemu_get_byte(f);
4056             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4057             break;
4058 
4059         case RAM_SAVE_FLAG_PAGE:
4060             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4061             break;
4062 
4063         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4064             len = qemu_get_be32(f);
4065             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4066                 error_report("Invalid compressed data length: %d", len);
4067                 ret = -EINVAL;
4068                 break;
4069             }
4070             decompress_data_with_multi_threads(f, host, len);
4071             break;
4072 
4073         case RAM_SAVE_FLAG_XBZRLE:
4074             if (load_xbzrle(f, addr, host) < 0) {
4075                 error_report("Failed to decompress XBZRLE page at "
4076                              RAM_ADDR_FMT, addr);
4077                 ret = -EINVAL;
4078                 break;
4079             }
4080             break;
4081         case RAM_SAVE_FLAG_EOS:
4082             /* normal exit */
4083             multifd_recv_sync_main();
4084             break;
4085         default:
4086             if (flags & RAM_SAVE_FLAG_HOOK) {
4087                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4088             } else {
4089                 error_report("Unknown combination of migration flags: 0x%x",
4090                              flags);
4091                 ret = -EINVAL;
4092             }
4093         }
4094         if (!ret) {
4095             ret = qemu_file_get_error(f);
4096         }
4097         if (!ret && host_bak) {
4098             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4099         }
4100     }
4101 
4102     ret |= wait_for_decompress_done();
4103     return ret;
4104 }
4105 
4106 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4107 {
4108     int ret = 0;
4109     static uint64_t seq_iter;
4110     /*
4111      * If system is running in postcopy mode, page inserts to host memory must
4112      * be atomic
4113      */
4114     bool postcopy_running = postcopy_is_running();
4115 
4116     seq_iter++;
4117 
4118     if (version_id != 4) {
4119         return -EINVAL;
4120     }
4121 
4122     /*
4123      * This RCU critical section can be very long running.
4124      * When RCU reclaims in the code start to become numerous,
4125      * it will be necessary to reduce the granularity of this
4126      * critical section.
4127      */
4128     WITH_RCU_READ_LOCK_GUARD() {
4129         if (postcopy_running) {
4130             ret = ram_load_postcopy(f);
4131         } else {
4132             ret = ram_load_precopy(f);
4133         }
4134     }
4135     trace_ram_load_complete(ret, seq_iter);
4136 
4137     return ret;
4138 }
4139 
4140 static bool ram_has_postcopy(void *opaque)
4141 {
4142     RAMBlock *rb;
4143     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4144         if (ramblock_is_pmem(rb)) {
4145             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4146                          "is not supported now!", rb->idstr, rb->host);
4147             return false;
4148         }
4149     }
4150 
4151     return migrate_postcopy_ram();
4152 }
4153 
4154 /* Sync all the dirty bitmap with destination VM.  */
4155 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4156 {
4157     RAMBlock *block;
4158     QEMUFile *file = s->to_dst_file;
4159     int ramblock_count = 0;
4160 
4161     trace_ram_dirty_bitmap_sync_start();
4162 
4163     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4164         qemu_savevm_send_recv_bitmap(file, block->idstr);
4165         trace_ram_dirty_bitmap_request(block->idstr);
4166         ramblock_count++;
4167     }
4168 
4169     trace_ram_dirty_bitmap_sync_wait();
4170 
4171     /* Wait until all the ramblocks' dirty bitmap synced */
4172     while (ramblock_count--) {
4173         qemu_sem_wait(&s->rp_state.rp_sem);
4174     }
4175 
4176     trace_ram_dirty_bitmap_sync_complete();
4177 
4178     return 0;
4179 }
4180 
4181 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4182 {
4183     qemu_sem_post(&s->rp_state.rp_sem);
4184 }
4185 
4186 /*
4187  * Read the received bitmap, revert it as the initial dirty bitmap.
4188  * This is only used when the postcopy migration is paused but wants
4189  * to resume from a middle point.
4190  */
4191 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4192 {
4193     int ret = -EINVAL;
4194     /* from_dst_file is always valid because we're within rp_thread */
4195     QEMUFile *file = s->rp_state.from_dst_file;
4196     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4197     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4198     uint64_t size, end_mark;
4199 
4200     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4201 
4202     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4203         error_report("%s: incorrect state %s", __func__,
4204                      MigrationStatus_str(s->state));
4205         return -EINVAL;
4206     }
4207 
4208     /*
4209      * Note: see comments in ramblock_recv_bitmap_send() on why we
4210      * need the endianness conversion, and the paddings.
4211      */
4212     local_size = ROUND_UP(local_size, 8);
4213 
4214     /* Add paddings */
4215     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4216 
4217     size = qemu_get_be64(file);
4218 
4219     /* The size of the bitmap should match with our ramblock */
4220     if (size != local_size) {
4221         error_report("%s: ramblock '%s' bitmap size mismatch "
4222                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4223                      block->idstr, size, local_size);
4224         ret = -EINVAL;
4225         goto out;
4226     }
4227 
4228     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4229     end_mark = qemu_get_be64(file);
4230 
4231     ret = qemu_file_get_error(file);
4232     if (ret || size != local_size) {
4233         error_report("%s: read bitmap failed for ramblock '%s': %d"
4234                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4235                      __func__, block->idstr, ret, local_size, size);
4236         ret = -EIO;
4237         goto out;
4238     }
4239 
4240     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4241         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4242                      __func__, block->idstr, end_mark);
4243         ret = -EINVAL;
4244         goto out;
4245     }
4246 
4247     /*
4248      * Endianness conversion. We are during postcopy (though paused).
4249      * The dirty bitmap won't change. We can directly modify it.
4250      */
4251     bitmap_from_le(block->bmap, le_bitmap, nbits);
4252 
4253     /*
4254      * What we received is "received bitmap". Revert it as the initial
4255      * dirty bitmap for this ramblock.
4256      */
4257     bitmap_complement(block->bmap, block->bmap, nbits);
4258 
4259     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4260     ramblock_dirty_bitmap_clear_discarded_pages(block);
4261 
4262     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4263     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4264 
4265     /*
4266      * We succeeded to sync bitmap for current ramblock. If this is
4267      * the last one to sync, we need to notify the main send thread.
4268      */
4269     ram_dirty_bitmap_reload_notify(s);
4270 
4271     ret = 0;
4272 out:
4273     g_free(le_bitmap);
4274     return ret;
4275 }
4276 
4277 static int ram_resume_prepare(MigrationState *s, void *opaque)
4278 {
4279     RAMState *rs = *(RAMState **)opaque;
4280     int ret;
4281 
4282     ret = ram_dirty_bitmap_sync_all(s, rs);
4283     if (ret) {
4284         return ret;
4285     }
4286 
4287     ram_state_resume_prepare(rs, s->to_dst_file);
4288 
4289     return 0;
4290 }
4291 
4292 static SaveVMHandlers savevm_ram_handlers = {
4293     .save_setup = ram_save_setup,
4294     .save_live_iterate = ram_save_iterate,
4295     .save_live_complete_postcopy = ram_save_complete,
4296     .save_live_complete_precopy = ram_save_complete,
4297     .has_postcopy = ram_has_postcopy,
4298     .save_live_pending = ram_save_pending,
4299     .load_state = ram_load,
4300     .save_cleanup = ram_save_cleanup,
4301     .load_setup = ram_load_setup,
4302     .load_cleanup = ram_load_cleanup,
4303     .resume_prepare = ram_resume_prepare,
4304 };
4305 
4306 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4307                                       size_t old_size, size_t new_size)
4308 {
4309     PostcopyState ps = postcopy_state_get();
4310     ram_addr_t offset;
4311     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4312     Error *err = NULL;
4313 
4314     if (ramblock_is_ignored(rb)) {
4315         return;
4316     }
4317 
4318     if (!migration_is_idle()) {
4319         /*
4320          * Precopy code on the source cannot deal with the size of RAM blocks
4321          * changing at random points in time - especially after sending the
4322          * RAM block sizes in the migration stream, they must no longer change.
4323          * Abort and indicate a proper reason.
4324          */
4325         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4326         migrate_set_error(migrate_get_current(), err);
4327         error_free(err);
4328         migration_cancel();
4329     }
4330 
4331     switch (ps) {
4332     case POSTCOPY_INCOMING_ADVISE:
4333         /*
4334          * Update what ram_postcopy_incoming_init()->init_range() does at the
4335          * time postcopy was advised. Syncing RAM blocks with the source will
4336          * result in RAM resizes.
4337          */
4338         if (old_size < new_size) {
4339             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4340                 error_report("RAM block '%s' discard of resized RAM failed",
4341                              rb->idstr);
4342             }
4343         }
4344         rb->postcopy_length = new_size;
4345         break;
4346     case POSTCOPY_INCOMING_NONE:
4347     case POSTCOPY_INCOMING_RUNNING:
4348     case POSTCOPY_INCOMING_END:
4349         /*
4350          * Once our guest is running, postcopy does no longer care about
4351          * resizes. When growing, the new memory was not available on the
4352          * source, no handler needed.
4353          */
4354         break;
4355     default:
4356         error_report("RAM block '%s' resized during postcopy state: %d",
4357                      rb->idstr, ps);
4358         exit(-1);
4359     }
4360 }
4361 
4362 static RAMBlockNotifier ram_mig_ram_notifier = {
4363     .ram_block_resized = ram_mig_ram_block_resized,
4364 };
4365 
4366 void ram_mig_init(void)
4367 {
4368     qemu_mutex_init(&XBZRLE.lock);
4369     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4370     ram_block_notifier_add(&ram_mig_ram_notifier);
4371 }
4372