xref: /openbmc/qemu/migration/ram.c (revision 927f93e0)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "block.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
60 
61 #include "hw/boards.h" /* for machine_dump_guest_core() */
62 
63 #if defined(__linux__)
64 #include "qemu/userfaultfd.h"
65 #endif /* defined(__linux__) */
66 
67 /***********************************************************/
68 /* ram save/restore */
69 
70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
71  * worked for pages that where filled with the same char.  We switched
72  * it to only search for the zero value.  And to avoid confusion with
73  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74  */
75 
76 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
77 #define RAM_SAVE_FLAG_ZERO     0x02
78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
79 #define RAM_SAVE_FLAG_PAGE     0x08
80 #define RAM_SAVE_FLAG_EOS      0x10
81 #define RAM_SAVE_FLAG_CONTINUE 0x20
82 #define RAM_SAVE_FLAG_XBZRLE   0x40
83 /* 0x80 is reserved in migration.h start with 0x100 next */
84 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
85 
86 XBZRLECacheStats xbzrle_counters;
87 
88 /* struct contains XBZRLE cache and a static page
89    used by the compression */
90 static struct {
91     /* buffer used for XBZRLE encoding */
92     uint8_t *encoded_buf;
93     /* buffer for storing page content */
94     uint8_t *current_buf;
95     /* Cache for XBZRLE, Protected by lock. */
96     PageCache *cache;
97     QemuMutex lock;
98     /* it will store a page full of zeros */
99     uint8_t *zero_target_page;
100     /* buffer used for XBZRLE decoding */
101     uint8_t *decoded_buf;
102 } XBZRLE;
103 
104 static void XBZRLE_cache_lock(void)
105 {
106     if (migrate_use_xbzrle()) {
107         qemu_mutex_lock(&XBZRLE.lock);
108     }
109 }
110 
111 static void XBZRLE_cache_unlock(void)
112 {
113     if (migrate_use_xbzrle()) {
114         qemu_mutex_unlock(&XBZRLE.lock);
115     }
116 }
117 
118 /**
119  * xbzrle_cache_resize: resize the xbzrle cache
120  *
121  * This function is called from migrate_params_apply in main
122  * thread, possibly while a migration is in progress.  A running
123  * migration may be using the cache and might finish during this call,
124  * hence changes to the cache are protected by XBZRLE.lock().
125  *
126  * Returns 0 for success or -1 for error
127  *
128  * @new_size: new cache size
129  * @errp: set *errp if the check failed, with reason
130  */
131 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
132 {
133     PageCache *new_cache;
134     int64_t ret = 0;
135 
136     /* Check for truncation */
137     if (new_size != (size_t)new_size) {
138         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
139                    "exceeding address space");
140         return -1;
141     }
142 
143     if (new_size == migrate_xbzrle_cache_size()) {
144         /* nothing to do */
145         return 0;
146     }
147 
148     XBZRLE_cache_lock();
149 
150     if (XBZRLE.cache != NULL) {
151         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
152         if (!new_cache) {
153             ret = -1;
154             goto out;
155         }
156 
157         cache_fini(XBZRLE.cache);
158         XBZRLE.cache = new_cache;
159     }
160 out:
161     XBZRLE_cache_unlock();
162     return ret;
163 }
164 
165 bool ramblock_is_ignored(RAMBlock *block)
166 {
167     return !qemu_ram_is_migratable(block) ||
168            (migrate_ignore_shared() && qemu_ram_is_shared(block));
169 }
170 
171 #undef RAMBLOCK_FOREACH
172 
173 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
174 {
175     RAMBlock *block;
176     int ret = 0;
177 
178     RCU_READ_LOCK_GUARD();
179 
180     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
181         ret = func(block, opaque);
182         if (ret) {
183             break;
184         }
185     }
186     return ret;
187 }
188 
189 static void ramblock_recv_map_init(void)
190 {
191     RAMBlock *rb;
192 
193     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
194         assert(!rb->receivedmap);
195         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
196     }
197 }
198 
199 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
200 {
201     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
202                     rb->receivedmap);
203 }
204 
205 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
206 {
207     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
208 }
209 
210 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
211 {
212     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
213 }
214 
215 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
216                                     size_t nr)
217 {
218     bitmap_set_atomic(rb->receivedmap,
219                       ramblock_recv_bitmap_offset(host_addr, rb),
220                       nr);
221 }
222 
223 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
224 
225 /*
226  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
227  *
228  * Returns >0 if success with sent bytes, or <0 if error.
229  */
230 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
231                                   const char *block_name)
232 {
233     RAMBlock *block = qemu_ram_block_by_name(block_name);
234     unsigned long *le_bitmap, nbits;
235     uint64_t size;
236 
237     if (!block) {
238         error_report("%s: invalid block name: %s", __func__, block_name);
239         return -1;
240     }
241 
242     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
243 
244     /*
245      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
246      * machines we may need 4 more bytes for padding (see below
247      * comment). So extend it a bit before hand.
248      */
249     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
250 
251     /*
252      * Always use little endian when sending the bitmap. This is
253      * required that when source and destination VMs are not using the
254      * same endianness. (Note: big endian won't work.)
255      */
256     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
257 
258     /* Size of the bitmap, in bytes */
259     size = DIV_ROUND_UP(nbits, 8);
260 
261     /*
262      * size is always aligned to 8 bytes for 64bit machines, but it
263      * may not be true for 32bit machines. We need this padding to
264      * make sure the migration can survive even between 32bit and
265      * 64bit machines.
266      */
267     size = ROUND_UP(size, 8);
268 
269     qemu_put_be64(file, size);
270     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
271     /*
272      * Mark as an end, in case the middle part is screwed up due to
273      * some "mysterious" reason.
274      */
275     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
276     qemu_fflush(file);
277 
278     g_free(le_bitmap);
279 
280     if (qemu_file_get_error(file)) {
281         return qemu_file_get_error(file);
282     }
283 
284     return size + sizeof(size);
285 }
286 
287 /*
288  * An outstanding page request, on the source, having been received
289  * and queued
290  */
291 struct RAMSrcPageRequest {
292     RAMBlock *rb;
293     hwaddr    offset;
294     hwaddr    len;
295 
296     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
297 };
298 
299 typedef struct {
300     /*
301      * Cached ramblock/offset values if preempted.  They're only meaningful if
302      * preempted==true below.
303      */
304     RAMBlock *ram_block;
305     unsigned long ram_page;
306     /*
307      * Whether a postcopy preemption just happened.  Will be reset after
308      * precopy recovered to background migration.
309      */
310     bool preempted;
311 } PostcopyPreemptState;
312 
313 /* State of RAM for migration */
314 struct RAMState {
315     /* QEMUFile used for this migration */
316     QEMUFile *f;
317     /* UFFD file descriptor, used in 'write-tracking' migration */
318     int uffdio_fd;
319     /* Last block that we have visited searching for dirty pages */
320     RAMBlock *last_seen_block;
321     /* Last block from where we have sent data */
322     RAMBlock *last_sent_block;
323     /* Last dirty target page we have sent */
324     ram_addr_t last_page;
325     /* last ram version we have seen */
326     uint32_t last_version;
327     /* How many times we have dirty too many pages */
328     int dirty_rate_high_cnt;
329     /* these variables are used for bitmap sync */
330     /* last time we did a full bitmap_sync */
331     int64_t time_last_bitmap_sync;
332     /* bytes transferred at start_time */
333     uint64_t bytes_xfer_prev;
334     /* number of dirty pages since start_time */
335     uint64_t num_dirty_pages_period;
336     /* xbzrle misses since the beginning of the period */
337     uint64_t xbzrle_cache_miss_prev;
338     /* Amount of xbzrle pages since the beginning of the period */
339     uint64_t xbzrle_pages_prev;
340     /* Amount of xbzrle encoded bytes since the beginning of the period */
341     uint64_t xbzrle_bytes_prev;
342     /* Start using XBZRLE (e.g., after the first round). */
343     bool xbzrle_enabled;
344     /* Are we on the last stage of migration */
345     bool last_stage;
346     /* compression statistics since the beginning of the period */
347     /* amount of count that no free thread to compress data */
348     uint64_t compress_thread_busy_prev;
349     /* amount bytes after compression */
350     uint64_t compressed_size_prev;
351     /* amount of compressed pages */
352     uint64_t compress_pages_prev;
353 
354     /* total handled target pages at the beginning of period */
355     uint64_t target_page_count_prev;
356     /* total handled target pages since start */
357     uint64_t target_page_count;
358     /* number of dirty bits in the bitmap */
359     uint64_t migration_dirty_pages;
360     /* Protects modification of the bitmap and migration dirty pages */
361     QemuMutex bitmap_mutex;
362     /* The RAMBlock used in the last src_page_requests */
363     RAMBlock *last_req_rb;
364     /* Queue of outstanding page requests from the destination */
365     QemuMutex src_page_req_mutex;
366     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
367 
368     /* Postcopy preemption informations */
369     PostcopyPreemptState postcopy_preempt_state;
370     /*
371      * Current channel we're using on src VM.  Only valid if postcopy-preempt
372      * is enabled.
373      */
374     unsigned int postcopy_channel;
375 };
376 typedef struct RAMState RAMState;
377 
378 static RAMState *ram_state;
379 
380 static NotifierWithReturnList precopy_notifier_list;
381 
382 static void postcopy_preempt_reset(RAMState *rs)
383 {
384     memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
385 }
386 
387 /* Whether postcopy has queued requests? */
388 static bool postcopy_has_request(RAMState *rs)
389 {
390     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
391 }
392 
393 void precopy_infrastructure_init(void)
394 {
395     notifier_with_return_list_init(&precopy_notifier_list);
396 }
397 
398 void precopy_add_notifier(NotifierWithReturn *n)
399 {
400     notifier_with_return_list_add(&precopy_notifier_list, n);
401 }
402 
403 void precopy_remove_notifier(NotifierWithReturn *n)
404 {
405     notifier_with_return_remove(n);
406 }
407 
408 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
409 {
410     PrecopyNotifyData pnd;
411     pnd.reason = reason;
412     pnd.errp = errp;
413 
414     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
415 }
416 
417 uint64_t ram_bytes_remaining(void)
418 {
419     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
420                        0;
421 }
422 
423 MigrationStats ram_counters;
424 
425 static void ram_transferred_add(uint64_t bytes)
426 {
427     if (runstate_is_running()) {
428         ram_counters.precopy_bytes += bytes;
429     } else if (migration_in_postcopy()) {
430         ram_counters.postcopy_bytes += bytes;
431     } else {
432         ram_counters.downtime_bytes += bytes;
433     }
434     ram_counters.transferred += bytes;
435 }
436 
437 /* used by the search for pages to send */
438 struct PageSearchStatus {
439     /* Current block being searched */
440     RAMBlock    *block;
441     /* Current page to search from */
442     unsigned long page;
443     /* Set once we wrap around */
444     bool         complete_round;
445     /*
446      * [POSTCOPY-ONLY] Whether current page is explicitly requested by
447      * postcopy.  When set, the request is "urgent" because the dest QEMU
448      * threads are waiting for us.
449      */
450     bool         postcopy_requested;
451     /*
452      * [POSTCOPY-ONLY] The target channel to use to send current page.
453      *
454      * Note: This may _not_ match with the value in postcopy_requested
455      * above. Let's imagine the case where the postcopy request is exactly
456      * the page that we're sending in progress during precopy. In this case
457      * we'll have postcopy_requested set to true but the target channel
458      * will be the precopy channel (so that we don't split brain on that
459      * specific page since the precopy channel already contains partial of
460      * that page data).
461      *
462      * Besides that specific use case, postcopy_target_channel should
463      * always be equal to postcopy_requested, because by default we send
464      * postcopy pages via postcopy preempt channel.
465      */
466     bool         postcopy_target_channel;
467 };
468 typedef struct PageSearchStatus PageSearchStatus;
469 
470 CompressionStats compression_counters;
471 
472 struct CompressParam {
473     bool done;
474     bool quit;
475     bool zero_page;
476     QEMUFile *file;
477     QemuMutex mutex;
478     QemuCond cond;
479     RAMBlock *block;
480     ram_addr_t offset;
481 
482     /* internally used fields */
483     z_stream stream;
484     uint8_t *originbuf;
485 };
486 typedef struct CompressParam CompressParam;
487 
488 struct DecompressParam {
489     bool done;
490     bool quit;
491     QemuMutex mutex;
492     QemuCond cond;
493     void *des;
494     uint8_t *compbuf;
495     int len;
496     z_stream stream;
497 };
498 typedef struct DecompressParam DecompressParam;
499 
500 static CompressParam *comp_param;
501 static QemuThread *compress_threads;
502 /* comp_done_cond is used to wake up the migration thread when
503  * one of the compression threads has finished the compression.
504  * comp_done_lock is used to co-work with comp_done_cond.
505  */
506 static QemuMutex comp_done_lock;
507 static QemuCond comp_done_cond;
508 
509 static QEMUFile *decomp_file;
510 static DecompressParam *decomp_param;
511 static QemuThread *decompress_threads;
512 static QemuMutex decomp_done_lock;
513 static QemuCond decomp_done_cond;
514 
515 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
516                                  ram_addr_t offset, uint8_t *source_buf);
517 
518 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
519                                      bool postcopy_requested);
520 
521 static void *do_data_compress(void *opaque)
522 {
523     CompressParam *param = opaque;
524     RAMBlock *block;
525     ram_addr_t offset;
526     bool zero_page;
527 
528     qemu_mutex_lock(&param->mutex);
529     while (!param->quit) {
530         if (param->block) {
531             block = param->block;
532             offset = param->offset;
533             param->block = NULL;
534             qemu_mutex_unlock(&param->mutex);
535 
536             zero_page = do_compress_ram_page(param->file, &param->stream,
537                                              block, offset, param->originbuf);
538 
539             qemu_mutex_lock(&comp_done_lock);
540             param->done = true;
541             param->zero_page = zero_page;
542             qemu_cond_signal(&comp_done_cond);
543             qemu_mutex_unlock(&comp_done_lock);
544 
545             qemu_mutex_lock(&param->mutex);
546         } else {
547             qemu_cond_wait(&param->cond, &param->mutex);
548         }
549     }
550     qemu_mutex_unlock(&param->mutex);
551 
552     return NULL;
553 }
554 
555 static void compress_threads_save_cleanup(void)
556 {
557     int i, thread_count;
558 
559     if (!migrate_use_compression() || !comp_param) {
560         return;
561     }
562 
563     thread_count = migrate_compress_threads();
564     for (i = 0; i < thread_count; i++) {
565         /*
566          * we use it as a indicator which shows if the thread is
567          * properly init'd or not
568          */
569         if (!comp_param[i].file) {
570             break;
571         }
572 
573         qemu_mutex_lock(&comp_param[i].mutex);
574         comp_param[i].quit = true;
575         qemu_cond_signal(&comp_param[i].cond);
576         qemu_mutex_unlock(&comp_param[i].mutex);
577 
578         qemu_thread_join(compress_threads + i);
579         qemu_mutex_destroy(&comp_param[i].mutex);
580         qemu_cond_destroy(&comp_param[i].cond);
581         deflateEnd(&comp_param[i].stream);
582         g_free(comp_param[i].originbuf);
583         qemu_fclose(comp_param[i].file);
584         comp_param[i].file = NULL;
585     }
586     qemu_mutex_destroy(&comp_done_lock);
587     qemu_cond_destroy(&comp_done_cond);
588     g_free(compress_threads);
589     g_free(comp_param);
590     compress_threads = NULL;
591     comp_param = NULL;
592 }
593 
594 static int compress_threads_save_setup(void)
595 {
596     int i, thread_count;
597 
598     if (!migrate_use_compression()) {
599         return 0;
600     }
601     thread_count = migrate_compress_threads();
602     compress_threads = g_new0(QemuThread, thread_count);
603     comp_param = g_new0(CompressParam, thread_count);
604     qemu_cond_init(&comp_done_cond);
605     qemu_mutex_init(&comp_done_lock);
606     for (i = 0; i < thread_count; i++) {
607         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
608         if (!comp_param[i].originbuf) {
609             goto exit;
610         }
611 
612         if (deflateInit(&comp_param[i].stream,
613                         migrate_compress_level()) != Z_OK) {
614             g_free(comp_param[i].originbuf);
615             goto exit;
616         }
617 
618         /* comp_param[i].file is just used as a dummy buffer to save data,
619          * set its ops to empty.
620          */
621         comp_param[i].file = qemu_file_new_output(
622             QIO_CHANNEL(qio_channel_null_new()));
623         comp_param[i].done = true;
624         comp_param[i].quit = false;
625         qemu_mutex_init(&comp_param[i].mutex);
626         qemu_cond_init(&comp_param[i].cond);
627         qemu_thread_create(compress_threads + i, "compress",
628                            do_data_compress, comp_param + i,
629                            QEMU_THREAD_JOINABLE);
630     }
631     return 0;
632 
633 exit:
634     compress_threads_save_cleanup();
635     return -1;
636 }
637 
638 /**
639  * save_page_header: write page header to wire
640  *
641  * If this is the 1st block, it also writes the block identification
642  *
643  * Returns the number of bytes written
644  *
645  * @f: QEMUFile where to send the data
646  * @block: block that contains the page we want to send
647  * @offset: offset inside the block for the page
648  *          in the lower bits, it contains flags
649  */
650 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
651                                ram_addr_t offset)
652 {
653     size_t size, len;
654 
655     if (block == rs->last_sent_block) {
656         offset |= RAM_SAVE_FLAG_CONTINUE;
657     }
658     qemu_put_be64(f, offset);
659     size = 8;
660 
661     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
662         len = strlen(block->idstr);
663         qemu_put_byte(f, len);
664         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
665         size += 1 + len;
666         rs->last_sent_block = block;
667     }
668     return size;
669 }
670 
671 /**
672  * mig_throttle_guest_down: throttle down the guest
673  *
674  * Reduce amount of guest cpu execution to hopefully slow down memory
675  * writes. If guest dirty memory rate is reduced below the rate at
676  * which we can transfer pages to the destination then we should be
677  * able to complete migration. Some workloads dirty memory way too
678  * fast and will not effectively converge, even with auto-converge.
679  */
680 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
681                                     uint64_t bytes_dirty_threshold)
682 {
683     MigrationState *s = migrate_get_current();
684     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
685     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
686     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
687     int pct_max = s->parameters.max_cpu_throttle;
688 
689     uint64_t throttle_now = cpu_throttle_get_percentage();
690     uint64_t cpu_now, cpu_ideal, throttle_inc;
691 
692     /* We have not started throttling yet. Let's start it. */
693     if (!cpu_throttle_active()) {
694         cpu_throttle_set(pct_initial);
695     } else {
696         /* Throttling already on, just increase the rate */
697         if (!pct_tailslow) {
698             throttle_inc = pct_increment;
699         } else {
700             /* Compute the ideal CPU percentage used by Guest, which may
701              * make the dirty rate match the dirty rate threshold. */
702             cpu_now = 100 - throttle_now;
703             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
704                         bytes_dirty_period);
705             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
706         }
707         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
708     }
709 }
710 
711 void mig_throttle_counter_reset(void)
712 {
713     RAMState *rs = ram_state;
714 
715     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
716     rs->num_dirty_pages_period = 0;
717     rs->bytes_xfer_prev = ram_counters.transferred;
718 }
719 
720 /**
721  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
722  *
723  * @rs: current RAM state
724  * @current_addr: address for the zero page
725  *
726  * Update the xbzrle cache to reflect a page that's been sent as all 0.
727  * The important thing is that a stale (not-yet-0'd) page be replaced
728  * by the new data.
729  * As a bonus, if the page wasn't in the cache it gets added so that
730  * when a small write is made into the 0'd page it gets XBZRLE sent.
731  */
732 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
733 {
734     if (!rs->xbzrle_enabled) {
735         return;
736     }
737 
738     /* We don't care if this fails to allocate a new cache page
739      * as long as it updated an old one */
740     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
741                  ram_counters.dirty_sync_count);
742 }
743 
744 #define ENCODING_FLAG_XBZRLE 0x1
745 
746 /**
747  * save_xbzrle_page: compress and send current page
748  *
749  * Returns: 1 means that we wrote the page
750  *          0 means that page is identical to the one already sent
751  *          -1 means that xbzrle would be longer than normal
752  *
753  * @rs: current RAM state
754  * @current_data: pointer to the address of the page contents
755  * @current_addr: addr of the page
756  * @block: block that contains the page we want to send
757  * @offset: offset inside the block for the page
758  */
759 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
760                             ram_addr_t current_addr, RAMBlock *block,
761                             ram_addr_t offset)
762 {
763     int encoded_len = 0, bytes_xbzrle;
764     uint8_t *prev_cached_page;
765 
766     if (!cache_is_cached(XBZRLE.cache, current_addr,
767                          ram_counters.dirty_sync_count)) {
768         xbzrle_counters.cache_miss++;
769         if (!rs->last_stage) {
770             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
771                              ram_counters.dirty_sync_count) == -1) {
772                 return -1;
773             } else {
774                 /* update *current_data when the page has been
775                    inserted into cache */
776                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
777             }
778         }
779         return -1;
780     }
781 
782     /*
783      * Reaching here means the page has hit the xbzrle cache, no matter what
784      * encoding result it is (normal encoding, overflow or skipping the page),
785      * count the page as encoded. This is used to calculate the encoding rate.
786      *
787      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
788      * 2nd page turns out to be skipped (i.e. no new bytes written to the
789      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
790      * skipped page included. In this way, the encoding rate can tell if the
791      * guest page is good for xbzrle encoding.
792      */
793     xbzrle_counters.pages++;
794     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
795 
796     /* save current buffer into memory */
797     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
798 
799     /* XBZRLE encoding (if there is no overflow) */
800     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
801                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
802                                        TARGET_PAGE_SIZE);
803 
804     /*
805      * Update the cache contents, so that it corresponds to the data
806      * sent, in all cases except where we skip the page.
807      */
808     if (!rs->last_stage && encoded_len != 0) {
809         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
810         /*
811          * In the case where we couldn't compress, ensure that the caller
812          * sends the data from the cache, since the guest might have
813          * changed the RAM since we copied it.
814          */
815         *current_data = prev_cached_page;
816     }
817 
818     if (encoded_len == 0) {
819         trace_save_xbzrle_page_skipping();
820         return 0;
821     } else if (encoded_len == -1) {
822         trace_save_xbzrle_page_overflow();
823         xbzrle_counters.overflow++;
824         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
825         return -1;
826     }
827 
828     /* Send XBZRLE based compressed page */
829     bytes_xbzrle = save_page_header(rs, rs->f, block,
830                                     offset | RAM_SAVE_FLAG_XBZRLE);
831     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
832     qemu_put_be16(rs->f, encoded_len);
833     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
834     bytes_xbzrle += encoded_len + 1 + 2;
835     /*
836      * Like compressed_size (please see update_compress_thread_counts),
837      * the xbzrle encoded bytes don't count the 8 byte header with
838      * RAM_SAVE_FLAG_CONTINUE.
839      */
840     xbzrle_counters.bytes += bytes_xbzrle - 8;
841     ram_transferred_add(bytes_xbzrle);
842 
843     return 1;
844 }
845 
846 /**
847  * migration_bitmap_find_dirty: find the next dirty page from start
848  *
849  * Returns the page offset within memory region of the start of a dirty page
850  *
851  * @rs: current RAM state
852  * @rb: RAMBlock where to search for dirty pages
853  * @start: page where we start the search
854  */
855 static inline
856 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
857                                           unsigned long start)
858 {
859     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
860     unsigned long *bitmap = rb->bmap;
861 
862     if (ramblock_is_ignored(rb)) {
863         return size;
864     }
865 
866     return find_next_bit(bitmap, size, start);
867 }
868 
869 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
870                                                        unsigned long page)
871 {
872     uint8_t shift;
873     hwaddr size, start;
874 
875     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
876         return;
877     }
878 
879     shift = rb->clear_bmap_shift;
880     /*
881      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
882      * can make things easier sometimes since then start address
883      * of the small chunk will always be 64 pages aligned so the
884      * bitmap will always be aligned to unsigned long. We should
885      * even be able to remove this restriction but I'm simply
886      * keeping it.
887      */
888     assert(shift >= 6);
889 
890     size = 1ULL << (TARGET_PAGE_BITS + shift);
891     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
892     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
893     memory_region_clear_dirty_bitmap(rb->mr, start, size);
894 }
895 
896 static void
897 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
898                                                  unsigned long start,
899                                                  unsigned long npages)
900 {
901     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
902     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
903     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
904 
905     /*
906      * Clear pages from start to start + npages - 1, so the end boundary is
907      * exclusive.
908      */
909     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
910         migration_clear_memory_region_dirty_bitmap(rb, i);
911     }
912 }
913 
914 /*
915  * colo_bitmap_find_diry:find contiguous dirty pages from start
916  *
917  * Returns the page offset within memory region of the start of the contiguout
918  * dirty page
919  *
920  * @rs: current RAM state
921  * @rb: RAMBlock where to search for dirty pages
922  * @start: page where we start the search
923  * @num: the number of contiguous dirty pages
924  */
925 static inline
926 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
927                                      unsigned long start, unsigned long *num)
928 {
929     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
930     unsigned long *bitmap = rb->bmap;
931     unsigned long first, next;
932 
933     *num = 0;
934 
935     if (ramblock_is_ignored(rb)) {
936         return size;
937     }
938 
939     first = find_next_bit(bitmap, size, start);
940     if (first >= size) {
941         return first;
942     }
943     next = find_next_zero_bit(bitmap, size, first + 1);
944     assert(next >= first);
945     *num = next - first;
946     return first;
947 }
948 
949 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
950                                                 RAMBlock *rb,
951                                                 unsigned long page)
952 {
953     bool ret;
954 
955     /*
956      * Clear dirty bitmap if needed.  This _must_ be called before we
957      * send any of the page in the chunk because we need to make sure
958      * we can capture further page content changes when we sync dirty
959      * log the next time.  So as long as we are going to send any of
960      * the page in the chunk we clear the remote dirty bitmap for all.
961      * Clearing it earlier won't be a problem, but too late will.
962      */
963     migration_clear_memory_region_dirty_bitmap(rb, page);
964 
965     ret = test_and_clear_bit(page, rb->bmap);
966     if (ret) {
967         rs->migration_dirty_pages--;
968     }
969 
970     return ret;
971 }
972 
973 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
974                                        void *opaque)
975 {
976     const hwaddr offset = section->offset_within_region;
977     const hwaddr size = int128_get64(section->size);
978     const unsigned long start = offset >> TARGET_PAGE_BITS;
979     const unsigned long npages = size >> TARGET_PAGE_BITS;
980     RAMBlock *rb = section->mr->ram_block;
981     uint64_t *cleared_bits = opaque;
982 
983     /*
984      * We don't grab ram_state->bitmap_mutex because we expect to run
985      * only when starting migration or during postcopy recovery where
986      * we don't have concurrent access.
987      */
988     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
989         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
990     }
991     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
992     bitmap_clear(rb->bmap, start, npages);
993 }
994 
995 /*
996  * Exclude all dirty pages from migration that fall into a discarded range as
997  * managed by a RamDiscardManager responsible for the mapped memory region of
998  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
999  *
1000  * Discarded pages ("logically unplugged") have undefined content and must
1001  * not get migrated, because even reading these pages for migration might
1002  * result in undesired behavior.
1003  *
1004  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1005  *
1006  * Note: The result is only stable while migrating (precopy/postcopy).
1007  */
1008 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1009 {
1010     uint64_t cleared_bits = 0;
1011 
1012     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1013         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1014         MemoryRegionSection section = {
1015             .mr = rb->mr,
1016             .offset_within_region = 0,
1017             .size = int128_make64(qemu_ram_get_used_length(rb)),
1018         };
1019 
1020         ram_discard_manager_replay_discarded(rdm, &section,
1021                                              dirty_bitmap_clear_section,
1022                                              &cleared_bits);
1023     }
1024     return cleared_bits;
1025 }
1026 
1027 /*
1028  * Check if a host-page aligned page falls into a discarded range as managed by
1029  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1030  *
1031  * Note: The result is only stable while migrating (precopy/postcopy).
1032  */
1033 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1034 {
1035     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1036         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1037         MemoryRegionSection section = {
1038             .mr = rb->mr,
1039             .offset_within_region = start,
1040             .size = int128_make64(qemu_ram_pagesize(rb)),
1041         };
1042 
1043         return !ram_discard_manager_is_populated(rdm, &section);
1044     }
1045     return false;
1046 }
1047 
1048 /* Called with RCU critical section */
1049 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1050 {
1051     uint64_t new_dirty_pages =
1052         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1053 
1054     rs->migration_dirty_pages += new_dirty_pages;
1055     rs->num_dirty_pages_period += new_dirty_pages;
1056 }
1057 
1058 /**
1059  * ram_pagesize_summary: calculate all the pagesizes of a VM
1060  *
1061  * Returns a summary bitmap of the page sizes of all RAMBlocks
1062  *
1063  * For VMs with just normal pages this is equivalent to the host page
1064  * size. If it's got some huge pages then it's the OR of all the
1065  * different page sizes.
1066  */
1067 uint64_t ram_pagesize_summary(void)
1068 {
1069     RAMBlock *block;
1070     uint64_t summary = 0;
1071 
1072     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1073         summary |= block->page_size;
1074     }
1075 
1076     return summary;
1077 }
1078 
1079 uint64_t ram_get_total_transferred_pages(void)
1080 {
1081     return  ram_counters.normal + ram_counters.duplicate +
1082                 compression_counters.pages + xbzrle_counters.pages;
1083 }
1084 
1085 static void migration_update_rates(RAMState *rs, int64_t end_time)
1086 {
1087     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1088     double compressed_size;
1089 
1090     /* calculate period counters */
1091     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1092                 / (end_time - rs->time_last_bitmap_sync);
1093 
1094     if (!page_count) {
1095         return;
1096     }
1097 
1098     if (migrate_use_xbzrle()) {
1099         double encoded_size, unencoded_size;
1100 
1101         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1102             rs->xbzrle_cache_miss_prev) / page_count;
1103         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1104         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1105                          TARGET_PAGE_SIZE;
1106         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1107         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1108             xbzrle_counters.encoding_rate = 0;
1109         } else {
1110             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1111         }
1112         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1113         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1114     }
1115 
1116     if (migrate_use_compression()) {
1117         compression_counters.busy_rate = (double)(compression_counters.busy -
1118             rs->compress_thread_busy_prev) / page_count;
1119         rs->compress_thread_busy_prev = compression_counters.busy;
1120 
1121         compressed_size = compression_counters.compressed_size -
1122                           rs->compressed_size_prev;
1123         if (compressed_size) {
1124             double uncompressed_size = (compression_counters.pages -
1125                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1126 
1127             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1128             compression_counters.compression_rate =
1129                                         uncompressed_size / compressed_size;
1130 
1131             rs->compress_pages_prev = compression_counters.pages;
1132             rs->compressed_size_prev = compression_counters.compressed_size;
1133         }
1134     }
1135 }
1136 
1137 static void migration_trigger_throttle(RAMState *rs)
1138 {
1139     MigrationState *s = migrate_get_current();
1140     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1141 
1142     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1143     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1144     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1145 
1146     /* During block migration the auto-converge logic incorrectly detects
1147      * that ram migration makes no progress. Avoid this by disabling the
1148      * throttling logic during the bulk phase of block migration. */
1149     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1150         /* The following detection logic can be refined later. For now:
1151            Check to see if the ratio between dirtied bytes and the approx.
1152            amount of bytes that just got transferred since the last time
1153            we were in this routine reaches the threshold. If that happens
1154            twice, start or increase throttling. */
1155 
1156         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1157             (++rs->dirty_rate_high_cnt >= 2)) {
1158             trace_migration_throttle();
1159             rs->dirty_rate_high_cnt = 0;
1160             mig_throttle_guest_down(bytes_dirty_period,
1161                                     bytes_dirty_threshold);
1162         }
1163     }
1164 }
1165 
1166 static void migration_bitmap_sync(RAMState *rs)
1167 {
1168     RAMBlock *block;
1169     int64_t end_time;
1170 
1171     ram_counters.dirty_sync_count++;
1172 
1173     if (!rs->time_last_bitmap_sync) {
1174         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1175     }
1176 
1177     trace_migration_bitmap_sync_start();
1178     memory_global_dirty_log_sync();
1179 
1180     qemu_mutex_lock(&rs->bitmap_mutex);
1181     WITH_RCU_READ_LOCK_GUARD() {
1182         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1183             ramblock_sync_dirty_bitmap(rs, block);
1184         }
1185         ram_counters.remaining = ram_bytes_remaining();
1186     }
1187     qemu_mutex_unlock(&rs->bitmap_mutex);
1188 
1189     memory_global_after_dirty_log_sync();
1190     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1191 
1192     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1193 
1194     /* more than 1 second = 1000 millisecons */
1195     if (end_time > rs->time_last_bitmap_sync + 1000) {
1196         migration_trigger_throttle(rs);
1197 
1198         migration_update_rates(rs, end_time);
1199 
1200         rs->target_page_count_prev = rs->target_page_count;
1201 
1202         /* reset period counters */
1203         rs->time_last_bitmap_sync = end_time;
1204         rs->num_dirty_pages_period = 0;
1205         rs->bytes_xfer_prev = ram_counters.transferred;
1206     }
1207     if (migrate_use_events()) {
1208         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1209     }
1210 }
1211 
1212 static void migration_bitmap_sync_precopy(RAMState *rs)
1213 {
1214     Error *local_err = NULL;
1215 
1216     /*
1217      * The current notifier usage is just an optimization to migration, so we
1218      * don't stop the normal migration process in the error case.
1219      */
1220     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1221         error_report_err(local_err);
1222         local_err = NULL;
1223     }
1224 
1225     migration_bitmap_sync(rs);
1226 
1227     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1228         error_report_err(local_err);
1229     }
1230 }
1231 
1232 static void ram_release_page(const char *rbname, uint64_t offset)
1233 {
1234     if (!migrate_release_ram() || !migration_in_postcopy()) {
1235         return;
1236     }
1237 
1238     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1239 }
1240 
1241 /**
1242  * save_zero_page_to_file: send the zero page to the file
1243  *
1244  * Returns the size of data written to the file, 0 means the page is not
1245  * a zero page
1246  *
1247  * @rs: current RAM state
1248  * @file: the file where the data is saved
1249  * @block: block that contains the page we want to send
1250  * @offset: offset inside the block for the page
1251  */
1252 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1253                                   RAMBlock *block, ram_addr_t offset)
1254 {
1255     uint8_t *p = block->host + offset;
1256     int len = 0;
1257 
1258     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1259         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1260         qemu_put_byte(file, 0);
1261         len += 1;
1262         ram_release_page(block->idstr, offset);
1263     }
1264     return len;
1265 }
1266 
1267 /**
1268  * save_zero_page: send the zero page to the stream
1269  *
1270  * Returns the number of pages written.
1271  *
1272  * @rs: current RAM state
1273  * @block: block that contains the page we want to send
1274  * @offset: offset inside the block for the page
1275  */
1276 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1277 {
1278     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1279 
1280     if (len) {
1281         ram_counters.duplicate++;
1282         ram_transferred_add(len);
1283         return 1;
1284     }
1285     return -1;
1286 }
1287 
1288 /*
1289  * @pages: the number of pages written by the control path,
1290  *        < 0 - error
1291  *        > 0 - number of pages written
1292  *
1293  * Return true if the pages has been saved, otherwise false is returned.
1294  */
1295 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1296                               int *pages)
1297 {
1298     uint64_t bytes_xmit = 0;
1299     int ret;
1300 
1301     *pages = -1;
1302     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1303                                 &bytes_xmit);
1304     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1305         return false;
1306     }
1307 
1308     if (bytes_xmit) {
1309         ram_transferred_add(bytes_xmit);
1310         *pages = 1;
1311     }
1312 
1313     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1314         return true;
1315     }
1316 
1317     if (bytes_xmit > 0) {
1318         ram_counters.normal++;
1319     } else if (bytes_xmit == 0) {
1320         ram_counters.duplicate++;
1321     }
1322 
1323     return true;
1324 }
1325 
1326 /*
1327  * directly send the page to the stream
1328  *
1329  * Returns the number of pages written.
1330  *
1331  * @rs: current RAM state
1332  * @block: block that contains the page we want to send
1333  * @offset: offset inside the block for the page
1334  * @buf: the page to be sent
1335  * @async: send to page asyncly
1336  */
1337 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1338                             uint8_t *buf, bool async)
1339 {
1340     ram_transferred_add(save_page_header(rs, rs->f, block,
1341                                          offset | RAM_SAVE_FLAG_PAGE));
1342     if (async) {
1343         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1344                               migrate_release_ram() &&
1345                               migration_in_postcopy());
1346     } else {
1347         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1348     }
1349     ram_transferred_add(TARGET_PAGE_SIZE);
1350     ram_counters.normal++;
1351     return 1;
1352 }
1353 
1354 /**
1355  * ram_save_page: send the given page to the stream
1356  *
1357  * Returns the number of pages written.
1358  *          < 0 - error
1359  *          >=0 - Number of pages written - this might legally be 0
1360  *                if xbzrle noticed the page was the same.
1361  *
1362  * @rs: current RAM state
1363  * @block: block that contains the page we want to send
1364  * @offset: offset inside the block for the page
1365  */
1366 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1367 {
1368     int pages = -1;
1369     uint8_t *p;
1370     bool send_async = true;
1371     RAMBlock *block = pss->block;
1372     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1373     ram_addr_t current_addr = block->offset + offset;
1374 
1375     p = block->host + offset;
1376     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1377 
1378     XBZRLE_cache_lock();
1379     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1380         pages = save_xbzrle_page(rs, &p, current_addr, block,
1381                                  offset);
1382         if (!rs->last_stage) {
1383             /* Can't send this cached data async, since the cache page
1384              * might get updated before it gets to the wire
1385              */
1386             send_async = false;
1387         }
1388     }
1389 
1390     /* XBZRLE overflow or normal page */
1391     if (pages == -1) {
1392         pages = save_normal_page(rs, block, offset, p, send_async);
1393     }
1394 
1395     XBZRLE_cache_unlock();
1396 
1397     return pages;
1398 }
1399 
1400 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1401                                  ram_addr_t offset)
1402 {
1403     if (multifd_queue_page(rs->f, block, offset) < 0) {
1404         return -1;
1405     }
1406     ram_counters.normal++;
1407 
1408     return 1;
1409 }
1410 
1411 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1412                                  ram_addr_t offset, uint8_t *source_buf)
1413 {
1414     RAMState *rs = ram_state;
1415     uint8_t *p = block->host + offset;
1416     int ret;
1417 
1418     if (save_zero_page_to_file(rs, f, block, offset)) {
1419         return true;
1420     }
1421 
1422     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1423 
1424     /*
1425      * copy it to a internal buffer to avoid it being modified by VM
1426      * so that we can catch up the error during compression and
1427      * decompression
1428      */
1429     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1430     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1431     if (ret < 0) {
1432         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1433         error_report("compressed data failed!");
1434     }
1435     return false;
1436 }
1437 
1438 static void
1439 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1440 {
1441     ram_transferred_add(bytes_xmit);
1442 
1443     if (param->zero_page) {
1444         ram_counters.duplicate++;
1445         return;
1446     }
1447 
1448     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1449     compression_counters.compressed_size += bytes_xmit - 8;
1450     compression_counters.pages++;
1451 }
1452 
1453 static bool save_page_use_compression(RAMState *rs);
1454 
1455 static void flush_compressed_data(RAMState *rs)
1456 {
1457     int idx, len, thread_count;
1458 
1459     if (!save_page_use_compression(rs)) {
1460         return;
1461     }
1462     thread_count = migrate_compress_threads();
1463 
1464     qemu_mutex_lock(&comp_done_lock);
1465     for (idx = 0; idx < thread_count; idx++) {
1466         while (!comp_param[idx].done) {
1467             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1468         }
1469     }
1470     qemu_mutex_unlock(&comp_done_lock);
1471 
1472     for (idx = 0; idx < thread_count; idx++) {
1473         qemu_mutex_lock(&comp_param[idx].mutex);
1474         if (!comp_param[idx].quit) {
1475             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1476             /*
1477              * it's safe to fetch zero_page without holding comp_done_lock
1478              * as there is no further request submitted to the thread,
1479              * i.e, the thread should be waiting for a request at this point.
1480              */
1481             update_compress_thread_counts(&comp_param[idx], len);
1482         }
1483         qemu_mutex_unlock(&comp_param[idx].mutex);
1484     }
1485 }
1486 
1487 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1488                                        ram_addr_t offset)
1489 {
1490     param->block = block;
1491     param->offset = offset;
1492 }
1493 
1494 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1495                                            ram_addr_t offset)
1496 {
1497     int idx, thread_count, bytes_xmit = -1, pages = -1;
1498     bool wait = migrate_compress_wait_thread();
1499 
1500     thread_count = migrate_compress_threads();
1501     qemu_mutex_lock(&comp_done_lock);
1502 retry:
1503     for (idx = 0; idx < thread_count; idx++) {
1504         if (comp_param[idx].done) {
1505             comp_param[idx].done = false;
1506             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1507             qemu_mutex_lock(&comp_param[idx].mutex);
1508             set_compress_params(&comp_param[idx], block, offset);
1509             qemu_cond_signal(&comp_param[idx].cond);
1510             qemu_mutex_unlock(&comp_param[idx].mutex);
1511             pages = 1;
1512             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1513             break;
1514         }
1515     }
1516 
1517     /*
1518      * wait for the free thread if the user specifies 'compress-wait-thread',
1519      * otherwise we will post the page out in the main thread as normal page.
1520      */
1521     if (pages < 0 && wait) {
1522         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1523         goto retry;
1524     }
1525     qemu_mutex_unlock(&comp_done_lock);
1526 
1527     return pages;
1528 }
1529 
1530 /**
1531  * find_dirty_block: find the next dirty page and update any state
1532  * associated with the search process.
1533  *
1534  * Returns true if a page is found
1535  *
1536  * @rs: current RAM state
1537  * @pss: data about the state of the current dirty page scan
1538  * @again: set to false if the search has scanned the whole of RAM
1539  */
1540 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1541 {
1542     /*
1543      * This is not a postcopy requested page, mark it "not urgent", and use
1544      * precopy channel to send it.
1545      */
1546     pss->postcopy_requested = false;
1547     pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
1548 
1549     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1550     if (pss->complete_round && pss->block == rs->last_seen_block &&
1551         pss->page >= rs->last_page) {
1552         /*
1553          * We've been once around the RAM and haven't found anything.
1554          * Give up.
1555          */
1556         *again = false;
1557         return false;
1558     }
1559     if (!offset_in_ramblock(pss->block,
1560                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1561         /* Didn't find anything in this RAM Block */
1562         pss->page = 0;
1563         pss->block = QLIST_NEXT_RCU(pss->block, next);
1564         if (!pss->block) {
1565             /*
1566              * If memory migration starts over, we will meet a dirtied page
1567              * which may still exists in compression threads's ring, so we
1568              * should flush the compressed data to make sure the new page
1569              * is not overwritten by the old one in the destination.
1570              *
1571              * Also If xbzrle is on, stop using the data compression at this
1572              * point. In theory, xbzrle can do better than compression.
1573              */
1574             flush_compressed_data(rs);
1575 
1576             /* Hit the end of the list */
1577             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1578             /* Flag that we've looped */
1579             pss->complete_round = true;
1580             /* After the first round, enable XBZRLE. */
1581             if (migrate_use_xbzrle()) {
1582                 rs->xbzrle_enabled = true;
1583             }
1584         }
1585         /* Didn't find anything this time, but try again on the new block */
1586         *again = true;
1587         return false;
1588     } else {
1589         /* Can go around again, but... */
1590         *again = true;
1591         /* We've found something so probably don't need to */
1592         return true;
1593     }
1594 }
1595 
1596 /**
1597  * unqueue_page: gets a page of the queue
1598  *
1599  * Helper for 'get_queued_page' - gets a page off the queue
1600  *
1601  * Returns the block of the page (or NULL if none available)
1602  *
1603  * @rs: current RAM state
1604  * @offset: used to return the offset within the RAMBlock
1605  */
1606 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1607 {
1608     struct RAMSrcPageRequest *entry;
1609     RAMBlock *block = NULL;
1610     size_t page_size;
1611 
1612     if (!postcopy_has_request(rs)) {
1613         return NULL;
1614     }
1615 
1616     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1617 
1618     /*
1619      * This should _never_ change even after we take the lock, because no one
1620      * should be taking anything off the request list other than us.
1621      */
1622     assert(postcopy_has_request(rs));
1623 
1624     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1625     block = entry->rb;
1626     *offset = entry->offset;
1627     page_size = qemu_ram_pagesize(block);
1628     /* Each page request should only be multiple page size of the ramblock */
1629     assert((entry->len % page_size) == 0);
1630 
1631     if (entry->len > page_size) {
1632         entry->len -= page_size;
1633         entry->offset += page_size;
1634     } else {
1635         memory_region_unref(block->mr);
1636         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1637         g_free(entry);
1638         migration_consume_urgent_request();
1639     }
1640 
1641     trace_unqueue_page(block->idstr, *offset,
1642                        test_bit((*offset >> TARGET_PAGE_BITS), block->bmap));
1643 
1644     return block;
1645 }
1646 
1647 #if defined(__linux__)
1648 /**
1649  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1650  *   is found, return RAM block pointer and page offset
1651  *
1652  * Returns pointer to the RAMBlock containing faulting page,
1653  *   NULL if no write faults are pending
1654  *
1655  * @rs: current RAM state
1656  * @offset: page offset from the beginning of the block
1657  */
1658 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1659 {
1660     struct uffd_msg uffd_msg;
1661     void *page_address;
1662     RAMBlock *block;
1663     int res;
1664 
1665     if (!migrate_background_snapshot()) {
1666         return NULL;
1667     }
1668 
1669     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1670     if (res <= 0) {
1671         return NULL;
1672     }
1673 
1674     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1675     block = qemu_ram_block_from_host(page_address, false, offset);
1676     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1677     return block;
1678 }
1679 
1680 /**
1681  * ram_save_release_protection: release UFFD write protection after
1682  *   a range of pages has been saved
1683  *
1684  * @rs: current RAM state
1685  * @pss: page-search-status structure
1686  * @start_page: index of the first page in the range relative to pss->block
1687  *
1688  * Returns 0 on success, negative value in case of an error
1689 */
1690 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1691         unsigned long start_page)
1692 {
1693     int res = 0;
1694 
1695     /* Check if page is from UFFD-managed region. */
1696     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1697         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1698         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1699 
1700         /* Flush async buffers before un-protect. */
1701         qemu_fflush(rs->f);
1702         /* Un-protect memory range. */
1703         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1704                 false, false);
1705     }
1706 
1707     return res;
1708 }
1709 
1710 /* ram_write_tracking_available: check if kernel supports required UFFD features
1711  *
1712  * Returns true if supports, false otherwise
1713  */
1714 bool ram_write_tracking_available(void)
1715 {
1716     uint64_t uffd_features;
1717     int res;
1718 
1719     res = uffd_query_features(&uffd_features);
1720     return (res == 0 &&
1721             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1722 }
1723 
1724 /* ram_write_tracking_compatible: check if guest configuration is
1725  *   compatible with 'write-tracking'
1726  *
1727  * Returns true if compatible, false otherwise
1728  */
1729 bool ram_write_tracking_compatible(void)
1730 {
1731     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1732     int uffd_fd;
1733     RAMBlock *block;
1734     bool ret = false;
1735 
1736     /* Open UFFD file descriptor */
1737     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1738     if (uffd_fd < 0) {
1739         return false;
1740     }
1741 
1742     RCU_READ_LOCK_GUARD();
1743 
1744     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1745         uint64_t uffd_ioctls;
1746 
1747         /* Nothing to do with read-only and MMIO-writable regions */
1748         if (block->mr->readonly || block->mr->rom_device) {
1749             continue;
1750         }
1751         /* Try to register block memory via UFFD-IO to track writes */
1752         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1753                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1754             goto out;
1755         }
1756         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1757             goto out;
1758         }
1759     }
1760     ret = true;
1761 
1762 out:
1763     uffd_close_fd(uffd_fd);
1764     return ret;
1765 }
1766 
1767 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1768                                        ram_addr_t size)
1769 {
1770     /*
1771      * We read one byte of each page; this will preallocate page tables if
1772      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1773      * where no page was populated yet. This might require adaption when
1774      * supporting other mappings, like shmem.
1775      */
1776     for (; offset < size; offset += block->page_size) {
1777         char tmp = *((char *)block->host + offset);
1778 
1779         /* Don't optimize the read out */
1780         asm volatile("" : "+r" (tmp));
1781     }
1782 }
1783 
1784 static inline int populate_read_section(MemoryRegionSection *section,
1785                                         void *opaque)
1786 {
1787     const hwaddr size = int128_get64(section->size);
1788     hwaddr offset = section->offset_within_region;
1789     RAMBlock *block = section->mr->ram_block;
1790 
1791     populate_read_range(block, offset, size);
1792     return 0;
1793 }
1794 
1795 /*
1796  * ram_block_populate_read: preallocate page tables and populate pages in the
1797  *   RAM block by reading a byte of each page.
1798  *
1799  * Since it's solely used for userfault_fd WP feature, here we just
1800  *   hardcode page size to qemu_real_host_page_size.
1801  *
1802  * @block: RAM block to populate
1803  */
1804 static void ram_block_populate_read(RAMBlock *rb)
1805 {
1806     /*
1807      * Skip populating all pages that fall into a discarded range as managed by
1808      * a RamDiscardManager responsible for the mapped memory region of the
1809      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1810      * must not get populated automatically. We don't have to track
1811      * modifications via userfaultfd WP reliably, because these pages will
1812      * not be part of the migration stream either way -- see
1813      * ramblock_dirty_bitmap_exclude_discarded_pages().
1814      *
1815      * Note: The result is only stable while migrating (precopy/postcopy).
1816      */
1817     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1818         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1819         MemoryRegionSection section = {
1820             .mr = rb->mr,
1821             .offset_within_region = 0,
1822             .size = rb->mr->size,
1823         };
1824 
1825         ram_discard_manager_replay_populated(rdm, &section,
1826                                              populate_read_section, NULL);
1827     } else {
1828         populate_read_range(rb, 0, rb->used_length);
1829     }
1830 }
1831 
1832 /*
1833  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1834  */
1835 void ram_write_tracking_prepare(void)
1836 {
1837     RAMBlock *block;
1838 
1839     RCU_READ_LOCK_GUARD();
1840 
1841     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1842         /* Nothing to do with read-only and MMIO-writable regions */
1843         if (block->mr->readonly || block->mr->rom_device) {
1844             continue;
1845         }
1846 
1847         /*
1848          * Populate pages of the RAM block before enabling userfault_fd
1849          * write protection.
1850          *
1851          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1852          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1853          * pages with pte_none() entries in page table.
1854          */
1855         ram_block_populate_read(block);
1856     }
1857 }
1858 
1859 /*
1860  * ram_write_tracking_start: start UFFD-WP memory tracking
1861  *
1862  * Returns 0 for success or negative value in case of error
1863  */
1864 int ram_write_tracking_start(void)
1865 {
1866     int uffd_fd;
1867     RAMState *rs = ram_state;
1868     RAMBlock *block;
1869 
1870     /* Open UFFD file descriptor */
1871     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1872     if (uffd_fd < 0) {
1873         return uffd_fd;
1874     }
1875     rs->uffdio_fd = uffd_fd;
1876 
1877     RCU_READ_LOCK_GUARD();
1878 
1879     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1880         /* Nothing to do with read-only and MMIO-writable regions */
1881         if (block->mr->readonly || block->mr->rom_device) {
1882             continue;
1883         }
1884 
1885         /* Register block memory with UFFD to track writes */
1886         if (uffd_register_memory(rs->uffdio_fd, block->host,
1887                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1888             goto fail;
1889         }
1890         /* Apply UFFD write protection to the block memory range */
1891         if (uffd_change_protection(rs->uffdio_fd, block->host,
1892                 block->max_length, true, false)) {
1893             goto fail;
1894         }
1895         block->flags |= RAM_UF_WRITEPROTECT;
1896         memory_region_ref(block->mr);
1897 
1898         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1899                 block->host, block->max_length);
1900     }
1901 
1902     return 0;
1903 
1904 fail:
1905     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1906 
1907     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1908         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1909             continue;
1910         }
1911         /*
1912          * In case some memory block failed to be write-protected
1913          * remove protection and unregister all succeeded RAM blocks
1914          */
1915         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1916                 false, false);
1917         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1918         /* Cleanup flags and remove reference */
1919         block->flags &= ~RAM_UF_WRITEPROTECT;
1920         memory_region_unref(block->mr);
1921     }
1922 
1923     uffd_close_fd(uffd_fd);
1924     rs->uffdio_fd = -1;
1925     return -1;
1926 }
1927 
1928 /**
1929  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1930  */
1931 void ram_write_tracking_stop(void)
1932 {
1933     RAMState *rs = ram_state;
1934     RAMBlock *block;
1935 
1936     RCU_READ_LOCK_GUARD();
1937 
1938     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1939         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1940             continue;
1941         }
1942         /* Remove protection and unregister all affected RAM blocks */
1943         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1944                 false, false);
1945         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1946 
1947         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1948                 block->host, block->max_length);
1949 
1950         /* Cleanup flags and remove reference */
1951         block->flags &= ~RAM_UF_WRITEPROTECT;
1952         memory_region_unref(block->mr);
1953     }
1954 
1955     /* Finally close UFFD file descriptor */
1956     uffd_close_fd(rs->uffdio_fd);
1957     rs->uffdio_fd = -1;
1958 }
1959 
1960 #else
1961 /* No target OS support, stubs just fail or ignore */
1962 
1963 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1964 {
1965     (void) rs;
1966     (void) offset;
1967 
1968     return NULL;
1969 }
1970 
1971 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1972         unsigned long start_page)
1973 {
1974     (void) rs;
1975     (void) pss;
1976     (void) start_page;
1977 
1978     return 0;
1979 }
1980 
1981 bool ram_write_tracking_available(void)
1982 {
1983     return false;
1984 }
1985 
1986 bool ram_write_tracking_compatible(void)
1987 {
1988     assert(0);
1989     return false;
1990 }
1991 
1992 int ram_write_tracking_start(void)
1993 {
1994     assert(0);
1995     return -1;
1996 }
1997 
1998 void ram_write_tracking_stop(void)
1999 {
2000     assert(0);
2001 }
2002 #endif /* defined(__linux__) */
2003 
2004 /*
2005  * Check whether two addr/offset of the ramblock falls onto the same host huge
2006  * page.  Returns true if so, false otherwise.
2007  */
2008 static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1,
2009                                      uint64_t addr2)
2010 {
2011     size_t page_size = qemu_ram_pagesize(rb);
2012 
2013     addr1 = ROUND_DOWN(addr1, page_size);
2014     addr2 = ROUND_DOWN(addr2, page_size);
2015 
2016     return addr1 == addr2;
2017 }
2018 
2019 /*
2020  * Whether a previous preempted precopy huge page contains current requested
2021  * page?  Returns true if so, false otherwise.
2022  *
2023  * This should really happen very rarely, because it means when we were sending
2024  * during background migration for postcopy we're sending exactly the page that
2025  * some vcpu got faulted on on dest node.  When it happens, we probably don't
2026  * need to do much but drop the request, because we know right after we restore
2027  * the precopy stream it'll be serviced.  It'll slightly affect the order of
2028  * postcopy requests to be serviced (e.g. it'll be the same as we move current
2029  * request to the end of the queue) but it shouldn't be a big deal.  The most
2030  * imporant thing is we can _never_ try to send a partial-sent huge page on the
2031  * POSTCOPY channel again, otherwise that huge page will got "split brain" on
2032  * two channels (PRECOPY, POSTCOPY).
2033  */
2034 static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block,
2035                                         ram_addr_t offset)
2036 {
2037     PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2038 
2039     /* No preemption at all? */
2040     if (!state->preempted) {
2041         return false;
2042     }
2043 
2044     /* Not even the same ramblock? */
2045     if (state->ram_block != block) {
2046         return false;
2047     }
2048 
2049     return offset_on_same_huge_page(block, offset,
2050                                     state->ram_page << TARGET_PAGE_BITS);
2051 }
2052 
2053 /**
2054  * get_queued_page: unqueue a page from the postcopy requests
2055  *
2056  * Skips pages that are already sent (!dirty)
2057  *
2058  * Returns true if a queued page is found
2059  *
2060  * @rs: current RAM state
2061  * @pss: data about the state of the current dirty page scan
2062  */
2063 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2064 {
2065     RAMBlock  *block;
2066     ram_addr_t offset;
2067 
2068     block = unqueue_page(rs, &offset);
2069 
2070     if (block) {
2071         /* See comment above postcopy_preempted_contains() */
2072         if (postcopy_preempted_contains(rs, block, offset)) {
2073             trace_postcopy_preempt_hit(block->idstr, offset);
2074             /*
2075              * If what we preempted previously was exactly what we're
2076              * requesting right now, restore the preempted precopy
2077              * immediately, boosting its priority as it's requested by
2078              * postcopy.
2079              */
2080             postcopy_preempt_restore(rs, pss, true);
2081             return true;
2082         }
2083     } else {
2084         /*
2085          * Poll write faults too if background snapshot is enabled; that's
2086          * when we have vcpus got blocked by the write protected pages.
2087          */
2088         block = poll_fault_page(rs, &offset);
2089     }
2090 
2091     if (block) {
2092         /*
2093          * We want the background search to continue from the queued page
2094          * since the guest is likely to want other pages near to the page
2095          * it just requested.
2096          */
2097         pss->block = block;
2098         pss->page = offset >> TARGET_PAGE_BITS;
2099 
2100         /*
2101          * This unqueued page would break the "one round" check, even is
2102          * really rare.
2103          */
2104         pss->complete_round = false;
2105         /* Mark it an urgent request, meanwhile using POSTCOPY channel */
2106         pss->postcopy_requested = true;
2107         pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY;
2108     }
2109 
2110     return !!block;
2111 }
2112 
2113 /**
2114  * migration_page_queue_free: drop any remaining pages in the ram
2115  * request queue
2116  *
2117  * It should be empty at the end anyway, but in error cases there may
2118  * be some left.  in case that there is any page left, we drop it.
2119  *
2120  */
2121 static void migration_page_queue_free(RAMState *rs)
2122 {
2123     struct RAMSrcPageRequest *mspr, *next_mspr;
2124     /* This queue generally should be empty - but in the case of a failed
2125      * migration might have some droppings in.
2126      */
2127     RCU_READ_LOCK_GUARD();
2128     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2129         memory_region_unref(mspr->rb->mr);
2130         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2131         g_free(mspr);
2132     }
2133 }
2134 
2135 /**
2136  * ram_save_queue_pages: queue the page for transmission
2137  *
2138  * A request from postcopy destination for example.
2139  *
2140  * Returns zero on success or negative on error
2141  *
2142  * @rbname: Name of the RAMBLock of the request. NULL means the
2143  *          same that last one.
2144  * @start: starting address from the start of the RAMBlock
2145  * @len: length (in bytes) to send
2146  */
2147 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2148 {
2149     RAMBlock *ramblock;
2150     RAMState *rs = ram_state;
2151 
2152     ram_counters.postcopy_requests++;
2153     RCU_READ_LOCK_GUARD();
2154 
2155     if (!rbname) {
2156         /* Reuse last RAMBlock */
2157         ramblock = rs->last_req_rb;
2158 
2159         if (!ramblock) {
2160             /*
2161              * Shouldn't happen, we can't reuse the last RAMBlock if
2162              * it's the 1st request.
2163              */
2164             error_report("ram_save_queue_pages no previous block");
2165             return -1;
2166         }
2167     } else {
2168         ramblock = qemu_ram_block_by_name(rbname);
2169 
2170         if (!ramblock) {
2171             /* We shouldn't be asked for a non-existent RAMBlock */
2172             error_report("ram_save_queue_pages no block '%s'", rbname);
2173             return -1;
2174         }
2175         rs->last_req_rb = ramblock;
2176     }
2177     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2178     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2179         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2180                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2181                      __func__, start, len, ramblock->used_length);
2182         return -1;
2183     }
2184 
2185     struct RAMSrcPageRequest *new_entry =
2186         g_new0(struct RAMSrcPageRequest, 1);
2187     new_entry->rb = ramblock;
2188     new_entry->offset = start;
2189     new_entry->len = len;
2190 
2191     memory_region_ref(ramblock->mr);
2192     qemu_mutex_lock(&rs->src_page_req_mutex);
2193     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2194     migration_make_urgent_request();
2195     qemu_mutex_unlock(&rs->src_page_req_mutex);
2196 
2197     return 0;
2198 }
2199 
2200 static bool save_page_use_compression(RAMState *rs)
2201 {
2202     if (!migrate_use_compression()) {
2203         return false;
2204     }
2205 
2206     /*
2207      * If xbzrle is enabled (e.g., after first round of migration), stop
2208      * using the data compression. In theory, xbzrle can do better than
2209      * compression.
2210      */
2211     if (rs->xbzrle_enabled) {
2212         return false;
2213     }
2214 
2215     return true;
2216 }
2217 
2218 /*
2219  * try to compress the page before posting it out, return true if the page
2220  * has been properly handled by compression, otherwise needs other
2221  * paths to handle it
2222  */
2223 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2224 {
2225     if (!save_page_use_compression(rs)) {
2226         return false;
2227     }
2228 
2229     /*
2230      * When starting the process of a new block, the first page of
2231      * the block should be sent out before other pages in the same
2232      * block, and all the pages in last block should have been sent
2233      * out, keeping this order is important, because the 'cont' flag
2234      * is used to avoid resending the block name.
2235      *
2236      * We post the fist page as normal page as compression will take
2237      * much CPU resource.
2238      */
2239     if (block != rs->last_sent_block) {
2240         flush_compressed_data(rs);
2241         return false;
2242     }
2243 
2244     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2245         return true;
2246     }
2247 
2248     compression_counters.busy++;
2249     return false;
2250 }
2251 
2252 /**
2253  * ram_save_target_page: save one target page
2254  *
2255  * Returns the number of pages written
2256  *
2257  * @rs: current RAM state
2258  * @pss: data about the page we want to send
2259  */
2260 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2261 {
2262     RAMBlock *block = pss->block;
2263     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2264     int res;
2265 
2266     if (control_save_page(rs, block, offset, &res)) {
2267         return res;
2268     }
2269 
2270     if (save_compress_page(rs, block, offset)) {
2271         return 1;
2272     }
2273 
2274     res = save_zero_page(rs, block, offset);
2275     if (res > 0) {
2276         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2277          * page would be stale
2278          */
2279         if (!save_page_use_compression(rs)) {
2280             XBZRLE_cache_lock();
2281             xbzrle_cache_zero_page(rs, block->offset + offset);
2282             XBZRLE_cache_unlock();
2283         }
2284         return res;
2285     }
2286 
2287     /*
2288      * Do not use multifd for:
2289      * 1. Compression as the first page in the new block should be posted out
2290      *    before sending the compressed page
2291      * 2. In postcopy as one whole host page should be placed
2292      */
2293     if (!save_page_use_compression(rs) && migrate_use_multifd()
2294         && !migration_in_postcopy()) {
2295         return ram_save_multifd_page(rs, block, offset);
2296     }
2297 
2298     return ram_save_page(rs, pss);
2299 }
2300 
2301 static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)
2302 {
2303     MigrationState *ms = migrate_get_current();
2304 
2305     /* Not enabled eager preempt?  Then never do that. */
2306     if (!migrate_postcopy_preempt()) {
2307         return false;
2308     }
2309 
2310     /* If the user explicitly disabled breaking of huge page, skip */
2311     if (!ms->postcopy_preempt_break_huge) {
2312         return false;
2313     }
2314 
2315     /* If the ramblock we're sending is a small page?  Never bother. */
2316     if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
2317         return false;
2318     }
2319 
2320     /* Not in postcopy at all? */
2321     if (!migration_in_postcopy()) {
2322         return false;
2323     }
2324 
2325     /*
2326      * If we're already handling a postcopy request, don't preempt as this page
2327      * has got the same high priority.
2328      */
2329     if (pss->postcopy_requested) {
2330         return false;
2331     }
2332 
2333     /* If there's postcopy requests, then check it up! */
2334     return postcopy_has_request(rs);
2335 }
2336 
2337 /* Returns true if we preempted precopy, false otherwise */
2338 static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss)
2339 {
2340     PostcopyPreemptState *p_state = &rs->postcopy_preempt_state;
2341 
2342     trace_postcopy_preempt_triggered(pss->block->idstr, pss->page);
2343 
2344     /*
2345      * Time to preempt precopy. Cache current PSS into preempt state, so that
2346      * after handling the postcopy pages we can recover to it.  We need to do
2347      * so because the dest VM will have partial of the precopy huge page kept
2348      * over in its tmp huge page caches; better move on with it when we can.
2349      */
2350     p_state->ram_block = pss->block;
2351     p_state->ram_page = pss->page;
2352     p_state->preempted = true;
2353 }
2354 
2355 /* Whether we're preempted by a postcopy request during sending a huge page */
2356 static bool postcopy_preempt_triggered(RAMState *rs)
2357 {
2358     return rs->postcopy_preempt_state.preempted;
2359 }
2360 
2361 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
2362                                      bool postcopy_requested)
2363 {
2364     PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2365 
2366     assert(state->preempted);
2367 
2368     pss->block = state->ram_block;
2369     pss->page = state->ram_page;
2370 
2371     /* Whether this is a postcopy request? */
2372     pss->postcopy_requested = postcopy_requested;
2373     /*
2374      * When restoring a preempted page, the old data resides in PRECOPY
2375      * slow channel, even if postcopy_requested is set.  So always use
2376      * PRECOPY channel here.
2377      */
2378     pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
2379 
2380     trace_postcopy_preempt_restored(pss->block->idstr, pss->page);
2381 
2382     /* Reset preempt state, most importantly, set preempted==false */
2383     postcopy_preempt_reset(rs);
2384 }
2385 
2386 static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss)
2387 {
2388     MigrationState *s = migrate_get_current();
2389     unsigned int channel = pss->postcopy_target_channel;
2390     QEMUFile *next;
2391 
2392     if (channel != rs->postcopy_channel) {
2393         if (channel == RAM_CHANNEL_PRECOPY) {
2394             next = s->to_dst_file;
2395         } else {
2396             next = s->postcopy_qemufile_src;
2397         }
2398         /* Update and cache the current channel */
2399         rs->f = next;
2400         rs->postcopy_channel = channel;
2401 
2402         /*
2403          * If channel switched, reset last_sent_block since the old sent block
2404          * may not be on the same channel.
2405          */
2406         rs->last_sent_block = NULL;
2407 
2408         trace_postcopy_preempt_switch_channel(channel);
2409     }
2410 
2411     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2412 }
2413 
2414 /* We need to make sure rs->f always points to the default channel elsewhere */
2415 static void postcopy_preempt_reset_channel(RAMState *rs)
2416 {
2417     if (migrate_postcopy_preempt() && migration_in_postcopy()) {
2418         rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2419         rs->f = migrate_get_current()->to_dst_file;
2420         trace_postcopy_preempt_reset_channel();
2421     }
2422 }
2423 
2424 /**
2425  * ram_save_host_page: save a whole host page
2426  *
2427  * Starting at *offset send pages up to the end of the current host
2428  * page. It's valid for the initial offset to point into the middle of
2429  * a host page in which case the remainder of the hostpage is sent.
2430  * Only dirty target pages are sent. Note that the host page size may
2431  * be a huge page for this block.
2432  * The saving stops at the boundary of the used_length of the block
2433  * if the RAMBlock isn't a multiple of the host page size.
2434  *
2435  * Returns the number of pages written or negative on error
2436  *
2437  * @rs: current RAM state
2438  * @pss: data about the page we want to send
2439  */
2440 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2441 {
2442     int tmppages, pages = 0;
2443     size_t pagesize_bits =
2444         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2445     unsigned long hostpage_boundary =
2446         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2447     unsigned long start_page = pss->page;
2448     int res;
2449 
2450     if (ramblock_is_ignored(pss->block)) {
2451         error_report("block %s should not be migrated !", pss->block->idstr);
2452         return 0;
2453     }
2454 
2455     if (migrate_postcopy_preempt() && migration_in_postcopy()) {
2456         postcopy_preempt_choose_channel(rs, pss);
2457     }
2458 
2459     do {
2460         if (postcopy_needs_preempt(rs, pss)) {
2461             postcopy_do_preempt(rs, pss);
2462             break;
2463         }
2464 
2465         /* Check the pages is dirty and if it is send it */
2466         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2467             tmppages = ram_save_target_page(rs, pss);
2468             if (tmppages < 0) {
2469                 return tmppages;
2470             }
2471 
2472             pages += tmppages;
2473             /*
2474              * Allow rate limiting to happen in the middle of huge pages if
2475              * something is sent in the current iteration.
2476              */
2477             if (pagesize_bits > 1 && tmppages > 0) {
2478                 migration_rate_limit();
2479             }
2480         }
2481         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2482     } while ((pss->page < hostpage_boundary) &&
2483              offset_in_ramblock(pss->block,
2484                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2485     /* The offset we leave with is the min boundary of host page and block */
2486     pss->page = MIN(pss->page, hostpage_boundary);
2487 
2488     /*
2489      * When with postcopy preempt mode, flush the data as soon as possible for
2490      * postcopy requests, because we've already sent a whole huge page, so the
2491      * dst node should already have enough resource to atomically filling in
2492      * the current missing page.
2493      *
2494      * More importantly, when using separate postcopy channel, we must do
2495      * explicit flush or it won't flush until the buffer is full.
2496      */
2497     if (migrate_postcopy_preempt() && pss->postcopy_requested) {
2498         qemu_fflush(rs->f);
2499     }
2500 
2501     res = ram_save_release_protection(rs, pss, start_page);
2502     return (res < 0 ? res : pages);
2503 }
2504 
2505 /**
2506  * ram_find_and_save_block: finds a dirty page and sends it to f
2507  *
2508  * Called within an RCU critical section.
2509  *
2510  * Returns the number of pages written where zero means no dirty pages,
2511  * or negative on error
2512  *
2513  * @rs: current RAM state
2514  *
2515  * On systems where host-page-size > target-page-size it will send all the
2516  * pages in a host page that are dirty.
2517  */
2518 static int ram_find_and_save_block(RAMState *rs)
2519 {
2520     PageSearchStatus pss;
2521     int pages = 0;
2522     bool again, found;
2523 
2524     /* No dirty page as there is zero RAM */
2525     if (!ram_bytes_total()) {
2526         return pages;
2527     }
2528 
2529     pss.block = rs->last_seen_block;
2530     pss.page = rs->last_page;
2531     pss.complete_round = false;
2532 
2533     if (!pss.block) {
2534         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2535     }
2536 
2537     do {
2538         again = true;
2539         found = get_queued_page(rs, &pss);
2540 
2541         if (!found) {
2542             /*
2543              * Recover previous precopy ramblock/offset if postcopy has
2544              * preempted precopy.  Otherwise find the next dirty bit.
2545              */
2546             if (postcopy_preempt_triggered(rs)) {
2547                 postcopy_preempt_restore(rs, &pss, false);
2548                 found = true;
2549             } else {
2550                 /* priority queue empty, so just search for something dirty */
2551                 found = find_dirty_block(rs, &pss, &again);
2552             }
2553         }
2554 
2555         if (found) {
2556             pages = ram_save_host_page(rs, &pss);
2557         }
2558     } while (!pages && again);
2559 
2560     rs->last_seen_block = pss.block;
2561     rs->last_page = pss.page;
2562 
2563     return pages;
2564 }
2565 
2566 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2567 {
2568     uint64_t pages = size / TARGET_PAGE_SIZE;
2569 
2570     if (zero) {
2571         ram_counters.duplicate += pages;
2572     } else {
2573         ram_counters.normal += pages;
2574         ram_transferred_add(size);
2575         qemu_file_credit_transfer(f, size);
2576     }
2577 }
2578 
2579 static uint64_t ram_bytes_total_common(bool count_ignored)
2580 {
2581     RAMBlock *block;
2582     uint64_t total = 0;
2583 
2584     RCU_READ_LOCK_GUARD();
2585 
2586     if (count_ignored) {
2587         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2588             total += block->used_length;
2589         }
2590     } else {
2591         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2592             total += block->used_length;
2593         }
2594     }
2595     return total;
2596 }
2597 
2598 uint64_t ram_bytes_total(void)
2599 {
2600     return ram_bytes_total_common(false);
2601 }
2602 
2603 static void xbzrle_load_setup(void)
2604 {
2605     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2606 }
2607 
2608 static void xbzrle_load_cleanup(void)
2609 {
2610     g_free(XBZRLE.decoded_buf);
2611     XBZRLE.decoded_buf = NULL;
2612 }
2613 
2614 static void ram_state_cleanup(RAMState **rsp)
2615 {
2616     if (*rsp) {
2617         migration_page_queue_free(*rsp);
2618         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2619         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2620         g_free(*rsp);
2621         *rsp = NULL;
2622     }
2623 }
2624 
2625 static void xbzrle_cleanup(void)
2626 {
2627     XBZRLE_cache_lock();
2628     if (XBZRLE.cache) {
2629         cache_fini(XBZRLE.cache);
2630         g_free(XBZRLE.encoded_buf);
2631         g_free(XBZRLE.current_buf);
2632         g_free(XBZRLE.zero_target_page);
2633         XBZRLE.cache = NULL;
2634         XBZRLE.encoded_buf = NULL;
2635         XBZRLE.current_buf = NULL;
2636         XBZRLE.zero_target_page = NULL;
2637     }
2638     XBZRLE_cache_unlock();
2639 }
2640 
2641 static void ram_save_cleanup(void *opaque)
2642 {
2643     RAMState **rsp = opaque;
2644     RAMBlock *block;
2645 
2646     /* We don't use dirty log with background snapshots */
2647     if (!migrate_background_snapshot()) {
2648         /* caller have hold iothread lock or is in a bh, so there is
2649          * no writing race against the migration bitmap
2650          */
2651         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2652             /*
2653              * do not stop dirty log without starting it, since
2654              * memory_global_dirty_log_stop will assert that
2655              * memory_global_dirty_log_start/stop used in pairs
2656              */
2657             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2658         }
2659     }
2660 
2661     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2662         g_free(block->clear_bmap);
2663         block->clear_bmap = NULL;
2664         g_free(block->bmap);
2665         block->bmap = NULL;
2666     }
2667 
2668     xbzrle_cleanup();
2669     compress_threads_save_cleanup();
2670     ram_state_cleanup(rsp);
2671 }
2672 
2673 static void ram_state_reset(RAMState *rs)
2674 {
2675     rs->last_seen_block = NULL;
2676     rs->last_sent_block = NULL;
2677     rs->last_page = 0;
2678     rs->last_version = ram_list.version;
2679     rs->xbzrle_enabled = false;
2680     postcopy_preempt_reset(rs);
2681     rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2682 }
2683 
2684 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2685 
2686 /* **** functions for postcopy ***** */
2687 
2688 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2689 {
2690     struct RAMBlock *block;
2691 
2692     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2693         unsigned long *bitmap = block->bmap;
2694         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2695         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2696 
2697         while (run_start < range) {
2698             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2699             ram_discard_range(block->idstr,
2700                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2701                               ((ram_addr_t)(run_end - run_start))
2702                                 << TARGET_PAGE_BITS);
2703             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2704         }
2705     }
2706 }
2707 
2708 /**
2709  * postcopy_send_discard_bm_ram: discard a RAMBlock
2710  *
2711  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2712  *
2713  * @ms: current migration state
2714  * @block: RAMBlock to discard
2715  */
2716 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2717 {
2718     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2719     unsigned long current;
2720     unsigned long *bitmap = block->bmap;
2721 
2722     for (current = 0; current < end; ) {
2723         unsigned long one = find_next_bit(bitmap, end, current);
2724         unsigned long zero, discard_length;
2725 
2726         if (one >= end) {
2727             break;
2728         }
2729 
2730         zero = find_next_zero_bit(bitmap, end, one + 1);
2731 
2732         if (zero >= end) {
2733             discard_length = end - one;
2734         } else {
2735             discard_length = zero - one;
2736         }
2737         postcopy_discard_send_range(ms, one, discard_length);
2738         current = one + discard_length;
2739     }
2740 }
2741 
2742 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2743 
2744 /**
2745  * postcopy_each_ram_send_discard: discard all RAMBlocks
2746  *
2747  * Utility for the outgoing postcopy code.
2748  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2749  *   passing it bitmap indexes and name.
2750  * (qemu_ram_foreach_block ends up passing unscaled lengths
2751  *  which would mean postcopy code would have to deal with target page)
2752  *
2753  * @ms: current migration state
2754  */
2755 static void postcopy_each_ram_send_discard(MigrationState *ms)
2756 {
2757     struct RAMBlock *block;
2758 
2759     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2760         postcopy_discard_send_init(ms, block->idstr);
2761 
2762         /*
2763          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2764          * host-page size chunks, mark any partially dirty host-page size
2765          * chunks as all dirty.  In this case the host-page is the host-page
2766          * for the particular RAMBlock, i.e. it might be a huge page.
2767          */
2768         postcopy_chunk_hostpages_pass(ms, block);
2769 
2770         /*
2771          * Postcopy sends chunks of bitmap over the wire, but it
2772          * just needs indexes at this point, avoids it having
2773          * target page specific code.
2774          */
2775         postcopy_send_discard_bm_ram(ms, block);
2776         postcopy_discard_send_finish(ms);
2777     }
2778 }
2779 
2780 /**
2781  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2782  *
2783  * Helper for postcopy_chunk_hostpages; it's called twice to
2784  * canonicalize the two bitmaps, that are similar, but one is
2785  * inverted.
2786  *
2787  * Postcopy requires that all target pages in a hostpage are dirty or
2788  * clean, not a mix.  This function canonicalizes the bitmaps.
2789  *
2790  * @ms: current migration state
2791  * @block: block that contains the page we want to canonicalize
2792  */
2793 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2794 {
2795     RAMState *rs = ram_state;
2796     unsigned long *bitmap = block->bmap;
2797     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2798     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2799     unsigned long run_start;
2800 
2801     if (block->page_size == TARGET_PAGE_SIZE) {
2802         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2803         return;
2804     }
2805 
2806     /* Find a dirty page */
2807     run_start = find_next_bit(bitmap, pages, 0);
2808 
2809     while (run_start < pages) {
2810 
2811         /*
2812          * If the start of this run of pages is in the middle of a host
2813          * page, then we need to fixup this host page.
2814          */
2815         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2816             /* Find the end of this run */
2817             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2818             /*
2819              * If the end isn't at the start of a host page, then the
2820              * run doesn't finish at the end of a host page
2821              * and we need to discard.
2822              */
2823         }
2824 
2825         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2826             unsigned long page;
2827             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2828                                                              host_ratio);
2829             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2830 
2831             /* Clean up the bitmap */
2832             for (page = fixup_start_addr;
2833                  page < fixup_start_addr + host_ratio; page++) {
2834                 /*
2835                  * Remark them as dirty, updating the count for any pages
2836                  * that weren't previously dirty.
2837                  */
2838                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2839             }
2840         }
2841 
2842         /* Find the next dirty page for the next iteration */
2843         run_start = find_next_bit(bitmap, pages, run_start);
2844     }
2845 }
2846 
2847 /**
2848  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2849  *
2850  * Transmit the set of pages to be discarded after precopy to the target
2851  * these are pages that:
2852  *     a) Have been previously transmitted but are now dirty again
2853  *     b) Pages that have never been transmitted, this ensures that
2854  *        any pages on the destination that have been mapped by background
2855  *        tasks get discarded (transparent huge pages is the specific concern)
2856  * Hopefully this is pretty sparse
2857  *
2858  * @ms: current migration state
2859  */
2860 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2861 {
2862     RAMState *rs = ram_state;
2863 
2864     RCU_READ_LOCK_GUARD();
2865 
2866     /* This should be our last sync, the src is now paused */
2867     migration_bitmap_sync(rs);
2868 
2869     /* Easiest way to make sure we don't resume in the middle of a host-page */
2870     rs->last_seen_block = NULL;
2871     rs->last_sent_block = NULL;
2872     rs->last_page = 0;
2873 
2874     postcopy_each_ram_send_discard(ms);
2875 
2876     trace_ram_postcopy_send_discard_bitmap();
2877 }
2878 
2879 /**
2880  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2881  *
2882  * Returns zero on success
2883  *
2884  * @rbname: name of the RAMBlock of the request. NULL means the
2885  *          same that last one.
2886  * @start: RAMBlock starting page
2887  * @length: RAMBlock size
2888  */
2889 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2890 {
2891     trace_ram_discard_range(rbname, start, length);
2892 
2893     RCU_READ_LOCK_GUARD();
2894     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2895 
2896     if (!rb) {
2897         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2898         return -1;
2899     }
2900 
2901     /*
2902      * On source VM, we don't need to update the received bitmap since
2903      * we don't even have one.
2904      */
2905     if (rb->receivedmap) {
2906         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2907                      length >> qemu_target_page_bits());
2908     }
2909 
2910     return ram_block_discard_range(rb, start, length);
2911 }
2912 
2913 /*
2914  * For every allocation, we will try not to crash the VM if the
2915  * allocation failed.
2916  */
2917 static int xbzrle_init(void)
2918 {
2919     Error *local_err = NULL;
2920 
2921     if (!migrate_use_xbzrle()) {
2922         return 0;
2923     }
2924 
2925     XBZRLE_cache_lock();
2926 
2927     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2928     if (!XBZRLE.zero_target_page) {
2929         error_report("%s: Error allocating zero page", __func__);
2930         goto err_out;
2931     }
2932 
2933     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2934                               TARGET_PAGE_SIZE, &local_err);
2935     if (!XBZRLE.cache) {
2936         error_report_err(local_err);
2937         goto free_zero_page;
2938     }
2939 
2940     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2941     if (!XBZRLE.encoded_buf) {
2942         error_report("%s: Error allocating encoded_buf", __func__);
2943         goto free_cache;
2944     }
2945 
2946     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2947     if (!XBZRLE.current_buf) {
2948         error_report("%s: Error allocating current_buf", __func__);
2949         goto free_encoded_buf;
2950     }
2951 
2952     /* We are all good */
2953     XBZRLE_cache_unlock();
2954     return 0;
2955 
2956 free_encoded_buf:
2957     g_free(XBZRLE.encoded_buf);
2958     XBZRLE.encoded_buf = NULL;
2959 free_cache:
2960     cache_fini(XBZRLE.cache);
2961     XBZRLE.cache = NULL;
2962 free_zero_page:
2963     g_free(XBZRLE.zero_target_page);
2964     XBZRLE.zero_target_page = NULL;
2965 err_out:
2966     XBZRLE_cache_unlock();
2967     return -ENOMEM;
2968 }
2969 
2970 static int ram_state_init(RAMState **rsp)
2971 {
2972     *rsp = g_try_new0(RAMState, 1);
2973 
2974     if (!*rsp) {
2975         error_report("%s: Init ramstate fail", __func__);
2976         return -1;
2977     }
2978 
2979     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2980     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2981     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2982 
2983     /*
2984      * Count the total number of pages used by ram blocks not including any
2985      * gaps due to alignment or unplugs.
2986      * This must match with the initial values of dirty bitmap.
2987      */
2988     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2989     ram_state_reset(*rsp);
2990 
2991     return 0;
2992 }
2993 
2994 static void ram_list_init_bitmaps(void)
2995 {
2996     MigrationState *ms = migrate_get_current();
2997     RAMBlock *block;
2998     unsigned long pages;
2999     uint8_t shift;
3000 
3001     /* Skip setting bitmap if there is no RAM */
3002     if (ram_bytes_total()) {
3003         shift = ms->clear_bitmap_shift;
3004         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3005             error_report("clear_bitmap_shift (%u) too big, using "
3006                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3007             shift = CLEAR_BITMAP_SHIFT_MAX;
3008         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3009             error_report("clear_bitmap_shift (%u) too small, using "
3010                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3011             shift = CLEAR_BITMAP_SHIFT_MIN;
3012         }
3013 
3014         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3015             pages = block->max_length >> TARGET_PAGE_BITS;
3016             /*
3017              * The initial dirty bitmap for migration must be set with all
3018              * ones to make sure we'll migrate every guest RAM page to
3019              * destination.
3020              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3021              * new migration after a failed migration, ram_list.
3022              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3023              * guest memory.
3024              */
3025             block->bmap = bitmap_new(pages);
3026             bitmap_set(block->bmap, 0, pages);
3027             block->clear_bmap_shift = shift;
3028             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3029         }
3030     }
3031 }
3032 
3033 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3034 {
3035     unsigned long pages;
3036     RAMBlock *rb;
3037 
3038     RCU_READ_LOCK_GUARD();
3039 
3040     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3041             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3042             rs->migration_dirty_pages -= pages;
3043     }
3044 }
3045 
3046 static void ram_init_bitmaps(RAMState *rs)
3047 {
3048     /* For memory_global_dirty_log_start below.  */
3049     qemu_mutex_lock_iothread();
3050     qemu_mutex_lock_ramlist();
3051 
3052     WITH_RCU_READ_LOCK_GUARD() {
3053         ram_list_init_bitmaps();
3054         /* We don't use dirty log with background snapshots */
3055         if (!migrate_background_snapshot()) {
3056             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3057             migration_bitmap_sync_precopy(rs);
3058         }
3059     }
3060     qemu_mutex_unlock_ramlist();
3061     qemu_mutex_unlock_iothread();
3062 
3063     /*
3064      * After an eventual first bitmap sync, fixup the initial bitmap
3065      * containing all 1s to exclude any discarded pages from migration.
3066      */
3067     migration_bitmap_clear_discarded_pages(rs);
3068 }
3069 
3070 static int ram_init_all(RAMState **rsp)
3071 {
3072     if (ram_state_init(rsp)) {
3073         return -1;
3074     }
3075 
3076     if (xbzrle_init()) {
3077         ram_state_cleanup(rsp);
3078         return -1;
3079     }
3080 
3081     ram_init_bitmaps(*rsp);
3082 
3083     return 0;
3084 }
3085 
3086 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3087 {
3088     RAMBlock *block;
3089     uint64_t pages = 0;
3090 
3091     /*
3092      * Postcopy is not using xbzrle/compression, so no need for that.
3093      * Also, since source are already halted, we don't need to care
3094      * about dirty page logging as well.
3095      */
3096 
3097     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3098         pages += bitmap_count_one(block->bmap,
3099                                   block->used_length >> TARGET_PAGE_BITS);
3100     }
3101 
3102     /* This may not be aligned with current bitmaps. Recalculate. */
3103     rs->migration_dirty_pages = pages;
3104 
3105     ram_state_reset(rs);
3106 
3107     /* Update RAMState cache of output QEMUFile */
3108     rs->f = out;
3109 
3110     trace_ram_state_resume_prepare(pages);
3111 }
3112 
3113 /*
3114  * This function clears bits of the free pages reported by the caller from the
3115  * migration dirty bitmap. @addr is the host address corresponding to the
3116  * start of the continuous guest free pages, and @len is the total bytes of
3117  * those pages.
3118  */
3119 void qemu_guest_free_page_hint(void *addr, size_t len)
3120 {
3121     RAMBlock *block;
3122     ram_addr_t offset;
3123     size_t used_len, start, npages;
3124     MigrationState *s = migrate_get_current();
3125 
3126     /* This function is currently expected to be used during live migration */
3127     if (!migration_is_setup_or_active(s->state)) {
3128         return;
3129     }
3130 
3131     for (; len > 0; len -= used_len, addr += used_len) {
3132         block = qemu_ram_block_from_host(addr, false, &offset);
3133         if (unlikely(!block || offset >= block->used_length)) {
3134             /*
3135              * The implementation might not support RAMBlock resize during
3136              * live migration, but it could happen in theory with future
3137              * updates. So we add a check here to capture that case.
3138              */
3139             error_report_once("%s unexpected error", __func__);
3140             return;
3141         }
3142 
3143         if (len <= block->used_length - offset) {
3144             used_len = len;
3145         } else {
3146             used_len = block->used_length - offset;
3147         }
3148 
3149         start = offset >> TARGET_PAGE_BITS;
3150         npages = used_len >> TARGET_PAGE_BITS;
3151 
3152         qemu_mutex_lock(&ram_state->bitmap_mutex);
3153         /*
3154          * The skipped free pages are equavalent to be sent from clear_bmap's
3155          * perspective, so clear the bits from the memory region bitmap which
3156          * are initially set. Otherwise those skipped pages will be sent in
3157          * the next round after syncing from the memory region bitmap.
3158          */
3159         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3160         ram_state->migration_dirty_pages -=
3161                       bitmap_count_one_with_offset(block->bmap, start, npages);
3162         bitmap_clear(block->bmap, start, npages);
3163         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3164     }
3165 }
3166 
3167 /*
3168  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3169  * long-running RCU critical section.  When rcu-reclaims in the code
3170  * start to become numerous it will be necessary to reduce the
3171  * granularity of these critical sections.
3172  */
3173 
3174 /**
3175  * ram_save_setup: Setup RAM for migration
3176  *
3177  * Returns zero to indicate success and negative for error
3178  *
3179  * @f: QEMUFile where to send the data
3180  * @opaque: RAMState pointer
3181  */
3182 static int ram_save_setup(QEMUFile *f, void *opaque)
3183 {
3184     RAMState **rsp = opaque;
3185     RAMBlock *block;
3186     int ret;
3187 
3188     if (compress_threads_save_setup()) {
3189         return -1;
3190     }
3191 
3192     /* migration has already setup the bitmap, reuse it. */
3193     if (!migration_in_colo_state()) {
3194         if (ram_init_all(rsp) != 0) {
3195             compress_threads_save_cleanup();
3196             return -1;
3197         }
3198     }
3199     (*rsp)->f = f;
3200 
3201     WITH_RCU_READ_LOCK_GUARD() {
3202         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3203 
3204         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3205             qemu_put_byte(f, strlen(block->idstr));
3206             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3207             qemu_put_be64(f, block->used_length);
3208             if (migrate_postcopy_ram() && block->page_size !=
3209                                           qemu_host_page_size) {
3210                 qemu_put_be64(f, block->page_size);
3211             }
3212             if (migrate_ignore_shared()) {
3213                 qemu_put_be64(f, block->mr->addr);
3214             }
3215         }
3216     }
3217 
3218     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3219     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3220 
3221     ret =  multifd_send_sync_main(f);
3222     if (ret < 0) {
3223         return ret;
3224     }
3225 
3226     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3227     qemu_fflush(f);
3228 
3229     return 0;
3230 }
3231 
3232 /**
3233  * ram_save_iterate: iterative stage for migration
3234  *
3235  * Returns zero to indicate success and negative for error
3236  *
3237  * @f: QEMUFile where to send the data
3238  * @opaque: RAMState pointer
3239  */
3240 static int ram_save_iterate(QEMUFile *f, void *opaque)
3241 {
3242     RAMState **temp = opaque;
3243     RAMState *rs = *temp;
3244     int ret = 0;
3245     int i;
3246     int64_t t0;
3247     int done = 0;
3248 
3249     if (blk_mig_bulk_active()) {
3250         /* Avoid transferring ram during bulk phase of block migration as
3251          * the bulk phase will usually take a long time and transferring
3252          * ram updates during that time is pointless. */
3253         goto out;
3254     }
3255 
3256     /*
3257      * We'll take this lock a little bit long, but it's okay for two reasons.
3258      * Firstly, the only possible other thread to take it is who calls
3259      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3260      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3261      * guarantees that we'll at least released it in a regular basis.
3262      */
3263     qemu_mutex_lock(&rs->bitmap_mutex);
3264     WITH_RCU_READ_LOCK_GUARD() {
3265         if (ram_list.version != rs->last_version) {
3266             ram_state_reset(rs);
3267         }
3268 
3269         /* Read version before ram_list.blocks */
3270         smp_rmb();
3271 
3272         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3273 
3274         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3275         i = 0;
3276         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3277                postcopy_has_request(rs)) {
3278             int pages;
3279 
3280             if (qemu_file_get_error(f)) {
3281                 break;
3282             }
3283 
3284             pages = ram_find_and_save_block(rs);
3285             /* no more pages to sent */
3286             if (pages == 0) {
3287                 done = 1;
3288                 break;
3289             }
3290 
3291             if (pages < 0) {
3292                 qemu_file_set_error(f, pages);
3293                 break;
3294             }
3295 
3296             rs->target_page_count += pages;
3297 
3298             /*
3299              * During postcopy, it is necessary to make sure one whole host
3300              * page is sent in one chunk.
3301              */
3302             if (migrate_postcopy_ram()) {
3303                 flush_compressed_data(rs);
3304             }
3305 
3306             /*
3307              * we want to check in the 1st loop, just in case it was the 1st
3308              * time and we had to sync the dirty bitmap.
3309              * qemu_clock_get_ns() is a bit expensive, so we only check each
3310              * some iterations
3311              */
3312             if ((i & 63) == 0) {
3313                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3314                               1000000;
3315                 if (t1 > MAX_WAIT) {
3316                     trace_ram_save_iterate_big_wait(t1, i);
3317                     break;
3318                 }
3319             }
3320             i++;
3321         }
3322     }
3323     qemu_mutex_unlock(&rs->bitmap_mutex);
3324 
3325     postcopy_preempt_reset_channel(rs);
3326 
3327     /*
3328      * Must occur before EOS (or any QEMUFile operation)
3329      * because of RDMA protocol.
3330      */
3331     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3332 
3333 out:
3334     if (ret >= 0
3335         && migration_is_setup_or_active(migrate_get_current()->state)) {
3336         ret = multifd_send_sync_main(rs->f);
3337         if (ret < 0) {
3338             return ret;
3339         }
3340 
3341         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3342         qemu_fflush(f);
3343         ram_transferred_add(8);
3344 
3345         ret = qemu_file_get_error(f);
3346     }
3347     if (ret < 0) {
3348         return ret;
3349     }
3350 
3351     return done;
3352 }
3353 
3354 /**
3355  * ram_save_complete: function called to send the remaining amount of ram
3356  *
3357  * Returns zero to indicate success or negative on error
3358  *
3359  * Called with iothread lock
3360  *
3361  * @f: QEMUFile where to send the data
3362  * @opaque: RAMState pointer
3363  */
3364 static int ram_save_complete(QEMUFile *f, void *opaque)
3365 {
3366     RAMState **temp = opaque;
3367     RAMState *rs = *temp;
3368     int ret = 0;
3369 
3370     rs->last_stage = !migration_in_colo_state();
3371 
3372     WITH_RCU_READ_LOCK_GUARD() {
3373         if (!migration_in_postcopy()) {
3374             migration_bitmap_sync_precopy(rs);
3375         }
3376 
3377         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3378 
3379         /* try transferring iterative blocks of memory */
3380 
3381         /* flush all remaining blocks regardless of rate limiting */
3382         while (true) {
3383             int pages;
3384 
3385             pages = ram_find_and_save_block(rs);
3386             /* no more blocks to sent */
3387             if (pages == 0) {
3388                 break;
3389             }
3390             if (pages < 0) {
3391                 ret = pages;
3392                 break;
3393             }
3394         }
3395 
3396         flush_compressed_data(rs);
3397         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3398     }
3399 
3400     if (ret < 0) {
3401         return ret;
3402     }
3403 
3404     postcopy_preempt_reset_channel(rs);
3405 
3406     ret = multifd_send_sync_main(rs->f);
3407     if (ret < 0) {
3408         return ret;
3409     }
3410 
3411     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3412     qemu_fflush(f);
3413 
3414     return 0;
3415 }
3416 
3417 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3418                              uint64_t *res_precopy_only,
3419                              uint64_t *res_compatible,
3420                              uint64_t *res_postcopy_only)
3421 {
3422     RAMState **temp = opaque;
3423     RAMState *rs = *temp;
3424     uint64_t remaining_size;
3425 
3426     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3427 
3428     if (!migration_in_postcopy() &&
3429         remaining_size < max_size) {
3430         qemu_mutex_lock_iothread();
3431         WITH_RCU_READ_LOCK_GUARD() {
3432             migration_bitmap_sync_precopy(rs);
3433         }
3434         qemu_mutex_unlock_iothread();
3435         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3436     }
3437 
3438     if (migrate_postcopy_ram()) {
3439         /* We can do postcopy, and all the data is postcopiable */
3440         *res_compatible += remaining_size;
3441     } else {
3442         *res_precopy_only += remaining_size;
3443     }
3444 }
3445 
3446 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3447 {
3448     unsigned int xh_len;
3449     int xh_flags;
3450     uint8_t *loaded_data;
3451 
3452     /* extract RLE header */
3453     xh_flags = qemu_get_byte(f);
3454     xh_len = qemu_get_be16(f);
3455 
3456     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3457         error_report("Failed to load XBZRLE page - wrong compression!");
3458         return -1;
3459     }
3460 
3461     if (xh_len > TARGET_PAGE_SIZE) {
3462         error_report("Failed to load XBZRLE page - len overflow!");
3463         return -1;
3464     }
3465     loaded_data = XBZRLE.decoded_buf;
3466     /* load data and decode */
3467     /* it can change loaded_data to point to an internal buffer */
3468     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3469 
3470     /* decode RLE */
3471     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3472                              TARGET_PAGE_SIZE) == -1) {
3473         error_report("Failed to load XBZRLE page - decode error!");
3474         return -1;
3475     }
3476 
3477     return 0;
3478 }
3479 
3480 /**
3481  * ram_block_from_stream: read a RAMBlock id from the migration stream
3482  *
3483  * Must be called from within a rcu critical section.
3484  *
3485  * Returns a pointer from within the RCU-protected ram_list.
3486  *
3487  * @mis: the migration incoming state pointer
3488  * @f: QEMUFile where to read the data from
3489  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3490  * @channel: the channel we're using
3491  */
3492 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3493                                               QEMUFile *f, int flags,
3494                                               int channel)
3495 {
3496     RAMBlock *block = mis->last_recv_block[channel];
3497     char id[256];
3498     uint8_t len;
3499 
3500     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3501         if (!block) {
3502             error_report("Ack, bad migration stream!");
3503             return NULL;
3504         }
3505         return block;
3506     }
3507 
3508     len = qemu_get_byte(f);
3509     qemu_get_buffer(f, (uint8_t *)id, len);
3510     id[len] = 0;
3511 
3512     block = qemu_ram_block_by_name(id);
3513     if (!block) {
3514         error_report("Can't find block %s", id);
3515         return NULL;
3516     }
3517 
3518     if (ramblock_is_ignored(block)) {
3519         error_report("block %s should not be migrated !", id);
3520         return NULL;
3521     }
3522 
3523     mis->last_recv_block[channel] = block;
3524 
3525     return block;
3526 }
3527 
3528 static inline void *host_from_ram_block_offset(RAMBlock *block,
3529                                                ram_addr_t offset)
3530 {
3531     if (!offset_in_ramblock(block, offset)) {
3532         return NULL;
3533     }
3534 
3535     return block->host + offset;
3536 }
3537 
3538 static void *host_page_from_ram_block_offset(RAMBlock *block,
3539                                              ram_addr_t offset)
3540 {
3541     /* Note: Explicitly no check against offset_in_ramblock(). */
3542     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3543                                    block->page_size);
3544 }
3545 
3546 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3547                                                          ram_addr_t offset)
3548 {
3549     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3550 }
3551 
3552 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3553                              ram_addr_t offset, bool record_bitmap)
3554 {
3555     if (!offset_in_ramblock(block, offset)) {
3556         return NULL;
3557     }
3558     if (!block->colo_cache) {
3559         error_report("%s: colo_cache is NULL in block :%s",
3560                      __func__, block->idstr);
3561         return NULL;
3562     }
3563 
3564     /*
3565     * During colo checkpoint, we need bitmap of these migrated pages.
3566     * It help us to decide which pages in ram cache should be flushed
3567     * into VM's RAM later.
3568     */
3569     if (record_bitmap &&
3570         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3571         ram_state->migration_dirty_pages++;
3572     }
3573     return block->colo_cache + offset;
3574 }
3575 
3576 /**
3577  * ram_handle_compressed: handle the zero page case
3578  *
3579  * If a page (or a whole RDMA chunk) has been
3580  * determined to be zero, then zap it.
3581  *
3582  * @host: host address for the zero page
3583  * @ch: what the page is filled from.  We only support zero
3584  * @size: size of the zero page
3585  */
3586 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3587 {
3588     if (ch != 0 || !buffer_is_zero(host, size)) {
3589         memset(host, ch, size);
3590     }
3591 }
3592 
3593 /* return the size after decompression, or negative value on error */
3594 static int
3595 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3596                      const uint8_t *source, size_t source_len)
3597 {
3598     int err;
3599 
3600     err = inflateReset(stream);
3601     if (err != Z_OK) {
3602         return -1;
3603     }
3604 
3605     stream->avail_in = source_len;
3606     stream->next_in = (uint8_t *)source;
3607     stream->avail_out = dest_len;
3608     stream->next_out = dest;
3609 
3610     err = inflate(stream, Z_NO_FLUSH);
3611     if (err != Z_STREAM_END) {
3612         return -1;
3613     }
3614 
3615     return stream->total_out;
3616 }
3617 
3618 static void *do_data_decompress(void *opaque)
3619 {
3620     DecompressParam *param = opaque;
3621     unsigned long pagesize;
3622     uint8_t *des;
3623     int len, ret;
3624 
3625     qemu_mutex_lock(&param->mutex);
3626     while (!param->quit) {
3627         if (param->des) {
3628             des = param->des;
3629             len = param->len;
3630             param->des = 0;
3631             qemu_mutex_unlock(&param->mutex);
3632 
3633             pagesize = TARGET_PAGE_SIZE;
3634 
3635             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3636                                        param->compbuf, len);
3637             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3638                 error_report("decompress data failed");
3639                 qemu_file_set_error(decomp_file, ret);
3640             }
3641 
3642             qemu_mutex_lock(&decomp_done_lock);
3643             param->done = true;
3644             qemu_cond_signal(&decomp_done_cond);
3645             qemu_mutex_unlock(&decomp_done_lock);
3646 
3647             qemu_mutex_lock(&param->mutex);
3648         } else {
3649             qemu_cond_wait(&param->cond, &param->mutex);
3650         }
3651     }
3652     qemu_mutex_unlock(&param->mutex);
3653 
3654     return NULL;
3655 }
3656 
3657 static int wait_for_decompress_done(void)
3658 {
3659     int idx, thread_count;
3660 
3661     if (!migrate_use_compression()) {
3662         return 0;
3663     }
3664 
3665     thread_count = migrate_decompress_threads();
3666     qemu_mutex_lock(&decomp_done_lock);
3667     for (idx = 0; idx < thread_count; idx++) {
3668         while (!decomp_param[idx].done) {
3669             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3670         }
3671     }
3672     qemu_mutex_unlock(&decomp_done_lock);
3673     return qemu_file_get_error(decomp_file);
3674 }
3675 
3676 static void compress_threads_load_cleanup(void)
3677 {
3678     int i, thread_count;
3679 
3680     if (!migrate_use_compression()) {
3681         return;
3682     }
3683     thread_count = migrate_decompress_threads();
3684     for (i = 0; i < thread_count; i++) {
3685         /*
3686          * we use it as a indicator which shows if the thread is
3687          * properly init'd or not
3688          */
3689         if (!decomp_param[i].compbuf) {
3690             break;
3691         }
3692 
3693         qemu_mutex_lock(&decomp_param[i].mutex);
3694         decomp_param[i].quit = true;
3695         qemu_cond_signal(&decomp_param[i].cond);
3696         qemu_mutex_unlock(&decomp_param[i].mutex);
3697     }
3698     for (i = 0; i < thread_count; i++) {
3699         if (!decomp_param[i].compbuf) {
3700             break;
3701         }
3702 
3703         qemu_thread_join(decompress_threads + i);
3704         qemu_mutex_destroy(&decomp_param[i].mutex);
3705         qemu_cond_destroy(&decomp_param[i].cond);
3706         inflateEnd(&decomp_param[i].stream);
3707         g_free(decomp_param[i].compbuf);
3708         decomp_param[i].compbuf = NULL;
3709     }
3710     g_free(decompress_threads);
3711     g_free(decomp_param);
3712     decompress_threads = NULL;
3713     decomp_param = NULL;
3714     decomp_file = NULL;
3715 }
3716 
3717 static int compress_threads_load_setup(QEMUFile *f)
3718 {
3719     int i, thread_count;
3720 
3721     if (!migrate_use_compression()) {
3722         return 0;
3723     }
3724 
3725     thread_count = migrate_decompress_threads();
3726     decompress_threads = g_new0(QemuThread, thread_count);
3727     decomp_param = g_new0(DecompressParam, thread_count);
3728     qemu_mutex_init(&decomp_done_lock);
3729     qemu_cond_init(&decomp_done_cond);
3730     decomp_file = f;
3731     for (i = 0; i < thread_count; i++) {
3732         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3733             goto exit;
3734         }
3735 
3736         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3737         qemu_mutex_init(&decomp_param[i].mutex);
3738         qemu_cond_init(&decomp_param[i].cond);
3739         decomp_param[i].done = true;
3740         decomp_param[i].quit = false;
3741         qemu_thread_create(decompress_threads + i, "decompress",
3742                            do_data_decompress, decomp_param + i,
3743                            QEMU_THREAD_JOINABLE);
3744     }
3745     return 0;
3746 exit:
3747     compress_threads_load_cleanup();
3748     return -1;
3749 }
3750 
3751 static void decompress_data_with_multi_threads(QEMUFile *f,
3752                                                void *host, int len)
3753 {
3754     int idx, thread_count;
3755 
3756     thread_count = migrate_decompress_threads();
3757     QEMU_LOCK_GUARD(&decomp_done_lock);
3758     while (true) {
3759         for (idx = 0; idx < thread_count; idx++) {
3760             if (decomp_param[idx].done) {
3761                 decomp_param[idx].done = false;
3762                 qemu_mutex_lock(&decomp_param[idx].mutex);
3763                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3764                 decomp_param[idx].des = host;
3765                 decomp_param[idx].len = len;
3766                 qemu_cond_signal(&decomp_param[idx].cond);
3767                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3768                 break;
3769             }
3770         }
3771         if (idx < thread_count) {
3772             break;
3773         } else {
3774             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3775         }
3776     }
3777 }
3778 
3779 static void colo_init_ram_state(void)
3780 {
3781     ram_state_init(&ram_state);
3782 }
3783 
3784 /*
3785  * colo cache: this is for secondary VM, we cache the whole
3786  * memory of the secondary VM, it is need to hold the global lock
3787  * to call this helper.
3788  */
3789 int colo_init_ram_cache(void)
3790 {
3791     RAMBlock *block;
3792 
3793     WITH_RCU_READ_LOCK_GUARD() {
3794         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3795             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3796                                                     NULL, false, false);
3797             if (!block->colo_cache) {
3798                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3799                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3800                              block->used_length);
3801                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3802                     if (block->colo_cache) {
3803                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3804                         block->colo_cache = NULL;
3805                     }
3806                 }
3807                 return -errno;
3808             }
3809             if (!machine_dump_guest_core(current_machine)) {
3810                 qemu_madvise(block->colo_cache, block->used_length,
3811                              QEMU_MADV_DONTDUMP);
3812             }
3813         }
3814     }
3815 
3816     /*
3817     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3818     * with to decide which page in cache should be flushed into SVM's RAM. Here
3819     * we use the same name 'ram_bitmap' as for migration.
3820     */
3821     if (ram_bytes_total()) {
3822         RAMBlock *block;
3823 
3824         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3825             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3826             block->bmap = bitmap_new(pages);
3827         }
3828     }
3829 
3830     colo_init_ram_state();
3831     return 0;
3832 }
3833 
3834 /* TODO: duplicated with ram_init_bitmaps */
3835 void colo_incoming_start_dirty_log(void)
3836 {
3837     RAMBlock *block = NULL;
3838     /* For memory_global_dirty_log_start below. */
3839     qemu_mutex_lock_iothread();
3840     qemu_mutex_lock_ramlist();
3841 
3842     memory_global_dirty_log_sync();
3843     WITH_RCU_READ_LOCK_GUARD() {
3844         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3845             ramblock_sync_dirty_bitmap(ram_state, block);
3846             /* Discard this dirty bitmap record */
3847             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3848         }
3849         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3850     }
3851     ram_state->migration_dirty_pages = 0;
3852     qemu_mutex_unlock_ramlist();
3853     qemu_mutex_unlock_iothread();
3854 }
3855 
3856 /* It is need to hold the global lock to call this helper */
3857 void colo_release_ram_cache(void)
3858 {
3859     RAMBlock *block;
3860 
3861     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3862     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3863         g_free(block->bmap);
3864         block->bmap = NULL;
3865     }
3866 
3867     WITH_RCU_READ_LOCK_GUARD() {
3868         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3869             if (block->colo_cache) {
3870                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3871                 block->colo_cache = NULL;
3872             }
3873         }
3874     }
3875     ram_state_cleanup(&ram_state);
3876 }
3877 
3878 /**
3879  * ram_load_setup: Setup RAM for migration incoming side
3880  *
3881  * Returns zero to indicate success and negative for error
3882  *
3883  * @f: QEMUFile where to receive the data
3884  * @opaque: RAMState pointer
3885  */
3886 static int ram_load_setup(QEMUFile *f, void *opaque)
3887 {
3888     if (compress_threads_load_setup(f)) {
3889         return -1;
3890     }
3891 
3892     xbzrle_load_setup();
3893     ramblock_recv_map_init();
3894 
3895     return 0;
3896 }
3897 
3898 static int ram_load_cleanup(void *opaque)
3899 {
3900     RAMBlock *rb;
3901 
3902     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3903         qemu_ram_block_writeback(rb);
3904     }
3905 
3906     xbzrle_load_cleanup();
3907     compress_threads_load_cleanup();
3908 
3909     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3910         g_free(rb->receivedmap);
3911         rb->receivedmap = NULL;
3912     }
3913 
3914     return 0;
3915 }
3916 
3917 /**
3918  * ram_postcopy_incoming_init: allocate postcopy data structures
3919  *
3920  * Returns 0 for success and negative if there was one error
3921  *
3922  * @mis: current migration incoming state
3923  *
3924  * Allocate data structures etc needed by incoming migration with
3925  * postcopy-ram. postcopy-ram's similarly names
3926  * postcopy_ram_incoming_init does the work.
3927  */
3928 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3929 {
3930     return postcopy_ram_incoming_init(mis);
3931 }
3932 
3933 /**
3934  * ram_load_postcopy: load a page in postcopy case
3935  *
3936  * Returns 0 for success or -errno in case of error
3937  *
3938  * Called in postcopy mode by ram_load().
3939  * rcu_read_lock is taken prior to this being called.
3940  *
3941  * @f: QEMUFile where to send the data
3942  * @channel: the channel to use for loading
3943  */
3944 int ram_load_postcopy(QEMUFile *f, int channel)
3945 {
3946     int flags = 0, ret = 0;
3947     bool place_needed = false;
3948     bool matches_target_page_size = false;
3949     MigrationIncomingState *mis = migration_incoming_get_current();
3950     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3951 
3952     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3953         ram_addr_t addr;
3954         void *page_buffer = NULL;
3955         void *place_source = NULL;
3956         RAMBlock *block = NULL;
3957         uint8_t ch;
3958         int len;
3959 
3960         addr = qemu_get_be64(f);
3961 
3962         /*
3963          * If qemu file error, we should stop here, and then "addr"
3964          * may be invalid
3965          */
3966         ret = qemu_file_get_error(f);
3967         if (ret) {
3968             break;
3969         }
3970 
3971         flags = addr & ~TARGET_PAGE_MASK;
3972         addr &= TARGET_PAGE_MASK;
3973 
3974         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
3975         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3976                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3977             block = ram_block_from_stream(mis, f, flags, channel);
3978             if (!block) {
3979                 ret = -EINVAL;
3980                 break;
3981             }
3982 
3983             /*
3984              * Relying on used_length is racy and can result in false positives.
3985              * We might place pages beyond used_length in case RAM was shrunk
3986              * while in postcopy, which is fine - trying to place via
3987              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3988              */
3989             if (!block->host || addr >= block->postcopy_length) {
3990                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3991                 ret = -EINVAL;
3992                 break;
3993             }
3994             tmp_page->target_pages++;
3995             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3996             /*
3997              * Postcopy requires that we place whole host pages atomically;
3998              * these may be huge pages for RAMBlocks that are backed by
3999              * hugetlbfs.
4000              * To make it atomic, the data is read into a temporary page
4001              * that's moved into place later.
4002              * The migration protocol uses,  possibly smaller, target-pages
4003              * however the source ensures it always sends all the components
4004              * of a host page in one chunk.
4005              */
4006             page_buffer = tmp_page->tmp_huge_page +
4007                           host_page_offset_from_ram_block_offset(block, addr);
4008             /* If all TP are zero then we can optimise the place */
4009             if (tmp_page->target_pages == 1) {
4010                 tmp_page->host_addr =
4011                     host_page_from_ram_block_offset(block, addr);
4012             } else if (tmp_page->host_addr !=
4013                        host_page_from_ram_block_offset(block, addr)) {
4014                 /* not the 1st TP within the HP */
4015                 error_report("Non-same host page detected on channel %d: "
4016                              "Target host page %p, received host page %p "
4017                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4018                              channel, tmp_page->host_addr,
4019                              host_page_from_ram_block_offset(block, addr),
4020                              block->idstr, addr, tmp_page->target_pages);
4021                 ret = -EINVAL;
4022                 break;
4023             }
4024 
4025             /*
4026              * If it's the last part of a host page then we place the host
4027              * page
4028              */
4029             if (tmp_page->target_pages ==
4030                 (block->page_size / TARGET_PAGE_SIZE)) {
4031                 place_needed = true;
4032             }
4033             place_source = tmp_page->tmp_huge_page;
4034         }
4035 
4036         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4037         case RAM_SAVE_FLAG_ZERO:
4038             ch = qemu_get_byte(f);
4039             /*
4040              * Can skip to set page_buffer when
4041              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4042              */
4043             if (ch || !matches_target_page_size) {
4044                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4045             }
4046             if (ch) {
4047                 tmp_page->all_zero = false;
4048             }
4049             break;
4050 
4051         case RAM_SAVE_FLAG_PAGE:
4052             tmp_page->all_zero = false;
4053             if (!matches_target_page_size) {
4054                 /* For huge pages, we always use temporary buffer */
4055                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4056             } else {
4057                 /*
4058                  * For small pages that matches target page size, we
4059                  * avoid the qemu_file copy.  Instead we directly use
4060                  * the buffer of QEMUFile to place the page.  Note: we
4061                  * cannot do any QEMUFile operation before using that
4062                  * buffer to make sure the buffer is valid when
4063                  * placing the page.
4064                  */
4065                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4066                                          TARGET_PAGE_SIZE);
4067             }
4068             break;
4069         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4070             tmp_page->all_zero = false;
4071             len = qemu_get_be32(f);
4072             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4073                 error_report("Invalid compressed data length: %d", len);
4074                 ret = -EINVAL;
4075                 break;
4076             }
4077             decompress_data_with_multi_threads(f, page_buffer, len);
4078             break;
4079 
4080         case RAM_SAVE_FLAG_EOS:
4081             /* normal exit */
4082             multifd_recv_sync_main();
4083             break;
4084         default:
4085             error_report("Unknown combination of migration flags: 0x%x"
4086                          " (postcopy mode)", flags);
4087             ret = -EINVAL;
4088             break;
4089         }
4090 
4091         /* Got the whole host page, wait for decompress before placing. */
4092         if (place_needed) {
4093             ret |= wait_for_decompress_done();
4094         }
4095 
4096         /* Detect for any possible file errors */
4097         if (!ret && qemu_file_get_error(f)) {
4098             ret = qemu_file_get_error(f);
4099         }
4100 
4101         if (!ret && place_needed) {
4102             if (tmp_page->all_zero) {
4103                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4104             } else {
4105                 ret = postcopy_place_page(mis, tmp_page->host_addr,
4106                                           place_source, block);
4107             }
4108             place_needed = false;
4109             postcopy_temp_page_reset(tmp_page);
4110         }
4111     }
4112 
4113     return ret;
4114 }
4115 
4116 static bool postcopy_is_advised(void)
4117 {
4118     PostcopyState ps = postcopy_state_get();
4119     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4120 }
4121 
4122 static bool postcopy_is_running(void)
4123 {
4124     PostcopyState ps = postcopy_state_get();
4125     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4126 }
4127 
4128 /*
4129  * Flush content of RAM cache into SVM's memory.
4130  * Only flush the pages that be dirtied by PVM or SVM or both.
4131  */
4132 void colo_flush_ram_cache(void)
4133 {
4134     RAMBlock *block = NULL;
4135     void *dst_host;
4136     void *src_host;
4137     unsigned long offset = 0;
4138 
4139     memory_global_dirty_log_sync();
4140     WITH_RCU_READ_LOCK_GUARD() {
4141         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4142             ramblock_sync_dirty_bitmap(ram_state, block);
4143         }
4144     }
4145 
4146     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4147     WITH_RCU_READ_LOCK_GUARD() {
4148         block = QLIST_FIRST_RCU(&ram_list.blocks);
4149 
4150         while (block) {
4151             unsigned long num = 0;
4152 
4153             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4154             if (!offset_in_ramblock(block,
4155                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4156                 offset = 0;
4157                 num = 0;
4158                 block = QLIST_NEXT_RCU(block, next);
4159             } else {
4160                 unsigned long i = 0;
4161 
4162                 for (i = 0; i < num; i++) {
4163                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
4164                 }
4165                 dst_host = block->host
4166                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4167                 src_host = block->colo_cache
4168                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4169                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4170                 offset += num;
4171             }
4172         }
4173     }
4174     trace_colo_flush_ram_cache_end();
4175 }
4176 
4177 /**
4178  * ram_load_precopy: load pages in precopy case
4179  *
4180  * Returns 0 for success or -errno in case of error
4181  *
4182  * Called in precopy mode by ram_load().
4183  * rcu_read_lock is taken prior to this being called.
4184  *
4185  * @f: QEMUFile where to send the data
4186  */
4187 static int ram_load_precopy(QEMUFile *f)
4188 {
4189     MigrationIncomingState *mis = migration_incoming_get_current();
4190     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4191     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4192     bool postcopy_advised = postcopy_is_advised();
4193     if (!migrate_use_compression()) {
4194         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4195     }
4196 
4197     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4198         ram_addr_t addr, total_ram_bytes;
4199         void *host = NULL, *host_bak = NULL;
4200         uint8_t ch;
4201 
4202         /*
4203          * Yield periodically to let main loop run, but an iteration of
4204          * the main loop is expensive, so do it each some iterations
4205          */
4206         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4207             aio_co_schedule(qemu_get_current_aio_context(),
4208                             qemu_coroutine_self());
4209             qemu_coroutine_yield();
4210         }
4211         i++;
4212 
4213         addr = qemu_get_be64(f);
4214         flags = addr & ~TARGET_PAGE_MASK;
4215         addr &= TARGET_PAGE_MASK;
4216 
4217         if (flags & invalid_flags) {
4218             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4219                 error_report("Received an unexpected compressed page");
4220             }
4221 
4222             ret = -EINVAL;
4223             break;
4224         }
4225 
4226         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4227                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4228             RAMBlock *block = ram_block_from_stream(mis, f, flags,
4229                                                     RAM_CHANNEL_PRECOPY);
4230 
4231             host = host_from_ram_block_offset(block, addr);
4232             /*
4233              * After going into COLO stage, we should not load the page
4234              * into SVM's memory directly, we put them into colo_cache firstly.
4235              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4236              * Previously, we copied all these memory in preparing stage of COLO
4237              * while we need to stop VM, which is a time-consuming process.
4238              * Here we optimize it by a trick, back-up every page while in
4239              * migration process while COLO is enabled, though it affects the
4240              * speed of the migration, but it obviously reduce the downtime of
4241              * back-up all SVM'S memory in COLO preparing stage.
4242              */
4243             if (migration_incoming_colo_enabled()) {
4244                 if (migration_incoming_in_colo_state()) {
4245                     /* In COLO stage, put all pages into cache temporarily */
4246                     host = colo_cache_from_block_offset(block, addr, true);
4247                 } else {
4248                    /*
4249                     * In migration stage but before COLO stage,
4250                     * Put all pages into both cache and SVM's memory.
4251                     */
4252                     host_bak = colo_cache_from_block_offset(block, addr, false);
4253                 }
4254             }
4255             if (!host) {
4256                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4257                 ret = -EINVAL;
4258                 break;
4259             }
4260             if (!migration_incoming_in_colo_state()) {
4261                 ramblock_recv_bitmap_set(block, host);
4262             }
4263 
4264             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4265         }
4266 
4267         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4268         case RAM_SAVE_FLAG_MEM_SIZE:
4269             /* Synchronize RAM block list */
4270             total_ram_bytes = addr;
4271             while (!ret && total_ram_bytes) {
4272                 RAMBlock *block;
4273                 char id[256];
4274                 ram_addr_t length;
4275 
4276                 len = qemu_get_byte(f);
4277                 qemu_get_buffer(f, (uint8_t *)id, len);
4278                 id[len] = 0;
4279                 length = qemu_get_be64(f);
4280 
4281                 block = qemu_ram_block_by_name(id);
4282                 if (block && !qemu_ram_is_migratable(block)) {
4283                     error_report("block %s should not be migrated !", id);
4284                     ret = -EINVAL;
4285                 } else if (block) {
4286                     if (length != block->used_length) {
4287                         Error *local_err = NULL;
4288 
4289                         ret = qemu_ram_resize(block, length,
4290                                               &local_err);
4291                         if (local_err) {
4292                             error_report_err(local_err);
4293                         }
4294                     }
4295                     /* For postcopy we need to check hugepage sizes match */
4296                     if (postcopy_advised && migrate_postcopy_ram() &&
4297                         block->page_size != qemu_host_page_size) {
4298                         uint64_t remote_page_size = qemu_get_be64(f);
4299                         if (remote_page_size != block->page_size) {
4300                             error_report("Mismatched RAM page size %s "
4301                                          "(local) %zd != %" PRId64,
4302                                          id, block->page_size,
4303                                          remote_page_size);
4304                             ret = -EINVAL;
4305                         }
4306                     }
4307                     if (migrate_ignore_shared()) {
4308                         hwaddr addr = qemu_get_be64(f);
4309                         if (ramblock_is_ignored(block) &&
4310                             block->mr->addr != addr) {
4311                             error_report("Mismatched GPAs for block %s "
4312                                          "%" PRId64 "!= %" PRId64,
4313                                          id, (uint64_t)addr,
4314                                          (uint64_t)block->mr->addr);
4315                             ret = -EINVAL;
4316                         }
4317                     }
4318                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4319                                           block->idstr);
4320                 } else {
4321                     error_report("Unknown ramblock \"%s\", cannot "
4322                                  "accept migration", id);
4323                     ret = -EINVAL;
4324                 }
4325 
4326                 total_ram_bytes -= length;
4327             }
4328             break;
4329 
4330         case RAM_SAVE_FLAG_ZERO:
4331             ch = qemu_get_byte(f);
4332             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4333             break;
4334 
4335         case RAM_SAVE_FLAG_PAGE:
4336             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4337             break;
4338 
4339         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4340             len = qemu_get_be32(f);
4341             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4342                 error_report("Invalid compressed data length: %d", len);
4343                 ret = -EINVAL;
4344                 break;
4345             }
4346             decompress_data_with_multi_threads(f, host, len);
4347             break;
4348 
4349         case RAM_SAVE_FLAG_XBZRLE:
4350             if (load_xbzrle(f, addr, host) < 0) {
4351                 error_report("Failed to decompress XBZRLE page at "
4352                              RAM_ADDR_FMT, addr);
4353                 ret = -EINVAL;
4354                 break;
4355             }
4356             break;
4357         case RAM_SAVE_FLAG_EOS:
4358             /* normal exit */
4359             multifd_recv_sync_main();
4360             break;
4361         default:
4362             if (flags & RAM_SAVE_FLAG_HOOK) {
4363                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4364             } else {
4365                 error_report("Unknown combination of migration flags: 0x%x",
4366                              flags);
4367                 ret = -EINVAL;
4368             }
4369         }
4370         if (!ret) {
4371             ret = qemu_file_get_error(f);
4372         }
4373         if (!ret && host_bak) {
4374             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4375         }
4376     }
4377 
4378     ret |= wait_for_decompress_done();
4379     return ret;
4380 }
4381 
4382 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4383 {
4384     int ret = 0;
4385     static uint64_t seq_iter;
4386     /*
4387      * If system is running in postcopy mode, page inserts to host memory must
4388      * be atomic
4389      */
4390     bool postcopy_running = postcopy_is_running();
4391 
4392     seq_iter++;
4393 
4394     if (version_id != 4) {
4395         return -EINVAL;
4396     }
4397 
4398     /*
4399      * This RCU critical section can be very long running.
4400      * When RCU reclaims in the code start to become numerous,
4401      * it will be necessary to reduce the granularity of this
4402      * critical section.
4403      */
4404     WITH_RCU_READ_LOCK_GUARD() {
4405         if (postcopy_running) {
4406             /*
4407              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4408              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4409              * service fast page faults.
4410              */
4411             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4412         } else {
4413             ret = ram_load_precopy(f);
4414         }
4415     }
4416     trace_ram_load_complete(ret, seq_iter);
4417 
4418     return ret;
4419 }
4420 
4421 static bool ram_has_postcopy(void *opaque)
4422 {
4423     RAMBlock *rb;
4424     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4425         if (ramblock_is_pmem(rb)) {
4426             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4427                          "is not supported now!", rb->idstr, rb->host);
4428             return false;
4429         }
4430     }
4431 
4432     return migrate_postcopy_ram();
4433 }
4434 
4435 /* Sync all the dirty bitmap with destination VM.  */
4436 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4437 {
4438     RAMBlock *block;
4439     QEMUFile *file = s->to_dst_file;
4440     int ramblock_count = 0;
4441 
4442     trace_ram_dirty_bitmap_sync_start();
4443 
4444     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4445         qemu_savevm_send_recv_bitmap(file, block->idstr);
4446         trace_ram_dirty_bitmap_request(block->idstr);
4447         ramblock_count++;
4448     }
4449 
4450     trace_ram_dirty_bitmap_sync_wait();
4451 
4452     /* Wait until all the ramblocks' dirty bitmap synced */
4453     while (ramblock_count--) {
4454         qemu_sem_wait(&s->rp_state.rp_sem);
4455     }
4456 
4457     trace_ram_dirty_bitmap_sync_complete();
4458 
4459     return 0;
4460 }
4461 
4462 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4463 {
4464     qemu_sem_post(&s->rp_state.rp_sem);
4465 }
4466 
4467 /*
4468  * Read the received bitmap, revert it as the initial dirty bitmap.
4469  * This is only used when the postcopy migration is paused but wants
4470  * to resume from a middle point.
4471  */
4472 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4473 {
4474     int ret = -EINVAL;
4475     /* from_dst_file is always valid because we're within rp_thread */
4476     QEMUFile *file = s->rp_state.from_dst_file;
4477     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4478     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4479     uint64_t size, end_mark;
4480 
4481     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4482 
4483     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4484         error_report("%s: incorrect state %s", __func__,
4485                      MigrationStatus_str(s->state));
4486         return -EINVAL;
4487     }
4488 
4489     /*
4490      * Note: see comments in ramblock_recv_bitmap_send() on why we
4491      * need the endianness conversion, and the paddings.
4492      */
4493     local_size = ROUND_UP(local_size, 8);
4494 
4495     /* Add paddings */
4496     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4497 
4498     size = qemu_get_be64(file);
4499 
4500     /* The size of the bitmap should match with our ramblock */
4501     if (size != local_size) {
4502         error_report("%s: ramblock '%s' bitmap size mismatch "
4503                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4504                      block->idstr, size, local_size);
4505         ret = -EINVAL;
4506         goto out;
4507     }
4508 
4509     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4510     end_mark = qemu_get_be64(file);
4511 
4512     ret = qemu_file_get_error(file);
4513     if (ret || size != local_size) {
4514         error_report("%s: read bitmap failed for ramblock '%s': %d"
4515                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4516                      __func__, block->idstr, ret, local_size, size);
4517         ret = -EIO;
4518         goto out;
4519     }
4520 
4521     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4522         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4523                      __func__, block->idstr, end_mark);
4524         ret = -EINVAL;
4525         goto out;
4526     }
4527 
4528     /*
4529      * Endianness conversion. We are during postcopy (though paused).
4530      * The dirty bitmap won't change. We can directly modify it.
4531      */
4532     bitmap_from_le(block->bmap, le_bitmap, nbits);
4533 
4534     /*
4535      * What we received is "received bitmap". Revert it as the initial
4536      * dirty bitmap for this ramblock.
4537      */
4538     bitmap_complement(block->bmap, block->bmap, nbits);
4539 
4540     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4541     ramblock_dirty_bitmap_clear_discarded_pages(block);
4542 
4543     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4544     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4545 
4546     /*
4547      * We succeeded to sync bitmap for current ramblock. If this is
4548      * the last one to sync, we need to notify the main send thread.
4549      */
4550     ram_dirty_bitmap_reload_notify(s);
4551 
4552     ret = 0;
4553 out:
4554     g_free(le_bitmap);
4555     return ret;
4556 }
4557 
4558 static int ram_resume_prepare(MigrationState *s, void *opaque)
4559 {
4560     RAMState *rs = *(RAMState **)opaque;
4561     int ret;
4562 
4563     ret = ram_dirty_bitmap_sync_all(s, rs);
4564     if (ret) {
4565         return ret;
4566     }
4567 
4568     ram_state_resume_prepare(rs, s->to_dst_file);
4569 
4570     return 0;
4571 }
4572 
4573 void postcopy_preempt_shutdown_file(MigrationState *s)
4574 {
4575     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4576     qemu_fflush(s->postcopy_qemufile_src);
4577 }
4578 
4579 static SaveVMHandlers savevm_ram_handlers = {
4580     .save_setup = ram_save_setup,
4581     .save_live_iterate = ram_save_iterate,
4582     .save_live_complete_postcopy = ram_save_complete,
4583     .save_live_complete_precopy = ram_save_complete,
4584     .has_postcopy = ram_has_postcopy,
4585     .save_live_pending = ram_save_pending,
4586     .load_state = ram_load,
4587     .save_cleanup = ram_save_cleanup,
4588     .load_setup = ram_load_setup,
4589     .load_cleanup = ram_load_cleanup,
4590     .resume_prepare = ram_resume_prepare,
4591 };
4592 
4593 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4594                                       size_t old_size, size_t new_size)
4595 {
4596     PostcopyState ps = postcopy_state_get();
4597     ram_addr_t offset;
4598     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4599     Error *err = NULL;
4600 
4601     if (ramblock_is_ignored(rb)) {
4602         return;
4603     }
4604 
4605     if (!migration_is_idle()) {
4606         /*
4607          * Precopy code on the source cannot deal with the size of RAM blocks
4608          * changing at random points in time - especially after sending the
4609          * RAM block sizes in the migration stream, they must no longer change.
4610          * Abort and indicate a proper reason.
4611          */
4612         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4613         migration_cancel(err);
4614         error_free(err);
4615     }
4616 
4617     switch (ps) {
4618     case POSTCOPY_INCOMING_ADVISE:
4619         /*
4620          * Update what ram_postcopy_incoming_init()->init_range() does at the
4621          * time postcopy was advised. Syncing RAM blocks with the source will
4622          * result in RAM resizes.
4623          */
4624         if (old_size < new_size) {
4625             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4626                 error_report("RAM block '%s' discard of resized RAM failed",
4627                              rb->idstr);
4628             }
4629         }
4630         rb->postcopy_length = new_size;
4631         break;
4632     case POSTCOPY_INCOMING_NONE:
4633     case POSTCOPY_INCOMING_RUNNING:
4634     case POSTCOPY_INCOMING_END:
4635         /*
4636          * Once our guest is running, postcopy does no longer care about
4637          * resizes. When growing, the new memory was not available on the
4638          * source, no handler needed.
4639          */
4640         break;
4641     default:
4642         error_report("RAM block '%s' resized during postcopy state: %d",
4643                      rb->idstr, ps);
4644         exit(-1);
4645     }
4646 }
4647 
4648 static RAMBlockNotifier ram_mig_ram_notifier = {
4649     .ram_block_resized = ram_mig_ram_block_resized,
4650 };
4651 
4652 void ram_mig_init(void)
4653 {
4654     qemu_mutex_init(&XBZRLE.lock);
4655     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4656     ram_block_notifier_add(&ram_mig_ram_notifier);
4657 }
4658