xref: /openbmc/qemu/migration/ram.c (revision 623d7e3551a6fc5693c06ea938c60fe281b52e27)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram-compress.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration-stats.h"
40 #include "migration/register.h"
41 #include "migration/misc.h"
42 #include "qemu-file.h"
43 #include "postcopy-ram.h"
44 #include "page_cache.h"
45 #include "qemu/error-report.h"
46 #include "qapi/error.h"
47 #include "qapi/qapi-types-migration.h"
48 #include "qapi/qapi-events-migration.h"
49 #include "qapi/qmp/qerror.h"
50 #include "trace.h"
51 #include "exec/ram_addr.h"
52 #include "exec/target_page.h"
53 #include "qemu/rcu_queue.h"
54 #include "migration/colo.h"
55 #include "block.h"
56 #include "sysemu/cpu-throttle.h"
57 #include "savevm.h"
58 #include "qemu/iov.h"
59 #include "multifd.h"
60 #include "sysemu/runstate.h"
61 #include "options.h"
62 
63 #include "hw/boards.h" /* for machine_dump_guest_core() */
64 
65 #if defined(__linux__)
66 #include "qemu/userfaultfd.h"
67 #endif /* defined(__linux__) */
68 
69 /***********************************************************/
70 /* ram save/restore */
71 
72 /*
73  * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
74  * worked for pages that were filled with the same char.  We switched
75  * it to only search for the zero value.  And to avoid confusion with
76  * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
77  */
78 /*
79  * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
80  */
81 #define RAM_SAVE_FLAG_FULL     0x01
82 #define RAM_SAVE_FLAG_ZERO     0x02
83 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
84 #define RAM_SAVE_FLAG_PAGE     0x08
85 #define RAM_SAVE_FLAG_EOS      0x10
86 #define RAM_SAVE_FLAG_CONTINUE 0x20
87 #define RAM_SAVE_FLAG_XBZRLE   0x40
88 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
89 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
90 #define RAM_SAVE_FLAG_MULTIFD_FLUSH    0x200
91 /* We can't use any flag that is bigger than 0x200 */
92 
93 XBZRLECacheStats xbzrle_counters;
94 
95 /* used by the search for pages to send */
96 struct PageSearchStatus {
97     /* The migration channel used for a specific host page */
98     QEMUFile    *pss_channel;
99     /* Last block from where we have sent data */
100     RAMBlock *last_sent_block;
101     /* Current block being searched */
102     RAMBlock    *block;
103     /* Current page to search from */
104     unsigned long page;
105     /* Set once we wrap around */
106     bool         complete_round;
107     /* Whether we're sending a host page */
108     bool          host_page_sending;
109     /* The start/end of current host page.  Invalid if host_page_sending==false */
110     unsigned long host_page_start;
111     unsigned long host_page_end;
112 };
113 typedef struct PageSearchStatus PageSearchStatus;
114 
115 /* struct contains XBZRLE cache and a static page
116    used by the compression */
117 static struct {
118     /* buffer used for XBZRLE encoding */
119     uint8_t *encoded_buf;
120     /* buffer for storing page content */
121     uint8_t *current_buf;
122     /* Cache for XBZRLE, Protected by lock. */
123     PageCache *cache;
124     QemuMutex lock;
125     /* it will store a page full of zeros */
126     uint8_t *zero_target_page;
127     /* buffer used for XBZRLE decoding */
128     uint8_t *decoded_buf;
129 } XBZRLE;
130 
131 static void XBZRLE_cache_lock(void)
132 {
133     if (migrate_xbzrle()) {
134         qemu_mutex_lock(&XBZRLE.lock);
135     }
136 }
137 
138 static void XBZRLE_cache_unlock(void)
139 {
140     if (migrate_xbzrle()) {
141         qemu_mutex_unlock(&XBZRLE.lock);
142     }
143 }
144 
145 /**
146  * xbzrle_cache_resize: resize the xbzrle cache
147  *
148  * This function is called from migrate_params_apply in main
149  * thread, possibly while a migration is in progress.  A running
150  * migration may be using the cache and might finish during this call,
151  * hence changes to the cache are protected by XBZRLE.lock().
152  *
153  * Returns 0 for success or -1 for error
154  *
155  * @new_size: new cache size
156  * @errp: set *errp if the check failed, with reason
157  */
158 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
159 {
160     PageCache *new_cache;
161     int64_t ret = 0;
162 
163     /* Check for truncation */
164     if (new_size != (size_t)new_size) {
165         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
166                    "exceeding address space");
167         return -1;
168     }
169 
170     if (new_size == migrate_xbzrle_cache_size()) {
171         /* nothing to do */
172         return 0;
173     }
174 
175     XBZRLE_cache_lock();
176 
177     if (XBZRLE.cache != NULL) {
178         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
179         if (!new_cache) {
180             ret = -1;
181             goto out;
182         }
183 
184         cache_fini(XBZRLE.cache);
185         XBZRLE.cache = new_cache;
186     }
187 out:
188     XBZRLE_cache_unlock();
189     return ret;
190 }
191 
192 static bool postcopy_preempt_active(void)
193 {
194     return migrate_postcopy_preempt() && migration_in_postcopy();
195 }
196 
197 bool ramblock_is_ignored(RAMBlock *block)
198 {
199     return !qemu_ram_is_migratable(block) ||
200            (migrate_ignore_shared() && qemu_ram_is_shared(block)
201                                     && qemu_ram_is_named_file(block));
202 }
203 
204 #undef RAMBLOCK_FOREACH
205 
206 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
207 {
208     RAMBlock *block;
209     int ret = 0;
210 
211     RCU_READ_LOCK_GUARD();
212 
213     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
214         ret = func(block, opaque);
215         if (ret) {
216             break;
217         }
218     }
219     return ret;
220 }
221 
222 static void ramblock_recv_map_init(void)
223 {
224     RAMBlock *rb;
225 
226     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
227         assert(!rb->receivedmap);
228         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
229     }
230 }
231 
232 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
233 {
234     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
235                     rb->receivedmap);
236 }
237 
238 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
239 {
240     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
241 }
242 
243 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
244 {
245     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
246 }
247 
248 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
249                                     size_t nr)
250 {
251     bitmap_set_atomic(rb->receivedmap,
252                       ramblock_recv_bitmap_offset(host_addr, rb),
253                       nr);
254 }
255 
256 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
257 
258 /*
259  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
260  *
261  * Returns >0 if success with sent bytes, or <0 if error.
262  */
263 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
264                                   const char *block_name)
265 {
266     RAMBlock *block = qemu_ram_block_by_name(block_name);
267     unsigned long *le_bitmap, nbits;
268     uint64_t size;
269 
270     if (!block) {
271         error_report("%s: invalid block name: %s", __func__, block_name);
272         return -1;
273     }
274 
275     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
276 
277     /*
278      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
279      * machines we may need 4 more bytes for padding (see below
280      * comment). So extend it a bit before hand.
281      */
282     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
283 
284     /*
285      * Always use little endian when sending the bitmap. This is
286      * required that when source and destination VMs are not using the
287      * same endianness. (Note: big endian won't work.)
288      */
289     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
290 
291     /* Size of the bitmap, in bytes */
292     size = DIV_ROUND_UP(nbits, 8);
293 
294     /*
295      * size is always aligned to 8 bytes for 64bit machines, but it
296      * may not be true for 32bit machines. We need this padding to
297      * make sure the migration can survive even between 32bit and
298      * 64bit machines.
299      */
300     size = ROUND_UP(size, 8);
301 
302     qemu_put_be64(file, size);
303     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
304     /*
305      * Mark as an end, in case the middle part is screwed up due to
306      * some "mysterious" reason.
307      */
308     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
309     qemu_fflush(file);
310 
311     g_free(le_bitmap);
312 
313     if (qemu_file_get_error(file)) {
314         return qemu_file_get_error(file);
315     }
316 
317     return size + sizeof(size);
318 }
319 
320 /*
321  * An outstanding page request, on the source, having been received
322  * and queued
323  */
324 struct RAMSrcPageRequest {
325     RAMBlock *rb;
326     hwaddr    offset;
327     hwaddr    len;
328 
329     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
330 };
331 
332 /* State of RAM for migration */
333 struct RAMState {
334     /*
335      * PageSearchStatus structures for the channels when send pages.
336      * Protected by the bitmap_mutex.
337      */
338     PageSearchStatus pss[RAM_CHANNEL_MAX];
339     /* UFFD file descriptor, used in 'write-tracking' migration */
340     int uffdio_fd;
341     /* total ram size in bytes */
342     uint64_t ram_bytes_total;
343     /* Last block that we have visited searching for dirty pages */
344     RAMBlock *last_seen_block;
345     /* Last dirty target page we have sent */
346     ram_addr_t last_page;
347     /* last ram version we have seen */
348     uint32_t last_version;
349     /* How many times we have dirty too many pages */
350     int dirty_rate_high_cnt;
351     /* these variables are used for bitmap sync */
352     /* last time we did a full bitmap_sync */
353     int64_t time_last_bitmap_sync;
354     /* bytes transferred at start_time */
355     uint64_t bytes_xfer_prev;
356     /* number of dirty pages since start_time */
357     uint64_t num_dirty_pages_period;
358     /* xbzrle misses since the beginning of the period */
359     uint64_t xbzrle_cache_miss_prev;
360     /* Amount of xbzrle pages since the beginning of the period */
361     uint64_t xbzrle_pages_prev;
362     /* Amount of xbzrle encoded bytes since the beginning of the period */
363     uint64_t xbzrle_bytes_prev;
364     /* Are we really using XBZRLE (e.g., after the first round). */
365     bool xbzrle_started;
366     /* Are we on the last stage of migration */
367     bool last_stage;
368     /* compression statistics since the beginning of the period */
369     /* amount of count that no free thread to compress data */
370     uint64_t compress_thread_busy_prev;
371     /* amount bytes after compression */
372     uint64_t compressed_size_prev;
373     /* amount of compressed pages */
374     uint64_t compress_pages_prev;
375 
376     /* total handled target pages at the beginning of period */
377     uint64_t target_page_count_prev;
378     /* total handled target pages since start */
379     uint64_t target_page_count;
380     /* number of dirty bits in the bitmap */
381     uint64_t migration_dirty_pages;
382     /*
383      * Protects:
384      * - dirty/clear bitmap
385      * - migration_dirty_pages
386      * - pss structures
387      */
388     QemuMutex bitmap_mutex;
389     /* The RAMBlock used in the last src_page_requests */
390     RAMBlock *last_req_rb;
391     /* Queue of outstanding page requests from the destination */
392     QemuMutex src_page_req_mutex;
393     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
394 };
395 typedef struct RAMState RAMState;
396 
397 static RAMState *ram_state;
398 
399 static NotifierWithReturnList precopy_notifier_list;
400 
401 /* Whether postcopy has queued requests? */
402 static bool postcopy_has_request(RAMState *rs)
403 {
404     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
405 }
406 
407 void precopy_infrastructure_init(void)
408 {
409     notifier_with_return_list_init(&precopy_notifier_list);
410 }
411 
412 void precopy_add_notifier(NotifierWithReturn *n)
413 {
414     notifier_with_return_list_add(&precopy_notifier_list, n);
415 }
416 
417 void precopy_remove_notifier(NotifierWithReturn *n)
418 {
419     notifier_with_return_remove(n);
420 }
421 
422 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
423 {
424     PrecopyNotifyData pnd;
425     pnd.reason = reason;
426     pnd.errp = errp;
427 
428     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
429 }
430 
431 uint64_t ram_bytes_remaining(void)
432 {
433     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
434                        0;
435 }
436 
437 void ram_transferred_add(uint64_t bytes)
438 {
439     if (runstate_is_running()) {
440         stat64_add(&mig_stats.precopy_bytes, bytes);
441     } else if (migration_in_postcopy()) {
442         stat64_add(&mig_stats.postcopy_bytes, bytes);
443     } else {
444         stat64_add(&mig_stats.downtime_bytes, bytes);
445     }
446     stat64_add(&mig_stats.transferred, bytes);
447 }
448 
449 struct MigrationOps {
450     int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
451 };
452 typedef struct MigrationOps MigrationOps;
453 
454 MigrationOps *migration_ops;
455 
456 static int ram_save_host_page_urgent(PageSearchStatus *pss);
457 
458 /* NOTE: page is the PFN not real ram_addr_t. */
459 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
460 {
461     pss->block = rb;
462     pss->page = page;
463     pss->complete_round = false;
464 }
465 
466 /*
467  * Check whether two PSSs are actively sending the same page.  Return true
468  * if it is, false otherwise.
469  */
470 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
471 {
472     return pss1->host_page_sending && pss2->host_page_sending &&
473         (pss1->host_page_start == pss2->host_page_start);
474 }
475 
476 /**
477  * save_page_header: write page header to wire
478  *
479  * If this is the 1st block, it also writes the block identification
480  *
481  * Returns the number of bytes written
482  *
483  * @pss: current PSS channel status
484  * @block: block that contains the page we want to send
485  * @offset: offset inside the block for the page
486  *          in the lower bits, it contains flags
487  */
488 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
489                                RAMBlock *block, ram_addr_t offset)
490 {
491     size_t size, len;
492     bool same_block = (block == pss->last_sent_block);
493 
494     if (same_block) {
495         offset |= RAM_SAVE_FLAG_CONTINUE;
496     }
497     qemu_put_be64(f, offset);
498     size = 8;
499 
500     if (!same_block) {
501         len = strlen(block->idstr);
502         qemu_put_byte(f, len);
503         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
504         size += 1 + len;
505         pss->last_sent_block = block;
506     }
507     return size;
508 }
509 
510 /**
511  * mig_throttle_guest_down: throttle down the guest
512  *
513  * Reduce amount of guest cpu execution to hopefully slow down memory
514  * writes. If guest dirty memory rate is reduced below the rate at
515  * which we can transfer pages to the destination then we should be
516  * able to complete migration. Some workloads dirty memory way too
517  * fast and will not effectively converge, even with auto-converge.
518  */
519 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
520                                     uint64_t bytes_dirty_threshold)
521 {
522     uint64_t pct_initial = migrate_cpu_throttle_initial();
523     uint64_t pct_increment = migrate_cpu_throttle_increment();
524     bool pct_tailslow = migrate_cpu_throttle_tailslow();
525     int pct_max = migrate_max_cpu_throttle();
526 
527     uint64_t throttle_now = cpu_throttle_get_percentage();
528     uint64_t cpu_now, cpu_ideal, throttle_inc;
529 
530     /* We have not started throttling yet. Let's start it. */
531     if (!cpu_throttle_active()) {
532         cpu_throttle_set(pct_initial);
533     } else {
534         /* Throttling already on, just increase the rate */
535         if (!pct_tailslow) {
536             throttle_inc = pct_increment;
537         } else {
538             /* Compute the ideal CPU percentage used by Guest, which may
539              * make the dirty rate match the dirty rate threshold. */
540             cpu_now = 100 - throttle_now;
541             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
542                         bytes_dirty_period);
543             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
544         }
545         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
546     }
547 }
548 
549 void mig_throttle_counter_reset(void)
550 {
551     RAMState *rs = ram_state;
552 
553     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
554     rs->num_dirty_pages_period = 0;
555     rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
556 }
557 
558 /**
559  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
560  *
561  * @rs: current RAM state
562  * @current_addr: address for the zero page
563  *
564  * Update the xbzrle cache to reflect a page that's been sent as all 0.
565  * The important thing is that a stale (not-yet-0'd) page be replaced
566  * by the new data.
567  * As a bonus, if the page wasn't in the cache it gets added so that
568  * when a small write is made into the 0'd page it gets XBZRLE sent.
569  */
570 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
571 {
572     /* We don't care if this fails to allocate a new cache page
573      * as long as it updated an old one */
574     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
575                  stat64_get(&mig_stats.dirty_sync_count));
576 }
577 
578 #define ENCODING_FLAG_XBZRLE 0x1
579 
580 /**
581  * save_xbzrle_page: compress and send current page
582  *
583  * Returns: 1 means that we wrote the page
584  *          0 means that page is identical to the one already sent
585  *          -1 means that xbzrle would be longer than normal
586  *
587  * @rs: current RAM state
588  * @pss: current PSS channel
589  * @current_data: pointer to the address of the page contents
590  * @current_addr: addr of the page
591  * @block: block that contains the page we want to send
592  * @offset: offset inside the block for the page
593  */
594 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
595                             uint8_t **current_data, ram_addr_t current_addr,
596                             RAMBlock *block, ram_addr_t offset)
597 {
598     int encoded_len = 0, bytes_xbzrle;
599     uint8_t *prev_cached_page;
600     QEMUFile *file = pss->pss_channel;
601     uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
602 
603     if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
604         xbzrle_counters.cache_miss++;
605         if (!rs->last_stage) {
606             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
607                              generation) == -1) {
608                 return -1;
609             } else {
610                 /* update *current_data when the page has been
611                    inserted into cache */
612                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
613             }
614         }
615         return -1;
616     }
617 
618     /*
619      * Reaching here means the page has hit the xbzrle cache, no matter what
620      * encoding result it is (normal encoding, overflow or skipping the page),
621      * count the page as encoded. This is used to calculate the encoding rate.
622      *
623      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
624      * 2nd page turns out to be skipped (i.e. no new bytes written to the
625      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
626      * skipped page included. In this way, the encoding rate can tell if the
627      * guest page is good for xbzrle encoding.
628      */
629     xbzrle_counters.pages++;
630     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
631 
632     /* save current buffer into memory */
633     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
634 
635     /* XBZRLE encoding (if there is no overflow) */
636     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
637                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
638                                        TARGET_PAGE_SIZE);
639 
640     /*
641      * Update the cache contents, so that it corresponds to the data
642      * sent, in all cases except where we skip the page.
643      */
644     if (!rs->last_stage && encoded_len != 0) {
645         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
646         /*
647          * In the case where we couldn't compress, ensure that the caller
648          * sends the data from the cache, since the guest might have
649          * changed the RAM since we copied it.
650          */
651         *current_data = prev_cached_page;
652     }
653 
654     if (encoded_len == 0) {
655         trace_save_xbzrle_page_skipping();
656         return 0;
657     } else if (encoded_len == -1) {
658         trace_save_xbzrle_page_overflow();
659         xbzrle_counters.overflow++;
660         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
661         return -1;
662     }
663 
664     /* Send XBZRLE based compressed page */
665     bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
666                                     offset | RAM_SAVE_FLAG_XBZRLE);
667     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
668     qemu_put_be16(file, encoded_len);
669     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
670     bytes_xbzrle += encoded_len + 1 + 2;
671     /*
672      * Like compressed_size (please see update_compress_thread_counts),
673      * the xbzrle encoded bytes don't count the 8 byte header with
674      * RAM_SAVE_FLAG_CONTINUE.
675      */
676     xbzrle_counters.bytes += bytes_xbzrle - 8;
677     ram_transferred_add(bytes_xbzrle);
678 
679     return 1;
680 }
681 
682 /**
683  * pss_find_next_dirty: find the next dirty page of current ramblock
684  *
685  * This function updates pss->page to point to the next dirty page index
686  * within the ramblock to migrate, or the end of ramblock when nothing
687  * found.  Note that when pss->host_page_sending==true it means we're
688  * during sending a host page, so we won't look for dirty page that is
689  * outside the host page boundary.
690  *
691  * @pss: the current page search status
692  */
693 static void pss_find_next_dirty(PageSearchStatus *pss)
694 {
695     RAMBlock *rb = pss->block;
696     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
697     unsigned long *bitmap = rb->bmap;
698 
699     if (ramblock_is_ignored(rb)) {
700         /* Points directly to the end, so we know no dirty page */
701         pss->page = size;
702         return;
703     }
704 
705     /*
706      * If during sending a host page, only look for dirty pages within the
707      * current host page being send.
708      */
709     if (pss->host_page_sending) {
710         assert(pss->host_page_end);
711         size = MIN(size, pss->host_page_end);
712     }
713 
714     pss->page = find_next_bit(bitmap, size, pss->page);
715 }
716 
717 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
718                                                        unsigned long page)
719 {
720     uint8_t shift;
721     hwaddr size, start;
722 
723     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
724         return;
725     }
726 
727     shift = rb->clear_bmap_shift;
728     /*
729      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
730      * can make things easier sometimes since then start address
731      * of the small chunk will always be 64 pages aligned so the
732      * bitmap will always be aligned to unsigned long. We should
733      * even be able to remove this restriction but I'm simply
734      * keeping it.
735      */
736     assert(shift >= 6);
737 
738     size = 1ULL << (TARGET_PAGE_BITS + shift);
739     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
740     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
741     memory_region_clear_dirty_bitmap(rb->mr, start, size);
742 }
743 
744 static void
745 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
746                                                  unsigned long start,
747                                                  unsigned long npages)
748 {
749     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
750     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
751     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
752 
753     /*
754      * Clear pages from start to start + npages - 1, so the end boundary is
755      * exclusive.
756      */
757     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
758         migration_clear_memory_region_dirty_bitmap(rb, i);
759     }
760 }
761 
762 /*
763  * colo_bitmap_find_diry:find contiguous dirty pages from start
764  *
765  * Returns the page offset within memory region of the start of the contiguout
766  * dirty page
767  *
768  * @rs: current RAM state
769  * @rb: RAMBlock where to search for dirty pages
770  * @start: page where we start the search
771  * @num: the number of contiguous dirty pages
772  */
773 static inline
774 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
775                                      unsigned long start, unsigned long *num)
776 {
777     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
778     unsigned long *bitmap = rb->bmap;
779     unsigned long first, next;
780 
781     *num = 0;
782 
783     if (ramblock_is_ignored(rb)) {
784         return size;
785     }
786 
787     first = find_next_bit(bitmap, size, start);
788     if (first >= size) {
789         return first;
790     }
791     next = find_next_zero_bit(bitmap, size, first + 1);
792     assert(next >= first);
793     *num = next - first;
794     return first;
795 }
796 
797 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
798                                                 RAMBlock *rb,
799                                                 unsigned long page)
800 {
801     bool ret;
802 
803     /*
804      * Clear dirty bitmap if needed.  This _must_ be called before we
805      * send any of the page in the chunk because we need to make sure
806      * we can capture further page content changes when we sync dirty
807      * log the next time.  So as long as we are going to send any of
808      * the page in the chunk we clear the remote dirty bitmap for all.
809      * Clearing it earlier won't be a problem, but too late will.
810      */
811     migration_clear_memory_region_dirty_bitmap(rb, page);
812 
813     ret = test_and_clear_bit(page, rb->bmap);
814     if (ret) {
815         rs->migration_dirty_pages--;
816     }
817 
818     return ret;
819 }
820 
821 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
822                                        void *opaque)
823 {
824     const hwaddr offset = section->offset_within_region;
825     const hwaddr size = int128_get64(section->size);
826     const unsigned long start = offset >> TARGET_PAGE_BITS;
827     const unsigned long npages = size >> TARGET_PAGE_BITS;
828     RAMBlock *rb = section->mr->ram_block;
829     uint64_t *cleared_bits = opaque;
830 
831     /*
832      * We don't grab ram_state->bitmap_mutex because we expect to run
833      * only when starting migration or during postcopy recovery where
834      * we don't have concurrent access.
835      */
836     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
837         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
838     }
839     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
840     bitmap_clear(rb->bmap, start, npages);
841 }
842 
843 /*
844  * Exclude all dirty pages from migration that fall into a discarded range as
845  * managed by a RamDiscardManager responsible for the mapped memory region of
846  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
847  *
848  * Discarded pages ("logically unplugged") have undefined content and must
849  * not get migrated, because even reading these pages for migration might
850  * result in undesired behavior.
851  *
852  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
853  *
854  * Note: The result is only stable while migrating (precopy/postcopy).
855  */
856 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
857 {
858     uint64_t cleared_bits = 0;
859 
860     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
861         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
862         MemoryRegionSection section = {
863             .mr = rb->mr,
864             .offset_within_region = 0,
865             .size = int128_make64(qemu_ram_get_used_length(rb)),
866         };
867 
868         ram_discard_manager_replay_discarded(rdm, &section,
869                                              dirty_bitmap_clear_section,
870                                              &cleared_bits);
871     }
872     return cleared_bits;
873 }
874 
875 /*
876  * Check if a host-page aligned page falls into a discarded range as managed by
877  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
878  *
879  * Note: The result is only stable while migrating (precopy/postcopy).
880  */
881 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
882 {
883     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
884         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
885         MemoryRegionSection section = {
886             .mr = rb->mr,
887             .offset_within_region = start,
888             .size = int128_make64(qemu_ram_pagesize(rb)),
889         };
890 
891         return !ram_discard_manager_is_populated(rdm, &section);
892     }
893     return false;
894 }
895 
896 /* Called with RCU critical section */
897 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
898 {
899     uint64_t new_dirty_pages =
900         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
901 
902     rs->migration_dirty_pages += new_dirty_pages;
903     rs->num_dirty_pages_period += new_dirty_pages;
904 }
905 
906 /**
907  * ram_pagesize_summary: calculate all the pagesizes of a VM
908  *
909  * Returns a summary bitmap of the page sizes of all RAMBlocks
910  *
911  * For VMs with just normal pages this is equivalent to the host page
912  * size. If it's got some huge pages then it's the OR of all the
913  * different page sizes.
914  */
915 uint64_t ram_pagesize_summary(void)
916 {
917     RAMBlock *block;
918     uint64_t summary = 0;
919 
920     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
921         summary |= block->page_size;
922     }
923 
924     return summary;
925 }
926 
927 uint64_t ram_get_total_transferred_pages(void)
928 {
929     return stat64_get(&mig_stats.normal_pages) +
930         stat64_get(&mig_stats.zero_pages) +
931         compression_counters.pages + xbzrle_counters.pages;
932 }
933 
934 static void migration_update_rates(RAMState *rs, int64_t end_time)
935 {
936     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
937     double compressed_size;
938 
939     /* calculate period counters */
940     stat64_set(&mig_stats.dirty_pages_rate,
941                rs->num_dirty_pages_period * 1000 /
942                (end_time - rs->time_last_bitmap_sync));
943 
944     if (!page_count) {
945         return;
946     }
947 
948     if (migrate_xbzrle()) {
949         double encoded_size, unencoded_size;
950 
951         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
952             rs->xbzrle_cache_miss_prev) / page_count;
953         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
954         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
955                          TARGET_PAGE_SIZE;
956         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
957         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
958             xbzrle_counters.encoding_rate = 0;
959         } else {
960             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
961         }
962         rs->xbzrle_pages_prev = xbzrle_counters.pages;
963         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
964     }
965 
966     if (migrate_compress()) {
967         compression_counters.busy_rate = (double)(compression_counters.busy -
968             rs->compress_thread_busy_prev) / page_count;
969         rs->compress_thread_busy_prev = compression_counters.busy;
970 
971         compressed_size = compression_counters.compressed_size -
972                           rs->compressed_size_prev;
973         if (compressed_size) {
974             double uncompressed_size = (compression_counters.pages -
975                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
976 
977             /* Compression-Ratio = Uncompressed-size / Compressed-size */
978             compression_counters.compression_rate =
979                                         uncompressed_size / compressed_size;
980 
981             rs->compress_pages_prev = compression_counters.pages;
982             rs->compressed_size_prev = compression_counters.compressed_size;
983         }
984     }
985 }
986 
987 static void migration_trigger_throttle(RAMState *rs)
988 {
989     uint64_t threshold = migrate_throttle_trigger_threshold();
990     uint64_t bytes_xfer_period =
991         stat64_get(&mig_stats.transferred) - rs->bytes_xfer_prev;
992     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
993     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
994 
995     /* During block migration the auto-converge logic incorrectly detects
996      * that ram migration makes no progress. Avoid this by disabling the
997      * throttling logic during the bulk phase of block migration. */
998     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
999         /* The following detection logic can be refined later. For now:
1000            Check to see if the ratio between dirtied bytes and the approx.
1001            amount of bytes that just got transferred since the last time
1002            we were in this routine reaches the threshold. If that happens
1003            twice, start or increase throttling. */
1004 
1005         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1006             (++rs->dirty_rate_high_cnt >= 2)) {
1007             trace_migration_throttle();
1008             rs->dirty_rate_high_cnt = 0;
1009             mig_throttle_guest_down(bytes_dirty_period,
1010                                     bytes_dirty_threshold);
1011         }
1012     }
1013 }
1014 
1015 static void migration_bitmap_sync(RAMState *rs, bool last_stage)
1016 {
1017     RAMBlock *block;
1018     int64_t end_time;
1019 
1020     stat64_add(&mig_stats.dirty_sync_count, 1);
1021 
1022     if (!rs->time_last_bitmap_sync) {
1023         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1024     }
1025 
1026     trace_migration_bitmap_sync_start();
1027     memory_global_dirty_log_sync(last_stage);
1028 
1029     qemu_mutex_lock(&rs->bitmap_mutex);
1030     WITH_RCU_READ_LOCK_GUARD() {
1031         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1032             ramblock_sync_dirty_bitmap(rs, block);
1033         }
1034         stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining());
1035     }
1036     qemu_mutex_unlock(&rs->bitmap_mutex);
1037 
1038     memory_global_after_dirty_log_sync();
1039     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1040 
1041     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1042 
1043     /* more than 1 second = 1000 millisecons */
1044     if (end_time > rs->time_last_bitmap_sync + 1000) {
1045         migration_trigger_throttle(rs);
1046 
1047         migration_update_rates(rs, end_time);
1048 
1049         rs->target_page_count_prev = rs->target_page_count;
1050 
1051         /* reset period counters */
1052         rs->time_last_bitmap_sync = end_time;
1053         rs->num_dirty_pages_period = 0;
1054         rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
1055     }
1056     if (migrate_events()) {
1057         uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
1058         qapi_event_send_migration_pass(generation);
1059     }
1060 }
1061 
1062 static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage)
1063 {
1064     Error *local_err = NULL;
1065 
1066     /*
1067      * The current notifier usage is just an optimization to migration, so we
1068      * don't stop the normal migration process in the error case.
1069      */
1070     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1071         error_report_err(local_err);
1072         local_err = NULL;
1073     }
1074 
1075     migration_bitmap_sync(rs, last_stage);
1076 
1077     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1078         error_report_err(local_err);
1079     }
1080 }
1081 
1082 void ram_release_page(const char *rbname, uint64_t offset)
1083 {
1084     if (!migrate_release_ram() || !migration_in_postcopy()) {
1085         return;
1086     }
1087 
1088     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1089 }
1090 
1091 /**
1092  * save_zero_page_to_file: send the zero page to the file
1093  *
1094  * Returns the size of data written to the file, 0 means the page is not
1095  * a zero page
1096  *
1097  * @pss: current PSS channel
1098  * @block: block that contains the page we want to send
1099  * @offset: offset inside the block for the page
1100  */
1101 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1102                                   RAMBlock *block, ram_addr_t offset)
1103 {
1104     uint8_t *p = block->host + offset;
1105     int len = 0;
1106 
1107     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1108         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1109         qemu_put_byte(file, 0);
1110         len += 1;
1111         ram_release_page(block->idstr, offset);
1112     }
1113     return len;
1114 }
1115 
1116 /**
1117  * save_zero_page: send the zero page to the stream
1118  *
1119  * Returns the number of pages written.
1120  *
1121  * @pss: current PSS channel
1122  * @block: block that contains the page we want to send
1123  * @offset: offset inside the block for the page
1124  */
1125 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1126                           ram_addr_t offset)
1127 {
1128     int len = save_zero_page_to_file(pss, f, block, offset);
1129 
1130     if (len) {
1131         stat64_add(&mig_stats.zero_pages, 1);
1132         ram_transferred_add(len);
1133         return 1;
1134     }
1135     return -1;
1136 }
1137 
1138 /*
1139  * @pages: the number of pages written by the control path,
1140  *        < 0 - error
1141  *        > 0 - number of pages written
1142  *
1143  * Return true if the pages has been saved, otherwise false is returned.
1144  */
1145 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1146                               ram_addr_t offset, int *pages)
1147 {
1148     uint64_t bytes_xmit = 0;
1149     int ret;
1150 
1151     *pages = -1;
1152     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1153                                 TARGET_PAGE_SIZE, &bytes_xmit);
1154     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1155         return false;
1156     }
1157 
1158     if (bytes_xmit) {
1159         ram_transferred_add(bytes_xmit);
1160         *pages = 1;
1161     }
1162 
1163     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1164         return true;
1165     }
1166 
1167     if (bytes_xmit > 0) {
1168         stat64_add(&mig_stats.normal_pages, 1);
1169     } else if (bytes_xmit == 0) {
1170         stat64_add(&mig_stats.zero_pages, 1);
1171     }
1172 
1173     return true;
1174 }
1175 
1176 /*
1177  * directly send the page to the stream
1178  *
1179  * Returns the number of pages written.
1180  *
1181  * @pss: current PSS channel
1182  * @block: block that contains the page we want to send
1183  * @offset: offset inside the block for the page
1184  * @buf: the page to be sent
1185  * @async: send to page asyncly
1186  */
1187 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1188                             ram_addr_t offset, uint8_t *buf, bool async)
1189 {
1190     QEMUFile *file = pss->pss_channel;
1191 
1192     ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1193                                          offset | RAM_SAVE_FLAG_PAGE));
1194     if (async) {
1195         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1196                               migrate_release_ram() &&
1197                               migration_in_postcopy());
1198     } else {
1199         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1200     }
1201     ram_transferred_add(TARGET_PAGE_SIZE);
1202     stat64_add(&mig_stats.normal_pages, 1);
1203     return 1;
1204 }
1205 
1206 /**
1207  * ram_save_page: send the given page to the stream
1208  *
1209  * Returns the number of pages written.
1210  *          < 0 - error
1211  *          >=0 - Number of pages written - this might legally be 0
1212  *                if xbzrle noticed the page was the same.
1213  *
1214  * @rs: current RAM state
1215  * @block: block that contains the page we want to send
1216  * @offset: offset inside the block for the page
1217  */
1218 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1219 {
1220     int pages = -1;
1221     uint8_t *p;
1222     bool send_async = true;
1223     RAMBlock *block = pss->block;
1224     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1225     ram_addr_t current_addr = block->offset + offset;
1226 
1227     p = block->host + offset;
1228     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1229 
1230     XBZRLE_cache_lock();
1231     if (rs->xbzrle_started && !migration_in_postcopy()) {
1232         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1233                                  block, offset);
1234         if (!rs->last_stage) {
1235             /* Can't send this cached data async, since the cache page
1236              * might get updated before it gets to the wire
1237              */
1238             send_async = false;
1239         }
1240     }
1241 
1242     /* XBZRLE overflow or normal page */
1243     if (pages == -1) {
1244         pages = save_normal_page(pss, block, offset, p, send_async);
1245     }
1246 
1247     XBZRLE_cache_unlock();
1248 
1249     return pages;
1250 }
1251 
1252 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1253                                  ram_addr_t offset)
1254 {
1255     if (multifd_queue_page(file, block, offset) < 0) {
1256         return -1;
1257     }
1258     stat64_add(&mig_stats.normal_pages, 1);
1259 
1260     return 1;
1261 }
1262 
1263 static void
1264 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1265 {
1266     ram_transferred_add(bytes_xmit);
1267 
1268     if (param->result == RES_ZEROPAGE) {
1269         stat64_add(&mig_stats.zero_pages, 1);
1270         return;
1271     }
1272 
1273     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1274     compression_counters.compressed_size += bytes_xmit - 8;
1275     compression_counters.pages++;
1276 }
1277 
1278 static bool save_page_use_compression(RAMState *rs);
1279 
1280 static int send_queued_data(CompressParam *param)
1281 {
1282     PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_PRECOPY];
1283     MigrationState *ms = migrate_get_current();
1284     QEMUFile *file = ms->to_dst_file;
1285     int len = 0;
1286 
1287     RAMBlock *block = param->block;
1288     ram_addr_t offset = param->offset;
1289 
1290     if (param->result == RES_NONE) {
1291         return 0;
1292     }
1293 
1294     assert(block == pss->last_sent_block);
1295 
1296     if (param->result == RES_ZEROPAGE) {
1297         assert(qemu_file_buffer_empty(param->file));
1298         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1299         qemu_put_byte(file, 0);
1300         len += 1;
1301         ram_release_page(block->idstr, offset);
1302     } else if (param->result == RES_COMPRESS) {
1303         assert(!qemu_file_buffer_empty(param->file));
1304         len += save_page_header(pss, file, block,
1305                                 offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1306         len += qemu_put_qemu_file(file, param->file);
1307     } else {
1308         abort();
1309     }
1310 
1311     update_compress_thread_counts(param, len);
1312 
1313     return len;
1314 }
1315 
1316 static void ram_flush_compressed_data(RAMState *rs)
1317 {
1318     if (!save_page_use_compression(rs)) {
1319         return;
1320     }
1321 
1322     flush_compressed_data(send_queued_data);
1323 }
1324 
1325 #define PAGE_ALL_CLEAN 0
1326 #define PAGE_TRY_AGAIN 1
1327 #define PAGE_DIRTY_FOUND 2
1328 /**
1329  * find_dirty_block: find the next dirty page and update any state
1330  * associated with the search process.
1331  *
1332  * Returns:
1333  *         <0: An error happened
1334  *         PAGE_ALL_CLEAN: no dirty page found, give up
1335  *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1336  *         PAGE_DIRTY_FOUND: dirty page found
1337  *
1338  * @rs: current RAM state
1339  * @pss: data about the state of the current dirty page scan
1340  * @again: set to false if the search has scanned the whole of RAM
1341  */
1342 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1343 {
1344     /* Update pss->page for the next dirty bit in ramblock */
1345     pss_find_next_dirty(pss);
1346 
1347     if (pss->complete_round && pss->block == rs->last_seen_block &&
1348         pss->page >= rs->last_page) {
1349         /*
1350          * We've been once around the RAM and haven't found anything.
1351          * Give up.
1352          */
1353         return PAGE_ALL_CLEAN;
1354     }
1355     if (!offset_in_ramblock(pss->block,
1356                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1357         /* Didn't find anything in this RAM Block */
1358         pss->page = 0;
1359         pss->block = QLIST_NEXT_RCU(pss->block, next);
1360         if (!pss->block) {
1361             if (!migrate_multifd_flush_after_each_section()) {
1362                 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
1363                 int ret = multifd_send_sync_main(f);
1364                 if (ret < 0) {
1365                     return ret;
1366                 }
1367                 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
1368                 qemu_fflush(f);
1369             }
1370             /*
1371              * If memory migration starts over, we will meet a dirtied page
1372              * which may still exists in compression threads's ring, so we
1373              * should flush the compressed data to make sure the new page
1374              * is not overwritten by the old one in the destination.
1375              *
1376              * Also If xbzrle is on, stop using the data compression at this
1377              * point. In theory, xbzrle can do better than compression.
1378              */
1379             ram_flush_compressed_data(rs);
1380 
1381             /* Hit the end of the list */
1382             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1383             /* Flag that we've looped */
1384             pss->complete_round = true;
1385             /* After the first round, enable XBZRLE. */
1386             if (migrate_xbzrle()) {
1387                 rs->xbzrle_started = true;
1388             }
1389         }
1390         /* Didn't find anything this time, but try again on the new block */
1391         return PAGE_TRY_AGAIN;
1392     } else {
1393         /* We've found something */
1394         return PAGE_DIRTY_FOUND;
1395     }
1396 }
1397 
1398 /**
1399  * unqueue_page: gets a page of the queue
1400  *
1401  * Helper for 'get_queued_page' - gets a page off the queue
1402  *
1403  * Returns the block of the page (or NULL if none available)
1404  *
1405  * @rs: current RAM state
1406  * @offset: used to return the offset within the RAMBlock
1407  */
1408 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1409 {
1410     struct RAMSrcPageRequest *entry;
1411     RAMBlock *block = NULL;
1412 
1413     if (!postcopy_has_request(rs)) {
1414         return NULL;
1415     }
1416 
1417     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1418 
1419     /*
1420      * This should _never_ change even after we take the lock, because no one
1421      * should be taking anything off the request list other than us.
1422      */
1423     assert(postcopy_has_request(rs));
1424 
1425     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1426     block = entry->rb;
1427     *offset = entry->offset;
1428 
1429     if (entry->len > TARGET_PAGE_SIZE) {
1430         entry->len -= TARGET_PAGE_SIZE;
1431         entry->offset += TARGET_PAGE_SIZE;
1432     } else {
1433         memory_region_unref(block->mr);
1434         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1435         g_free(entry);
1436         migration_consume_urgent_request();
1437     }
1438 
1439     return block;
1440 }
1441 
1442 #if defined(__linux__)
1443 /**
1444  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1445  *   is found, return RAM block pointer and page offset
1446  *
1447  * Returns pointer to the RAMBlock containing faulting page,
1448  *   NULL if no write faults are pending
1449  *
1450  * @rs: current RAM state
1451  * @offset: page offset from the beginning of the block
1452  */
1453 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1454 {
1455     struct uffd_msg uffd_msg;
1456     void *page_address;
1457     RAMBlock *block;
1458     int res;
1459 
1460     if (!migrate_background_snapshot()) {
1461         return NULL;
1462     }
1463 
1464     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1465     if (res <= 0) {
1466         return NULL;
1467     }
1468 
1469     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1470     block = qemu_ram_block_from_host(page_address, false, offset);
1471     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1472     return block;
1473 }
1474 
1475 /**
1476  * ram_save_release_protection: release UFFD write protection after
1477  *   a range of pages has been saved
1478  *
1479  * @rs: current RAM state
1480  * @pss: page-search-status structure
1481  * @start_page: index of the first page in the range relative to pss->block
1482  *
1483  * Returns 0 on success, negative value in case of an error
1484 */
1485 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1486         unsigned long start_page)
1487 {
1488     int res = 0;
1489 
1490     /* Check if page is from UFFD-managed region. */
1491     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1492         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1493         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1494 
1495         /* Flush async buffers before un-protect. */
1496         qemu_fflush(pss->pss_channel);
1497         /* Un-protect memory range. */
1498         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1499                 false, false);
1500     }
1501 
1502     return res;
1503 }
1504 
1505 /* ram_write_tracking_available: check if kernel supports required UFFD features
1506  *
1507  * Returns true if supports, false otherwise
1508  */
1509 bool ram_write_tracking_available(void)
1510 {
1511     uint64_t uffd_features;
1512     int res;
1513 
1514     res = uffd_query_features(&uffd_features);
1515     return (res == 0 &&
1516             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1517 }
1518 
1519 /* ram_write_tracking_compatible: check if guest configuration is
1520  *   compatible with 'write-tracking'
1521  *
1522  * Returns true if compatible, false otherwise
1523  */
1524 bool ram_write_tracking_compatible(void)
1525 {
1526     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1527     int uffd_fd;
1528     RAMBlock *block;
1529     bool ret = false;
1530 
1531     /* Open UFFD file descriptor */
1532     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1533     if (uffd_fd < 0) {
1534         return false;
1535     }
1536 
1537     RCU_READ_LOCK_GUARD();
1538 
1539     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1540         uint64_t uffd_ioctls;
1541 
1542         /* Nothing to do with read-only and MMIO-writable regions */
1543         if (block->mr->readonly || block->mr->rom_device) {
1544             continue;
1545         }
1546         /* Try to register block memory via UFFD-IO to track writes */
1547         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1548                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1549             goto out;
1550         }
1551         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1552             goto out;
1553         }
1554     }
1555     ret = true;
1556 
1557 out:
1558     uffd_close_fd(uffd_fd);
1559     return ret;
1560 }
1561 
1562 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1563                                        ram_addr_t size)
1564 {
1565     const ram_addr_t end = offset + size;
1566 
1567     /*
1568      * We read one byte of each page; this will preallocate page tables if
1569      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1570      * where no page was populated yet. This might require adaption when
1571      * supporting other mappings, like shmem.
1572      */
1573     for (; offset < end; offset += block->page_size) {
1574         char tmp = *((char *)block->host + offset);
1575 
1576         /* Don't optimize the read out */
1577         asm volatile("" : "+r" (tmp));
1578     }
1579 }
1580 
1581 static inline int populate_read_section(MemoryRegionSection *section,
1582                                         void *opaque)
1583 {
1584     const hwaddr size = int128_get64(section->size);
1585     hwaddr offset = section->offset_within_region;
1586     RAMBlock *block = section->mr->ram_block;
1587 
1588     populate_read_range(block, offset, size);
1589     return 0;
1590 }
1591 
1592 /*
1593  * ram_block_populate_read: preallocate page tables and populate pages in the
1594  *   RAM block by reading a byte of each page.
1595  *
1596  * Since it's solely used for userfault_fd WP feature, here we just
1597  *   hardcode page size to qemu_real_host_page_size.
1598  *
1599  * @block: RAM block to populate
1600  */
1601 static void ram_block_populate_read(RAMBlock *rb)
1602 {
1603     /*
1604      * Skip populating all pages that fall into a discarded range as managed by
1605      * a RamDiscardManager responsible for the mapped memory region of the
1606      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1607      * must not get populated automatically. We don't have to track
1608      * modifications via userfaultfd WP reliably, because these pages will
1609      * not be part of the migration stream either way -- see
1610      * ramblock_dirty_bitmap_exclude_discarded_pages().
1611      *
1612      * Note: The result is only stable while migrating (precopy/postcopy).
1613      */
1614     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1615         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1616         MemoryRegionSection section = {
1617             .mr = rb->mr,
1618             .offset_within_region = 0,
1619             .size = rb->mr->size,
1620         };
1621 
1622         ram_discard_manager_replay_populated(rdm, &section,
1623                                              populate_read_section, NULL);
1624     } else {
1625         populate_read_range(rb, 0, rb->used_length);
1626     }
1627 }
1628 
1629 /*
1630  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1631  */
1632 void ram_write_tracking_prepare(void)
1633 {
1634     RAMBlock *block;
1635 
1636     RCU_READ_LOCK_GUARD();
1637 
1638     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1639         /* Nothing to do with read-only and MMIO-writable regions */
1640         if (block->mr->readonly || block->mr->rom_device) {
1641             continue;
1642         }
1643 
1644         /*
1645          * Populate pages of the RAM block before enabling userfault_fd
1646          * write protection.
1647          *
1648          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1649          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1650          * pages with pte_none() entries in page table.
1651          */
1652         ram_block_populate_read(block);
1653     }
1654 }
1655 
1656 static inline int uffd_protect_section(MemoryRegionSection *section,
1657                                        void *opaque)
1658 {
1659     const hwaddr size = int128_get64(section->size);
1660     const hwaddr offset = section->offset_within_region;
1661     RAMBlock *rb = section->mr->ram_block;
1662     int uffd_fd = (uintptr_t)opaque;
1663 
1664     return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1665                                   false);
1666 }
1667 
1668 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1669 {
1670     assert(rb->flags & RAM_UF_WRITEPROTECT);
1671 
1672     /* See ram_block_populate_read() */
1673     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1674         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1675         MemoryRegionSection section = {
1676             .mr = rb->mr,
1677             .offset_within_region = 0,
1678             .size = rb->mr->size,
1679         };
1680 
1681         return ram_discard_manager_replay_populated(rdm, &section,
1682                                                     uffd_protect_section,
1683                                                     (void *)(uintptr_t)uffd_fd);
1684     }
1685     return uffd_change_protection(uffd_fd, rb->host,
1686                                   rb->used_length, true, false);
1687 }
1688 
1689 /*
1690  * ram_write_tracking_start: start UFFD-WP memory tracking
1691  *
1692  * Returns 0 for success or negative value in case of error
1693  */
1694 int ram_write_tracking_start(void)
1695 {
1696     int uffd_fd;
1697     RAMState *rs = ram_state;
1698     RAMBlock *block;
1699 
1700     /* Open UFFD file descriptor */
1701     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1702     if (uffd_fd < 0) {
1703         return uffd_fd;
1704     }
1705     rs->uffdio_fd = uffd_fd;
1706 
1707     RCU_READ_LOCK_GUARD();
1708 
1709     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1710         /* Nothing to do with read-only and MMIO-writable regions */
1711         if (block->mr->readonly || block->mr->rom_device) {
1712             continue;
1713         }
1714 
1715         /* Register block memory with UFFD to track writes */
1716         if (uffd_register_memory(rs->uffdio_fd, block->host,
1717                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1718             goto fail;
1719         }
1720         block->flags |= RAM_UF_WRITEPROTECT;
1721         memory_region_ref(block->mr);
1722 
1723         /* Apply UFFD write protection to the block memory range */
1724         if (ram_block_uffd_protect(block, uffd_fd)) {
1725             goto fail;
1726         }
1727 
1728         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1729                 block->host, block->max_length);
1730     }
1731 
1732     return 0;
1733 
1734 fail:
1735     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1736 
1737     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1738         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1739             continue;
1740         }
1741         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1742         /* Cleanup flags and remove reference */
1743         block->flags &= ~RAM_UF_WRITEPROTECT;
1744         memory_region_unref(block->mr);
1745     }
1746 
1747     uffd_close_fd(uffd_fd);
1748     rs->uffdio_fd = -1;
1749     return -1;
1750 }
1751 
1752 /**
1753  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1754  */
1755 void ram_write_tracking_stop(void)
1756 {
1757     RAMState *rs = ram_state;
1758     RAMBlock *block;
1759 
1760     RCU_READ_LOCK_GUARD();
1761 
1762     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1763         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1764             continue;
1765         }
1766         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1767 
1768         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1769                 block->host, block->max_length);
1770 
1771         /* Cleanup flags and remove reference */
1772         block->flags &= ~RAM_UF_WRITEPROTECT;
1773         memory_region_unref(block->mr);
1774     }
1775 
1776     /* Finally close UFFD file descriptor */
1777     uffd_close_fd(rs->uffdio_fd);
1778     rs->uffdio_fd = -1;
1779 }
1780 
1781 #else
1782 /* No target OS support, stubs just fail or ignore */
1783 
1784 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1785 {
1786     (void) rs;
1787     (void) offset;
1788 
1789     return NULL;
1790 }
1791 
1792 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1793         unsigned long start_page)
1794 {
1795     (void) rs;
1796     (void) pss;
1797     (void) start_page;
1798 
1799     return 0;
1800 }
1801 
1802 bool ram_write_tracking_available(void)
1803 {
1804     return false;
1805 }
1806 
1807 bool ram_write_tracking_compatible(void)
1808 {
1809     assert(0);
1810     return false;
1811 }
1812 
1813 int ram_write_tracking_start(void)
1814 {
1815     assert(0);
1816     return -1;
1817 }
1818 
1819 void ram_write_tracking_stop(void)
1820 {
1821     assert(0);
1822 }
1823 #endif /* defined(__linux__) */
1824 
1825 /**
1826  * get_queued_page: unqueue a page from the postcopy requests
1827  *
1828  * Skips pages that are already sent (!dirty)
1829  *
1830  * Returns true if a queued page is found
1831  *
1832  * @rs: current RAM state
1833  * @pss: data about the state of the current dirty page scan
1834  */
1835 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1836 {
1837     RAMBlock  *block;
1838     ram_addr_t offset;
1839     bool dirty;
1840 
1841     do {
1842         block = unqueue_page(rs, &offset);
1843         /*
1844          * We're sending this page, and since it's postcopy nothing else
1845          * will dirty it, and we must make sure it doesn't get sent again
1846          * even if this queue request was received after the background
1847          * search already sent it.
1848          */
1849         if (block) {
1850             unsigned long page;
1851 
1852             page = offset >> TARGET_PAGE_BITS;
1853             dirty = test_bit(page, block->bmap);
1854             if (!dirty) {
1855                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1856                                                 page);
1857             } else {
1858                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1859             }
1860         }
1861 
1862     } while (block && !dirty);
1863 
1864     if (!block) {
1865         /*
1866          * Poll write faults too if background snapshot is enabled; that's
1867          * when we have vcpus got blocked by the write protected pages.
1868          */
1869         block = poll_fault_page(rs, &offset);
1870     }
1871 
1872     if (block) {
1873         /*
1874          * We want the background search to continue from the queued page
1875          * since the guest is likely to want other pages near to the page
1876          * it just requested.
1877          */
1878         pss->block = block;
1879         pss->page = offset >> TARGET_PAGE_BITS;
1880 
1881         /*
1882          * This unqueued page would break the "one round" check, even is
1883          * really rare.
1884          */
1885         pss->complete_round = false;
1886     }
1887 
1888     return !!block;
1889 }
1890 
1891 /**
1892  * migration_page_queue_free: drop any remaining pages in the ram
1893  * request queue
1894  *
1895  * It should be empty at the end anyway, but in error cases there may
1896  * be some left.  in case that there is any page left, we drop it.
1897  *
1898  */
1899 static void migration_page_queue_free(RAMState *rs)
1900 {
1901     struct RAMSrcPageRequest *mspr, *next_mspr;
1902     /* This queue generally should be empty - but in the case of a failed
1903      * migration might have some droppings in.
1904      */
1905     RCU_READ_LOCK_GUARD();
1906     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1907         memory_region_unref(mspr->rb->mr);
1908         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1909         g_free(mspr);
1910     }
1911 }
1912 
1913 /**
1914  * ram_save_queue_pages: queue the page for transmission
1915  *
1916  * A request from postcopy destination for example.
1917  *
1918  * Returns zero on success or negative on error
1919  *
1920  * @rbname: Name of the RAMBLock of the request. NULL means the
1921  *          same that last one.
1922  * @start: starting address from the start of the RAMBlock
1923  * @len: length (in bytes) to send
1924  */
1925 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1926 {
1927     RAMBlock *ramblock;
1928     RAMState *rs = ram_state;
1929 
1930     stat64_add(&mig_stats.postcopy_requests, 1);
1931     RCU_READ_LOCK_GUARD();
1932 
1933     if (!rbname) {
1934         /* Reuse last RAMBlock */
1935         ramblock = rs->last_req_rb;
1936 
1937         if (!ramblock) {
1938             /*
1939              * Shouldn't happen, we can't reuse the last RAMBlock if
1940              * it's the 1st request.
1941              */
1942             error_report("ram_save_queue_pages no previous block");
1943             return -1;
1944         }
1945     } else {
1946         ramblock = qemu_ram_block_by_name(rbname);
1947 
1948         if (!ramblock) {
1949             /* We shouldn't be asked for a non-existent RAMBlock */
1950             error_report("ram_save_queue_pages no block '%s'", rbname);
1951             return -1;
1952         }
1953         rs->last_req_rb = ramblock;
1954     }
1955     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1956     if (!offset_in_ramblock(ramblock, start + len - 1)) {
1957         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1958                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1959                      __func__, start, len, ramblock->used_length);
1960         return -1;
1961     }
1962 
1963     /*
1964      * When with postcopy preempt, we send back the page directly in the
1965      * rp-return thread.
1966      */
1967     if (postcopy_preempt_active()) {
1968         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
1969         size_t page_size = qemu_ram_pagesize(ramblock);
1970         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
1971         int ret = 0;
1972 
1973         qemu_mutex_lock(&rs->bitmap_mutex);
1974 
1975         pss_init(pss, ramblock, page_start);
1976         /*
1977          * Always use the preempt channel, and make sure it's there.  It's
1978          * safe to access without lock, because when rp-thread is running
1979          * we should be the only one who operates on the qemufile
1980          */
1981         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
1982         assert(pss->pss_channel);
1983 
1984         /*
1985          * It must be either one or multiple of host page size.  Just
1986          * assert; if something wrong we're mostly split brain anyway.
1987          */
1988         assert(len % page_size == 0);
1989         while (len) {
1990             if (ram_save_host_page_urgent(pss)) {
1991                 error_report("%s: ram_save_host_page_urgent() failed: "
1992                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
1993                              __func__, ramblock->idstr, start);
1994                 ret = -1;
1995                 break;
1996             }
1997             /*
1998              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
1999              * will automatically be moved and point to the next host page
2000              * we're going to send, so no need to update here.
2001              *
2002              * Normally QEMU never sends >1 host page in requests, so
2003              * logically we don't even need that as the loop should only
2004              * run once, but just to be consistent.
2005              */
2006             len -= page_size;
2007         };
2008         qemu_mutex_unlock(&rs->bitmap_mutex);
2009 
2010         return ret;
2011     }
2012 
2013     struct RAMSrcPageRequest *new_entry =
2014         g_new0(struct RAMSrcPageRequest, 1);
2015     new_entry->rb = ramblock;
2016     new_entry->offset = start;
2017     new_entry->len = len;
2018 
2019     memory_region_ref(ramblock->mr);
2020     qemu_mutex_lock(&rs->src_page_req_mutex);
2021     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2022     migration_make_urgent_request();
2023     qemu_mutex_unlock(&rs->src_page_req_mutex);
2024 
2025     return 0;
2026 }
2027 
2028 static bool save_page_use_compression(RAMState *rs)
2029 {
2030     if (!migrate_compress()) {
2031         return false;
2032     }
2033 
2034     /*
2035      * If xbzrle is enabled (e.g., after first round of migration), stop
2036      * using the data compression. In theory, xbzrle can do better than
2037      * compression.
2038      */
2039     if (rs->xbzrle_started) {
2040         return false;
2041     }
2042 
2043     return true;
2044 }
2045 
2046 /*
2047  * try to compress the page before posting it out, return true if the page
2048  * has been properly handled by compression, otherwise needs other
2049  * paths to handle it
2050  */
2051 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2052                                RAMBlock *block, ram_addr_t offset)
2053 {
2054     if (!save_page_use_compression(rs)) {
2055         return false;
2056     }
2057 
2058     /*
2059      * When starting the process of a new block, the first page of
2060      * the block should be sent out before other pages in the same
2061      * block, and all the pages in last block should have been sent
2062      * out, keeping this order is important, because the 'cont' flag
2063      * is used to avoid resending the block name.
2064      *
2065      * We post the fist page as normal page as compression will take
2066      * much CPU resource.
2067      */
2068     if (block != pss->last_sent_block) {
2069         ram_flush_compressed_data(rs);
2070         return false;
2071     }
2072 
2073     if (compress_page_with_multi_thread(block, offset, send_queued_data) > 0) {
2074         return true;
2075     }
2076 
2077     compression_counters.busy++;
2078     return false;
2079 }
2080 
2081 /**
2082  * ram_save_target_page_legacy: save one target page
2083  *
2084  * Returns the number of pages written
2085  *
2086  * @rs: current RAM state
2087  * @pss: data about the page we want to send
2088  */
2089 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2090 {
2091     RAMBlock *block = pss->block;
2092     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2093     int res;
2094 
2095     if (control_save_page(pss, block, offset, &res)) {
2096         return res;
2097     }
2098 
2099     if (save_compress_page(rs, pss, block, offset)) {
2100         return 1;
2101     }
2102 
2103     res = save_zero_page(pss, pss->pss_channel, block, offset);
2104     if (res > 0) {
2105         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2106          * page would be stale
2107          */
2108         if (rs->xbzrle_started) {
2109             XBZRLE_cache_lock();
2110             xbzrle_cache_zero_page(rs, block->offset + offset);
2111             XBZRLE_cache_unlock();
2112         }
2113         return res;
2114     }
2115 
2116     /*
2117      * Do not use multifd in postcopy as one whole host page should be
2118      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2119      * if host page size == guest page size the dest guest during run may
2120      * still see partially copied pages which is data corruption.
2121      */
2122     if (migrate_multifd() && !migration_in_postcopy()) {
2123         return ram_save_multifd_page(pss->pss_channel, block, offset);
2124     }
2125 
2126     return ram_save_page(rs, pss);
2127 }
2128 
2129 /* Should be called before sending a host page */
2130 static void pss_host_page_prepare(PageSearchStatus *pss)
2131 {
2132     /* How many guest pages are there in one host page? */
2133     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2134 
2135     pss->host_page_sending = true;
2136     if (guest_pfns <= 1) {
2137         /*
2138          * This covers both when guest psize == host psize, or when guest
2139          * has larger psize than the host (guest_pfns==0).
2140          *
2141          * For the latter, we always send one whole guest page per
2142          * iteration of the host page (example: an Alpha VM on x86 host
2143          * will have guest psize 8K while host psize 4K).
2144          */
2145         pss->host_page_start = pss->page;
2146         pss->host_page_end = pss->page + 1;
2147     } else {
2148         /*
2149          * The host page spans over multiple guest pages, we send them
2150          * within the same host page iteration.
2151          */
2152         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2153         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2154     }
2155 }
2156 
2157 /*
2158  * Whether the page pointed by PSS is within the host page being sent.
2159  * Must be called after a previous pss_host_page_prepare().
2160  */
2161 static bool pss_within_range(PageSearchStatus *pss)
2162 {
2163     ram_addr_t ram_addr;
2164 
2165     assert(pss->host_page_sending);
2166 
2167     /* Over host-page boundary? */
2168     if (pss->page >= pss->host_page_end) {
2169         return false;
2170     }
2171 
2172     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2173 
2174     return offset_in_ramblock(pss->block, ram_addr);
2175 }
2176 
2177 static void pss_host_page_finish(PageSearchStatus *pss)
2178 {
2179     pss->host_page_sending = false;
2180     /* This is not needed, but just to reset it */
2181     pss->host_page_start = pss->host_page_end = 0;
2182 }
2183 
2184 /*
2185  * Send an urgent host page specified by `pss'.  Need to be called with
2186  * bitmap_mutex held.
2187  *
2188  * Returns 0 if save host page succeeded, false otherwise.
2189  */
2190 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2191 {
2192     bool page_dirty, sent = false;
2193     RAMState *rs = ram_state;
2194     int ret = 0;
2195 
2196     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2197     pss_host_page_prepare(pss);
2198 
2199     /*
2200      * If precopy is sending the same page, let it be done in precopy, or
2201      * we could send the same page in two channels and none of them will
2202      * receive the whole page.
2203      */
2204     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2205         trace_postcopy_preempt_hit(pss->block->idstr,
2206                                    pss->page << TARGET_PAGE_BITS);
2207         return 0;
2208     }
2209 
2210     do {
2211         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2212 
2213         if (page_dirty) {
2214             /* Be strict to return code; it must be 1, or what else? */
2215             if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2216                 error_report_once("%s: ram_save_target_page failed", __func__);
2217                 ret = -1;
2218                 goto out;
2219             }
2220             sent = true;
2221         }
2222         pss_find_next_dirty(pss);
2223     } while (pss_within_range(pss));
2224 out:
2225     pss_host_page_finish(pss);
2226     /* For urgent requests, flush immediately if sent */
2227     if (sent) {
2228         qemu_fflush(pss->pss_channel);
2229     }
2230     return ret;
2231 }
2232 
2233 /**
2234  * ram_save_host_page: save a whole host page
2235  *
2236  * Starting at *offset send pages up to the end of the current host
2237  * page. It's valid for the initial offset to point into the middle of
2238  * a host page in which case the remainder of the hostpage is sent.
2239  * Only dirty target pages are sent. Note that the host page size may
2240  * be a huge page for this block.
2241  *
2242  * The saving stops at the boundary of the used_length of the block
2243  * if the RAMBlock isn't a multiple of the host page size.
2244  *
2245  * The caller must be with ram_state.bitmap_mutex held to call this
2246  * function.  Note that this function can temporarily release the lock, but
2247  * when the function is returned it'll make sure the lock is still held.
2248  *
2249  * Returns the number of pages written or negative on error
2250  *
2251  * @rs: current RAM state
2252  * @pss: data about the page we want to send
2253  */
2254 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2255 {
2256     bool page_dirty, preempt_active = postcopy_preempt_active();
2257     int tmppages, pages = 0;
2258     size_t pagesize_bits =
2259         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2260     unsigned long start_page = pss->page;
2261     int res;
2262 
2263     if (ramblock_is_ignored(pss->block)) {
2264         error_report("block %s should not be migrated !", pss->block->idstr);
2265         return 0;
2266     }
2267 
2268     /* Update host page boundary information */
2269     pss_host_page_prepare(pss);
2270 
2271     do {
2272         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2273 
2274         /* Check the pages is dirty and if it is send it */
2275         if (page_dirty) {
2276             /*
2277              * Properly yield the lock only in postcopy preempt mode
2278              * because both migration thread and rp-return thread can
2279              * operate on the bitmaps.
2280              */
2281             if (preempt_active) {
2282                 qemu_mutex_unlock(&rs->bitmap_mutex);
2283             }
2284             tmppages = migration_ops->ram_save_target_page(rs, pss);
2285             if (tmppages >= 0) {
2286                 pages += tmppages;
2287                 /*
2288                  * Allow rate limiting to happen in the middle of huge pages if
2289                  * something is sent in the current iteration.
2290                  */
2291                 if (pagesize_bits > 1 && tmppages > 0) {
2292                     migration_rate_limit();
2293                 }
2294             }
2295             if (preempt_active) {
2296                 qemu_mutex_lock(&rs->bitmap_mutex);
2297             }
2298         } else {
2299             tmppages = 0;
2300         }
2301 
2302         if (tmppages < 0) {
2303             pss_host_page_finish(pss);
2304             return tmppages;
2305         }
2306 
2307         pss_find_next_dirty(pss);
2308     } while (pss_within_range(pss));
2309 
2310     pss_host_page_finish(pss);
2311 
2312     res = ram_save_release_protection(rs, pss, start_page);
2313     return (res < 0 ? res : pages);
2314 }
2315 
2316 /**
2317  * ram_find_and_save_block: finds a dirty page and sends it to f
2318  *
2319  * Called within an RCU critical section.
2320  *
2321  * Returns the number of pages written where zero means no dirty pages,
2322  * or negative on error
2323  *
2324  * @rs: current RAM state
2325  *
2326  * On systems where host-page-size > target-page-size it will send all the
2327  * pages in a host page that are dirty.
2328  */
2329 static int ram_find_and_save_block(RAMState *rs)
2330 {
2331     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2332     int pages = 0;
2333 
2334     /* No dirty page as there is zero RAM */
2335     if (!rs->ram_bytes_total) {
2336         return pages;
2337     }
2338 
2339     /*
2340      * Always keep last_seen_block/last_page valid during this procedure,
2341      * because find_dirty_block() relies on these values (e.g., we compare
2342      * last_seen_block with pss.block to see whether we searched all the
2343      * ramblocks) to detect the completion of migration.  Having NULL value
2344      * of last_seen_block can conditionally cause below loop to run forever.
2345      */
2346     if (!rs->last_seen_block) {
2347         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2348         rs->last_page = 0;
2349     }
2350 
2351     pss_init(pss, rs->last_seen_block, rs->last_page);
2352 
2353     while (true){
2354         if (!get_queued_page(rs, pss)) {
2355             /* priority queue empty, so just search for something dirty */
2356             int res = find_dirty_block(rs, pss);
2357             if (res != PAGE_DIRTY_FOUND) {
2358                 if (res == PAGE_ALL_CLEAN) {
2359                     break;
2360                 } else if (res == PAGE_TRY_AGAIN) {
2361                     continue;
2362                 } else if (res < 0) {
2363                     pages = res;
2364                     break;
2365                 }
2366             }
2367         }
2368         pages = ram_save_host_page(rs, pss);
2369         if (pages) {
2370             break;
2371         }
2372     }
2373 
2374     rs->last_seen_block = pss->block;
2375     rs->last_page = pss->page;
2376 
2377     return pages;
2378 }
2379 
2380 static uint64_t ram_bytes_total_with_ignored(void)
2381 {
2382     RAMBlock *block;
2383     uint64_t total = 0;
2384 
2385     RCU_READ_LOCK_GUARD();
2386 
2387     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2388         total += block->used_length;
2389     }
2390     return total;
2391 }
2392 
2393 uint64_t ram_bytes_total(void)
2394 {
2395     RAMBlock *block;
2396     uint64_t total = 0;
2397 
2398     RCU_READ_LOCK_GUARD();
2399 
2400     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2401         total += block->used_length;
2402     }
2403     return total;
2404 }
2405 
2406 static void xbzrle_load_setup(void)
2407 {
2408     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2409 }
2410 
2411 static void xbzrle_load_cleanup(void)
2412 {
2413     g_free(XBZRLE.decoded_buf);
2414     XBZRLE.decoded_buf = NULL;
2415 }
2416 
2417 static void ram_state_cleanup(RAMState **rsp)
2418 {
2419     if (*rsp) {
2420         migration_page_queue_free(*rsp);
2421         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2422         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2423         g_free(*rsp);
2424         *rsp = NULL;
2425     }
2426 }
2427 
2428 static void xbzrle_cleanup(void)
2429 {
2430     XBZRLE_cache_lock();
2431     if (XBZRLE.cache) {
2432         cache_fini(XBZRLE.cache);
2433         g_free(XBZRLE.encoded_buf);
2434         g_free(XBZRLE.current_buf);
2435         g_free(XBZRLE.zero_target_page);
2436         XBZRLE.cache = NULL;
2437         XBZRLE.encoded_buf = NULL;
2438         XBZRLE.current_buf = NULL;
2439         XBZRLE.zero_target_page = NULL;
2440     }
2441     XBZRLE_cache_unlock();
2442 }
2443 
2444 static void ram_save_cleanup(void *opaque)
2445 {
2446     RAMState **rsp = opaque;
2447     RAMBlock *block;
2448 
2449     /* We don't use dirty log with background snapshots */
2450     if (!migrate_background_snapshot()) {
2451         /* caller have hold iothread lock or is in a bh, so there is
2452          * no writing race against the migration bitmap
2453          */
2454         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2455             /*
2456              * do not stop dirty log without starting it, since
2457              * memory_global_dirty_log_stop will assert that
2458              * memory_global_dirty_log_start/stop used in pairs
2459              */
2460             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2461         }
2462     }
2463 
2464     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2465         g_free(block->clear_bmap);
2466         block->clear_bmap = NULL;
2467         g_free(block->bmap);
2468         block->bmap = NULL;
2469     }
2470 
2471     xbzrle_cleanup();
2472     compress_threads_save_cleanup();
2473     ram_state_cleanup(rsp);
2474     g_free(migration_ops);
2475     migration_ops = NULL;
2476 }
2477 
2478 static void ram_state_reset(RAMState *rs)
2479 {
2480     int i;
2481 
2482     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2483         rs->pss[i].last_sent_block = NULL;
2484     }
2485 
2486     rs->last_seen_block = NULL;
2487     rs->last_page = 0;
2488     rs->last_version = ram_list.version;
2489     rs->xbzrle_started = false;
2490 }
2491 
2492 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2493 
2494 /* **** functions for postcopy ***** */
2495 
2496 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2497 {
2498     struct RAMBlock *block;
2499 
2500     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2501         unsigned long *bitmap = block->bmap;
2502         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2503         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2504 
2505         while (run_start < range) {
2506             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2507             ram_discard_range(block->idstr,
2508                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2509                               ((ram_addr_t)(run_end - run_start))
2510                                 << TARGET_PAGE_BITS);
2511             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2512         }
2513     }
2514 }
2515 
2516 /**
2517  * postcopy_send_discard_bm_ram: discard a RAMBlock
2518  *
2519  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2520  *
2521  * @ms: current migration state
2522  * @block: RAMBlock to discard
2523  */
2524 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2525 {
2526     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2527     unsigned long current;
2528     unsigned long *bitmap = block->bmap;
2529 
2530     for (current = 0; current < end; ) {
2531         unsigned long one = find_next_bit(bitmap, end, current);
2532         unsigned long zero, discard_length;
2533 
2534         if (one >= end) {
2535             break;
2536         }
2537 
2538         zero = find_next_zero_bit(bitmap, end, one + 1);
2539 
2540         if (zero >= end) {
2541             discard_length = end - one;
2542         } else {
2543             discard_length = zero - one;
2544         }
2545         postcopy_discard_send_range(ms, one, discard_length);
2546         current = one + discard_length;
2547     }
2548 }
2549 
2550 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2551 
2552 /**
2553  * postcopy_each_ram_send_discard: discard all RAMBlocks
2554  *
2555  * Utility for the outgoing postcopy code.
2556  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2557  *   passing it bitmap indexes and name.
2558  * (qemu_ram_foreach_block ends up passing unscaled lengths
2559  *  which would mean postcopy code would have to deal with target page)
2560  *
2561  * @ms: current migration state
2562  */
2563 static void postcopy_each_ram_send_discard(MigrationState *ms)
2564 {
2565     struct RAMBlock *block;
2566 
2567     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2568         postcopy_discard_send_init(ms, block->idstr);
2569 
2570         /*
2571          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2572          * host-page size chunks, mark any partially dirty host-page size
2573          * chunks as all dirty.  In this case the host-page is the host-page
2574          * for the particular RAMBlock, i.e. it might be a huge page.
2575          */
2576         postcopy_chunk_hostpages_pass(ms, block);
2577 
2578         /*
2579          * Postcopy sends chunks of bitmap over the wire, but it
2580          * just needs indexes at this point, avoids it having
2581          * target page specific code.
2582          */
2583         postcopy_send_discard_bm_ram(ms, block);
2584         postcopy_discard_send_finish(ms);
2585     }
2586 }
2587 
2588 /**
2589  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2590  *
2591  * Helper for postcopy_chunk_hostpages; it's called twice to
2592  * canonicalize the two bitmaps, that are similar, but one is
2593  * inverted.
2594  *
2595  * Postcopy requires that all target pages in a hostpage are dirty or
2596  * clean, not a mix.  This function canonicalizes the bitmaps.
2597  *
2598  * @ms: current migration state
2599  * @block: block that contains the page we want to canonicalize
2600  */
2601 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2602 {
2603     RAMState *rs = ram_state;
2604     unsigned long *bitmap = block->bmap;
2605     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2606     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2607     unsigned long run_start;
2608 
2609     if (block->page_size == TARGET_PAGE_SIZE) {
2610         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2611         return;
2612     }
2613 
2614     /* Find a dirty page */
2615     run_start = find_next_bit(bitmap, pages, 0);
2616 
2617     while (run_start < pages) {
2618 
2619         /*
2620          * If the start of this run of pages is in the middle of a host
2621          * page, then we need to fixup this host page.
2622          */
2623         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2624             /* Find the end of this run */
2625             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2626             /*
2627              * If the end isn't at the start of a host page, then the
2628              * run doesn't finish at the end of a host page
2629              * and we need to discard.
2630              */
2631         }
2632 
2633         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2634             unsigned long page;
2635             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2636                                                              host_ratio);
2637             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2638 
2639             /* Clean up the bitmap */
2640             for (page = fixup_start_addr;
2641                  page < fixup_start_addr + host_ratio; page++) {
2642                 /*
2643                  * Remark them as dirty, updating the count for any pages
2644                  * that weren't previously dirty.
2645                  */
2646                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2647             }
2648         }
2649 
2650         /* Find the next dirty page for the next iteration */
2651         run_start = find_next_bit(bitmap, pages, run_start);
2652     }
2653 }
2654 
2655 /**
2656  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2657  *
2658  * Transmit the set of pages to be discarded after precopy to the target
2659  * these are pages that:
2660  *     a) Have been previously transmitted but are now dirty again
2661  *     b) Pages that have never been transmitted, this ensures that
2662  *        any pages on the destination that have been mapped by background
2663  *        tasks get discarded (transparent huge pages is the specific concern)
2664  * Hopefully this is pretty sparse
2665  *
2666  * @ms: current migration state
2667  */
2668 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2669 {
2670     RAMState *rs = ram_state;
2671 
2672     RCU_READ_LOCK_GUARD();
2673 
2674     /* This should be our last sync, the src is now paused */
2675     migration_bitmap_sync(rs, false);
2676 
2677     /* Easiest way to make sure we don't resume in the middle of a host-page */
2678     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2679     rs->last_seen_block = NULL;
2680     rs->last_page = 0;
2681 
2682     postcopy_each_ram_send_discard(ms);
2683 
2684     trace_ram_postcopy_send_discard_bitmap();
2685 }
2686 
2687 /**
2688  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2689  *
2690  * Returns zero on success
2691  *
2692  * @rbname: name of the RAMBlock of the request. NULL means the
2693  *          same that last one.
2694  * @start: RAMBlock starting page
2695  * @length: RAMBlock size
2696  */
2697 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2698 {
2699     trace_ram_discard_range(rbname, start, length);
2700 
2701     RCU_READ_LOCK_GUARD();
2702     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2703 
2704     if (!rb) {
2705         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2706         return -1;
2707     }
2708 
2709     /*
2710      * On source VM, we don't need to update the received bitmap since
2711      * we don't even have one.
2712      */
2713     if (rb->receivedmap) {
2714         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2715                      length >> qemu_target_page_bits());
2716     }
2717 
2718     return ram_block_discard_range(rb, start, length);
2719 }
2720 
2721 /*
2722  * For every allocation, we will try not to crash the VM if the
2723  * allocation failed.
2724  */
2725 static int xbzrle_init(void)
2726 {
2727     Error *local_err = NULL;
2728 
2729     if (!migrate_xbzrle()) {
2730         return 0;
2731     }
2732 
2733     XBZRLE_cache_lock();
2734 
2735     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2736     if (!XBZRLE.zero_target_page) {
2737         error_report("%s: Error allocating zero page", __func__);
2738         goto err_out;
2739     }
2740 
2741     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2742                               TARGET_PAGE_SIZE, &local_err);
2743     if (!XBZRLE.cache) {
2744         error_report_err(local_err);
2745         goto free_zero_page;
2746     }
2747 
2748     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2749     if (!XBZRLE.encoded_buf) {
2750         error_report("%s: Error allocating encoded_buf", __func__);
2751         goto free_cache;
2752     }
2753 
2754     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2755     if (!XBZRLE.current_buf) {
2756         error_report("%s: Error allocating current_buf", __func__);
2757         goto free_encoded_buf;
2758     }
2759 
2760     /* We are all good */
2761     XBZRLE_cache_unlock();
2762     return 0;
2763 
2764 free_encoded_buf:
2765     g_free(XBZRLE.encoded_buf);
2766     XBZRLE.encoded_buf = NULL;
2767 free_cache:
2768     cache_fini(XBZRLE.cache);
2769     XBZRLE.cache = NULL;
2770 free_zero_page:
2771     g_free(XBZRLE.zero_target_page);
2772     XBZRLE.zero_target_page = NULL;
2773 err_out:
2774     XBZRLE_cache_unlock();
2775     return -ENOMEM;
2776 }
2777 
2778 static int ram_state_init(RAMState **rsp)
2779 {
2780     *rsp = g_try_new0(RAMState, 1);
2781 
2782     if (!*rsp) {
2783         error_report("%s: Init ramstate fail", __func__);
2784         return -1;
2785     }
2786 
2787     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2788     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2789     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2790     (*rsp)->ram_bytes_total = ram_bytes_total();
2791 
2792     /*
2793      * Count the total number of pages used by ram blocks not including any
2794      * gaps due to alignment or unplugs.
2795      * This must match with the initial values of dirty bitmap.
2796      */
2797     (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
2798     ram_state_reset(*rsp);
2799 
2800     return 0;
2801 }
2802 
2803 static void ram_list_init_bitmaps(void)
2804 {
2805     MigrationState *ms = migrate_get_current();
2806     RAMBlock *block;
2807     unsigned long pages;
2808     uint8_t shift;
2809 
2810     /* Skip setting bitmap if there is no RAM */
2811     if (ram_bytes_total()) {
2812         shift = ms->clear_bitmap_shift;
2813         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2814             error_report("clear_bitmap_shift (%u) too big, using "
2815                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2816             shift = CLEAR_BITMAP_SHIFT_MAX;
2817         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2818             error_report("clear_bitmap_shift (%u) too small, using "
2819                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2820             shift = CLEAR_BITMAP_SHIFT_MIN;
2821         }
2822 
2823         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2824             pages = block->max_length >> TARGET_PAGE_BITS;
2825             /*
2826              * The initial dirty bitmap for migration must be set with all
2827              * ones to make sure we'll migrate every guest RAM page to
2828              * destination.
2829              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2830              * new migration after a failed migration, ram_list.
2831              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2832              * guest memory.
2833              */
2834             block->bmap = bitmap_new(pages);
2835             bitmap_set(block->bmap, 0, pages);
2836             block->clear_bmap_shift = shift;
2837             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2838         }
2839     }
2840 }
2841 
2842 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2843 {
2844     unsigned long pages;
2845     RAMBlock *rb;
2846 
2847     RCU_READ_LOCK_GUARD();
2848 
2849     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2850             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2851             rs->migration_dirty_pages -= pages;
2852     }
2853 }
2854 
2855 static void ram_init_bitmaps(RAMState *rs)
2856 {
2857     /* For memory_global_dirty_log_start below.  */
2858     qemu_mutex_lock_iothread();
2859     qemu_mutex_lock_ramlist();
2860 
2861     WITH_RCU_READ_LOCK_GUARD() {
2862         ram_list_init_bitmaps();
2863         /* We don't use dirty log with background snapshots */
2864         if (!migrate_background_snapshot()) {
2865             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2866             migration_bitmap_sync_precopy(rs, false);
2867         }
2868     }
2869     qemu_mutex_unlock_ramlist();
2870     qemu_mutex_unlock_iothread();
2871 
2872     /*
2873      * After an eventual first bitmap sync, fixup the initial bitmap
2874      * containing all 1s to exclude any discarded pages from migration.
2875      */
2876     migration_bitmap_clear_discarded_pages(rs);
2877 }
2878 
2879 static int ram_init_all(RAMState **rsp)
2880 {
2881     if (ram_state_init(rsp)) {
2882         return -1;
2883     }
2884 
2885     if (xbzrle_init()) {
2886         ram_state_cleanup(rsp);
2887         return -1;
2888     }
2889 
2890     ram_init_bitmaps(*rsp);
2891 
2892     return 0;
2893 }
2894 
2895 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2896 {
2897     RAMBlock *block;
2898     uint64_t pages = 0;
2899 
2900     /*
2901      * Postcopy is not using xbzrle/compression, so no need for that.
2902      * Also, since source are already halted, we don't need to care
2903      * about dirty page logging as well.
2904      */
2905 
2906     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2907         pages += bitmap_count_one(block->bmap,
2908                                   block->used_length >> TARGET_PAGE_BITS);
2909     }
2910 
2911     /* This may not be aligned with current bitmaps. Recalculate. */
2912     rs->migration_dirty_pages = pages;
2913 
2914     ram_state_reset(rs);
2915 
2916     /* Update RAMState cache of output QEMUFile */
2917     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
2918 
2919     trace_ram_state_resume_prepare(pages);
2920 }
2921 
2922 /*
2923  * This function clears bits of the free pages reported by the caller from the
2924  * migration dirty bitmap. @addr is the host address corresponding to the
2925  * start of the continuous guest free pages, and @len is the total bytes of
2926  * those pages.
2927  */
2928 void qemu_guest_free_page_hint(void *addr, size_t len)
2929 {
2930     RAMBlock *block;
2931     ram_addr_t offset;
2932     size_t used_len, start, npages;
2933     MigrationState *s = migrate_get_current();
2934 
2935     /* This function is currently expected to be used during live migration */
2936     if (!migration_is_setup_or_active(s->state)) {
2937         return;
2938     }
2939 
2940     for (; len > 0; len -= used_len, addr += used_len) {
2941         block = qemu_ram_block_from_host(addr, false, &offset);
2942         if (unlikely(!block || offset >= block->used_length)) {
2943             /*
2944              * The implementation might not support RAMBlock resize during
2945              * live migration, but it could happen in theory with future
2946              * updates. So we add a check here to capture that case.
2947              */
2948             error_report_once("%s unexpected error", __func__);
2949             return;
2950         }
2951 
2952         if (len <= block->used_length - offset) {
2953             used_len = len;
2954         } else {
2955             used_len = block->used_length - offset;
2956         }
2957 
2958         start = offset >> TARGET_PAGE_BITS;
2959         npages = used_len >> TARGET_PAGE_BITS;
2960 
2961         qemu_mutex_lock(&ram_state->bitmap_mutex);
2962         /*
2963          * The skipped free pages are equavalent to be sent from clear_bmap's
2964          * perspective, so clear the bits from the memory region bitmap which
2965          * are initially set. Otherwise those skipped pages will be sent in
2966          * the next round after syncing from the memory region bitmap.
2967          */
2968         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2969         ram_state->migration_dirty_pages -=
2970                       bitmap_count_one_with_offset(block->bmap, start, npages);
2971         bitmap_clear(block->bmap, start, npages);
2972         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2973     }
2974 }
2975 
2976 /*
2977  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2978  * long-running RCU critical section.  When rcu-reclaims in the code
2979  * start to become numerous it will be necessary to reduce the
2980  * granularity of these critical sections.
2981  */
2982 
2983 /**
2984  * ram_save_setup: Setup RAM for migration
2985  *
2986  * Returns zero to indicate success and negative for error
2987  *
2988  * @f: QEMUFile where to send the data
2989  * @opaque: RAMState pointer
2990  */
2991 static int ram_save_setup(QEMUFile *f, void *opaque)
2992 {
2993     RAMState **rsp = opaque;
2994     RAMBlock *block;
2995     int ret;
2996 
2997     if (compress_threads_save_setup()) {
2998         return -1;
2999     }
3000 
3001     /* migration has already setup the bitmap, reuse it. */
3002     if (!migration_in_colo_state()) {
3003         if (ram_init_all(rsp) != 0) {
3004             compress_threads_save_cleanup();
3005             return -1;
3006         }
3007     }
3008     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3009 
3010     WITH_RCU_READ_LOCK_GUARD() {
3011         qemu_put_be64(f, ram_bytes_total_with_ignored()
3012                          | RAM_SAVE_FLAG_MEM_SIZE);
3013 
3014         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3015             qemu_put_byte(f, strlen(block->idstr));
3016             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3017             qemu_put_be64(f, block->used_length);
3018             if (migrate_postcopy_ram() && block->page_size !=
3019                                           qemu_host_page_size) {
3020                 qemu_put_be64(f, block->page_size);
3021             }
3022             if (migrate_ignore_shared()) {
3023                 qemu_put_be64(f, block->mr->addr);
3024             }
3025         }
3026     }
3027 
3028     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3029     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3030 
3031     migration_ops = g_malloc0(sizeof(MigrationOps));
3032     migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3033     ret = multifd_send_sync_main(f);
3034     if (ret < 0) {
3035         return ret;
3036     }
3037 
3038     if (!migrate_multifd_flush_after_each_section()) {
3039         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3040     }
3041 
3042     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3043     qemu_fflush(f);
3044 
3045     return 0;
3046 }
3047 
3048 /**
3049  * ram_save_iterate: iterative stage for migration
3050  *
3051  * Returns zero to indicate success and negative for error
3052  *
3053  * @f: QEMUFile where to send the data
3054  * @opaque: RAMState pointer
3055  */
3056 static int ram_save_iterate(QEMUFile *f, void *opaque)
3057 {
3058     RAMState **temp = opaque;
3059     RAMState *rs = *temp;
3060     int ret = 0;
3061     int i;
3062     int64_t t0;
3063     int done = 0;
3064 
3065     if (blk_mig_bulk_active()) {
3066         /* Avoid transferring ram during bulk phase of block migration as
3067          * the bulk phase will usually take a long time and transferring
3068          * ram updates during that time is pointless. */
3069         goto out;
3070     }
3071 
3072     /*
3073      * We'll take this lock a little bit long, but it's okay for two reasons.
3074      * Firstly, the only possible other thread to take it is who calls
3075      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3076      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3077      * guarantees that we'll at least released it in a regular basis.
3078      */
3079     qemu_mutex_lock(&rs->bitmap_mutex);
3080     WITH_RCU_READ_LOCK_GUARD() {
3081         if (ram_list.version != rs->last_version) {
3082             ram_state_reset(rs);
3083         }
3084 
3085         /* Read version before ram_list.blocks */
3086         smp_rmb();
3087 
3088         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3089 
3090         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3091         i = 0;
3092         while ((ret = migration_rate_exceeded(f)) == 0 ||
3093                postcopy_has_request(rs)) {
3094             int pages;
3095 
3096             if (qemu_file_get_error(f)) {
3097                 break;
3098             }
3099 
3100             pages = ram_find_and_save_block(rs);
3101             /* no more pages to sent */
3102             if (pages == 0) {
3103                 done = 1;
3104                 break;
3105             }
3106 
3107             if (pages < 0) {
3108                 qemu_file_set_error(f, pages);
3109                 break;
3110             }
3111 
3112             rs->target_page_count += pages;
3113 
3114             /*
3115              * During postcopy, it is necessary to make sure one whole host
3116              * page is sent in one chunk.
3117              */
3118             if (migrate_postcopy_ram()) {
3119                 ram_flush_compressed_data(rs);
3120             }
3121 
3122             /*
3123              * we want to check in the 1st loop, just in case it was the 1st
3124              * time and we had to sync the dirty bitmap.
3125              * qemu_clock_get_ns() is a bit expensive, so we only check each
3126              * some iterations
3127              */
3128             if ((i & 63) == 0) {
3129                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3130                               1000000;
3131                 if (t1 > MAX_WAIT) {
3132                     trace_ram_save_iterate_big_wait(t1, i);
3133                     break;
3134                 }
3135             }
3136             i++;
3137         }
3138     }
3139     qemu_mutex_unlock(&rs->bitmap_mutex);
3140 
3141     /*
3142      * Must occur before EOS (or any QEMUFile operation)
3143      * because of RDMA protocol.
3144      */
3145     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3146 
3147 out:
3148     if (ret >= 0
3149         && migration_is_setup_or_active(migrate_get_current()->state)) {
3150         if (migrate_multifd_flush_after_each_section()) {
3151             ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3152             if (ret < 0) {
3153                 return ret;
3154             }
3155         }
3156 
3157         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3158         qemu_fflush(f);
3159         ram_transferred_add(8);
3160 
3161         ret = qemu_file_get_error(f);
3162     }
3163     if (ret < 0) {
3164         return ret;
3165     }
3166 
3167     return done;
3168 }
3169 
3170 /**
3171  * ram_save_complete: function called to send the remaining amount of ram
3172  *
3173  * Returns zero to indicate success or negative on error
3174  *
3175  * Called with iothread lock
3176  *
3177  * @f: QEMUFile where to send the data
3178  * @opaque: RAMState pointer
3179  */
3180 static int ram_save_complete(QEMUFile *f, void *opaque)
3181 {
3182     RAMState **temp = opaque;
3183     RAMState *rs = *temp;
3184     int ret = 0;
3185 
3186     rs->last_stage = !migration_in_colo_state();
3187 
3188     WITH_RCU_READ_LOCK_GUARD() {
3189         if (!migration_in_postcopy()) {
3190             migration_bitmap_sync_precopy(rs, true);
3191         }
3192 
3193         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3194 
3195         /* try transferring iterative blocks of memory */
3196 
3197         /* flush all remaining blocks regardless of rate limiting */
3198         qemu_mutex_lock(&rs->bitmap_mutex);
3199         while (true) {
3200             int pages;
3201 
3202             pages = ram_find_and_save_block(rs);
3203             /* no more blocks to sent */
3204             if (pages == 0) {
3205                 break;
3206             }
3207             if (pages < 0) {
3208                 ret = pages;
3209                 break;
3210             }
3211         }
3212         qemu_mutex_unlock(&rs->bitmap_mutex);
3213 
3214         ram_flush_compressed_data(rs);
3215         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3216     }
3217 
3218     if (ret < 0) {
3219         return ret;
3220     }
3221 
3222     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3223     if (ret < 0) {
3224         return ret;
3225     }
3226 
3227     if (!migrate_multifd_flush_after_each_section()) {
3228         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3229     }
3230     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3231     qemu_fflush(f);
3232 
3233     return 0;
3234 }
3235 
3236 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3237                                        uint64_t *can_postcopy)
3238 {
3239     RAMState **temp = opaque;
3240     RAMState *rs = *temp;
3241 
3242     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3243 
3244     if (migrate_postcopy_ram()) {
3245         /* We can do postcopy, and all the data is postcopiable */
3246         *can_postcopy += remaining_size;
3247     } else {
3248         *must_precopy += remaining_size;
3249     }
3250 }
3251 
3252 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3253                                     uint64_t *can_postcopy)
3254 {
3255     MigrationState *s = migrate_get_current();
3256     RAMState **temp = opaque;
3257     RAMState *rs = *temp;
3258 
3259     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3260 
3261     if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3262         qemu_mutex_lock_iothread();
3263         WITH_RCU_READ_LOCK_GUARD() {
3264             migration_bitmap_sync_precopy(rs, false);
3265         }
3266         qemu_mutex_unlock_iothread();
3267         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3268     }
3269 
3270     if (migrate_postcopy_ram()) {
3271         /* We can do postcopy, and all the data is postcopiable */
3272         *can_postcopy += remaining_size;
3273     } else {
3274         *must_precopy += remaining_size;
3275     }
3276 }
3277 
3278 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3279 {
3280     unsigned int xh_len;
3281     int xh_flags;
3282     uint8_t *loaded_data;
3283 
3284     /* extract RLE header */
3285     xh_flags = qemu_get_byte(f);
3286     xh_len = qemu_get_be16(f);
3287 
3288     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3289         error_report("Failed to load XBZRLE page - wrong compression!");
3290         return -1;
3291     }
3292 
3293     if (xh_len > TARGET_PAGE_SIZE) {
3294         error_report("Failed to load XBZRLE page - len overflow!");
3295         return -1;
3296     }
3297     loaded_data = XBZRLE.decoded_buf;
3298     /* load data and decode */
3299     /* it can change loaded_data to point to an internal buffer */
3300     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3301 
3302     /* decode RLE */
3303     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3304                              TARGET_PAGE_SIZE) == -1) {
3305         error_report("Failed to load XBZRLE page - decode error!");
3306         return -1;
3307     }
3308 
3309     return 0;
3310 }
3311 
3312 /**
3313  * ram_block_from_stream: read a RAMBlock id from the migration stream
3314  *
3315  * Must be called from within a rcu critical section.
3316  *
3317  * Returns a pointer from within the RCU-protected ram_list.
3318  *
3319  * @mis: the migration incoming state pointer
3320  * @f: QEMUFile where to read the data from
3321  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3322  * @channel: the channel we're using
3323  */
3324 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3325                                               QEMUFile *f, int flags,
3326                                               int channel)
3327 {
3328     RAMBlock *block = mis->last_recv_block[channel];
3329     char id[256];
3330     uint8_t len;
3331 
3332     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3333         if (!block) {
3334             error_report("Ack, bad migration stream!");
3335             return NULL;
3336         }
3337         return block;
3338     }
3339 
3340     len = qemu_get_byte(f);
3341     qemu_get_buffer(f, (uint8_t *)id, len);
3342     id[len] = 0;
3343 
3344     block = qemu_ram_block_by_name(id);
3345     if (!block) {
3346         error_report("Can't find block %s", id);
3347         return NULL;
3348     }
3349 
3350     if (ramblock_is_ignored(block)) {
3351         error_report("block %s should not be migrated !", id);
3352         return NULL;
3353     }
3354 
3355     mis->last_recv_block[channel] = block;
3356 
3357     return block;
3358 }
3359 
3360 static inline void *host_from_ram_block_offset(RAMBlock *block,
3361                                                ram_addr_t offset)
3362 {
3363     if (!offset_in_ramblock(block, offset)) {
3364         return NULL;
3365     }
3366 
3367     return block->host + offset;
3368 }
3369 
3370 static void *host_page_from_ram_block_offset(RAMBlock *block,
3371                                              ram_addr_t offset)
3372 {
3373     /* Note: Explicitly no check against offset_in_ramblock(). */
3374     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3375                                    block->page_size);
3376 }
3377 
3378 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3379                                                          ram_addr_t offset)
3380 {
3381     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3382 }
3383 
3384 void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages)
3385 {
3386     qemu_mutex_lock(&ram_state->bitmap_mutex);
3387     for (int i = 0; i < pages; i++) {
3388         ram_addr_t offset = normal[i];
3389         ram_state->migration_dirty_pages += !test_and_set_bit(
3390                                                 offset >> TARGET_PAGE_BITS,
3391                                                 block->bmap);
3392     }
3393     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3394 }
3395 
3396 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3397                              ram_addr_t offset, bool record_bitmap)
3398 {
3399     if (!offset_in_ramblock(block, offset)) {
3400         return NULL;
3401     }
3402     if (!block->colo_cache) {
3403         error_report("%s: colo_cache is NULL in block :%s",
3404                      __func__, block->idstr);
3405         return NULL;
3406     }
3407 
3408     /*
3409     * During colo checkpoint, we need bitmap of these migrated pages.
3410     * It help us to decide which pages in ram cache should be flushed
3411     * into VM's RAM later.
3412     */
3413     if (record_bitmap) {
3414         colo_record_bitmap(block, &offset, 1);
3415     }
3416     return block->colo_cache + offset;
3417 }
3418 
3419 /**
3420  * ram_handle_compressed: handle the zero page case
3421  *
3422  * If a page (or a whole RDMA chunk) has been
3423  * determined to be zero, then zap it.
3424  *
3425  * @host: host address for the zero page
3426  * @ch: what the page is filled from.  We only support zero
3427  * @size: size of the zero page
3428  */
3429 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3430 {
3431     if (ch != 0 || !buffer_is_zero(host, size)) {
3432         memset(host, ch, size);
3433     }
3434 }
3435 
3436 static void colo_init_ram_state(void)
3437 {
3438     ram_state_init(&ram_state);
3439 }
3440 
3441 /*
3442  * colo cache: this is for secondary VM, we cache the whole
3443  * memory of the secondary VM, it is need to hold the global lock
3444  * to call this helper.
3445  */
3446 int colo_init_ram_cache(void)
3447 {
3448     RAMBlock *block;
3449 
3450     WITH_RCU_READ_LOCK_GUARD() {
3451         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3452             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3453                                                     NULL, false, false);
3454             if (!block->colo_cache) {
3455                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3456                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3457                              block->used_length);
3458                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3459                     if (block->colo_cache) {
3460                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3461                         block->colo_cache = NULL;
3462                     }
3463                 }
3464                 return -errno;
3465             }
3466             if (!machine_dump_guest_core(current_machine)) {
3467                 qemu_madvise(block->colo_cache, block->used_length,
3468                              QEMU_MADV_DONTDUMP);
3469             }
3470         }
3471     }
3472 
3473     /*
3474     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3475     * with to decide which page in cache should be flushed into SVM's RAM. Here
3476     * we use the same name 'ram_bitmap' as for migration.
3477     */
3478     if (ram_bytes_total()) {
3479         RAMBlock *block;
3480 
3481         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3482             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3483             block->bmap = bitmap_new(pages);
3484         }
3485     }
3486 
3487     colo_init_ram_state();
3488     return 0;
3489 }
3490 
3491 /* TODO: duplicated with ram_init_bitmaps */
3492 void colo_incoming_start_dirty_log(void)
3493 {
3494     RAMBlock *block = NULL;
3495     /* For memory_global_dirty_log_start below. */
3496     qemu_mutex_lock_iothread();
3497     qemu_mutex_lock_ramlist();
3498 
3499     memory_global_dirty_log_sync(false);
3500     WITH_RCU_READ_LOCK_GUARD() {
3501         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3502             ramblock_sync_dirty_bitmap(ram_state, block);
3503             /* Discard this dirty bitmap record */
3504             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3505         }
3506         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3507     }
3508     ram_state->migration_dirty_pages = 0;
3509     qemu_mutex_unlock_ramlist();
3510     qemu_mutex_unlock_iothread();
3511 }
3512 
3513 /* It is need to hold the global lock to call this helper */
3514 void colo_release_ram_cache(void)
3515 {
3516     RAMBlock *block;
3517 
3518     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3519     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3520         g_free(block->bmap);
3521         block->bmap = NULL;
3522     }
3523 
3524     WITH_RCU_READ_LOCK_GUARD() {
3525         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3526             if (block->colo_cache) {
3527                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3528                 block->colo_cache = NULL;
3529             }
3530         }
3531     }
3532     ram_state_cleanup(&ram_state);
3533 }
3534 
3535 /**
3536  * ram_load_setup: Setup RAM for migration incoming side
3537  *
3538  * Returns zero to indicate success and negative for error
3539  *
3540  * @f: QEMUFile where to receive the data
3541  * @opaque: RAMState pointer
3542  */
3543 static int ram_load_setup(QEMUFile *f, void *opaque)
3544 {
3545     xbzrle_load_setup();
3546     ramblock_recv_map_init();
3547 
3548     return 0;
3549 }
3550 
3551 static int ram_load_cleanup(void *opaque)
3552 {
3553     RAMBlock *rb;
3554 
3555     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3556         qemu_ram_block_writeback(rb);
3557     }
3558 
3559     xbzrle_load_cleanup();
3560 
3561     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3562         g_free(rb->receivedmap);
3563         rb->receivedmap = NULL;
3564     }
3565 
3566     return 0;
3567 }
3568 
3569 /**
3570  * ram_postcopy_incoming_init: allocate postcopy data structures
3571  *
3572  * Returns 0 for success and negative if there was one error
3573  *
3574  * @mis: current migration incoming state
3575  *
3576  * Allocate data structures etc needed by incoming migration with
3577  * postcopy-ram. postcopy-ram's similarly names
3578  * postcopy_ram_incoming_init does the work.
3579  */
3580 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3581 {
3582     return postcopy_ram_incoming_init(mis);
3583 }
3584 
3585 /**
3586  * ram_load_postcopy: load a page in postcopy case
3587  *
3588  * Returns 0 for success or -errno in case of error
3589  *
3590  * Called in postcopy mode by ram_load().
3591  * rcu_read_lock is taken prior to this being called.
3592  *
3593  * @f: QEMUFile where to send the data
3594  * @channel: the channel to use for loading
3595  */
3596 int ram_load_postcopy(QEMUFile *f, int channel)
3597 {
3598     int flags = 0, ret = 0;
3599     bool place_needed = false;
3600     bool matches_target_page_size = false;
3601     MigrationIncomingState *mis = migration_incoming_get_current();
3602     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3603 
3604     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3605         ram_addr_t addr;
3606         void *page_buffer = NULL;
3607         void *place_source = NULL;
3608         RAMBlock *block = NULL;
3609         uint8_t ch;
3610         int len;
3611 
3612         addr = qemu_get_be64(f);
3613 
3614         /*
3615          * If qemu file error, we should stop here, and then "addr"
3616          * may be invalid
3617          */
3618         ret = qemu_file_get_error(f);
3619         if (ret) {
3620             break;
3621         }
3622 
3623         flags = addr & ~TARGET_PAGE_MASK;
3624         addr &= TARGET_PAGE_MASK;
3625 
3626         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
3627         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3628                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3629             block = ram_block_from_stream(mis, f, flags, channel);
3630             if (!block) {
3631                 ret = -EINVAL;
3632                 break;
3633             }
3634 
3635             /*
3636              * Relying on used_length is racy and can result in false positives.
3637              * We might place pages beyond used_length in case RAM was shrunk
3638              * while in postcopy, which is fine - trying to place via
3639              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3640              */
3641             if (!block->host || addr >= block->postcopy_length) {
3642                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3643                 ret = -EINVAL;
3644                 break;
3645             }
3646             tmp_page->target_pages++;
3647             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3648             /*
3649              * Postcopy requires that we place whole host pages atomically;
3650              * these may be huge pages for RAMBlocks that are backed by
3651              * hugetlbfs.
3652              * To make it atomic, the data is read into a temporary page
3653              * that's moved into place later.
3654              * The migration protocol uses,  possibly smaller, target-pages
3655              * however the source ensures it always sends all the components
3656              * of a host page in one chunk.
3657              */
3658             page_buffer = tmp_page->tmp_huge_page +
3659                           host_page_offset_from_ram_block_offset(block, addr);
3660             /* If all TP are zero then we can optimise the place */
3661             if (tmp_page->target_pages == 1) {
3662                 tmp_page->host_addr =
3663                     host_page_from_ram_block_offset(block, addr);
3664             } else if (tmp_page->host_addr !=
3665                        host_page_from_ram_block_offset(block, addr)) {
3666                 /* not the 1st TP within the HP */
3667                 error_report("Non-same host page detected on channel %d: "
3668                              "Target host page %p, received host page %p "
3669                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
3670                              channel, tmp_page->host_addr,
3671                              host_page_from_ram_block_offset(block, addr),
3672                              block->idstr, addr, tmp_page->target_pages);
3673                 ret = -EINVAL;
3674                 break;
3675             }
3676 
3677             /*
3678              * If it's the last part of a host page then we place the host
3679              * page
3680              */
3681             if (tmp_page->target_pages ==
3682                 (block->page_size / TARGET_PAGE_SIZE)) {
3683                 place_needed = true;
3684             }
3685             place_source = tmp_page->tmp_huge_page;
3686         }
3687 
3688         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3689         case RAM_SAVE_FLAG_ZERO:
3690             ch = qemu_get_byte(f);
3691             /*
3692              * Can skip to set page_buffer when
3693              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3694              */
3695             if (ch || !matches_target_page_size) {
3696                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3697             }
3698             if (ch) {
3699                 tmp_page->all_zero = false;
3700             }
3701             break;
3702 
3703         case RAM_SAVE_FLAG_PAGE:
3704             tmp_page->all_zero = false;
3705             if (!matches_target_page_size) {
3706                 /* For huge pages, we always use temporary buffer */
3707                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3708             } else {
3709                 /*
3710                  * For small pages that matches target page size, we
3711                  * avoid the qemu_file copy.  Instead we directly use
3712                  * the buffer of QEMUFile to place the page.  Note: we
3713                  * cannot do any QEMUFile operation before using that
3714                  * buffer to make sure the buffer is valid when
3715                  * placing the page.
3716                  */
3717                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3718                                          TARGET_PAGE_SIZE);
3719             }
3720             break;
3721         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3722             tmp_page->all_zero = false;
3723             len = qemu_get_be32(f);
3724             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3725                 error_report("Invalid compressed data length: %d", len);
3726                 ret = -EINVAL;
3727                 break;
3728             }
3729             decompress_data_with_multi_threads(f, page_buffer, len);
3730             break;
3731         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
3732             multifd_recv_sync_main();
3733             break;
3734         case RAM_SAVE_FLAG_EOS:
3735             /* normal exit */
3736             if (migrate_multifd_flush_after_each_section()) {
3737                 multifd_recv_sync_main();
3738             }
3739             break;
3740         default:
3741             error_report("Unknown combination of migration flags: 0x%x"
3742                          " (postcopy mode)", flags);
3743             ret = -EINVAL;
3744             break;
3745         }
3746 
3747         /* Got the whole host page, wait for decompress before placing. */
3748         if (place_needed) {
3749             ret |= wait_for_decompress_done();
3750         }
3751 
3752         /* Detect for any possible file errors */
3753         if (!ret && qemu_file_get_error(f)) {
3754             ret = qemu_file_get_error(f);
3755         }
3756 
3757         if (!ret && place_needed) {
3758             if (tmp_page->all_zero) {
3759                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
3760             } else {
3761                 ret = postcopy_place_page(mis, tmp_page->host_addr,
3762                                           place_source, block);
3763             }
3764             place_needed = false;
3765             postcopy_temp_page_reset(tmp_page);
3766         }
3767     }
3768 
3769     return ret;
3770 }
3771 
3772 static bool postcopy_is_running(void)
3773 {
3774     PostcopyState ps = postcopy_state_get();
3775     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3776 }
3777 
3778 /*
3779  * Flush content of RAM cache into SVM's memory.
3780  * Only flush the pages that be dirtied by PVM or SVM or both.
3781  */
3782 void colo_flush_ram_cache(void)
3783 {
3784     RAMBlock *block = NULL;
3785     void *dst_host;
3786     void *src_host;
3787     unsigned long offset = 0;
3788 
3789     memory_global_dirty_log_sync(false);
3790     qemu_mutex_lock(&ram_state->bitmap_mutex);
3791     WITH_RCU_READ_LOCK_GUARD() {
3792         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3793             ramblock_sync_dirty_bitmap(ram_state, block);
3794         }
3795     }
3796 
3797     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3798     WITH_RCU_READ_LOCK_GUARD() {
3799         block = QLIST_FIRST_RCU(&ram_list.blocks);
3800 
3801         while (block) {
3802             unsigned long num = 0;
3803 
3804             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3805             if (!offset_in_ramblock(block,
3806                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3807                 offset = 0;
3808                 num = 0;
3809                 block = QLIST_NEXT_RCU(block, next);
3810             } else {
3811                 unsigned long i = 0;
3812 
3813                 for (i = 0; i < num; i++) {
3814                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
3815                 }
3816                 dst_host = block->host
3817                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3818                 src_host = block->colo_cache
3819                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3820                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3821                 offset += num;
3822             }
3823         }
3824     }
3825     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3826     trace_colo_flush_ram_cache_end();
3827 }
3828 
3829 /**
3830  * ram_load_precopy: load pages in precopy case
3831  *
3832  * Returns 0 for success or -errno in case of error
3833  *
3834  * Called in precopy mode by ram_load().
3835  * rcu_read_lock is taken prior to this being called.
3836  *
3837  * @f: QEMUFile where to send the data
3838  */
3839 static int ram_load_precopy(QEMUFile *f)
3840 {
3841     MigrationIncomingState *mis = migration_incoming_get_current();
3842     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3843     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3844     bool postcopy_advised = migration_incoming_postcopy_advised();
3845     if (!migrate_compress()) {
3846         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3847     }
3848 
3849     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3850         ram_addr_t addr, total_ram_bytes;
3851         void *host = NULL, *host_bak = NULL;
3852         uint8_t ch;
3853 
3854         /*
3855          * Yield periodically to let main loop run, but an iteration of
3856          * the main loop is expensive, so do it each some iterations
3857          */
3858         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3859             aio_co_schedule(qemu_get_current_aio_context(),
3860                             qemu_coroutine_self());
3861             qemu_coroutine_yield();
3862         }
3863         i++;
3864 
3865         addr = qemu_get_be64(f);
3866         flags = addr & ~TARGET_PAGE_MASK;
3867         addr &= TARGET_PAGE_MASK;
3868 
3869         if (flags & invalid_flags) {
3870             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3871                 error_report("Received an unexpected compressed page");
3872             }
3873 
3874             ret = -EINVAL;
3875             break;
3876         }
3877 
3878         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3879                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3880             RAMBlock *block = ram_block_from_stream(mis, f, flags,
3881                                                     RAM_CHANNEL_PRECOPY);
3882 
3883             host = host_from_ram_block_offset(block, addr);
3884             /*
3885              * After going into COLO stage, we should not load the page
3886              * into SVM's memory directly, we put them into colo_cache firstly.
3887              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3888              * Previously, we copied all these memory in preparing stage of COLO
3889              * while we need to stop VM, which is a time-consuming process.
3890              * Here we optimize it by a trick, back-up every page while in
3891              * migration process while COLO is enabled, though it affects the
3892              * speed of the migration, but it obviously reduce the downtime of
3893              * back-up all SVM'S memory in COLO preparing stage.
3894              */
3895             if (migration_incoming_colo_enabled()) {
3896                 if (migration_incoming_in_colo_state()) {
3897                     /* In COLO stage, put all pages into cache temporarily */
3898                     host = colo_cache_from_block_offset(block, addr, true);
3899                 } else {
3900                    /*
3901                     * In migration stage but before COLO stage,
3902                     * Put all pages into both cache and SVM's memory.
3903                     */
3904                     host_bak = colo_cache_from_block_offset(block, addr, false);
3905                 }
3906             }
3907             if (!host) {
3908                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3909                 ret = -EINVAL;
3910                 break;
3911             }
3912             if (!migration_incoming_in_colo_state()) {
3913                 ramblock_recv_bitmap_set(block, host);
3914             }
3915 
3916             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3917         }
3918 
3919         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3920         case RAM_SAVE_FLAG_MEM_SIZE:
3921             /* Synchronize RAM block list */
3922             total_ram_bytes = addr;
3923             while (!ret && total_ram_bytes) {
3924                 RAMBlock *block;
3925                 char id[256];
3926                 ram_addr_t length;
3927 
3928                 len = qemu_get_byte(f);
3929                 qemu_get_buffer(f, (uint8_t *)id, len);
3930                 id[len] = 0;
3931                 length = qemu_get_be64(f);
3932 
3933                 block = qemu_ram_block_by_name(id);
3934                 if (block && !qemu_ram_is_migratable(block)) {
3935                     error_report("block %s should not be migrated !", id);
3936                     ret = -EINVAL;
3937                 } else if (block) {
3938                     if (length != block->used_length) {
3939                         Error *local_err = NULL;
3940 
3941                         ret = qemu_ram_resize(block, length,
3942                                               &local_err);
3943                         if (local_err) {
3944                             error_report_err(local_err);
3945                         }
3946                     }
3947                     /* For postcopy we need to check hugepage sizes match */
3948                     if (postcopy_advised && migrate_postcopy_ram() &&
3949                         block->page_size != qemu_host_page_size) {
3950                         uint64_t remote_page_size = qemu_get_be64(f);
3951                         if (remote_page_size != block->page_size) {
3952                             error_report("Mismatched RAM page size %s "
3953                                          "(local) %zd != %" PRId64,
3954                                          id, block->page_size,
3955                                          remote_page_size);
3956                             ret = -EINVAL;
3957                         }
3958                     }
3959                     if (migrate_ignore_shared()) {
3960                         hwaddr addr = qemu_get_be64(f);
3961                         if (ramblock_is_ignored(block) &&
3962                             block->mr->addr != addr) {
3963                             error_report("Mismatched GPAs for block %s "
3964                                          "%" PRId64 "!= %" PRId64,
3965                                          id, (uint64_t)addr,
3966                                          (uint64_t)block->mr->addr);
3967                             ret = -EINVAL;
3968                         }
3969                     }
3970                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3971                                           block->idstr);
3972                 } else {
3973                     error_report("Unknown ramblock \"%s\", cannot "
3974                                  "accept migration", id);
3975                     ret = -EINVAL;
3976                 }
3977 
3978                 total_ram_bytes -= length;
3979             }
3980             break;
3981 
3982         case RAM_SAVE_FLAG_ZERO:
3983             ch = qemu_get_byte(f);
3984             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3985             break;
3986 
3987         case RAM_SAVE_FLAG_PAGE:
3988             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3989             break;
3990 
3991         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3992             len = qemu_get_be32(f);
3993             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3994                 error_report("Invalid compressed data length: %d", len);
3995                 ret = -EINVAL;
3996                 break;
3997             }
3998             decompress_data_with_multi_threads(f, host, len);
3999             break;
4000 
4001         case RAM_SAVE_FLAG_XBZRLE:
4002             if (load_xbzrle(f, addr, host) < 0) {
4003                 error_report("Failed to decompress XBZRLE page at "
4004                              RAM_ADDR_FMT, addr);
4005                 ret = -EINVAL;
4006                 break;
4007             }
4008             break;
4009         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4010             multifd_recv_sync_main();
4011             break;
4012         case RAM_SAVE_FLAG_EOS:
4013             /* normal exit */
4014             if (migrate_multifd_flush_after_each_section()) {
4015                 multifd_recv_sync_main();
4016             }
4017             break;
4018         case RAM_SAVE_FLAG_HOOK:
4019             ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4020             break;
4021         default:
4022             error_report("Unknown combination of migration flags: 0x%x", flags);
4023             ret = -EINVAL;
4024         }
4025         if (!ret) {
4026             ret = qemu_file_get_error(f);
4027         }
4028         if (!ret && host_bak) {
4029             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4030         }
4031     }
4032 
4033     ret |= wait_for_decompress_done();
4034     return ret;
4035 }
4036 
4037 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4038 {
4039     int ret = 0;
4040     static uint64_t seq_iter;
4041     /*
4042      * If system is running in postcopy mode, page inserts to host memory must
4043      * be atomic
4044      */
4045     bool postcopy_running = postcopy_is_running();
4046 
4047     seq_iter++;
4048 
4049     if (version_id != 4) {
4050         return -EINVAL;
4051     }
4052 
4053     /*
4054      * This RCU critical section can be very long running.
4055      * When RCU reclaims in the code start to become numerous,
4056      * it will be necessary to reduce the granularity of this
4057      * critical section.
4058      */
4059     WITH_RCU_READ_LOCK_GUARD() {
4060         if (postcopy_running) {
4061             /*
4062              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4063              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4064              * service fast page faults.
4065              */
4066             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4067         } else {
4068             ret = ram_load_precopy(f);
4069         }
4070     }
4071     trace_ram_load_complete(ret, seq_iter);
4072 
4073     return ret;
4074 }
4075 
4076 static bool ram_has_postcopy(void *opaque)
4077 {
4078     RAMBlock *rb;
4079     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4080         if (ramblock_is_pmem(rb)) {
4081             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4082                          "is not supported now!", rb->idstr, rb->host);
4083             return false;
4084         }
4085     }
4086 
4087     return migrate_postcopy_ram();
4088 }
4089 
4090 /* Sync all the dirty bitmap with destination VM.  */
4091 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4092 {
4093     RAMBlock *block;
4094     QEMUFile *file = s->to_dst_file;
4095     int ramblock_count = 0;
4096 
4097     trace_ram_dirty_bitmap_sync_start();
4098 
4099     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4100         qemu_savevm_send_recv_bitmap(file, block->idstr);
4101         trace_ram_dirty_bitmap_request(block->idstr);
4102         ramblock_count++;
4103     }
4104 
4105     trace_ram_dirty_bitmap_sync_wait();
4106 
4107     /* Wait until all the ramblocks' dirty bitmap synced */
4108     while (ramblock_count--) {
4109         qemu_sem_wait(&s->rp_state.rp_sem);
4110     }
4111 
4112     trace_ram_dirty_bitmap_sync_complete();
4113 
4114     return 0;
4115 }
4116 
4117 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4118 {
4119     qemu_sem_post(&s->rp_state.rp_sem);
4120 }
4121 
4122 /*
4123  * Read the received bitmap, revert it as the initial dirty bitmap.
4124  * This is only used when the postcopy migration is paused but wants
4125  * to resume from a middle point.
4126  */
4127 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4128 {
4129     int ret = -EINVAL;
4130     /* from_dst_file is always valid because we're within rp_thread */
4131     QEMUFile *file = s->rp_state.from_dst_file;
4132     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4133     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4134     uint64_t size, end_mark;
4135 
4136     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4137 
4138     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4139         error_report("%s: incorrect state %s", __func__,
4140                      MigrationStatus_str(s->state));
4141         return -EINVAL;
4142     }
4143 
4144     /*
4145      * Note: see comments in ramblock_recv_bitmap_send() on why we
4146      * need the endianness conversion, and the paddings.
4147      */
4148     local_size = ROUND_UP(local_size, 8);
4149 
4150     /* Add paddings */
4151     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4152 
4153     size = qemu_get_be64(file);
4154 
4155     /* The size of the bitmap should match with our ramblock */
4156     if (size != local_size) {
4157         error_report("%s: ramblock '%s' bitmap size mismatch "
4158                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4159                      block->idstr, size, local_size);
4160         ret = -EINVAL;
4161         goto out;
4162     }
4163 
4164     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4165     end_mark = qemu_get_be64(file);
4166 
4167     ret = qemu_file_get_error(file);
4168     if (ret || size != local_size) {
4169         error_report("%s: read bitmap failed for ramblock '%s': %d"
4170                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4171                      __func__, block->idstr, ret, local_size, size);
4172         ret = -EIO;
4173         goto out;
4174     }
4175 
4176     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4177         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4178                      __func__, block->idstr, end_mark);
4179         ret = -EINVAL;
4180         goto out;
4181     }
4182 
4183     /*
4184      * Endianness conversion. We are during postcopy (though paused).
4185      * The dirty bitmap won't change. We can directly modify it.
4186      */
4187     bitmap_from_le(block->bmap, le_bitmap, nbits);
4188 
4189     /*
4190      * What we received is "received bitmap". Revert it as the initial
4191      * dirty bitmap for this ramblock.
4192      */
4193     bitmap_complement(block->bmap, block->bmap, nbits);
4194 
4195     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4196     ramblock_dirty_bitmap_clear_discarded_pages(block);
4197 
4198     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4199     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4200 
4201     /*
4202      * We succeeded to sync bitmap for current ramblock. If this is
4203      * the last one to sync, we need to notify the main send thread.
4204      */
4205     ram_dirty_bitmap_reload_notify(s);
4206 
4207     ret = 0;
4208 out:
4209     g_free(le_bitmap);
4210     return ret;
4211 }
4212 
4213 static int ram_resume_prepare(MigrationState *s, void *opaque)
4214 {
4215     RAMState *rs = *(RAMState **)opaque;
4216     int ret;
4217 
4218     ret = ram_dirty_bitmap_sync_all(s, rs);
4219     if (ret) {
4220         return ret;
4221     }
4222 
4223     ram_state_resume_prepare(rs, s->to_dst_file);
4224 
4225     return 0;
4226 }
4227 
4228 void postcopy_preempt_shutdown_file(MigrationState *s)
4229 {
4230     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4231     qemu_fflush(s->postcopy_qemufile_src);
4232 }
4233 
4234 static SaveVMHandlers savevm_ram_handlers = {
4235     .save_setup = ram_save_setup,
4236     .save_live_iterate = ram_save_iterate,
4237     .save_live_complete_postcopy = ram_save_complete,
4238     .save_live_complete_precopy = ram_save_complete,
4239     .has_postcopy = ram_has_postcopy,
4240     .state_pending_exact = ram_state_pending_exact,
4241     .state_pending_estimate = ram_state_pending_estimate,
4242     .load_state = ram_load,
4243     .save_cleanup = ram_save_cleanup,
4244     .load_setup = ram_load_setup,
4245     .load_cleanup = ram_load_cleanup,
4246     .resume_prepare = ram_resume_prepare,
4247 };
4248 
4249 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4250                                       size_t old_size, size_t new_size)
4251 {
4252     PostcopyState ps = postcopy_state_get();
4253     ram_addr_t offset;
4254     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4255     Error *err = NULL;
4256 
4257     if (ramblock_is_ignored(rb)) {
4258         return;
4259     }
4260 
4261     if (!migration_is_idle()) {
4262         /*
4263          * Precopy code on the source cannot deal with the size of RAM blocks
4264          * changing at random points in time - especially after sending the
4265          * RAM block sizes in the migration stream, they must no longer change.
4266          * Abort and indicate a proper reason.
4267          */
4268         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4269         migration_cancel(err);
4270         error_free(err);
4271     }
4272 
4273     switch (ps) {
4274     case POSTCOPY_INCOMING_ADVISE:
4275         /*
4276          * Update what ram_postcopy_incoming_init()->init_range() does at the
4277          * time postcopy was advised. Syncing RAM blocks with the source will
4278          * result in RAM resizes.
4279          */
4280         if (old_size < new_size) {
4281             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4282                 error_report("RAM block '%s' discard of resized RAM failed",
4283                              rb->idstr);
4284             }
4285         }
4286         rb->postcopy_length = new_size;
4287         break;
4288     case POSTCOPY_INCOMING_NONE:
4289     case POSTCOPY_INCOMING_RUNNING:
4290     case POSTCOPY_INCOMING_END:
4291         /*
4292          * Once our guest is running, postcopy does no longer care about
4293          * resizes. When growing, the new memory was not available on the
4294          * source, no handler needed.
4295          */
4296         break;
4297     default:
4298         error_report("RAM block '%s' resized during postcopy state: %d",
4299                      rb->idstr, ps);
4300         exit(-1);
4301     }
4302 }
4303 
4304 static RAMBlockNotifier ram_mig_ram_notifier = {
4305     .ram_block_resized = ram_mig_ram_block_resized,
4306 };
4307 
4308 void ram_mig_init(void)
4309 {
4310     qemu_mutex_init(&XBZRLE.lock);
4311     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4312     ram_block_notifier_add(&ram_mig_ram_notifier);
4313 }
4314